diff options
Diffstat (limited to 'net/ipv4')
85 files changed, 3887 insertions, 2863 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 70491d9035eb..0c94a1ac2946 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -166,7 +166,7 @@ config IP_PNP_DHCP If unsure, say Y. Note that if you want to use DHCP, a DHCP server must be operating on your network. Read - <file:Documentation/filesystems/nfsroot.txt> for details. + <file:Documentation/filesystems/nfs/nfsroot.txt> for details. config IP_PNP_BOOTP bool "IP: BOOTP support" @@ -181,7 +181,7 @@ config IP_PNP_BOOTP does BOOTP itself, providing all necessary information on the kernel command line, you can say N here. If unsure, say Y. Note that if you want to use BOOTP, a BOOTP server must be operating on your network. - Read <file:Documentation/filesystems/nfsroot.txt> for details. + Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details. config IP_PNP_RARP bool "IP: RARP support" @@ -194,7 +194,7 @@ config IP_PNP_RARP older protocol which is being obsoleted by BOOTP and DHCP), say Y here. Note that if you want to use RARP, a RARP server must be operating on your network. Read - <file:Documentation/filesystems/nfsroot.txt> for details. + <file:Documentation/filesystems/nfs/nfsroot.txt> for details. # not yet ready.. # bool ' IP: ARP support' CONFIG_IP_PNP_ARP diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 57737b8d1711..f71357422380 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -86,6 +86,7 @@ #include <linux/poll.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/system.h> @@ -174,12 +175,12 @@ static int inet_autobind(struct sock *sk) /* We may need to bind the socket. */ lock_sock(sk); inet = inet_sk(sk); - if (!inet->num) { + if (!inet->inet_num) { if (sk->sk_prot->get_port(sk, 0)) { release_sock(sk); return -EAGAIN; } - inet->sport = htons(inet->num); + inet->inet_sport = htons(inet->inet_num); } release_sock(sk); return 0; @@ -262,7 +263,8 @@ static inline int inet_netns_ok(struct net *net, int protocol) * Create an inet socket. */ -static int inet_create(struct net *net, struct socket *sock, int protocol) +static int inet_create(struct net *net, struct socket *sock, int protocol, + int kern) { struct sock *sk; struct inet_protosw *answer; @@ -325,7 +327,7 @@ lookup_protocol: } err = -EPERM; - if (answer->capability > 0 && !capable(answer->capability)) + if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW)) goto out_rcu_unlock; err = -EAFNOSUPPORT; @@ -354,7 +356,7 @@ lookup_protocol: inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0; if (SOCK_RAW == sock->type) { - inet->num = protocol; + inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet->hdrincl = 1; } @@ -364,7 +366,7 @@ lookup_protocol: else inet->pmtudisc = IP_PMTUDISC_WANT; - inet->id = 0; + inet->inet_id = 0; sock_init_data(sock, sk); @@ -381,13 +383,13 @@ lookup_protocol: sk_refcnt_debug_inc(sk); - if (inet->num) { + if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ - inet->sport = htons(inet->num); + inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ sk->sk_prot->hash(sk); } @@ -494,27 +496,27 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* Check these errors (active socket, double bind). */ err = -EINVAL; - if (sk->sk_state != TCP_CLOSE || inet->num) + if (sk->sk_state != TCP_CLOSE || inet->inet_num) goto out_release_sock; - inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; + inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) - inet->saddr = 0; /* Use device */ + inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ if (sk->sk_prot->get_port(sk, snum)) { - inet->saddr = inet->rcv_saddr = 0; + inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; goto out_release_sock; } - if (inet->rcv_saddr) + if (inet->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; - inet->sport = htons(inet->num); - inet->daddr = 0; - inet->dport = 0; + inet->inet_sport = htons(inet->inet_num); + inet->inet_daddr = 0; + inet->inet_dport = 0; sk_dst_reset(sk); err = 0; out_release_sock: @@ -529,10 +531,12 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, { struct sock *sk = sock->sk; + if (addr_len < sizeof(uaddr->sa_family)) + return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); - if (!inet_sk(sk)->num && inet_autobind(sk)) + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); } @@ -572,6 +576,9 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int err; long timeo; + if (addr_len < sizeof(uaddr->sa_family)) + return -EINVAL; + lock_sock(sk); if (uaddr->sa_family == AF_UNSPEC) { @@ -685,21 +692,21 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); - struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); sin->sin_family = AF_INET; if (peer) { - if (!inet->dport || + if (!inet->inet_dport || (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && peer == 1)) return -ENOTCONN; - sin->sin_port = inet->dport; - sin->sin_addr.s_addr = inet->daddr; + sin->sin_port = inet->inet_dport; + sin->sin_addr.s_addr = inet->inet_daddr; } else { - __be32 addr = inet->rcv_saddr; + __be32 addr = inet->inet_rcv_saddr; if (!addr) - addr = inet->saddr; - sin->sin_port = inet->sport; + addr = inet->inet_saddr; + sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); @@ -714,7 +721,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; /* We may need to bind the socket. */ - if (!inet_sk(sk)->num && inet_autobind(sk)) + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->sendmsg(iocb, sk, msg, size); @@ -728,7 +735,7 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, struct sock *sk = sock->sk; /* We may need to bind the socket. */ - if (!inet_sk(sk)->num && inet_autobind(sk)) + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; if (sk->sk_prot->sendpage) @@ -931,7 +938,7 @@ static const struct proto_ops inet_sockraw_ops = { #endif }; -static struct net_proto_family inet_family_ops = { +static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, @@ -947,7 +954,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, - .capability = -1, .no_check = 0, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, @@ -958,7 +964,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, - .capability = -1, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_PERMANENT, }, @@ -969,7 +974,6 @@ static struct inet_protosw inetsw_array[] = .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, - .capability = CAP_NET_RAW, .no_check = UDP_CSUM_DEFAULT, .flags = INET_PROTOSW_REUSE, } @@ -1059,9 +1063,9 @@ static int inet_sk_reselect_saddr(struct sock *sk) struct inet_sock *inet = inet_sk(sk); int err; struct rtable *rt; - __be32 old_saddr = inet->saddr; + __be32 old_saddr = inet->inet_saddr; __be32 new_saddr; - __be32 daddr = inet->daddr; + __be32 daddr = inet->inet_daddr; if (inet->opt && inet->opt->srr) daddr = inet->opt->faddr; @@ -1071,7 +1075,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, sk->sk_protocol, - inet->sport, inet->dport, sk, 0); + inet->inet_sport, inet->inet_dport, sk, 0); if (err) return err; @@ -1087,7 +1091,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) __func__, &old_saddr, &new_saddr); } - inet->saddr = inet->rcv_saddr = new_saddr; + inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; /* * XXX The only one ugly spot where we need to @@ -1113,7 +1117,7 @@ int inet_sk_rebuild_header(struct sock *sk) return 0; /* Reroute. */ - daddr = inet->daddr; + daddr = inet->inet_daddr; if (inet->opt && inet->opt->srr) daddr = inet->opt->faddr; { @@ -1123,7 +1127,7 @@ int inet_sk_rebuild_header(struct sock *sk) .nl_u = { .ip4_u = { .daddr = daddr, - .saddr = inet->saddr, + .saddr = inet->inet_saddr, .tos = RT_CONN_FLAGS(sk), }, }, @@ -1131,8 +1135,8 @@ int inet_sk_rebuild_header(struct sock *sk) .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = { - .sport = inet->sport, - .dport = inet->dport, + .sport = inet->inet_sport, + .dport = inet->inet_dport, }, }, }; @@ -1387,7 +1391,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family, } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); -unsigned long snmp_fold_field(void *mib[], int offt) +unsigned long snmp_fold_field(void __percpu *mib[], int offt) { unsigned long res = 0; int i; @@ -1400,7 +1404,7 @@ unsigned long snmp_fold_field(void *mib[], int offt) } EXPORT_SYMBOL_GPL(snmp_fold_field); -int snmp_mib_init(void *ptr[2], size_t mibsize) +int snmp_mib_init(void __percpu *ptr[2], size_t mibsize) { BUG_ON(ptr == NULL); ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long)); @@ -1418,7 +1422,7 @@ err0: } EXPORT_SYMBOL_GPL(snmp_mib_init); -void snmp_mib_free(void *ptr[2]) +void snmp_mib_free(void __percpu *ptr[2]) { BUG_ON(ptr == NULL); free_percpu(ptr[0]); @@ -1462,25 +1466,25 @@ static const struct net_protocol icmp_protocol = { static __net_init int ipv4_mib_init_net(struct net *net) { - if (snmp_mib_init((void **)net->mib.tcp_statistics, + if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics, sizeof(struct tcp_mib)) < 0) goto err_tcp_mib; - if (snmp_mib_init((void **)net->mib.ip_statistics, + if (snmp_mib_init((void __percpu **)net->mib.ip_statistics, sizeof(struct ipstats_mib)) < 0) goto err_ip_mib; - if (snmp_mib_init((void **)net->mib.net_statistics, + if (snmp_mib_init((void __percpu **)net->mib.net_statistics, sizeof(struct linux_mib)) < 0) goto err_net_mib; - if (snmp_mib_init((void **)net->mib.udp_statistics, + if (snmp_mib_init((void __percpu **)net->mib.udp_statistics, sizeof(struct udp_mib)) < 0) goto err_udp_mib; - if (snmp_mib_init((void **)net->mib.udplite_statistics, + if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics, sizeof(struct udp_mib)) < 0) goto err_udplite_mib; - if (snmp_mib_init((void **)net->mib.icmp_statistics, + if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics, sizeof(struct icmp_mib)) < 0) goto err_icmp_mib; - if (snmp_mib_init((void **)net->mib.icmpmsg_statistics, + if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics, sizeof(struct icmpmsg_mib)) < 0) goto err_icmpmsg_mib; @@ -1488,30 +1492,30 @@ static __net_init int ipv4_mib_init_net(struct net *net) return 0; err_icmpmsg_mib: - snmp_mib_free((void **)net->mib.icmp_statistics); + snmp_mib_free((void __percpu **)net->mib.icmp_statistics); err_icmp_mib: - snmp_mib_free((void **)net->mib.udplite_statistics); + snmp_mib_free((void __percpu **)net->mib.udplite_statistics); err_udplite_mib: - snmp_mib_free((void **)net->mib.udp_statistics); + snmp_mib_free((void __percpu **)net->mib.udp_statistics); err_udp_mib: - snmp_mib_free((void **)net->mib.net_statistics); + snmp_mib_free((void __percpu **)net->mib.net_statistics); err_net_mib: - snmp_mib_free((void **)net->mib.ip_statistics); + snmp_mib_free((void __percpu **)net->mib.ip_statistics); err_ip_mib: - snmp_mib_free((void **)net->mib.tcp_statistics); + snmp_mib_free((void __percpu **)net->mib.tcp_statistics); err_tcp_mib: return -ENOMEM; } static __net_exit void ipv4_mib_exit_net(struct net *net) { - snmp_mib_free((void **)net->mib.icmpmsg_statistics); - snmp_mib_free((void **)net->mib.icmp_statistics); - snmp_mib_free((void **)net->mib.udplite_statistics); - snmp_mib_free((void **)net->mib.udp_statistics); - snmp_mib_free((void **)net->mib.net_statistics); - snmp_mib_free((void **)net->mib.ip_statistics); - snmp_mib_free((void **)net->mib.tcp_statistics); + snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics); + snmp_mib_free((void __percpu **)net->mib.icmp_statistics); + snmp_mib_free((void __percpu **)net->mib.udplite_statistics); + snmp_mib_free((void __percpu **)net->mib.udp_statistics); + snmp_mib_free((void __percpu **)net->mib.net_statistics); + snmp_mib_free((void __percpu **)net->mib.ip_statistics); + snmp_mib_free((void __percpu **)net->mib.tcp_statistics); } static __net_initdata struct pernet_operations ipv4_mib_ops = { diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 5c662703eb1e..880a5ec6dce0 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -1,14 +1,73 @@ +#include <crypto/hash.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/slab.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ah.h> #include <linux/crypto.h> #include <linux/pfkeyv2.h> -#include <linux/spinlock.h> +#include <linux/scatterlist.h> #include <net/icmp.h> #include <net/protocol.h> +struct ah_skb_cb { + struct xfrm_skb_cb xfrm; + void *tmp; +}; + +#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) + +static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, + unsigned int size) +{ + unsigned int len; + + len = size + crypto_ahash_digestsize(ahash) + + (crypto_ahash_alignmask(ahash) & + ~(crypto_tfm_ctx_alignment() - 1)); + + len = ALIGN(len, crypto_tfm_ctx_alignment()); + + len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash); + len = ALIGN(len, __alignof__(struct scatterlist)); + + len += sizeof(struct scatterlist) * nfrags; + + return kmalloc(len, GFP_ATOMIC); +} + +static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset) +{ + return tmp + offset; +} + +static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp, + unsigned int offset) +{ + return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1); +} + +static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, + u8 *icv) +{ + struct ahash_request *req; + + req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash), + crypto_tfm_ctx_alignment()); + + ahash_request_set_tfm(req, ahash); + + return req; +} + +static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, + struct ahash_request *req) +{ + return (void *)ALIGN((unsigned long)(req + 1) + + crypto_ahash_reqsize(ahash), + __alignof__(struct scatterlist)); +} /* Clear mutable options and find final destination to substitute * into IP header for icv calculation. Options are already checked @@ -54,20 +113,72 @@ static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr) return 0; } +static void ah_output_done(struct crypto_async_request *base, int err) +{ + u8 *icv; + struct iphdr *iph; + struct sk_buff *skb = base->data; + struct xfrm_state *x = skb_dst(skb)->xfrm; + struct ah_data *ahp = x->data; + struct iphdr *top_iph = ip_hdr(skb); + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + int ihl = ip_hdrlen(skb); + + iph = AH_SKB_CB(skb)->tmp; + icv = ah_tmp_icv(ahp->ahash, iph, ihl); + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); + + top_iph->tos = iph->tos; + top_iph->ttl = iph->ttl; + top_iph->frag_off = iph->frag_off; + if (top_iph->ihl != 5) { + top_iph->daddr = iph->daddr; + memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); + } + + err = ah->nexthdr; + + kfree(AH_SKB_CB(skb)->tmp); + xfrm_output_resume(skb, err); +} + static int ah_output(struct xfrm_state *x, struct sk_buff *skb) { int err; + int nfrags; + int ihl; + u8 *icv; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; struct iphdr *iph, *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; - union { - struct iphdr iph; - char buf[60]; - } tmp_iph; + + ahp = x->data; + ahash = ahp->ahash; + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; skb_push(skb, -skb_network_offset(skb)); + ah = ip_auth_hdr(skb); + ihl = ip_hdrlen(skb); + + err = -ENOMEM; + iph = ah_alloc_tmp(ahash, nfrags, ihl); + if (!iph) + goto out; + + icv = ah_tmp_icv(ahash, iph, ihl); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); + + memset(ah->auth_data, 0, ahp->icv_trunc_len); + top_iph = ip_hdr(skb); - iph = &tmp_iph.iph; iph->tos = top_iph->tos; iph->ttl = top_iph->ttl; @@ -78,10 +189,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); err = ip_clear_mutable_options(top_iph, &top_iph->daddr); if (err) - goto error; + goto out_free; } - ah = ip_auth_hdr(skb); ah->nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_AH; @@ -91,20 +201,31 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) top_iph->ttl = 0; top_iph->check = 0; - ahp = x->data; ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output); - spin_lock_bh(&x->lock); - err = ah_mac_digest(ahp, skb, ah->auth_data); - memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len); - spin_unlock_bh(&x->lock); + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); - if (err) - goto error; + ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_callback(req, 0, ah_output_done, skb); + + AH_SKB_CB(skb)->tmp = iph; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; + + if (err == -EBUSY) + err = NET_XMIT_DROP; + goto out_free; + } + + memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; @@ -114,28 +235,67 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } - err = 0; - -error: +out_free: + kfree(iph); +out: return err; } +static void ah_input_done(struct crypto_async_request *base, int err) +{ + u8 *auth_data; + u8 *icv; + struct iphdr *work_iph; + struct sk_buff *skb = base->data; + struct xfrm_state *x = xfrm_input_state(skb); + struct ah_data *ahp = x->data; + struct ip_auth_hdr *ah = ip_auth_hdr(skb); + int ihl = ip_hdrlen(skb); + int ah_hlen = (ah->hdrlen + 2) << 2; + + work_iph = AH_SKB_CB(skb)->tmp; + auth_data = ah_tmp_auth(work_iph, ihl); + icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len); + + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; + if (err) + goto out; + + skb->network_header += ah_hlen; + memcpy(skb_network_header(skb), work_iph, ihl); + __skb_pull(skb, ah_hlen + ihl); + skb_set_transport_header(skb, -ihl); + + err = ah->nexthdr; +out: + kfree(AH_SKB_CB(skb)->tmp); + xfrm_input_resume(skb, err); +} + static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; int ihl; int nexthdr; - int err = -EINVAL; - struct iphdr *iph; + int nfrags; + u8 *auth_data; + u8 *icv; + struct sk_buff *trailer; + struct crypto_ahash *ahash; + struct ahash_request *req; + struct scatterlist *sg; + struct iphdr *iph, *work_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; - char work_buf[60]; + int err = -ENOMEM; if (!pskb_may_pull(skb, sizeof(*ah))) goto out; ah = (struct ip_auth_hdr *)skb->data; ahp = x->data; + ahash = ahp->ahash; + nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; @@ -156,9 +316,24 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) ah = (struct ip_auth_hdr *)skb->data; iph = ip_hdr(skb); + ihl = ip_hdrlen(skb); + + if ((err = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + nfrags = err; + + work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len); + if (!work_iph) + goto out; + + auth_data = ah_tmp_auth(work_iph, ihl); + icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len); + req = ah_tmp_req(ahash, icv); + sg = ah_req_sg(ahash, req); - ihl = skb->data - skb_network_header(skb); - memcpy(work_buf, iph, ihl); + memcpy(work_iph, iph, ihl); + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + memset(ah->auth_data, 0, ahp->icv_trunc_len); iph->ttl = 0; iph->tos = 0; @@ -166,35 +341,44 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) iph->check = 0; if (ihl > sizeof(*iph)) { __be32 dummy; - if (ip_clear_mutable_options(iph, &dummy)) - goto out; + err = ip_clear_mutable_options(iph, &dummy); + if (err) + goto out_free; } - spin_lock(&x->lock); - { - u8 auth_data[MAX_AH_AUTH_LEN]; + skb_push(skb, ihl); - memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); - skb_push(skb, ihl); - err = ah_mac_digest(ahp, skb, ah->auth_data); - if (err) - goto unlock; - if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len)) - err = -EBADMSG; + sg_init_table(sg, nfrags); + skb_to_sgvec(skb, sg, 0, skb->len); + + ahash_request_set_crypt(req, sg, icv, skb->len); + ahash_request_set_callback(req, 0, ah_input_done, skb); + + AH_SKB_CB(skb)->tmp = work_iph; + + err = crypto_ahash_digest(req); + if (err) { + if (err == -EINPROGRESS) + goto out; + + if (err == -EBUSY) + err = NET_XMIT_DROP; + goto out_free; } -unlock: - spin_unlock(&x->lock); + err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0; if (err) - goto out; + goto out_free; skb->network_header += ah_hlen; - memcpy(skb_network_header(skb), work_buf, ihl); - skb->transport_header = skb->network_header; + memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); + skb_set_transport_header(skb, -ihl); - return nexthdr; + err = nexthdr; +out_free: + kfree (work_iph); out: return err; } @@ -210,7 +394,7 @@ static void ah4_err(struct sk_buff *skb, u32 info) icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; - x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); + x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) return; printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", @@ -222,7 +406,7 @@ static int ah_init_state(struct xfrm_state *x) { struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; - struct crypto_hash *tfm; + struct crypto_ahash *ahash; if (!x->aalg) goto error; @@ -231,44 +415,40 @@ static int ah_init_state(struct xfrm_state *x) goto error; ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); - if (ahp == NULL) + if (!ahp) return -ENOMEM; - tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) + ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); + if (IS_ERR(ahash)) goto error; - ahp->tfm = tfm; - if (crypto_hash_setkey(tfm, x->aalg->alg_key, - (x->aalg->alg_key_len + 7) / 8)) + ahp->ahash = ahash; + if (crypto_ahash_setkey(ahash, x->aalg->alg_key, + (x->aalg->alg_key_len + 7) / 8)) goto error; /* * Lookup the algorithm description maintained by xfrm_algo, * verify crypto transform properties, and store information * we need for AH processing. This lookup cannot fail here - * after a successful crypto_alloc_hash(). + * after a successful crypto_alloc_ahash(). */ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); BUG_ON(!aalg_desc); if (aalg_desc->uinfo.auth.icv_fullbits/8 != - crypto_hash_digestsize(tfm)) { + crypto_ahash_digestsize(ahash)) { printk(KERN_INFO "AH: %s digestsize %u != %hu\n", - x->aalg->alg_name, crypto_hash_digestsize(tfm), + x->aalg->alg_name, crypto_ahash_digestsize(ahash), aalg_desc->uinfo.auth.icv_fullbits/8); goto error; } ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; - ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); - ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL); - if (!ahp->work_icv) - goto error; - x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) @@ -279,8 +459,7 @@ static int ah_init_state(struct xfrm_state *x) error: if (ahp) { - kfree(ahp->work_icv); - crypto_free_hash(ahp->tfm); + crypto_free_ahash(ahp->ahash); kfree(ahp); } return -EINVAL; @@ -293,8 +472,7 @@ static void ah_destroy(struct xfrm_state *x) if (!ahp) return; - kfree(ahp->work_icv); - crypto_free_hash(ahp->tfm); + crypto_free_ahash(ahp->ahash); kfree(ahp); } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 4e80f336c0cf..6e747065c202 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -70,6 +70,7 @@ * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash + * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #include <linux/module.h> @@ -97,6 +98,7 @@ #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/jhash.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -524,12 +526,15 @@ int arp_bind_neighbour(struct dst_entry *dst) /* * Check if we can use proxy ARP for this path */ - -static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) +static inline int arp_fwd_proxy(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; + if (rt->u.dst.dev == dev) + return 0; + if (!IN_DEV_PROXY_ARP(in_dev)) return 0; @@ -548,6 +553,43 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) } /* + * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) + * + * RFC3069 supports proxy arp replies back to the same interface. This + * is done to support (ethernet) switch features, like RFC 3069, where + * the individual ports are not allowed to communicate with each + * other, BUT they are allowed to talk to the upstream router. As + * described in RFC 3069, it is possible to allow these hosts to + * communicate through the upstream router, by proxy_arp'ing. + * + * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" + * + * This technology is known by different names: + * In RFC 3069 it is called VLAN Aggregation. + * Cisco and Allied Telesyn call it Private VLAN. + * Hewlett-Packard call it Source-Port filtering or port-isolation. + * Ericsson call it MAC-Forced Forwarding (RFC Draft). + * + */ +static inline int arp_fwd_pvlan(struct in_device *in_dev, + struct net_device *dev, struct rtable *rt, + __be32 sip, __be32 tip) +{ + /* Private VLAN is only concerned about the same ethernet segment */ + if (rt->u.dst.dev != dev) + return 0; + + /* Don't reply on self probes (often done by windowz boxes)*/ + if (sip == tip) + return 0; + + if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) + return 1; + else + return 0; +} + +/* * Interface to link layer: send routine and receive handler. */ @@ -833,8 +875,11 @@ static int arp_process(struct sk_buff *skb) } goto out; } else if (IN_DEV_FORWARD(in_dev)) { - if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && - (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { + if (addr_type == RTN_UNICAST && + (arp_fwd_proxy(in_dev, dev, rt) || + arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || + pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) + { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -863,7 +908,8 @@ static int arp_process(struct sk_buff *skb) devices (strip is candidate) */ if (n == NULL && - arp->ar_op == htons(ARPOP_REPLY) && + (arp->ar_op == htons(ARPOP_REPLY) || + (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) && inet_addr_type(net, sip) == RTN_UNICAST) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } @@ -1239,8 +1285,7 @@ void __init arp_init(void) dev_add_pack(&arp_packet_type); arp_proc_init(); #ifdef CONFIG_SYSCTL - neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, - NET_IPV4_NEIGH, "ipv4", NULL, NULL); + neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL); #endif register_netdevice_notifier(&arp_netdev_notifier); } diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 039cc1ffe977..c97cd9ff697e 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -44,6 +44,7 @@ #include <linux/string.h> #include <linux/jhash.h> #include <linux/audit.h> +#include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp.h> @@ -2017,7 +2018,7 @@ req_setattr_failure: * values on failure. * */ -int cipso_v4_delopt(struct ip_options **opt_ptr) +static int cipso_v4_delopt(struct ip_options **opt_ptr) { int hdr_delta = 0; struct ip_options *opt = *opt_ptr; diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 5e6c5a0f3fde..fb2465811b48 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -39,7 +39,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); oif = sk->sk_bound_dev_if; - saddr = inet->saddr; + saddr = inet->inet_saddr; if (ipv4_is_multicast(usin->sin_addr.s_addr)) { if (!oif) oif = inet->mc_index; @@ -49,7 +49,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, RT_CONN_FLAGS(sk), oif, sk->sk_protocol, - inet->sport, usin->sin_port, sk, 1); + inet->inet_sport, usin->sin_port, sk, 1); if (err) { if (err == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); @@ -60,14 +60,14 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) ip_rt_put(rt); return -EACCES; } - if (!inet->saddr) - inet->saddr = rt->rt_src; /* Update source address */ - if (!inet->rcv_saddr) - inet->rcv_saddr = rt->rt_src; - inet->daddr = rt->rt_dst; - inet->dport = usin->sin_port; + if (!inet->inet_saddr) + inet->inet_saddr = rt->rt_src; /* Update source address */ + if (!inet->inet_rcv_saddr) + inet->inet_rcv_saddr = rt->rt_src; + inet->inet_daddr = rt->rt_dst; + inet->inet_dport = usin->sin_port; sk->sk_state = TCP_ESTABLISHED; - inet->id = jiffies; + inet->inet_id = jiffies; sk_dst_set(sk, &rt->u.dst); return(0); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 5df2f6a0b0f0..90e3d6379a42 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -50,6 +50,7 @@ #include <linux/notifier.h> #include <linux/inetdevice.h> #include <linux/igmp.h> +#include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif @@ -64,20 +65,20 @@ static struct ipv4_devconf ipv4_devconf = { .data = { - [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, + [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, }, }; static struct ipv4_devconf ipv4_devconf_dflt = { .data = { - [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1, - [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1, - [NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE - 1] = 1, + [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1, + [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1, + [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1, }, }; @@ -140,11 +141,11 @@ void in_dev_finish_destroy(struct in_device *idev) #endif dev_put(dev); if (!idev->dead) - printk("Freeing alive in_device %p\n", idev); - else { + pr_err("Freeing alive in_device %p\n", idev); + else kfree(idev); - } } +EXPORT_SYMBOL(in_dev_finish_destroy); static struct in_device *inetdev_init(struct net_device *dev) { @@ -159,7 +160,8 @@ static struct in_device *inetdev_init(struct net_device *dev) sizeof(in_dev->cnf)); in_dev->cnf.sysctl = NULL; in_dev->dev = dev; - if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) + in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl); + if (!in_dev->arp_parms) goto out_kfree; if (IPV4_DEVCONF(in_dev->cnf, FORWARDING)) dev_disable_lro(dev); @@ -405,13 +407,15 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; struct in_device *in_dev = NULL; - read_lock(&dev_base_lock); - dev = __dev_get_by_index(net, ifindex); + + rcu_read_lock(); + dev = dev_get_by_index_rcu(net, ifindex); if (dev) in_dev = in_dev_get(dev); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return in_dev; } +EXPORT_SYMBOL(inetdev_by_index); /* Called only from RTNL semaphored context. No locks. */ @@ -557,7 +561,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg * Determine a default network mask, based on the IP address. */ -static __inline__ int inet_abc_len(__be32 addr) +static inline int inet_abc_len(__be32 addr) { int rc = -1; /* Something else, probably a multicast. */ @@ -646,13 +650,15 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); ret = -ENODEV; - if ((dev = __dev_get_by_name(net, ifr.ifr_name)) == NULL) + dev = __dev_get_by_name(net, ifr.ifr_name); + if (!dev) goto done; if (colon) *colon = ':'; - if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { if (tryaddrmatch) { /* Matthias Andree */ /* compare label and address (4.4BSD style) */ @@ -720,7 +726,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (!ifa) { ret = -ENOBUFS; - if ((ifa = inet_alloc_ifa()) == NULL) + ifa = inet_alloc_ifa(); + if (!ifa) break; if (colon) memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); @@ -822,10 +829,10 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len) struct ifreq ifr; int done = 0; - if (!in_dev || (ifa = in_dev->ifa_list) == NULL) + if (!in_dev) goto out; - for (; ifa; ifa = ifa->ifa_next) { + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { if (!buf) { done += sizeof(ifr); continue; @@ -875,36 +882,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (!addr) addr = ifa->ifa_local; } endfor_ifa(in_dev); -no_in_dev: - rcu_read_unlock(); if (addr) - goto out; + goto out_unlock; +no_in_dev: /* Not loopback addresses on loopback should be preferred in this case. It is importnat that lo is the first interface in dev_base list. */ - read_lock(&dev_base_lock); - rcu_read_lock(); - for_each_netdev(net, dev) { - if ((in_dev = __in_dev_get_rcu(dev)) == NULL) + for_each_netdev_rcu(net, dev) { + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) continue; for_primary_ifa(in_dev) { if (ifa->ifa_scope != RT_SCOPE_LINK && ifa->ifa_scope <= scope) { addr = ifa->ifa_local; - goto out_unlock_both; + goto out_unlock; } } endfor_ifa(in_dev); } -out_unlock_both: - read_unlock(&dev_base_lock); +out_unlock: rcu_read_unlock(); -out: return addr; } +EXPORT_SYMBOL(inet_select_addr); static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, __be32 local, int scope) @@ -940,7 +944,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst, } } endfor_ifa(in_dev); - return same? addr : 0; + return same ? addr : 0; } /* @@ -961,17 +965,16 @@ __be32 inet_confirm_addr(struct in_device *in_dev, return confirm_addr_indev(in_dev, dst, local, scope); net = dev_net(in_dev->dev); - read_lock(&dev_base_lock); rcu_read_lock(); - for_each_netdev(net, dev) { - if ((in_dev = __in_dev_get_rcu(dev))) { + for_each_netdev_rcu(net, dev) { + in_dev = __in_dev_get_rcu(dev); + if (in_dev) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) break; } } rcu_read_unlock(); - read_unlock(&dev_base_lock); return addr; } @@ -984,14 +987,16 @@ int register_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&inetaddr_chain, nb); } +EXPORT_SYMBOL(register_inetaddr_notifier); int unregister_inetaddr_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&inetaddr_chain, nb); } +EXPORT_SYMBOL(unregister_inetaddr_notifier); -/* Rename ifa_labels for a device name change. Make some effort to preserve existing - * alias numbering and to create unique labels if possible. +/* Rename ifa_labels for a device name change. Make some effort to preserve + * existing alias numbering and to create unique labels if possible. */ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) { @@ -1010,11 +1015,10 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) sprintf(old, ":%d", named); dot = old; } - if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) { + if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) strcat(ifa->ifa_label, dot); - } else { + else strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); - } skip: rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0); } @@ -1061,8 +1065,9 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, if (!inetdev_valid_mtu(dev->mtu)) break; if (dev->flags & IFF_LOOPBACK) { - struct in_ifaddr *ifa; - if ((ifa = inet_alloc_ifa()) != NULL) { + struct in_ifaddr *ifa = inet_alloc_ifa(); + + if (ifa) { ifa->ifa_local = ifa->ifa_address = htonl(INADDR_LOOPBACK); ifa->ifa_prefixlen = 8; @@ -1170,38 +1175,54 @@ nla_put_failure: static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - int idx, ip_idx; + int h, s_h; + int idx, s_idx; + int ip_idx, s_ip_idx; struct net_device *dev; struct in_device *in_dev; struct in_ifaddr *ifa; - int s_ip_idx, s_idx = cb->args[0]; + struct hlist_head *head; + struct hlist_node *node; - s_ip_idx = ip_idx = cb->args[1]; - idx = 0; - for_each_netdev(net, dev) { - if (idx < s_idx) - goto cont; - if (idx > s_idx) - s_ip_idx = 0; - if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) - goto cont; - - for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; - ifa = ifa->ifa_next, ip_idx++) { - if (ip_idx < s_ip_idx) - continue; - if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + s_ip_idx = ip_idx = cb->args[2]; + + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { + idx = 0; + head = &net->dev_index_head[h]; + rcu_read_lock(); + hlist_for_each_entry_rcu(dev, node, head, index_hlist) { + if (idx < s_idx) + goto cont; + if (h > s_h || idx > s_idx) + s_ip_idx = 0; + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + goto cont; + + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; + ifa = ifa->ifa_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if (inet_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, - RTM_NEWADDR, NLM_F_MULTI) <= 0) - goto done; - } + RTM_NEWADDR, NLM_F_MULTI) <= 0) { + rcu_read_unlock(); + goto done; + } + } cont: - idx++; + idx++; + } + rcu_read_unlock(); } done: - cb->args[0] = idx; - cb->args[1] = ip_idx; + cb->args[0] = h; + cb->args[1] = idx; + cb->args[2] = ip_idx; return skb->len; } @@ -1239,18 +1260,18 @@ static void devinet_copy_dflt_conf(struct net *net, int i) { struct net_device *dev; - read_lock(&dev_base_lock); - for_each_netdev(net, dev) { + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { struct in_device *in_dev; - rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev); if (in_dev && !test_bit(i, in_dev->cnf.state)) in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i]; - rcu_read_unlock(); } - read_unlock(&dev_base_lock); + rcu_read_unlock(); } +/* called with RTNL locked */ static void inet_forward_change(struct net *net) { struct net_device *dev; @@ -1259,7 +1280,6 @@ static void inet_forward_change(struct net *net) IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; - read_lock(&dev_base_lock); for_each_netdev(net, dev) { struct in_device *in_dev; if (on) @@ -1270,7 +1290,6 @@ static void inet_forward_change(struct net *net) IN_DEV_CONF_SET(in_dev, FORWARDING, on); rcu_read_unlock(); } - read_unlock(&dev_base_lock); } static int devinet_conf_proc(ctl_table *ctl, int write, @@ -1293,72 +1312,25 @@ static int devinet_conf_proc(ctl_table *ctl, int write, return ret; } -static int devinet_conf_sysctl(ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - struct ipv4_devconf *cnf; - struct net *net; - int *valp = table->data; - int new; - int i; - - if (!newval || !newlen) - return 0; - - if (newlen != sizeof(int)) - return -EINVAL; - - if (get_user(new, (int __user *)newval)) - return -EFAULT; - - if (new == *valp) - return 0; - - if (oldval && oldlenp) { - size_t len; - - if (get_user(len, oldlenp)) - return -EFAULT; - - if (len) { - if (len > table->maxlen) - len = table->maxlen; - if (copy_to_user(oldval, valp, len)) - return -EFAULT; - if (put_user(len, oldlenp)) - return -EFAULT; - } - } - - *valp = new; - - cnf = table->extra1; - net = table->extra2; - i = (int *)table->data - cnf->data; - - set_bit(i, cnf->state); - - if (cnf == net->ipv4.devconf_dflt) - devinet_copy_dflt_conf(net, i); - - return 1; -} - static int devinet_sysctl_forward(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; + loff_t pos = *ppos; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *valp != val) { struct net *net = ctl->extra2; if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) { - if (!rtnl_trylock()) + if (!rtnl_trylock()) { + /* Restore the original values before restarting */ + *valp = val; + *ppos = pos; return restart_syscall(); + } if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) { inet_forward_change(net); } else if (*valp) { @@ -1390,57 +1362,37 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write, return ret; } -int ipv4_doint_and_flush_strategy(ctl_table *table, - void __user *oldval, size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int ret = devinet_conf_sysctl(table, oldval, oldlenp, newval, newlen); - struct net *net = table->extra2; - - if (ret == 1) - rt_cache_flush(net, 0); - - return ret; -} - - -#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc, sysctl) \ +#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \ { \ - .ctl_name = NET_IPV4_CONF_ ## attr, \ .procname = name, \ .data = ipv4_devconf.data + \ - NET_IPV4_CONF_ ## attr - 1, \ + IPV4_DEVCONF_ ## attr - 1, \ .maxlen = sizeof(int), \ .mode = mval, \ .proc_handler = proc, \ - .strategy = sysctl, \ .extra1 = &ipv4_devconf, \ } #define DEVINET_SYSCTL_RW_ENTRY(attr, name) \ - DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc, \ - devinet_conf_sysctl) + DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc) #define DEVINET_SYSCTL_RO_ENTRY(attr, name) \ - DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc, \ - devinet_conf_sysctl) + DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc) -#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc, sysctl) \ - DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc, sysctl) +#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \ + DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc) #define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \ - DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush, \ - ipv4_doint_and_flush_strategy) + DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush) static struct devinet_sysctl_table { struct ctl_table_header *sysctl_header; - struct ctl_table devinet_vars[__NET_IPV4_CONF_MAX]; + struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX]; char *dev_name; } devinet_sysctl = { .devinet_vars = { DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding", - devinet_sysctl_forward, - devinet_conf_sysctl), + devinet_sysctl_forward), DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"), @@ -1450,6 +1402,8 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"), DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE, "accept_source_route"), + DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"), + DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"), DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"), DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"), DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"), @@ -1460,6 +1414,7 @@ static struct devinet_sysctl_table { DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"), DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"), DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"), + DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"), DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"), @@ -1471,7 +1426,7 @@ static struct devinet_sysctl_table { }; static int __devinet_sysctl_register(struct net *net, char *dev_name, - int ctl_name, struct ipv4_devconf *p) + struct ipv4_devconf *p) { int i; struct devinet_sysctl_table *t; @@ -1479,9 +1434,9 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, #define DEVINET_CTL_PATH_DEV 3 struct ctl_path devinet_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "conf", .ctl_name = NET_IPV4_CONF, }, + { .procname = "net", }, + { .procname = "ipv4", }, + { .procname = "conf", }, { /* to be set */ }, { }, }; @@ -1506,7 +1461,6 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, goto free; devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name; - devinet_ctl_path[DEVINET_CTL_PATH_DEV].ctl_name = ctl_name; t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path, t->devinet_vars); @@ -1539,10 +1493,9 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) static void devinet_sysctl_register(struct in_device *idev) { - neigh_sysctl_register(idev->dev, idev->arp_parms, NET_IPV4, - NET_IPV4_NEIGH, "ipv4", NULL, NULL); + neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL); __devinet_sysctl_register(dev_net(idev->dev), idev->dev->name, - idev->dev->ifindex, &idev->cnf); + &idev->cnf); } static void devinet_sysctl_unregister(struct in_device *idev) @@ -1553,14 +1506,12 @@ static void devinet_sysctl_unregister(struct in_device *idev) static struct ctl_table ctl_forward_entry[] = { { - .ctl_name = NET_IPV4_FORWARD, .procname = "ip_forward", .data = &ipv4_devconf.data[ - NET_IPV4_CONF_FORWARDING - 1], + IPV4_DEVCONF_FORWARDING - 1], .maxlen = sizeof(int), .mode = 0644, .proc_handler = devinet_sysctl_forward, - .strategy = devinet_conf_sysctl, .extra1 = &ipv4_devconf, .extra2 = &init_net, }, @@ -1568,8 +1519,8 @@ static struct ctl_table ctl_forward_entry[] = { }; static __net_initdata struct ctl_path net_ipv4_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "net", }, + { .procname = "ipv4", }, { }, }; #endif @@ -1587,7 +1538,7 @@ static __net_init int devinet_init_net(struct net *net) all = &ipv4_devconf; dflt = &ipv4_devconf_dflt; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL); if (all == NULL) goto err_alloc_all; @@ -1601,20 +1552,18 @@ static __net_init int devinet_init_net(struct net *net) if (tbl == NULL) goto err_alloc_ctl; - tbl[0].data = &all->data[NET_IPV4_CONF_FORWARDING - 1]; + tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1]; tbl[0].extra1 = all; tbl[0].extra2 = net; #endif } #ifdef CONFIG_SYSCTL - err = __devinet_sysctl_register(net, "all", - NET_PROTO_CONF_ALL, all); + err = __devinet_sysctl_register(net, "all", all); if (err < 0) goto err_reg_all; - err = __devinet_sysctl_register(net, "default", - NET_PROTO_CONF_DEFAULT, dflt); + err = __devinet_sysctl_register(net, "default", dflt); if (err < 0) goto err_reg_dflt; @@ -1680,8 +1629,3 @@ void __init devinet_init(void) rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr); } -EXPORT_SYMBOL(in_dev_finish_destroy); -EXPORT_SYMBOL(inet_select_addr); -EXPORT_SYMBOL(inetdev_by_index); -EXPORT_SYMBOL(register_inetaddr_notifier); -EXPORT_SYMBOL(unregister_inetaddr_notifier); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 12f7287e902d..14ca1f1c3fb0 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -422,7 +422,7 @@ static void esp4_err(struct sk_buff *skb, u32 info) icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return; - x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); + x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); if (!x) return; NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", @@ -530,7 +530,7 @@ static int esp_init_authenc(struct xfrm_state *x) } err = crypto_aead_setauthsize( - aead, aalg_desc->uinfo.auth.icv_truncbits / 8); + aead, x->aalg->alg_trunc_len / 8); if (err) goto free_key; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index aa00398be80e..4f0ed458c883 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -34,6 +34,7 @@ #include <linux/skbuff.h> #include <linux/init.h> #include <linux/list.h> +#include <linux/slab.h> #include <net/ip.h> #include <net/protocol.h> @@ -125,7 +126,7 @@ void fib_select_default(struct net *net, #endif tb = fib_get_table(net, table); if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - tb->tb_select_default(tb, flp, res); + fib_table_select_default(tb, flp, res); } static void fib_flush(struct net *net) @@ -139,7 +140,7 @@ static void fib_flush(struct net *net) for (h = 0; h < FIB_TABLE_HASHSZ; h++) { head = &net->ipv4.fib_table_hash[h]; hlist_for_each_entry(tb, node, head, tb_hlist) - flushed += tb->tb_flush(tb); + flushed += fib_table_flush(tb); } if (flushed) @@ -162,7 +163,7 @@ struct net_device * ip_dev_find(struct net *net, __be32 addr) #endif local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (!local_table || local_table->tb_lookup(local_table, &fl, &res)) + if (!local_table || fib_table_lookup(local_table, &fl, &res)) return NULL; if (res.type != RTN_LOCAL) goto out; @@ -200,7 +201,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net, local_table = fib_get_table(net, RT_TABLE_LOCAL); if (local_table) { ret = RTN_UNICAST; - if (!local_table->tb_lookup(local_table, &fl, &res)) { + if (!fib_table_lookup(local_table, &fl, &res)) { if (!dev || dev == res.fi->fib_dev) ret = res.type; fib_res_put(&res); @@ -241,16 +242,19 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, .iif = oif }; struct fib_result res; - int no_addr, rpf; + int no_addr, rpf, accept_local; int ret; struct net *net; - no_addr = rpf = 0; + no_addr = rpf = accept_local = 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) { no_addr = in_dev->ifa_list == NULL; rpf = IN_DEV_RPFILTER(in_dev); + accept_local = IN_DEV_ACCEPT_LOCAL(in_dev); + if (mark && !IN_DEV_SRC_VMARK(in_dev)) + fl.mark = 0; } rcu_read_unlock(); @@ -260,8 +264,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, net = dev_net(dev); if (fib_lookup(net, &fl, &res)) goto last_resort; - if (res.type != RTN_UNICAST) - goto e_inval_res; + if (res.type != RTN_UNICAST) { + if (res.type != RTN_LOCAL || !accept_local) + goto e_inval_res; + } *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH @@ -476,13 +482,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) if (cmd == SIOCDELRT) { tb = fib_get_table(net, cfg.fc_table); if (tb) - err = tb->tb_delete(tb, &cfg); + err = fib_table_delete(tb, &cfg); else err = -ESRCH; } else { tb = fib_new_table(net, cfg.fc_table); if (tb) - err = tb->tb_insert(tb, &cfg); + err = fib_table_insert(tb, &cfg); else err = -ENOBUFS; } @@ -597,7 +603,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar goto errout; } - err = tb->tb_delete(tb, &cfg); + err = fib_table_delete(tb, &cfg); errout: return err; } @@ -619,7 +625,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar goto errout; } - err = tb->tb_insert(tb, &cfg); + err = fib_table_insert(tb, &cfg); errout: return err; } @@ -650,7 +656,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (dumped) memset(&cb->args[2], 0, sizeof(cb->args) - 2 * sizeof(cb->args[0])); - if (tb->tb_dump(tb, skb, cb) < 0) + if (fib_table_dump(tb, skb, cb) < 0) goto out; dumped = 1; next: @@ -704,9 +710,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad cfg.fc_scope = RT_SCOPE_HOST; if (cmd == RTM_NEWROUTE) - tb->tb_insert(tb, &cfg); + fib_table_insert(tb, &cfg); else - tb->tb_delete(tb, &cfg); + fib_table_delete(tb, &cfg); } void fib_add_ifaddr(struct in_ifaddr *ifa) @@ -835,7 +841,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) local_bh_disable(); frn->tb_id = tb->tb_id; - frn->err = tb->tb_lookup(tb, &fl, &res); + frn->err = fib_table_lookup(tb, &fl, &res); if (!frn->err) { frn->prefixlen = res.prefixlen; @@ -878,7 +884,7 @@ static void nl_fib_input(struct sk_buff *skb) netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); } -static int nl_fib_lookup_init(struct net *net) +static int __net_init nl_fib_lookup_init(struct net *net) { struct sock *sk; sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0, @@ -895,11 +901,11 @@ static void nl_fib_lookup_exit(struct net *net) net->ipv4.fibnl = NULL; } -static void fib_disable_ip(struct net_device *dev, int force) +static void fib_disable_ip(struct net_device *dev, int force, int delay) { if (fib_sync_down_dev(dev, force)) fib_flush(dev_net(dev)); - rt_cache_flush(dev_net(dev), 0); + rt_cache_flush(dev_net(dev), delay); arp_ifdown(dev); } @@ -922,7 +928,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, /* Last address was deleted from this interface. Disable IP. */ - fib_disable_ip(dev, 1); + fib_disable_ip(dev, 1, 0); } else { rt_cache_flush(dev_net(dev), -1); } @@ -937,7 +943,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo struct in_device *in_dev = __in_dev_get_rtnl(dev); if (event == NETDEV_UNREGISTER) { - fib_disable_ip(dev, 2); + fib_disable_ip(dev, 2, -1); return NOTIFY_DONE; } @@ -955,12 +961,15 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo rt_cache_flush(dev_net(dev), -1); break; case NETDEV_DOWN: - fib_disable_ip(dev, 0); + fib_disable_ip(dev, 0, 0); break; case NETDEV_CHANGEMTU: case NETDEV_CHANGE: rt_cache_flush(dev_net(dev), 0); break; + case NETDEV_UNREGISTER_BATCH: + rt_cache_flush_batch(); + break; } return NOTIFY_DONE; } @@ -996,7 +1005,7 @@ fail: return err; } -static void __net_exit ip_fib_net_exit(struct net *net) +static void ip_fib_net_exit(struct net *net) { unsigned int i; @@ -1012,7 +1021,7 @@ static void __net_exit ip_fib_net_exit(struct net *net) head = &net->ipv4.fib_table_hash[i]; hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) { hlist_del(node); - tb->tb_flush(tb); + fib_table_flush(tb); kfree(tb); } } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index ecd39454235c..4ed7e0dea1bc 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -32,6 +32,7 @@ #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/init.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/ip.h> @@ -242,8 +243,8 @@ fn_new_zone(struct fn_hash *table, int z) return fz; } -static int -fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +int fib_table_lookup(struct fib_table *tb, + const struct flowi *flp, struct fib_result *res) { int err; struct fn_zone *fz; @@ -274,8 +275,8 @@ out: return err; } -static void -fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +void fib_table_select_default(struct fib_table *tb, + const struct flowi *flp, struct fib_result *res) { int order, last_idx; struct hlist_node *node; @@ -366,7 +367,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) return NULL; } -static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg) +int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; struct fib_node *new_f = NULL; @@ -544,8 +545,7 @@ out: return err; } - -static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg) +int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) { struct fn_hash *table = (struct fn_hash *)tb->tb_data; struct fib_node *f; @@ -662,7 +662,7 @@ static int fn_flush_list(struct fn_zone *fz, int idx) return found; } -static int fn_hash_flush(struct fib_table *tb) +int fib_table_flush(struct fib_table *tb) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; struct fn_zone *fz; @@ -743,7 +743,8 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, return skb->len; } -static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) +int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, + struct netlink_callback *cb) { int m, s_m; struct fn_zone *fz; @@ -787,12 +788,7 @@ struct fib_table *fib_hash_table(u32 id) tb->tb_id = id; tb->tb_default = -1; - tb->tb_lookup = fn_hash_lookup; - tb->tb_insert = fn_hash_insert; - tb->tb_delete = fn_hash_delete; - tb->tb_flush = fn_hash_flush; - tb->tb_select_default = fn_hash_select_default; - tb->tb_dump = fn_hash_dump; + memset(tb->tb_data, 0, sizeof(struct fn_hash)); return tb; } diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 92d9d97ec5e3..ca2d07b1c706 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -94,7 +94,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) goto errout; - err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result); + err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result); if (err > 0) err = -EAGAIN; errout: @@ -284,7 +284,7 @@ static int fib_default_rules_init(struct fib_rules_ops *ops) { int err; - err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT); + err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0); if (err < 0) return err; err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0); @@ -301,13 +301,9 @@ int __net_init fib4_rules_init(struct net *net) int err; struct fib_rules_ops *ops; - ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL); - if (ops == NULL) - return -ENOMEM; - INIT_LIST_HEAD(&ops->rules_list); - ops->fro_net = net; - - fib_rules_register(ops); + ops = fib_rules_register(&fib4_rules_ops_template, net); + if (IS_ERR(ops)) + return PTR_ERR(ops); err = fib_default_rules_init(ops); if (err < 0) @@ -318,12 +314,10 @@ int __net_init fib4_rules_init(struct net *net) fail: /* also cleans all rules already added */ fib_rules_unregister(ops); - kfree(ops); return err; } void __net_exit fib4_rules_exit(struct net *net) { fib_rules_unregister(net->ipv4.rules_ops); - kfree(net->ipv4.rules_ops); } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 9b096d6ff3f2..20f09c5b31e8 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -32,6 +32,7 @@ #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/init.h> +#include <linux/slab.h> #include <net/arp.h> #include <net/ip.h> @@ -62,8 +63,8 @@ static DEFINE_SPINLOCK(fib_multipath_lock); #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) -#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ -for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) +#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \ +for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -72,7 +73,7 @@ for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, #define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ for (nhsel=0; nhsel < 1; nhsel++) -#define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \ +#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ for (nhsel=0; nhsel < 1; nhsel++) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -145,9 +146,9 @@ void free_fib_info(struct fib_info *fi) return; } change_nexthops(fi) { - if (nh->nh_dev) - dev_put(nh->nh_dev); - nh->nh_dev = NULL; + if (nexthop_nh->nh_dev) + dev_put(nexthop_nh->nh_dev); + nexthop_nh->nh_dev = NULL; } endfor_nexthops(fi); fib_info_cnt--; release_net(fi->fib_net); @@ -162,9 +163,9 @@ void fib_release_info(struct fib_info *fi) if (fi->fib_prefsrc) hlist_del(&fi->fib_lhash); change_nexthops(fi) { - if (!nh->nh_dev) + if (!nexthop_nh->nh_dev) continue; - hlist_del(&nh->nh_hash); + hlist_del(&nexthop_nh->nh_hash); } endfor_nexthops(fi) fi->fib_dead = 1; fib_info_put(fi); @@ -228,7 +229,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) head = &fib_info_hash[hash]; hlist_for_each_entry(fi, node, head, fib_hash) { - if (fi->fib_net != nfi->fib_net) + if (!net_eq(fi->fib_net, nfi->fib_net)) continue; if (fi->fib_nhs != nfi->fib_nhs) continue; @@ -395,19 +396,20 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh, if (!rtnh_ok(rtnh, remaining)) return -EINVAL; - nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; - nh->nh_oif = rtnh->rtnh_ifindex; - nh->nh_weight = rtnh->rtnh_hops + 1; + nexthop_nh->nh_flags = + (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags; + nexthop_nh->nh_oif = rtnh->rtnh_ifindex; + nexthop_nh->nh_weight = rtnh->rtnh_hops + 1; attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); nla = nla_find(attrs, attrlen, RTA_GATEWAY); - nh->nh_gw = nla ? nla_get_be32(nla) : 0; + nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; #ifdef CONFIG_NET_CLS_ROUTE nla = nla_find(attrs, attrlen, RTA_FLOW); - nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; + nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; #endif } @@ -527,10 +529,6 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (nh->nh_gw) { struct fib_result res; -#ifdef CONFIG_IP_ROUTE_PERVASIVE - if (nh->nh_flags&RTNH_F_PERVASIVE) - return 0; -#endif if (nh->nh_flags&RTNH_F_ONLINK) { struct net_device *dev; @@ -738,7 +736,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi->fib_nhs = nhs; change_nexthops(fi) { - nh->nh_parent = fi; + nexthop_nh->nh_parent = fi; } endfor_nexthops(fi) if (cfg->fc_mx) { @@ -808,7 +806,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; } else { change_nexthops(fi) { - if ((err = fib_check_nh(cfg, fi, nh)) != 0) + if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0) goto failure; } endfor_nexthops(fi) } @@ -843,11 +841,11 @@ link_it: struct hlist_head *head; unsigned int hash; - if (!nh->nh_dev) + if (!nexthop_nh->nh_dev) continue; - hash = fib_devindex_hashfn(nh->nh_dev->ifindex); + hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex); head = &fib_info_devhash[hash]; - hlist_add_head(&nh->nh_hash, head); + hlist_add_head(&nexthop_nh->nh_hash, head); } endfor_nexthops(fi) spin_unlock_bh(&fib_info_lock); return fi; @@ -1047,7 +1045,7 @@ int fib_sync_down_addr(struct net *net, __be32 local) return 0; hlist_for_each_entry(fi, node, head, fib_lhash) { - if (fi->fib_net != net) + if (!net_eq(fi->fib_net, net)) continue; if (fi->fib_prefsrc == local) { fi->fib_flags |= RTNH_F_DEAD; @@ -1080,21 +1078,21 @@ int fib_sync_down_dev(struct net_device *dev, int force) prev_fi = fi; dead = 0; change_nexthops(fi) { - if (nh->nh_flags&RTNH_F_DEAD) + if (nexthop_nh->nh_flags&RTNH_F_DEAD) dead++; - else if (nh->nh_dev == dev && - nh->nh_scope != scope) { - nh->nh_flags |= RTNH_F_DEAD; + else if (nexthop_nh->nh_dev == dev && + nexthop_nh->nh_scope != scope) { + nexthop_nh->nh_flags |= RTNH_F_DEAD; #ifdef CONFIG_IP_ROUTE_MULTIPATH spin_lock_bh(&fib_multipath_lock); - fi->fib_power -= nh->nh_power; - nh->nh_power = 0; + fi->fib_power -= nexthop_nh->nh_power; + nexthop_nh->nh_power = 0; spin_unlock_bh(&fib_multipath_lock); #endif dead++; } #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (force > 1 && nh->nh_dev == dev) { + if (force > 1 && nexthop_nh->nh_dev == dev) { dead = fi->fib_nhs; break; } @@ -1144,18 +1142,20 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { alive++; continue; } - if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + if (nexthop_nh->nh_dev == NULL || + !(nexthop_nh->nh_dev->flags&IFF_UP)) continue; - if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) + if (nexthop_nh->nh_dev != dev || + !__in_dev_get_rtnl(dev)) continue; alive++; spin_lock_bh(&fib_multipath_lock); - nh->nh_power = 0; - nh->nh_flags &= ~RTNH_F_DEAD; + nexthop_nh->nh_power = 0; + nexthop_nh->nh_flags &= ~RTNH_F_DEAD; spin_unlock_bh(&fib_multipath_lock); } endfor_nexthops(fi) @@ -1182,9 +1182,9 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD)) { - power += nh->nh_weight; - nh->nh_power = nh->nh_weight; + if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { + power += nexthop_nh->nh_weight; + nexthop_nh->nh_power = nexthop_nh->nh_weight; } } endfor_nexthops(fi); fi->fib_power = power; @@ -1204,9 +1204,10 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) w = jiffies % fi->fib_power; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { - if ((w -= nh->nh_power) <= 0) { - nh->nh_power--; + if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) && + nexthop_nh->nh_power) { + if ((w -= nexthop_nh->nh_power) <= 0) { + nexthop_nh->nh_power--; fi->fib_power--; res->nh_sel = nhsel; spin_unlock_bh(&fib_multipath_lock); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 291bdf50a21f..59a838795e3e 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -71,6 +71,7 @@ #include <linux/netlink.h> #include <linux/init.h> #include <linux/list.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -961,7 +962,9 @@ fib_find_node(struct trie *t, u32 key) struct node *n; pos = 0; - n = rcu_dereference(t->trie); + n = rcu_dereference_check(t->trie, + rcu_read_lock_held() || + lockdep_rtnl_is_held()); while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; @@ -1174,7 +1177,7 @@ done: /* * Caller must hold RTNL. */ -static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg) +int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; struct fib_alias *fa, *new_fa; @@ -1373,8 +1376,8 @@ static int check_leaf(struct trie *t, struct leaf *l, return 1; } -static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, - struct fib_result *res) +int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, + struct fib_result *res) { struct trie *t = (struct trie *) tb->tb_data; int ret; @@ -1595,7 +1598,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l) /* * Caller must hold RTNL. */ -static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg) +int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) { struct trie *t = (struct trie *) tb->tb_data; u32 key, mask; @@ -1786,7 +1789,7 @@ static struct leaf *trie_leafindex(struct trie *t, int index) /* * Caller must hold RTNL. */ -static int fn_trie_flush(struct fib_table *tb) +int fib_table_flush(struct fib_table *tb) { struct trie *t = (struct trie *) tb->tb_data; struct leaf *l, *ll = NULL; @@ -1807,9 +1810,9 @@ static int fn_trie_flush(struct fib_table *tb) return found; } -static void fn_trie_select_default(struct fib_table *tb, - const struct flowi *flp, - struct fib_result *res) +void fib_table_select_default(struct fib_table *tb, + const struct flowi *flp, + struct fib_result *res) { struct trie *t = (struct trie *) tb->tb_data; int order, last_idx; @@ -1952,8 +1955,8 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb, return skb->len; } -static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, - struct netlink_callback *cb) +int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, + struct netlink_callback *cb) { struct leaf *l; struct trie *t = (struct trie *) tb->tb_data; @@ -2020,12 +2023,6 @@ struct fib_table *fib_hash_table(u32 id) tb->tb_id = id; tb->tb_default = -1; - tb->tb_lookup = fn_trie_lookup; - tb->tb_insert = fn_trie_insert; - tb->tb_delete = fn_trie_delete; - tb->tb_flush = fn_trie_flush; - tb->tb_select_default = fn_trie_select_default; - tb->tb_dump = fn_trie_dump; t = (struct trie *) tb->tb_data; memset(t, 0, sizeof(*t)); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 5bc13fe816d1..ac4dec132735 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -74,6 +74,7 @@ #include <linux/netdevice.h> #include <linux/string.h> #include <linux/netfilter_ipv4.h> +#include <linux/slab.h> #include <net/snmp.h> #include <net/ip.h> #include <net/route.h> @@ -114,7 +115,7 @@ struct icmp_bxm { /* An array of errno for error messages from dest unreach. */ /* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ -struct icmp_err icmp_err_convert[] = { +const struct icmp_err icmp_err_convert[] = { { .errno = ENETUNREACH, /* ICMP_NET_UNREACH */ .fatal = 0, @@ -501,15 +502,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) if (!(rt->rt_flags & RTCF_LOCAL)) { struct net_device *dev = NULL; + rcu_read_lock(); if (rt->fl.iif && net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr) - dev = dev_get_by_index(net, rt->fl.iif); + dev = dev_get_by_index_rcu(net, rt->fl.iif); - if (dev) { + if (dev) saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK); - dev_put(dev); - } else + else saddr = 0; + rcu_read_unlock(); } tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | @@ -1165,6 +1167,10 @@ static int __net_init icmp_sk_init(struct net *net) sk->sk_sndbuf = (2 * ((64 * 1024) + sizeof(struct sk_buff))); + /* + * Speedup sock_wfree() + */ + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d41e5de79a82..15d3eeda92f5 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -71,6 +71,7 @@ */ #include <linux/module.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <asm/system.h> #include <linux/types.h> @@ -946,7 +947,6 @@ int igmp_rcv(struct sk_buff *skb) break; case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: - case IGMPV3_HOST_MEMBERSHIP_REPORT: /* Is it our report looped back? */ if (skb_rtable(skb)->fl.iif == 0) break; @@ -960,6 +960,7 @@ int igmp_rcv(struct sk_buff *skb) in_dev_put(in_dev); return pim_rcv_v1(skb); #endif + case IGMPV3_HOST_MEMBERSHIP_REPORT: case IGMP_DVMRP: case IGMP_TRACE: case IGMP_HOST_LEAVE_MESSAGE: @@ -1799,7 +1800,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) iml->next = inet->mc_list; iml->sflist = NULL; iml->sfmode = MCAST_EXCLUDE; - inet->mc_list = iml; + rcu_assign_pointer(inet->mc_list, iml); ip_mc_inc_group(in_dev, addr); err = 0; done: @@ -1807,24 +1808,46 @@ done: return err; } +static void ip_sf_socklist_reclaim(struct rcu_head *rp) +{ + struct ip_sf_socklist *psf; + + psf = container_of(rp, struct ip_sf_socklist, rcu); + /* sk_omem_alloc should have been decreased by the caller*/ + kfree(psf); +} + static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, struct in_device *in_dev) { + struct ip_sf_socklist *psf = iml->sflist; int err; - if (iml->sflist == NULL) { + if (psf == NULL) { /* any-source empty exclude case */ return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, iml->sfmode, 0, NULL, 0); } err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, - iml->sfmode, iml->sflist->sl_count, - iml->sflist->sl_addr, 0); - sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max)); - iml->sflist = NULL; + iml->sfmode, psf->sl_count, psf->sl_addr, 0); + rcu_assign_pointer(iml->sflist, NULL); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc); + call_rcu(&psf->rcu, ip_sf_socklist_reclaim); return err; } + +static void ip_mc_socklist_reclaim(struct rcu_head *rp) +{ + struct ip_mc_socklist *iml; + + iml = container_of(rp, struct ip_mc_socklist, rcu); + /* sk_omem_alloc should have been decreased by the caller*/ + kfree(iml); +} + + /* * Ask a socket to leave a group. */ @@ -1854,12 +1877,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) (void) ip_mc_leave_src(sk, iml, in_dev); - *imlp = iml->next; + rcu_assign_pointer(*imlp, iml->next); if (in_dev) ip_mc_dec_group(in_dev, group); rtnl_unlock(); - sock_kfree_s(sk, iml, sizeof(*iml)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + call_rcu(&iml->rcu, ip_mc_socklist_reclaim); return 0; } if (!in_dev) @@ -1899,8 +1924,9 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct err = -EADDRNOTAVAIL; for (pmc=inet->mc_list; pmc; pmc=pmc->next) { - if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr - && pmc->multi.imr_ifindex == imr.imr_ifindex) + if ((pmc->multi.imr_multiaddr.s_addr == + imr.imr_multiaddr.s_addr) && + (pmc->multi.imr_ifindex == imr.imr_ifindex)) break; } if (!pmc) { /* must have a prior join */ @@ -1973,9 +1999,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct if (psl) { for (i=0; i<psl->sl_count; i++) newpsl->sl_addr[i] = psl->sl_addr[i]; - sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + call_rcu(&psl->rcu, ip_sf_socklist_reclaim); } - pmc->sflist = psl = newpsl; + rcu_assign_pointer(pmc->sflist, newpsl); + psl = newpsl; } rv = 1; /* > 0 for insert logic below if sl_count is 0 */ for (i=0; i<psl->sl_count; i++) { @@ -2071,11 +2100,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) if (psl) { (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, psl->sl_count, psl->sl_addr, 0); - sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc); + call_rcu(&psl->rcu, ip_sf_socklist_reclaim); } else (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, 0, NULL, 0); - pmc->sflist = newpsl; + rcu_assign_pointer(pmc->sflist, newpsl); pmc->sfmode = msf->imsf_fmode; err = 0; done: @@ -2208,30 +2239,40 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif) struct ip_mc_socklist *pmc; struct ip_sf_socklist *psl; int i; + int ret; + ret = 1; if (!ipv4_is_multicast(loc_addr)) - return 1; + goto out; - for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + rcu_read_lock(); + for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) { if (pmc->multi.imr_multiaddr.s_addr == loc_addr && pmc->multi.imr_ifindex == dif) break; } + ret = inet->mc_all; if (!pmc) - return inet->mc_all; + goto unlock; psl = pmc->sflist; + ret = (pmc->sfmode == MCAST_EXCLUDE); if (!psl) - return pmc->sfmode == MCAST_EXCLUDE; + goto unlock; for (i=0; i<psl->sl_count; i++) { if (psl->sl_addr[i] == rmt_addr) break; } + ret = 0; if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) - return 0; + goto unlock; if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) - return 0; - return 1; + goto unlock; + ret = 1; +unlock: + rcu_read_unlock(); +out: + return ret; } /* @@ -2250,7 +2291,7 @@ void ip_mc_drop_socket(struct sock *sk) rtnl_lock(); while ((iml = inet->mc_list) != NULL) { struct in_device *in_dev; - inet->mc_list = iml->next; + rcu_assign_pointer(inet->mc_list, iml->next); in_dev = inetdev_by_index(net, iml->multi.imr_ifindex); (void) ip_mc_leave_src(sk, iml, in_dev); @@ -2258,7 +2299,9 @@ void ip_mc_drop_socket(struct sock *sk) ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); in_dev_put(in_dev); } - sock_kfree_s(sk, iml, sizeof(*iml)); + /* decrease mem now to avoid the memleak warning */ + atomic_sub(sizeof(*iml), &sk->sk_omem_alloc); + call_rcu(&iml->rcu, ip_mc_socklist_reclaim); } rtnl_unlock(); } @@ -2311,9 +2354,10 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); state->in_dev = NULL; - for_each_netdev(net, state->dev) { + for_each_netdev_rcu(net, state->dev) { struct in_device *in_dev; - in_dev = in_dev_get(state->dev); + + in_dev = __in_dev_get_rcu(state->dev); if (!in_dev) continue; read_lock(&in_dev->mc_list_lock); @@ -2323,7 +2367,6 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) break; } read_unlock(&in_dev->mc_list_lock); - in_dev_put(in_dev); } return im; } @@ -2333,16 +2376,15 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); im = im->next; while (!im) { - if (likely(state->in_dev != NULL)) { + if (likely(state->in_dev != NULL)) read_unlock(&state->in_dev->mc_list_lock); - in_dev_put(state->in_dev); - } - state->dev = next_net_device(state->dev); + + state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->in_dev = NULL; break; } - state->in_dev = in_dev_get(state->dev); + state->in_dev = __in_dev_get_rcu(state->dev); if (!state->in_dev) continue; read_lock(&state->in_dev->mc_list_lock); @@ -2361,9 +2403,9 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(dev_base_lock) + __acquires(rcu) { - read_lock(&dev_base_lock); + rcu_read_lock(); return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -2379,16 +2421,15 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void igmp_mc_seq_stop(struct seq_file *seq, void *v) - __releases(dev_base_lock) + __releases(rcu) { struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); if (likely(state->in_dev != NULL)) { read_unlock(&state->in_dev->mc_list_lock); - in_dev_put(state->in_dev); state->in_dev = NULL; } state->dev = NULL; - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static int igmp_mc_seq_show(struct seq_file *seq, void *v) @@ -2462,9 +2503,9 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) state->idev = NULL; state->im = NULL; - for_each_netdev(net, state->dev) { + for_each_netdev_rcu(net, state->dev) { struct in_device *idev; - idev = in_dev_get(state->dev); + idev = __in_dev_get_rcu(state->dev); if (unlikely(idev == NULL)) continue; read_lock(&idev->mc_list_lock); @@ -2480,7 +2521,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) spin_unlock_bh(&im->lock); } read_unlock(&idev->mc_list_lock); - in_dev_put(idev); } return psf; } @@ -2494,16 +2534,15 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l spin_unlock_bh(&state->im->lock); state->im = state->im->next; while (!state->im) { - if (likely(state->idev != NULL)) { + if (likely(state->idev != NULL)) read_unlock(&state->idev->mc_list_lock); - in_dev_put(state->idev); - } - state->dev = next_net_device(state->dev); + + state->dev = next_net_device_rcu(state->dev); if (!state->dev) { state->idev = NULL; goto out; } - state->idev = in_dev_get(state->dev); + state->idev = __in_dev_get_rcu(state->dev); if (!state->idev) continue; read_lock(&state->idev->mc_list_lock); @@ -2528,8 +2567,9 @@ static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos) } static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) { - read_lock(&dev_base_lock); + rcu_read_lock(); return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } @@ -2545,6 +2585,7 @@ static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) + __releases(rcu) { struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); if (likely(state->im != NULL)) { @@ -2553,11 +2594,10 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) } if (likely(state->idev != NULL)) { read_unlock(&state->idev->mc_list_lock); - in_dev_put(state->idev); state->idev = NULL; } state->dev = NULL; - read_unlock(&dev_base_lock); + rcu_read_unlock(); } static int igmp_mcf_seq_show(struct seq_file *seq, void *v) @@ -2605,7 +2645,7 @@ static const struct file_operations igmp_mcf_seq_fops = { .release = seq_release_net, }; -static int igmp_net_init(struct net *net) +static int __net_init igmp_net_init(struct net *net) { struct proc_dir_entry *pde; @@ -2623,7 +2663,7 @@ out_igmp: return -ENOMEM; } -static void igmp_net_exit(struct net *net) +static void __net_exit igmp_net_exit(struct net *net) { proc_net_remove(net, "mcfilter"); proc_net_remove(net, "igmp"); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 537731b3bcb3..8da6429269dd 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -112,7 +112,7 @@ again: hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (ib_net(tb) == net && tb->port == rover) { + if (net_eq(ib_net(tb), net) && tb->port == rover) { if (tb->fastreuse > 0 && sk->sk_reuse && sk->sk_state != TCP_LISTEN && @@ -158,7 +158,7 @@ have_snum: hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (ib_net(tb) == net && tb->port == snum) + if (net_eq(ib_net(tb), net) && tb->port == snum) goto tb_found; } tb = NULL; @@ -358,6 +358,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, const struct inet_request_sock *ireq = inet_rsk(req); struct ip_options *opt = inet_rsk(req)->opt; struct flowi fl = { .oif = sk->sk_bound_dev_if, + .mark = sk->sk_mark, .nl_u = { .ip4_u = { .daddr = ((opt && opt->srr) ? opt->faddr : @@ -367,7 +368,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk, .proto = sk->sk_protocol, .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = - { .sport = inet_sk(sk)->sport, + { .sport = inet_sk(sk)->inet_sport, .dport = ireq->rmt_port } } }; struct net *net = sock_net(sk); @@ -528,9 +529,11 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, syn_ack_recalc(req, thresh, max_retries, queue->rskq_defer_accept, &expire, &resend); + if (req->rsk_ops->syn_ack_timeout) + req->rsk_ops->syn_ack_timeout(parent, req); if (!expire && (!resend || - !req->rsk_ops->rtx_syn_ack(parent, req) || + !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || inet_rsk(req)->acked)) { unsigned long timeo; @@ -574,9 +577,9 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, newsk->sk_state = TCP_SYN_RECV; newicsk->icsk_bind_hash = NULL; - inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; - inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port); - inet_sk(newsk)->sport = inet_rsk(req)->loc_port; + inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port; + inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port); + inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port; newsk->sk_write_space = sk_stream_write_space; newicsk->icsk_retransmits = 0; @@ -607,8 +610,8 @@ void inet_csk_destroy_sock(struct sock *sk) /* It cannot be in hash table! */ WARN_ON(!sk_unhashed(sk)); - /* If it has not 0 inet_sk(sk)->num, it must be bound */ - WARN_ON(inet_sk(sk)->num && !inet_csk(sk)->icsk_bind_hash); + /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ + WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); @@ -643,8 +646,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) * after validation is complete. */ sk->sk_state = TCP_LISTEN; - if (!sk->sk_prot->get_port(sk, inet->num)) { - inet->sport = htons(inet->num); + if (!sk->sk_prot->get_port(sk, inet->inet_num)) { + inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); sk->sk_prot->hash(sk); @@ -720,8 +723,8 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) const struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->daddr; - sin->sin_port = inet->dport; + sin->sin_addr.s_addr = inet->inet_daddr; + sin->sin_port = inet->inet_dport; } EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index a706a47f4dbb..e5fa2ddce320 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/fcntl.h> #include <linux/random.h> +#include <linux/slab.h> #include <linux/cache.h> #include <linux/init.h> #include <linux/time.h> @@ -116,10 +117,10 @@ static int inet_csk_diag_fill(struct sock *sk, r->id.idiag_cookie[0] = (u32)(unsigned long)sk; r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); - r->id.idiag_sport = inet->sport; - r->id.idiag_dport = inet->dport; - r->id.idiag_src[0] = inet->rcv_saddr; - r->id.idiag_dst[0] = inet->daddr; + r->id.idiag_sport = inet->inet_sport; + r->id.idiag_dport = inet->inet_dport; + r->id.idiag_src[0] = inet->inet_rcv_saddr; + r->id.idiag_dst[0] = inet->inet_daddr; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { @@ -368,7 +369,7 @@ static int inet_diag_bc_run(const void *bc, int len, yes = entry->sport >= op[1].no; break; case INET_DIAG_BC_S_LE: - yes = entry->dport <= op[1].no; + yes = entry->sport <= op[1].no; break; case INET_DIAG_BC_D_GE: yes = entry->dport >= op[1].no; @@ -504,11 +505,11 @@ static int inet_csk_diag_dump(struct sock *sk, } else #endif { - entry.saddr = &inet->rcv_saddr; - entry.daddr = &inet->daddr; + entry.saddr = &inet->inet_rcv_saddr; + entry.daddr = &inet->inet_daddr; } - entry.sport = inet->num; - entry.dport = ntohs(inet->dport); + entry.sport = inet->inet_num; + entry.dport = ntohs(inet->inet_dport); entry.userlocks = sk->sk_userlocks; if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) @@ -584,7 +585,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, if (tmo < 0) tmo = 0; - r->id.idiag_sport = inet->sport; + r->id.idiag_sport = inet->inet_sport; r->id.idiag_dport = ireq->rmt_port; r->id.idiag_src[0] = ireq->loc_addr; r->id.idiag_dst[0] = ireq->rmt_addr; @@ -639,7 +640,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { bc = (struct rtattr *)(r + 1); - entry.sport = inet->num; + entry.sport = inet->inet_num; entry.userlocks = sk->sk_userlocks; } @@ -732,7 +733,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; } - if (r->id.idiag_sport != inet->sport && + if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next_listen; @@ -774,7 +775,7 @@ skip_listen_ht: if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) goto unlock; - for (i = s_i; i < hashinfo->ehash_size; i++) { + for (i = s_i; i <= hashinfo->ehash_mask; i++) { struct inet_ehash_bucket *head = &hashinfo->ehash[i]; spinlock_t *lock = inet_ehash_lockp(hashinfo, i); struct sock *sk; @@ -797,10 +798,10 @@ skip_listen_ht: goto next_normal; if (!(r->idiag_states & (1 << sk->sk_state))) goto next_normal; - if (r->id.idiag_sport != inet->sport && + if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next_normal; - if (r->id.idiag_dport != inet->dport && + if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next_normal; if (inet_csk_diag_dump(sk, skb, cb) < 0) { diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index eaf3e2c8646a..a2ca6aed763b 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -19,6 +19,7 @@ #include <linux/random.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> +#include <linux/slab.h> #include <net/inet_frag.h> diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 625cc5f64c94..2b79377b468d 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -64,7 +64,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, atomic_inc(&hashinfo->bsockets); - inet_sk(sk)->num = snum; + inet_sk(sk)->inet_num = snum; sk_add_bind_node(sk, &tb->owners); tb->num_owners++; inet_csk(sk)->icsk_bind_hash = tb; @@ -76,7 +76,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, static void __inet_put_port(struct sock *sk) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->num, + const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, hashinfo->bhash_size); struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; struct inet_bind_bucket *tb; @@ -88,7 +88,7 @@ static void __inet_put_port(struct sock *sk) __sk_del_bind_node(sk); tb->num_owners--; inet_csk(sk)->icsk_bind_hash = NULL; - inet_sk(sk)->num = 0; + inet_sk(sk)->inet_num = 0; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); spin_unlock(&head->lock); } @@ -105,7 +105,7 @@ EXPORT_SYMBOL(inet_put_port); void __inet_inherit_port(struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->num, + const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; @@ -126,9 +126,9 @@ static inline int compute_score(struct sock *sk, struct net *net, int score = -1; struct inet_sock *inet = inet_sk(sk); - if (net_eq(sock_net(sk), net) && inet->num == hnum && + if (net_eq(sock_net(sk), net) && inet->inet_num == hnum && !ipv6_only_sock(sk)) { - __be32 rcv_saddr = inet->rcv_saddr; + __be32 rcv_saddr = inet->inet_rcv_saddr; score = sk->sk_family == PF_INET ? 1 : 0; if (rcv_saddr) { if (rcv_saddr != daddr) @@ -209,7 +209,7 @@ struct sock * __inet_lookup_established(struct net *net, * have wildcards anyways. */ unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & (hashinfo->ehash_size - 1); + unsigned int slot = hash & hashinfo->ehash_mask; struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; rcu_read_lock(); @@ -273,18 +273,20 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); - __be32 daddr = inet->rcv_saddr; - __be32 saddr = inet->daddr; + __be32 daddr = inet->inet_rcv_saddr; + __be32 saddr = inet->inet_daddr; int dif = sk->sk_bound_dev_if; INET_ADDR_COOKIE(acookie, saddr, daddr) - const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport); + const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); struct net *net = sock_net(sk); - unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport); + unsigned int hash = inet_ehashfn(net, daddr, lport, + saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); spinlock_t *lock = inet_ehash_lockp(hinfo, hash); struct sock *sk2; const struct hlist_nulls_node *node; struct inet_timewait_sock *tw; + int twrefcnt = 0; spin_lock(lock); @@ -312,25 +314,28 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, unique: /* Must record num and sport now. Otherwise we will see * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); + inet->inet_num = lport; + inet->inet_sport = htons(lport); sk->sk_hash = hash; WARN_ON(!sk_unhashed(sk)); __sk_nulls_add_node_rcu(sk, &head->chain); + if (tw) { + twrefcnt = inet_twsk_unhash(tw); + NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); + } spin_unlock(lock); + if (twrefcnt) + inet_twsk_put(tw); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); if (twp) { *twp = tw; - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); } else if (tw) { /* Silly. Should hash-dance instead... */ inet_twsk_deschedule(tw, death_row); - NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED); inet_twsk_put(tw); } - return 0; not_unique: @@ -341,16 +346,18 @@ not_unique: static inline u32 inet_sk_port_offset(const struct sock *sk) { const struct inet_sock *inet = inet_sk(sk); - return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, - inet->dport); + return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr, + inet->inet_daddr, + inet->inet_dport); } -void __inet_hash_nolisten(struct sock *sk) +int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct hlist_nulls_head *list; spinlock_t *lock; struct inet_ehash_bucket *head; + int twrefcnt = 0; WARN_ON(!sk_unhashed(sk)); @@ -361,8 +368,13 @@ void __inet_hash_nolisten(struct sock *sk) spin_lock(lock); __sk_nulls_add_node_rcu(sk, list); + if (tw) { + WARN_ON(sk->sk_hash != tw->tw_hash); + twrefcnt = inet_twsk_unhash(tw); + } spin_unlock(lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + return twrefcnt; } EXPORT_SYMBOL_GPL(__inet_hash_nolisten); @@ -372,7 +384,7 @@ static void __inet_hash(struct sock *sk) struct inet_listen_hashbucket *ilb; if (sk->sk_state != TCP_LISTEN) { - __inet_hash_nolisten(sk); + __inet_hash_nolisten(sk, NULL); return; } @@ -421,14 +433,15 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u32 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, struct inet_timewait_sock **), - void (*hash)(struct sock *sk)) + int (*hash)(struct sock *sk, struct inet_timewait_sock *twp)) { struct inet_hashinfo *hinfo = death_row->hashinfo; - const unsigned short snum = inet_sk(sk)->num; + const unsigned short snum = inet_sk(sk)->inet_num; struct inet_bind_hashbucket *head; struct inet_bind_bucket *tb; int ret; struct net *net = sock_net(sk); + int twrefcnt = 1; if (!snum) { int i, remaining, low, high, port; @@ -452,7 +465,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, * unique enough. */ inet_bind_bucket_for_each(tb, node, &head->chain) { - if (ib_net(tb) == net && tb->port == port) { + if (net_eq(ib_net(tb), net) && + tb->port == port) { if (tb->fastreuse >= 0) goto next_port; WARN_ON(hlist_empty(&tb->owners)); @@ -485,14 +499,19 @@ ok: /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - hash(sk); + inet_sk(sk)->inet_sport = htons(port); + twrefcnt += hash(sk, tw); } + if (tw) + twrefcnt += inet_twsk_bind_unhash(tw, hinfo); spin_unlock(&head->lock); if (tw) { inet_twsk_deschedule(tw, death_row); - inet_twsk_put(tw); + while (twrefcnt) { + twrefcnt--; + inet_twsk_put(tw); + } } ret = 0; @@ -503,7 +522,7 @@ ok: tb = inet_csk(sk)->icsk_bind_hash; spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - hash(sk); + hash(sk, NULL); spin_unlock_bh(&head->lock); return 0; } else { diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c index 6a667dae315e..47038cb6c138 100644 --- a/net/ipv4/inet_lro.c +++ b/net/ipv4/inet_lro.c @@ -64,15 +64,15 @@ static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph, if (iph->ihl != IPH_LEN_WO_OPTIONS) return -1; - if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack - || tcph->rst || tcph->syn || tcph->fin) + if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || + tcph->rst || tcph->syn || tcph->fin) return -1; if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) return -1; - if (tcph->doff != TCPH_LEN_WO_OPTIONS - && tcph->doff != TCPH_LEN_W_TIMESTAMP) + if (tcph->doff != TCPH_LEN_WO_OPTIONS && + tcph->doff != TCPH_LEN_W_TIMESTAMP) return -1; /* check tcp options (only timestamp allowed) */ @@ -262,10 +262,10 @@ static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, struct iphdr *iph, struct tcphdr *tcph) { - if ((lro_desc->iph->saddr != iph->saddr) - || (lro_desc->iph->daddr != iph->daddr) - || (lro_desc->tcph->source != tcph->source) - || (lro_desc->tcph->dest != tcph->dest)) + if ((lro_desc->iph->saddr != iph->saddr) || + (lro_desc->iph->daddr != iph->daddr) || + (lro_desc->tcph->source != tcph->source) || + (lro_desc->tcph->dest != tcph->dest)) return -1; return 0; } @@ -339,9 +339,9 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, u64 flags; int vlan_hdr_len = 0; - if (!lro_mgr->get_skb_header - || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, - &flags, priv)) + if (!lro_mgr->get_skb_header || + lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph, + &flags, priv)) goto out; if (!(flags & LRO_IPV4) || !(flags & LRO_TCP)) @@ -351,8 +351,8 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, if (!lro_desc) goto out; - if ((skb->protocol == htons(ETH_P_8021Q)) - && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) + if ((skb->protocol == htons(ETH_P_8021Q)) && + !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) vlan_hdr_len = VLAN_HLEN; if (!lro_desc->active) { /* start new lro session */ @@ -446,9 +446,9 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, int hdr_len = LRO_MAX_PG_HLEN; int vlan_hdr_len = 0; - if (!lro_mgr->get_frag_header - || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, - (void *)&tcph, &flags, priv)) { + if (!lro_mgr->get_frag_header || + lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph, + (void *)&tcph, &flags, priv)) { mac_hdr = page_address(frags->page) + frags->page_offset; goto out1; } @@ -472,8 +472,8 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr, if (!skb) goto out; - if ((skb->protocol == htons(ETH_P_8021Q)) - && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) + if ((skb->protocol == htons(ETH_P_8021Q)) && + !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID)) vlan_hdr_len = VLAN_HLEN; iph = (void *)(skb->data + vlan_hdr_len); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 13f0781f35cd..c5af909cf701 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -10,44 +10,92 @@ #include <linux/kernel.h> #include <linux/kmemcheck.h> +#include <linux/slab.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> + +/** + * inet_twsk_unhash - unhash a timewait socket from established hash + * @tw: timewait socket + * + * unhash a timewait socket from established hash, if hashed. + * ehash lock must be held by caller. + * Returns 1 if caller should call inet_twsk_put() after lock release. + */ +int inet_twsk_unhash(struct inet_timewait_sock *tw) +{ + if (hlist_nulls_unhashed(&tw->tw_node)) + return 0; + + hlist_nulls_del_rcu(&tw->tw_node); + sk_nulls_node_init(&tw->tw_node); + /* + * We cannot call inet_twsk_put() ourself under lock, + * caller must call it for us. + */ + return 1; +} + +/** + * inet_twsk_bind_unhash - unhash a timewait socket from bind hash + * @tw: timewait socket + * @hashinfo: hashinfo pointer + * + * unhash a timewait socket from bind hash, if hashed. + * bind hash lock must be held by caller. + * Returns 1 if caller should call inet_twsk_put() after lock release. + */ +int inet_twsk_bind_unhash(struct inet_timewait_sock *tw, + struct inet_hashinfo *hashinfo) +{ + struct inet_bind_bucket *tb = tw->tw_tb; + + if (!tb) + return 0; + + __hlist_del(&tw->tw_bind_node); + tw->tw_tb = NULL; + inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + /* + * We cannot call inet_twsk_put() ourself under lock, + * caller must call it for us. + */ + return 1; +} + /* Must be called with locally disabled BHs. */ static void __inet_twsk_kill(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { struct inet_bind_hashbucket *bhead; - struct inet_bind_bucket *tb; + int refcnt; /* Unlink from established hashes. */ spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); spin_lock(lock); - if (hlist_nulls_unhashed(&tw->tw_node)) { - spin_unlock(lock); - return; - } - hlist_nulls_del_rcu(&tw->tw_node); - sk_nulls_node_init(&tw->tw_node); + refcnt = inet_twsk_unhash(tw); spin_unlock(lock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, hashinfo->bhash_size)]; + spin_lock(&bhead->lock); - tb = tw->tw_tb; - __hlist_del(&tw->tw_bind_node); - tw->tw_tb = NULL; - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + refcnt += inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); + #ifdef SOCK_REFCNT_DEBUG if (atomic_read(&tw->tw_refcnt) != 1) { printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n", tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt)); } #endif - inet_twsk_put(tw); + while (refcnt) { + inet_twsk_put(tw); + refcnt--; + } } static noinline void inet_twsk_free(struct inet_timewait_sock *tw) @@ -86,7 +134,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ - bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num, + bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; @@ -101,16 +149,24 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, * Should be done before removing sk from established chain * because readers are lockless and search established first. */ - atomic_inc(&tw->tw_refcnt); inet_twsk_add_node_rcu(tw, &ehead->twchain); /* Step 3: Remove SK from established hash. */ if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + /* + * Notes : + * - We initially set tw_refcnt to 0 in inet_twsk_alloc() + * - We add one reference for the bhash link + * - We add one reference for the ehash link + * - We want this refcnt update done before allowing other + * threads to find this tw in ehash chain. + */ + atomic_add(1 + 1 + 1, &tw->tw_refcnt); + spin_unlock(lock); } - EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) @@ -124,14 +180,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat kmemcheck_annotate_bitfield(tw, flags); /* Give us an identity. */ - tw->tw_daddr = inet->daddr; - tw->tw_rcv_saddr = inet->rcv_saddr; + tw->tw_daddr = inet->inet_daddr; + tw->tw_rcv_saddr = inet->inet_rcv_saddr; tw->tw_bound_dev_if = sk->sk_bound_dev_if; - tw->tw_num = inet->num; + tw->tw_num = inet->inet_num; tw->tw_state = TCP_TIME_WAIT; tw->tw_substate = state; - tw->tw_sport = inet->sport; - tw->tw_dport = inet->dport; + tw->tw_sport = inet->inet_sport; + tw->tw_dport = inet->inet_dport; tw->tw_family = sk->sk_family; tw->tw_reuse = sk->sk_reuse; tw->tw_hash = sk->sk_hash; @@ -139,14 +195,18 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat tw->tw_transparent = inet->transparent; tw->tw_prot = sk->sk_prot_creator; twsk_net_set(tw, hold_net(sock_net(sk))); - atomic_set(&tw->tw_refcnt, 1); + /* + * Because we use RCU lookups, we should not set tw_refcnt + * to a non null value before everything is setup for this + * timewait socket. + */ + atomic_set(&tw->tw_refcnt, 0); inet_twsk_dead_node_init(tw); __module_get(tw->tw_prot->owner); } return tw; } - EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* Returns non-zero if quota exceeded. */ @@ -225,7 +285,6 @@ void inet_twdr_hangman(unsigned long data) out: spin_unlock(&twdr->death_lock); } - EXPORT_SYMBOL_GPL(inet_twdr_hangman); void inet_twdr_twkill_work(struct work_struct *work) @@ -256,7 +315,6 @@ void inet_twdr_twkill_work(struct work_struct *work) spin_unlock_bh(&twdr->death_lock); } } - EXPORT_SYMBOL_GPL(inet_twdr_twkill_work); /* These are always called from BH context. See callers in @@ -276,7 +334,6 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw, spin_unlock(&twdr->death_lock); __inet_twsk_kill(tw, twdr->hashinfo); } - EXPORT_SYMBOL(inet_twsk_deschedule); void inet_twsk_schedule(struct inet_timewait_sock *tw, @@ -357,7 +414,6 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw, mod_timer(&twdr->tw_timer, jiffies + twdr->period); spin_unlock(&twdr->death_lock); } - EXPORT_SYMBOL_GPL(inet_twsk_schedule); void inet_twdr_twcal_tick(unsigned long data) @@ -418,40 +474,48 @@ out: #endif spin_unlock(&twdr->death_lock); } - EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); -void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, +void inet_twsk_purge(struct inet_hashinfo *hashinfo, struct inet_timewait_death_row *twdr, int family) { struct inet_timewait_sock *tw; struct sock *sk; struct hlist_nulls_node *node; - int h; + unsigned int slot; - local_bh_disable(); - for (h = 0; h < (hashinfo->ehash_size); h++) { - struct inet_ehash_bucket *head = - inet_ehash_bucket(hashinfo, h); - spinlock_t *lock = inet_ehash_lockp(hashinfo, h); + for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; +restart_rcu: + rcu_read_lock(); restart: - spin_lock(lock); - sk_nulls_for_each(sk, node, &head->twchain) { - + sk_nulls_for_each_rcu(sk, node, &head->twchain) { tw = inet_twsk(sk); - if (!net_eq(twsk_net(tw), net) || - tw->tw_family != family) + if ((tw->tw_family != family) || + atomic_read(&twsk_net(tw)->count)) continue; - atomic_inc(&tw->tw_refcnt); - spin_unlock(lock); + if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt))) + continue; + + if (unlikely((tw->tw_family != family) || + atomic_read(&twsk_net(tw)->count))) { + inet_twsk_put(tw); + goto restart; + } + + rcu_read_unlock(); inet_twsk_deschedule(tw, twdr); inet_twsk_put(tw); - - goto restart; + goto restart_rcu; } - spin_unlock(lock); + /* If the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot) + goto restart; + rcu_read_unlock(); } - local_bh_enable(); } EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index b1fbe18feb5a..6bcfe52a9c87 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -67,9 +67,6 @@ * ip_id_count: idlock */ -/* Exported for inet_getid inline function. */ -DEFINE_SPINLOCK(inet_peer_idlock); - static struct kmem_cache *peer_cachep __read_mostly; #define node_height(x) x->avl_height @@ -390,7 +387,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create) n->v4daddr = daddr; atomic_set(&n->refcnt, 1); atomic_set(&n->rid, 0); - n->ip_id_count = secure_ip_id(daddr); + atomic_set(&n->ip_id_count, secure_ip_id(daddr)); n->tcp_ts_stamp = 0; write_lock_bh(&peer_pool_lock); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index a2991bc8e32e..af10942b326c 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -25,6 +25,7 @@ #include <linux/ip.h> #include <linux/icmp.h> #include <linux/netdevice.h> +#include <linux/slab.h> #include <net/sock.h> #include <net/ip.h> #include <net/tcp.h> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index d3fe10be7219..75347ea70ea0 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -32,6 +32,9 @@ #include <linux/netdevice.h> #include <linux/jhash.h> #include <linux/random.h> +#include <linux/slab.h> +#include <net/route.h> +#include <net/dst.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> @@ -205,11 +208,35 @@ static void ip_expire(unsigned long arg) if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; - /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) { - icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); - dev_put(head->dev); + rcu_read_lock(); + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (!head->dev) + goto out_rcu_unlock; + + /* + * Only search router table for the head fragment, + * when defraging timeout at PRE_ROUTING HOOK. + */ + if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { + const struct iphdr *iph = ip_hdr(head); + int err = ip_route_input(head, iph->daddr, iph->saddr, + iph->tos, head->dev); + if (unlikely(err)) + goto out_rcu_unlock; + + /* + * Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. + */ + if (skb_rtable(head)->rt_type != RTN_LOCAL) + goto out_rcu_unlock; + } + + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); +out_rcu_unlock: + rcu_read_unlock(); } out: spin_unlock(&qp->q.lock); @@ -603,7 +630,6 @@ static int zero; static struct ctl_table ip4_frags_ns_ctl_table[] = { { - .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, .procname = "ipfrag_high_thresh", .data = &init_net.ipv4.frags.high_thresh, .maxlen = sizeof(int), @@ -611,7 +637,6 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH, .procname = "ipfrag_low_thresh", .data = &init_net.ipv4.frags.low_thresh, .maxlen = sizeof(int), @@ -619,26 +644,22 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_IPFRAG_TIME, .procname = "ipfrag_time", .data = &init_net.ipv4.frags.timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { } }; static struct ctl_table ip4_frags_ctl_table[] = { { - .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, .procname = "ipfrag_secret_interval", .data = &ip4_frags.secret_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { .procname = "ipfrag_max_dist", @@ -651,13 +672,13 @@ static struct ctl_table ip4_frags_ctl_table[] = { { } }; -static int ip4_frags_ns_ctl_register(struct net *net) +static int __net_init ip4_frags_ns_ctl_register(struct net *net) { struct ctl_table *table; struct ctl_table_header *hdr; table = ip4_frags_ns_ctl_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); if (table == NULL) goto err_alloc; @@ -675,13 +696,13 @@ static int ip4_frags_ns_ctl_register(struct net *net) return 0; err_reg: - if (net != &init_net) + if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; } -static void ip4_frags_ns_ctl_unregister(struct net *net) +static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) { struct ctl_table *table; @@ -709,7 +730,7 @@ static inline void ip4_frags_ctl_register(void) } #endif -static int ipv4_frags_init_net(struct net *net) +static int __net_init ipv4_frags_init_net(struct net *net) { /* * Fragment cache limits. We will commit 256K at one time. Should we @@ -731,7 +752,7 @@ static int ipv4_frags_init_net(struct net *net) return ip4_frags_ns_ctl_register(net); } -static void ipv4_frags_exit_net(struct net *net) +static void __net_exit ipv4_frags_exit_net(struct net *net) { ip4_frags_ns_ctl_unregister(net); inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 143333852624..fe381d12ecdd 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> @@ -125,7 +126,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev); #define HASH_SIZE 16 -static int ipgre_net_id; +static int ipgre_net_id __read_mostly; struct ipgre_net { struct ip_tunnel *tunnels[4][HASH_SIZE]; @@ -156,8 +157,13 @@ struct ipgre_net { #define tunnels_r tunnels[2] #define tunnels_l tunnels[1] #define tunnels_wc tunnels[0] +/* + * Locking : hash tables are protected by RCU and a spinlock + */ +static DEFINE_SPINLOCK(ipgre_lock); -static DEFINE_RWLOCK(ipgre_lock); +#define for_each_ip_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) /* Given src, dst and key, find appropriate for input tunnel. */ @@ -175,7 +181,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, ARPHRD_ETHER : ARPHRD_IPGRE; int score, cand_score = 4; - for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) { + for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) { if (local != t->parms.iph.saddr || remote != t->parms.iph.daddr || key != t->parms.i_key || @@ -200,7 +206,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, } } - for (t = ign->tunnels_r[h0^h1]; t; t = t->next) { + for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) { if (remote != t->parms.iph.daddr || key != t->parms.i_key || !(t->dev->flags & IFF_UP)) @@ -224,7 +230,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, } } - for (t = ign->tunnels_l[h1]; t; t = t->next) { + for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) { if ((local != t->parms.iph.saddr && (local != t->parms.iph.daddr || !ipv4_is_multicast(local))) || @@ -250,7 +256,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, } } - for (t = ign->tunnels_wc[h1]; t; t = t->next) { + for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) { if (t->parms.i_key != key || !(t->dev->flags & IFF_UP)) continue; @@ -276,8 +282,9 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, if (cand != NULL) return cand; - if (ign->fb_tunnel_dev->flags & IFF_UP) - return netdev_priv(ign->fb_tunnel_dev); + dev = ign->fb_tunnel_dev; + if (dev->flags & IFF_UP) + return netdev_priv(dev); return NULL; } @@ -311,10 +318,10 @@ static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) { struct ip_tunnel **tp = ipgre_bucket(ign, t); + spin_lock_bh(&ipgre_lock); t->next = *tp; - write_lock_bh(&ipgre_lock); - *tp = t; - write_unlock_bh(&ipgre_lock); + rcu_assign_pointer(*tp, t); + spin_unlock_bh(&ipgre_lock); } static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) @@ -323,9 +330,9 @@ static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { if (t == *tp) { - write_lock_bh(&ipgre_lock); + spin_lock_bh(&ipgre_lock); *tp = t->next; - write_unlock_bh(&ipgre_lock); + spin_unlock_bh(&ipgre_lock); break; } } @@ -476,7 +483,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) break; } - read_lock(&ipgre_lock); + rcu_read_lock(); t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr, flags & GRE_KEY ? *(((__be32 *)p) + (grehlen / 4) - 1) : 0, @@ -494,7 +501,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info) t->err_count = 1; t->err_time = jiffies; out: - read_unlock(&ipgre_lock); + rcu_read_unlock(); return; } @@ -573,7 +580,7 @@ static int ipgre_rcv(struct sk_buff *skb) gre_proto = *(__be16 *)(h + 2); - read_lock(&ipgre_lock); + rcu_read_lock(); if ((tunnel = ipgre_tunnel_lookup(skb->dev, iph->saddr, iph->daddr, key, gre_proto))) { @@ -647,13 +654,13 @@ static int ipgre_rcv(struct sk_buff *skb) ipgre_ecn_decapsulate(iph, skb); netif_rx(skb); - read_unlock(&ipgre_lock); + rcu_read_unlock(); return(0); } icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: - read_unlock(&ipgre_lock); + rcu_read_unlock(); drop_nolock: kfree_skb(skb); return(0); @@ -662,7 +669,8 @@ drop_nolock: static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &tunnel->dev->stats; + struct net_device_stats *stats = &dev->stats; + struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); struct iphdr *old_iph = ip_hdr(skb); struct iphdr *tiph; u8 tos; @@ -786,7 +794,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { - icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ip_rt_put(rt); goto tx_error; } @@ -803,14 +811,16 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev tunnel->err_count = 0; } - max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len; if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (max_headroom > dev->needed_headroom) + dev->needed_headroom = max_headroom; if (!new_skb) { ip_rt_put(rt); - stats->tx_dropped++; + txq->tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -1137,12 +1147,9 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, if (saddr) memcpy(&iph->saddr, saddr, 4); - - if (daddr) { + if (daddr) memcpy(&iph->daddr, daddr, 4); - return t->hlen; - } - if (iph->daddr && !ipv4_is_multicast(iph->daddr)) + if (iph->daddr) return t->hlen; return -t->hlen; @@ -1283,33 +1290,27 @@ static const struct net_protocol ipgre_protocol = { .netns_ok = 1, }; -static void ipgre_destroy_tunnels(struct ipgre_net *ign) +static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) { int prio; for (prio = 0; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t; - while ((t = ign->tunnels[prio][h]) != NULL) - unregister_netdevice(t->dev); + struct ip_tunnel *t = ign->tunnels[prio][h]; + + while (t != NULL) { + unregister_netdevice_queue(t->dev, head); + t = t->next; + } } } } -static int ipgre_init_net(struct net *net) +static int __net_init ipgre_init_net(struct net *net) { + struct ipgre_net *ign = net_generic(net, ipgre_net_id); int err; - struct ipgre_net *ign; - - err = -ENOMEM; - ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL); - if (ign == NULL) - goto err_alloc; - - err = net_assign_generic(net, ipgre_net_id, ign); - if (err < 0) - goto err_assign; ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", ipgre_tunnel_setup); @@ -1330,27 +1331,26 @@ static int ipgre_init_net(struct net *net) err_reg_dev: free_netdev(ign->fb_tunnel_dev); err_alloc_dev: - /* nothing */ -err_assign: - kfree(ign); -err_alloc: return err; } -static void ipgre_exit_net(struct net *net) +static void __net_exit ipgre_exit_net(struct net *net) { struct ipgre_net *ign; + LIST_HEAD(list); ign = net_generic(net, ipgre_net_id); rtnl_lock(); - ipgre_destroy_tunnels(ign); + ipgre_destroy_tunnels(ign, &list); + unregister_netdevice_many(&list); rtnl_unlock(); - kfree(ign); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, .exit = ipgre_exit_net, + .id = &ipgre_net_id, + .size = sizeof(struct ipgre_net), }; static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -1471,7 +1471,7 @@ static void ipgre_tap_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; } -static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[], +static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel *nt; @@ -1665,15 +1665,16 @@ static int __init ipgre_init(void) printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); - if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) { + err = register_pernet_device(&ipgre_net_ops); + if (err < 0) + return err; + + err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); + if (err < 0) { printk(KERN_INFO "ipgre init: can't add protocol\n"); - return -EAGAIN; + goto add_proto_failed; } - err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops); - if (err < 0) - goto gen_device_failed; - err = rtnl_link_register(&ipgre_link_ops); if (err < 0) goto rtnl_link_failed; @@ -1688,9 +1689,9 @@ out: tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); -gen_device_failed: inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); +add_proto_failed: + unregister_pernet_device(&ipgre_net_ops); goto out; } @@ -1698,9 +1699,9 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops); if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); + unregister_pernet_device(&ipgre_net_ops); } module_init(ipgre_init); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 6c98b43badf4..f8ab7a380d4a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -119,6 +119,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> +#include <linux/slab.h> #include <linux/net.h> #include <linux/socket.h> @@ -161,10 +162,10 @@ int ip_call_ra_chain(struct sk_buff *skb) /* If socket is bound to an interface, only report * the packet if it came from that interface. */ - if (sk && inet_sk(sk)->num == protocol && + if (sk && inet_sk(sk)->inet_num == protocol && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) && - sock_net(sk) == dev_net(dev)) { + net_eq(sock_net(sk), dev_net(dev))) { if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) { read_unlock(&ip_ra_lock); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 94bf105ef3c9..4c09a31fd140 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -11,6 +11,7 @@ #include <linux/capability.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/types.h> #include <asm/uaccess.h> #include <linux/skbuff.h> diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index f9895180f481..c65f18e0936e 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -51,6 +51,7 @@ #include <linux/string.h> #include <linux/errno.h> #include <linux/highmem.h> +#include <linux/slab.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -254,7 +255,7 @@ int ip_mc_output(struct sk_buff *skb) */ if (rt->rt_flags&RTCF_MULTICAST) { - if ((!sk || inet_sk(sk)->mc_loop) + if (sk_mc_loop(sk) #ifdef CONFIG_IP_MROUTE /* Small optimization: do not loopback not local frames, which returned after forwarding; they will be dropped @@ -264,9 +265,11 @@ int ip_mc_output(struct sk_buff *skb) This check is duplicated in ip_mr_input at the moment. */ - && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) + && + ((rt->rt_flags & RTCF_LOCAL) || + !(IPCB(skb)->flags & IPSKB_FORWARDED)) #endif - ) { + ) { struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); if (newskb) NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, @@ -329,7 +332,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) __be32 daddr; /* Use correct destination address if we have options. */ - daddr = inet->daddr; + daddr = inet->inet_daddr; if(opt && opt->srr) daddr = opt->faddr; @@ -338,13 +341,13 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok) .mark = sk->sk_mark, .nl_u = { .ip4_u = { .daddr = daddr, - .saddr = inet->saddr, + .saddr = inet->inet_saddr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = - { .sport = inet->sport, - .dport = inet->dport } } }; + { .sport = inet->inet_sport, + .dport = inet->inet_dport } } }; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times @@ -379,7 +382,7 @@ packet_routed: if (opt && opt->optlen) { iph->ihl += opt->optlen >> 2; - ip_options_build(skb, opt, inet->daddr, rt, 0); + ip_options_build(skb, opt, inet->inet_daddr, rt, 0); } ip_select_ident_more(iph, &rt->u.dst, sk, @@ -501,8 +504,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) if (skb->sk) { frag->sk = skb->sk; frag->destructor = sock_wfree; - truesizes += frag->truesize; } + truesizes += frag->truesize; } /* Everything is OK. Generate! */ @@ -846,7 +849,8 @@ int ip_append_data(struct sock *sk, maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; if (inet->cork.length + length > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, + mtu-exthdrlen); return -EMSGSIZE; } @@ -1100,7 +1104,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; if (inet->cork.length + size > 0xFFFF - fragheaderlen) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu); return -EMSGSIZE; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index e982b5c1ee17..1e64dabbd232 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -23,6 +23,7 @@ #include <linux/icmp.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> +#include <linux/slab.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> @@ -245,7 +246,7 @@ int ip_ra_control(struct sock *sk, unsigned char on, { struct ip_ra_chain *ra, *new_ra, **rap; - if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW) + if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW) return -EINVAL; new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; @@ -451,7 +452,8 @@ static int do_ip_setsockopt(struct sock *sk, int level, (1<<IP_TTL) | (1<<IP_HDRINCL) | (1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) | (1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) | - (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) || + (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) | + (1<<IP_MINTTL))) || optname == IP_MULTICAST_TTL || optname == IP_MULTICAST_ALL || optname == IP_MULTICAST_LOOP || @@ -480,7 +482,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, case IP_OPTIONS: { struct ip_options *opt = NULL; - if (optlen > 40 || optlen < 0) + if (optlen > 40) goto e_inval; err = ip_options_get_from_user(sock_net(sk), &opt, optval, optlen); @@ -492,7 +494,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, if (sk->sk_family == PF_INET || (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && - inet->daddr != LOOPBACK4_IPV6)) { + inet->inet_daddr != LOOPBACK4_IPV6)) { #endif if (inet->opt) icsk->icsk_ext_hdr_len -= inet->opt->optlen; @@ -575,7 +577,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, inet->hdrincl = val ? 1 : 0; break; case IP_MTU_DISCOVER: - if (val < 0 || val > 3) + if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE) goto e_inval; inet->pmtudisc = val; break; @@ -936,6 +938,14 @@ mc_msf_out: inet->transparent = !!val; break; + case IP_MINTTL: + if (optlen < 1) + goto e_inval; + if (val < 0 || val > 255) + goto e_inval; + inet->min_ttl = val; + break; + default: err = -ENOPROTOOPT; break; @@ -1180,8 +1190,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (inet->cmsg_flags & IP_CMSG_PKTINFO) { struct in_pktinfo info; - info.ipi_addr.s_addr = inet->rcv_saddr; - info.ipi_spec_dst.s_addr = inet->rcv_saddr; + info.ipi_addr.s_addr = inet->inet_rcv_saddr; + info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr; info.ipi_ifindex = inet->mc_index; put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); } @@ -1198,6 +1208,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_TRANSPARENT: val = inet->transparent; break; + case IP_MINTTL: + val = inet->min_ttl; + break; default: release_sock(sk); return -ENOPROTOOPT; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 38fbf04150ae..629067571f02 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -25,6 +25,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) { + struct net *net = dev_net(skb->dev); __be32 spi; struct iphdr *iph = (struct iphdr *)skb->data; struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); @@ -35,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) return; spi = htonl(ntohs(ipch->cpi)); - x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr, + x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET); if (!x) return; @@ -47,9 +48,10 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info) /* We always hold one tunnel user reference to indicate a tunnel */ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { + struct net *net = xs_net(x); struct xfrm_state *t; - t = xfrm_state_alloc(&init_net); + t = xfrm_state_alloc(net); if (t == NULL) goto out; @@ -61,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t->props.mode = x->props.mode; t->props.saddr.a4 = x->props.saddr.a4; t->props.flags = x->props.flags; + memcpy(&t->mark, &x->mark, sizeof(t->mark)); if (xfrm_init_state(t)) goto error; @@ -82,10 +85,12 @@ error: */ static int ipcomp_tunnel_attach(struct xfrm_state *x) { + struct net *net = xs_net(x); int err = 0; struct xfrm_state *t; + u32 mark = x->mark.v & x->mark.m; - t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr.a4, + t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4, x->props.saddr.a4, IPPROTO_IPIP, AF_INET); if (!t) { t = ipcomp_tunnel_create(x); @@ -124,16 +129,12 @@ static int ipcomp4_init_state(struct xfrm_state *x) if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp_tunnel_attach(x); if (err) - goto error_tunnel; + goto out; } err = 0; out: return err; - -error_tunnel: - ipcomp_destroy(x); - goto out; } static const struct xfrm_type ipcomp_type = { diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index f8d04c256454..067ce9e043dc 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -53,6 +53,7 @@ #include <linux/root_dev.h> #include <linux/delay.h> #include <linux/nfs_fs.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/arp.h> #include <net/ip.h> @@ -187,6 +188,16 @@ struct ic_device { static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ static struct net_device *ic_dev __initdata = NULL; /* Selected device */ +static bool __init ic_device_match(struct net_device *dev) +{ + if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : + (!(dev->flags & IFF_LOOPBACK) && + (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && + strncmp(dev->name, "dummy", 5))) + return true; + return false; +} + static int __init ic_open_devs(void) { struct ic_device *d, **last; @@ -207,10 +218,7 @@ static int __init ic_open_devs(void) for_each_netdev(&init_net, dev) { if (dev->flags & IFF_LOOPBACK) continue; - if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : - (!(dev->flags & IFF_LOOPBACK) && - (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && - strncmp(dev->name, "dummy", 5))) { + if (ic_device_match(dev)) { int able = 0; if (dev->mtu >= 364) able |= IC_BOOTP; @@ -228,7 +236,7 @@ static int __init ic_open_devs(void) } if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { rtnl_unlock(); - return -1; + return -ENOMEM; } d->dev = dev; *last = d; @@ -253,7 +261,7 @@ static int __init ic_open_devs(void) printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); else printk(KERN_ERR "IP-Config: No network devices available.\n"); - return -1; + return -ENODEV; } return 0; } @@ -1172,10 +1180,9 @@ static int __init ic_dynamic(void) schedule_timeout_uninterruptible(1); #ifdef IPCONFIG_DHCP /* DHCP isn't done until we get a DHCPACK. */ - if ((ic_got_reply & IC_BOOTP) - && (ic_proto_enabled & IC_USE_DHCP) - && ic_dhcp_msgtype != DHCPACK) - { + if ((ic_got_reply & IC_BOOTP) && + (ic_proto_enabled & IC_USE_DHCP) && + ic_dhcp_msgtype != DHCPACK) { ic_got_reply = 0; printk(","); continue; @@ -1304,6 +1311,32 @@ __be32 __init root_nfs_parse_addr(char *name) return addr; } +#define DEVICE_WAIT_MAX 12 /* 12 seconds */ + +static int __init wait_for_devices(void) +{ + int i; + + msleep(CONF_PRE_OPEN); + for (i = 0; i < DEVICE_WAIT_MAX; i++) { + struct net_device *dev; + int found = 0; + + rtnl_lock(); + for_each_netdev(&init_net, dev) { + if (ic_device_match(dev)) { + found = 1; + break; + } + } + rtnl_unlock(); + if (found) + return 0; + ssleep(1); + } + return -ENODEV; +} + /* * IP Autoconfig dispatcher. */ @@ -1314,6 +1347,7 @@ static int __init ip_auto_config(void) #ifdef IPCONFIG_DYNAMIC int retries = CONF_OPEN_RETRIES; #endif + int err; #ifdef CONFIG_PROC_FS proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); @@ -1326,12 +1360,15 @@ static int __init ip_auto_config(void) #ifdef IPCONFIG_DYNAMIC try_try_again: #endif - /* Give hardware a chance to settle */ - msleep(CONF_PRE_OPEN); + /* Wait for devices to appear */ + err = wait_for_devices(); + if (err) + return err; /* Setup all network devices */ - if (ic_open_devs() < 0) - return -1; + err = ic_open_devs(); + if (err) + return err; /* Give drivers a chance to settle */ ssleep(CONF_POST_OPEN); @@ -1344,9 +1381,9 @@ static int __init ip_auto_config(void) */ if (ic_myaddr == NONE || #ifdef CONFIG_ROOT_NFS - (root_server_addr == NONE - && ic_servaddr == NONE - && ROOT_DEV == Root_NFS) || + (root_server_addr == NONE && + ic_servaddr == NONE && + ROOT_DEV == Root_NFS) || #endif ic_first_dev->next) { #ifdef IPCONFIG_DYNAMIC @@ -1447,7 +1484,7 @@ late_initcall(ip_auto_config); /* * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel - * command line parameter. See Documentation/filesystems/nfsroot.txt. + * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt. */ static int __init ic_proto_name(char *name) { diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ae40ed1ba560..0b27b14dcc9d 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -95,6 +95,7 @@ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <asm/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> @@ -119,7 +120,7 @@ #define HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) -static int ipip_net_id; +static int ipip_net_id __read_mostly; struct ipip_net { struct ip_tunnel *tunnels_r_l[HASH_SIZE]; struct ip_tunnel *tunnels_r[HASH_SIZE]; @@ -130,11 +131,16 @@ struct ipip_net { struct net_device *fb_tunnel_dev; }; -static void ipip_fb_tunnel_init(struct net_device *dev); static void ipip_tunnel_init(struct net_device *dev); static void ipip_tunnel_setup(struct net_device *dev); -static DEFINE_RWLOCK(ipip_lock); +/* + * Locking : hash tables are protected by RCU and a spinlock + */ +static DEFINE_SPINLOCK(ipip_lock); + +#define for_each_ip_tunnel_rcu(start) \ + for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, __be32 remote, __be32 local) @@ -144,20 +150,21 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, struct ip_tunnel *t; struct ipip_net *ipn = net_generic(net, ipip_net_id); - for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) { + for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1]) if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) return t; - } - for (t = ipn->tunnels_r[h0]; t; t = t->next) { + + for_each_ip_tunnel_rcu(ipn->tunnels_r[h0]) if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) return t; - } - for (t = ipn->tunnels_l[h1]; t; t = t->next) { + + for_each_ip_tunnel_rcu(ipn->tunnels_l[h1]) if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) return t; - } - if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + + t = rcu_dereference(ipn->tunnels_wc[0]); + if (t && (t->dev->flags&IFF_UP)) return t; return NULL; } @@ -193,9 +200,9 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { if (t == *tp) { - write_lock_bh(&ipip_lock); + spin_lock_bh(&ipip_lock); *tp = t->next; - write_unlock_bh(&ipip_lock); + spin_unlock_bh(&ipip_lock); break; } } @@ -205,10 +212,10 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) { struct ip_tunnel **tp = ipip_bucket(ipn, t); + spin_lock_bh(&ipip_lock); t->next = *tp; - write_lock_bh(&ipip_lock); - *tp = t; - write_unlock_bh(&ipip_lock); + rcu_assign_pointer(*tp, t); + spin_unlock_bh(&ipip_lock); } static struct ip_tunnel * ipip_tunnel_locate(struct net *net, @@ -267,9 +274,9 @@ static void ipip_tunnel_uninit(struct net_device *dev) struct ipip_net *ipn = net_generic(net, ipip_net_id); if (dev == ipn->fb_tunnel_dev) { - write_lock_bh(&ipip_lock); + spin_lock_bh(&ipip_lock); ipn->tunnels_wc[0] = NULL; - write_unlock_bh(&ipip_lock); + spin_unlock_bh(&ipip_lock); } else ipip_tunnel_unlink(ipn, netdev_priv(dev)); dev_put(dev); @@ -318,7 +325,7 @@ static int ipip_err(struct sk_buff *skb, u32 info) err = -ENOENT; - read_lock(&ipip_lock); + rcu_read_lock(); t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr); if (t == NULL || t->parms.iph.daddr == 0) goto out; @@ -333,7 +340,7 @@ static int ipip_err(struct sk_buff *skb, u32 info) t->err_count = 1; t->err_time = jiffies; out: - read_unlock(&ipip_lock); + rcu_read_unlock(); return err; } @@ -351,11 +358,11 @@ static int ipip_rcv(struct sk_buff *skb) struct ip_tunnel *tunnel; const struct iphdr *iph = ip_hdr(skb); - read_lock(&ipip_lock); + rcu_read_lock(); if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr)) != NULL) { if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { - read_unlock(&ipip_lock); + rcu_read_unlock(); kfree_skb(skb); return 0; } @@ -374,10 +381,10 @@ static int ipip_rcv(struct sk_buff *skb) nf_reset(skb); ipip_ecn_decapsulate(iph, skb); netif_rx(skb); - read_unlock(&ipip_lock); + rcu_read_unlock(); return 0; } - read_unlock(&ipip_lock); + rcu_read_unlock(); return -1; } @@ -390,7 +397,8 @@ static int ipip_rcv(struct sk_buff *skb) static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &tunnel->dev->stats; + struct net_device_stats *stats = &dev->stats; + struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; @@ -480,7 +488,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - stats->tx_dropped++; + txq->tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -722,7 +730,7 @@ static void ipip_tunnel_init(struct net_device *dev) ipip_tunnel_bind_dev(dev); } -static void ipip_fb_tunnel_init(struct net_device *dev) +static void __net_init ipip_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -748,33 +756,27 @@ static struct xfrm_tunnel ipip_handler = { static const char banner[] __initconst = KERN_INFO "IPv4 over IPv4 tunneling driver\n"; -static void ipip_destroy_tunnels(struct ipip_net *ipn) +static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) { int prio; for (prio = 1; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t; - while ((t = ipn->tunnels[prio][h]) != NULL) - unregister_netdevice(t->dev); + struct ip_tunnel *t = ipn->tunnels[prio][h]; + + while (t != NULL) { + unregister_netdevice_queue(t->dev, head); + t = t->next; + } } } } -static int ipip_init_net(struct net *net) +static int __net_init ipip_init_net(struct net *net) { + struct ipip_net *ipn = net_generic(net, ipip_net_id); int err; - struct ipip_net *ipn; - - err = -ENOMEM; - ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL); - if (ipn == NULL) - goto err_alloc; - - err = net_assign_generic(net, ipip_net_id, ipn); - if (err < 0) - goto err_assign; ipn->tunnels[0] = ipn->tunnels_wc; ipn->tunnels[1] = ipn->tunnels_l; @@ -801,27 +803,26 @@ err_reg_dev: free_netdev(ipn->fb_tunnel_dev); err_alloc_dev: /* nothing */ -err_assign: - kfree(ipn); -err_alloc: return err; } -static void ipip_exit_net(struct net *net) +static void __net_exit ipip_exit_net(struct net *net) { - struct ipip_net *ipn; + struct ipip_net *ipn = net_generic(net, ipip_net_id); + LIST_HEAD(list); - ipn = net_generic(net, ipip_net_id); rtnl_lock(); - ipip_destroy_tunnels(ipn); - unregister_netdevice(ipn->fb_tunnel_dev); + ipip_destroy_tunnels(ipn, &list); + unregister_netdevice_queue(ipn->fb_tunnel_dev, &list); + unregister_netdevice_many(&list); rtnl_unlock(); - kfree(ipn); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, .exit = ipip_exit_net, + .id = &ipip_net_id, + .size = sizeof(struct ipip_net), }; static int __init ipip_init(void) @@ -830,15 +831,14 @@ static int __init ipip_init(void) printk(banner); - if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) { + err = register_pernet_device(&ipip_net_ops); + if (err < 0) + return err; + err = xfrm4_tunnel_register(&ipip_handler, AF_INET); + if (err < 0) { + unregister_pernet_device(&ipip_net_ops); printk(KERN_INFO "ipip init: can't register tunnel\n"); - return -EAGAIN; } - - err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops); - if (err) - xfrm4_tunnel_deregister(&ipip_handler, AF_INET); - return err; } @@ -847,7 +847,7 @@ static void __exit ipip_fini(void) if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET)) printk(KERN_INFO "ipip close: can't deregister tunnel\n"); - unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops); + unregister_pernet_device(&ipip_net_ops); } module_init(ipip_init); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 99508d66a642..9d4f6d1340a4 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -47,6 +47,7 @@ #include <linux/mroute.h> #include <linux/init.h> #include <linux/if_ether.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -275,7 +276,8 @@ failure: * @notify: Set to 1, if the caller is a notifier_call */ -static int vif_delete(struct net *net, int vifi, int notify) +static int vif_delete(struct net *net, int vifi, int notify, + struct list_head *head) { struct vif_device *v; struct net_device *dev; @@ -319,7 +321,7 @@ static int vif_delete(struct net *net, int vifi, int notify) } if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) - unregister_netdevice(dev); + unregister_netdevice_queue(dev, head); dev_put(dev); return 0; @@ -469,8 +471,18 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) return err; } break; + + case VIFF_USE_IFINDEX: case 0: - dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); + if (vifc->vifc_flags == VIFF_USE_IFINDEX) { + dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); + if (dev && dev->ip_ptr == NULL) { + dev_put(dev); + return -EADDRNOTAVAIL; + } + } else + dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); + if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); @@ -791,6 +803,9 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock) int line; struct mfc_cache *uc, *c, **cp; + if (mfc->mfcc_parent >= MAXVIFS) + return -ENFILE; + line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); for (cp = &net->ipv4.mfc_cache_array[line]; @@ -862,14 +877,16 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock) static void mroute_clean_tables(struct net *net) { int i; + LIST_HEAD(list); /* * Shut down all active vif entries */ for (i = 0; i < net->ipv4.maxvif; i++) { if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC)) - vif_delete(net, i, 0); + vif_delete(net, i, 0, &list); } + unregister_netdevice_many(&list); /* * Wipe the cache @@ -948,7 +965,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi switch (optname) { case MRT_INIT: if (sk->sk_type != SOCK_RAW || - inet_sk(sk)->num != IPPROTO_IGMP) + inet_sk(sk)->inet_num != IPPROTO_IGMP) return -EOPNOTSUPP; if (optlen != sizeof(int)) return -ENOPROTOOPT; @@ -985,7 +1002,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi if (optname == MRT_ADD_VIF) { ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk); } else { - ret = vif_delete(net, vif.vifc_vifi, 0); + ret = vif_delete(net, vif.vifc_vifi, 0, NULL); } rtnl_unlock(); return ret; @@ -1148,17 +1165,16 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v struct net *net = dev_net(dev); struct vif_device *v; int ct; - - if (!net_eq(dev_net(dev), net)) - return NOTIFY_DONE; + LIST_HEAD(list); if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; v = &net->ipv4.vif_table[0]; for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) { if (v->dev == dev) - vif_delete(net, ct, 1); + vif_delete(net, ct, 1, &list); } + unregister_netdevice_many(&list); return NOTIFY_DONE; } @@ -1601,17 +1617,20 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) int ct; struct rtnexthop *nhp; struct net *net = mfc_net(c); - struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev; u8 *b = skb_tail_pointer(skb); struct rtattr *mp_head; - if (dev) - RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); + /* If cache is unresolved, don't try to parse IIF and OIF */ + if (c->mfc_parent > MAXVIFS) + return -ENOENT; + + if (VIF_EXISTS(net, c->mfc_parent)) + RTA_PUT(skb, RTA_IIF, 4, &net->ipv4.vif_table[c->mfc_parent].dev->ifindex); mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0)); for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { - if (c->mfc_un.res.ttls[ct] < 255) { + if (VIF_EXISTS(net, ct) && c->mfc_un.res.ttls[ct] < 255) { if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) goto rtattr_failure; nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 1725dc0ef688..82fb43c5c59e 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -4,6 +4,7 @@ #include <linux/netfilter_ipv4.h> #include <linux/ip.h> #include <linux/skbuff.h> +#include <linux/gfp.h> #include <net/route.h> #include <net/xfrm.h> #include <net/ip.h> @@ -155,10 +156,10 @@ static int nf_ip_reroute(struct sk_buff *skb, if (entry->hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); - if (!(iph->tos == rt_info->tos - && skb->mark == rt_info->mark - && iph->daddr == rt_info->daddr - && iph->saddr == rt_info->saddr)) + if (!(iph->tos == rt_info->tos && + skb->mark == rt_info->mark && + iph->daddr == rt_info->daddr && + iph->saddr == rt_info->saddr)) return ip_route_me_harder(skb, RTN_UNSPEC); } return 0; @@ -248,9 +249,9 @@ module_exit(ipv4_netfilter_fini); #ifdef CONFIG_SYSCTL struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "netfilter", .ctl_name = NET_IPV4_NETFILTER, }, + { .procname = "net", }, + { .procname = "ipv4", }, + { .procname = "netfilter", }, { } }; EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path); diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 27774c99d888..f07d77f65751 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -27,6 +27,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp/arp_tables.h> +#include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); @@ -58,6 +59,12 @@ do { \ #define ARP_NF_ASSERT(x) #endif +void *arpt_alloc_initial_table(const struct xt_table *info) +{ + return xt_alloc_initial_table(arpt, ARPT); +} +EXPORT_SYMBOL_GPL(arpt_alloc_initial_table); + static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, const char *hdr_addr, int len) { @@ -226,7 +233,14 @@ arpt_error(struct sk_buff *skb, const struct xt_target_param *par) return NF_DROP; } -static inline struct arpt_entry *get_entry(void *base, unsigned int offset) +static inline const struct arpt_entry_target * +arpt_get_target_c(const struct arpt_entry *e) +{ + return arpt_get_target((struct arpt_entry *)e); +} + +static inline struct arpt_entry * +get_entry(const void *base, unsigned int offset) { return (struct arpt_entry *)(base + offset); } @@ -273,7 +287,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, arp = arp_hdr(skb); do { - struct arpt_entry_target *t; + const struct arpt_entry_target *t; int hdr_len; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { @@ -285,7 +299,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, (2 * skb->dev->addr_len); ADD_COUNTER(e->counters, hdr_len, 1); - t = arpt_get_target(e); + t = arpt_get_target_c(e); /* Standard target? */ if (!t->u.kernel.target->target) { @@ -351,7 +365,7 @@ static inline bool unconditional(const struct arpt_arp *arp) /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ -static int mark_source_chains(struct xt_table_info *newinfo, +static int mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -372,7 +386,7 @@ static int mark_source_chains(struct xt_table_info *newinfo, for (;;) { const struct arpt_standard_target *t - = (void *)arpt_get_target(e); + = (void *)arpt_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { @@ -384,11 +398,11 @@ static int mark_source_chains(struct xt_table_info *newinfo, |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct arpt_entry) - && (strcmp(t->target.u.user.name, - ARPT_STANDARD_TARGET) == 0) - && t->verdict < 0 - && unconditional(&e->arp)) || visited) { + if ((e->target_offset == sizeof(struct arpt_entry) && + (strcmp(t->target.u.user.name, + ARPT_STANDARD_TARGET) == 0) && + t->verdict < 0 && unconditional(&e->arp)) || + visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -427,8 +441,8 @@ static int mark_source_chains(struct xt_table_info *newinfo, int newpos = t->verdict; if (strcmp(t->target.u.user.name, - ARPT_STANDARD_TARGET) == 0 - && newpos >= 0) { + ARPT_STANDARD_TARGET) == 0 && + newpos >= 0) { if (newpos > newinfo->size - sizeof(struct arpt_entry)) { duprintf("mark_source_chains: " @@ -456,7 +470,7 @@ static int mark_source_chains(struct xt_table_info *newinfo, return 1; } -static inline int check_entry(struct arpt_entry *e, const char *name) +static inline int check_entry(const struct arpt_entry *e, const char *name) { const struct arpt_entry_target *t; @@ -468,7 +482,7 @@ static inline int check_entry(struct arpt_entry *e, const char *name) if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) return -EINVAL; - t = arpt_get_target(e); + t = arpt_get_target_c(e); if (e->target_offset + t->u.target_size > e->next_offset) return -EINVAL; @@ -498,8 +512,7 @@ static inline int check_target(struct arpt_entry *e, const char *name) } static inline int -find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, - unsigned int *i) +find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) { struct arpt_entry_target *t; struct xt_target *target; @@ -524,8 +537,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size, ret = check_target(e, name); if (ret) goto err; - - (*i)++; return 0; err: module_put(t->u.kernel.target->me); @@ -533,14 +544,14 @@ out: return ret; } -static bool check_underflow(struct arpt_entry *e) +static bool check_underflow(const struct arpt_entry *e) { const struct arpt_entry_target *t; unsigned int verdict; if (!unconditional(&e->arp)) return false; - t = arpt_get_target(e); + t = arpt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; verdict = ((struct arpt_standard_target *)t)->verdict; @@ -550,17 +561,16 @@ static bool check_underflow(struct arpt_entry *e) static inline int check_entry_size_and_hooks(struct arpt_entry *e, struct xt_table_info *newinfo, - unsigned char *base, - unsigned char *limit, + const unsigned char *base, + const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, - unsigned int valid_hooks, - unsigned int *i) + unsigned int valid_hooks) { unsigned int h; - if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 - || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 || + (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -592,19 +602,14 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; - - (*i)++; return 0; } -static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) +static inline void cleanup_entry(struct arpt_entry *e) { struct xt_tgdtor_param par; struct arpt_entry_target *t; - if (i && (*i)-- == 0) - return 1; - t = arpt_get_target(e); par.target = t->u.kernel.target; par.targinfo = t->data; @@ -612,26 +617,20 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - return 0; } /* Checks and translates the user-supplied table segment (held in * newinfo). */ -static int translate_table(const char *name, - unsigned int valid_hooks, - struct xt_table_info *newinfo, - void *entry0, - unsigned int size, - unsigned int number, - const unsigned int *hook_entries, - const unsigned int *underflows) +static int translate_table(struct xt_table_info *newinfo, void *entry0, + const struct arpt_replace *repl) { + struct arpt_entry *iter; unsigned int i; - int ret; + int ret = 0; - newinfo->size = size; - newinfo->number = number; + newinfo->size = repl->size; + newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { @@ -643,52 +642,63 @@ static int translate_table(const char *name, i = 0; /* Walk through entries, checking offsets. */ - ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, - check_entry_size_and_hooks, - newinfo, - entry0, - entry0 + size, - hook_entries, underflows, valid_hooks, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = check_entry_size_and_hooks(iter, newinfo, entry0, + entry0 + repl->size, + repl->hook_entry, + repl->underflow, + repl->valid_hooks); + if (ret != 0) + break; + ++i; + } duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) return ret; - if (i != number) { + if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", - i, number); + i, repl->num_entries); return -EINVAL; } /* Check hooks all assigned */ for (i = 0; i < NF_ARP_NUMHOOKS; i++) { /* Only hooks which are valid */ - if (!(valid_hooks & (1 << i))) + if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", - i, hook_entries[i]); + i, repl->hook_entry[i]); return -EINVAL; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", - i, underflows[i]); + i, repl->underflow[i]); return -EINVAL; } } - if (!mark_source_chains(newinfo, valid_hooks, entry0)) { + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) { duprintf("Looping hook\n"); return -ELOOP; } /* Finally, each sanity check must pass */ i = 0; - ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, - find_check_entry, name, size, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = find_check_entry(iter, repl->name, repl->size); + if (ret != 0) + break; + ++i; + } if (ret != 0) { - ARPT_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter); + } return ret; } @@ -701,30 +711,10 @@ static int translate_table(const char *name, return ret; } -/* Gets counters. */ -static inline int add_entry_to_counter(const struct arpt_entry *e, - struct xt_counters total[], - unsigned int *i) -{ - ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - -static inline int set_entry_to_counter(const struct arpt_entry *e, - struct xt_counters total[], - unsigned int *i) -{ - SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { + struct arpt_entry *iter; unsigned int cpu; unsigned int i; unsigned int curcpu; @@ -740,32 +730,32 @@ static void get_counters(const struct xt_table_info *t, curcpu = smp_processor_id(); i = 0; - ARPT_ENTRY_ITERATE(t->entries[curcpu], - t->size, - set_entry_to_counter, - counters, - &i); + xt_entry_foreach(iter, t->entries[curcpu], t->size) { + SET_COUNTER(counters[i], iter->counters.bcnt, + iter->counters.pcnt); + ++i; + } for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; i = 0; xt_info_wrlock(cpu); - ARPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_entry_to_counter, - counters, - &i); + xt_entry_foreach(iter, t->entries[cpu], t->size) { + ADD_COUNTER(counters[i], iter->counters.bcnt, + iter->counters.pcnt); + ++i; + } xt_info_wrunlock(cpu); } local_bh_enable(); } -static struct xt_counters *alloc_counters(struct xt_table *table) +static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -783,11 +773,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table) } static int copy_entries_to_user(unsigned int total_size, - struct xt_table *table, + const struct xt_table *table, void __user *userptr) { unsigned int off, num; - struct arpt_entry *e; + const struct arpt_entry *e; struct xt_counters *counters; struct xt_table_info *private = table->private; int ret = 0; @@ -807,7 +797,7 @@ static int copy_entries_to_user(unsigned int total_size, /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ - struct arpt_entry_target *t; + const struct arpt_entry_target *t; e = (struct arpt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off @@ -818,7 +808,7 @@ static int copy_entries_to_user(unsigned int total_size, goto free_counters; } - t = arpt_get_target(e); + t = arpt_get_target_c(e); if (copy_to_user(userptr + off + e->target_offset + offsetof(struct arpt_entry_target, u.user.name), @@ -835,7 +825,7 @@ static int copy_entries_to_user(unsigned int total_size, } #ifdef CONFIG_COMPAT -static void compat_standard_from_user(void *dst, void *src) +static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -844,7 +834,7 @@ static void compat_standard_from_user(void *dst, void *src) memcpy(dst, &v, sizeof(v)); } -static int compat_standard_to_user(void __user *dst, void *src) +static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; @@ -853,18 +843,18 @@ static int compat_standard_to_user(void __user *dst, void *src) return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } -static int compat_calc_entry(struct arpt_entry *e, +static int compat_calc_entry(const struct arpt_entry *e, const struct xt_table_info *info, - void *base, struct xt_table_info *newinfo) + const void *base, struct xt_table_info *newinfo) { - struct arpt_entry_target *t; + const struct arpt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); entry_offset = (void *)e - base; - t = arpt_get_target(e); + t = arpt_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off); @@ -885,7 +875,9 @@ static int compat_calc_entry(struct arpt_entry *e, static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { + struct arpt_entry *iter; void *loc_cpu_entry; + int ret; if (!newinfo || !info) return -EINVAL; @@ -894,13 +886,17 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; - return ARPT_ENTRY_ITERATE(loc_cpu_entry, info->size, - compat_calc_entry, info, loc_cpu_entry, - newinfo); + xt_entry_foreach(iter, loc_cpu_entry, info->size) { + ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); + if (ret != 0) + return ret; + } + return 0; } #endif -static int get_info(struct net *net, void __user *user, int *len, int compat) +static int get_info(struct net *net, void __user *user, + const int *len, int compat) { char name[ARPT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -925,10 +921,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (t && !IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; - #ifdef CONFIG_COMPAT + struct xt_table_info tmp; + if (compat) { - struct xt_table_info tmp; ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(NFPROTO_ARP); private = &tmp; @@ -959,7 +955,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) } static int get_entries(struct net *net, struct arpt_get_entries __user *uptr, - int *len) + const int *len) { int ret; struct arpt_get_entries get; @@ -1010,6 +1006,7 @@ static int __do_replace(struct net *net, const char *name, struct xt_table_info *oldinfo; struct xt_counters *counters; void *loc_cpu_old_entry; + struct arpt_entry *iter; ret = 0; counters = vmalloc_node(num_counters * sizeof(struct xt_counters), @@ -1053,8 +1050,8 @@ static int __do_replace(struct net *net, const char *name, /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, - NULL); + xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + cleanup_entry(iter); xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, @@ -1073,12 +1070,14 @@ static int __do_replace(struct net *net, const char *name, return ret; } -static int do_replace(struct net *net, void __user *user, unsigned int len) +static int do_replace(struct net *net, const void __user *user, + unsigned int len) { int ret; struct arpt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct arpt_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1099,9 +1098,7 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, - tmp.hook_entry, tmp.underflow); + ret = translate_table(newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; @@ -1114,27 +1111,15 @@ static int do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter); free_newinfo: xt_free_table_info(newinfo); return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct arpt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -static int do_add_counters(struct net *net, void __user *user, unsigned int len, - int compat) +static int do_add_counters(struct net *net, const void __user *user, + unsigned int len, int compat) { unsigned int i, curcpu; struct xt_counters_info tmp; @@ -1147,6 +1132,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, const struct xt_table_info *private; int ret = 0; void *loc_cpu_entry; + struct arpt_entry *iter; #ifdef CONFIG_COMPAT struct compat_xt_counters_info compat_tmp; @@ -1204,11 +1190,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, curcpu = smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; xt_info_wrlock(curcpu); - ARPT_ENTRY_ITERATE(loc_cpu_entry, - private->size, - add_counter_to_entry, - paddc, - &i); + xt_entry_foreach(iter, loc_cpu_entry, private->size) { + ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + ++i; + } xt_info_wrunlock(curcpu); unlock_up_free: local_bh_enable(); @@ -1221,28 +1206,22 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len, } #ifdef CONFIG_COMPAT -static inline int -compat_release_entry(struct compat_arpt_entry *e, unsigned int *i) +static inline void compat_release_entry(struct compat_arpt_entry *e) { struct arpt_entry_target *t; - if (i && (*i)-- == 0) - return 1; - t = compat_arpt_get_target(e); module_put(t->u.kernel.target->me); - return 0; } static inline int check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, struct xt_table_info *newinfo, unsigned int *size, - unsigned char *base, - unsigned char *limit, - unsigned int *hook_entries, - unsigned int *underflows, - unsigned int *i, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, const char *name) { struct arpt_entry_target *t; @@ -1251,8 +1230,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, int ret, off, h; duprintf("check_compat_entry_size_and_hooks %p\n", e); - if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 - || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { + if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 || + (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } @@ -1302,8 +1281,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, /* Clear counters and comefrom */ memset(&e->counters, 0, sizeof(e->counters)); e->comefrom = 0; - - (*i)++; return 0; release_target: @@ -1347,19 +1324,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, return ret; } -static inline int compat_check_entry(struct arpt_entry *e, const char *name, - unsigned int *i) -{ - int ret; - - ret = check_target(e, name); - if (ret) - return ret; - - (*i)++; - return 0; -} - static int translate_compat_table(const char *name, unsigned int valid_hooks, struct xt_table_info **pinfo, @@ -1372,8 +1336,10 @@ static int translate_compat_table(const char *name, unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; + struct compat_arpt_entry *iter0; + struct arpt_entry *iter1; unsigned int size; - int ret; + int ret = 0; info = *pinfo; entry0 = *pentry0; @@ -1390,13 +1356,17 @@ static int translate_compat_table(const char *name, j = 0; xt_compat_lock(NFPROTO_ARP); /* Walk through entries, checking offsets. */ - ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, - check_compat_entry_size_and_hooks, - info, &size, entry0, - entry0 + total_size, - hook_entries, underflows, &j, name); - if (ret != 0) - goto out_unlock; + xt_entry_foreach(iter0, entry0, total_size) { + ret = check_compat_entry_size_and_hooks(iter0, info, &size, + entry0, + entry0 + total_size, + hook_entries, + underflows, + name); + if (ret != 0) + goto out_unlock; + ++j; + } ret = -EINVAL; if (j != number) { @@ -1435,9 +1405,12 @@ static int translate_compat_table(const char *name, entry1 = newinfo->entries[raw_smp_processor_id()]; pos = entry1; size = total_size; - ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, - compat_copy_entry_from_user, - &pos, &size, name, newinfo, entry1); + xt_entry_foreach(iter0, entry0, total_size) { + ret = compat_copy_entry_from_user(iter0, &pos, &size, + name, newinfo, entry1); + if (ret != 0) + break; + } xt_compat_flush_offsets(NFPROTO_ARP); xt_compat_unlock(NFPROTO_ARP); if (ret) @@ -1448,13 +1421,32 @@ static int translate_compat_table(const char *name, goto free_newinfo; i = 0; - ret = ARPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + xt_entry_foreach(iter1, entry1, newinfo->size) { + ret = check_target(iter1, name); + if (ret != 0) + break; + ++i; + } if (ret) { + /* + * The first i matches need cleanup_entry (calls ->destroy) + * because they had called ->check already. The other j-i + * entries need only release. + */ + int skip = i; j -= i; - COMPAT_ARPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, - compat_release_entry, &j); - ARPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + xt_entry_foreach(iter0, entry0, newinfo->size) { + if (skip-- > 0) + continue; + if (j-- == 0) + break; + compat_release_entry(iter0); + } + xt_entry_foreach(iter1, entry1, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter1); + } xt_free_table_info(newinfo); return ret; } @@ -1472,7 +1464,11 @@ static int translate_compat_table(const char *name, free_newinfo: xt_free_table_info(newinfo); out: - COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); + xt_entry_foreach(iter0, entry0, total_size) { + if (j-- == 0) + break; + compat_release_entry(iter0); + } return ret; out_unlock: xt_compat_flush_offsets(NFPROTO_ARP); @@ -1499,6 +1495,7 @@ static int compat_do_replace(struct net *net, void __user *user, struct compat_arpt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct arpt_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1536,7 +1533,8 @@ static int compat_do_replace(struct net *net, void __user *user, return 0; free_newinfo_untrans: - ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1570,7 +1568,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, compat_uint_t *size, struct xt_counters *counters, - unsigned int *i) + unsigned int i) { struct arpt_entry_target *t; struct compat_arpt_entry __user *ce; @@ -1578,14 +1576,12 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, compat_uint_t origsize; int ret; - ret = -EFAULT; origsize = *size; ce = (struct compat_arpt_entry __user *)*dstptr; - if (copy_to_user(ce, e, sizeof(struct arpt_entry))) - goto out; - - if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) - goto out; + if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 || + copy_to_user(&ce->counters, &counters[i], + sizeof(counters[i])) != 0) + return -EFAULT; *dstptr += sizeof(struct compat_arpt_entry); *size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry); @@ -1595,18 +1591,12 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, t = arpt_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) - goto out; - ret = -EFAULT; + return ret; next_offset = e->next_offset - (origsize - *size); - if (put_user(target_offset, &ce->target_offset)) - goto out; - if (put_user(next_offset, &ce->next_offset)) - goto out; - - (*i)++; + if (put_user(target_offset, &ce->target_offset) != 0 || + put_user(next_offset, &ce->next_offset) != 0) + return -EFAULT; return 0; -out: - return ret; } static int compat_copy_entries_to_user(unsigned int total_size, @@ -1620,6 +1610,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, int ret = 0; void *loc_cpu_entry; unsigned int i = 0; + struct arpt_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -1629,9 +1620,12 @@ static int compat_copy_entries_to_user(unsigned int total_size, loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - ret = ARPT_ENTRY_ITERATE(loc_cpu_entry, total_size, - compat_copy_entry_to_user, - &pos, &size, counters, &i); + xt_entry_foreach(iter, loc_cpu_entry, total_size) { + ret = compat_copy_entry_to_user(iter, &pos, + &size, counters, i++); + if (ret != 0) + break; + } vfree(counters); return ret; } @@ -1799,12 +1793,7 @@ struct xt_table *arpt_register_table(struct net *net, loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(table->name, table->valid_hooks, - newinfo, loc_cpu_entry, repl->size, - repl->num_entries, - repl->hook_entry, - repl->underflow); - + ret = translate_table(newinfo, loc_cpu_entry, repl); duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) goto out_free; @@ -1827,13 +1816,14 @@ void arpt_unregister_table(struct xt_table *table) struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; + struct arpt_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, - cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c index 97337601827a..79ca5e70d497 100644 --- a/net/ipv4/netfilter/arptable_filter.c +++ b/net/ipv4/netfilter/arptable_filter.c @@ -6,7 +6,9 @@ */ #include <linux/module.h> +#include <linux/netfilter/x_tables.h> #include <linux/netfilter_arp/arp_tables.h> +#include <linux/slab.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("David S. Miller <davem@redhat.com>"); @@ -15,93 +17,37 @@ MODULE_DESCRIPTION("arptables filter table"); #define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ (1 << NF_ARP_FORWARD)) -static const struct -{ - struct arpt_replace repl; - struct arpt_standard entries[3]; - struct arpt_error term; -} initial_table __net_initdata = { - .repl = { - .name = "filter", - .valid_hooks = FILTER_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error), - .hook_entry = { - [NF_ARP_IN] = 0, - [NF_ARP_OUT] = sizeof(struct arpt_standard), - [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), - }, - .underflow = { - [NF_ARP_IN] = 0, - [NF_ARP_OUT] = sizeof(struct arpt_standard), - [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), - }, - }, - .entries = { - ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_IN */ - ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_OUT */ - ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_FORWARD */ - }, - .term = ARPT_ERROR_INIT, -}; - static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_ARP, + .priority = NF_IP_PRI_FILTER, }; /* The work comes in here from netfilter.c */ -static unsigned int arpt_in_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +static unsigned int +arptable_filter_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return arpt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.arptable_filter); -} + const struct net *net = dev_net((in != NULL) ? in : out); -static unsigned int arpt_out_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return arpt_do_table(skb, hook, in, out, - dev_net(out)->ipv4.arptable_filter); + return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter); } -static struct nf_hook_ops arpt_ops[] __read_mostly = { - { - .hook = arpt_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_ARP, - .hooknum = NF_ARP_IN, - .priority = NF_IP_PRI_FILTER, - }, - { - .hook = arpt_out_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_ARP, - .hooknum = NF_ARP_OUT, - .priority = NF_IP_PRI_FILTER, - }, - { - .hook = arpt_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_ARP, - .hooknum = NF_ARP_FORWARD, - .priority = NF_IP_PRI_FILTER, - }, -}; +static struct nf_hook_ops *arpfilter_ops __read_mostly; static int __net_init arptable_filter_net_init(struct net *net) { - /* Register table */ + struct arpt_replace *repl; + + repl = arpt_alloc_initial_table(&packet_filter); + if (repl == NULL) + return -ENOMEM; net->ipv4.arptable_filter = - arpt_register_table(net, &packet_filter, &initial_table.repl); + arpt_register_table(net, &packet_filter, repl); + kfree(repl); if (IS_ERR(net->ipv4.arptable_filter)) return PTR_ERR(net->ipv4.arptable_filter); return 0; @@ -125,9 +71,11 @@ static int __init arptable_filter_init(void) if (ret < 0) return ret; - ret = nf_register_hooks(arpt_ops, ARRAY_SIZE(arpt_ops)); - if (ret < 0) + arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook); + if (IS_ERR(arpfilter_ops)) { + ret = PTR_ERR(arpfilter_ops); goto cleanup_table; + } return ret; cleanup_table: @@ -137,7 +85,7 @@ cleanup_table: static void __exit arptable_filter_fini(void) { - nf_unregister_hooks(arpt_ops, ARRAY_SIZE(arpt_ops)); + xt_hook_unlink(&packet_filter, arpfilter_ops); unregister_pernet_subsys(&arptable_filter_net_ops); } diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c index c156db215987..e2787048aa0a 100644 --- a/net/ipv4/netfilter/ip_queue.c +++ b/net/ipv4/netfilter/ip_queue.c @@ -26,6 +26,7 @@ #include <linux/security.h> #include <linux/net.h> #include <linux/mutex.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/route.h> @@ -497,10 +498,9 @@ ipq_rcv_nl_event(struct notifier_block *this, { struct netlink_notify *n = ptr; - if (event == NETLINK_URELEASE && - n->protocol == NETLINK_FIREWALL && n->pid) { + if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) { write_lock_bh(&queue_lock); - if ((n->net == &init_net) && (n->pid == peer_pid)) + if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid)) __ipq_reset(); write_unlock_bh(&queue_lock); } @@ -516,14 +516,13 @@ static struct ctl_table_header *ipq_sysctl_header; static ctl_table ipq_table[] = { { - .ctl_name = NET_IPQ_QMAX, .procname = NET_IPQ_QMAX_NAME, .data = &queue_maxlen, .maxlen = sizeof(queue_maxlen), .mode = 0644, .proc_handler = proc_dointvec }, - { .ctl_name = 0 } + { } }; #endif @@ -622,7 +621,7 @@ cleanup_netlink_notifier: static void __exit ip_queue_fini(void) { nf_unregister_queue_handlers(&nfqh); - synchronize_net(); + ipq_flush(NULL, 0); #ifdef CONFIG_SYSCTL diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index cde755d5eeab..b29c66df8d1f 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -28,6 +28,7 @@ #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <net/netfilter/nf_log.h> +#include "../../netfilter/xt_repldata.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); @@ -66,6 +67,12 @@ do { \ #define inline #endif +void *ipt_alloc_initial_table(const struct xt_table *info) +{ + return xt_alloc_initial_table(ipt, IPT); +} +EXPORT_SYMBOL_GPL(ipt_alloc_initial_table); + /* We keep a set of rules for each CPU, so we can avoid write-locking them in the softirq when updating the counters and therefore @@ -89,9 +96,9 @@ ip_packet_match(const struct iphdr *ip, #define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg))) if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, - IPT_INV_SRCIP) - || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, - IPT_INV_DSTIP)) { + IPT_INV_SRCIP) || + FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, + IPT_INV_DSTIP)) { dprintf("Source or dest mismatch.\n"); dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n", @@ -122,8 +129,8 @@ ip_packet_match(const struct iphdr *ip, } /* Check specific protocol */ - if (ipinfo->proto - && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { + if (ipinfo->proto && + FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { dprintf("Packet protocol %hi does not match %hi.%s\n", ip->protocol, ipinfo->proto, ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); @@ -169,7 +176,7 @@ ipt_error(struct sk_buff *skb, const struct xt_target_param *par) /* Performance critical - called for every packet */ static inline bool -do_match(struct ipt_entry_match *m, const struct sk_buff *skb, +do_match(const struct ipt_entry_match *m, const struct sk_buff *skb, struct xt_match_param *par) { par->match = m->u.kernel.match; @@ -184,7 +191,7 @@ do_match(struct ipt_entry_match *m, const struct sk_buff *skb, /* Performance critical */ static inline struct ipt_entry * -get_entry(void *base, unsigned int offset) +get_entry(const void *base, unsigned int offset) { return (struct ipt_entry *)(base + offset); } @@ -199,6 +206,13 @@ static inline bool unconditional(const struct ipt_ip *ip) #undef FWINV } +/* for const-correctness */ +static inline const struct ipt_entry_target * +ipt_get_target_c(const struct ipt_entry *e) +{ + return ipt_get_target((struct ipt_entry *)e); +} + #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) static const char *const hooknames[] = { @@ -233,11 +247,11 @@ static struct nf_loginfo trace_loginfo = { /* Mildly perf critical (only if packet tracing is on) */ static inline int -get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e, +get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, const char *hookname, const char **chainname, const char **comment, unsigned int *rulenum) { - struct ipt_standard_target *t = (void *)ipt_get_target(s); + const struct ipt_standard_target *t = (void *)ipt_get_target_c(s); if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ @@ -246,11 +260,11 @@ get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e, } else if (s == e) { (*rulenum)++; - if (s->target_offset == sizeof(struct ipt_entry) - && strcmp(t->target.u.kernel.target->name, - IPT_STANDARD_TARGET) == 0 - && t->verdict < 0 - && unconditional(&s->ip)) { + if (s->target_offset == sizeof(struct ipt_entry) && + strcmp(t->target.u.kernel.target->name, + IPT_STANDARD_TARGET) == 0 && + t->verdict < 0 && + unconditional(&s->ip)) { /* Tail of chains: STANDARD target (return/policy) */ *comment = *chainname == hookname ? comments[NF_IP_TRACE_COMMENT_POLICY] @@ -263,17 +277,18 @@ get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e, return 0; } -static void trace_packet(struct sk_buff *skb, +static void trace_packet(const struct sk_buff *skb, unsigned int hook, const struct net_device *in, const struct net_device *out, const char *tablename, - struct xt_table_info *private, - struct ipt_entry *e) + const struct xt_table_info *private, + const struct ipt_entry *e) { - void *table_base; + const void *table_base; const struct ipt_entry *root; const char *hookname, *chainname, *comment; + const struct ipt_entry *iter; unsigned int rulenum = 0; table_base = private->entries[smp_processor_id()]; @@ -282,10 +297,10 @@ static void trace_packet(struct sk_buff *skb, hookname = chainname = hooknames[hook]; comment = comments[NF_IP_TRACE_COMMENT_RULE]; - IPT_ENTRY_ITERATE(root, - private->size - private->hook_entry[hook], - get_chainname_rulenum, - e, hookname, &chainname, &comment, &rulenum); + xt_entry_foreach(iter, root, private->size - private->hook_entry[hook]) + if (get_chainname_rulenum(iter, e, hookname, + &chainname, &comment, &rulenum) != 0) + break; nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo, "TRACE: %s:%s:%s:%u ", @@ -315,9 +330,9 @@ ipt_do_table(struct sk_buff *skb, /* Initializing verdict to NF_DROP keeps gcc happy. */ unsigned int verdict = NF_DROP; const char *indev, *outdev; - void *table_base; + const void *table_base; struct ipt_entry *e, *back; - struct xt_table_info *private; + const struct xt_table_info *private; struct xt_match_param mtpar; struct xt_target_param tgpar; @@ -350,17 +365,22 @@ ipt_do_table(struct sk_buff *skb, back = get_entry(table_base, private->underflow[hook]); do { - struct ipt_entry_target *t; + const struct ipt_entry_target *t; + const struct xt_entry_match *ematch; IP_NF_ASSERT(e); IP_NF_ASSERT(back); if (!ip_packet_match(ip, indev, outdev, - &e->ip, mtpar.fragoff) || - IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0) { + &e->ip, mtpar.fragoff)) { + no_match: e = ipt_next_entry(e); continue; } + xt_ematch_foreach(ematch, e) + if (do_match(ematch, skb, &mtpar) != 0) + goto no_match; + ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); t = ipt_get_target(e); @@ -388,8 +408,8 @@ ipt_do_table(struct sk_buff *skb, back = get_entry(table_base, back->comefrom); continue; } - if (table_base + v != ipt_next_entry(e) - && !(e->ip.flags & IPT_F_GOTO)) { + if (table_base + v != ipt_next_entry(e) && + !(e->ip.flags & IPT_F_GOTO)) { /* Save old back ptr in next entry */ struct ipt_entry *next = ipt_next_entry(e); next->comefrom = (void *)back - table_base; @@ -443,7 +463,7 @@ ipt_do_table(struct sk_buff *skb, /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct xt_table_info *newinfo, +mark_source_chains(const struct xt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -461,8 +481,8 @@ mark_source_chains(struct xt_table_info *newinfo, e->counters.pcnt = pos; for (;;) { - struct ipt_standard_target *t - = (void *)ipt_get_target(e); + const struct ipt_standard_target *t + = (void *)ipt_get_target_c(e); int visited = e->comefrom & (1 << hook); if (e->comefrom & (1 << NF_INET_NUMHOOKS)) { @@ -473,11 +493,11 @@ mark_source_chains(struct xt_table_info *newinfo, e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS)); /* Unconditional return/END. */ - if ((e->target_offset == sizeof(struct ipt_entry) - && (strcmp(t->target.u.user.name, - IPT_STANDARD_TARGET) == 0) - && t->verdict < 0 - && unconditional(&e->ip)) || visited) { + if ((e->target_offset == sizeof(struct ipt_entry) && + (strcmp(t->target.u.user.name, + IPT_STANDARD_TARGET) == 0) && + t->verdict < 0 && unconditional(&e->ip)) || + visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, @@ -524,8 +544,8 @@ mark_source_chains(struct xt_table_info *newinfo, int newpos = t->verdict; if (strcmp(t->target.u.user.name, - IPT_STANDARD_TARGET) == 0 - && newpos >= 0) { + IPT_STANDARD_TARGET) == 0 && + newpos >= 0) { if (newpos > newinfo->size - sizeof(struct ipt_entry)) { duprintf("mark_source_chains: " @@ -552,27 +572,23 @@ mark_source_chains(struct xt_table_info *newinfo, return 1; } -static int -cleanup_match(struct ipt_entry_match *m, unsigned int *i) +static void cleanup_match(struct ipt_entry_match *m, struct net *net) { struct xt_mtdtor_param par; - if (i && (*i)-- == 0) - return 1; - + par.net = net; par.match = m->u.kernel.match; par.matchinfo = m->data; par.family = NFPROTO_IPV4; if (par.match->destroy != NULL) par.match->destroy(&par); module_put(par.match->me); - return 0; } static int -check_entry(struct ipt_entry *e, const char *name) +check_entry(const struct ipt_entry *e, const char *name) { - struct ipt_entry_target *t; + const struct ipt_entry_target *t; if (!ip_checkentry(&e->ip)) { duprintf("ip_tables: ip check failed %p %s.\n", e, name); @@ -583,7 +599,7 @@ check_entry(struct ipt_entry *e, const char *name) e->next_offset) return -EINVAL; - t = ipt_get_target(e); + t = ipt_get_target_c(e); if (e->target_offset + t->u.target_size > e->next_offset) return -EINVAL; @@ -591,8 +607,7 @@ check_entry(struct ipt_entry *e, const char *name) } static int -check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, - unsigned int *i) +check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) { const struct ipt_ip *ip = par->entryinfo; int ret; @@ -607,13 +622,11 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, par.match->name); return ret; } - ++*i; return 0; } static int -find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, - unsigned int *i) +find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) { struct xt_match *match; int ret; @@ -627,7 +640,7 @@ find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par, } m->u.kernel.match = match; - ret = check_match(m, par, i); + ret = check_match(m, par); if (ret) goto err; @@ -637,10 +650,11 @@ err: return ret; } -static int check_target(struct ipt_entry *e, const char *name) +static int check_target(struct ipt_entry *e, struct net *net, const char *name) { struct ipt_entry_target *t = ipt_get_target(e); struct xt_tgchk_param par = { + .net = net, .table = name, .entryinfo = e, .target = t->u.kernel.target, @@ -661,27 +675,32 @@ static int check_target(struct ipt_entry *e, const char *name) } static int -find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, - unsigned int *i) +find_check_entry(struct ipt_entry *e, struct net *net, const char *name, + unsigned int size) { struct ipt_entry_target *t; struct xt_target *target; int ret; unsigned int j; struct xt_mtchk_param mtpar; + struct xt_entry_match *ematch; ret = check_entry(e, name); if (ret) return ret; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ip; mtpar.hook_mask = e->comefrom; mtpar.family = NFPROTO_IPV4; - ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j); - if (ret != 0) - goto cleanup_matches; + xt_ematch_foreach(ematch, e) { + ret = find_check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } t = ipt_get_target(e); target = try_then_request_module(xt_find_target(AF_INET, @@ -695,27 +714,29 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size, } t->u.kernel.target = target; - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto err; - - (*i)++; return 0; err: module_put(t->u.kernel.target->me); cleanup_matches: - IPT_MATCH_ITERATE(e, cleanup_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } return ret; } -static bool check_underflow(struct ipt_entry *e) +static bool check_underflow(const struct ipt_entry *e) { const struct ipt_entry_target *t; unsigned int verdict; if (!unconditional(&e->ip)) return false; - t = ipt_get_target(e); + t = ipt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; verdict = ((struct ipt_standard_target *)t)->verdict; @@ -726,17 +747,16 @@ static bool check_underflow(struct ipt_entry *e) static int check_entry_size_and_hooks(struct ipt_entry *e, struct xt_table_info *newinfo, - unsigned char *base, - unsigned char *limit, + const unsigned char *base, + const unsigned char *limit, const unsigned int *hook_entries, const unsigned int *underflows, - unsigned int valid_hooks, - unsigned int *i) + unsigned int valid_hooks) { unsigned int h; - if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 - || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 || + (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { duprintf("Bad offset %p\n", e); return -EINVAL; } @@ -768,50 +788,42 @@ check_entry_size_and_hooks(struct ipt_entry *e, /* Clear counters and comefrom */ e->counters = ((struct xt_counters) { 0, 0 }); e->comefrom = 0; - - (*i)++; return 0; } -static int -cleanup_entry(struct ipt_entry *e, unsigned int *i) +static void +cleanup_entry(struct ipt_entry *e, struct net *net) { struct xt_tgdtor_param par; struct ipt_entry_target *t; - - if (i && (*i)-- == 0) - return 1; + struct xt_entry_match *ematch; /* Cleanup all matches */ - IPT_MATCH_ITERATE(e, cleanup_match, NULL); + xt_ematch_foreach(ematch, e) + cleanup_match(ematch, net); t = ipt_get_target(e); + par.net = net; par.target = t->u.kernel.target; par.targinfo = t->data; par.family = NFPROTO_IPV4; if (par.target->destroy != NULL) par.target->destroy(&par); module_put(par.target->me); - return 0; } /* Checks and translates the user-supplied table segment (held in newinfo) */ static int -translate_table(const char *name, - unsigned int valid_hooks, - struct xt_table_info *newinfo, - void *entry0, - unsigned int size, - unsigned int number, - const unsigned int *hook_entries, - const unsigned int *underflows) +translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0, + const struct ipt_replace *repl) { + struct ipt_entry *iter; unsigned int i; - int ret; + int ret = 0; - newinfo->size = size; - newinfo->number = number; + newinfo->size = repl->size; + newinfo->number = repl->num_entries; /* Init all hooks to impossible value. */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { @@ -822,49 +834,58 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, - check_entry_size_and_hooks, - newinfo, - entry0, - entry0 + size, - hook_entries, underflows, valid_hooks, &i); - if (ret != 0) - return ret; + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = check_entry_size_and_hooks(iter, newinfo, entry0, + entry0 + repl->size, + repl->hook_entry, + repl->underflow, + repl->valid_hooks); + if (ret != 0) + return ret; + ++i; + } - if (i != number) { + if (i != repl->num_entries) { duprintf("translate_table: %u not %u entries\n", - i, number); + i, repl->num_entries); return -EINVAL; } /* Check hooks all assigned */ for (i = 0; i < NF_INET_NUMHOOKS; i++) { /* Only hooks which are valid */ - if (!(valid_hooks & (1 << i))) + if (!(repl->valid_hooks & (1 << i))) continue; if (newinfo->hook_entry[i] == 0xFFFFFFFF) { duprintf("Invalid hook entry %u %u\n", - i, hook_entries[i]); + i, repl->hook_entry[i]); return -EINVAL; } if (newinfo->underflow[i] == 0xFFFFFFFF) { duprintf("Invalid underflow %u %u\n", - i, underflows[i]); + i, repl->underflow[i]); return -EINVAL; } } - if (!mark_source_chains(newinfo, valid_hooks, entry0)) + if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, - find_check_entry, name, size, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + ret = find_check_entry(iter, net, repl->name, repl->size); + if (ret != 0) + break; + ++i; + } if (ret != 0) { - IPT_ENTRY_ITERATE(entry0, newinfo->size, - cleanup_entry, &i); + xt_entry_foreach(iter, entry0, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter, net); + } return ret; } @@ -877,33 +898,11 @@ translate_table(const char *name, return ret; } -/* Gets counters. */ -static inline int -add_entry_to_counter(const struct ipt_entry *e, - struct xt_counters total[], - unsigned int *i) -{ - ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - -static inline int -set_entry_to_counter(const struct ipt_entry *e, - struct ipt_counters total[], - unsigned int *i) -{ - SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); - - (*i)++; - return 0; -} - static void get_counters(const struct xt_table_info *t, struct xt_counters counters[]) { + struct ipt_entry *iter; unsigned int cpu; unsigned int i; unsigned int curcpu; @@ -919,32 +918,32 @@ get_counters(const struct xt_table_info *t, curcpu = smp_processor_id(); i = 0; - IPT_ENTRY_ITERATE(t->entries[curcpu], - t->size, - set_entry_to_counter, - counters, - &i); + xt_entry_foreach(iter, t->entries[curcpu], t->size) { + SET_COUNTER(counters[i], iter->counters.bcnt, + iter->counters.pcnt); + ++i; + } for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; i = 0; xt_info_wrlock(cpu); - IPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_entry_to_counter, - counters, - &i); + xt_entry_foreach(iter, t->entries[cpu], t->size) { + ADD_COUNTER(counters[i], iter->counters.bcnt, + iter->counters.pcnt); + ++i; /* macro does multi eval of i */ + } xt_info_wrunlock(cpu); } local_bh_enable(); } -static struct xt_counters * alloc_counters(struct xt_table *table) +static struct xt_counters *alloc_counters(const struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; - struct xt_table_info *private = table->private; + const struct xt_table_info *private = table->private; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -962,11 +961,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table) static int copy_entries_to_user(unsigned int total_size, - struct xt_table *table, + const struct xt_table *table, void __user *userptr) { unsigned int off, num; - struct ipt_entry *e; + const struct ipt_entry *e; struct xt_counters *counters; const struct xt_table_info *private = table->private; int ret = 0; @@ -1018,7 +1017,7 @@ copy_entries_to_user(unsigned int total_size, } } - t = ipt_get_target(e); + t = ipt_get_target_c(e); if (copy_to_user(userptr + off + e->target_offset + offsetof(struct ipt_entry_target, u.user.name), @@ -1035,7 +1034,7 @@ copy_entries_to_user(unsigned int total_size, } #ifdef CONFIG_COMPAT -static void compat_standard_from_user(void *dst, void *src) +static void compat_standard_from_user(void *dst, const void *src) { int v = *(compat_int_t *)src; @@ -1044,7 +1043,7 @@ static void compat_standard_from_user(void *dst, void *src) memcpy(dst, &v, sizeof(v)); } -static int compat_standard_to_user(void __user *dst, void *src) +static int compat_standard_to_user(void __user *dst, const void *src) { compat_int_t cv = *(int *)src; @@ -1053,25 +1052,20 @@ static int compat_standard_to_user(void __user *dst, void *src) return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0; } -static inline int -compat_calc_match(struct ipt_entry_match *m, int *size) -{ - *size += xt_compat_match_offset(m->u.kernel.match); - return 0; -} - -static int compat_calc_entry(struct ipt_entry *e, +static int compat_calc_entry(const struct ipt_entry *e, const struct xt_table_info *info, - void *base, struct xt_table_info *newinfo) + const void *base, struct xt_table_info *newinfo) { - struct ipt_entry_target *t; + const struct xt_entry_match *ematch; + const struct ipt_entry_target *t; unsigned int entry_offset; int off, i, ret; off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); entry_offset = (void *)e - base; - IPT_MATCH_ITERATE(e, compat_calc_match, &off); - t = ipt_get_target(e); + xt_ematch_foreach(ematch, e) + off += xt_compat_match_offset(ematch->u.kernel.match); + t = ipt_get_target_c(e); off += xt_compat_target_offset(t->u.kernel.target); newinfo->size -= off; ret = xt_compat_add_offset(AF_INET, entry_offset, off); @@ -1092,7 +1086,9 @@ static int compat_calc_entry(struct ipt_entry *e, static int compat_table_info(const struct xt_table_info *info, struct xt_table_info *newinfo) { + struct ipt_entry *iter; void *loc_cpu_entry; + int ret; if (!newinfo || !info) return -EINVAL; @@ -1101,13 +1097,17 @@ static int compat_table_info(const struct xt_table_info *info, memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); newinfo->initial_entries = 0; loc_cpu_entry = info->entries[raw_smp_processor_id()]; - return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size, - compat_calc_entry, info, loc_cpu_entry, - newinfo); + xt_entry_foreach(iter, loc_cpu_entry, info->size) { + ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); + if (ret != 0) + return ret; + } + return 0; } #endif -static int get_info(struct net *net, void __user *user, int *len, int compat) +static int get_info(struct net *net, void __user *user, + const int *len, int compat) { char name[IPT_TABLE_MAXNAMELEN]; struct xt_table *t; @@ -1132,10 +1132,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (t && !IS_ERR(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; - #ifdef CONFIG_COMPAT + struct xt_table_info tmp; + if (compat) { - struct xt_table_info tmp; ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET); private = &tmp; @@ -1167,7 +1167,8 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) } static int -get_entries(struct net *net, struct ipt_get_entries __user *uptr, int *len) +get_entries(struct net *net, struct ipt_get_entries __user *uptr, + const int *len) { int ret; struct ipt_get_entries get; @@ -1215,6 +1216,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, struct xt_table_info *oldinfo; struct xt_counters *counters; void *loc_cpu_old_entry; + struct ipt_entry *iter; ret = 0; counters = vmalloc(num_counters * sizeof(struct xt_counters)); @@ -1257,8 +1259,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, /* Decrease module usage counts and free resource */ loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; - IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, - NULL); + xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size) + cleanup_entry(iter, net); + xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, sizeof(struct xt_counters) * num_counters) != 0) @@ -1277,12 +1280,13 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, } static int -do_replace(struct net *net, void __user *user, unsigned int len) +do_replace(struct net *net, const void __user *user, unsigned int len) { int ret; struct ipt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct ipt_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1303,9 +1307,7 @@ do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, - tmp.hook_entry, tmp.underflow); + ret = translate_table(net, newinfo, loc_cpu_entry, &tmp); if (ret != 0) goto free_newinfo; @@ -1318,27 +1320,16 @@ do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; } -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ static int -add_counter_to_entry(struct ipt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -static int -do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) +do_add_counters(struct net *net, const void __user *user, + unsigned int len, int compat) { unsigned int i, curcpu; struct xt_counters_info tmp; @@ -1351,6 +1342,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat const struct xt_table_info *private; int ret = 0; void *loc_cpu_entry; + struct ipt_entry *iter; #ifdef CONFIG_COMPAT struct compat_xt_counters_info compat_tmp; @@ -1408,11 +1400,10 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat curcpu = smp_processor_id(); loc_cpu_entry = private->entries[curcpu]; xt_info_wrlock(curcpu); - IPT_ENTRY_ITERATE(loc_cpu_entry, - private->size, - add_counter_to_entry, - paddc, - &i); + xt_entry_foreach(iter, loc_cpu_entry, private->size) { + ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt); + ++i; + } xt_info_wrunlock(curcpu); unlock_up_free: local_bh_enable(); @@ -1440,45 +1431,40 @@ struct compat_ipt_replace { static int compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, unsigned int *size, struct xt_counters *counters, - unsigned int *i) + unsigned int i) { struct ipt_entry_target *t; struct compat_ipt_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; - int ret; + const struct xt_entry_match *ematch; + int ret = 0; - ret = -EFAULT; origsize = *size; ce = (struct compat_ipt_entry __user *)*dstptr; - if (copy_to_user(ce, e, sizeof(struct ipt_entry))) - goto out; - - if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i]))) - goto out; + if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 || + copy_to_user(&ce->counters, &counters[i], + sizeof(counters[i])) != 0) + return -EFAULT; *dstptr += sizeof(struct compat_ipt_entry); *size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); - ret = IPT_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size); + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_to_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } target_offset = e->target_offset - (origsize - *size); - if (ret) - goto out; t = ipt_get_target(e); ret = xt_compat_target_to_user(t, dstptr, size); if (ret) - goto out; - ret = -EFAULT; + return ret; next_offset = e->next_offset - (origsize - *size); - if (put_user(target_offset, &ce->target_offset)) - goto out; - if (put_user(next_offset, &ce->next_offset)) - goto out; - - (*i)++; + if (put_user(target_offset, &ce->target_offset) != 0 || + put_user(next_offset, &ce->next_offset) != 0) + return -EFAULT; return 0; -out: - return ret; } static int @@ -1486,7 +1472,7 @@ compat_find_calc_match(struct ipt_entry_match *m, const char *name, const struct ipt_ip *ip, unsigned int hookmask, - int *size, unsigned int *i) + int *size) { struct xt_match *match; @@ -1500,47 +1486,32 @@ compat_find_calc_match(struct ipt_entry_match *m, } m->u.kernel.match = match; *size += xt_compat_match_offset(match); - - (*i)++; - return 0; -} - -static int -compat_release_match(struct ipt_entry_match *m, unsigned int *i) -{ - if (i && (*i)-- == 0) - return 1; - - module_put(m->u.kernel.match->me); return 0; } -static int -compat_release_entry(struct compat_ipt_entry *e, unsigned int *i) +static void compat_release_entry(struct compat_ipt_entry *e) { struct ipt_entry_target *t; - - if (i && (*i)-- == 0) - return 1; + struct xt_entry_match *ematch; /* Cleanup all matches */ - COMPAT_IPT_MATCH_ITERATE(e, compat_release_match, NULL); + xt_ematch_foreach(ematch, e) + module_put(ematch->u.kernel.match->me); t = compat_ipt_get_target(e); module_put(t->u.kernel.target->me); - return 0; } static int check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, struct xt_table_info *newinfo, unsigned int *size, - unsigned char *base, - unsigned char *limit, - unsigned int *hook_entries, - unsigned int *underflows, - unsigned int *i, + const unsigned char *base, + const unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, const char *name) { + struct xt_entry_match *ematch; struct ipt_entry_target *t; struct xt_target *target; unsigned int entry_offset; @@ -1548,8 +1519,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, int ret, off, h; duprintf("check_compat_entry_size_and_hooks %p\n", e); - if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 - || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { + if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 || + (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) { duprintf("Bad offset %p, limit = %p\n", e, limit); return -EINVAL; } @@ -1569,10 +1540,13 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); entry_offset = (void *)e - (void *)base; j = 0; - ret = COMPAT_IPT_MATCH_ITERATE(e, compat_find_calc_match, name, - &e->ip, e->comefrom, &off, &j); - if (ret != 0) - goto release_matches; + xt_ematch_foreach(ematch, e) { + ret = compat_find_calc_match(ematch, name, + &e->ip, e->comefrom, &off); + if (ret != 0) + goto release_matches; + ++j; + } t = compat_ipt_get_target(e); target = try_then_request_module(xt_find_target(AF_INET, @@ -1604,14 +1578,16 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, /* Clear counters and comefrom */ memset(&e->counters, 0, sizeof(e->counters)); e->comefrom = 0; - - (*i)++; return 0; out: module_put(t->u.kernel.target->me); release_matches: - IPT_MATCH_ITERATE(e, compat_release_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + module_put(ematch->u.kernel.match->me); + } return ret; } @@ -1625,6 +1601,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, struct ipt_entry *de; unsigned int origsize; int ret, h; + struct xt_entry_match *ematch; ret = 0; origsize = *size; @@ -1635,10 +1612,11 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, *dstptr += sizeof(struct ipt_entry); *size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry); - ret = COMPAT_IPT_MATCH_ITERATE(e, xt_compat_match_from_user, - dstptr, size); - if (ret) - return ret; + xt_ematch_foreach(ematch, e) { + ret = xt_compat_match_from_user(ematch, dstptr, size); + if (ret != 0) + return ret; + } de->target_offset = e->target_offset - (origsize - *size); t = compat_ipt_get_target(e); target = t->u.kernel.target; @@ -1655,36 +1633,43 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, } static int -compat_check_entry(struct ipt_entry *e, const char *name, - unsigned int *i) +compat_check_entry(struct ipt_entry *e, struct net *net, const char *name) { + struct xt_entry_match *ematch; struct xt_mtchk_param mtpar; unsigned int j; - int ret; + int ret = 0; j = 0; + mtpar.net = net; mtpar.table = name; mtpar.entryinfo = &e->ip; mtpar.hook_mask = e->comefrom; mtpar.family = NFPROTO_IPV4; - ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j); - if (ret) - goto cleanup_matches; + xt_ematch_foreach(ematch, e) { + ret = check_match(ematch, &mtpar); + if (ret != 0) + goto cleanup_matches; + ++j; + } - ret = check_target(e, name); + ret = check_target(e, net, name); if (ret) goto cleanup_matches; - - (*i)++; return 0; cleanup_matches: - IPT_MATCH_ITERATE(e, cleanup_match, &j); + xt_ematch_foreach(ematch, e) { + if (j-- == 0) + break; + cleanup_match(ematch, net); + } return ret; } static int -translate_compat_table(const char *name, +translate_compat_table(struct net *net, + const char *name, unsigned int valid_hooks, struct xt_table_info **pinfo, void **pentry0, @@ -1696,6 +1681,8 @@ translate_compat_table(const char *name, unsigned int i, j; struct xt_table_info *newinfo, *info; void *pos, *entry0, *entry1; + struct compat_ipt_entry *iter0; + struct ipt_entry *iter1; unsigned int size; int ret; @@ -1714,13 +1701,17 @@ translate_compat_table(const char *name, j = 0; xt_compat_lock(AF_INET); /* Walk through entries, checking offsets. */ - ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, - check_compat_entry_size_and_hooks, - info, &size, entry0, - entry0 + total_size, - hook_entries, underflows, &j, name); - if (ret != 0) - goto out_unlock; + xt_entry_foreach(iter0, entry0, total_size) { + ret = check_compat_entry_size_and_hooks(iter0, info, &size, + entry0, + entry0 + total_size, + hook_entries, + underflows, + name); + if (ret != 0) + goto out_unlock; + ++j; + } ret = -EINVAL; if (j != number) { @@ -1759,9 +1750,12 @@ translate_compat_table(const char *name, entry1 = newinfo->entries[raw_smp_processor_id()]; pos = entry1; size = total_size; - ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, - compat_copy_entry_from_user, - &pos, &size, name, newinfo, entry1); + xt_entry_foreach(iter0, entry0, total_size) { + ret = compat_copy_entry_from_user(iter0, &pos, &size, + name, newinfo, entry1); + if (ret != 0) + break; + } xt_compat_flush_offsets(AF_INET); xt_compat_unlock(AF_INET); if (ret) @@ -1772,13 +1766,32 @@ translate_compat_table(const char *name, goto free_newinfo; i = 0; - ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry, - name, &i); + xt_entry_foreach(iter1, entry1, newinfo->size) { + ret = compat_check_entry(iter1, net, name); + if (ret != 0) + break; + ++i; + } if (ret) { + /* + * The first i matches need cleanup_entry (calls ->destroy) + * because they had called ->check already. The other j-i + * entries need only release. + */ + int skip = i; j -= i; - COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i, - compat_release_entry, &j); - IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i); + xt_entry_foreach(iter0, entry0, newinfo->size) { + if (skip-- > 0) + continue; + if (j-- == 0) + break; + compat_release_entry(iter0); + } + xt_entry_foreach(iter1, entry1, newinfo->size) { + if (i-- == 0) + break; + cleanup_entry(iter1, net); + } xt_free_table_info(newinfo); return ret; } @@ -1796,7 +1809,11 @@ translate_compat_table(const char *name, free_newinfo: xt_free_table_info(newinfo); out: - COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j); + xt_entry_foreach(iter0, entry0, total_size) { + if (j-- == 0) + break; + compat_release_entry(iter0); + } return ret; out_unlock: xt_compat_flush_offsets(AF_INET); @@ -1811,6 +1828,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) struct compat_ipt_replace tmp; struct xt_table_info *newinfo; void *loc_cpu_entry; + struct ipt_entry *iter; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1833,7 +1851,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) goto free_newinfo; } - ret = translate_compat_table(tmp.name, tmp.valid_hooks, + ret = translate_compat_table(net, tmp.name, tmp.valid_hooks, &newinfo, &loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); @@ -1849,7 +1867,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len) return 0; free_newinfo_untrans: - IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, newinfo->size) + cleanup_entry(iter, net); free_newinfo: xt_free_table_info(newinfo); return ret; @@ -1898,6 +1917,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, int ret = 0; const void *loc_cpu_entry; unsigned int i = 0; + struct ipt_entry *iter; counters = alloc_counters(table); if (IS_ERR(counters)) @@ -1910,9 +1930,12 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table, loc_cpu_entry = private->entries[raw_smp_processor_id()]; pos = userptr; size = total_size; - ret = IPT_ENTRY_ITERATE(loc_cpu_entry, total_size, - compat_copy_entry_to_user, - &pos, &size, counters, &i); + xt_entry_foreach(iter, loc_cpu_entry, total_size) { + ret = compat_copy_entry_to_user(iter, &pos, + &size, counters, i++); + if (ret != 0) + break; + } vfree(counters); return ret; @@ -2086,11 +2109,7 @@ struct xt_table *ipt_register_table(struct net *net, loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; memcpy(loc_cpu_entry, repl->entries, repl->size); - ret = translate_table(table->name, table->valid_hooks, - newinfo, loc_cpu_entry, repl->size, - repl->num_entries, - repl->hook_entry, - repl->underflow); + ret = translate_table(net, newinfo, loc_cpu_entry, repl); if (ret != 0) goto out_free; @@ -2108,17 +2127,19 @@ out: return ERR_PTR(ret); } -void ipt_unregister_table(struct xt_table *table) +void ipt_unregister_table(struct net *net, struct xt_table *table) { struct xt_table_info *private; void *loc_cpu_entry; struct module *table_owner = table->me; + struct ipt_entry *iter; private = xt_unregister_table(table); /* Decrease module usage counts and free resources */ loc_cpu_entry = private->entries[raw_smp_processor_id()]; - IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL); + xt_entry_foreach(iter, loc_cpu_entry, private->size) + cleanup_entry(iter, net); if (private->number > private->initial_entries) module_put(table_owner); xt_free_table_info(private); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 2e4f98b85524..ab828400ed71 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -14,6 +14,7 @@ #include <linux/jhash.h> #include <linux/bitops.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/udp.h> @@ -303,9 +304,9 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par) /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ - if (ip_hdr(skb)->protocol == IPPROTO_ICMP - && (ctinfo == IP_CT_RELATED - || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY)) + if (ip_hdr(skb)->protocol == IPPROTO_ICMP && + (ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)) return XT_CONTINUE; /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, @@ -362,8 +363,8 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par) return false; } - if (e->ip.dmsk.s_addr != htonl(0xffffffff) - || e->ip.dst.s_addr == 0) { + if (e->ip.dmsk.s_addr != htonl(0xffffffff) || + e->ip.dst.s_addr == 0) { printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n"); return false; } @@ -495,14 +496,14 @@ arp_mangle(unsigned int hook, struct clusterip_config *c; /* we don't care about non-ethernet and non-ipv4 ARP */ - if (arp->ar_hrd != htons(ARPHRD_ETHER) - || arp->ar_pro != htons(ETH_P_IP) - || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) + if (arp->ar_hrd != htons(ARPHRD_ETHER) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) return NF_ACCEPT; /* we only want to mangle arp requests and replies */ - if (arp->ar_op != htons(ARPOP_REPLY) - && arp->ar_op != htons(ARPOP_REQUEST)) + if (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST)) return NF_ACCEPT; payload = (void *)(arp+1); @@ -560,8 +561,7 @@ struct clusterip_seq_position { static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) { - const struct proc_dir_entry *pde = s->private; - struct clusterip_config *c = pde->data; + struct clusterip_config *c = s->private; unsigned int weight; u_int32_t local_nodes; struct clusterip_seq_position *idx; @@ -632,10 +632,9 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *sf = file->private_data; - struct proc_dir_entry *pde = PDE(inode); - struct clusterip_config *c = pde->data; + struct clusterip_config *c = PDE(inode)->data; - sf->private = pde; + sf->private = c; clusterip_config_get(c); } @@ -645,8 +644,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file) static int clusterip_proc_release(struct inode *inode, struct file *file) { - struct proc_dir_entry *pde = PDE(inode); - struct clusterip_config *c = pde->data; + struct clusterip_config *c = PDE(inode)->data; int ret; ret = seq_release(inode, file); @@ -660,10 +658,9 @@ static int clusterip_proc_release(struct inode *inode, struct file *file) static ssize_t clusterip_proc_write(struct file *file, const char __user *input, size_t size, loff_t *ofs) { + struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data; #define PROC_WRITELEN 10 char buffer[PROC_WRITELEN+1]; - const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode); - struct clusterip_config *c = pde->data; unsigned long nodenum; if (copy_from_user(buffer, input, PROC_WRITELEN)) diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c index f7e2fa0974dc..ea5cea2415c1 100644 --- a/net/ipv4/netfilter/ipt_ECN.c +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -50,7 +50,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo) struct tcphdr _tcph, *tcph; __be16 oldval; - /* Not enought header? */ + /* Not enough header? */ tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph); if (!tcph) return false; @@ -85,8 +85,8 @@ ecn_tg(struct sk_buff *skb, const struct xt_target_param *par) if (!set_ect_ip(skb, einfo)) return NF_DROP; - if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) - && ip_hdr(skb)->protocol == IPPROTO_TCP) + if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) && + ip_hdr(skb)->protocol == IPPROTO_TCP) if (!set_ect_tcp(skb, einfo)) return NF_DROP; @@ -108,8 +108,8 @@ static bool ecn_tg_check(const struct xt_tgchk_param *par) einfo->ip_ect); return false; } - if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) - && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) { + if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) && + (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) { printk(KERN_WARNING "ECN: cannot use TCP operations on a " "non-tcp rule\n"); return false; diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index acc44c69eb68..ee128efa1c8d 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -74,8 +74,8 @@ static void dump_packet(const struct nf_loginfo *info, if (ntohs(ih->frag_off) & IP_OFFSET) printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); - if ((logflags & IPT_LOG_IPOPT) - && ih->ihl * 4 > sizeof(struct iphdr)) { + if ((logflags & IPT_LOG_IPOPT) && + ih->ihl * 4 > sizeof(struct iphdr)) { const unsigned char *op; unsigned char _opt[4 * 15 - sizeof(struct iphdr)]; unsigned int i, optsize; @@ -146,8 +146,8 @@ static void dump_packet(const struct nf_loginfo *info, /* Max length: 11 "URGP=65535 " */ printk("URGP=%u ", ntohs(th->urg_ptr)); - if ((logflags & IPT_LOG_TCPOPT) - && th->doff * 4 > sizeof(struct tcphdr)) { + if ((logflags & IPT_LOG_TCPOPT) && + th->doff * 4 > sizeof(struct tcphdr)) { unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; const unsigned char *op; unsigned int i, optsize; @@ -238,9 +238,9 @@ static void dump_packet(const struct nf_loginfo *info, printk("TYPE=%u CODE=%u ", ich->type, ich->code); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ - if (ich->type <= NR_ICMP_TYPES - && required_len[ich->type] - && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + if (ich->type <= NR_ICMP_TYPES && + required_len[ich->type] && + skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { printk("INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; @@ -276,8 +276,8 @@ static void dump_packet(const struct nf_loginfo *info, } /* Max length: 10 "MTU=65535 " */ - if (ich->type == ICMP_DEST_UNREACH - && ich->code == ICMP_FRAG_NEEDED) + if (ich->type == ICMP_DEST_UNREACH && + ich->code == ICMP_FRAG_NEEDED) printk("MTU=%u ", ntohs(ich->un.frag.mtu)); } break; @@ -407,8 +407,8 @@ ipt_log_packet(u_int8_t pf, if (in && !out) { /* MAC logging for input chain only. */ printk("MAC="); - if (skb->dev && skb->dev->hard_header_len - && skb->mac_header != skb->network_header) { + if (skb->dev && skb->dev->hard_header_len && + skb->mac_header != skb->network_header) { int i; const unsigned char *p = skb_mac_header(skb); for (i = 0; i < skb->dev->hard_header_len; i++,p++) diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index dada0863946d..650b54042b01 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -59,8 +59,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par) ct = nf_ct_get(skb, &ctinfo); nat = nfct_nat(ct); - NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED - || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); + NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); /* Source address is 0.0.0.0 - locally generated packet that is * probably not supposed to be masqueraded. diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index c93ae44bff2a..a0e8bcf04159 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -12,6 +12,7 @@ #include <linux/module.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <linux/ip.h> #include <linux/udp.h> #include <linux/icmp.h> @@ -184,8 +185,8 @@ static bool reject_tg_check(const struct xt_tgchk_param *par) return false; } else if (rejinfo->with == IPT_TCP_RESET) { /* Must specify that it's a TCP packet */ - if (e->ip.proto != IPPROTO_TCP - || (e->ip.invflags & XT_INV_PROTO)) { + if (e->ip.proto != IPPROTO_TCP || + (e->ip.invflags & XT_INV_PROTO)) { printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n"); return false; } diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c index d32cc4bb328a..0dbe697f164f 100644 --- a/net/ipv4/netfilter/ipt_ULOG.c +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -33,6 +33,7 @@ #include <linux/module.h> #include <linux/spinlock.h> #include <linux/socket.h> +#include <linux/slab.h> #include <linux/skbuff.h> #include <linux/kernel.h> #include <linux/timer.h> @@ -226,9 +227,9 @@ static void ipt_ulog_packet(unsigned int hooknum, else *(pm->prefix) = '\0'; - if (in && in->hard_header_len > 0 - && skb->mac_header != skb->network_header - && in->hard_header_len <= ULOG_MAC_LEN) { + if (in && in->hard_header_len > 0 && + skb->mac_header != skb->network_header && + in->hard_header_len <= ULOG_MAC_LEN) { memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len); pm->mac_len = in->hard_header_len; } else @@ -338,7 +339,7 @@ struct compat_ipt_ulog_info { char prefix[ULOG_PREFIX_LEN]; }; -static void ulog_tg_compat_from_user(void *dst, void *src) +static void ulog_tg_compat_from_user(void *dst, const void *src) { const struct compat_ipt_ulog_info *cl = src; struct ipt_ulog_info l = { @@ -351,7 +352,7 @@ static void ulog_tg_compat_from_user(void *dst, void *src) memcpy(dst, &l, sizeof(l)); } -static int ulog_tg_compat_to_user(void __user *dst, void *src) +static int ulog_tg_compat_to_user(void __user *dst, const void *src) { const struct ipt_ulog_info *l = src; struct compat_ipt_ulog_info cl = { diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c index 6289b64144c6..2a1e56b71908 100644 --- a/net/ipv4/netfilter/ipt_ecn.c +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -96,8 +96,8 @@ static bool ecn_mt_check(const struct xt_mtchk_param *par) if (info->invert & IPT_ECN_OP_MATCH_MASK) return false; - if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) - && ip->proto != IPPROTO_TCP) { + if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) && + ip->proto != IPPROTO_TCP) { printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for" " non-tcp packets\n"); return false; diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c index df566cbd68e5..55392466daa4 100644 --- a/net/ipv4/netfilter/iptable_filter.c +++ b/net/ipv4/netfilter/iptable_filter.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/slab.h> #include <net/ip.h> MODULE_LICENSE("GPL"); @@ -23,104 +24,32 @@ MODULE_DESCRIPTION("iptables filter table"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT)) -static struct -{ - struct ipt_replace repl; - struct ipt_standard entries[3]; - struct ipt_error term; -} initial_table __net_initdata = { - .repl = { - .name = "filter", - .valid_hooks = FILTER_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), - .hook_entry = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, - }, - .underflow = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, - }, - }, - .entries = { - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ - IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IPT_ERROR_INIT, /* ERROR */ -}; - static const struct xt_table packet_filter = { .name = "filter", .valid_hooks = FILTER_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_FILTER, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -ipt_local_in_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_filter); -} - static unsigned int -ipt_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +iptable_filter_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_filter); -} + const struct net *net; -static unsigned int -ipt_local_out_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) + if (hook == NF_INET_LOCAL_OUT && + (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr))) + /* root is playing with raw sockets. */ return NF_ACCEPT; - return ipt_do_table(skb, hook, in, out, - dev_net(out)->ipv4.iptable_filter); + + net = dev_net((in != NULL) ? in : out); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter); } -static struct nf_hook_ops ipt_ops[] __read_mostly = { - { - .hook = ipt_local_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_FILTER, - }, - { - .hook = ipt_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP_PRI_FILTER, - }, - { - .hook = ipt_local_out_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_FILTER, - }, -}; +static struct nf_hook_ops *filter_ops __read_mostly; /* Default to forward because I got too much mail already. */ static int forward = NF_ACCEPT; @@ -128,9 +57,18 @@ module_param(forward, bool, 0000); static int __net_init iptable_filter_net_init(struct net *net) { - /* Register table */ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&packet_filter); + if (repl == NULL) + return -ENOMEM; + /* Entry 1 is the FORWARD hook */ + ((struct ipt_standard *)repl->entries)[1].target.verdict = + -forward - 1; + net->ipv4.iptable_filter = - ipt_register_table(net, &packet_filter, &initial_table.repl); + ipt_register_table(net, &packet_filter, repl); + kfree(repl); if (IS_ERR(net->ipv4.iptable_filter)) return PTR_ERR(net->ipv4.iptable_filter); return 0; @@ -138,7 +76,7 @@ static int __net_init iptable_filter_net_init(struct net *net) static void __net_exit iptable_filter_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_filter); + ipt_unregister_table(net, net->ipv4.iptable_filter); } static struct pernet_operations iptable_filter_net_ops = { @@ -155,17 +93,16 @@ static int __init iptable_filter_init(void) return -EINVAL; } - /* Entry 1 is the FORWARD hook */ - initial_table.entries[1].target.verdict = -forward - 1; - ret = register_pernet_subsys(&iptable_filter_net_ops); if (ret < 0) return ret; /* Register hooks */ - ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - if (ret < 0) + filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook); + if (IS_ERR(filter_ops)) { + ret = PTR_ERR(filter_ops); goto cleanup_table; + } return ret; @@ -176,7 +113,7 @@ static int __init iptable_filter_init(void) static void __exit iptable_filter_fini(void) { - nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + xt_hook_unlink(&packet_filter, filter_ops); unregister_pernet_subsys(&iptable_filter_net_ops); } diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c index 036047f9b0f2..294a2a32f293 100644 --- a/net/ipv4/netfilter/iptable_mangle.c +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -12,6 +12,7 @@ #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <net/sock.h> #include <net/route.h> #include <linux/ip.h> @@ -27,101 +28,16 @@ MODULE_DESCRIPTION("iptables mangle table"); (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) -/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */ -static const struct -{ - struct ipt_replace repl; - struct ipt_standard entries[5]; - struct ipt_error term; -} initial_table __net_initdata = { - .repl = { - .name = "mangle", - .valid_hooks = MANGLE_VALID_HOOKS, - .num_entries = 6, - .size = sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error), - .hook_entry = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard), - [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2, - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, - [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4, - }, - .underflow = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard), - [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2, - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, - [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4, - }, - }, - .entries = { - IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ - IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */ - }, - .term = IPT_ERROR_INIT, /* ERROR */ -}; - static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_MANGLE, }; -/* The work comes in here from netfilter.c. */ -static unsigned int -ipt_pre_routing_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_mangle); -} - -static unsigned int -ipt_post_routing_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(out)->ipv4.iptable_mangle); -} - -static unsigned int -ipt_local_in_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_mangle); -} - -static unsigned int -ipt_forward_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_mangle); -} - static unsigned int -ipt_local_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ipt_mangle_out(struct sk_buff *skb, const struct net_device *out) { unsigned int ret; const struct iphdr *iph; @@ -130,8 +46,8 @@ ipt_local_hook(unsigned int hook, u_int32_t mark; /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) - || ip_hdrlen(skb) < sizeof(struct iphdr)) + if (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; /* Save things which could affect route */ @@ -141,7 +57,7 @@ ipt_local_hook(unsigned int hook, daddr = iph->daddr; tos = iph->tos; - ret = ipt_do_table(skb, hook, in, out, + ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, dev_net(out)->ipv4.iptable_mangle); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { @@ -158,49 +74,36 @@ ipt_local_hook(unsigned int hook, return ret; } -static struct nf_hook_ops ipt_ops[] __read_mostly = { - { - .hook = ipt_pre_routing_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_MANGLE, - }, - { - .hook = ipt_local_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_MANGLE, - }, - { - .hook = ipt_forward_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP_PRI_MANGLE, - }, - { - .hook = ipt_local_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_MANGLE, - }, - { - .hook = ipt_post_routing_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_MANGLE, - }, -}; +/* The work comes in here from netfilter.c. */ +static unsigned int +iptable_mangle_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (hook == NF_INET_LOCAL_OUT) + return ipt_mangle_out(skb, out); + if (hook == NF_INET_POST_ROUTING) + return ipt_do_table(skb, hook, in, out, + dev_net(out)->ipv4.iptable_mangle); + /* PREROUTING/INPUT/FORWARD: */ + return ipt_do_table(skb, hook, in, out, + dev_net(in)->ipv4.iptable_mangle); +} + +static struct nf_hook_ops *mangle_ops __read_mostly; static int __net_init iptable_mangle_net_init(struct net *net) { - /* Register table */ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&packet_mangler); + if (repl == NULL) + return -ENOMEM; net->ipv4.iptable_mangle = - ipt_register_table(net, &packet_mangler, &initial_table.repl); + ipt_register_table(net, &packet_mangler, repl); + kfree(repl); if (IS_ERR(net->ipv4.iptable_mangle)) return PTR_ERR(net->ipv4.iptable_mangle); return 0; @@ -208,7 +111,7 @@ static int __net_init iptable_mangle_net_init(struct net *net) static void __net_exit iptable_mangle_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_mangle); + ipt_unregister_table(net, net->ipv4.iptable_mangle); } static struct pernet_operations iptable_mangle_net_ops = { @@ -225,9 +128,11 @@ static int __init iptable_mangle_init(void) return ret; /* Register hooks */ - ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - if (ret < 0) + mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook); + if (IS_ERR(mangle_ops)) { + ret = PTR_ERR(mangle_ops); goto cleanup_table; + } return ret; @@ -238,7 +143,7 @@ static int __init iptable_mangle_init(void) static void __exit iptable_mangle_fini(void) { - nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + xt_hook_unlink(&packet_mangler, mangle_ops); unregister_pernet_subsys(&iptable_mangle_net_ops); } diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c index 993edc23be09..07fb710cd722 100644 --- a/net/ipv4/netfilter/iptable_raw.c +++ b/net/ipv4/netfilter/iptable_raw.c @@ -5,94 +5,49 @@ */ #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/slab.h> #include <net/ip.h> #define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT)) -static const struct -{ - struct ipt_replace repl; - struct ipt_standard entries[2]; - struct ipt_error term; -} initial_table __net_initdata = { - .repl = { - .name = "raw", - .valid_hooks = RAW_VALID_HOOKS, - .num_entries = 3, - .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error), - .hook_entry = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) - }, - .underflow = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) - }, - }, - .entries = { - IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IPT_ERROR_INIT, /* ERROR */ -}; - static const struct xt_table packet_raw = { .name = "raw", .valid_hooks = RAW_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_RAW, }; /* The work comes in here from netfilter.c. */ static unsigned int -ipt_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +iptable_raw_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_raw); -} + const struct net *net; -static unsigned int -ipt_local_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* root is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) || - ip_hdrlen(skb) < sizeof(struct iphdr)) + if (hook == NF_INET_LOCAL_OUT && + (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr))) + /* root is playing with raw sockets. */ return NF_ACCEPT; - return ipt_do_table(skb, hook, in, out, - dev_net(out)->ipv4.iptable_raw); + + net = dev_net((in != NULL) ? in : out); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw); } -/* 'raw' is the very first table. */ -static struct nf_hook_ops ipt_ops[] __read_mostly = { - { - .hook = ipt_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP_PRI_RAW, - .owner = THIS_MODULE, - }, - { - .hook = ipt_local_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_RAW, - .owner = THIS_MODULE, - }, -}; +static struct nf_hook_ops *rawtable_ops __read_mostly; static int __net_init iptable_raw_net_init(struct net *net) { - /* Register table */ + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&packet_raw); + if (repl == NULL) + return -ENOMEM; net->ipv4.iptable_raw = - ipt_register_table(net, &packet_raw, &initial_table.repl); + ipt_register_table(net, &packet_raw, repl); + kfree(repl); if (IS_ERR(net->ipv4.iptable_raw)) return PTR_ERR(net->ipv4.iptable_raw); return 0; @@ -100,7 +55,7 @@ static int __net_init iptable_raw_net_init(struct net *net) static void __net_exit iptable_raw_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_raw); + ipt_unregister_table(net, net->ipv4.iptable_raw); } static struct pernet_operations iptable_raw_net_ops = { @@ -117,9 +72,11 @@ static int __init iptable_raw_init(void) return ret; /* Register hooks */ - ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - if (ret < 0) + rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook); + if (IS_ERR(rawtable_ops)) { + ret = PTR_ERR(rawtable_ops); goto cleanup_table; + } return ret; @@ -130,7 +87,7 @@ static int __init iptable_raw_init(void) static void __exit iptable_raw_fini(void) { - nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + xt_hook_unlink(&packet_raw, rawtable_ops); unregister_pernet_subsys(&iptable_raw_net_ops); } diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c index 99eb76c65d25..be45bdc4c602 100644 --- a/net/ipv4/netfilter/iptable_security.c +++ b/net/ipv4/netfilter/iptable_security.c @@ -17,6 +17,7 @@ */ #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> +#include <linux/slab.h> #include <net/ip.h> MODULE_LICENSE("GPL"); @@ -27,109 +28,44 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules"); (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) -static const struct -{ - struct ipt_replace repl; - struct ipt_standard entries[3]; - struct ipt_error term; -} initial_table __net_initdata = { - .repl = { - .name = "security", - .valid_hooks = SECURITY_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), - .hook_entry = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, - }, - .underflow = { - [NF_INET_LOCAL_IN] = 0, - [NF_INET_FORWARD] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2, - }, - }, - .entries = { - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */ - IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IPT_ERROR_INIT, /* ERROR */ -}; - static const struct xt_table security_table = { .name = "security", .valid_hooks = SECURITY_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, + .priority = NF_IP_PRI_SECURITY, }; static unsigned int -ipt_local_in_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_security); -} - -static unsigned int -ipt_forward_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) +iptable_security_hook(unsigned int hook, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) { - return ipt_do_table(skb, hook, in, out, - dev_net(in)->ipv4.iptable_security); -} + const struct net *net; -static unsigned int -ipt_local_out_hook(unsigned int hook, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - /* Somebody is playing with raw sockets. */ - if (skb->len < sizeof(struct iphdr) - || ip_hdrlen(skb) < sizeof(struct iphdr)) + if (hook == NF_INET_LOCAL_OUT && + (skb->len < sizeof(struct iphdr) || + ip_hdrlen(skb) < sizeof(struct iphdr))) + /* Somebody is playing with raw sockets. */ return NF_ACCEPT; - return ipt_do_table(skb, hook, in, out, - dev_net(out)->ipv4.iptable_security); + + net = dev_net((in != NULL) ? in : out); + return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security); } -static struct nf_hook_ops ipt_ops[] __read_mostly = { - { - .hook = ipt_local_in_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_SECURITY, - }, - { - .hook = ipt_forward_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_FORWARD, - .priority = NF_IP_PRI_SECURITY, - }, - { - .hook = ipt_local_out_hook, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP_PRI_SECURITY, - }, -}; +static struct nf_hook_ops *sectbl_ops __read_mostly; static int __net_init iptable_security_net_init(struct net *net) { - net->ipv4.iptable_security = - ipt_register_table(net, &security_table, &initial_table.repl); + struct ipt_replace *repl; + repl = ipt_alloc_initial_table(&security_table); + if (repl == NULL) + return -ENOMEM; + net->ipv4.iptable_security = + ipt_register_table(net, &security_table, repl); + kfree(repl); if (IS_ERR(net->ipv4.iptable_security)) return PTR_ERR(net->ipv4.iptable_security); @@ -138,7 +74,7 @@ static int __net_init iptable_security_net_init(struct net *net) static void __net_exit iptable_security_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.iptable_security); + ipt_unregister_table(net, net->ipv4.iptable_security); } static struct pernet_operations iptable_security_net_ops = { @@ -154,9 +90,11 @@ static int __init iptable_security_init(void) if (ret < 0) return ret; - ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); - if (ret < 0) + sectbl_ops = xt_hook_link(&security_table, iptable_security_hook); + if (IS_ERR(sectbl_ops)) { + ret = PTR_ERR(sectbl_ops); goto cleanup_table; + } return ret; @@ -167,7 +105,7 @@ cleanup_table: static void __exit iptable_security_fini(void) { - nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops)); + xt_hook_unlink(&security_table, sectbl_ops); unregister_pernet_subsys(&iptable_security_net_ops); } diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index aa95bb82ee6c..2bb1f87051c4 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -22,6 +22,7 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_l3proto.h> +#include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/ipv4/nf_conntrack_ipv4.h> #include <net/netfilter/nf_nat_helper.h> @@ -195,7 +196,6 @@ static int log_invalid_proto_max = 255; static ctl_table ip_ct_sysctl_table[] = { { - .ctl_name = NET_IPV4_NF_CONNTRACK_MAX, .procname = "ip_conntrack_max", .data = &nf_conntrack_max, .maxlen = sizeof(int), @@ -203,7 +203,6 @@ static ctl_table ip_ct_sysctl_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, .procname = "ip_conntrack_count", .data = &init_net.ct.count, .maxlen = sizeof(int), @@ -211,15 +210,13 @@ static ctl_table ip_ct_sysctl_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS, .procname = "ip_conntrack_buckets", - .data = &nf_conntrack_htable_size, + .data = &init_net.ct.htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, .procname = "ip_conntrack_checksum", .data = &init_net.ct.sysctl_checksum, .maxlen = sizeof(int), @@ -227,19 +224,15 @@ static ctl_table ip_ct_sysctl_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, .procname = "ip_conntrack_log_invalid", .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .strategy = sysctl_intvec, .extra1 = &log_invalid_proto_min, .extra2 = &log_invalid_proto_max, }, - { - .ctl_name = 0 - } + { } }; #endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */ @@ -255,10 +248,10 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) struct nf_conntrack_tuple tuple; memset(&tuple, 0, sizeof(tuple)); - tuple.src.u3.ip = inet->rcv_saddr; - tuple.src.u.tcp.port = inet->sport; - tuple.dst.u3.ip = inet->daddr; - tuple.dst.u.tcp.port = inet->dport; + tuple.src.u3.ip = inet->inet_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.ip = inet->inet_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; tuple.src.l3num = PF_INET; tuple.dst.protonum = sk->sk_protocol; @@ -274,7 +267,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -EINVAL; } - h = nf_conntrack_find_get(sock_net(sk), &tuple); + h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple); if (h) { struct sockaddr_in sin; struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 8668a3defda6..2fb7b76da94f 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -32,7 +32,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < net->ct.htable_size; st->bucket++) { n = rcu_dereference(net->ct.hash[st->bucket].first); if (!is_a_nulls(n)) @@ -50,7 +50,7 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, head = rcu_dereference(head->next); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= nf_conntrack_htable_size) + if (++st->bucket >= net->ct.htable_size) return NULL; } head = rcu_dereference(net->ct.hash[st->bucket].first); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index d71ba7677344..7404bde95994 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -18,6 +18,7 @@ #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_log.h> static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ; @@ -54,8 +55,8 @@ static const u_int8_t invmap[] = { static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig) { - if (orig->dst.u.icmp.type >= sizeof(invmap) - || !invmap[orig->dst.u.icmp.type]) + if (orig->dst.u.icmp.type >= sizeof(invmap) || + !invmap[orig->dst.u.icmp.type]) return false; tuple->src.u.icmp.id = orig->src.u.icmp.id; @@ -101,8 +102,8 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, [ICMP_ADDRESS] = 1 }; - if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) - || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { + if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) || + !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) { /* Can't create a new ICMP `conn' with this. */ pr_debug("icmp: can't create new conn with type %u\n", ct->tuplehash[0].tuple.dst.u.icmp.type); @@ -114,13 +115,14 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb, /* Returns conntrack if it dealt with ICMP, and filled in skb fields */ static int -icmp_error_message(struct net *net, struct sk_buff *skb, +icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, enum ip_conntrack_info *ctinfo, unsigned int hooknum) { struct nf_conntrack_tuple innertuple, origtuple; const struct nf_conntrack_l4proto *innerproto; const struct nf_conntrack_tuple_hash *h; + u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; NF_CT_ASSERT(skb->nfct == NULL); @@ -146,7 +148,7 @@ icmp_error_message(struct net *net, struct sk_buff *skb, *ctinfo = IP_CT_RELATED; - h = nf_conntrack_find_get(net, &innertuple); + h = nf_conntrack_find_get(net, zone, &innertuple); if (!h) { pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT; @@ -163,7 +165,8 @@ icmp_error_message(struct net *net, struct sk_buff *skb, /* Small and modified version of icmp_rcv */ static int -icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, +icmp_error(struct net *net, struct nf_conn *tmpl, + struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum) { const struct icmphdr *icmph; @@ -201,14 +204,14 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff, } /* Need to track icmp error message? */ - if (icmph->type != ICMP_DEST_UNREACH - && icmph->type != ICMP_SOURCE_QUENCH - && icmph->type != ICMP_TIME_EXCEEDED - && icmph->type != ICMP_PARAMETERPROB - && icmph->type != ICMP_REDIRECT) + if (icmph->type != ICMP_DEST_UNREACH && + icmph->type != ICMP_SOURCE_QUENCH && + icmph->type != ICMP_TIME_EXCEEDED && + icmph->type != ICMP_PARAMETERPROB && + icmph->type != ICMP_REDIRECT) return NF_ACCEPT; - return icmp_error_message(net, skb, ctinfo, hooknum); + return icmp_error_message(net, tmpl, skb, ctinfo, hooknum); } #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) @@ -238,17 +241,17 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = { static int icmp_nlattr_to_tuple(struct nlattr *tb[], struct nf_conntrack_tuple *tuple) { - if (!tb[CTA_PROTO_ICMP_TYPE] - || !tb[CTA_PROTO_ICMP_CODE] - || !tb[CTA_PROTO_ICMP_ID]) + if (!tb[CTA_PROTO_ICMP_TYPE] || + !tb[CTA_PROTO_ICMP_CODE] || + !tb[CTA_PROTO_ICMP_ID]) return -EINVAL; tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]); tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]); tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]); - if (tuple->dst.u.icmp.type >= sizeof(invmap) - || !invmap[tuple->dst.u.icmp.type]) + if (tuple->dst.u.icmp.type >= sizeof(invmap) || + !invmap[tuple->dst.u.icmp.type]) return -EINVAL; return 0; @@ -270,9 +273,7 @@ static struct ctl_table icmp_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = 0 - } + { } }; #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT static struct ctl_table icmp_compat_sysctl_table[] = { @@ -283,9 +284,7 @@ static struct ctl_table icmp_compat_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - { - .ctl_name = 0 - } + { } }; #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index fa2d6b6fc3e5..cb763ae9ed90 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -14,8 +14,13 @@ #include <net/route.h> #include <net/ip.h> +#include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) +#include <net/netfilter/nf_conntrack.h> +#endif +#include <net/netfilter/nf_conntrack_zones.h> /* Returns new sk_buff, or NULL */ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) @@ -34,6 +39,27 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user) return err; } +static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, + struct sk_buff *skb) +{ + u16 zone = NF_CT_DEFAULT_ZONE; + +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (skb->nfct) + zone = nf_ct_zone((struct nf_conn *)skb->nfct); +#endif + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge && + skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone; +#endif + if (hooknum == NF_INET_PRE_ROUTING) + return IP_DEFRAG_CONNTRACK_IN + zone; + else + return IP_DEFRAG_CONNTRACK_OUT + zone; +} + static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, @@ -44,16 +70,14 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum, #if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ - if (skb->nfct) + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) return NF_ACCEPT; #endif #endif /* Gather fragments. */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (nf_ct_ipv4_gather_frags(skb, - hooknum == NF_INET_PRE_ROUTING ? - IP_DEFRAG_CONNTRACK_IN : - IP_DEFRAG_CONNTRACK_OUT)) + enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb); + if (nf_ct_ipv4_gather_frags(skb, user)) return NF_STOLEN; } return NF_ACCEPT; diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index fe1a64479dd0..4f8bddb760c9 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/timer.h> #include <linux/skbuff.h> +#include <linux/gfp.h> #include <net/checksum.h> #include <net/icmp.h> #include <net/ip.h> @@ -30,14 +31,12 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_l4proto.h> +#include <net/netfilter/nf_conntrack_zones.h> static DEFINE_SPINLOCK(nf_nat_lock); static struct nf_conntrack_l3proto *l3proto __read_mostly; -/* Calculated at init based on memory size */ -static unsigned int nf_nat_htable_size __read_mostly; - #define MAX_IP_NAT_PROTO 256 static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] __read_mostly; @@ -72,15 +71,16 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) { unsigned int hash; /* Original src, to ensure we map it consistently if poss. */ hash = jhash_3words((__force u32)tuple->src.u3.ip, - (__force u32)tuple->src.u.all, + (__force u32)tuple->src.u.all ^ zone, tuple->dst.protonum, 0); - return ((u64)hash * nf_nat_htable_size) >> 32; + return ((u64)hash * net->ipv4.nat_htable_size) >> 32; } /* Is this tuple already taken? (not by us) */ @@ -142,12 +142,12 @@ same_src(const struct nf_conn *ct, /* Only called for SRC manip */ static int -find_appropriate_src(struct net *net, +find_appropriate_src(struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { - unsigned int h = hash_by_src(tuple); + unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn_nat *nat; const struct nf_conn *ct; const struct hlist_node *n; @@ -155,7 +155,7 @@ find_appropriate_src(struct net *net, rcu_read_lock(); hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) { ct = nat->ct; - if (same_src(ct, tuple)) { + if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) { /* Copy source part from reply tuple. */ nf_ct_invert_tuplepr(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); @@ -178,7 +178,7 @@ find_appropriate_src(struct net *net, the ip with the lowest src-ip/dst-ip/proto usage. */ static void -find_best_ips_proto(struct nf_conntrack_tuple *tuple, +find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple, const struct nf_nat_range *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) @@ -212,7 +212,7 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple, maxip = ntohl(range->max_ip); j = jhash_2words((__force u32)tuple->src.u3.ip, range->flags & IP_NAT_RANGE_PERSISTENT ? - 0 : (__force u32)tuple->dst.u3.ip, 0); + 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0); j = ((u64)j * (maxip - minip + 1)) >> 32; *var_ipp = htonl(minip + j); } @@ -232,6 +232,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, { struct net *net = nf_ct_net(ct); const struct nf_nat_protocol *proto; + u16 zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, and that same mapping gives a unique tuple within the given @@ -242,7 +243,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, manips not an issue. */ if (maniptype == IP_NAT_MANIP_SRC && !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { - if (find_appropriate_src(net, orig_tuple, tuple, range)) { + if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; @@ -252,7 +253,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, /* 2) Select the least-used IP/proto combination in the given range. */ *tuple = *orig_tuple; - find_best_ips_proto(tuple, range, ct, maniptype); + find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into the range to make a unique tuple. */ @@ -330,7 +331,8 @@ nf_nat_setup_info(struct nf_conn *ct, if (have_to_hash) { unsigned int srchash; - srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + srchash = hash_by_src(net, nf_ct_zone(ct), + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_lock); /* nf_conntrack_alter_reply might re-allocate exntension aera */ nat = nfct_nat(ct); @@ -679,8 +681,10 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, static int __net_init nf_nat_net_init(struct net *net) { - net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, - &net->ipv4.nat_vmalloced, 0); + /* Leave them the same for the moment. */ + net->ipv4.nat_htable_size = net->ct.htable_size; + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, + &net->ipv4.nat_vmalloced, 0); if (!net->ipv4.nat_bysource) return -ENOMEM; return 0; @@ -703,7 +707,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) nf_ct_iterate_cleanup(net, &clean_nat, NULL); synchronize_rcu(); nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, - nf_nat_htable_size); + net->ipv4.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { @@ -724,9 +728,6 @@ static int __init nf_nat_init(void) return ret; } - /* Leave them the same for the moment. */ - nf_nat_htable_size = nf_conntrack_htable_size; - ret = register_pernet_subsys(&nf_nat_net_ops); if (ret < 0) goto cleanup_extend; diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c index a1d5d58a58bf..86e0e84ff0a0 100644 --- a/net/ipv4/netfilter/nf_nat_ftp.c +++ b/net/ipv4/netfilter/nf_nat_ftp.c @@ -27,76 +27,29 @@ MODULE_ALIAS("ip_nat_ftp"); /* FIXME: Time out? --RR */ -static int -mangle_rfc959_packet(struct sk_buff *skb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) +static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type, + char *buffer, size_t buflen, + __be32 addr, u16 port) { - char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; - - sprintf(buffer, "%u,%u,%u,%u,%u,%u", - NIPQUAD(newip), port>>8, port&0xFF); - - pr_debug("calling nf_nat_mangle_tcp_packet\n"); - - return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); -} - -/* |1|132.235.1.2|6275| */ -static int -mangle_eprt_packet(struct sk_buff *skb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - char buffer[sizeof("|1|255.255.255.255|65535|")]; - - sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port); - - pr_debug("calling nf_nat_mangle_tcp_packet\n"); - - return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); -} - -/* |1|132.235.1.2|6275| */ -static int -mangle_epsv_packet(struct sk_buff *skb, - __be32 newip, - u_int16_t port, - unsigned int matchoff, - unsigned int matchlen, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo) -{ - char buffer[sizeof("|||65535|")]; - - sprintf(buffer, "|||%u|", port); - - pr_debug("calling nf_nat_mangle_tcp_packet\n"); + switch (type) { + case NF_CT_FTP_PORT: + case NF_CT_FTP_PASV: + return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u", + ((unsigned char *)&addr)[0], + ((unsigned char *)&addr)[1], + ((unsigned char *)&addr)[2], + ((unsigned char *)&addr)[3], + port >> 8, + port & 0xFF); + case NF_CT_FTP_EPRT: + return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port); + case NF_CT_FTP_EPSV: + return snprintf(buffer, buflen, "|||%u|", port); + } - return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, - matchlen, buffer, strlen(buffer)); + return 0; } -static int (*mangle[])(struct sk_buff *, __be32, u_int16_t, - unsigned int, unsigned int, struct nf_conn *, - enum ip_conntrack_info) -= { - [NF_CT_FTP_PORT] = mangle_rfc959_packet, - [NF_CT_FTP_PASV] = mangle_rfc959_packet, - [NF_CT_FTP_EPRT] = mangle_eprt_packet, - [NF_CT_FTP_EPSV] = mangle_epsv_packet -}; - /* So, this packet has hit the connection tracking matching code. Mangle it, and change the expectation to match the new version. */ static unsigned int nf_nat_ftp(struct sk_buff *skb, @@ -110,6 +63,8 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, u_int16_t port; int dir = CTINFO2DIR(ctinfo); struct nf_conn *ct = exp->master; + char buffer[sizeof("|1|255.255.255.255|65535|")]; + unsigned int buflen; pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); @@ -132,11 +87,21 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, if (port == 0) return NF_DROP; - if (!mangle[type](skb, newip, port, matchoff, matchlen, ct, ctinfo)) { - nf_ct_unexpect_related(exp); - return NF_DROP; - } + buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port); + if (!buflen) + goto out; + + pr_debug("calling nf_nat_mangle_tcp_packet\n"); + + if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff, + matchlen, buffer, buflen)) + goto out; + return NF_ACCEPT; + +out: + nf_ct_unexpect_related(exp); + return NF_DROP; } static void __exit nf_nat_ftp_fini(void) diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index f9520fa3aba9..4a0c6b548eee 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -8,6 +8,7 @@ * published by the Free Software Foundation. */ #include <linux/module.h> +#include <linux/gfp.h> #include <linux/kmod.h> #include <linux/types.h> #include <linux/timer.h> @@ -41,18 +42,14 @@ adjust_tcp_sequence(u32 seq, struct nf_conn *ct, enum ip_conntrack_info ctinfo) { - int dir; - struct nf_nat_seq *this_way, *other_way; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_nat_seq *this_way = &nat->seq[dir]; - pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", seq, seq); - - dir = CTINFO2DIR(ctinfo); - - this_way = &nat->seq[dir]; - other_way = &nat->seq[!dir]; + pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", + seq, sizediff); - pr_debug("nf_nat_resize_packet: Seq_offset before: "); + pr_debug("adjust_tcp_sequence: Seq_offset before: "); DUMP_OFFSET(this_way); spin_lock_bh(&nf_nat_seqofs_lock); @@ -63,13 +60,13 @@ adjust_tcp_sequence(u32 seq, * retransmit */ if (this_way->offset_before == this_way->offset_after || before(this_way->correction_pos, seq)) { - this_way->correction_pos = seq; - this_way->offset_before = this_way->offset_after; - this_way->offset_after += sizediff; + this_way->correction_pos = seq; + this_way->offset_before = this_way->offset_after; + this_way->offset_after += sizediff; } spin_unlock_bh(&nf_nat_seqofs_lock); - pr_debug("nf_nat_resize_packet: Seq_offset after: "); + pr_debug("adjust_tcp_sequence: Seq_offset after: "); DUMP_OFFSET(this_way); } @@ -145,6 +142,17 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra) return 1; } +void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, + __be32 seq, s16 off) +{ + if (!off) + return; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo); + nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); +} +EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); + /* Generic function for mangling variable-length address changes inside * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX * command in FTP). @@ -153,14 +161,13 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra) * skb enlargement, ... * * */ -int -nf_nat_mangle_tcp_packet(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len) +int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len, bool adjust) { struct rtable *rt = skb_rtable(skb); struct iphdr *iph; @@ -206,16 +213,13 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, inet_proto_csum_replace2(&tcph->check, skb, htons(oldlen), htons(datalen), 1); - if (rep_len != match_len) { - set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); - adjust_tcp_sequence(ntohl(tcph->seq), - (int)rep_len - (int)match_len, - ct, ctinfo); - nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); - } + if (adjust && rep_len != match_len) + nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, + (int)rep_len - (int)match_len); + return 1; } -EXPORT_SYMBOL(nf_nat_mangle_tcp_packet); +EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet); /* Generic function for mangling variable-length address changes inside * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index 9eb171056c63..4c060038d29f 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -25,6 +25,7 @@ #include <net/netfilter/nf_nat_rule.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_zones.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> @@ -74,7 +75,7 @@ static void pptp_nat_expected(struct nf_conn *ct, pr_debug("trying to unexpect other dir: "); nf_ct_dump_tuple_ip(&t); - other_exp = nf_ct_expect_find_get(net, &t); + other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t); if (other_exp) { nf_ct_unexpect_related(other_exp); nf_ct_expect_put(other_exp); diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index 9e81e0dfb4ec..26de2c1f7fab 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -15,6 +15,7 @@ #include <linux/kmod.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> +#include <linux/slab.h> #include <net/checksum.h> #include <net/route.h> #include <linux/bitops.h> @@ -28,36 +29,6 @@ (1 << NF_INET_POST_ROUTING) | \ (1 << NF_INET_LOCAL_OUT)) -static const struct -{ - struct ipt_replace repl; - struct ipt_standard entries[3]; - struct ipt_error term; -} nat_initial_table __net_initdata = { - .repl = { - .name = "nat", - .valid_hooks = NAT_VALID_HOOKS, - .num_entries = 4, - .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), - .hook_entry = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 - }, - .underflow = { - [NF_INET_PRE_ROUTING] = 0, - [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard), - [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 - }, - }, - .entries = { - IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */ - IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */ - IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */ - }, - .term = IPT_ERROR_INIT, /* ERROR */ -}; - static const struct xt_table nat_table = { .name = "nat", .valid_hooks = NAT_VALID_HOOKS, @@ -186,8 +157,13 @@ static struct xt_target ipt_dnat_reg __read_mostly = { static int __net_init nf_nat_rule_net_init(struct net *net) { - net->ipv4.nat_table = ipt_register_table(net, &nat_table, - &nat_initial_table.repl); + struct ipt_replace *repl; + + repl = ipt_alloc_initial_table(&nat_table); + if (repl == NULL) + return -ENOMEM; + net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl); + kfree(repl); if (IS_ERR(net->ipv4.nat_table)) return PTR_ERR(net->ipv4.nat_table); return 0; @@ -195,7 +171,7 @@ static int __net_init nf_nat_rule_net_init(struct net *net) static void __net_exit nf_nat_rule_net_exit(struct net *net) { - ipt_unregister_table(net->ipv4.nat_table); + ipt_unregister_table(net, net->ipv4.nat_table); } static struct pernet_operations nf_nat_rule_net_ops = { diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 07d61a57613c..11b538deaaec 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -1,4 +1,4 @@ -/* SIP extension for UDP NAT alteration. +/* SIP extension for NAT alteration. * * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> * based on RR's ip_nat_ftp.c and other modules. @@ -15,6 +15,7 @@ #include <linux/ip.h> #include <net/ip.h> #include <linux/udp.h> +#include <linux/tcp.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> @@ -29,25 +30,42 @@ MODULE_DESCRIPTION("SIP NAT helper"); MODULE_ALIAS("ip_nat_sip"); -static unsigned int mangle_packet(struct sk_buff *skb, +static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int matchoff, unsigned int matchlen, const char *buffer, unsigned int buflen) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - - if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, matchoff, matchlen, - buffer, buflen)) - return 0; + struct tcphdr *th; + unsigned int baseoff; + + if (nf_ct_protonum(ct) == IPPROTO_TCP) { + th = (struct tcphdr *)(skb->data + ip_hdrlen(skb)); + baseoff = ip_hdrlen(skb) + th->doff * 4; + matchoff += dataoff - baseoff; + + if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + matchoff, matchlen, + buffer, buflen, false)) + return 0; + } else { + baseoff = ip_hdrlen(skb) + sizeof(struct udphdr); + matchoff += dataoff - baseoff; + + if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, + matchoff, matchlen, + buffer, buflen)) + return 0; + } /* Reload data pointer and adjust datalen value */ - *dptr = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr); + *dptr = skb->data + dataoff; *datalen += buflen - matchlen; return 1; } -static int map_addr(struct sk_buff *skb, +static int map_addr(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen, unsigned int matchoff, unsigned int matchlen, union nf_inet_addr *addr, __be16 port) @@ -76,11 +94,11 @@ static int map_addr(struct sk_buff *skb, buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport)); - return mangle_packet(skb, dptr, datalen, matchoff, matchlen, + return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen); } -static int map_sip_addr(struct sk_buff *skb, +static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen, enum sip_header_types type) { @@ -93,16 +111,18 @@ static int map_sip_addr(struct sk_buff *skb, if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL, &matchoff, &matchlen, &addr, &port) <= 0) return 1; - return map_addr(skb, dptr, datalen, matchoff, matchlen, &addr, port); + return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, + &addr, port); } -static unsigned int ip_nat_sip(struct sk_buff *skb, +static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - unsigned int dataoff, matchoff, matchlen; + unsigned int coff, matchoff, matchlen; + enum sip_header_types hdr; union nf_inet_addr addr; __be16 port; int request, in_header; @@ -112,16 +132,21 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, if (ct_sip_parse_request(ct, *dptr, *datalen, &matchoff, &matchlen, &addr, &port) > 0 && - !map_addr(skb, dptr, datalen, matchoff, matchlen, + !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, &addr, port)) return NF_DROP; request = 1; } else request = 0; + if (nf_ct_protonum(ct) == IPPROTO_TCP) + hdr = SIP_HDR_VIA_TCP; + else + hdr = SIP_HDR_VIA_UDP; + /* Translate topmost Via header and parameters */ if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, - SIP_HDR_VIA, NULL, &matchoff, &matchlen, + hdr, NULL, &matchoff, &matchlen, &addr, &port) > 0) { unsigned int matchend, poff, plen, buflen, n; char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; @@ -138,7 +163,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, goto next; } - if (!map_addr(skb, dptr, datalen, matchoff, matchlen, + if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, &addr, port)) return NF_DROP; @@ -153,8 +178,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { buflen = sprintf(buffer, "%pI4", &ct->tuplehash[!dir].tuple.dst.u3.ip); - if (!mangle_packet(skb, dptr, datalen, poff, plen, - buffer, buflen)) + if (!mangle_packet(skb, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) return NF_DROP; } @@ -167,8 +192,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { buflen = sprintf(buffer, "%pI4", &ct->tuplehash[!dir].tuple.src.u3.ip); - if (!mangle_packet(skb, dptr, datalen, poff, plen, - buffer, buflen)) + if (!mangle_packet(skb, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) return NF_DROP; } @@ -181,31 +206,45 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) { __be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port; buflen = sprintf(buffer, "%u", ntohs(p)); - if (!mangle_packet(skb, dptr, datalen, poff, plen, - buffer, buflen)) + if (!mangle_packet(skb, dataoff, dptr, datalen, + poff, plen, buffer, buflen)) return NF_DROP; } } next: /* Translate Contact headers */ - dataoff = 0; + coff = 0; in_header = 0; - while (ct_sip_parse_header_uri(ct, *dptr, &dataoff, *datalen, + while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen, SIP_HDR_CONTACT, &in_header, &matchoff, &matchlen, &addr, &port) > 0) { - if (!map_addr(skb, dptr, datalen, matchoff, matchlen, + if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, &addr, port)) return NF_DROP; } - if (!map_sip_addr(skb, dptr, datalen, SIP_HDR_FROM) || - !map_sip_addr(skb, dptr, datalen, SIP_HDR_TO)) + if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) || + !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO)) return NF_DROP; + return NF_ACCEPT; } +static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = nf_ct_get(skb, &ctinfo); + const struct tcphdr *th; + + if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0) + return; + + th = (struct tcphdr *)(skb->data + ip_hdrlen(skb)); + nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off); +} + /* Handles expected signalling connections and media streams */ static void ip_nat_sip_expected(struct nf_conn *ct, struct nf_conntrack_expect *exp) @@ -232,7 +271,7 @@ static void ip_nat_sip_expected(struct nf_conn *ct, } } -static unsigned int ip_nat_sip_expect(struct sk_buff *skb, +static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen, struct nf_conntrack_expect *exp, unsigned int matchoff, @@ -279,8 +318,8 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, if (exp->tuple.dst.u3.ip != exp->saved_ip || exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) { buflen = sprintf(buffer, "%pI4:%u", &newip, port); - if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, - buffer, buflen)) + if (!mangle_packet(skb, dataoff, dptr, datalen, + matchoff, matchlen, buffer, buflen)) goto err; } return NF_ACCEPT; @@ -290,7 +329,7 @@ err: return NF_DROP; } -static int mangle_content_len(struct sk_buff *skb, +static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff, const char **dptr, unsigned int *datalen) { enum ip_conntrack_info ctinfo; @@ -312,12 +351,13 @@ static int mangle_content_len(struct sk_buff *skb, return 0; buflen = sprintf(buffer, "%u", c_len); - return mangle_packet(skb, dptr, datalen, matchoff, matchlen, + return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen); } -static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr, - unsigned int dataoff, unsigned int *datalen, +static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, enum sdp_header_types type, enum sdp_header_types term, char *buffer, int buflen) @@ -326,16 +366,16 @@ static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr, struct nf_conn *ct = nf_ct_get(skb, &ctinfo); unsigned int matchlen, matchoff; - if (ct_sip_get_sdp_header(ct, *dptr, dataoff, *datalen, type, term, + if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term, &matchoff, &matchlen) <= 0) return -ENOENT; - return mangle_packet(skb, dptr, datalen, matchoff, matchlen, + return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen) ? 0 : -EINVAL; } -static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, - unsigned int dataoff, - unsigned int *datalen, +static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, enum sdp_header_types type, enum sdp_header_types term, const union nf_inet_addr *addr) @@ -344,16 +384,15 @@ static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr, unsigned int buflen; buflen = sprintf(buffer, "%pI4", &addr->ip); - if (mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term, + if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term, buffer, buflen)) return 0; - return mangle_content_len(skb, dptr, datalen); + return mangle_content_len(skb, dataoff, dptr, datalen); } -static unsigned int ip_nat_sdp_port(struct sk_buff *skb, - const char **dptr, - unsigned int *datalen, +static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, unsigned int matchoff, unsigned int matchlen, u_int16_t port) @@ -362,16 +401,16 @@ static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int buflen; buflen = sprintf(buffer, "%u", port); - if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen, + if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen, buffer, buflen)) return 0; - return mangle_content_len(skb, dptr, datalen); + return mangle_content_len(skb, dataoff, dptr, datalen); } -static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, - unsigned int dataoff, - unsigned int *datalen, +static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, + unsigned int sdpoff, const union nf_inet_addr *addr) { char buffer[sizeof("nnn.nnn.nnn.nnn")]; @@ -379,12 +418,12 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, /* Mangle session description owner and contact addresses */ buflen = sprintf(buffer, "%pI4", &addr->ip); - if (mangle_sdp_packet(skb, dptr, dataoff, datalen, + if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA, buffer, buflen)) return 0; - switch (mangle_sdp_packet(skb, dptr, dataoff, datalen, + switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA, buffer, buflen)) { case 0: @@ -401,14 +440,13 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr, return 0; } - return mangle_content_len(skb, dptr, datalen); + return mangle_content_len(skb, dataoff, dptr, datalen); } /* So, this packet has hit the connection tracking matching code. Mangle it, and change the expectation to match the new version. */ -static unsigned int ip_nat_sdp_media(struct sk_buff *skb, - const char **dptr, - unsigned int *datalen, +static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, + const char **dptr, unsigned int *datalen, struct nf_conntrack_expect *rtp_exp, struct nf_conntrack_expect *rtcp_exp, unsigned int mediaoff, @@ -456,7 +494,8 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, /* Update media port. */ if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port && - !ip_nat_sdp_port(skb, dptr, datalen, mediaoff, medialen, port)) + !ip_nat_sdp_port(skb, dataoff, dptr, datalen, + mediaoff, medialen, port)) goto err2; return NF_ACCEPT; @@ -471,6 +510,7 @@ err1: static void __exit nf_nat_sip_fini(void) { rcu_assign_pointer(nf_nat_sip_hook, NULL); + rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, NULL); rcu_assign_pointer(nf_nat_sip_expect_hook, NULL); rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL); rcu_assign_pointer(nf_nat_sdp_port_hook, NULL); @@ -482,12 +522,14 @@ static void __exit nf_nat_sip_fini(void) static int __init nf_nat_sip_init(void) { BUG_ON(nf_nat_sip_hook != NULL); + BUG_ON(nf_nat_sip_seq_adjust_hook != NULL); BUG_ON(nf_nat_sip_expect_hook != NULL); BUG_ON(nf_nat_sdp_addr_hook != NULL); BUG_ON(nf_nat_sdp_port_hook != NULL); BUG_ON(nf_nat_sdp_session_hook != NULL); BUG_ON(nf_nat_sdp_media_hook != NULL); rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip); + rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust); rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect); rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr); rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port); diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index d9521f6f9ed0..4d85b6e55f29 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -43,6 +43,7 @@ #include <linux/moduleparam.h> #include <linux/types.h> #include <linux/kernel.h> +#include <linux/slab.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/udp.h> @@ -1038,7 +1039,7 @@ static int snmp_parse_mangle(unsigned char *msg, unsigned int cls, con, tag, vers, pdutype; struct asn1_ctx ctx; struct asn1_octstr comm; - struct snmp_object **obj; + struct snmp_object *obj; if (debug > 1) hex_dump(msg, len); @@ -1148,43 +1149,34 @@ static int snmp_parse_mangle(unsigned char *msg, if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) return 0; - obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); - if (obj == NULL) { - if (net_ratelimit()) - printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__); - return 0; - } - while (!asn1_eoc_decode(&ctx, eoc)) { unsigned int i; - if (!snmp_object_decode(&ctx, obj)) { - if (*obj) { - kfree((*obj)->id); - kfree(*obj); + if (!snmp_object_decode(&ctx, &obj)) { + if (obj) { + kfree(obj->id); + kfree(obj); } - kfree(obj); return 0; } if (debug > 1) { printk(KERN_DEBUG "bsalg: object: "); - for (i = 0; i < (*obj)->id_len; i++) { + for (i = 0; i < obj->id_len; i++) { if (i > 0) printk("."); - printk("%lu", (*obj)->id[i]); + printk("%lu", obj->id[i]); } - printk(": type=%u\n", (*obj)->type); + printk(": type=%u\n", obj->type); } - if ((*obj)->type == SNMP_IPADDR) + if (obj->type == SNMP_IPADDR) mangle_address(ctx.begin, ctx.pointer - 4 , map, check); - kfree((*obj)->id); - kfree(*obj); + kfree(obj->id); + kfree(obj); } - kfree(obj); if (!asn1_eoc_decode(&ctx, eoc)) return 0; diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c index 5f41d017ddd8..c39c9cf6bee6 100644 --- a/net/ipv4/netfilter/nf_nat_standalone.c +++ b/net/ipv4/netfilter/nf_nat_standalone.c @@ -7,6 +7,7 @@ */ #include <linux/types.h> #include <linux/icmp.h> +#include <linux/gfp.h> #include <linux/ip.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> @@ -197,11 +198,11 @@ nf_nat_out(unsigned int hooknum, (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - if (ct->tuplehash[dir].tuple.src.u3.ip != - ct->tuplehash[!dir].tuple.dst.u3.ip - || ct->tuplehash[dir].tuple.src.u.all != - ct->tuplehash[!dir].tuple.dst.u.all - ) + if ((ct->tuplehash[dir].tuple.src.u3.ip != + ct->tuplehash[!dir].tuple.dst.u3.ip) || + (ct->tuplehash[dir].tuple.src.u.all != + ct->tuplehash[!dir].tuple.dst.u.all) + ) return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; } #endif diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index f25542c48b7d..4f1f337f4337 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -127,8 +127,8 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_SENTINEL }; -static struct { - char *name; +static const struct { + const char *name; int index; } icmpmibmap[] = { { "DestUnreachs", ICMP_DEST_UNREACH }, @@ -249,6 +249,8 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED), SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED), SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK), + SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP), + SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP), SNMP_MIB_SENTINEL }; @@ -280,7 +282,7 @@ static void icmpmsg_put(struct seq_file *seq) count = 0; for (i = 0; i < ICMPMSG_MIB_MAX; i++) { - val = snmp_fold_field((void **) net->mib.icmpmsg_statistics, i); + val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i); if (val) { type[count] = i; vals[count++] = val; @@ -307,18 +309,18 @@ static void icmp_put(struct seq_file *seq) for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu", - snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), - snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS), + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) net->mib.icmpmsg_statistics, + snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, icmpmibmap[i].index)); seq_printf(seq, " %lu %lu", - snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), + snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); for (i=0; icmpmibmap[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **) net->mib.icmpmsg_statistics, + snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, icmpmibmap[i].index | 0x100)); } @@ -341,7 +343,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.ip_statistics, + snmp_fold_field((void __percpu **)net->mib.ip_statistics, snmp4_ipstats_list[i].entry)); icmp_put(seq); /* RFC 2011 compatibility */ @@ -356,11 +358,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v) /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", - snmp_fold_field((void **)net->mib.tcp_statistics, + snmp_fold_field((void __percpu **)net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); else seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.tcp_statistics, + snmp_fold_field((void __percpu **)net->mib.tcp_statistics, snmp4_tcp_list[i].entry)); } @@ -371,7 +373,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdp:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.udp_statistics, + snmp_fold_field((void __percpu **)net->mib.udp_statistics, snmp4_udp_list[i].entry)); /* the UDP and UDP-Lite MIBs are the same */ @@ -382,7 +384,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nUdpLite:"); for (i = 0; snmp4_udp_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.udplite_statistics, + snmp_fold_field((void __percpu **)net->mib.udplite_statistics, snmp4_udp_list[i].entry)); seq_putc(seq, '\n'); @@ -419,7 +421,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nTcpExt:"); for (i = 0; snmp4_net_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.net_statistics, + snmp_fold_field((void __percpu **)net->mib.net_statistics, snmp4_net_list[i].entry)); seq_puts(seq, "\nIpExt:"); @@ -429,7 +431,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) seq_puts(seq, "\nIpExt:"); for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++) seq_printf(seq, " %lu", - snmp_fold_field((void **)net->mib.ip_statistics, + snmp_fold_field((void __percpu **)net->mib.ip_statistics, snmp4_ipextstats_list[i].entry)); seq_putc(seq, '\n'); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index ab996f9c0fe0..cc6f097fbd5f 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -60,7 +60,6 @@ #include <net/net_namespace.h> #include <net/dst.h> #include <net/sock.h> -#include <linux/gfp.h> #include <linux/ip.h> #include <linux/net.h> #include <net/ip.h> @@ -87,7 +86,7 @@ void raw_hash_sk(struct sock *sk) struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; struct hlist_head *head; - head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)]; + head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)]; write_lock_bh(&h->lock); sk_add_node(sk, head); @@ -115,9 +114,9 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk, sk_for_each_from(sk, node) { struct inet_sock *inet = inet_sk(sk); - if (net_eq(sock_net(sk), net) && inet->num == num && - !(inet->daddr && inet->daddr != raddr) && - !(inet->rcv_saddr && inet->rcv_saddr != laddr) && + if (net_eq(sock_net(sk), net) && inet->inet_num == num && + !(inet->inet_daddr && inet->inet_daddr != raddr) && + !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) goto found; /* gotcha */ } @@ -292,7 +291,6 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) /* Charge it to the socket. */ if (sock_queue_rcv_skb(sk, skb) < 0) { - atomic_inc(&sk->sk_drops); kfree_skb(skb); return NET_RX_DROP; } @@ -327,7 +325,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, int err; if (length > rt->u.dst.dev->mtu) { - ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, rt->u.dst.dev->mtu); return -EMSGSIZE; } @@ -500,10 +498,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, err = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) goto out; - daddr = inet->daddr; + daddr = inet->inet_daddr; } - ipc.addr = inet->saddr; + ipc.addr = inet->inet_saddr; ipc.opt = NULL; ipc.shtx.flags = 0; ipc.oif = sk->sk_bound_dev_if; @@ -645,9 +643,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; - inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; + inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) - inet->saddr = 0; /* Use device */ + inet->inet_saddr = 0; /* Use device */ sk_dst_reset(sk); ret = 0; out: return ret; @@ -692,7 +690,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (err) goto done; - sock_recv_timestamp(msg, sk, skb); + sock_recv_ts_and_drops(msg, sk, skb); /* Copy the address. */ if (sin) { @@ -717,7 +715,7 @@ static int raw_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); - if (inet_sk(sk)->num == IPPROTO_ICMP) + if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; } @@ -754,7 +752,7 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { if (optname == ICMP_FILTER) { - if (inet_sk(sk)->num != IPPROTO_ICMP) + if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_seticmpfilter(sk, optval, optlen); @@ -784,7 +782,7 @@ static int do_raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { if (optname == ICMP_FILTER) { - if (inet_sk(sk)->num != IPPROTO_ICMP) + if (inet_sk(sk)->inet_num != IPPROTO_ICMP) return -EOPNOTSUPP; else return raw_geticmpfilter(sk, optval, optlen); @@ -943,10 +941,10 @@ EXPORT_SYMBOL_GPL(raw_seq_stop); static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) { struct inet_sock *inet = inet_sk(sp); - __be32 dest = inet->daddr, - src = inet->rcv_saddr; + __be32 dest = inet->inet_daddr, + src = inet->inet_rcv_saddr; __u16 destp = 0, - srcp = inet->num; + srcp = inet->inet_num; seq_printf(seq, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n", diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 5b1050a5d874..cb562fdd9b9a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -90,6 +90,7 @@ #include <linux/jhash.h> #include <linux/rcupdate.h> #include <linux/times.h> +#include <linux/slab.h> #include <net/dst.h> #include <net/net_namespace.h> #include <net/protocol.h> @@ -146,7 +147,6 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); static int rt_garbage_collect(struct dst_ops *ops); -static void rt_emergency_hash_rebuild(struct net *net); static struct dst_ops ipv4_dst_ops = { @@ -287,12 +287,12 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq) if (!rt_hash_table[st->bucket].chain) continue; rcu_read_lock_bh(); - r = rcu_dereference(rt_hash_table[st->bucket].chain); + r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); while (r) { if (dev_net(r->u.dst.dev) == seq_file_net(seq) && r->rt_genid == st->genid) return r; - r = rcu_dereference(r->u.dst.rt_next); + r = rcu_dereference_bh(r->u.dst.rt_next); } rcu_read_unlock_bh(); } @@ -314,7 +314,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq, rcu_read_lock_bh(); r = rt_hash_table[st->bucket].chain; } - return rcu_dereference(r); + return rcu_dereference_bh(r); } static struct rtable *rt_cache_get_next(struct seq_file *seq, @@ -513,43 +513,42 @@ static const struct file_operations rt_cpu_seq_fops = { }; #ifdef CONFIG_NET_CLS_ROUTE -static int ip_rt_acct_read(char *buffer, char **start, off_t offset, - int length, int *eof, void *data) -{ - unsigned int i; - - if ((offset & 3) || (length & 3)) - return -EIO; - - if (offset >= sizeof(struct ip_rt_acct) * 256) { - *eof = 1; - return 0; - } - - if (offset + length >= sizeof(struct ip_rt_acct) * 256) { - length = sizeof(struct ip_rt_acct) * 256 - offset; - *eof = 1; +static int rt_acct_proc_show(struct seq_file *m, void *v) +{ + struct ip_rt_acct *dst, *src; + unsigned int i, j; + + dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); + if (!dst) + return -ENOMEM; + + for_each_possible_cpu(i) { + src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); + for (j = 0; j < 256; j++) { + dst[j].o_bytes += src[j].o_bytes; + dst[j].o_packets += src[j].o_packets; + dst[j].i_bytes += src[j].i_bytes; + dst[j].i_packets += src[j].i_packets; + } } - offset /= sizeof(u32); - - if (length > 0) { - u32 *dst = (u32 *) buffer; - - *start = buffer; - memset(dst, 0, length); - - for_each_possible_cpu(i) { - unsigned int j; - u32 *src; + seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); + kfree(dst); + return 0; +} - src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset; - for (j = 0; j < length/4; j++) - dst[j] += src[j]; - } - } - return length; +static int rt_acct_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rt_acct_proc_show, NULL); } + +static const struct file_operations rt_acct_proc_fops = { + .owner = THIS_MODULE, + .open = rt_acct_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif static int __net_init ip_rt_do_proc_init(struct net *net) @@ -567,8 +566,7 @@ static int __net_init ip_rt_do_proc_init(struct net *net) goto err2; #ifdef CONFIG_NET_CLS_ROUTE - pde = create_proc_read_entry("rt_acct", 0, net->proc_net, - ip_rt_acct_read, NULL); + pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif @@ -588,7 +586,9 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net) { remove_proc_entry("rt_cache", net->proc_net_stat); remove_proc_entry("rt_cache", net->proc_net); +#ifdef CONFIG_NET_CLS_ROUTE remove_proc_entry("rt_acct", net->proc_net); +#endif } static struct pernet_operations ip_rt_proc_ops __net_initdata = { @@ -703,7 +703,7 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) { - return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); + return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); } static inline int rt_is_expired(struct rtable *rth) @@ -780,11 +780,30 @@ static void rt_do_flush(int process_context) #define FRACT_BITS 3 #define ONE (1UL << FRACT_BITS) +/* + * Given a hash chain and an item in this hash chain, + * find if a previous entry has the same hash_inputs + * (but differs on tos, mark or oif) + * Returns 0 if an alias is found. + * Returns ONE if rth has no alias before itself. + */ +static int has_noalias(const struct rtable *head, const struct rtable *rth) +{ + const struct rtable *aux = head; + + while (aux != rth) { + if (compare_hash_inputs(&aux->fl, &rth->fl)) + return 0; + aux = aux->u.dst.rt_next; + } + return ONE; +} + static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; - struct rtable *rth, *aux, **rthp; + struct rtable *rth, **rthp; unsigned long samples = 0; unsigned long sum = 0, sum2 = 0; unsigned long delta; @@ -835,15 +854,7 @@ nofree: * attributes don't unfairly skew * the length computation */ - for (aux = rt_hash_table[i].chain;;) { - if (aux == rth) { - length += ONE; - break; - } - if (compare_hash_inputs(&aux->fl, &rth->fl)) - break; - aux = aux->u.dst.rt_next; - } + length += has_noalias(rt_hash_table[i].chain, rth); continue; } } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) @@ -902,6 +913,12 @@ void rt_cache_flush(struct net *net, int delay) rt_do_flush(!in_softirq()); } +/* Flush previous cache invalidated entries from the cache */ +void rt_cache_flush_batch(void) +{ + rt_do_flush(!in_softirq()); +} + /* * We change rt_genid and let gc do the cleanup */ @@ -916,10 +933,8 @@ static void rt_secret_rebuild_oneshot(struct net *net) { del_timer_sync(&net->ipv4.rt_secret_timer); rt_cache_invalidate(net); - if (ip_rt_secret_interval) { - net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval; - add_timer(&net->ipv4.rt_secret_timer); - } + if (ip_rt_secret_interval) + mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); } static void rt_emergency_hash_rebuild(struct net *net) @@ -1067,8 +1082,23 @@ work_done: out: return 0; } +/* + * Returns number of entries in a hash chain that have different hash_inputs + */ +static int slow_chain_length(const struct rtable *head) +{ + int length = 0; + const struct rtable *rth = head; + + while (rth) { + length += has_noalias(head, rth); + rth = rth->u.dst.rt_next; + } + return length >> FRACT_BITS; +} + static int rt_intern_hash(unsigned hash, struct rtable *rt, - struct rtable **rp, struct sk_buff *skb) + struct rtable **rp, struct sk_buff *skb, int ifindex) { struct rtable *rth, **rthp; unsigned long now; @@ -1179,14 +1209,20 @@ restart: rt_free(cand); } } else { - if (chain_length > rt_chain_length_max) { + if (chain_length > rt_chain_length_max && + slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { struct net *net = dev_net(rt->u.dst.dev); int num = ++net->ipv4.current_rt_cache_rebuild_count; - if (!rt_caching(dev_net(rt->u.dst.dev))) { + if (!rt_caching(net)) { printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", rt->u.dst.dev->name, num); } - rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); + rt_emergency_hash_rebuild(net); + spin_unlock_bh(rt_hash_lock_addr(hash)); + + hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, + ifindex, rt_genid(net)); + goto restart; } } @@ -1346,9 +1382,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, return; net = dev_net(dev); - if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) - || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) - || ipv4_is_zeronet(new_gw)) + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || + ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || + ipv4_is_zeronet(new_gw)) goto reject_redirect; if (!rt_caching(net)) @@ -1411,7 +1447,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, dev_hold(rt->u.dst.dev); if (rt->idev) in_dev_hold(rt->idev); - rt->u.dst.obsolete = 0; + rt->u.dst.obsolete = -1; rt->u.dst.lastuse = jiffies; rt->u.dst.path = &rt->u.dst; rt->u.dst.neighbour = NULL; @@ -1447,7 +1483,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, &netevent); rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt, NULL)) + if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif)) ip_rt_put(rt); goto do_next; } @@ -1476,11 +1512,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) struct dst_entry *ret = dst; if (rt) { - if (dst->obsolete) { + if (dst->obsolete > 0) { ip_rt_put(rt); ret = NULL; } else if ((rt->rt_flags & RTCF_REDIRECTED) || - rt->u.dst.expires) { + (rt->u.dst.expires && + time_after_eq(jiffies, rt->u.dst.expires))) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, rt->fl.oif, rt_genid(dev_net(dst->dev))); @@ -1628,9 +1665,6 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, __be32 daddr = iph->daddr; unsigned short est_mtu = 0; - if (ipv4_config.no_pmtu_disc) - return 0; - for (k = 0; k < 2; k++) { for (i = 0; i < 2; i++) { unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], @@ -1699,7 +1733,9 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { - return NULL; + if (rt_is_expired((struct rtable *)dst)) + return NULL; + return dst; } static void ipv4_dst_destroy(struct dst_entry *dst) @@ -1861,7 +1897,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (!rth) goto e_nobufs; - rth->u.dst.output= ip_rt_bug; + rth->u.dst.output = ip_rt_bug; + rth->u.dst.obsolete = -1; atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; @@ -1900,7 +1937,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, in_dev_put(in_dev); hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - return rt_intern_hash(hash, rth, NULL, skb); + return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex); e_nobufs: in_dev_put(in_dev); @@ -1987,8 +2024,13 @@ static int __mkroute_input(struct sk_buff *skb, if (skb->protocol != htons(ETH_P_IP)) { /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. + * + * Proxy arp feature have been extended to allow, ARP + * replies back to the same interface, to support + * Private VLAN switch technologies. See arp.c. */ - if (out_dev == in_dev) { + if (out_dev == in_dev && + IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { err = -EINVAL; goto cleanup; } @@ -2022,6 +2064,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->fl.oif = 0; rth->rt_spec_dst= spec_dst; + rth->u.dst.obsolete = -1; rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); @@ -2061,7 +2104,7 @@ static int ip_mkroute_input(struct sk_buff *skb, /* put it into the cache */ hash = rt_hash(daddr, saddr, fl->iif, rt_genid(dev_net(rth->u.dst.dev))); - return rt_intern_hash(hash, rth, NULL, skb); + return rt_intern_hash(hash, rth, NULL, skb, fl->iif); } /* @@ -2186,6 +2229,7 @@ local_input: goto e_nobufs; rth->u.dst.output= ip_rt_bug; + rth->u.dst.obsolete = -1; rth->rt_genid = rt_genid(net); atomic_set(&rth->u.dst.__refcnt, 1); @@ -2217,7 +2261,7 @@ local_input: } rth->rt_type = res.type; hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); - err = rt_intern_hash(hash, rth, NULL, skb); + err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); goto done; no_route: @@ -2314,10 +2358,11 @@ skip_cache: ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE - || (!ipv4_is_local_multicast(daddr) && - IN_DEV_MFORWARD(in_dev)) + || + (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) #endif - ) { + ) { rcu_read_unlock(); return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); @@ -2411,6 +2456,7 @@ static int __mkroute_output(struct rtable **result, rth->rt_spec_dst= fl->fl4_src; rth->u.dst.output=ip_output; + rth->u.dst.obsolete = -1; rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); @@ -2462,7 +2508,7 @@ static int ip_mkroute_output(struct rtable **rp, if (err == 0) { hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, rt_genid(dev_net(dev_out))); - err = rt_intern_hash(hash, rth, rp, NULL); + err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif); } return err; @@ -2514,9 +2560,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, of another iface. --ANK */ - if (oldflp->oif == 0 - && (ipv4_is_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + if (oldflp->oif == 0 && + (ipv4_is_multicast(oldflp->fl4_dst) || + oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = ip_dev_find(net, oldflp->fl4_src); if (dev_out == NULL) @@ -2685,8 +2731,8 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); rcu_read_lock_bh(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { + for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; + rth = rcu_dereference_bh(rth->u.dst.rt_next)) { if (rth->fl.fl4_dst == flp->fl4_dst && rth->fl.fl4_src == flp->fl4_src && rth->fl.iif == 0 && @@ -2855,7 +2901,7 @@ static int rt_fill_info(struct net *net, error = rt->u.dst.error; expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; if (rt->peer) { - id = rt->peer->ip_id_count; + id = atomic_read(&rt->peer->ip_id_count) & 0xffff; if (rt->peer->tcp_ts_stamp) { ts = rt->peer->tcp_ts; tsage = get_seconds() - rt->peer->tcp_ts_stamp; @@ -3004,8 +3050,8 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) if (!rt_hash_table[h].chain) continue; rcu_read_lock_bh(); - for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; - rt = rcu_dereference(rt->u.dst.rt_next), idx++) { + for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; + rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) { if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) continue; if (rt_is_expired(rt)) @@ -3056,23 +3102,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, return -EINVAL; } -static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, - size_t newlen) -{ - int delay; - struct net *net; - if (newlen != sizeof(int)) - return -EINVAL; - if (get_user(delay, (int __user *)newval)) - return -EFAULT; - net = (struct net *)table->extra1; - rt_cache_flush(net, delay); - return 0; -} - static void rt_secret_reschedule(int old) { struct net *net; @@ -3085,22 +3114,20 @@ static void rt_secret_reschedule(int old) rtnl_lock(); for_each_net(net) { int deleted = del_timer_sync(&net->ipv4.rt_secret_timer); + long time; if (!new) continue; if (deleted) { - long time = net->ipv4.rt_secret_timer.expires - jiffies; + time = net->ipv4.rt_secret_timer.expires - jiffies; if (time <= 0 || (time += diff) <= 0) time = 0; - - net->ipv4.rt_secret_timer.expires = time; } else - net->ipv4.rt_secret_timer.expires = new; + time = new; - net->ipv4.rt_secret_timer.expires += jiffies; - add_timer(&net->ipv4.rt_secret_timer); + mod_timer(&net->ipv4.rt_secret_timer, jiffies + time); } rtnl_unlock(); } @@ -3117,23 +3144,8 @@ static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, return ret; } -static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, - size_t newlen) -{ - int old = ip_rt_secret_interval; - int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen); - - rt_secret_reschedule(old); - - return ret; -} - static ctl_table ipv4_route_table[] = { { - .ctl_name = NET_IPV4_ROUTE_GC_THRESH, .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), @@ -3141,7 +3153,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, .procname = "max_size", .data = &ip_rt_max_size, .maxlen = sizeof(int), @@ -3151,43 +3162,34 @@ static ctl_table ipv4_route_table[] = { { /* Deprecated. Use gc_min_interval_ms */ - .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL, .procname = "gc_min_interval", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, .procname = "gc_min_interval_ms", .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, - .strategy = sysctl_ms_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, .procname = "gc_timeout", .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, .procname = "gc_interval", .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, .procname = "redirect_load", .data = &ip_rt_redirect_load, .maxlen = sizeof(int), @@ -3195,7 +3197,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, .procname = "redirect_number", .data = &ip_rt_redirect_number, .maxlen = sizeof(int), @@ -3203,7 +3204,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, .procname = "redirect_silence", .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), @@ -3211,7 +3211,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_ERROR_COST, .procname = "error_cost", .data = &ip_rt_error_cost, .maxlen = sizeof(int), @@ -3219,7 +3218,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, .procname = "error_burst", .data = &ip_rt_error_burst, .maxlen = sizeof(int), @@ -3227,7 +3225,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, .procname = "gc_elasticity", .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), @@ -3235,16 +3232,13 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, .procname = "mtu_expires", .data = &ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies, }, { - .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, .procname = "min_pmtu", .data = &ip_rt_min_pmtu, .maxlen = sizeof(int), @@ -3252,7 +3246,6 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, .procname = "min_adv_mss", .data = &ip_rt_min_advmss, .maxlen = sizeof(int), @@ -3260,50 +3253,46 @@ static ctl_table ipv4_route_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, .procname = "secret_interval", .data = &ip_rt_secret_interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipv4_sysctl_rt_secret_interval, - .strategy = ipv4_sysctl_rt_secret_interval_strategy, }, - { .ctl_name = 0 } + { } }; static struct ctl_table empty[1]; static struct ctl_table ipv4_skeleton[] = { - { .procname = "route", .ctl_name = NET_IPV4_ROUTE, + { .procname = "route", .mode = 0555, .child = ipv4_route_table}, - { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH, + { .procname = "neigh", .mode = 0555, .child = empty}, { } }; static __net_initdata struct ctl_path ipv4_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "net", }, + { .procname = "ipv4", }, { }, }; static struct ctl_table ipv4_route_flush_table[] = { { - .ctl_name = NET_IPV4_ROUTE_FLUSH, .procname = "flush", .maxlen = sizeof(int), .mode = 0200, .proc_handler = ipv4_sysctl_rtcache_flush, - .strategy = ipv4_sysctl_rtcache_flush_strategy, }, - { .ctl_name = 0 }, + { }, }; static __net_initdata struct ctl_path ipv4_route_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, - { .procname = "route", .ctl_name = NET_IPV4_ROUTE, }, + { .procname = "net", }, + { .procname = "ipv4", }, + { .procname = "route", }, { }, }; @@ -3312,7 +3301,7 @@ static __net_init int sysctl_route_net_init(struct net *net) struct ctl_table *tbl; tbl = ipv4_route_flush_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; @@ -3380,7 +3369,7 @@ static __net_initdata struct pernet_operations rt_secret_timer_ops = { #ifdef CONFIG_NET_CLS_ROUTE -struct ip_rt_acct *ip_rt_acct __read_mostly; +struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; #endif /* CONFIG_NET_CLS_ROUTE */ static __initdata unsigned long rhash_entries; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index a6e0e077ac33..5c24db4a3c91 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -253,6 +253,8 @@ EXPORT_SYMBOL(cookie_check_timestamp); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct ip_options *opt) { + struct tcp_options_received tcp_opt; + u8 *hash_location; struct inet_request_sock *ireq; struct tcp_request_sock *treq; struct tcp_sock *tp = tcp_sk(sk); @@ -263,7 +265,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, int mss; struct rtable *rt; __u8 rcv_wscale; - struct tcp_options_received tcp_opt; if (!sysctl_tcp_syncookies || !th->ack) goto out; @@ -278,7 +279,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, /* check for timestamp cookie support */ memset(&tcp_opt, 0, sizeof(tcp_opt)); - tcp_parse_options(skb, &tcp_opt, 0); + tcp_parse_options(skb, &tcp_opt, &hash_location, 0); if (tcp_opt.saw_tstamp) cookie_check_timestamp(&tcp_opt); @@ -333,7 +334,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * no easy way to do this. */ { - struct flowi fl = { .nl_u = { .ip4_u = + struct flowi fl = { .mark = sk->sk_mark, + .nl_u = { .ip4_u = { .daddr = ((opt && opt->srr) ? opt->faddr : ireq->rmt_addr), @@ -356,7 +358,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, tcp_select_initial_window(tcp_full_space(sk), req->mss, &req->rcv_wnd, &req->window_clamp, - ireq->wscale_ok, &rcv_wscale); + ireq->wscale_ok, &rcv_wscale, + dst_metric(&rt->u.dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 2dcf04d9b005..1cd5c15174b8 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -12,6 +12,7 @@ #include <linux/inetdevice.h> #include <linux/seqlock.h> #include <linux/init.h> +#include <linux/slab.h> #include <net/snmp.h> #include <net/icmp.h> #include <net/ip.h> @@ -63,34 +64,6 @@ static int ipv4_local_port_range(ctl_table *table, int write, return ret; } -/* Validate changes from sysctl interface. */ -static int ipv4_sysctl_local_port_range(ctl_table *table, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - int ret; - int range[2]; - ctl_table tmp = { - .data = &range, - .maxlen = sizeof(range), - .mode = table->mode, - .extra1 = &ip_local_port_range_min, - .extra2 = &ip_local_port_range_max, - }; - - inet_get_local_port_range(range, range + 1); - ret = sysctl_intvec(&tmp, oldval, oldlenp, newval, newlen); - if (ret == 0 && newval && newlen) { - if (range[1] < range[0]) - ret = -EINVAL; - else - set_local_port_range(range); - } - return ret; -} - - static int proc_tcp_congestion_control(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -109,25 +82,6 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, return ret; } -static int sysctl_tcp_congestion_control(ctl_table *table, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, size_t newlen) -{ - char val[TCP_CA_NAME_MAX]; - ctl_table tbl = { - .data = val, - .maxlen = TCP_CA_NAME_MAX, - }; - int ret; - - tcp_get_default_congestion_control(val); - ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); - if (ret == 1 && newval && newlen) - ret = tcp_set_default_congestion_control(val); - return ret; -} - static int proc_tcp_available_congestion_control(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, @@ -165,32 +119,8 @@ static int proc_allowed_congestion_control(ctl_table *ctl, return ret; } -static int strategy_allowed_congestion_control(ctl_table *table, - void __user *oldval, - size_t __user *oldlenp, - void __user *newval, - size_t newlen) -{ - ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; - int ret; - - tbl.data = kmalloc(tbl.maxlen, GFP_USER); - if (!tbl.data) - return -ENOMEM; - - tcp_get_available_congestion_control(tbl.data, tbl.maxlen); - ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen); - if (ret == 1 && newval && newlen) - ret = tcp_set_allowed_congestion_control(tbl.data); - kfree(tbl.data); - - return ret; - -} - static struct ctl_table ipv4_table[] = { { - .ctl_name = NET_IPV4_TCP_TIMESTAMPS, .procname = "tcp_timestamps", .data = &sysctl_tcp_timestamps, .maxlen = sizeof(int), @@ -198,7 +128,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_WINDOW_SCALING, .procname = "tcp_window_scaling", .data = &sysctl_tcp_window_scaling, .maxlen = sizeof(int), @@ -206,7 +135,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_SACK, .procname = "tcp_sack", .data = &sysctl_tcp_sack, .maxlen = sizeof(int), @@ -214,7 +142,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE, .procname = "tcp_retrans_collapse", .data = &sysctl_tcp_retrans_collapse, .maxlen = sizeof(int), @@ -222,17 +149,14 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_DEFAULT_TTL, .procname = "ip_default_ttl", .data = &sysctl_ip_default_ttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = ipv4_doint_and_flush, - .strategy = ipv4_doint_and_flush_strategy, .extra2 = &init_net, }, { - .ctl_name = NET_IPV4_NO_PMTU_DISC, .procname = "ip_no_pmtu_disc", .data = &ipv4_config.no_pmtu_disc, .maxlen = sizeof(int), @@ -240,7 +164,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_NONLOCAL_BIND, .procname = "ip_nonlocal_bind", .data = &sysctl_ip_nonlocal_bind, .maxlen = sizeof(int), @@ -248,7 +171,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_SYN_RETRIES, .procname = "tcp_syn_retries", .data = &sysctl_tcp_syn_retries, .maxlen = sizeof(int), @@ -256,7 +178,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_SYNACK_RETRIES, .procname = "tcp_synack_retries", .data = &sysctl_tcp_synack_retries, .maxlen = sizeof(int), @@ -264,7 +185,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_MAX_ORPHANS, .procname = "tcp_max_orphans", .data = &sysctl_tcp_max_orphans, .maxlen = sizeof(int), @@ -272,7 +192,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_MAX_TW_BUCKETS, .procname = "tcp_max_tw_buckets", .data = &tcp_death_row.sysctl_max_tw_buckets, .maxlen = sizeof(int), @@ -280,7 +199,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_DYNADDR, .procname = "ip_dynaddr", .data = &sysctl_ip_dynaddr, .maxlen = sizeof(int), @@ -288,16 +206,13 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME, .procname = "tcp_keepalive_time", .data = &sysctl_tcp_keepalive_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES, .procname = "tcp_keepalive_probes", .data = &sysctl_tcp_keepalive_probes, .maxlen = sizeof(int), @@ -305,26 +220,21 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL, .procname = "tcp_keepalive_intvl", .data = &sysctl_tcp_keepalive_intvl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_IPV4_TCP_RETRIES1, .procname = "tcp_retries1", .data = &sysctl_tcp_retries1, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .strategy = sysctl_intvec, .extra2 = &tcp_retr1_max }, { - .ctl_name = NET_IPV4_TCP_RETRIES2, .procname = "tcp_retries2", .data = &sysctl_tcp_retries2, .maxlen = sizeof(int), @@ -332,17 +242,14 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT, .procname = "tcp_fin_timeout", .data = &sysctl_tcp_fin_timeout, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, #ifdef CONFIG_SYN_COOKIES { - .ctl_name = NET_TCP_SYNCOOKIES, .procname = "tcp_syncookies", .data = &sysctl_tcp_syncookies, .maxlen = sizeof(int), @@ -351,7 +258,6 @@ static struct ctl_table ipv4_table[] = { }, #endif { - .ctl_name = NET_TCP_TW_RECYCLE, .procname = "tcp_tw_recycle", .data = &tcp_death_row.sysctl_tw_recycle, .maxlen = sizeof(int), @@ -359,7 +265,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_ABORT_ON_OVERFLOW, .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, .maxlen = sizeof(int), @@ -367,7 +272,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_STDURG, .procname = "tcp_stdurg", .data = &sysctl_tcp_stdurg, .maxlen = sizeof(int), @@ -375,7 +279,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_RFC1337, .procname = "tcp_rfc1337", .data = &sysctl_tcp_rfc1337, .maxlen = sizeof(int), @@ -383,7 +286,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_MAX_SYN_BACKLOG, .procname = "tcp_max_syn_backlog", .data = &sysctl_max_syn_backlog, .maxlen = sizeof(int), @@ -391,17 +293,14 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_LOCAL_PORT_RANGE, .procname = "ip_local_port_range", .data = &sysctl_local_ports.range, .maxlen = sizeof(sysctl_local_ports.range), .mode = 0644, .proc_handler = ipv4_local_port_range, - .strategy = ipv4_sysctl_local_port_range, }, #ifdef CONFIG_IP_MULTICAST { - .ctl_name = NET_IPV4_IGMP_MAX_MEMBERSHIPS, .procname = "igmp_max_memberships", .data = &sysctl_igmp_max_memberships, .maxlen = sizeof(int), @@ -411,7 +310,6 @@ static struct ctl_table ipv4_table[] = { #endif { - .ctl_name = NET_IPV4_IGMP_MAX_MSF, .procname = "igmp_max_msf", .data = &sysctl_igmp_max_msf, .maxlen = sizeof(int), @@ -419,7 +317,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_INET_PEER_THRESHOLD, .procname = "inet_peer_threshold", .data = &inet_peer_threshold, .maxlen = sizeof(int), @@ -427,43 +324,34 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_INET_PEER_MINTTL, .procname = "inet_peer_minttl", .data = &inet_peer_minttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_IPV4_INET_PEER_MAXTTL, .procname = "inet_peer_maxttl", .data = &inet_peer_maxttl, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME, .procname = "inet_peer_gc_mintime", .data = &inet_peer_gc_mintime, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME, .procname = "inet_peer_gc_maxtime", .data = &inet_peer_gc_maxtime, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, - .strategy = sysctl_jiffies }, { - .ctl_name = NET_TCP_ORPHAN_RETRIES, .procname = "tcp_orphan_retries", .data = &sysctl_tcp_orphan_retries, .maxlen = sizeof(int), @@ -471,7 +359,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_FACK, .procname = "tcp_fack", .data = &sysctl_tcp_fack, .maxlen = sizeof(int), @@ -479,7 +366,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_REORDERING, .procname = "tcp_reordering", .data = &sysctl_tcp_reordering, .maxlen = sizeof(int), @@ -487,7 +373,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_ECN, .procname = "tcp_ecn", .data = &sysctl_tcp_ecn, .maxlen = sizeof(int), @@ -495,7 +380,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_DSACK, .procname = "tcp_dsack", .data = &sysctl_tcp_dsack, .maxlen = sizeof(int), @@ -503,7 +387,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_MEM, .procname = "tcp_mem", .data = &sysctl_tcp_mem, .maxlen = sizeof(sysctl_tcp_mem), @@ -511,7 +394,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_WMEM, .procname = "tcp_wmem", .data = &sysctl_tcp_wmem, .maxlen = sizeof(sysctl_tcp_wmem), @@ -519,7 +401,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_RMEM, .procname = "tcp_rmem", .data = &sysctl_tcp_rmem, .maxlen = sizeof(sysctl_tcp_rmem), @@ -527,7 +408,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_APP_WIN, .procname = "tcp_app_win", .data = &sysctl_tcp_app_win, .maxlen = sizeof(int), @@ -535,7 +415,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_ADV_WIN_SCALE, .procname = "tcp_adv_win_scale", .data = &sysctl_tcp_adv_win_scale, .maxlen = sizeof(int), @@ -543,7 +422,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_TW_REUSE, .procname = "tcp_tw_reuse", .data = &sysctl_tcp_tw_reuse, .maxlen = sizeof(int), @@ -551,7 +429,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_FRTO, .procname = "tcp_frto", .data = &sysctl_tcp_frto, .maxlen = sizeof(int), @@ -559,7 +436,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_FRTO_RESPONSE, .procname = "tcp_frto_response", .data = &sysctl_tcp_frto_response, .maxlen = sizeof(int), @@ -567,7 +443,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_LOW_LATENCY, .procname = "tcp_low_latency", .data = &sysctl_tcp_low_latency, .maxlen = sizeof(int), @@ -575,7 +450,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save, .maxlen = sizeof(int), @@ -583,7 +457,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_TCP_MODERATE_RCVBUF, .procname = "tcp_moderate_rcvbuf", .data = &sysctl_tcp_moderate_rcvbuf, .maxlen = sizeof(int), @@ -591,7 +464,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_TCP_TSO_WIN_DIVISOR, .procname = "tcp_tso_win_divisor", .data = &sysctl_tcp_tso_win_divisor, .maxlen = sizeof(int), @@ -599,15 +471,12 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_TCP_CONG_CONTROL, .procname = "tcp_congestion_control", .mode = 0644, .maxlen = TCP_CA_NAME_MAX, .proc_handler = proc_tcp_congestion_control, - .strategy = sysctl_tcp_congestion_control, }, { - .ctl_name = NET_TCP_ABC, .procname = "tcp_abc", .data = &sysctl_tcp_abc, .maxlen = sizeof(int), @@ -615,7 +484,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_TCP_MTU_PROBING, .procname = "tcp_mtu_probing", .data = &sysctl_tcp_mtu_probing, .maxlen = sizeof(int), @@ -623,7 +491,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_TCP_BASE_MSS, .procname = "tcp_base_mss", .data = &sysctl_tcp_base_mss, .maxlen = sizeof(int), @@ -631,7 +498,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, .procname = "tcp_workaround_signed_windows", .data = &sysctl_tcp_workaround_signed_windows, .maxlen = sizeof(int), @@ -640,7 +506,6 @@ static struct ctl_table ipv4_table[] = { }, #ifdef CONFIG_NET_DMA { - .ctl_name = NET_TCP_DMA_COPYBREAK, .procname = "tcp_dma_copybreak", .data = &sysctl_tcp_dma_copybreak, .maxlen = sizeof(int), @@ -649,7 +514,6 @@ static struct ctl_table ipv4_table[] = { }, #endif { - .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE, .procname = "tcp_slow_start_after_idle", .data = &sysctl_tcp_slow_start_after_idle, .maxlen = sizeof(int), @@ -658,7 +522,6 @@ static struct ctl_table ipv4_table[] = { }, #ifdef CONFIG_NETLABEL { - .ctl_name = NET_CIPSOV4_CACHE_ENABLE, .procname = "cipso_cache_enable", .data = &cipso_v4_cache_enabled, .maxlen = sizeof(int), @@ -666,7 +529,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_CIPSOV4_CACHE_BUCKET_SIZE, .procname = "cipso_cache_bucket_size", .data = &cipso_v4_cache_bucketsize, .maxlen = sizeof(int), @@ -674,7 +536,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_CIPSOV4_RBM_OPTFMT, .procname = "cipso_rbm_optfmt", .data = &cipso_v4_rbm_optfmt, .maxlen = sizeof(int), @@ -682,7 +543,6 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = NET_CIPSOV4_RBM_STRICTVALID, .procname = "cipso_rbm_strictvalid", .data = &cipso_v4_rbm_strictvalid, .maxlen = sizeof(int), @@ -697,15 +557,12 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_tcp_available_congestion_control, }, { - .ctl_name = NET_TCP_ALLOWED_CONG_CONTROL, .procname = "tcp_allowed_congestion_control", .maxlen = TCP_CA_BUF_MAX, .mode = 0644, .proc_handler = proc_allowed_congestion_control, - .strategy = strategy_allowed_congestion_control, }, { - .ctl_name = NET_TCP_MAX_SSTHRESH, .procname = "tcp_max_ssthresh", .data = &sysctl_tcp_max_ssthresh, .maxlen = sizeof(int), @@ -713,41 +570,55 @@ static struct ctl_table ipv4_table[] = { .proc_handler = proc_dointvec, }, { - .ctl_name = CTL_UNNUMBERED, + .procname = "tcp_cookie_size", + .data = &sysctl_tcp_cookie_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_thin_linear_timeouts", + .data = &sysctl_tcp_thin_linear_timeouts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "tcp_thin_dupack", + .data = &sysctl_tcp_thin_dupack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .procname = "udp_mem", .data = &sysctl_udp_mem, .maxlen = sizeof(sysctl_udp_mem), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .strategy = sysctl_intvec, .extra1 = &zero }, { - .ctl_name = CTL_UNNUMBERED, .procname = "udp_rmem_min", .data = &sysctl_udp_rmem_min, .maxlen = sizeof(sysctl_udp_rmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .strategy = sysctl_intvec, .extra1 = &zero }, { - .ctl_name = CTL_UNNUMBERED, .procname = "udp_wmem_min", .data = &sysctl_udp_wmem_min, .maxlen = sizeof(sysctl_udp_wmem_min), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .strategy = sysctl_intvec, .extra1 = &zero }, - { .ctl_name = 0 } + { } }; static struct ctl_table ipv4_net_table[] = { { - .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL, .procname = "icmp_echo_ignore_all", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_all, .maxlen = sizeof(int), @@ -755,7 +626,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, .procname = "icmp_echo_ignore_broadcasts", .data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts, .maxlen = sizeof(int), @@ -763,7 +633,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, .procname = "icmp_ignore_bogus_error_responses", .data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses, .maxlen = sizeof(int), @@ -771,7 +640,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, .procname = "icmp_errors_use_inbound_ifaddr", .data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr, .maxlen = sizeof(int), @@ -779,16 +647,13 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = NET_IPV4_ICMP_RATELIMIT, .procname = "icmp_ratelimit", .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, - .strategy = sysctl_ms_jiffies }, { - .ctl_name = NET_IPV4_ICMP_RATEMASK, .procname = "icmp_ratemask", .data = &init_net.ipv4.sysctl_icmp_ratemask, .maxlen = sizeof(int), @@ -796,7 +661,6 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec }, { - .ctl_name = CTL_UNNUMBERED, .procname = "rt_cache_rebuild_count", .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count, .maxlen = sizeof(int), @@ -807,8 +671,8 @@ static struct ctl_table ipv4_net_table[] = { }; struct ctl_path net_ipv4_ctl_path[] = { - { .procname = "net", .ctl_name = CTL_NET, }, - { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "net", }, + { .procname = "ipv4", }, { }, }; EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); @@ -818,7 +682,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) struct ctl_table *table; table = ipv4_net_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL); if (table == NULL) goto err_alloc; @@ -849,7 +713,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) return 0; err_reg: - if (net != &init_net) + if (!net_eq(net, &init_net)) kfree(table); err_alloc: return -ENOMEM; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f1813bc71088..0f8caf64caa3 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -264,6 +264,8 @@ #include <linux/cache.h> #include <linux/err.h> #include <linux/crypto.h> +#include <linux/time.h> +#include <linux/slab.h> #include <net/icmp.h> #include <net/tcp.h> @@ -428,7 +430,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (tp->urg_seq == tp->copied_seq && !sock_flag(sk, SOCK_URGINLINE) && tp->urg_data) - target--; + target++; /* Potential race condition. If read of tp below will * escape above sk->sk_state, we can be illegally awaken @@ -535,8 +537,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) tp->nonagle &= ~TCP_NAGLE_PUSH; } -static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, - struct sk_buff *skb) +static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) { if (flags & MSG_OOB) tp->snd_up = tp->write_seq; @@ -545,13 +546,13 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, static inline void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle) { - struct tcp_sock *tp = tcp_sk(sk); - if (tcp_send_head(sk)) { - struct sk_buff *skb = tcp_write_queue_tail(sk); + struct tcp_sock *tp = tcp_sk(sk); + if (!(flags & MSG_MORE) || forced_push(tp)) - tcp_mark_push(tp, skb); - tcp_mark_urg(tp, flags, skb); + tcp_mark_push(tp, tcp_write_queue_tail(sk)); + + tcp_mark_urg(tp, flags); __tcp_push_pending_frames(sk, mss_now, (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); } @@ -876,12 +877,12 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, #define TCP_PAGE(sk) (sk->sk_sndmsg_page) #define TCP_OFF(sk) (sk->sk_sndmsg_off) -static inline int select_size(struct sock *sk) +static inline int select_size(struct sock *sk, int sg) { struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; - if (sk->sk_route_caps & NETIF_F_SG) { + if (sg) { if (sk_can_gso(sk)) tmp = 0; else { @@ -905,7 +906,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, struct sk_buff *skb; int iovlen, flags; int mss_now, size_goal; - int err, copied; + int sg, err, copied; long timeo; lock_sock(sk); @@ -933,6 +934,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto out_err; + sg = sk->sk_route_caps & NETIF_F_SG; + while (--iovlen >= 0) { int seglen = iov->iov_len; unsigned char __user *from = iov->iov_base; @@ -958,8 +961,9 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_skb(sk, select_size(sk), - sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, + select_size(sk, sg), + sk->sk_allocation); if (!skb) goto wait_for_memory; @@ -996,9 +1000,7 @@ new_segment: /* We can extend the last page * fragment. */ merge = 1; - } else if (i == MAX_SKB_FRAGS || - (!i && - !(sk->sk_route_caps & NETIF_F_SG))) { + } else if (i == MAX_SKB_FRAGS || !sg) { /* Need to add new fragment and cannot * do this because interface is non-SG, * or because all the page slots are @@ -1253,6 +1255,39 @@ static void tcp_prequeue_process(struct sock *sk) tp->ucopy.memory = 0; } +#ifdef CONFIG_NET_DMA +static void tcp_service_net_dma(struct sock *sk, bool wait) +{ + dma_cookie_t done, used; + dma_cookie_t last_issued; + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->ucopy.dma_chan) + return; + + last_issued = tp->ucopy.dma_cookie; + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + + do { + if (dma_async_memcpy_complete(tp->ucopy.dma_chan, + last_issued, &done, + &used) == DMA_SUCCESS) { + /* Safe to free early-copied skbs now */ + __skb_queue_purge(&sk->sk_async_wait_queue); + break; + } else { + struct sk_buff *skb; + while ((skb = skb_peek(&sk->sk_async_wait_queue)) && + (dma_async_is_complete(skb->dma_cookie, done, + used) == DMA_SUCCESS)) { + __skb_dequeue(&sk->sk_async_wait_queue); + kfree_skb(skb); + } + } + } while (wait); +} +#endif + static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; @@ -1334,6 +1369,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_eat_skb(sk, skb, 0); if (!desc->count) break; + tp->copied_seq = seq; } tp->copied_seq = seq; @@ -1545,6 +1581,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* __ Set realtime policy in scheduler __ */ } +#ifdef CONFIG_NET_DMA + if (tp->ucopy.dma_chan) + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); +#endif if (copied >= target) { /* Do not sleep, just process backlog. */ release_sock(sk); @@ -1553,6 +1593,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sk_wait_data(sk, &timeo); #ifdef CONFIG_NET_DMA + tcp_service_net_dma(sk, false); /* Don't block */ tp->ucopy.wakeup = 0; #endif @@ -1632,6 +1673,9 @@ do_prequeue: copied = -EFAULT; break; } + + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + if ((offset + used) == skb->len) copied_early = 1; @@ -1701,27 +1745,9 @@ skip_copy: } #ifdef CONFIG_NET_DMA - if (tp->ucopy.dma_chan) { - dma_cookie_t done, used; - - dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); - - while (dma_async_memcpy_complete(tp->ucopy.dma_chan, - tp->ucopy.dma_cookie, &done, - &used) == DMA_IN_PROGRESS) { - /* do partial cleanup of sk_async_wait_queue */ - while ((skb = skb_peek(&sk->sk_async_wait_queue)) && - (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_SUCCESS)) { - __skb_dequeue(&sk->sk_async_wait_queue); - kfree_skb(skb); - } - } + tcp_service_net_dma(sk, true); /* Wait for queue to drain */ + tp->ucopy.dma_chan = NULL; - /* Safe to free early-copied skbs now */ - __skb_queue_purge(&sk->sk_async_wait_queue); - tp->ucopy.dma_chan = NULL; - } if (tp->ucopy.pinned_list) { dma_unpin_iovec_pages(tp->ucopy.pinned_list); tp->ucopy.pinned_list = NULL; @@ -2042,7 +2068,7 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_async_wait_queue); #endif - inet->dport = 0; + inet->inet_dport = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); @@ -2059,6 +2085,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_cnt = 0; tp->bytes_acked = 0; + tp->window_clamp = 0; tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); @@ -2066,7 +2093,7 @@ int tcp_disconnect(struct sock *sk, int flags) memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); - WARN_ON(inet->num && !icsk->icsk_bind_hash); + WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); sk->sk_error_report(sk); return err; @@ -2083,8 +2110,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int val; int err = 0; - /* This is a string value all the others are int's */ - if (optname == TCP_CONGESTION) { + /* These are data/string values, all the others are ints */ + switch (optname) { + case TCP_CONGESTION: { char name[TCP_CA_NAME_MAX]; if (optlen < 1) @@ -2101,6 +2129,93 @@ static int do_tcp_setsockopt(struct sock *sk, int level, release_sock(sk); return err; } + case TCP_COOKIE_TRANSACTIONS: { + struct tcp_cookie_transactions ctd; + struct tcp_cookie_values *cvp = NULL; + + if (sizeof(ctd) > optlen) + return -EINVAL; + if (copy_from_user(&ctd, optval, sizeof(ctd))) + return -EFAULT; + + if (ctd.tcpct_used > sizeof(ctd.tcpct_value) || + ctd.tcpct_s_data_desired > TCP_MSS_DESIRED) + return -EINVAL; + + if (ctd.tcpct_cookie_desired == 0) { + /* default to global value */ + } else if ((0x1 & ctd.tcpct_cookie_desired) || + ctd.tcpct_cookie_desired > TCP_COOKIE_MAX || + ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) { + return -EINVAL; + } + + if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) { + /* Supercedes all other values */ + lock_sock(sk); + if (tp->cookie_values != NULL) { + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + tp->cookie_values = NULL; + } + tp->rx_opt.cookie_in_always = 0; /* false */ + tp->rx_opt.cookie_out_never = 1; /* true */ + release_sock(sk); + return err; + } + + /* Allocate ancillary memory before locking. + */ + if (ctd.tcpct_used > 0 || + (tp->cookie_values == NULL && + (sysctl_tcp_cookie_size > 0 || + ctd.tcpct_cookie_desired > 0 || + ctd.tcpct_s_data_desired > 0))) { + cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used, + GFP_KERNEL); + if (cvp == NULL) + return -ENOMEM; + } + lock_sock(sk); + tp->rx_opt.cookie_in_always = + (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags); + tp->rx_opt.cookie_out_never = 0; /* false */ + + if (tp->cookie_values != NULL) { + if (cvp != NULL) { + /* Changed values are recorded by a changed + * pointer, ensuring the cookie will differ, + * without separately hashing each value later. + */ + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + kref_init(&cvp->kref); + tp->cookie_values = cvp; + } else { + cvp = tp->cookie_values; + } + } + if (cvp != NULL) { + cvp->cookie_desired = ctd.tcpct_cookie_desired; + + if (ctd.tcpct_used > 0) { + memcpy(cvp->s_data_payload, ctd.tcpct_value, + ctd.tcpct_used); + cvp->s_data_desired = ctd.tcpct_used; + cvp->s_data_constant = 1; /* true */ + } else { + /* No constant payload data. */ + cvp->s_data_desired = ctd.tcpct_s_data_desired; + cvp->s_data_constant = 0; /* false */ + } + } + release_sock(sk); + return err; + } + default: + /* fallthru */ + break; + }; if (optlen < sizeof(int)) return -EINVAL; @@ -2139,6 +2254,20 @@ static int do_tcp_setsockopt(struct sock *sk, int level, } break; + case TCP_THIN_LINEAR_TIMEOUTS: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->thin_lto = val; + break; + + case TCP_THIN_DUPACK: + if (val < 0 || val > 1) + err = -EINVAL; + else + tp->thin_dupack = val; + break; + case TCP_CORK: /* When set indicates to always queue non-full frames. * Later the user clears this option and we transmit @@ -2425,6 +2554,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level, if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; + + case TCP_COOKIE_TRANSACTIONS: { + struct tcp_cookie_transactions ctd; + struct tcp_cookie_values *cvp = tp->cookie_values; + + if (get_user(len, optlen)) + return -EFAULT; + if (len < sizeof(ctd)) + return -EINVAL; + + memset(&ctd, 0, sizeof(ctd)); + ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ? + TCP_COOKIE_IN_ALWAYS : 0) + | (tp->rx_opt.cookie_out_never ? + TCP_COOKIE_OUT_NEVER : 0); + + if (cvp != NULL) { + ctd.tcpct_flags |= (cvp->s_data_in ? + TCP_S_DATA_IN : 0) + | (cvp->s_data_out ? + TCP_S_DATA_OUT : 0); + + ctd.tcpct_cookie_desired = cvp->cookie_desired; + ctd.tcpct_s_data_desired = cvp->s_data_desired; + + memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0], + cvp->cookie_pair_size); + ctd.tcpct_used = cvp->cookie_pair_size; + } + + if (put_user(sizeof(ctd), optlen)) + return -EFAULT; + if (copy_to_user(optval, &ctd, sizeof(ctd))) + return -EFAULT; + return 0; + } default: return -ENOPROTOOPT; } @@ -2662,10 +2827,10 @@ EXPORT_SYMBOL(tcp_gro_complete); #ifdef CONFIG_TCP_MD5SIG static unsigned long tcp_md5sig_users; -static struct tcp_md5sig_pool **tcp_md5sig_pool; +static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool; static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); -static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) +static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool) { int cpu; for_each_possible_cpu(cpu) { @@ -2682,7 +2847,7 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) void tcp_free_md5sig_pool(void) { - struct tcp_md5sig_pool **pool = NULL; + struct tcp_md5sig_pool * __percpu *pool = NULL; spin_lock_bh(&tcp_md5sig_pool_lock); if (--tcp_md5sig_users == 0) { @@ -2696,10 +2861,11 @@ void tcp_free_md5sig_pool(void) EXPORT_SYMBOL(tcp_free_md5sig_pool); -static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(struct sock *sk) +static struct tcp_md5sig_pool * __percpu * +__tcp_alloc_md5sig_pool(struct sock *sk) { int cpu; - struct tcp_md5sig_pool **pool; + struct tcp_md5sig_pool * __percpu *pool; pool = alloc_percpu(struct tcp_md5sig_pool *); if (!pool) @@ -2726,9 +2892,9 @@ out_free: return NULL; } -struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(struct sock *sk) +struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk) { - struct tcp_md5sig_pool **pool; + struct tcp_md5sig_pool * __percpu *pool; int alloc = 0; retry: @@ -2747,7 +2913,9 @@ retry: if (alloc) { /* we cannot hold spinlock here because this may sleep. */ - struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(sk); + struct tcp_md5sig_pool * __percpu *p; + + p = __tcp_alloc_md5sig_pool(sk); spin_lock_bh(&tcp_md5sig_pool_lock); if (!p) { tcp_md5sig_users--; @@ -2771,7 +2939,7 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool); struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) { - struct tcp_md5sig_pool **p; + struct tcp_md5sig_pool * __percpu *p; spin_lock_bh(&tcp_md5sig_pool_lock); p = tcp_md5sig_pool; if (p) @@ -2847,6 +3015,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key); #endif +/** + * Each Responder maintains up to two secret values concurrently for + * efficient secret rollover. Each secret value has 4 states: + * + * Generating. (tcp_secret_generating != tcp_secret_primary) + * Generates new Responder-Cookies, but not yet used for primary + * verification. This is a short-term state, typically lasting only + * one round trip time (RTT). + * + * Primary. (tcp_secret_generating == tcp_secret_primary) + * Used both for generation and primary verification. + * + * Retiring. (tcp_secret_retiring != tcp_secret_secondary) + * Used for verification, until the first failure that can be + * verified by the newer Generating secret. At that time, this + * cookie's state is changed to Secondary, and the Generating + * cookie's state is changed to Primary. This is a short-term state, + * typically lasting only one round trip time (RTT). + * + * Secondary. (tcp_secret_retiring == tcp_secret_secondary) + * Used for secondary verification, after primary verification + * failures. This state lasts no more than twice the Maximum Segment + * Lifetime (2MSL). Then, the secret is discarded. + */ +struct tcp_cookie_secret { + /* The secret is divided into two parts. The digest part is the + * equivalent of previously hashing a secret and saving the state, + * and serves as an initialization vector (IV). The message part + * serves as the trailing secret. + */ + u32 secrets[COOKIE_WORKSPACE_WORDS]; + unsigned long expires; +}; + +#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL) +#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2) +#define TCP_SECRET_LIFE (HZ * 600) + +static struct tcp_cookie_secret tcp_secret_one; +static struct tcp_cookie_secret tcp_secret_two; + +/* Essentially a circular list, without dynamic allocation. */ +static struct tcp_cookie_secret *tcp_secret_generating; +static struct tcp_cookie_secret *tcp_secret_primary; +static struct tcp_cookie_secret *tcp_secret_retiring; +static struct tcp_cookie_secret *tcp_secret_secondary; + +static DEFINE_SPINLOCK(tcp_secret_locker); + +/* Select a pseudo-random word in the cookie workspace. + */ +static inline u32 tcp_cookie_work(const u32 *ws, const int n) +{ + return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])]; +} + +/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed. + * Called in softirq context. + * Returns: 0 for success. + */ +int tcp_cookie_generator(u32 *bakery) +{ + unsigned long jiffy = jiffies; + + if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) { + spin_lock_bh(&tcp_secret_locker); + if (!time_after_eq(jiffy, tcp_secret_generating->expires)) { + /* refreshed by another */ + memcpy(bakery, + &tcp_secret_generating->secrets[0], + COOKIE_WORKSPACE_WORDS); + } else { + /* still needs refreshing */ + get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS); + + /* The first time, paranoia assumes that the + * randomization function isn't as strong. But, + * this secret initialization is delayed until + * the last possible moment (packet arrival). + * Although that time is observable, it is + * unpredictably variable. Mash in the most + * volatile clock bits available, and expire the + * secret extra quickly. + */ + if (unlikely(tcp_secret_primary->expires == + tcp_secret_secondary->expires)) { + struct timespec tv; + + getnstimeofday(&tv); + bakery[COOKIE_DIGEST_WORDS+0] ^= + (u32)tv.tv_nsec; + + tcp_secret_secondary->expires = jiffy + + TCP_SECRET_1MSL + + (0x0f & tcp_cookie_work(bakery, 0)); + } else { + tcp_secret_secondary->expires = jiffy + + TCP_SECRET_LIFE + + (0xff & tcp_cookie_work(bakery, 1)); + tcp_secret_primary->expires = jiffy + + TCP_SECRET_2MSL + + (0x1f & tcp_cookie_work(bakery, 2)); + } + memcpy(&tcp_secret_secondary->secrets[0], + bakery, COOKIE_WORKSPACE_WORDS); + + rcu_assign_pointer(tcp_secret_generating, + tcp_secret_secondary); + rcu_assign_pointer(tcp_secret_retiring, + tcp_secret_primary); + /* + * Neither call_rcu() nor synchronize_rcu() needed. + * Retiring data is not freed. It is replaced after + * further (locked) pointer updates, and a quiet time + * (minimum 1MSL, maximum LIFE - 2MSL). + */ + } + spin_unlock_bh(&tcp_secret_locker); + } else { + rcu_read_lock_bh(); + memcpy(bakery, + &rcu_dereference(tcp_secret_generating)->secrets[0], + COOKIE_WORKSPACE_WORDS); + rcu_read_unlock_bh(); + } + return 0; +} +EXPORT_SYMBOL(tcp_cookie_generator); + void tcp_done(struct sock *sk) { if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) @@ -2881,6 +3178,7 @@ void __init tcp_init(void) struct sk_buff *skb = NULL; unsigned long nr_pages, limit; int order, i, max_share; + unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -2903,11 +3201,10 @@ void __init tcp_init(void) (totalram_pages >= 128 * 1024) ? 13 : 15, 0, - &tcp_hashinfo.ehash_size, NULL, + &tcp_hashinfo.ehash_mask, thash_entries ? 0 : 512 * 1024); - tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; - for (i = 0; i < tcp_hashinfo.ehash_size; i++) { + for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) { INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); } @@ -2916,7 +3213,7 @@ void __init tcp_init(void) tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), - tcp_hashinfo.ehash_size, + tcp_hashinfo.ehash_mask + 1, (totalram_pages >= 128 * 1024) ? 13 : 15, 0, @@ -2971,10 +3268,19 @@ void __init tcp_init(void) sysctl_tcp_rmem[2] = max(87380, max_share); printk(KERN_INFO "TCP: Hash tables configured " - "(established %d bind %d)\n", - tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); + "(established %u bind %u)\n", + tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_register_congestion_control(&tcp_reno); + + memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets)); + memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets)); + tcp_secret_one.expires = jiffy; /* past due */ + tcp_secret_two.expires = jiffy; /* past due */ + tcp_secret_generating = &tcp_secret_one; + tcp_secret_primary = &tcp_secret_one; + tcp_secret_retiring = &tcp_secret_two; + tcp_secret_secondary = &tcp_secret_two; } EXPORT_SYMBOL(tcp_close); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 6428b342b164..0ec9bd0ae94f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -10,6 +10,7 @@ #include <linux/mm.h> #include <linux/types.h> #include <linux/list.h> +#include <linux/gfp.h> #include <net/tcp.h> int sysctl_tcp_max_ssthresh = 0; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index fcbcd4ff6c5f..939edb3b8e4d 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -27,7 +27,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, r->idiag_rqueue = sk->sk_ack_backlog; r->idiag_wqueue = sk->sk_max_ack_backlog; } else { - r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); r->idiag_wqueue = tp->write_seq - tp->snd_una; } if (info != NULL) diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 26d5c7fc7de5..7c94a4955416 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -92,8 +92,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt) if (icsk->icsk_ca_state == TCP_CA_Open) { if (ca->maxRTT < ca->minRTT) ca->maxRTT = ca->minRTT; - if (ca->maxRTT < srtt - && srtt <= ca->maxRTT + msecs_to_jiffies(20)) + if (ca->maxRTT < srtt && + srtt <= ca->maxRTT + msecs_to_jiffies(20)) ca->maxRTT = srtt; } } @@ -123,9 +123,9 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt ca->packetcount += pkts_acked; - if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) - && now - ca->lasttime >= ca->minRTT - && ca->minRTT > 0) { + if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) && + now - ca->lasttime >= ca->minRTT && + ca->minRTT > 0) { __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime); if (htcp_ccount(ca) <= 3) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d86784be7ab3..f240f57b2199 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -62,6 +62,7 @@ */ #include <linux/mm.h> +#include <linux/slab.h> #include <linux/module.h> #include <linux/sysctl.h> #include <linux/kernel.h> @@ -89,6 +90,8 @@ int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_nometrics_save __read_mostly; +int sysctl_tcp_thin_dupack __read_mostly; + int sysctl_tcp_moderate_rcvbuf __read_mostly = 1; int sysctl_tcp_abc __read_mostly; @@ -140,7 +143,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) * "len" is invariant segment length, including TCP header. */ len += skb->data - skb_transport_header(skb); - if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || + if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. * This observation (if it is correct 8)) allows @@ -411,7 +414,7 @@ void tcp_initialize_rcv_mss(struct sock *sk) unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); hint = min(hint, tp->rcv_wnd / 2); - hint = min(hint, TCP_MIN_RCVMSS); + hint = min(hint, TCP_MSS_DEFAULT); hint = max(hint, TCP_MIN_MSS); inet_csk(sk)->icsk_ack.rcv_mss = hint; @@ -2300,7 +2303,7 @@ static inline int tcp_fackets_out(struct tcp_sock *tp) * they differ. Since neither occurs due to loss, TCP should really * ignore them. */ -static inline int tcp_dupack_heurestics(struct tcp_sock *tp) +static inline int tcp_dupack_heuristics(struct tcp_sock *tp) { return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1; } @@ -2425,7 +2428,7 @@ static int tcp_time_to_recover(struct sock *sk) return 1; /* Not-A-Trick#2 : Classic rule... */ - if (tcp_dupack_heurestics(tp) > tp->reordering) + if (tcp_dupack_heuristics(tp) > tp->reordering) return 1; /* Trick#3 : when we use RFC2988 timer restart, fast @@ -2447,6 +2450,16 @@ static int tcp_time_to_recover(struct sock *sk) return 1; } + /* If a thin stream is detected, retransmit after first + * received dupack. Employ only if SACK is supported in order + * to avoid possible corner-case series of spurious retransmissions + * Use only if there are no unsent data. + */ + if ((tp->thin_dupack || sysctl_tcp_thin_dupack) && + tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 && + tcp_is_sack(tp) && !tcp_send_head(sk)) + return 1; + return 0; } @@ -2499,6 +2512,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) int err; unsigned int mss; + if (packets == 0) + return; + WARN_ON(packets > tp->packets_out); if (tp->lost_skb_hint) { skb = tp->lost_skb_hint; @@ -2717,6 +2733,35 @@ static void tcp_try_undo_dsack(struct sock *sk) } } +/* We can clear retrans_stamp when there are no retransmissions in the + * window. It would seem that it is trivially available for us in + * tp->retrans_out, however, that kind of assumptions doesn't consider + * what will happen if errors occur when sending retransmission for the + * second time. ...It could the that such segment has only + * TCPCB_EVER_RETRANS set at the present time. It seems that checking + * the head skb is enough except for some reneging corner cases that + * are not worth the effort. + * + * Main reason for all this complexity is the fact that connection dying + * time now depends on the validity of the retrans_stamp, in particular, + * that successive retransmissions of a segment must not advance + * retrans_stamp under any conditions. + */ +static int tcp_any_retrans_done(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if (tp->retrans_out) + return 1; + + skb = tcp_write_queue_head(sk); + if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS)) + return 1; + + return 0; +} + /* Undo during fast recovery after partial ACK. */ static int tcp_try_undo_partial(struct sock *sk, int acked) @@ -2729,7 +2774,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked) /* Plain luck! Hole if filled with delayed * packet, rather than with a retransmit. */ - if (tp->retrans_out == 0) + if (!tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); @@ -2788,7 +2833,7 @@ static void tcp_try_keep_open(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); int state = TCP_CA_Open; - if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) + if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { @@ -2803,7 +2848,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) tcp_verify_left_out(tp); - if (!tp->frto_counter && tp->retrans_out == 0) + if (!tp->frto_counter && !tcp_any_retrans_done(sk)) tp->retrans_stamp = 0; if (flag & FLAG_ECE) @@ -3698,7 +3743,7 @@ old_ack: * the fast version below fails. */ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, - int estab) + u8 **hvpp, int estab) { unsigned char *ptr; struct tcphdr *th = tcp_hdr(skb); @@ -3782,7 +3827,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, */ break; #endif - } + case TCPOPT_COOKIE: + /* This option is variable length. + */ + switch (opsize) { + case TCPOLEN_COOKIE_BASE: + /* not yet implemented */ + break; + case TCPOLEN_COOKIE_PAIR: + /* not yet implemented */ + break; + case TCPOLEN_COOKIE_MIN+0: + case TCPOLEN_COOKIE_MIN+2: + case TCPOLEN_COOKIE_MIN+4: + case TCPOLEN_COOKIE_MIN+6: + case TCPOLEN_COOKIE_MAX: + /* 16-bit multiple */ + opt_rx->cookie_plus = opsize; + *hvpp = ptr; + default: + /* ignore option */ + break; + }; + break; + }; ptr += opsize-2; length -= opsize; @@ -3810,17 +3878,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th) * If it is wrong it falls back on tcp_parse_options(). */ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, - struct tcp_sock *tp) + struct tcp_sock *tp, u8 **hvpp) { - if (th->doff == sizeof(struct tcphdr) >> 2) { + /* In the spirit of fast parsing, compare doff directly to constant + * values. Because equality is used, short doff can be ignored here. + */ + if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; return 0; } else if (tp->rx_opt.tstamp_ok && - th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { + th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { if (tcp_parse_aligned_timestamp(tp, th)) return 1; } - tcp_parse_options(skb, &tp->rx_opt, 1); + tcp_parse_options(skb, &tp->rx_opt, hvpp, 1); return 1; } @@ -4845,11 +4916,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) struct tcp_sock *tp = tcp_sk(sk); /* More than one full frame received... */ - if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). Or... */ - && __tcp_select_window(sk) >= tp->rcv_wnd) || + __tcp_select_window(sk) >= tp->rcv_wnd) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* We have out of order data. */ @@ -5070,10 +5141,12 @@ out: static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, int syn_inerr) { + u8 *hash_location; struct tcp_sock *tp = tcp_sk(sk); /* RFC1323: H1. Apply PAWS check first. */ - if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && + if (tcp_fast_parse_options(skb, th, tp, &hash_location) && + tp->rx_opt.saw_tstamp && tcp_paws_discard(sk, skb)) { if (!th->rst) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); @@ -5361,11 +5434,13 @@ discard: static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { - struct tcp_sock *tp = tcp_sk(sk); + u8 *hash_location; struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_cookie_values *cvp = tp->cookie_values; int saved_clamp = tp->rx_opt.mss_clamp; - tcp_parse_options(skb, &tp->rx_opt, 0); + tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0); if (th->ack) { /* rfc793: @@ -5462,6 +5537,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * Change state from SYN-SENT only after copied_seq * is initialized. */ tp->copied_seq = tp->rcv_nxt; + + if (cvp != NULL && + cvp->cookie_pair_size > 0 && + tp->rx_opt.cookie_plus > 0) { + int cookie_size = tp->rx_opt.cookie_plus + - TCPOLEN_COOKIE_BASE; + int cookie_pair_size = cookie_size + + cvp->cookie_desired; + + /* A cookie extension option was sent and returned. + * Note that each incoming SYNACK replaces the + * Responder cookie. The initial exchange is most + * fragile, as protection against spoofing relies + * entirely upon the sequence and timestamp (above). + * This replacement strategy allows the correct pair to + * pass through, while any others will be filtered via + * Responder verification later. + */ + if (sizeof(cvp->cookie_pair) >= cookie_pair_size) { + memcpy(&cvp->cookie_pair[cvp->cookie_desired], + hash_location, cookie_size); + cvp->cookie_pair_size = cookie_pair_size; + } + } + smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED); @@ -5699,11 +5799,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* tcp_ack considers this ACK as duplicate * and does not calculate rtt. - * Fix it at least with timestamps. + * Force it here. */ - if (tp->rx_opt.saw_tstamp && - tp->rx_opt.rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, 0); + tcp_ack_update_rtt(sk, 0, 0); if (tp->rx_opt.tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 7cda24b53f61..3c23e70885f4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -60,6 +60,7 @@ #include <linux/jhash.h> #include <linux/init.h> #include <linux/times.h> +#include <linux/slab.h> #include <net/net_namespace.h> #include <net/icmp.h> @@ -165,10 +166,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) nexthop = inet->opt->faddr; } - tmp = ip_route_connect(&rt, nexthop, inet->saddr, + tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, IPPROTO_TCP, - inet->sport, usin->sin_port, sk, 1); + inet->inet_sport, usin->sin_port, sk, 1); if (tmp < 0) { if (tmp == -ENETUNREACH) IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); @@ -183,11 +184,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!inet->opt || !inet->opt->srr) daddr = rt->rt_dst; - if (!inet->saddr) - inet->saddr = rt->rt_src; - inet->rcv_saddr = inet->saddr; + if (!inet->inet_saddr) + inet->inet_saddr = rt->rt_src; + inet->inet_rcv_saddr = inet->inet_saddr; - if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { + if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; @@ -204,20 +205,20 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * when trying new connection. */ if (peer != NULL && - peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) { + (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) { tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; tp->rx_opt.ts_recent = peer->tcp_ts; } } - inet->dport = usin->sin_port; - inet->daddr = daddr; + inet->inet_dport = usin->sin_port; + inet->inet_daddr = daddr; inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt) inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; - tp->rx_opt.mss_clamp = 536; + tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; /* Socket identity is still unknown (sport may be zero). * However we set state to SYN-SENT and not releasing socket @@ -230,7 +231,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) goto failure; err = ip_route_newports(&rt, IPPROTO_TCP, - inet->sport, inet->dport, sk); + inet->inet_sport, inet->inet_dport, sk); if (err) goto failure; @@ -239,12 +240,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) sk_setup_caps(sk, &rt->u.dst); if (!tp->write_seq) - tp->write_seq = secure_tcp_sequence_number(inet->saddr, - inet->daddr, - inet->sport, + tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, usin->sin_port); - inet->id = tp->write_seq ^ jiffies; + inet->inet_id = tp->write_seq ^ jiffies; err = tcp_connect(sk); rt = NULL; @@ -261,7 +262,7 @@ failure: tcp_set_state(sk, TCP_CLOSE); ip_rt_put(rt); sk->sk_route_caps = 0; - inet->dport = 0; + inet->inet_dport = 0; return err; } @@ -370,6 +371,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (sk->sk_state == TCP_CLOSE) goto out; + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto out; + } + icsk = inet_csk(sk); tp = tcp_sk(sk); seq = ntohl(th->seq); @@ -520,12 +526,13 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) struct tcphdr *th = tcp_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { - th->check = ~tcp_v4_check(len, inet->saddr, - inet->daddr, 0); + th->check = ~tcp_v4_check(len, inet->inet_saddr, + inet->inet_daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { - th->check = tcp_v4_check(len, inet->saddr, inet->daddr, + th->check = tcp_v4_check(len, inet->inet_saddr, + inet->inet_daddr, csum_partial(th, th->doff << 2, skb->csum)); @@ -741,8 +748,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb, * This still operates on a request_sock only, not on a big * socket. */ -static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, - struct dst_entry *dst) +static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req, + struct request_values *rvp) { const struct inet_request_sock *ireq = inet_rsk(req); int err = -1; @@ -752,7 +760,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) return -1; - skb = tcp_make_synack(sk, dst, req); + skb = tcp_make_synack(sk, dst, req, rvp); if (skb) { struct tcphdr *th = tcp_hdr(skb); @@ -773,9 +781,11 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req, return err; } -static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req) +static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req, + struct request_values *rvp) { - return __tcp_v4_send_synack(sk, req, NULL); + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); + return tcp_v4_send_synack(sk, NULL, req, rvp); } /* @@ -848,7 +858,7 @@ static struct tcp_md5sig_key * struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, struct sock *addr_sk) { - return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr); + return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr); } EXPORT_SYMBOL(tcp_v4_md5_lookup); @@ -923,7 +933,7 @@ EXPORT_SYMBOL(tcp_v4_md5_do_add); static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk, u8 *newkey, u8 newkeylen) { - return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr, + return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr, newkey, newkeylen); } @@ -1089,8 +1099,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, __be32 saddr, daddr; if (sk) { - saddr = inet_sk(sk)->saddr; - daddr = inet_sk(sk)->daddr; + saddr = inet_sk(sk)->inet_saddr; + daddr = inet_sk(sk)->inet_daddr; } else if (req) { saddr = inet_rsk(req)->loc_addr; daddr = inet_rsk(req)->rmt_addr; @@ -1189,10 +1199,11 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb) struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), - .rtx_syn_ack = tcp_v4_send_synack, + .rtx_syn_ack = tcp_v4_rtx_synack, .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, + .syn_ack_timeout = tcp_syn_ack_timeout, }; #ifdef CONFIG_TCP_MD5SIG @@ -1210,13 +1221,16 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { - struct inet_request_sock *ireq; + struct tcp_extend_values tmp_ext; struct tcp_options_received tmp_opt; + u8 *hash_location; struct request_sock *req; + struct inet_request_sock *ireq; + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = NULL; __be32 saddr = ip_hdr(skb)->saddr; __be32 daddr = ip_hdr(skb)->daddr; __u32 isn = TCP_SKB_CB(skb)->when; - struct dst_entry *dst = NULL; #ifdef CONFIG_SYN_COOKIES int want_cookie = 0; #else @@ -1257,16 +1271,50 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) #endif tcp_clear_options(&tmp_opt); - tmp_opt.mss_clamp = 536; - tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; + tmp_opt.mss_clamp = TCP_MSS_DEFAULT; + tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); + + if (tmp_opt.cookie_plus > 0 && + tmp_opt.saw_tstamp && + !tp->rx_opt.cookie_out_never && + (sysctl_tcp_cookie_size > 0 || + (tp->cookie_values != NULL && + tp->cookie_values->cookie_desired > 0))) { + u8 *c; + u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS]; + int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE; + + if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0) + goto drop_and_release; + + /* Secret recipe starts with IP addresses */ + *mess++ ^= daddr; + *mess++ ^= saddr; - tcp_parse_options(skb, &tmp_opt, 0); + /* plus variable length Initiator Cookie */ + c = (u8 *)mess; + while (l-- > 0) + *c++ ^= *hash_location++; + +#ifdef CONFIG_SYN_COOKIES + want_cookie = 0; /* not our kind of cookie */ +#endif + tmp_ext.cookie_out_never = 0; /* false */ + tmp_ext.cookie_plus = tmp_opt.cookie_plus; + } else if (!tp->rx_opt.cookie_in_always) { + /* redundant indications, but ensure initialization. */ + tmp_ext.cookie_out_never = 1; /* true */ + tmp_ext.cookie_plus = 0; + } else { + goto drop_and_release; + } + tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always; if (want_cookie && !tmp_opt.saw_tstamp) tcp_clear_options(&tmp_opt); tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; - tcp_openreq_init(req, &tmp_opt, skb); ireq = inet_rsk(req); @@ -1304,7 +1352,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) (dst = inet_csk_route_req(sk, req)) != NULL && (peer = rt_get_peer((struct rtable *)dst)) != NULL && peer->v4daddr == saddr) { - if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL && + if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL && (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); @@ -1333,7 +1381,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) } tcp_rsk(req)->snt_isn = isn; - if (__tcp_v4_send_synack(sk, req, dst) || want_cookie) + if (tcp_v4_send_synack(sk, dst, req, + (struct request_values *)&tmp_ext) || + want_cookie) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); @@ -1380,9 +1430,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - newinet->daddr = ireq->rmt_addr; - newinet->rcv_saddr = ireq->loc_addr; - newinet->saddr = ireq->loc_addr; + newinet->inet_daddr = ireq->rmt_addr; + newinet->inet_rcv_saddr = ireq->loc_addr; + newinet->inet_saddr = ireq->loc_addr; newinet->opt = ireq->opt; ireq->opt = NULL; newinet->mc_index = inet_iif(skb); @@ -1390,7 +1440,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newinet->opt) inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; - newinet->id = newtp->write_seq ^ jiffies; + newinet->inet_id = newtp->write_seq ^ jiffies; tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); @@ -1403,7 +1453,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ - if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) { + key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr); + if (key != NULL) { /* * We're using one, so create a matching key * on the newsk structure. If we fail to get @@ -1412,13 +1463,13 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, */ char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); if (newkey != NULL) - tcp_v4_md5_do_add(newsk, newinet->daddr, + tcp_v4_md5_do_add(newsk, newinet->inet_daddr, newkey, key->keylen); newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; } #endif - __inet_hash_nolisten(newsk); + __inet_hash_nolisten(newsk, NULL); __inet_inherit_port(sk, newsk); return newsk; @@ -1610,6 +1661,11 @@ process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; + if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { + NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); + goto discard_and_relse; + } + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; nf_reset(skb); @@ -1634,8 +1690,11 @@ process: if (!tcp_prequeue(sk, skb)) ret = tcp_v4_do_rcv(sk, skb); } - } else - sk_add_backlog(sk, skb); + } else if (unlikely(sk_add_backlog(sk, skb))) { + bh_unlock_sock(sk); + NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP); + goto discard_and_relse; + } bh_unlock_sock(sk); sock_put(sk); @@ -1711,8 +1770,8 @@ int tcp_v4_remember_stamp(struct sock *sk) struct inet_peer *peer = NULL; int release_it = 0; - if (!rt || rt->rt_dst != inet->daddr) { - peer = inet_getpeer(inet->daddr, 1); + if (!rt || rt->rt_dst != inet->inet_daddr) { + peer = inet_getpeer(inet->inet_daddr, 1); release_it = 1; } else { if (!rt->peer) @@ -1722,9 +1781,9 @@ int tcp_v4_remember_stamp(struct sock *sk) if (peer) { if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && - peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { - peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; peer->tcp_ts = tp->rx_opt.ts_recent; } if (release_it) @@ -1743,9 +1802,9 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 || - (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() && - peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) { - peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp; + ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL && + peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; peer->tcp_ts = tcptw->tw_ts_recent; } inet_putpeer(peer); @@ -1810,7 +1869,7 @@ static int tcp_v4_init_sock(struct sock *sk) */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; - tp->mss_cache = 536; + tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = sysctl_tcp_reordering; icsk->icsk_ca_ops = &tcp_init_congestion_ops; @@ -1826,6 +1885,19 @@ static int tcp_v4_init_sock(struct sock *sk) tp->af_specific = &tcp_sock_ipv4_specific; #endif + /* TCP Cookie Transactions */ + if (sysctl_tcp_cookie_size > 0) { + /* Default, cookies without s_data_payload. */ + tp->cookie_values = + kzalloc(sizeof(*tp->cookie_values), + sk->sk_allocation); + if (tp->cookie_values != NULL) + kref_init(&tp->cookie_values->kref); + } + /* Presumed zeroed, in order of appearance: + * cookie_in_always, cookie_out_never, + * s_data_constant, s_data_in, s_data_out + */ sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; @@ -1879,6 +1951,13 @@ void tcp_v4_destroy_sock(struct sock *sk) sk->sk_sndmsg_page = NULL; } + /* TCP Cookie Transactions */ + if (tp->cookie_values != NULL) { + kref_put(&tp->cookie_values->kref, + tcp_cookie_values_release); + tp->cookie_values = NULL; + } + percpu_counter_dec(&tcp_sockets_allocated); } @@ -2000,7 +2079,7 @@ static void *established_get_first(struct seq_file *seq) struct net *net = seq_file_net(seq); void *rc = NULL; - for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) { + for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; struct inet_timewait_sock *tw; @@ -2061,10 +2140,10 @@ get_tw: st->state = TCP_SEQ_STATE_ESTABLISHED; /* Look for next non empty bucket */ - while (++st->bucket < tcp_hashinfo.ehash_size && + while (++st->bucket <= tcp_hashinfo.ehash_mask && empty_bucket(st)) ; - if (st->bucket >= tcp_hashinfo.ehash_size) + if (st->bucket > tcp_hashinfo.ehash_mask) return NULL; spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); @@ -2225,7 +2304,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req, " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", i, ireq->loc_addr, - ntohs(inet_sk(sk)->sport), + ntohs(inet_sk(sk)->inet_sport), ireq->rmt_addr, ntohs(ireq->rmt_port), TCP_SYN_RECV, @@ -2248,10 +2327,11 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); - __be32 dest = inet->daddr; - __be32 src = inet->rcv_saddr; - __u16 destp = ntohs(inet->dport); - __u16 srcp = ntohs(inet->sport); + __be32 dest = inet->inet_daddr; + __be32 src = inet->inet_rcv_saddr; + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); + int rx_queue; if (icsk->icsk_pending == ICSK_TIME_RETRANS) { timer_active = 1; @@ -2267,12 +2347,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len) timer_expires = jiffies; } + if (sk->sk_state == TCP_LISTEN) + rx_queue = sk->sk_ack_backlog; + else + /* + * because we dont lock socket, we might find a transient negative value + */ + rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0); + seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n", i, src, srcp, dest, destp, sk->sk_state, tp->write_seq - tp->snd_una, - sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog : - (tp->rcv_nxt - tp->copied_seq), + rx_queue, timer_active, jiffies_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, @@ -2354,12 +2441,12 @@ static struct tcp_seq_afinfo tcp4_seq_afinfo = { }, }; -static int tcp4_proc_init_net(struct net *net) +static int __net_init tcp4_proc_init_net(struct net *net) { return tcp_proc_register(net, &tcp4_seq_afinfo); } -static void tcp4_proc_exit_net(struct net *net) +static void __net_exit tcp4_proc_exit_net(struct net *net) { tcp_proc_unregister(net, &tcp4_seq_afinfo); } @@ -2463,12 +2550,17 @@ static int __net_init tcp_sk_init(struct net *net) static void __net_exit tcp_sk_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv4.tcp_sock); - inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); +} + +static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) +{ + inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET); } static struct pernet_operations __net_initdata tcp_sk_ops = { - .init = tcp_sk_init, - .exit = tcp_sk_exit, + .init = tcp_sk_init, + .exit = tcp_sk_exit, + .exit_batch = tcp_sk_exit_batch, }; void __init tcp_v4_init(void) diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index ce3c41ff50b2..de870377fbba 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -143,8 +143,8 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk) goto out; /* we can't calc remote HZ with no different!! */ - if (tp->rx_opt.rcv_tsval == lp->remote_ref_time - || tp->rx_opt.rcv_tsecr == lp->local_ref_time) + if (tp->rx_opt.rcv_tsval == lp->remote_ref_time || + tp->rx_opt.rcv_tsecr == lp->local_ref_time) goto out; m = HZ * (tp->rx_opt.rcv_tsval - diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4c03598ed924..5fabff9ac6d6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -20,19 +20,14 @@ #include <linux/mm.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/sysctl.h> #include <linux/workqueue.h> #include <net/tcp.h> #include <net/inet_common.h> #include <net/xfrm.h> -#ifdef CONFIG_SYSCTL -#define SYNC_INIT 0 /* let the user enable it */ -#else -#define SYNC_INIT 1 -#endif - -int sysctl_tcp_syncookies __read_mostly = SYNC_INIT; +int sysctl_tcp_syncookies __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_syncookies); int sysctl_tcp_abort_on_overflow __read_mostly; @@ -96,13 +91,14 @@ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th) { - struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); struct tcp_options_received tmp_opt; + u8 *hash_location; + struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); int paws_reject = 0; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { - tcp_parse_options(skb, &tmp_opt, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = tcptw->tw_ts_recent; @@ -389,14 +385,43 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk = inet_csk(newsk); - struct tcp_sock *newtp; + struct tcp_sock *newtp = tcp_sk(newsk); + struct tcp_sock *oldtp = tcp_sk(sk); + struct tcp_cookie_values *oldcvp = oldtp->cookie_values; + + /* TCP Cookie Transactions require space for the cookie pair, + * as it differs for each connection. There is no need to + * copy any s_data_payload stored at the original socket. + * Failure will prevent resuming the connection. + * + * Presumed copied, in order of appearance: + * cookie_in_always, cookie_out_never + */ + if (oldcvp != NULL) { + struct tcp_cookie_values *newcvp = + kzalloc(sizeof(*newtp->cookie_values), + GFP_ATOMIC); + + if (newcvp != NULL) { + kref_init(&newcvp->kref); + newcvp->cookie_desired = + oldcvp->cookie_desired; + newtp->cookie_values = newcvp; + } else { + /* Not Yet Implemented */ + newtp->cookie_values = NULL; + } + } /* Now setup tcp_sock */ - newtp = tcp_sk(newsk); newtp->pred_flags = 0; - newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; - newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1; - newtp->snd_up = treq->snt_isn + 1; + + newtp->rcv_wup = newtp->copied_seq = + newtp->rcv_nxt = treq->rcv_isn + 1; + + newtp->snd_sml = newtp->snd_una = + newtp->snd_nxt = newtp->snd_up = + treq->snt_isn + 1 + tcp_s_data_size(oldtp); tcp_prequeue_init(newtp); @@ -429,8 +454,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, tcp_set_ca_state(newsk, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); - newtp->write_seq = treq->snt_isn + 1; - newtp->pushed_seq = newtp->write_seq; + newtp->write_seq = newtp->pushed_seq = + treq->snt_isn + 1 + tcp_s_data_size(oldtp); newtp->rx_opt.saw_tstamp = 0; @@ -476,7 +501,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, if (newtp->af_specific->md5_lookup(sk, newsk)) newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif - if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) + if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); @@ -495,15 +520,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct request_sock **prev) { + struct tcp_options_received tmp_opt; + u8 *hash_location; + struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; - struct tcp_options_received tmp_opt; - struct sock *child; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { - tcp_parse_options(skb, &tmp_opt, 0); + tcp_parse_options(skb, &tmp_opt, &hash_location, 0); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = req->ts_recent; @@ -537,7 +563,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. */ - req->rsk_ops->rtx_syn_ack(sk, req); + req->rsk_ops->rtx_syn_ack(sk, req, NULL); return NULL; } @@ -596,7 +622,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * Invalid ACK: reset will be sent by listening socket */ if ((flg & TCP_FLAG_ACK) && - (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) + (TCP_SKB_CB(skb)->ack_seq != + tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) return sk; /* Also, it would be not so bad idea to check rcv_tsecr, which @@ -702,7 +729,7 @@ int tcp_child_process(struct sock *parent, struct sock *child, * in main socket hash table and lock on listening * socket does not protect us more. */ - sk_add_backlog(child, skb); + __sk_add_backlog(child, skb); } bh_unlock_sock(child); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index fcd278a7080e..0dda86e72ad8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -37,6 +37,7 @@ #include <net/tcp.h> #include <linux/compiler.h> +#include <linux/gfp.h> #include <linux/module.h> /* People can turn this off for buggy TCP's found in printers etc. */ @@ -59,6 +60,10 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */ +EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size); + + /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { @@ -179,7 +184,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) */ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, - int wscale_ok, __u8 *rcv_wscale) + int wscale_ok, __u8 *rcv_wscale, + __u32 init_rcv_wnd) { unsigned int space = (__space < 0 ? 0 : __space); @@ -228,7 +234,13 @@ void tcp_select_initial_window(int __space, __u32 mss, init_cwnd = 2; else if (mss > 1460) init_cwnd = 3; - if (*rcv_wnd > init_cwnd * mss) + /* when initializing use the value from init_rcv_wnd + * rather than the default from above + */ + if (init_rcv_wnd && + (*rcv_wnd > init_rcv_wnd * mss)) + *rcv_wnd = init_rcv_wnd * mss; + else if (*rcv_wnd > init_cwnd * mss) *rcv_wnd = init_cwnd * mss; } @@ -362,15 +374,45 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_TS (1 << 1) #define OPTION_MD5 (1 << 2) #define OPTION_WSCALE (1 << 3) +#define OPTION_COOKIE_EXTENSION (1 << 4) struct tcp_out_options { u8 options; /* bit field of OPTION_* */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 hash_size; /* bytes in hash_location */ u16 mss; /* 0 to disable */ __u32 tsval, tsecr; /* need to include OPTION_TS */ + __u8 *hash_location; /* temporary pointer, overloaded */ }; +/* The sysctl int routines are generic, so check consistency here. + */ +static u8 tcp_cookie_size_check(u8 desired) +{ + if (desired > 0) { + /* previously specified */ + return desired; + } + if (sysctl_tcp_cookie_size <= 0) { + /* no default specified */ + return 0; + } + if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) { + /* value too small, specify minimum */ + return TCP_COOKIE_MIN; + } + if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) { + /* value too large, specify maximum */ + return TCP_COOKIE_MAX; + } + if (0x1 & sysctl_tcp_cookie_size) { + /* 8-bit multiple, illegal, fix it */ + return (u8)(sysctl_tcp_cookie_size + 0x1); + } + return (u8)sysctl_tcp_cookie_size; +} + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -385,17 +427,34 @@ struct tcp_out_options { * (but it may well be that other scenarios fail similarly). */ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, - const struct tcp_out_options *opts, - __u8 **md5_hash) { - if (unlikely(OPTION_MD5 & opts->options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - *md5_hash = (__u8 *)ptr; + struct tcp_out_options *opts) +{ + u8 options = opts->options; /* mungable copy */ + + /* Having both authentication and cookies for security is redundant, + * and there's certainly not enough room. Instead, the cookie-less + * extension variant is proposed. + * + * Consider the pessimal case with authentication. The options + * could look like: + * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40 + */ + if (unlikely(OPTION_MD5 & options)) { + if (unlikely(OPTION_COOKIE_EXTENSION & options)) { + *ptr++ = htonl((TCPOPT_COOKIE << 24) | + (TCPOLEN_COOKIE_BASE << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + } else { + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_MD5SIG << 8) | + TCPOLEN_MD5SIG); + } + options &= ~OPTION_COOKIE_EXTENSION; + /* overload cookie hash location */ + opts->hash_location = (__u8 *)ptr; ptr += 4; - } else { - *md5_hash = NULL; } if (unlikely(opts->mss)) { @@ -404,12 +463,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, opts->mss); } - if (likely(OPTION_TS & opts->options)) { - if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { + if (likely(OPTION_TS & options)) { + if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + options &= ~OPTION_SACK_ADVERTISE; } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -420,15 +480,52 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } - if (unlikely(OPTION_SACK_ADVERTISE & opts->options && - !(OPTION_TS & opts->options))) { + /* Specification requires after timestamp, so do it now. + * + * Consider the pessimal case without authentication. The options + * could look like: + * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40 + */ + if (unlikely(OPTION_COOKIE_EXTENSION & options)) { + __u8 *cookie_copy = opts->hash_location; + u8 cookie_size = opts->hash_size; + + /* 8-bit multiple handled in tcp_cookie_size_check() above, + * and elsewhere. + */ + if (0x2 & cookie_size) { + __u8 *p = (__u8 *)ptr; + + /* 16-bit multiple */ + *p++ = TCPOPT_COOKIE; + *p++ = TCPOLEN_COOKIE_BASE + cookie_size; + *p++ = *cookie_copy++; + *p++ = *cookie_copy++; + ptr++; + cookie_size -= 2; + } else { + /* 32-bit multiple */ + *ptr++ = htonl(((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_COOKIE << 8) | + TCPOLEN_COOKIE_BASE) + + cookie_size); + } + + if (cookie_size > 0) { + memcpy(ptr, cookie_copy, cookie_size); + ptr += (cookie_size / 4); + } + } + + if (unlikely(OPTION_SACK_ADVERTISE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); } - if (unlikely(OPTION_WSCALE & opts->options)) { + if (unlikely(OPTION_WSCALE & options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | @@ -463,13 +560,17 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, struct tcp_out_options *opts, struct tcp_md5sig_key **md5) { struct tcp_sock *tp = tcp_sk(sk); - unsigned size = 0; + struct tcp_cookie_values *cvp = tp->cookie_values; + unsigned remaining = MAX_TCP_OPTION_SPACE; + u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ? + tcp_cookie_size_check(cvp->cookie_desired) : + 0; #ifdef CONFIG_TCP_MD5SIG *md5 = tp->af_specific->md5_lookup(sk, sk); if (*md5) { opts->options |= OPTION_MD5; - size += TCPOLEN_MD5SIG_ALIGNED; + remaining -= TCPOLEN_MD5SIG_ALIGNED; } #else *md5 = NULL; @@ -485,26 +586,72 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, * SACKs don't matter, we never delay an ACK when we have any of those * going out. */ opts->mss = tcp_advertise_mss(sk); - size += TCPOLEN_MSS_ALIGNED; + remaining -= TCPOLEN_MSS_ALIGNED; if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { opts->options |= OPTION_TS; opts->tsval = TCP_SKB_CB(skb)->when; opts->tsecr = tp->rx_opt.ts_recent; - size += TCPOLEN_TSTAMP_ALIGNED; + remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(sysctl_tcp_window_scaling)) { opts->ws = tp->rx_opt.rcv_wscale; opts->options |= OPTION_WSCALE; - size += TCPOLEN_WSCALE_ALIGNED; + remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(sysctl_tcp_sack)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!(OPTION_TS & opts->options))) - size += TCPOLEN_SACKPERM_ALIGNED; + remaining -= TCPOLEN_SACKPERM_ALIGNED; } - return size; + /* Note that timestamps are required by the specification. + * + * Odd numbers of bytes are prohibited by the specification, ensuring + * that the cookie is 16-bit aligned, and the resulting cookie pair is + * 32-bit aligned. + */ + if (*md5 == NULL && + (OPTION_TS & opts->options) && + cookie_size > 0) { + int need = TCPOLEN_COOKIE_BASE + cookie_size; + + if (0x2 & need) { + /* 32-bit multiple */ + need += 2; /* NOPs */ + + if (need > remaining) { + /* try shrinking cookie to fit */ + cookie_size -= 2; + need -= 4; + } + } + while (need > remaining && TCP_COOKIE_MIN <= cookie_size) { + cookie_size -= 4; + need -= 4; + } + if (TCP_COOKIE_MIN <= cookie_size) { + opts->options |= OPTION_COOKIE_EXTENSION; + opts->hash_location = (__u8 *)&cvp->cookie_pair[0]; + opts->hash_size = cookie_size; + + /* Remember for future incarnations. */ + cvp->cookie_desired = cookie_size; + + if (cvp->cookie_desired != cvp->cookie_pair_size) { + /* Currently use random bytes as a nonce, + * assuming these are completely unpredictable + * by hostile users of the same system. + */ + get_random_bytes(&cvp->cookie_pair[0], + cookie_size); + cvp->cookie_pair_size = cookie_size; + } + + remaining -= need; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } /* Set up TCP options for SYN-ACKs. */ @@ -512,48 +659,77 @@ static unsigned tcp_synack_options(struct sock *sk, struct request_sock *req, unsigned mss, struct sk_buff *skb, struct tcp_out_options *opts, - struct tcp_md5sig_key **md5) { - unsigned size = 0; + struct tcp_md5sig_key **md5, + struct tcp_extend_values *xvp) +{ struct inet_request_sock *ireq = inet_rsk(req); - char doing_ts; + unsigned remaining = MAX_TCP_OPTION_SPACE; + u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ? + xvp->cookie_plus : + 0; + bool doing_ts = ireq->tstamp_ok; #ifdef CONFIG_TCP_MD5SIG *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); if (*md5) { opts->options |= OPTION_MD5; - size += TCPOLEN_MD5SIG_ALIGNED; + remaining -= TCPOLEN_MD5SIG_ALIGNED; + + /* We can't fit any SACK blocks in a packet with MD5 + TS + * options. There was discussion about disabling SACK + * rather than TS in order to fit in better with old, + * buggy kernels, but that was deemed to be unnecessary. + */ + doing_ts &= !ireq->sack_ok; } #else *md5 = NULL; #endif - /* we can't fit any SACK blocks in a packet with MD5 + TS - options. There was discussion about disabling SACK rather than TS in - order to fit in better with old, buggy kernels, but that was deemed - to be unnecessary. */ - doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok); - + /* We always send an MSS option. */ opts->mss = mss; - size += TCPOLEN_MSS_ALIGNED; + remaining -= TCPOLEN_MSS_ALIGNED; if (likely(ireq->wscale_ok)) { opts->ws = ireq->rcv_wscale; opts->options |= OPTION_WSCALE; - size += TCPOLEN_WSCALE_ALIGNED; + remaining -= TCPOLEN_WSCALE_ALIGNED; } if (likely(doing_ts)) { opts->options |= OPTION_TS; opts->tsval = TCP_SKB_CB(skb)->when; opts->tsecr = req->ts_recent; - size += TCPOLEN_TSTAMP_ALIGNED; + remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { opts->options |= OPTION_SACK_ADVERTISE; if (unlikely(!doing_ts)) - size += TCPOLEN_SACKPERM_ALIGNED; + remaining -= TCPOLEN_SACKPERM_ALIGNED; } - return size; + /* Similar rationale to tcp_syn_options() applies here, too. + * If the <SYN> options fit, the same options should fit now! + */ + if (*md5 == NULL && + doing_ts && + cookie_plus > TCPOLEN_COOKIE_BASE) { + int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */ + + if (0x2 & need) { + /* 32-bit multiple */ + need += 2; /* NOPs */ + } + if (need <= remaining) { + opts->options |= OPTION_COOKIE_EXTENSION; + opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE; + remaining -= need; + } else { + /* There's no error return, so flag it. */ + xvp->cookie_out_never = 1; /* true */ + opts->hash_size = 0; + } + } + return MAX_TCP_OPTION_SPACE - remaining; } /* Compute TCP options for ESTABLISHED sockets. This is not the @@ -619,7 +795,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct tcp_out_options opts; unsigned tcp_options_size, tcp_header_size; struct tcp_md5sig_key *md5; - __u8 *md5_hash_location; struct tcphdr *th; int err; @@ -661,8 +836,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* Build TCP header and checksum it. */ th = tcp_hdr(skb); - th->source = inet->sport; - th->dest = inet->dport; + th->source = inet->inet_sport; + th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(tp->rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | @@ -690,7 +865,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, } } - tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); + tcp_options_write((__be32 *)(th + 1), tp, &opts); if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); @@ -698,7 +873,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, /* Calculate the MD5 hash, as we have all we need now */ if (md5) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - tp->af_specific->calc_md5_hash(md5_hash_location, + tp->af_specific->calc_md5_hash(opts.hash_location, md5, sk, NULL, skb); } #endif @@ -1627,11 +1802,6 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle) { - struct sk_buff *skb = tcp_send_head(sk); - - if (!skb) - return; - /* If we are closed, the bytes will have to remain here. * In time closedown will finish, we empty the write queue and * all will be happy. @@ -1918,8 +2088,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) * case, when window is shrunk to zero. In this case * our retransmit serves as a zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) - && TCP_SKB_CB(skb)->seq != tp->snd_una) + if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && + TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; if (skb->len > cur_mss) { @@ -2219,19 +2389,24 @@ int tcp_send_synack(struct sock *sk) /* Prepare a SYN-ACK. */ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, - struct request_sock *req) + struct request_sock *req, + struct request_values *rvp) { + struct tcp_out_options opts; + struct tcp_extend_values *xvp = tcp_xv(rvp); struct inet_request_sock *ireq = inet_rsk(req); struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_cookie_values *cvp = tp->cookie_values; struct tcphdr *th; - int tcp_header_size; - struct tcp_out_options opts; struct sk_buff *skb; struct tcp_md5sig_key *md5; - __u8 *md5_hash_location; + int tcp_header_size; int mss; + int s_data_desired = 0; - skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); + if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) + s_data_desired = cvp->s_data_desired; + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC); if (skb == NULL) return NULL; @@ -2254,7 +2429,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, - &rcv_wscale); + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); ireq->rcv_wscale = rcv_wscale; } @@ -2266,8 +2442,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, #endif TCP_SKB_CB(skb)->when = tcp_time_stamp; tcp_header_size = tcp_synack_options(sk, req, mss, - skb, &opts, &md5) + - sizeof(struct tcphdr); + skb, &opts, &md5, xvp) + + sizeof(*th); skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); @@ -2284,19 +2460,54 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, */ tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); + + if (OPTION_COOKIE_EXTENSION & opts.options) { + if (s_data_desired) { + u8 *buf = skb_put(skb, s_data_desired); + + /* copy data directly from the listening socket. */ + memcpy(buf, cvp->s_data_payload, s_data_desired); + TCP_SKB_CB(skb)->end_seq += s_data_desired; + } + + if (opts.hash_size > 0) { + __u32 workspace[SHA_WORKSPACE_WORDS]; + u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS]; + u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1]; + + /* Secret recipe depends on the Timestamp, (future) + * Sequence and Acknowledgment Numbers, Initiator + * Cookie, and others handled by IP variant caller. + */ + *tail-- ^= opts.tsval; + *tail-- ^= tcp_rsk(req)->rcv_isn + 1; + *tail-- ^= TCP_SKB_CB(skb)->seq + 1; + + /* recommended */ + *tail-- ^= ((th->dest << 16) | th->source); + *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */ + + sha_transform((__u32 *)&xvp->cookie_bakery[0], + (char *)mess, + &workspace[0]); + opts.hash_location = + (__u8 *)&xvp->cookie_bakery[0]; + } + } + th->seq = htonl(TCP_SKB_CB(skb)->seq); th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ th->window = htons(min(req->rcv_wnd, 65535U)); - tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); + tcp_options_write((__be32 *)(th + 1), tp, &opts); th->doff = (tcp_header_size >> 2); TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ if (md5) { - tcp_rsk(req)->af_specific->calc_md5_hash(md5_hash_location, + tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, md5, NULL, req, skb); } #endif @@ -2342,7 +2553,8 @@ static void tcp_connect_init(struct sock *sk) &tp->rcv_wnd, &tp->window_clamp, sysctl_tcp_window_scaling, - &rcv_wscale); + &rcv_wscale, + dst_metric(dst, RTAX_INITRWND)); tp->rx_opt.rcv_wscale = rcv_wscale; tp->rcv_ssthresh = tp->rcv_wnd; diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index 59f5b5e7c566..f8efada580e8 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -22,6 +22,7 @@ #include <linux/kprobes.h> #include <linux/socket.h> #include <linux/tcp.h> +#include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/module.h> #include <linux/ktime.h> @@ -39,9 +40,9 @@ static int port __read_mostly = 0; MODULE_PARM_DESC(port, "Port to match (0=all)"); module_param(port, int, 0); -static int bufsize __read_mostly = 4096; +static unsigned int bufsize __read_mostly = 4096; MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)"); -module_param(bufsize, int, 0); +module_param(bufsize, uint, 0); static int full __read_mostly; MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)"); @@ -75,12 +76,12 @@ static struct { static inline int tcp_probe_used(void) { - return (tcp_probe.head - tcp_probe.tail) % bufsize; + return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1); } static inline int tcp_probe_avail(void) { - return bufsize - tcp_probe_used(); + return bufsize - tcp_probe_used() - 1; } /* @@ -94,8 +95,9 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct inet_sock *inet = inet_sk(sk); /* Only update if port matches */ - if ((port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port) - && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { + if ((port == 0 || ntohs(inet->inet_dport) == port || + ntohs(inet->inet_sport) == port) && + (full || tp->snd_cwnd != tcp_probe.lastcwnd)) { spin_lock(&tcp_probe.lock); /* If log fills, just silently drop */ @@ -103,10 +105,10 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, struct tcp_log *p = tcp_probe.log + tcp_probe.head; p->tstamp = ktime_get(); - p->saddr = inet->saddr; - p->sport = inet->sport; - p->daddr = inet->daddr; - p->dport = inet->dport; + p->saddr = inet->inet_saddr; + p->sport = inet->inet_sport; + p->daddr = inet->inet_daddr; + p->dport = inet->inet_dport; p->length = skb->len; p->snd_nxt = tp->snd_nxt; p->snd_una = tp->snd_una; @@ -115,7 +117,7 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb, p->ssthresh = tcp_current_ssthresh(sk); p->srtt = tp->srtt >> 3; - tcp_probe.head = (tcp_probe.head + 1) % bufsize; + tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1); } tcp_probe.lastcwnd = tp->snd_cwnd; spin_unlock(&tcp_probe.lock); @@ -148,7 +150,7 @@ static int tcpprobe_open(struct inode * inode, struct file * file) static int tcpprobe_sprint(char *tbuf, int n) { const struct tcp_log *p - = tcp_probe.log + tcp_probe.tail % bufsize; + = tcp_probe.log + tcp_probe.tail; struct timespec tv = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start)); @@ -191,7 +193,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf, width = tcpprobe_sprint(tbuf, sizeof(tbuf)); if (cnt + width < len) - tcp_probe.tail = (tcp_probe.tail + 1) % bufsize; + tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1); spin_unlock_bh(&tcp_probe.lock); @@ -221,9 +223,10 @@ static __init int tcpprobe_init(void) init_waitqueue_head(&tcp_probe.wait); spin_lock_init(&tcp_probe.lock); - if (bufsize < 0) + if (bufsize == 0) return -EINVAL; + bufsize = roundup_pow_of_two(bufsize); tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL); if (!tcp_probe.log) goto err0; @@ -235,7 +238,7 @@ static __init int tcpprobe_init(void) if (ret) goto err1; - pr_info("TCP probe registered (port=%d)\n", port); + pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize); return 0; err1: proc_net_remove(&init_net, procname); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index cdb2ca7684d4..8a0ab2977f1f 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -19,6 +19,7 @@ */ #include <linux/module.h> +#include <linux/gfp.h> #include <net/tcp.h> int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES; @@ -29,6 +30,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL; int sysctl_tcp_retries1 __read_mostly = TCP_RETR1; int sysctl_tcp_retries2 __read_mostly = TCP_RETR2; int sysctl_tcp_orphan_retries __read_mostly; +int sysctl_tcp_thin_linear_timeouts __read_mostly; static void tcp_write_timer(unsigned long); static void tcp_delack_timer(unsigned long); @@ -132,6 +134,35 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) } } +/* This function calculates a "timeout" which is equivalent to the timeout of a + * TCP connection after "boundary" unsuccessful, exponentially backed-off + * retransmissions with an initial RTO of TCP_RTO_MIN. + */ +static bool retransmits_timed_out(struct sock *sk, + unsigned int boundary) +{ + unsigned int timeout, linear_backoff_thresh; + unsigned int start_ts; + + if (!inet_csk(sk)->icsk_retransmits) + return false; + + if (unlikely(!tcp_sk(sk)->retrans_stamp)) + start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when; + else + start_ts = tcp_sk(sk)->retrans_stamp; + + linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); + + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; + else + timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + + return (tcp_time_stamp - start_ts) >= timeout; +} + /* A write timeout has occurred. Process the after effects. */ static int tcp_write_timeout(struct sock *sk) { @@ -141,14 +172,14 @@ static int tcp_write_timeout(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { if (icsk->icsk_retransmits) - dst_negative_advice(&sk->sk_dst_cache); + dst_negative_advice(&sk->sk_dst_cache, sk); retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); - dst_negative_advice(&sk->sk_dst_cache); + dst_negative_advice(&sk->sk_dst_cache, sk); } retry_until = sysctl_tcp_retries2; @@ -303,15 +334,15 @@ void tcp_retransmit_timer(struct sock *sk) struct inet_sock *inet = inet_sk(sk); if (sk->sk_family == AF_INET) { LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", - &inet->daddr, ntohs(inet->dport), - inet->num, tp->snd_una, tp->snd_nxt); + &inet->inet_daddr, ntohs(inet->inet_dport), + inet->inet_num, tp->snd_una, tp->snd_nxt); } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) else if (sk->sk_family == AF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n", - &np->daddr, ntohs(inet->dport), - inet->num, tp->snd_una, tp->snd_nxt); + &np->daddr, ntohs(inet->inet_dport), + inet->inet_num, tp->snd_una, tp->snd_nxt); } #endif #endif @@ -386,7 +417,25 @@ void tcp_retransmit_timer(struct sock *sk) icsk->icsk_retransmits++; out_reset_timer: - icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is + * used to reset timer, set to 0. Recalculate 'icsk_rto' as this + * might be increased if the stream oscillates between thin and thick, + * thus the old value might already be too high compared to the value + * set by 'tcp_set_rto' in tcp_input.c which resets the rto without + * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating + * exponential backoff behaviour to avoid continue hammering + * linear-timeout retransmissions into a black hole + */ + if (sk->sk_state == TCP_ESTABLISHED && + (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) && + tcp_stream_is_thin(tp) && + icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) { + icsk->icsk_backoff = 0; + icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX); + } else { + /* Use normal (exponential) backoff */ + icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); + } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) __sk_dst_reset(sk); @@ -445,6 +494,12 @@ static void tcp_synack_timer(struct sock *sk) TCP_TIMEOUT_INIT, TCP_RTO_MAX); } +void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req) +{ + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS); +} +EXPORT_SYMBOL(tcp_syn_ack_timeout); + void tcp_set_keepalive(struct sock *sk, int val) { if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index e9bbff746488..b612acf76183 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -165,9 +165,8 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) * every other rtt. */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - if (veno->inc - && tp->snd_cwnd < - tp->snd_cwnd_clamp) { + if (veno->inc && + tp->snd_cwnd < tp->snd_cwnd_clamp) { tp->snd_cwnd++; veno->inc = 0; } else diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 66b6821b984e..a0f240358892 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -157,8 +157,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight) if (queue > TCP_YEAH_ALPHA || rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { - if (queue > TCP_YEAH_ALPHA - && tp->snd_cwnd > yeah->reno_count) { + if (queue > TCP_YEAH_ALPHA && + tp->snd_cwnd > yeah->reno_count) { u32 reduction = min(queue / TCP_YEAH_GAMMA , tp->snd_cwnd >> TCP_YEAH_EPSILON); diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 3959e0ca456a..3b3813cc80b9 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -8,6 +8,7 @@ #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/skbuff.h> +#include <linux/slab.h> #include <net/icmp.h> #include <net/ip.h> #include <net/protocol.h> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 0fa9f70e4b19..954bbfb39dff 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -95,6 +95,7 @@ #include <linux/mm.h> #include <linux/inet.h> #include <linux/netdevice.h> +#include <linux/slab.h> #include <net/tcp_states.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> @@ -106,7 +107,7 @@ #include <net/xfrm.h> #include "udp_impl.h" -struct udp_table udp_table; +struct udp_table udp_table __read_mostly; EXPORT_SYMBOL(udp_table); int sysctl_udp_mem[3] __read_mostly; @@ -121,28 +122,30 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min); atomic_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); -#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE) +#define MAX_UDP_PORTS 65536 +#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN) static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, unsigned long *bitmap, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + const struct sock *sk2), + unsigned int log) { struct sock *sk2; struct hlist_nulls_node *node; sk_nulls_for_each(sk2, node, &hslot->head) - if (net_eq(sock_net(sk2), net) && - sk2 != sk && - (bitmap || sk2->sk_hash == num) && - (!sk2->sk_reuse || !sk->sk_reuse) && - (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if - || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + (bitmap || udp_sk(sk2)->udp_port_hash == num) && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && (*saddr_comp)(sk, sk2)) { if (bitmap) - __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE, + __set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap); else return 1; @@ -150,18 +153,51 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, return 0; } +/* + * Note: we still hold spinlock of primary hash chain, so no other writer + * can insert/delete a socket with local_port == num + */ +static int udp_lib_lport_inuse2(struct net *net, __u16 num, + struct udp_hslot *hslot2, + struct sock *sk, + int (*saddr_comp)(const struct sock *sk1, + const struct sock *sk2)) +{ + struct sock *sk2; + struct hlist_nulls_node *node; + int res = 0; + + spin_lock(&hslot2->lock); + udp_portaddr_for_each_entry(sk2, node, &hslot2->head) + if (net_eq(sock_net(sk2), net) && + sk2 != sk && + (udp_sk(sk2)->udp_port_hash == num) && + (!sk2->sk_reuse || !sk->sk_reuse) && + (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (*saddr_comp)(sk, sk2)) { + res = 1; + break; + } + spin_unlock(&hslot2->lock); + return res; +} + /** * udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6 * * @sk: socket struct in question * @snum: port number to look up * @saddr_comp: AF-dependent comparison of bound local IP addresses + * @hash2_nulladdr: AF-dependant hash value in secondary hash chains, + * with NULL address */ int udp_lib_get_port(struct sock *sk, unsigned short snum, int (*saddr_comp)(const struct sock *sk1, - const struct sock *sk2)) + const struct sock *sk2), + unsigned int hash2_nulladdr) { - struct udp_hslot *hslot; + struct udp_hslot *hslot, *hslot2; struct udp_table *udptable = sk->sk_prot->h.udp_table; int error = 1; struct net *net = sock_net(sk); @@ -180,13 +216,14 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, /* * force rand to be an odd multiple of UDP_HTABLE_SIZE */ - rand = (rand | 1) * UDP_HTABLE_SIZE; - for (last = first + UDP_HTABLE_SIZE; first != last; first++) { - hslot = &udptable->hash[udp_hashfn(net, first)]; + rand = (rand | 1) * (udptable->mask + 1); + last = first + udptable->mask + 1; + do { + hslot = udp_hashslot(udptable, net, first); bitmap_zero(bitmap, PORTS_PER_CHAIN); spin_lock_bh(&hslot->lock); udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, - saddr_comp); + saddr_comp, udptable->log); snum = first; /* @@ -196,25 +233,59 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, */ do { if (low <= snum && snum <= high && - !test_bit(snum / UDP_HTABLE_SIZE, bitmap)) + !test_bit(snum >> udptable->log, bitmap)) goto found; snum += rand; } while (snum != first); spin_unlock_bh(&hslot->lock); - } + } while (++first != last); goto fail; } else { - hslot = &udptable->hash[udp_hashfn(net, snum)]; + hslot = udp_hashslot(udptable, net, snum); spin_lock_bh(&hslot->lock); - if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp)) + if (hslot->count > 10) { + int exist; + unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum; + + slot2 &= udptable->mask; + hash2_nulladdr &= udptable->mask; + + hslot2 = udp_hashslot2(udptable, slot2); + if (hslot->count < hslot2->count) + goto scan_primary_hash; + + exist = udp_lib_lport_inuse2(net, snum, hslot2, + sk, saddr_comp); + if (!exist && (hash2_nulladdr != slot2)) { + hslot2 = udp_hashslot2(udptable, hash2_nulladdr); + exist = udp_lib_lport_inuse2(net, snum, hslot2, + sk, saddr_comp); + } + if (exist) + goto fail_unlock; + else + goto found; + } +scan_primary_hash: + if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, + saddr_comp, 0)) goto fail_unlock; } found: - inet_sk(sk)->num = snum; - sk->sk_hash = snum; + inet_sk(sk)->inet_num = snum; + udp_sk(sk)->udp_port_hash = snum; + udp_sk(sk)->udp_portaddr_hash ^= snum; if (sk_unhashed(sk)) { sk_nulls_add_node_rcu(sk, &hslot->head); + hslot->count++; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); + spin_lock(&hslot2->lock); + hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node, + &hslot2->head); + hslot2->count++; + spin_unlock(&hslot2->lock); } error = 0; fail_unlock: @@ -229,13 +300,26 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2) struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2); return (!ipv6_only_sock(sk2) && - (!inet1->rcv_saddr || !inet2->rcv_saddr || - inet1->rcv_saddr == inet2->rcv_saddr)); + (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr || + inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)); +} + +static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr, + unsigned int port) +{ + return jhash_1word(saddr, net_hash_mix(net)) ^ port; } int udp_v4_get_port(struct sock *sk, unsigned short snum) { - return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal); + unsigned int hash2_nulladdr = + udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum); + unsigned int hash2_partial = + udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0); + + /* precompute partial secondary hash */ + udp_sk(sk)->udp_portaddr_hash = hash2_partial; + return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr); } static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, @@ -244,23 +328,61 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, { int score = -1; - if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum && + if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum && !ipv6_only_sock(sk)) { struct inet_sock *inet = inet_sk(sk); score = (sk->sk_family == PF_INET ? 1 : 0); - if (inet->rcv_saddr) { - if (inet->rcv_saddr != daddr) + if (inet->inet_rcv_saddr) { + if (inet->inet_rcv_saddr != daddr) + return -1; + score += 2; + } + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) + return -1; + score += 2; + } + if (inet->inet_dport) { + if (inet->inet_dport != sport) + return -1; + score += 2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) return -1; score += 2; } - if (inet->daddr) { - if (inet->daddr != saddr) + } + return score; +} + +/* + * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num) + */ +#define SCORE2_MAX (1 + 2 + 2 + 2) +static inline int compute_score2(struct sock *sk, struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, int dif) +{ + int score = -1; + + if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->inet_rcv_saddr != daddr) + return -1; + if (inet->inet_num != hnum) + return -1; + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->inet_daddr) { + if (inet->inet_daddr != saddr) return -1; score += 2; } - if (inet->dport) { - if (inet->dport != sport) + if (inet->inet_dport) { + if (inet->inet_dport != sport) return -1; score += 2; } @@ -273,6 +395,51 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr, return score; } + +/* called with read_rcu_lock() */ +static struct sock *udp4_lib_lookup2(struct net *net, + __be32 saddr, __be16 sport, + __be32 daddr, unsigned int hnum, int dif, + struct udp_hslot *hslot2, unsigned int slot2) +{ + struct sock *sk, *result; + struct hlist_nulls_node *node; + int score, badness; + +begin: + result = NULL; + badness = -1; + udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) { + score = compute_score2(sk, net, saddr, sport, + daddr, hnum, dif); + if (score > badness) { + result = sk; + badness = score; + if (score == SCORE2_MAX) + goto exact_match; + } + } + /* + * if the nulls value we got at the end of this lookup is + * not the expected one, we must restart lookup. + * We probably met an item that was moved to another chain. + */ + if (get_nulls_value(node) != slot2) + goto begin; + + if (result) { +exact_match: + if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt))) + result = NULL; + else if (unlikely(compute_score2(result, net, saddr, sport, + daddr, hnum, dif) < badness)) { + sock_put(result); + goto begin; + } + } + return result; +} + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ @@ -283,11 +450,35 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, struct sock *sk, *result; struct hlist_nulls_node *node; unsigned short hnum = ntohs(dport); - unsigned int hash = udp_hashfn(net, hnum); - struct udp_hslot *hslot = &udptable->hash[hash]; + unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask); + struct udp_hslot *hslot2, *hslot = &udptable->hash[slot]; int score, badness; rcu_read_lock(); + if (hslot->count > 10) { + hash2 = udp4_portaddr_hash(net, daddr, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp4_lib_lookup2(net, saddr, sport, + daddr, hnum, dif, + hslot2, slot2); + if (!result) { + hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum); + slot2 = hash2 & udptable->mask; + hslot2 = &udptable->hash2[slot2]; + if (hslot->count < hslot2->count) + goto begin; + + result = udp4_lib_lookup2(net, INADDR_ANY, sport, + daddr, hnum, dif, + hslot2, slot2); + } + rcu_read_unlock(); + return result; + } begin: result = NULL; badness = -1; @@ -304,7 +495,7 @@ begin: * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ - if (get_nulls_value(node) != hash) + if (get_nulls_value(node) != slot) goto begin; if (result) { @@ -354,12 +545,13 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk, sk_nulls_for_each_from(s, node) { struct inet_sock *inet = inet_sk(s); - if (!net_eq(sock_net(s), net) || - s->sk_hash != hnum || - (inet->daddr && inet->daddr != rmt_addr) || - (inet->dport != rmt_port && inet->dport) || - (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || - ipv6_only_sock(s) || + if (!net_eq(sock_net(s), net) || + udp_sk(s)->udp_port_hash != hnum || + (inet->inet_daddr && inet->inet_daddr != rmt_addr) || + (inet->inet_dport != rmt_port && inet->inet_dport) || + (inet->inet_rcv_saddr && + inet->inet_rcv_saddr != loc_addr) || + ipv6_only_sock(s) || (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) continue; if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) @@ -642,14 +834,14 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; - daddr = inet->daddr; - dport = inet->dport; + daddr = inet->inet_daddr; + dport = inet->inet_dport; /* Open fast path for connected socket. Route will not be used, if at least one option is set. */ connected = 1; } - ipc.addr = inet->saddr; + ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; err = sock_tx_timestamp(msg, sk, &ipc.shtx); @@ -704,7 +896,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, .proto = sk->sk_protocol, .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = - { .sport = inet->sport, + { .sport = inet->inet_sport, .dport = dport } } }; struct net *net = sock_net(sk); @@ -748,7 +940,7 @@ back_from_confirm: inet->cork.fl.fl4_dst = daddr; inet->cork.fl.fl_ip_dport = dport; inet->cork.fl.fl4_src = saddr; - inet->cork.fl.fl_ip_sport = inet->sport; + inet->cork.fl.fl_ip_sport = inet->inet_sport; up->pending = AF_INET; do_append_data: @@ -862,6 +1054,7 @@ static unsigned int first_packet_length(struct sock *sk) udp_lib_checksum_complete(skb)) { UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); + atomic_inc(&sk->sk_drops); __skb_unlink(skb, rcvq); __skb_queue_tail(&list_kill, skb); } @@ -925,7 +1118,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; struct sk_buff *skb; - unsigned int ulen, copied; + unsigned int ulen; int peeked; int err; int is_udplite = IS_UDPLITE(sk); @@ -946,10 +1139,9 @@ try_again: goto out; ulen = skb->len - sizeof(struct udphdr); - copied = len; - if (copied > ulen) - copied = ulen; - else if (copied < ulen) + if (len > ulen) + len = ulen; + else if (len < ulen) msg->msg_flags |= MSG_TRUNC; /* @@ -958,14 +1150,14 @@ try_again: * coverage checksum (UDP-Lite), do it before the copy. */ - if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) { + if (len < ulen || UDP_SKB_CB(skb)->partial_cov) { if (udp_lib_checksum_complete(skb)) goto csum_copy_err; } if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + msg->msg_iov, len); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), @@ -982,7 +1174,7 @@ try_again: UDP_INC_STATS_USER(sock_net(sk), UDP_MIB_INDATAGRAMS, is_udplite); - sock_recv_timestamp(msg, sk, skb); + sock_recv_ts_and_drops(msg, sk, skb); /* Copy the address. */ if (sin) { @@ -994,7 +1186,7 @@ try_again: if (inet->cmsg_flags) ip_cmsg_recv(msg, skb); - err = copied; + err = len; if (flags & MSG_TRUNC) err = ulen; @@ -1023,15 +1215,15 @@ int udp_disconnect(struct sock *sk, int flags) */ sk->sk_state = TCP_CLOSE; - inet->daddr = 0; - inet->dport = 0; + inet->inet_daddr = 0; + inet->inet_dport = 0; sk->sk_bound_dev_if = 0; if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { sk->sk_prot->unhash(sk); - inet->sport = 0; + inet->inet_sport = 0; } sk_dst_reset(sk); return 0; @@ -1042,13 +1234,22 @@ void udp_lib_unhash(struct sock *sk) { if (sk_hashed(sk)) { struct udp_table *udptable = sk->sk_prot->h.udp_table; - unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash); - struct udp_hslot *hslot = &udptable->hash[hash]; + struct udp_hslot *hslot, *hslot2; + + hslot = udp_hashslot(udptable, sock_net(sk), + udp_sk(sk)->udp_port_hash); + hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash); spin_lock_bh(&hslot->lock); if (sk_nulls_del_node_init_rcu(sk)) { - inet_sk(sk)->num = 0; + hslot->count--; + inet_sk(sk)->inet_num = 0; sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + + spin_lock(&hslot2->lock); + hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node); + hslot2->count--; + spin_unlock(&hslot2->lock); } spin_unlock_bh(&hslot->lock); } @@ -1057,25 +1258,22 @@ EXPORT_SYMBOL(udp_lib_unhash); static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { - int is_udplite = IS_UDPLITE(sk); - int rc; + int rc = sock_queue_rcv_skb(sk, skb); + + if (rc < 0) { + int is_udplite = IS_UDPLITE(sk); - if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) { /* Note that an ENOMEM error is charged twice */ - if (rc == -ENOMEM) { + if (rc == -ENOMEM) UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); - atomic_inc(&sk->sk_drops); - } - goto drop; + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + kfree_skb(skb); + return -1; } return 0; -drop: - UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - kfree_skb(skb); - return -1; } /* returns: @@ -1174,61 +1372,98 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) bh_lock_sock(sk); if (!sock_owned_by_user(sk)) rc = __udp_queue_rcv_skb(sk, skb); - else - sk_add_backlog(sk, skb); + else if (sk_add_backlog(sk, skb)) { + bh_unlock_sock(sk); + goto drop; + } bh_unlock_sock(sk); return rc; drop: UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite); + atomic_inc(&sk->sk_drops); kfree_skb(skb); return -1; } + +static void flush_stack(struct sock **stack, unsigned int count, + struct sk_buff *skb, unsigned int final) +{ + unsigned int i; + struct sk_buff *skb1 = NULL; + struct sock *sk; + + for (i = 0; i < count; i++) { + sk = stack[i]; + if (likely(skb1 == NULL)) + skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC); + + if (!skb1) { + atomic_inc(&sk->sk_drops); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS, + IS_UDPLITE(sk)); + UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, + IS_UDPLITE(sk)); + } + + if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0) + skb1 = NULL; + } + if (unlikely(skb1)) + kfree_skb(skb1); +} + /* * Multicasts and broadcasts go to each listener. * - * Note: called only from the BH handler context, - * so we don't need to lock the hashes. + * Note: called only from the BH handler context. */ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb, struct udphdr *uh, __be32 saddr, __be32 daddr, struct udp_table *udptable) { - struct sock *sk; - struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))]; + struct sock *sk, *stack[256 / sizeof(struct sock *)]; + struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest)); int dif; + unsigned int i, count = 0; spin_lock(&hslot->lock); sk = sk_nulls_head(&hslot->head); dif = skb->dev->ifindex; sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif); - if (sk) { - struct sock *sknext = NULL; + while (sk) { + stack[count++] = sk; + sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, + daddr, uh->source, saddr, dif); + if (unlikely(count == ARRAY_SIZE(stack))) { + if (!sk) + break; + flush_stack(stack, count, skb, ~0); + count = 0; + } + } + /* + * before releasing chain lock, we must take a reference on sockets + */ + for (i = 0; i < count; i++) + sock_hold(stack[i]); - do { - struct sk_buff *skb1 = skb; - - sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest, - daddr, uh->source, saddr, - dif); - if (sknext) - skb1 = skb_clone(skb, GFP_ATOMIC); - - if (skb1) { - int ret = udp_queue_rcv_skb(sk, skb1); - if (ret > 0) - /* we should probably re-process instead - * of dropping packets here. */ - kfree_skb(skb1); - } - sk = sknext; - } while (sknext); - } else - consume_skb(skb); spin_unlock(&hslot->lock); + + /* + * do the slow work with no lock held + */ + if (count) { + flush_stack(stack, count, skb, count - 1); + + for (i = 0; i < count; i++) + sock_put(stack[i]); + } else { + kfree_skb(skb); + } return 0; } @@ -1620,9 +1855,14 @@ static struct sock *udp_get_first(struct seq_file *seq, int start) struct udp_iter_state *state = seq->private; struct net *net = seq_file_net(seq); - for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { + for (state->bucket = start; state->bucket <= state->udp_table->mask; + ++state->bucket) { struct hlist_nulls_node *node; struct udp_hslot *hslot = &state->udp_table->hash[state->bucket]; + + if (hlist_nulls_empty(&hslot->head)) + continue; + spin_lock_bh(&hslot->lock); sk_nulls_for_each(sk, node, &hslot->head) { if (!net_eq(sock_net(sk), net)) @@ -1647,7 +1887,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) } while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family)); if (!sk) { - if (state->bucket < UDP_HTABLE_SIZE) + if (state->bucket <= state->udp_table->mask) spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); return udp_get_first(seq, state->bucket + 1); } @@ -1667,7 +1907,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) static void *udp_seq_start(struct seq_file *seq, loff_t *pos) { struct udp_iter_state *state = seq->private; - state->bucket = UDP_HTABLE_SIZE; + state->bucket = MAX_UDP_PORTS; return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } @@ -1689,7 +1929,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v) { struct udp_iter_state *state = seq->private; - if (state->bucket < UDP_HTABLE_SIZE) + if (state->bucket <= state->udp_table->mask) spin_unlock_bh(&state->udp_table->hash[state->bucket].lock); } @@ -1744,12 +1984,12 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, int bucket, int *len) { struct inet_sock *inet = inet_sk(sp); - __be32 dest = inet->daddr; - __be32 src = inet->rcv_saddr; - __u16 destp = ntohs(inet->dport); - __u16 srcp = ntohs(inet->sport); + __be32 dest = inet->inet_daddr; + __be32 src = inet->inet_rcv_saddr; + __u16 destp = ntohs(inet->inet_dport); + __u16 srcp = ntohs(inet->inet_sport); - seq_printf(f, "%4d: %08X:%04X %08X:%04X" + seq_printf(f, "%5d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n", bucket, src, srcp, dest, destp, sp->sk_state, sk_wmem_alloc_get(sp), @@ -1789,12 +2029,12 @@ static struct udp_seq_afinfo udp4_seq_afinfo = { }, }; -static int udp4_proc_init_net(struct net *net) +static int __net_init udp4_proc_init_net(struct net *net) { return udp_proc_register(net, &udp4_seq_afinfo); } -static void udp4_proc_exit_net(struct net *net) +static void __net_exit udp4_proc_exit_net(struct net *net) { udp_proc_unregister(net, &udp4_seq_afinfo); } @@ -1815,21 +2055,60 @@ void udp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -void __init udp_table_init(struct udp_table *table) +static __initdata unsigned long uhash_entries; +static int __init set_uhash_entries(char *str) { - int i; + if (!str) + return 0; + uhash_entries = simple_strtoul(str, &str, 0); + if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN) + uhash_entries = UDP_HTABLE_SIZE_MIN; + return 1; +} +__setup("uhash_entries=", set_uhash_entries); - for (i = 0; i < UDP_HTABLE_SIZE; i++) { +void __init udp_table_init(struct udp_table *table, const char *name) +{ + unsigned int i; + + if (!CONFIG_BASE_SMALL) + table->hash = alloc_large_system_hash(name, + 2 * sizeof(struct udp_hslot), + uhash_entries, + 21, /* one slot per 2 MB */ + 0, + &table->log, + &table->mask, + 64 * 1024); + /* + * Make sure hash table has the minimum size + */ + if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) { + table->hash = kmalloc(UDP_HTABLE_SIZE_MIN * + 2 * sizeof(struct udp_hslot), GFP_KERNEL); + if (!table->hash) + panic(name); + table->log = ilog2(UDP_HTABLE_SIZE_MIN); + table->mask = UDP_HTABLE_SIZE_MIN - 1; + } + table->hash2 = table->hash + (table->mask + 1); + for (i = 0; i <= table->mask; i++) { INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i); + table->hash[i].count = 0; spin_lock_init(&table->hash[i].lock); } + for (i = 0; i <= table->mask; i++) { + INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i); + table->hash2[i].count = 0; + spin_lock_init(&table->hash2[i].lock); + } } void __init udp_init(void) { unsigned long nr_pages, limit; - udp_table_init(&udp_table); + udp_table_init(&udp_table, "UDP"); /* Set the pressure threshold up by the same strategy of TCP. It is a * fraction of global memory that is up to 1/2 at 256 MB, decreasing * toward zero with the amount of memory, with a floor of 128 pages. diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 95248d7f75ec..6610bf76369f 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -12,7 +12,7 @@ */ #include "udp_impl.h" -struct udp_table udplite_table; +struct udp_table udplite_table __read_mostly; EXPORT_SYMBOL(udplite_table); static int udplite_rcv(struct sk_buff *skb) @@ -64,7 +64,6 @@ static struct inet_protosw udplite4_protosw = { .protocol = IPPROTO_UDPLITE, .prot = &udplite_prot, .ops = &inet_dgram_ops, - .capability = -1, .no_check = 0, /* must checksum (RFC 3828) */ .flags = INET_PROTOSW_PERMANENT, }; @@ -82,12 +81,12 @@ static struct udp_seq_afinfo udplite4_seq_afinfo = { }, }; -static int udplite4_proc_init_net(struct net *net) +static int __net_init udplite4_proc_init_net(struct net *net) { return udp_proc_register(net, &udplite4_seq_afinfo); } -static void udplite4_proc_exit_net(struct net *net) +static void __net_exit udplite4_proc_exit_net(struct net *net) { udp_proc_unregister(net, &udplite4_seq_afinfo); } @@ -110,7 +109,7 @@ static inline int udplite4_proc_init(void) void __init udplite4_register(void) { - udp_table_init(&udplite_table); + udp_table_init(&udplite_table, "UDP-Lite"); if (proto_register(&udplite_prot, 1)) goto out_register_err; diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index f9f922a0ba88..c791bb63203f 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -9,6 +9,7 @@ * */ +#include <linux/slab.h> #include <linux/module.h> #include <linux/string.h> #include <linux/netfilter.h> diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 3444f3b34eca..6f368413eb0e 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -4,6 +4,7 @@ * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> */ +#include <linux/gfp.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 74fb2eb833ec..e4a1483fba77 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -15,7 +15,6 @@ #include <net/xfrm.h> #include <net/ip.h> -static struct dst_ops xfrm4_dst_ops; static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, @@ -92,11 +91,12 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst, return 0; } -static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev) +static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, + struct flowi *fl) { struct rtable *rt = (struct rtable *)xdst->route; - xdst->u.rt.fl = rt->fl; + xdst->u.rt.fl = *fl; xdst->u.dst.dev = dev; dev_hold(dev); @@ -190,8 +190,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse) static inline int xfrm4_garbage_collect(struct dst_ops *ops) { - xfrm4_policy_afinfo.garbage_collect(&init_net); - return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); + struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); + + xfrm4_policy_afinfo.garbage_collect(net); + return (atomic_read(&ops->entries) > ops->gc_thresh * 2); } static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) @@ -267,9 +269,8 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { #ifdef CONFIG_SYSCTL static struct ctl_table xfrm4_policy_table[] = { { - .ctl_name = CTL_UNNUMBERED, .procname = "xfrm4_gc_thresh", - .data = &xfrm4_dst_ops.gc_thresh, + .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, @@ -296,8 +297,6 @@ static void __exit xfrm4_policy_fini(void) void __init xfrm4_init(int rt_max_size) { - xfrm4_state_init(); - xfrm4_policy_init(); /* * Select a default value for the gc_thresh based on the main route * table hash size. It seems to me the worst case scenario is when @@ -309,6 +308,9 @@ void __init xfrm4_init(int rt_max_size) * and start cleaning when were 1/2 full */ xfrm4_dst_ops.gc_thresh = rt_max_size/2; + + xfrm4_state_init(); + xfrm4_policy_init(); #ifdef CONFIG_SYSCTL sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path, xfrm4_policy_table); |
