summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/Kconfig6
-rw-r--r--net/ipv4/af_inet.c126
-rw-r--r--net/ipv4/ah4.c300
-rw-r--r--net/ipv4/arp.c59
-rw-r--r--net/ipv4/cipso_ipv4.c3
-rw-r--r--net/ipv4/datagram.c18
-rw-r--r--net/ipv4/devinet.c306
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/fib_frontend.c57
-rw-r--r--net/ipv4/fib_hash.c26
-rw-r--r--net/ipv4/fib_rules.c16
-rw-r--r--net/ipv4/fib_semantics.c85
-rw-r--r--net/ipv4/fib_trie.c31
-rw-r--r--net/ipv4/icmp.c16
-rw-r--r--net/ipv4/igmp.c140
-rw-r--r--net/ipv4/inet_connection_sock.c29
-rw-r--r--net/ipv4/inet_diag.c31
-rw-r--r--net/ipv4/inet_fragment.c1
-rw-r--r--net/ipv4/inet_hashtables.c73
-rw-r--r--net/ipv4/inet_lro.c36
-rw-r--r--net/ipv4/inet_timewait_sock.c154
-rw-r--r--net/ipv4/inetpeer.c5
-rw-r--r--net/ipv4/ip_forward.c1
-rw-r--r--net/ipv4/ip_fragment.c53
-rw-r--r--net/ipv4/ip_gre.c119
-rw-r--r--net/ipv4/ip_input.c5
-rw-r--r--net/ipv4/ip_options.c1
-rw-r--r--net/ipv4/ip_output.c26
-rw-r--r--net/ipv4/ip_sockglue.c27
-rw-r--r--net/ipv4/ipcomp.c17
-rw-r--r--net/ipv4/ipconfig.c73
-rw-r--r--net/ipv4/ipip.c116
-rw-r--r--net/ipv4/ipmr.c47
-rw-r--r--net/ipv4/netfilter.c15
-rw-r--r--net/ipv4/netfilter/arp_tables.c412
-rw-r--r--net/ipv4/netfilter/arptable_filter.c96
-rw-r--r--net/ipv4/netfilter/ip_queue.c11
-rw-r--r--net/ipv4/netfilter/ip_tables.c607
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c35
-rw-r--r--net/ipv4/netfilter/ipt_ECN.c10
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c22
-rw-r--r--net/ipv4/netfilter/ipt_MASQUERADE.c4
-rw-r--r--net/ipv4/netfilter/ipt_REJECT.c5
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c11
-rw-r--r--net/ipv4/netfilter/ipt_ecn.c4
-rw-r--r--net/ipv4/netfilter/iptable_filter.c125
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c171
-rw-r--r--net/ipv4/netfilter/iptable_raw.c97
-rw-r--r--net/ipv4/netfilter/iptable_security.c118
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c23
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c4
-rw-r--r--net/ipv4/netfilter/nf_conntrack_proto_icmp.c47
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c34
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c41
-rw-r--r--net/ipv4/netfilter/nf_nat_ftp.c105
-rw-r--r--net/ipv4/netfilter/nf_nat_helper.c62
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c3
-rw-r--r--net/ipv4/netfilter/nf_nat_rule.c42
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c154
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c32
-rw-r--r--net/ipv4/netfilter/nf_nat_standalone.c11
-rw-r--r--net/ipv4/proc.c34
-rw-r--r--net/ipv4/raw.c34
-rw-r--r--net/ipv4/route.c305
-rw-r--r--net/ipv4/syncookies.c11
-rw-r--r--net/ipv4/sysctl_net_ipv4.c190
-rw-r--r--net/ipv4/tcp.c416
-rw-r--r--net/ipv4/tcp_cong.c1
-rw-r--r--net/ipv4/tcp_diag.c2
-rw-r--r--net/ipv4/tcp_htcp.c10
-rw-r--r--net/ipv4/tcp_input.c142
-rw-r--r--net/ipv4/tcp_ipv4.c232
-rw-r--r--net/ipv4/tcp_lp.c4
-rw-r--r--net/ipv4/tcp_minisocks.c73
-rw-r--r--net/ipv4/tcp_output.c336
-rw-r--r--net/ipv4/tcp_probe.c33
-rw-r--r--net/ipv4/tcp_timer.c69
-rw-r--r--net/ipv4/tcp_veno.c5
-rw-r--r--net/ipv4/tcp_yeah.c4
-rw-r--r--net/ipv4/tunnel4.c1
-rw-r--r--net/ipv4/udp.c509
-rw-r--r--net/ipv4/udplite.c9
-rw-r--r--net/ipv4/xfrm4_input.c1
-rw-r--r--net/ipv4/xfrm4_mode_tunnel.c1
-rw-r--r--net/ipv4/xfrm4_policy.c20
85 files changed, 3887 insertions, 2863 deletions
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 70491d9035eb..0c94a1ac2946 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -166,7 +166,7 @@ config IP_PNP_DHCP
If unsure, say Y. Note that if you want to use DHCP, a DHCP server
must be operating on your network. Read
- <file:Documentation/filesystems/nfsroot.txt> for details.
+ <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
config IP_PNP_BOOTP
bool "IP: BOOTP support"
@@ -181,7 +181,7 @@ config IP_PNP_BOOTP
does BOOTP itself, providing all necessary information on the kernel
command line, you can say N here. If unsure, say Y. Note that if you
want to use BOOTP, a BOOTP server must be operating on your network.
- Read <file:Documentation/filesystems/nfsroot.txt> for details.
+ Read <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
config IP_PNP_RARP
bool "IP: RARP support"
@@ -194,7 +194,7 @@ config IP_PNP_RARP
older protocol which is being obsoleted by BOOTP and DHCP), say Y
here. Note that if you want to use RARP, a RARP server must be
operating on your network. Read
- <file:Documentation/filesystems/nfsroot.txt> for details.
+ <file:Documentation/filesystems/nfs/nfsroot.txt> for details.
# not yet ready..
# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 57737b8d1711..f71357422380 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -86,6 +86,7 @@
#include <linux/poll.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -174,12 +175,12 @@ static int inet_autobind(struct sock *sk)
/* We may need to bind the socket. */
lock_sock(sk);
inet = inet_sk(sk);
- if (!inet->num) {
+ if (!inet->inet_num) {
if (sk->sk_prot->get_port(sk, 0)) {
release_sock(sk);
return -EAGAIN;
}
- inet->sport = htons(inet->num);
+ inet->inet_sport = htons(inet->inet_num);
}
release_sock(sk);
return 0;
@@ -262,7 +263,8 @@ static inline int inet_netns_ok(struct net *net, int protocol)
* Create an inet socket.
*/
-static int inet_create(struct net *net, struct socket *sock, int protocol)
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
struct inet_protosw *answer;
@@ -325,7 +327,7 @@ lookup_protocol:
}
err = -EPERM;
- if (answer->capability > 0 && !capable(answer->capability))
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
goto out_rcu_unlock;
err = -EAFNOSUPPORT;
@@ -354,7 +356,7 @@ lookup_protocol:
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
if (SOCK_RAW == sock->type) {
- inet->num = protocol;
+ inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
@@ -364,7 +366,7 @@ lookup_protocol:
else
inet->pmtudisc = IP_PMTUDISC_WANT;
- inet->id = 0;
+ inet->inet_id = 0;
sock_init_data(sock, sk);
@@ -381,13 +383,13 @@ lookup_protocol:
sk_refcnt_debug_inc(sk);
- if (inet->num) {
+ if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
- inet->sport = htons(inet->num);
+ inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
@@ -494,27 +496,27 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
/* Check these errors (active socket, double bind). */
err = -EINVAL;
- if (sk->sk_state != TCP_CLOSE || inet->num)
+ if (sk->sk_state != TCP_CLOSE || inet->inet_num)
goto out_release_sock;
- inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
- inet->saddr = 0; /* Use device */
+ inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
if (sk->sk_prot->get_port(sk, snum)) {
- inet->saddr = inet->rcv_saddr = 0;
+ inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}
- if (inet->rcv_saddr)
+ if (inet->inet_rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
- inet->sport = htons(inet->num);
- inet->daddr = 0;
- inet->dport = 0;
+ inet->inet_sport = htons(inet->inet_num);
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
@@ -529,10 +531,12 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
{
struct sock *sk = sock->sk;
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
if (uaddr->sa_family == AF_UNSPEC)
return sk->sk_prot->disconnect(sk, flags);
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
}
@@ -572,6 +576,9 @@ int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int err;
long timeo;
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+
lock_sock(sk);
if (uaddr->sa_family == AF_UNSPEC) {
@@ -685,21 +692,21 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
{
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
sin->sin_family = AF_INET;
if (peer) {
- if (!inet->dport ||
+ if (!inet->inet_dport ||
(((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
peer == 1))
return -ENOTCONN;
- sin->sin_port = inet->dport;
- sin->sin_addr.s_addr = inet->daddr;
+ sin->sin_port = inet->inet_dport;
+ sin->sin_addr.s_addr = inet->inet_daddr;
} else {
- __be32 addr = inet->rcv_saddr;
+ __be32 addr = inet->inet_rcv_saddr;
if (!addr)
- addr = inet->saddr;
- sin->sin_port = inet->sport;
+ addr = inet->inet_saddr;
+ sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
}
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
@@ -714,7 +721,7 @@ int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
struct sock *sk = sock->sk;
/* We may need to bind the socket. */
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
return sk->sk_prot->sendmsg(iocb, sk, msg, size);
@@ -728,7 +735,7 @@ static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
struct sock *sk = sock->sk;
/* We may need to bind the socket. */
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
if (sk->sk_prot->sendpage)
@@ -931,7 +938,7 @@ static const struct proto_ops inet_sockraw_ops = {
#endif
};
-static struct net_proto_family inet_family_ops = {
+static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
@@ -947,7 +954,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
- .capability = -1,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
@@ -958,7 +964,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
- .capability = -1,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
@@ -969,7 +974,6 @@ static struct inet_protosw inetsw_array[] =
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
- .capability = CAP_NET_RAW,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
@@ -1059,9 +1063,9 @@ static int inet_sk_reselect_saddr(struct sock *sk)
struct inet_sock *inet = inet_sk(sk);
int err;
struct rtable *rt;
- __be32 old_saddr = inet->saddr;
+ __be32 old_saddr = inet->inet_saddr;
__be32 new_saddr;
- __be32 daddr = inet->daddr;
+ __be32 daddr = inet->inet_daddr;
if (inet->opt && inet->opt->srr)
daddr = inet->opt->faddr;
@@ -1071,7 +1075,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
RT_CONN_FLAGS(sk),
sk->sk_bound_dev_if,
sk->sk_protocol,
- inet->sport, inet->dport, sk, 0);
+ inet->inet_sport, inet->inet_dport, sk, 0);
if (err)
return err;
@@ -1087,7 +1091,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
__func__, &old_saddr, &new_saddr);
}
- inet->saddr = inet->rcv_saddr = new_saddr;
+ inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
/*
* XXX The only one ugly spot where we need to
@@ -1113,7 +1117,7 @@ int inet_sk_rebuild_header(struct sock *sk)
return 0;
/* Reroute. */
- daddr = inet->daddr;
+ daddr = inet->inet_daddr;
if (inet->opt && inet->opt->srr)
daddr = inet->opt->faddr;
{
@@ -1123,7 +1127,7 @@ int inet_sk_rebuild_header(struct sock *sk)
.nl_u = {
.ip4_u = {
.daddr = daddr,
- .saddr = inet->saddr,
+ .saddr = inet->inet_saddr,
.tos = RT_CONN_FLAGS(sk),
},
},
@@ -1131,8 +1135,8 @@ int inet_sk_rebuild_header(struct sock *sk)
.flags = inet_sk_flowi_flags(sk),
.uli_u = {
.ports = {
- .sport = inet->sport,
- .dport = inet->dport,
+ .sport = inet->inet_sport,
+ .dport = inet->inet_dport,
},
},
};
@@ -1387,7 +1391,7 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
}
EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
-unsigned long snmp_fold_field(void *mib[], int offt)
+unsigned long snmp_fold_field(void __percpu *mib[], int offt)
{
unsigned long res = 0;
int i;
@@ -1400,7 +1404,7 @@ unsigned long snmp_fold_field(void *mib[], int offt)
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
-int snmp_mib_init(void *ptr[2], size_t mibsize)
+int snmp_mib_init(void __percpu *ptr[2], size_t mibsize)
{
BUG_ON(ptr == NULL);
ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
@@ -1418,7 +1422,7 @@ err0:
}
EXPORT_SYMBOL_GPL(snmp_mib_init);
-void snmp_mib_free(void *ptr[2])
+void snmp_mib_free(void __percpu *ptr[2])
{
BUG_ON(ptr == NULL);
free_percpu(ptr[0]);
@@ -1462,25 +1466,25 @@ static const struct net_protocol icmp_protocol = {
static __net_init int ipv4_mib_init_net(struct net *net)
{
- if (snmp_mib_init((void **)net->mib.tcp_statistics,
+ if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
sizeof(struct tcp_mib)) < 0)
goto err_tcp_mib;
- if (snmp_mib_init((void **)net->mib.ip_statistics,
+ if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
sizeof(struct ipstats_mib)) < 0)
goto err_ip_mib;
- if (snmp_mib_init((void **)net->mib.net_statistics,
+ if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
sizeof(struct linux_mib)) < 0)
goto err_net_mib;
- if (snmp_mib_init((void **)net->mib.udp_statistics,
+ if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
sizeof(struct udp_mib)) < 0)
goto err_udp_mib;
- if (snmp_mib_init((void **)net->mib.udplite_statistics,
+ if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
sizeof(struct udp_mib)) < 0)
goto err_udplite_mib;
- if (snmp_mib_init((void **)net->mib.icmp_statistics,
+ if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
sizeof(struct icmp_mib)) < 0)
goto err_icmp_mib;
- if (snmp_mib_init((void **)net->mib.icmpmsg_statistics,
+ if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
sizeof(struct icmpmsg_mib)) < 0)
goto err_icmpmsg_mib;
@@ -1488,30 +1492,30 @@ static __net_init int ipv4_mib_init_net(struct net *net)
return 0;
err_icmpmsg_mib:
- snmp_mib_free((void **)net->mib.icmp_statistics);
+ snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
err_icmp_mib:
- snmp_mib_free((void **)net->mib.udplite_statistics);
+ snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
err_udplite_mib:
- snmp_mib_free((void **)net->mib.udp_statistics);
+ snmp_mib_free((void __percpu **)net->mib.udp_statistics);
err_udp_mib:
- snmp_mib_free((void **)net->mib.net_statistics);
+ snmp_mib_free((void __percpu **)net->mib.net_statistics);
err_net_mib:
- snmp_mib_free((void **)net->mib.ip_statistics);
+ snmp_mib_free((void __percpu **)net->mib.ip_statistics);
err_ip_mib:
- snmp_mib_free((void **)net->mib.tcp_statistics);
+ snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
err_tcp_mib:
return -ENOMEM;
}
static __net_exit void ipv4_mib_exit_net(struct net *net)
{
- snmp_mib_free((void **)net->mib.icmpmsg_statistics);
- snmp_mib_free((void **)net->mib.icmp_statistics);
- snmp_mib_free((void **)net->mib.udplite_statistics);
- snmp_mib_free((void **)net->mib.udp_statistics);
- snmp_mib_free((void **)net->mib.net_statistics);
- snmp_mib_free((void **)net->mib.ip_statistics);
- snmp_mib_free((void **)net->mib.tcp_statistics);
+ snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
+ snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
+ snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+ snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+ snmp_mib_free((void __percpu **)net->mib.net_statistics);
+ snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+ snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
}
static __net_initdata struct pernet_operations ipv4_mib_ops = {
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c
index 5c662703eb1e..880a5ec6dce0 100644
--- a/net/ipv4/ah4.c
+++ b/net/ipv4/ah4.c
@@ -1,14 +1,73 @@
+#include <crypto/hash.h>
#include <linux/err.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/xfrm.h>
#include <net/ah.h>
#include <linux/crypto.h>
#include <linux/pfkeyv2.h>
-#include <linux/spinlock.h>
+#include <linux/scatterlist.h>
#include <net/icmp.h>
#include <net/protocol.h>
+struct ah_skb_cb {
+ struct xfrm_skb_cb xfrm;
+ void *tmp;
+};
+
+#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+
+static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
+ unsigned int size)
+{
+ unsigned int len;
+
+ len = size + crypto_ahash_digestsize(ahash) +
+ (crypto_ahash_alignmask(ahash) &
+ ~(crypto_tfm_ctx_alignment() - 1));
+
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+
+ len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash);
+ len = ALIGN(len, __alignof__(struct scatterlist));
+
+ len += sizeof(struct scatterlist) * nfrags;
+
+ return kmalloc(len, GFP_ATOMIC);
+}
+
+static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset)
+{
+ return tmp + offset;
+}
+
+static inline u8 *ah_tmp_icv(struct crypto_ahash *ahash, void *tmp,
+ unsigned int offset)
+{
+ return PTR_ALIGN((u8 *)tmp + offset, crypto_ahash_alignmask(ahash) + 1);
+}
+
+static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash,
+ u8 *icv)
+{
+ struct ahash_request *req;
+
+ req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash),
+ crypto_tfm_ctx_alignment());
+
+ ahash_request_set_tfm(req, ahash);
+
+ return req;
+}
+
+static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash,
+ struct ahash_request *req)
+{
+ return (void *)ALIGN((unsigned long)(req + 1) +
+ crypto_ahash_reqsize(ahash),
+ __alignof__(struct scatterlist));
+}
/* Clear mutable options and find final destination to substitute
* into IP header for icv calculation. Options are already checked
@@ -54,20 +113,72 @@ static int ip_clear_mutable_options(struct iphdr *iph, __be32 *daddr)
return 0;
}
+static void ah_output_done(struct crypto_async_request *base, int err)
+{
+ u8 *icv;
+ struct iphdr *iph;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = skb_dst(skb)->xfrm;
+ struct ah_data *ahp = x->data;
+ struct iphdr *top_iph = ip_hdr(skb);
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+
+ iph = AH_SKB_CB(skb)->tmp;
+ icv = ah_tmp_icv(ahp->ahash, iph, ihl);
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
+
+ top_iph->tos = iph->tos;
+ top_iph->ttl = iph->ttl;
+ top_iph->frag_off = iph->frag_off;
+ if (top_iph->ihl != 5) {
+ top_iph->daddr = iph->daddr;
+ memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
+ }
+
+ err = ah->nexthdr;
+
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_output_resume(skb, err);
+}
+
static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
{
int err;
+ int nfrags;
+ int ihl;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
struct iphdr *iph, *top_iph;
struct ip_auth_hdr *ah;
struct ah_data *ahp;
- union {
- struct iphdr iph;
- char buf[60];
- } tmp_iph;
+
+ ahp = x->data;
+ ahash = ahp->ahash;
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
skb_push(skb, -skb_network_offset(skb));
+ ah = ip_auth_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ err = -ENOMEM;
+ iph = ah_alloc_tmp(ahash, nfrags, ihl);
+ if (!iph)
+ goto out;
+
+ icv = ah_tmp_icv(ahash, iph, ihl);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
+
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
+
top_iph = ip_hdr(skb);
- iph = &tmp_iph.iph;
iph->tos = top_iph->tos;
iph->ttl = top_iph->ttl;
@@ -78,10 +189,9 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
err = ip_clear_mutable_options(top_iph, &top_iph->daddr);
if (err)
- goto error;
+ goto out_free;
}
- ah = ip_auth_hdr(skb);
ah->nexthdr = *skb_mac_header(skb);
*skb_mac_header(skb) = IPPROTO_AH;
@@ -91,20 +201,31 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
top_iph->ttl = 0;
top_iph->check = 0;
- ahp = x->data;
ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2;
ah->reserved = 0;
ah->spi = x->id.spi;
ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output);
- spin_lock_bh(&x->lock);
- err = ah_mac_digest(ahp, skb, ah->auth_data);
- memcpy(ah->auth_data, ahp->work_icv, ahp->icv_trunc_len);
- spin_unlock_bh(&x->lock);
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
- if (err)
- goto error;
+ ahash_request_set_crypt(req, sg, icv, skb->len);
+ ahash_request_set_callback(req, 0, ah_output_done, skb);
+
+ AH_SKB_CB(skb)->tmp = iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
+ goto out_free;
+ }
+
+ memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
top_iph->tos = iph->tos;
top_iph->ttl = iph->ttl;
@@ -114,28 +235,67 @@ static int ah_output(struct xfrm_state *x, struct sk_buff *skb)
memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr));
}
- err = 0;
-
-error:
+out_free:
+ kfree(iph);
+out:
return err;
}
+static void ah_input_done(struct crypto_async_request *base, int err)
+{
+ u8 *auth_data;
+ u8 *icv;
+ struct iphdr *work_iph;
+ struct sk_buff *skb = base->data;
+ struct xfrm_state *x = xfrm_input_state(skb);
+ struct ah_data *ahp = x->data;
+ struct ip_auth_hdr *ah = ip_auth_hdr(skb);
+ int ihl = ip_hdrlen(skb);
+ int ah_hlen = (ah->hdrlen + 2) << 2;
+
+ work_iph = AH_SKB_CB(skb)->tmp;
+ auth_data = ah_tmp_auth(work_iph, ihl);
+ icv = ah_tmp_icv(ahp->ahash, auth_data, ahp->icv_trunc_len);
+
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
+ if (err)
+ goto out;
+
+ skb->network_header += ah_hlen;
+ memcpy(skb_network_header(skb), work_iph, ihl);
+ __skb_pull(skb, ah_hlen + ihl);
+ skb_set_transport_header(skb, -ihl);
+
+ err = ah->nexthdr;
+out:
+ kfree(AH_SKB_CB(skb)->tmp);
+ xfrm_input_resume(skb, err);
+}
+
static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
{
int ah_hlen;
int ihl;
int nexthdr;
- int err = -EINVAL;
- struct iphdr *iph;
+ int nfrags;
+ u8 *auth_data;
+ u8 *icv;
+ struct sk_buff *trailer;
+ struct crypto_ahash *ahash;
+ struct ahash_request *req;
+ struct scatterlist *sg;
+ struct iphdr *iph, *work_iph;
struct ip_auth_hdr *ah;
struct ah_data *ahp;
- char work_buf[60];
+ int err = -ENOMEM;
if (!pskb_may_pull(skb, sizeof(*ah)))
goto out;
ah = (struct ip_auth_hdr *)skb->data;
ahp = x->data;
+ ahash = ahp->ahash;
+
nexthdr = ah->nexthdr;
ah_hlen = (ah->hdrlen + 2) << 2;
@@ -156,9 +316,24 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
ah = (struct ip_auth_hdr *)skb->data;
iph = ip_hdr(skb);
+ ihl = ip_hdrlen(skb);
+
+ if ((err = skb_cow_data(skb, 0, &trailer)) < 0)
+ goto out;
+ nfrags = err;
+
+ work_iph = ah_alloc_tmp(ahash, nfrags, ihl + ahp->icv_trunc_len);
+ if (!work_iph)
+ goto out;
+
+ auth_data = ah_tmp_auth(work_iph, ihl);
+ icv = ah_tmp_icv(ahash, auth_data, ahp->icv_trunc_len);
+ req = ah_tmp_req(ahash, icv);
+ sg = ah_req_sg(ahash, req);
- ihl = skb->data - skb_network_header(skb);
- memcpy(work_buf, iph, ihl);
+ memcpy(work_iph, iph, ihl);
+ memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
+ memset(ah->auth_data, 0, ahp->icv_trunc_len);
iph->ttl = 0;
iph->tos = 0;
@@ -166,35 +341,44 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb)
iph->check = 0;
if (ihl > sizeof(*iph)) {
__be32 dummy;
- if (ip_clear_mutable_options(iph, &dummy))
- goto out;
+ err = ip_clear_mutable_options(iph, &dummy);
+ if (err)
+ goto out_free;
}
- spin_lock(&x->lock);
- {
- u8 auth_data[MAX_AH_AUTH_LEN];
+ skb_push(skb, ihl);
- memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len);
- skb_push(skb, ihl);
- err = ah_mac_digest(ahp, skb, ah->auth_data);
- if (err)
- goto unlock;
- if (memcmp(ahp->work_icv, auth_data, ahp->icv_trunc_len))
- err = -EBADMSG;
+ sg_init_table(sg, nfrags);
+ skb_to_sgvec(skb, sg, 0, skb->len);
+
+ ahash_request_set_crypt(req, sg, icv, skb->len);
+ ahash_request_set_callback(req, 0, ah_input_done, skb);
+
+ AH_SKB_CB(skb)->tmp = work_iph;
+
+ err = crypto_ahash_digest(req);
+ if (err) {
+ if (err == -EINPROGRESS)
+ goto out;
+
+ if (err == -EBUSY)
+ err = NET_XMIT_DROP;
+ goto out_free;
}
-unlock:
- spin_unlock(&x->lock);
+ err = memcmp(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG: 0;
if (err)
- goto out;
+ goto out_free;
skb->network_header += ah_hlen;
- memcpy(skb_network_header(skb), work_buf, ihl);
- skb->transport_header = skb->network_header;
+ memcpy(skb_network_header(skb), work_iph, ihl);
__skb_pull(skb, ah_hlen + ihl);
+ skb_set_transport_header(skb, -ihl);
- return nexthdr;
+ err = nexthdr;
+out_free:
+ kfree (work_iph);
out:
return err;
}
@@ -210,7 +394,7 @@ static void ah4_err(struct sk_buff *skb, u32 info)
icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
return;
- x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
+ x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET);
if (!x)
return;
printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n",
@@ -222,7 +406,7 @@ static int ah_init_state(struct xfrm_state *x)
{
struct ah_data *ahp = NULL;
struct xfrm_algo_desc *aalg_desc;
- struct crypto_hash *tfm;
+ struct crypto_ahash *ahash;
if (!x->aalg)
goto error;
@@ -231,44 +415,40 @@ static int ah_init_state(struct xfrm_state *x)
goto error;
ahp = kzalloc(sizeof(*ahp), GFP_KERNEL);
- if (ahp == NULL)
+ if (!ahp)
return -ENOMEM;
- tfm = crypto_alloc_hash(x->aalg->alg_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm))
+ ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0);
+ if (IS_ERR(ahash))
goto error;
- ahp->tfm = tfm;
- if (crypto_hash_setkey(tfm, x->aalg->alg_key,
- (x->aalg->alg_key_len + 7) / 8))
+ ahp->ahash = ahash;
+ if (crypto_ahash_setkey(ahash, x->aalg->alg_key,
+ (x->aalg->alg_key_len + 7) / 8))
goto error;
/*
* Lookup the algorithm description maintained by xfrm_algo,
* verify crypto transform properties, and store information
* we need for AH processing. This lookup cannot fail here
- * after a successful crypto_alloc_hash().
+ * after a successful crypto_alloc_ahash().
*/
aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0);
BUG_ON(!aalg_desc);
if (aalg_desc->uinfo.auth.icv_fullbits/8 !=
- crypto_hash_digestsize(tfm)) {
+ crypto_ahash_digestsize(ahash)) {
printk(KERN_INFO "AH: %s digestsize %u != %hu\n",
- x->aalg->alg_name, crypto_hash_digestsize(tfm),
+ x->aalg->alg_name, crypto_ahash_digestsize(ahash),
aalg_desc->uinfo.auth.icv_fullbits/8);
goto error;
}
ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8;
- ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8;
+ ahp->icv_trunc_len = x->aalg->alg_trunc_len/8;
BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN);
- ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL);
- if (!ahp->work_icv)
- goto error;
-
x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) +
ahp->icv_trunc_len);
if (x->props.mode == XFRM_MODE_TUNNEL)
@@ -279,8 +459,7 @@ static int ah_init_state(struct xfrm_state *x)
error:
if (ahp) {
- kfree(ahp->work_icv);
- crypto_free_hash(ahp->tfm);
+ crypto_free_ahash(ahp->ahash);
kfree(ahp);
}
return -EINVAL;
@@ -293,8 +472,7 @@ static void ah_destroy(struct xfrm_state *x)
if (!ahp)
return;
- kfree(ahp->work_icv);
- crypto_free_hash(ahp->tfm);
+ crypto_free_ahash(ahp->ahash);
kfree(ahp);
}
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 4e80f336c0cf..6e747065c202 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -70,6 +70,7 @@
* bonding can change the skb before
* sending (e.g. insert 8021q tag).
* Harald Welte : convert to make use of jenkins hash
+ * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support.
*/
#include <linux/module.h>
@@ -97,6 +98,7 @@
#include <linux/net.h>
#include <linux/rcupdate.h>
#include <linux/jhash.h>
+#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
@@ -524,12 +526,15 @@ int arp_bind_neighbour(struct dst_entry *dst)
/*
* Check if we can use proxy ARP for this path
*/
-
-static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
+static inline int arp_fwd_proxy(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt)
{
struct in_device *out_dev;
int imi, omi = -1;
+ if (rt->u.dst.dev == dev)
+ return 0;
+
if (!IN_DEV_PROXY_ARP(in_dev))
return 0;
@@ -548,6 +553,43 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt)
}
/*
+ * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev)
+ *
+ * RFC3069 supports proxy arp replies back to the same interface. This
+ * is done to support (ethernet) switch features, like RFC 3069, where
+ * the individual ports are not allowed to communicate with each
+ * other, BUT they are allowed to talk to the upstream router. As
+ * described in RFC 3069, it is possible to allow these hosts to
+ * communicate through the upstream router, by proxy_arp'ing.
+ *
+ * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation"
+ *
+ * This technology is known by different names:
+ * In RFC 3069 it is called VLAN Aggregation.
+ * Cisco and Allied Telesyn call it Private VLAN.
+ * Hewlett-Packard call it Source-Port filtering or port-isolation.
+ * Ericsson call it MAC-Forced Forwarding (RFC Draft).
+ *
+ */
+static inline int arp_fwd_pvlan(struct in_device *in_dev,
+ struct net_device *dev, struct rtable *rt,
+ __be32 sip, __be32 tip)
+{
+ /* Private VLAN is only concerned about the same ethernet segment */
+ if (rt->u.dst.dev != dev)
+ return 0;
+
+ /* Don't reply on self probes (often done by windowz boxes)*/
+ if (sip == tip)
+ return 0;
+
+ if (IN_DEV_PROXY_ARP_PVLAN(in_dev))
+ return 1;
+ else
+ return 0;
+}
+
+/*
* Interface to link layer: send routine and receive handler.
*/
@@ -833,8 +875,11 @@ static int arp_process(struct sk_buff *skb)
}
goto out;
} else if (IN_DEV_FORWARD(in_dev)) {
- if (addr_type == RTN_UNICAST && rt->u.dst.dev != dev &&
- (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
+ if (addr_type == RTN_UNICAST &&
+ (arp_fwd_proxy(in_dev, dev, rt) ||
+ arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
+ pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))
+ {
n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
if (n)
neigh_release(n);
@@ -863,7 +908,8 @@ static int arp_process(struct sk_buff *skb)
devices (strip is candidate)
*/
if (n == NULL &&
- arp->ar_op == htons(ARPOP_REPLY) &&
+ (arp->ar_op == htons(ARPOP_REPLY) ||
+ (arp->ar_op == htons(ARPOP_REQUEST) && tip == sip)) &&
inet_addr_type(net, sip) == RTN_UNICAST)
n = __neigh_lookup(&arp_tbl, &sip, dev, 1);
}
@@ -1239,8 +1285,7 @@ void __init arp_init(void)
dev_add_pack(&arp_packet_type);
arp_proc_init();
#ifdef CONFIG_SYSCTL
- neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4,
- NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+ neigh_sysctl_register(NULL, &arp_tbl.parms, "ipv4", NULL);
#endif
register_netdevice_notifier(&arp_netdev_notifier);
}
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 039cc1ffe977..c97cd9ff697e 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -44,6 +44,7 @@
#include <linux/string.h>
#include <linux/jhash.h>
#include <linux/audit.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -2017,7 +2018,7 @@ req_setattr_failure:
* values on failure.
*
*/
-int cipso_v4_delopt(struct ip_options **opt_ptr)
+static int cipso_v4_delopt(struct ip_options **opt_ptr)
{
int hdr_delta = 0;
struct ip_options *opt = *opt_ptr;
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 5e6c5a0f3fde..fb2465811b48 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -39,7 +39,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk_dst_reset(sk);
oif = sk->sk_bound_dev_if;
- saddr = inet->saddr;
+ saddr = inet->inet_saddr;
if (ipv4_is_multicast(usin->sin_addr.s_addr)) {
if (!oif)
oif = inet->mc_index;
@@ -49,7 +49,7 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr,
RT_CONN_FLAGS(sk), oif,
sk->sk_protocol,
- inet->sport, usin->sin_port, sk, 1);
+ inet->inet_sport, usin->sin_port, sk, 1);
if (err) {
if (err == -ENETUNREACH)
IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -60,14 +60,14 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
ip_rt_put(rt);
return -EACCES;
}
- if (!inet->saddr)
- inet->saddr = rt->rt_src; /* Update source address */
- if (!inet->rcv_saddr)
- inet->rcv_saddr = rt->rt_src;
- inet->daddr = rt->rt_dst;
- inet->dport = usin->sin_port;
+ if (!inet->inet_saddr)
+ inet->inet_saddr = rt->rt_src; /* Update source address */
+ if (!inet->inet_rcv_saddr)
+ inet->inet_rcv_saddr = rt->rt_src;
+ inet->inet_daddr = rt->rt_dst;
+ inet->inet_dport = usin->sin_port;
sk->sk_state = TCP_ESTABLISHED;
- inet->id = jiffies;
+ inet->inet_id = jiffies;
sk_dst_set(sk, &rt->u.dst);
return(0);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 5df2f6a0b0f0..90e3d6379a42 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -50,6 +50,7 @@
#include <linux/notifier.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
+#include <linux/slab.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
@@ -64,20 +65,20 @@
static struct ipv4_devconf ipv4_devconf = {
.data = {
- [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
},
};
static struct ipv4_devconf ipv4_devconf_dflt = {
.data = {
- [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1,
- [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1,
- [NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
+ [IPV4_DEVCONF_ACCEPT_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SEND_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SECURE_REDIRECTS - 1] = 1,
+ [IPV4_DEVCONF_SHARED_MEDIA - 1] = 1,
+ [IPV4_DEVCONF_ACCEPT_SOURCE_ROUTE - 1] = 1,
},
};
@@ -140,11 +141,11 @@ void in_dev_finish_destroy(struct in_device *idev)
#endif
dev_put(dev);
if (!idev->dead)
- printk("Freeing alive in_device %p\n", idev);
- else {
+ pr_err("Freeing alive in_device %p\n", idev);
+ else
kfree(idev);
- }
}
+EXPORT_SYMBOL(in_dev_finish_destroy);
static struct in_device *inetdev_init(struct net_device *dev)
{
@@ -159,7 +160,8 @@ static struct in_device *inetdev_init(struct net_device *dev)
sizeof(in_dev->cnf));
in_dev->cnf.sysctl = NULL;
in_dev->dev = dev;
- if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)
+ in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl);
+ if (!in_dev->arp_parms)
goto out_kfree;
if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))
dev_disable_lro(dev);
@@ -405,13 +407,15 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
{
struct net_device *dev;
struct in_device *in_dev = NULL;
- read_lock(&dev_base_lock);
- dev = __dev_get_by_index(net, ifindex);
+
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, ifindex);
if (dev)
in_dev = in_dev_get(dev);
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
return in_dev;
}
+EXPORT_SYMBOL(inetdev_by_index);
/* Called only from RTNL semaphored context. No locks. */
@@ -557,7 +561,7 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
* Determine a default network mask, based on the IP address.
*/
-static __inline__ int inet_abc_len(__be32 addr)
+static inline int inet_abc_len(__be32 addr)
{
int rc = -1; /* Something else, probably a multicast. */
@@ -646,13 +650,15 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
rtnl_lock();
ret = -ENODEV;
- if ((dev = __dev_get_by_name(net, ifr.ifr_name)) == NULL)
+ dev = __dev_get_by_name(net, ifr.ifr_name);
+ if (!dev)
goto done;
if (colon)
*colon = ':';
- if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
+ in_dev = __in_dev_get_rtnl(dev);
+ if (in_dev) {
if (tryaddrmatch) {
/* Matthias Andree */
/* compare label and address (4.4BSD style) */
@@ -720,7 +726,8 @@ int devinet_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (!ifa) {
ret = -ENOBUFS;
- if ((ifa = inet_alloc_ifa()) == NULL)
+ ifa = inet_alloc_ifa();
+ if (!ifa)
break;
if (colon)
memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ);
@@ -822,10 +829,10 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len)
struct ifreq ifr;
int done = 0;
- if (!in_dev || (ifa = in_dev->ifa_list) == NULL)
+ if (!in_dev)
goto out;
- for (; ifa; ifa = ifa->ifa_next) {
+ for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
if (!buf) {
done += sizeof(ifr);
continue;
@@ -875,36 +882,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
if (!addr)
addr = ifa->ifa_local;
} endfor_ifa(in_dev);
-no_in_dev:
- rcu_read_unlock();
if (addr)
- goto out;
+ goto out_unlock;
+no_in_dev:
/* Not loopback addresses on loopback should be preferred
in this case. It is importnat that lo is the first interface
in dev_base list.
*/
- read_lock(&dev_base_lock);
- rcu_read_lock();
- for_each_netdev(net, dev) {
- if ((in_dev = __in_dev_get_rcu(dev)) == NULL)
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
continue;
for_primary_ifa(in_dev) {
if (ifa->ifa_scope != RT_SCOPE_LINK &&
ifa->ifa_scope <= scope) {
addr = ifa->ifa_local;
- goto out_unlock_both;
+ goto out_unlock;
}
} endfor_ifa(in_dev);
}
-out_unlock_both:
- read_unlock(&dev_base_lock);
+out_unlock:
rcu_read_unlock();
-out:
return addr;
}
+EXPORT_SYMBOL(inet_select_addr);
static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
__be32 local, int scope)
@@ -940,7 +944,7 @@ static __be32 confirm_addr_indev(struct in_device *in_dev, __be32 dst,
}
} endfor_ifa(in_dev);
- return same? addr : 0;
+ return same ? addr : 0;
}
/*
@@ -961,17 +965,16 @@ __be32 inet_confirm_addr(struct in_device *in_dev,
return confirm_addr_indev(in_dev, dst, local, scope);
net = dev_net(in_dev->dev);
- read_lock(&dev_base_lock);
rcu_read_lock();
- for_each_netdev(net, dev) {
- if ((in_dev = __in_dev_get_rcu(dev))) {
+ for_each_netdev_rcu(net, dev) {
+ in_dev = __in_dev_get_rcu(dev);
+ if (in_dev) {
addr = confirm_addr_indev(in_dev, dst, local, scope);
if (addr)
break;
}
}
rcu_read_unlock();
- read_unlock(&dev_base_lock);
return addr;
}
@@ -984,14 +987,16 @@ int register_inetaddr_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&inetaddr_chain, nb);
}
+EXPORT_SYMBOL(register_inetaddr_notifier);
int unregister_inetaddr_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&inetaddr_chain, nb);
}
+EXPORT_SYMBOL(unregister_inetaddr_notifier);
-/* Rename ifa_labels for a device name change. Make some effort to preserve existing
- * alias numbering and to create unique labels if possible.
+/* Rename ifa_labels for a device name change. Make some effort to preserve
+ * existing alias numbering and to create unique labels if possible.
*/
static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
{
@@ -1010,11 +1015,10 @@ static void inetdev_changename(struct net_device *dev, struct in_device *in_dev)
sprintf(old, ":%d", named);
dot = old;
}
- if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) {
+ if (strlen(dot) + strlen(dev->name) < IFNAMSIZ)
strcat(ifa->ifa_label, dot);
- } else {
+ else
strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot);
- }
skip:
rtmsg_ifa(RTM_NEWADDR, ifa, NULL, 0);
}
@@ -1061,8 +1065,9 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
if (!inetdev_valid_mtu(dev->mtu))
break;
if (dev->flags & IFF_LOOPBACK) {
- struct in_ifaddr *ifa;
- if ((ifa = inet_alloc_ifa()) != NULL) {
+ struct in_ifaddr *ifa = inet_alloc_ifa();
+
+ if (ifa) {
ifa->ifa_local =
ifa->ifa_address = htonl(INADDR_LOOPBACK);
ifa->ifa_prefixlen = 8;
@@ -1170,38 +1175,54 @@ nla_put_failure:
static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
- int idx, ip_idx;
+ int h, s_h;
+ int idx, s_idx;
+ int ip_idx, s_ip_idx;
struct net_device *dev;
struct in_device *in_dev;
struct in_ifaddr *ifa;
- int s_ip_idx, s_idx = cb->args[0];
+ struct hlist_head *head;
+ struct hlist_node *node;
- s_ip_idx = ip_idx = cb->args[1];
- idx = 0;
- for_each_netdev(net, dev) {
- if (idx < s_idx)
- goto cont;
- if (idx > s_idx)
- s_ip_idx = 0;
- if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
- goto cont;
-
- for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
- ifa = ifa->ifa_next, ip_idx++) {
- if (ip_idx < s_ip_idx)
- continue;
- if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid,
+ s_h = cb->args[0];
+ s_idx = idx = cb->args[1];
+ s_ip_idx = ip_idx = cb->args[2];
+
+ for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
+ idx = 0;
+ head = &net->dev_index_head[h];
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(dev, node, head, index_hlist) {
+ if (idx < s_idx)
+ goto cont;
+ if (h > s_h || idx > s_idx)
+ s_ip_idx = 0;
+ in_dev = __in_dev_get_rcu(dev);
+ if (!in_dev)
+ goto cont;
+
+ for (ifa = in_dev->ifa_list, ip_idx = 0; ifa;
+ ifa = ifa->ifa_next, ip_idx++) {
+ if (ip_idx < s_ip_idx)
+ continue;
+ if (inet_fill_ifaddr(skb, ifa,
+ NETLINK_CB(cb->skb).pid,
cb->nlh->nlmsg_seq,
- RTM_NEWADDR, NLM_F_MULTI) <= 0)
- goto done;
- }
+ RTM_NEWADDR, NLM_F_MULTI) <= 0) {
+ rcu_read_unlock();
+ goto done;
+ }
+ }
cont:
- idx++;
+ idx++;
+ }
+ rcu_read_unlock();
}
done:
- cb->args[0] = idx;
- cb->args[1] = ip_idx;
+ cb->args[0] = h;
+ cb->args[1] = idx;
+ cb->args[2] = ip_idx;
return skb->len;
}
@@ -1239,18 +1260,18 @@ static void devinet_copy_dflt_conf(struct net *net, int i)
{
struct net_device *dev;
- read_lock(&dev_base_lock);
- for_each_netdev(net, dev) {
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
struct in_device *in_dev;
- rcu_read_lock();
+
in_dev = __in_dev_get_rcu(dev);
if (in_dev && !test_bit(i, in_dev->cnf.state))
in_dev->cnf.data[i] = net->ipv4.devconf_dflt->data[i];
- rcu_read_unlock();
}
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
+/* called with RTNL locked */
static void inet_forward_change(struct net *net)
{
struct net_device *dev;
@@ -1259,7 +1280,6 @@ static void inet_forward_change(struct net *net)
IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;
IPV4_DEVCONF_DFLT(net, FORWARDING) = on;
- read_lock(&dev_base_lock);
for_each_netdev(net, dev) {
struct in_device *in_dev;
if (on)
@@ -1270,7 +1290,6 @@ static void inet_forward_change(struct net *net)
IN_DEV_CONF_SET(in_dev, FORWARDING, on);
rcu_read_unlock();
}
- read_unlock(&dev_base_lock);
}
static int devinet_conf_proc(ctl_table *ctl, int write,
@@ -1293,72 +1312,25 @@ static int devinet_conf_proc(ctl_table *ctl, int write,
return ret;
}
-static int devinet_conf_sysctl(ctl_table *table,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen)
-{
- struct ipv4_devconf *cnf;
- struct net *net;
- int *valp = table->data;
- int new;
- int i;
-
- if (!newval || !newlen)
- return 0;
-
- if (newlen != sizeof(int))
- return -EINVAL;
-
- if (get_user(new, (int __user *)newval))
- return -EFAULT;
-
- if (new == *valp)
- return 0;
-
- if (oldval && oldlenp) {
- size_t len;
-
- if (get_user(len, oldlenp))
- return -EFAULT;
-
- if (len) {
- if (len > table->maxlen)
- len = table->maxlen;
- if (copy_to_user(oldval, valp, len))
- return -EFAULT;
- if (put_user(len, oldlenp))
- return -EFAULT;
- }
- }
-
- *valp = new;
-
- cnf = table->extra1;
- net = table->extra2;
- i = (int *)table->data - cnf->data;
-
- set_bit(i, cnf->state);
-
- if (cnf == net->ipv4.devconf_dflt)
- devinet_copy_dflt_conf(net, i);
-
- return 1;
-}
-
static int devinet_sysctl_forward(ctl_table *ctl, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int *valp = ctl->data;
int val = *valp;
+ loff_t pos = *ppos;
int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
if (write && *valp != val) {
struct net *net = ctl->extra2;
if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {
- if (!rtnl_trylock())
+ if (!rtnl_trylock()) {
+ /* Restore the original values before restarting */
+ *valp = val;
+ *ppos = pos;
return restart_syscall();
+ }
if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {
inet_forward_change(net);
} else if (*valp) {
@@ -1390,57 +1362,37 @@ int ipv4_doint_and_flush(ctl_table *ctl, int write,
return ret;
}
-int ipv4_doint_and_flush_strategy(ctl_table *table,
- void __user *oldval, size_t __user *oldlenp,
- void __user *newval, size_t newlen)
-{
- int ret = devinet_conf_sysctl(table, oldval, oldlenp, newval, newlen);
- struct net *net = table->extra2;
-
- if (ret == 1)
- rt_cache_flush(net, 0);
-
- return ret;
-}
-
-
-#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc, sysctl) \
+#define DEVINET_SYSCTL_ENTRY(attr, name, mval, proc) \
{ \
- .ctl_name = NET_IPV4_CONF_ ## attr, \
.procname = name, \
.data = ipv4_devconf.data + \
- NET_IPV4_CONF_ ## attr - 1, \
+ IPV4_DEVCONF_ ## attr - 1, \
.maxlen = sizeof(int), \
.mode = mval, \
.proc_handler = proc, \
- .strategy = sysctl, \
.extra1 = &ipv4_devconf, \
}
#define DEVINET_SYSCTL_RW_ENTRY(attr, name) \
- DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc, \
- devinet_conf_sysctl)
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, devinet_conf_proc)
#define DEVINET_SYSCTL_RO_ENTRY(attr, name) \
- DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc, \
- devinet_conf_sysctl)
+ DEVINET_SYSCTL_ENTRY(attr, name, 0444, devinet_conf_proc)
-#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc, sysctl) \
- DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc, sysctl)
+#define DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, proc) \
+ DEVINET_SYSCTL_ENTRY(attr, name, 0644, proc)
#define DEVINET_SYSCTL_FLUSHING_ENTRY(attr, name) \
- DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush, \
- ipv4_doint_and_flush_strategy)
+ DEVINET_SYSCTL_COMPLEX_ENTRY(attr, name, ipv4_doint_and_flush)
static struct devinet_sysctl_table {
struct ctl_table_header *sysctl_header;
- struct ctl_table devinet_vars[__NET_IPV4_CONF_MAX];
+ struct ctl_table devinet_vars[__IPV4_DEVCONF_MAX];
char *dev_name;
} devinet_sysctl = {
.devinet_vars = {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
- devinet_sysctl_forward,
- devinet_conf_sysctl),
+ devinet_sysctl_forward),
DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
@@ -1450,6 +1402,8 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_RW_ENTRY(SEND_REDIRECTS, "send_redirects"),
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_SOURCE_ROUTE,
"accept_source_route"),
+ DEVINET_SYSCTL_RW_ENTRY(ACCEPT_LOCAL, "accept_local"),
+ DEVINET_SYSCTL_RW_ENTRY(SRC_VMARK, "src_valid_mark"),
DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP, "proxy_arp"),
DEVINET_SYSCTL_RW_ENTRY(MEDIUM_ID, "medium_id"),
DEVINET_SYSCTL_RW_ENTRY(BOOTP_RELAY, "bootp_relay"),
@@ -1460,6 +1414,7 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
+ DEVINET_SYSCTL_RW_ENTRY(PROXY_ARP_PVLAN, "proxy_arp_pvlan"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
@@ -1471,7 +1426,7 @@ static struct devinet_sysctl_table {
};
static int __devinet_sysctl_register(struct net *net, char *dev_name,
- int ctl_name, struct ipv4_devconf *p)
+ struct ipv4_devconf *p)
{
int i;
struct devinet_sysctl_table *t;
@@ -1479,9 +1434,9 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
#define DEVINET_CTL_PATH_DEV 3
struct ctl_path devinet_ctl_path[] = {
- { .procname = "net", .ctl_name = CTL_NET, },
- { .procname = "ipv4", .ctl_name = NET_IPV4, },
- { .procname = "conf", .ctl_name = NET_IPV4_CONF, },
+ { .procname = "net", },
+ { .procname = "ipv4", },
+ { .procname = "conf", },
{ /* to be set */ },
{ },
};
@@ -1506,7 +1461,6 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name,
goto free;
devinet_ctl_path[DEVINET_CTL_PATH_DEV].procname = t->dev_name;
- devinet_ctl_path[DEVINET_CTL_PATH_DEV].ctl_name = ctl_name;
t->sysctl_header = register_net_sysctl_table(net, devinet_ctl_path,
t->devinet_vars);
@@ -1539,10 +1493,9 @@ static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf)
static void devinet_sysctl_register(struct in_device *idev)
{
- neigh_sysctl_register(idev->dev, idev->arp_parms, NET_IPV4,
- NET_IPV4_NEIGH, "ipv4", NULL, NULL);
+ neigh_sysctl_register(idev->dev, idev->arp_parms, "ipv4", NULL);
__devinet_sysctl_register(dev_net(idev->dev), idev->dev->name,
- idev->dev->ifindex, &idev->cnf);
+ &idev->cnf);
}
static void devinet_sysctl_unregister(struct in_device *idev)
@@ -1553,14 +1506,12 @@ static void devinet_sysctl_unregister(struct in_device *idev)
static struct ctl_table ctl_forward_entry[] = {
{
- .ctl_name = NET_IPV4_FORWARD,
.procname = "ip_forward",
.data = &ipv4_devconf.data[
- NET_IPV4_CONF_FORWARDING - 1],
+ IPV4_DEVCONF_FORWARDING - 1],
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = devinet_sysctl_forward,
- .strategy = devinet_conf_sysctl,
.extra1 = &ipv4_devconf,
.extra2 = &init_net,
},
@@ -1568,8 +1519,8 @@ static struct ctl_table ctl_forward_entry[] = {
};
static __net_initdata struct ctl_path net_ipv4_path[] = {
- { .procname = "net", .ctl_name = CTL_NET, },
- { .procname = "ipv4", .ctl_name = NET_IPV4, },
+ { .procname = "net", },
+ { .procname = "ipv4", },
{ },
};
#endif
@@ -1587,7 +1538,7 @@ static __net_init int devinet_init_net(struct net *net)
all = &ipv4_devconf;
dflt = &ipv4_devconf_dflt;
- if (net != &init_net) {
+ if (!net_eq(net, &init_net)) {
all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);
if (all == NULL)
goto err_alloc_all;
@@ -1601,20 +1552,18 @@ static __net_init int devinet_init_net(struct net *net)
if (tbl == NULL)
goto err_alloc_ctl;
- tbl[0].data = &all->data[NET_IPV4_CONF_FORWARDING - 1];
+ tbl[0].data = &all->data[IPV4_DEVCONF_FORWARDING - 1];
tbl[0].extra1 = all;
tbl[0].extra2 = net;
#endif
}
#ifdef CONFIG_SYSCTL
- err = __devinet_sysctl_register(net, "all",
- NET_PROTO_CONF_ALL, all);
+ err = __devinet_sysctl_register(net, "all", all);
if (err < 0)
goto err_reg_all;
- err = __devinet_sysctl_register(net, "default",
- NET_PROTO_CONF_DEFAULT, dflt);
+ err = __devinet_sysctl_register(net, "default", dflt);
if (err < 0)
goto err_reg_dflt;
@@ -1680,8 +1629,3 @@ void __init devinet_init(void)
rtnl_register(PF_INET, RTM_GETADDR, NULL, inet_dump_ifaddr);
}
-EXPORT_SYMBOL(in_dev_finish_destroy);
-EXPORT_SYMBOL(inet_select_addr);
-EXPORT_SYMBOL(inetdev_by_index);
-EXPORT_SYMBOL(register_inetaddr_notifier);
-EXPORT_SYMBOL(unregister_inetaddr_notifier);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 12f7287e902d..14ca1f1c3fb0 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -422,7 +422,7 @@ static void esp4_err(struct sk_buff *skb, u32 info)
icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
return;
- x = xfrm_state_lookup(net, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
+ x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET);
if (!x)
return;
NETDEBUG(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n",
@@ -530,7 +530,7 @@ static int esp_init_authenc(struct xfrm_state *x)
}
err = crypto_aead_setauthsize(
- aead, aalg_desc->uinfo.auth.icv_truncbits / 8);
+ aead, x->aalg->alg_trunc_len / 8);
if (err)
goto free_key;
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index aa00398be80e..4f0ed458c883 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -34,6 +34,7 @@
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/slab.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -125,7 +126,7 @@ void fib_select_default(struct net *net,
#endif
tb = fib_get_table(net, table);
if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
- tb->tb_select_default(tb, flp, res);
+ fib_table_select_default(tb, flp, res);
}
static void fib_flush(struct net *net)
@@ -139,7 +140,7 @@ static void fib_flush(struct net *net)
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
head = &net->ipv4.fib_table_hash[h];
hlist_for_each_entry(tb, node, head, tb_hlist)
- flushed += tb->tb_flush(tb);
+ flushed += fib_table_flush(tb);
}
if (flushed)
@@ -162,7 +163,7 @@ struct net_device * ip_dev_find(struct net *net, __be32 addr)
#endif
local_table = fib_get_table(net, RT_TABLE_LOCAL);
- if (!local_table || local_table->tb_lookup(local_table, &fl, &res))
+ if (!local_table || fib_table_lookup(local_table, &fl, &res))
return NULL;
if (res.type != RTN_LOCAL)
goto out;
@@ -200,7 +201,7 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
local_table = fib_get_table(net, RT_TABLE_LOCAL);
if (local_table) {
ret = RTN_UNICAST;
- if (!local_table->tb_lookup(local_table, &fl, &res)) {
+ if (!fib_table_lookup(local_table, &fl, &res)) {
if (!dev || dev == res.fi->fib_dev)
ret = res.type;
fib_res_put(&res);
@@ -241,16 +242,19 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
.iif = oif };
struct fib_result res;
- int no_addr, rpf;
+ int no_addr, rpf, accept_local;
int ret;
struct net *net;
- no_addr = rpf = 0;
+ no_addr = rpf = accept_local = 0;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
if (in_dev) {
no_addr = in_dev->ifa_list == NULL;
rpf = IN_DEV_RPFILTER(in_dev);
+ accept_local = IN_DEV_ACCEPT_LOCAL(in_dev);
+ if (mark && !IN_DEV_SRC_VMARK(in_dev))
+ fl.mark = 0;
}
rcu_read_unlock();
@@ -260,8 +264,10 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
net = dev_net(dev);
if (fib_lookup(net, &fl, &res))
goto last_resort;
- if (res.type != RTN_UNICAST)
- goto e_inval_res;
+ if (res.type != RTN_UNICAST) {
+ if (res.type != RTN_LOCAL || !accept_local)
+ goto e_inval_res;
+ }
*spec_dst = FIB_RES_PREFSRC(res);
fib_combine_itag(itag, &res);
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -476,13 +482,13 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
if (cmd == SIOCDELRT) {
tb = fib_get_table(net, cfg.fc_table);
if (tb)
- err = tb->tb_delete(tb, &cfg);
+ err = fib_table_delete(tb, &cfg);
else
err = -ESRCH;
} else {
tb = fib_new_table(net, cfg.fc_table);
if (tb)
- err = tb->tb_insert(tb, &cfg);
+ err = fib_table_insert(tb, &cfg);
else
err = -ENOBUFS;
}
@@ -597,7 +603,7 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar
goto errout;
}
- err = tb->tb_delete(tb, &cfg);
+ err = fib_table_delete(tb, &cfg);
errout:
return err;
}
@@ -619,7 +625,7 @@ static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *ar
goto errout;
}
- err = tb->tb_insert(tb, &cfg);
+ err = fib_table_insert(tb, &cfg);
errout:
return err;
}
@@ -650,7 +656,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
if (dumped)
memset(&cb->args[2], 0, sizeof(cb->args) -
2 * sizeof(cb->args[0]));
- if (tb->tb_dump(tb, skb, cb) < 0)
+ if (fib_table_dump(tb, skb, cb) < 0)
goto out;
dumped = 1;
next:
@@ -704,9 +710,9 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
cfg.fc_scope = RT_SCOPE_HOST;
if (cmd == RTM_NEWROUTE)
- tb->tb_insert(tb, &cfg);
+ fib_table_insert(tb, &cfg);
else
- tb->tb_delete(tb, &cfg);
+ fib_table_delete(tb, &cfg);
}
void fib_add_ifaddr(struct in_ifaddr *ifa)
@@ -835,7 +841,7 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
local_bh_disable();
frn->tb_id = tb->tb_id;
- frn->err = tb->tb_lookup(tb, &fl, &res);
+ frn->err = fib_table_lookup(tb, &fl, &res);
if (!frn->err) {
frn->prefixlen = res.prefixlen;
@@ -878,7 +884,7 @@ static void nl_fib_input(struct sk_buff *skb)
netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
}
-static int nl_fib_lookup_init(struct net *net)
+static int __net_init nl_fib_lookup_init(struct net *net)
{
struct sock *sk;
sk = netlink_kernel_create(net, NETLINK_FIB_LOOKUP, 0,
@@ -895,11 +901,11 @@ static void nl_fib_lookup_exit(struct net *net)
net->ipv4.fibnl = NULL;
}
-static void fib_disable_ip(struct net_device *dev, int force)
+static void fib_disable_ip(struct net_device *dev, int force, int delay)
{
if (fib_sync_down_dev(dev, force))
fib_flush(dev_net(dev));
- rt_cache_flush(dev_net(dev), 0);
+ rt_cache_flush(dev_net(dev), delay);
arp_ifdown(dev);
}
@@ -922,7 +928,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
/* Last address was deleted from this interface.
Disable IP.
*/
- fib_disable_ip(dev, 1);
+ fib_disable_ip(dev, 1, 0);
} else {
rt_cache_flush(dev_net(dev), -1);
}
@@ -937,7 +943,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
struct in_device *in_dev = __in_dev_get_rtnl(dev);
if (event == NETDEV_UNREGISTER) {
- fib_disable_ip(dev, 2);
+ fib_disable_ip(dev, 2, -1);
return NOTIFY_DONE;
}
@@ -955,12 +961,15 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
rt_cache_flush(dev_net(dev), -1);
break;
case NETDEV_DOWN:
- fib_disable_ip(dev, 0);
+ fib_disable_ip(dev, 0, 0);
break;
case NETDEV_CHANGEMTU:
case NETDEV_CHANGE:
rt_cache_flush(dev_net(dev), 0);
break;
+ case NETDEV_UNREGISTER_BATCH:
+ rt_cache_flush_batch();
+ break;
}
return NOTIFY_DONE;
}
@@ -996,7 +1005,7 @@ fail:
return err;
}
-static void __net_exit ip_fib_net_exit(struct net *net)
+static void ip_fib_net_exit(struct net *net)
{
unsigned int i;
@@ -1012,7 +1021,7 @@ static void __net_exit ip_fib_net_exit(struct net *net)
head = &net->ipv4.fib_table_hash[i];
hlist_for_each_entry_safe(tb, node, tmp, head, tb_hlist) {
hlist_del(node);
- tb->tb_flush(tb);
+ fib_table_flush(tb);
kfree(tb);
}
}
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index ecd39454235c..4ed7e0dea1bc 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -32,6 +32,7 @@
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/ip.h>
@@ -242,8 +243,8 @@ fn_new_zone(struct fn_hash *table, int z)
return fz;
}
-static int
-fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+int fib_table_lookup(struct fib_table *tb,
+ const struct flowi *flp, struct fib_result *res)
{
int err;
struct fn_zone *fz;
@@ -274,8 +275,8 @@ out:
return err;
}
-static void
-fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
+void fib_table_select_default(struct fib_table *tb,
+ const struct flowi *flp, struct fib_result *res)
{
int order, last_idx;
struct hlist_node *node;
@@ -366,7 +367,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
return NULL;
}
-static int fn_hash_insert(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
struct fn_hash *table = (struct fn_hash *) tb->tb_data;
struct fib_node *new_f = NULL;
@@ -544,8 +545,7 @@ out:
return err;
}
-
-static int fn_hash_delete(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
{
struct fn_hash *table = (struct fn_hash *)tb->tb_data;
struct fib_node *f;
@@ -662,7 +662,7 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
return found;
}
-static int fn_hash_flush(struct fib_table *tb)
+int fib_table_flush(struct fib_table *tb)
{
struct fn_hash *table = (struct fn_hash *) tb->tb_data;
struct fn_zone *fz;
@@ -743,7 +743,8 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
return skb->len;
}
-static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
+int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb)
{
int m, s_m;
struct fn_zone *fz;
@@ -787,12 +788,7 @@ struct fib_table *fib_hash_table(u32 id)
tb->tb_id = id;
tb->tb_default = -1;
- tb->tb_lookup = fn_hash_lookup;
- tb->tb_insert = fn_hash_insert;
- tb->tb_delete = fn_hash_delete;
- tb->tb_flush = fn_hash_flush;
- tb->tb_select_default = fn_hash_select_default;
- tb->tb_dump = fn_hash_dump;
+
memset(tb->tb_data, 0, sizeof(struct fn_hash));
return tb;
}
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 92d9d97ec5e3..ca2d07b1c706 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -94,7 +94,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)
goto errout;
- err = tbl->tb_lookup(tbl, flp, (struct fib_result *) arg->result);
+ err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result);
if (err > 0)
err = -EAGAIN;
errout:
@@ -284,7 +284,7 @@ static int fib_default_rules_init(struct fib_rules_ops *ops)
{
int err;
- err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, FIB_RULE_PERMANENT);
+ err = fib_default_rule_add(ops, 0, RT_TABLE_LOCAL, 0);
if (err < 0)
return err;
err = fib_default_rule_add(ops, 0x7FFE, RT_TABLE_MAIN, 0);
@@ -301,13 +301,9 @@ int __net_init fib4_rules_init(struct net *net)
int err;
struct fib_rules_ops *ops;
- ops = kmemdup(&fib4_rules_ops_template, sizeof(*ops), GFP_KERNEL);
- if (ops == NULL)
- return -ENOMEM;
- INIT_LIST_HEAD(&ops->rules_list);
- ops->fro_net = net;
-
- fib_rules_register(ops);
+ ops = fib_rules_register(&fib4_rules_ops_template, net);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
err = fib_default_rules_init(ops);
if (err < 0)
@@ -318,12 +314,10 @@ int __net_init fib4_rules_init(struct net *net)
fail:
/* also cleans all rules already added */
fib_rules_unregister(ops);
- kfree(ops);
return err;
}
void __net_exit fib4_rules_exit(struct net *net)
{
fib_rules_unregister(net->ipv4.rules_ops);
- kfree(net->ipv4.rules_ops);
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 9b096d6ff3f2..20f09c5b31e8 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -32,6 +32,7 @@
#include <linux/proc_fs.h>
#include <linux/skbuff.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <net/arp.h>
#include <net/ip.h>
@@ -62,8 +63,8 @@ static DEFINE_SPINLOCK(fib_multipath_lock);
#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
-for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
+#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
+for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
#else /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -72,7 +73,7 @@ for (nhsel=0, nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++,
#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
for (nhsel=0; nhsel < 1; nhsel++)
-#define change_nexthops(fi) { int nhsel = 0; struct fib_nh * nh = (struct fib_nh *)((fi)->fib_nh); \
+#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
for (nhsel=0; nhsel < 1; nhsel++)
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
@@ -145,9 +146,9 @@ void free_fib_info(struct fib_info *fi)
return;
}
change_nexthops(fi) {
- if (nh->nh_dev)
- dev_put(nh->nh_dev);
- nh->nh_dev = NULL;
+ if (nexthop_nh->nh_dev)
+ dev_put(nexthop_nh->nh_dev);
+ nexthop_nh->nh_dev = NULL;
} endfor_nexthops(fi);
fib_info_cnt--;
release_net(fi->fib_net);
@@ -162,9 +163,9 @@ void fib_release_info(struct fib_info *fi)
if (fi->fib_prefsrc)
hlist_del(&fi->fib_lhash);
change_nexthops(fi) {
- if (!nh->nh_dev)
+ if (!nexthop_nh->nh_dev)
continue;
- hlist_del(&nh->nh_hash);
+ hlist_del(&nexthop_nh->nh_hash);
} endfor_nexthops(fi)
fi->fib_dead = 1;
fib_info_put(fi);
@@ -228,7 +229,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
head = &fib_info_hash[hash];
hlist_for_each_entry(fi, node, head, fib_hash) {
- if (fi->fib_net != nfi->fib_net)
+ if (!net_eq(fi->fib_net, nfi->fib_net))
continue;
if (fi->fib_nhs != nfi->fib_nhs)
continue;
@@ -395,19 +396,20 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
if (!rtnh_ok(rtnh, remaining))
return -EINVAL;
- nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
- nh->nh_oif = rtnh->rtnh_ifindex;
- nh->nh_weight = rtnh->rtnh_hops + 1;
+ nexthop_nh->nh_flags =
+ (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
+ nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
+ nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
attrlen = rtnh_attrlen(rtnh);
if (attrlen > 0) {
struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
- nh->nh_gw = nla ? nla_get_be32(nla) : 0;
+ nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
#ifdef CONFIG_NET_CLS_ROUTE
nla = nla_find(attrs, attrlen, RTA_FLOW);
- nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
+ nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
#endif
}
@@ -527,10 +529,6 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
if (nh->nh_gw) {
struct fib_result res;
-#ifdef CONFIG_IP_ROUTE_PERVASIVE
- if (nh->nh_flags&RTNH_F_PERVASIVE)
- return 0;
-#endif
if (nh->nh_flags&RTNH_F_ONLINK) {
struct net_device *dev;
@@ -738,7 +736,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi->fib_nhs = nhs;
change_nexthops(fi) {
- nh->nh_parent = fi;
+ nexthop_nh->nh_parent = fi;
} endfor_nexthops(fi)
if (cfg->fc_mx) {
@@ -808,7 +806,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto failure;
} else {
change_nexthops(fi) {
- if ((err = fib_check_nh(cfg, fi, nh)) != 0)
+ if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
goto failure;
} endfor_nexthops(fi)
}
@@ -843,11 +841,11 @@ link_it:
struct hlist_head *head;
unsigned int hash;
- if (!nh->nh_dev)
+ if (!nexthop_nh->nh_dev)
continue;
- hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
+ hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
head = &fib_info_devhash[hash];
- hlist_add_head(&nh->nh_hash, head);
+ hlist_add_head(&nexthop_nh->nh_hash, head);
} endfor_nexthops(fi)
spin_unlock_bh(&fib_info_lock);
return fi;
@@ -1047,7 +1045,7 @@ int fib_sync_down_addr(struct net *net, __be32 local)
return 0;
hlist_for_each_entry(fi, node, head, fib_lhash) {
- if (fi->fib_net != net)
+ if (!net_eq(fi->fib_net, net))
continue;
if (fi->fib_prefsrc == local) {
fi->fib_flags |= RTNH_F_DEAD;
@@ -1080,21 +1078,21 @@ int fib_sync_down_dev(struct net_device *dev, int force)
prev_fi = fi;
dead = 0;
change_nexthops(fi) {
- if (nh->nh_flags&RTNH_F_DEAD)
+ if (nexthop_nh->nh_flags&RTNH_F_DEAD)
dead++;
- else if (nh->nh_dev == dev &&
- nh->nh_scope != scope) {
- nh->nh_flags |= RTNH_F_DEAD;
+ else if (nexthop_nh->nh_dev == dev &&
+ nexthop_nh->nh_scope != scope) {
+ nexthop_nh->nh_flags |= RTNH_F_DEAD;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
spin_lock_bh(&fib_multipath_lock);
- fi->fib_power -= nh->nh_power;
- nh->nh_power = 0;
+ fi->fib_power -= nexthop_nh->nh_power;
+ nexthop_nh->nh_power = 0;
spin_unlock_bh(&fib_multipath_lock);
#endif
dead++;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (force > 1 && nh->nh_dev == dev) {
+ if (force > 1 && nexthop_nh->nh_dev == dev) {
dead = fi->fib_nhs;
break;
}
@@ -1144,18 +1142,20 @@ int fib_sync_up(struct net_device *dev)
prev_fi = fi;
alive = 0;
change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD)) {
+ if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
alive++;
continue;
}
- if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
+ if (nexthop_nh->nh_dev == NULL ||
+ !(nexthop_nh->nh_dev->flags&IFF_UP))
continue;
- if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
+ if (nexthop_nh->nh_dev != dev ||
+ !__in_dev_get_rtnl(dev))
continue;
alive++;
spin_lock_bh(&fib_multipath_lock);
- nh->nh_power = 0;
- nh->nh_flags &= ~RTNH_F_DEAD;
+ nexthop_nh->nh_power = 0;
+ nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
spin_unlock_bh(&fib_multipath_lock);
} endfor_nexthops(fi)
@@ -1182,9 +1182,9 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
if (fi->fib_power <= 0) {
int power = 0;
change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD)) {
- power += nh->nh_weight;
- nh->nh_power = nh->nh_weight;
+ if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+ power += nexthop_nh->nh_weight;
+ nexthop_nh->nh_power = nexthop_nh->nh_weight;
}
} endfor_nexthops(fi);
fi->fib_power = power;
@@ -1204,9 +1204,10 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
w = jiffies % fi->fib_power;
change_nexthops(fi) {
- if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
- if ((w -= nh->nh_power) <= 0) {
- nh->nh_power--;
+ if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
+ nexthop_nh->nh_power) {
+ if ((w -= nexthop_nh->nh_power) <= 0) {
+ nexthop_nh->nh_power--;
fi->fib_power--;
res->nh_sel = nhsel;
spin_unlock_bh(&fib_multipath_lock);
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 291bdf50a21f..59a838795e3e 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -71,6 +71,7 @@
#include <linux/netlink.h>
#include <linux/init.h>
#include <linux/list.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -961,7 +962,9 @@ fib_find_node(struct trie *t, u32 key)
struct node *n;
pos = 0;
- n = rcu_dereference(t->trie);
+ n = rcu_dereference_check(t->trie,
+ rcu_read_lock_held() ||
+ lockdep_rtnl_is_held());
while (n != NULL && NODE_TYPE(n) == T_TNODE) {
tn = (struct tnode *) n;
@@ -1174,7 +1177,7 @@ done:
/*
* Caller must hold RTNL.
*/
-static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
struct fib_alias *fa, *new_fa;
@@ -1373,8 +1376,8 @@ static int check_leaf(struct trie *t, struct leaf *l,
return 1;
}
-static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
- struct fib_result *res)
+int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
+ struct fib_result *res)
{
struct trie *t = (struct trie *) tb->tb_data;
int ret;
@@ -1595,7 +1598,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
/*
* Caller must hold RTNL.
*/
-static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
+int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
{
struct trie *t = (struct trie *) tb->tb_data;
u32 key, mask;
@@ -1786,7 +1789,7 @@ static struct leaf *trie_leafindex(struct trie *t, int index)
/*
* Caller must hold RTNL.
*/
-static int fn_trie_flush(struct fib_table *tb)
+int fib_table_flush(struct fib_table *tb)
{
struct trie *t = (struct trie *) tb->tb_data;
struct leaf *l, *ll = NULL;
@@ -1807,9 +1810,9 @@ static int fn_trie_flush(struct fib_table *tb)
return found;
}
-static void fn_trie_select_default(struct fib_table *tb,
- const struct flowi *flp,
- struct fib_result *res)
+void fib_table_select_default(struct fib_table *tb,
+ const struct flowi *flp,
+ struct fib_result *res)
{
struct trie *t = (struct trie *) tb->tb_data;
int order, last_idx;
@@ -1952,8 +1955,8 @@ static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
return skb->len;
}
-static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
- struct netlink_callback *cb)
+int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
+ struct netlink_callback *cb)
{
struct leaf *l;
struct trie *t = (struct trie *) tb->tb_data;
@@ -2020,12 +2023,6 @@ struct fib_table *fib_hash_table(u32 id)
tb->tb_id = id;
tb->tb_default = -1;
- tb->tb_lookup = fn_trie_lookup;
- tb->tb_insert = fn_trie_insert;
- tb->tb_delete = fn_trie_delete;
- tb->tb_flush = fn_trie_flush;
- tb->tb_select_default = fn_trie_select_default;
- tb->tb_dump = fn_trie_dump;
t = (struct trie *) tb->tb_data;
memset(t, 0, sizeof(*t));
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 5bc13fe816d1..ac4dec132735 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -74,6 +74,7 @@
#include <linux/netdevice.h>
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
+#include <linux/slab.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -114,7 +115,7 @@ struct icmp_bxm {
/* An array of errno for error messages from dest unreach. */
/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */
-struct icmp_err icmp_err_convert[] = {
+const struct icmp_err icmp_err_convert[] = {
{
.errno = ENETUNREACH, /* ICMP_NET_UNREACH */
.fatal = 0,
@@ -501,15 +502,16 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
if (!(rt->rt_flags & RTCF_LOCAL)) {
struct net_device *dev = NULL;
+ rcu_read_lock();
if (rt->fl.iif &&
net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)
- dev = dev_get_by_index(net, rt->fl.iif);
+ dev = dev_get_by_index_rcu(net, rt->fl.iif);
- if (dev) {
+ if (dev)
saddr = inet_select_addr(dev, 0, RT_SCOPE_LINK);
- dev_put(dev);
- } else
+ else
saddr = 0;
+ rcu_read_unlock();
}
tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) |
@@ -1165,6 +1167,10 @@ static int __net_init icmp_sk_init(struct net *net)
sk->sk_sndbuf =
(2 * ((64 * 1024) + sizeof(struct sk_buff)));
+ /*
+ * Speedup sock_wfree()
+ */
+ sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
inet_sk(sk)->pmtudisc = IP_PMTUDISC_DONT;
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index d41e5de79a82..15d3eeda92f5 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -71,6 +71,7 @@
*/
#include <linux/module.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/types.h>
@@ -946,7 +947,6 @@ int igmp_rcv(struct sk_buff *skb)
break;
case IGMP_HOST_MEMBERSHIP_REPORT:
case IGMPV2_HOST_MEMBERSHIP_REPORT:
- case IGMPV3_HOST_MEMBERSHIP_REPORT:
/* Is it our report looped back? */
if (skb_rtable(skb)->fl.iif == 0)
break;
@@ -960,6 +960,7 @@ int igmp_rcv(struct sk_buff *skb)
in_dev_put(in_dev);
return pim_rcv_v1(skb);
#endif
+ case IGMPV3_HOST_MEMBERSHIP_REPORT:
case IGMP_DVMRP:
case IGMP_TRACE:
case IGMP_HOST_LEAVE_MESSAGE:
@@ -1799,7 +1800,7 @@ int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr)
iml->next = inet->mc_list;
iml->sflist = NULL;
iml->sfmode = MCAST_EXCLUDE;
- inet->mc_list = iml;
+ rcu_assign_pointer(inet->mc_list, iml);
ip_mc_inc_group(in_dev, addr);
err = 0;
done:
@@ -1807,24 +1808,46 @@ done:
return err;
}
+static void ip_sf_socklist_reclaim(struct rcu_head *rp)
+{
+ struct ip_sf_socklist *psf;
+
+ psf = container_of(rp, struct ip_sf_socklist, rcu);
+ /* sk_omem_alloc should have been decreased by the caller*/
+ kfree(psf);
+}
+
static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
struct in_device *in_dev)
{
+ struct ip_sf_socklist *psf = iml->sflist;
int err;
- if (iml->sflist == NULL) {
+ if (psf == NULL) {
/* any-source empty exclude case */
return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
iml->sfmode, 0, NULL, 0);
}
err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr,
- iml->sfmode, iml->sflist->sl_count,
- iml->sflist->sl_addr, 0);
- sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max));
- iml->sflist = NULL;
+ iml->sfmode, psf->sl_count, psf->sl_addr, 0);
+ rcu_assign_pointer(iml->sflist, NULL);
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psf->sl_max), &sk->sk_omem_alloc);
+ call_rcu(&psf->rcu, ip_sf_socklist_reclaim);
return err;
}
+
+static void ip_mc_socklist_reclaim(struct rcu_head *rp)
+{
+ struct ip_mc_socklist *iml;
+
+ iml = container_of(rp, struct ip_mc_socklist, rcu);
+ /* sk_omem_alloc should have been decreased by the caller*/
+ kfree(iml);
+}
+
+
/*
* Ask a socket to leave a group.
*/
@@ -1854,12 +1877,14 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
(void) ip_mc_leave_src(sk, iml, in_dev);
- *imlp = iml->next;
+ rcu_assign_pointer(*imlp, iml->next);
if (in_dev)
ip_mc_dec_group(in_dev, group);
rtnl_unlock();
- sock_kfree_s(sk, iml, sizeof(*iml));
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+ call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
return 0;
}
if (!in_dev)
@@ -1899,8 +1924,9 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
err = -EADDRNOTAVAIL;
for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
- if (pmc->multi.imr_multiaddr.s_addr == imr.imr_multiaddr.s_addr
- && pmc->multi.imr_ifindex == imr.imr_ifindex)
+ if ((pmc->multi.imr_multiaddr.s_addr ==
+ imr.imr_multiaddr.s_addr) &&
+ (pmc->multi.imr_ifindex == imr.imr_ifindex))
break;
}
if (!pmc) { /* must have a prior join */
@@ -1973,9 +1999,12 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
if (psl) {
for (i=0; i<psl->sl_count; i++)
newpsl->sl_addr[i] = psl->sl_addr[i];
- sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+ call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
}
- pmc->sflist = psl = newpsl;
+ rcu_assign_pointer(pmc->sflist, newpsl);
+ psl = newpsl;
}
rv = 1; /* > 0 for insert logic below if sl_count is 0 */
for (i=0; i<psl->sl_count; i++) {
@@ -2071,11 +2100,13 @@ int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex)
if (psl) {
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
psl->sl_count, psl->sl_addr, 0);
- sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max));
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(IP_SFLSIZE(psl->sl_max), &sk->sk_omem_alloc);
+ call_rcu(&psl->rcu, ip_sf_socklist_reclaim);
} else
(void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode,
0, NULL, 0);
- pmc->sflist = newpsl;
+ rcu_assign_pointer(pmc->sflist, newpsl);
pmc->sfmode = msf->imsf_fmode;
err = 0;
done:
@@ -2208,30 +2239,40 @@ int ip_mc_sf_allow(struct sock *sk, __be32 loc_addr, __be32 rmt_addr, int dif)
struct ip_mc_socklist *pmc;
struct ip_sf_socklist *psl;
int i;
+ int ret;
+ ret = 1;
if (!ipv4_is_multicast(loc_addr))
- return 1;
+ goto out;
- for (pmc=inet->mc_list; pmc; pmc=pmc->next) {
+ rcu_read_lock();
+ for (pmc=rcu_dereference(inet->mc_list); pmc; pmc=rcu_dereference(pmc->next)) {
if (pmc->multi.imr_multiaddr.s_addr == loc_addr &&
pmc->multi.imr_ifindex == dif)
break;
}
+ ret = inet->mc_all;
if (!pmc)
- return inet->mc_all;
+ goto unlock;
psl = pmc->sflist;
+ ret = (pmc->sfmode == MCAST_EXCLUDE);
if (!psl)
- return pmc->sfmode == MCAST_EXCLUDE;
+ goto unlock;
for (i=0; i<psl->sl_count; i++) {
if (psl->sl_addr[i] == rmt_addr)
break;
}
+ ret = 0;
if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count)
- return 0;
+ goto unlock;
if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count)
- return 0;
- return 1;
+ goto unlock;
+ ret = 1;
+unlock:
+ rcu_read_unlock();
+out:
+ return ret;
}
/*
@@ -2250,7 +2291,7 @@ void ip_mc_drop_socket(struct sock *sk)
rtnl_lock();
while ((iml = inet->mc_list) != NULL) {
struct in_device *in_dev;
- inet->mc_list = iml->next;
+ rcu_assign_pointer(inet->mc_list, iml->next);
in_dev = inetdev_by_index(net, iml->multi.imr_ifindex);
(void) ip_mc_leave_src(sk, iml, in_dev);
@@ -2258,7 +2299,9 @@ void ip_mc_drop_socket(struct sock *sk)
ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr);
in_dev_put(in_dev);
}
- sock_kfree_s(sk, iml, sizeof(*iml));
+ /* decrease mem now to avoid the memleak warning */
+ atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
+ call_rcu(&iml->rcu, ip_mc_socklist_reclaim);
}
rtnl_unlock();
}
@@ -2311,9 +2354,10 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
state->in_dev = NULL;
- for_each_netdev(net, state->dev) {
+ for_each_netdev_rcu(net, state->dev) {
struct in_device *in_dev;
- in_dev = in_dev_get(state->dev);
+
+ in_dev = __in_dev_get_rcu(state->dev);
if (!in_dev)
continue;
read_lock(&in_dev->mc_list_lock);
@@ -2323,7 +2367,6 @@ static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq)
break;
}
read_unlock(&in_dev->mc_list_lock);
- in_dev_put(in_dev);
}
return im;
}
@@ -2333,16 +2376,15 @@ static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_li
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
im = im->next;
while (!im) {
- if (likely(state->in_dev != NULL)) {
+ if (likely(state->in_dev != NULL))
read_unlock(&state->in_dev->mc_list_lock);
- in_dev_put(state->in_dev);
- }
- state->dev = next_net_device(state->dev);
+
+ state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->in_dev = NULL;
break;
}
- state->in_dev = in_dev_get(state->dev);
+ state->in_dev = __in_dev_get_rcu(state->dev);
if (!state->in_dev)
continue;
read_lock(&state->in_dev->mc_list_lock);
@@ -2361,9 +2403,9 @@ static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos)
}
static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(dev_base_lock)
+ __acquires(rcu)
{
- read_lock(&dev_base_lock);
+ rcu_read_lock();
return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
@@ -2379,16 +2421,15 @@ static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void igmp_mc_seq_stop(struct seq_file *seq, void *v)
- __releases(dev_base_lock)
+ __releases(rcu)
{
struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq);
if (likely(state->in_dev != NULL)) {
read_unlock(&state->in_dev->mc_list_lock);
- in_dev_put(state->in_dev);
state->in_dev = NULL;
}
state->dev = NULL;
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int igmp_mc_seq_show(struct seq_file *seq, void *v)
@@ -2462,9 +2503,9 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
state->idev = NULL;
state->im = NULL;
- for_each_netdev(net, state->dev) {
+ for_each_netdev_rcu(net, state->dev) {
struct in_device *idev;
- idev = in_dev_get(state->dev);
+ idev = __in_dev_get_rcu(state->dev);
if (unlikely(idev == NULL))
continue;
read_lock(&idev->mc_list_lock);
@@ -2480,7 +2521,6 @@ static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq)
spin_unlock_bh(&im->lock);
}
read_unlock(&idev->mc_list_lock);
- in_dev_put(idev);
}
return psf;
}
@@ -2494,16 +2534,15 @@ static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_l
spin_unlock_bh(&state->im->lock);
state->im = state->im->next;
while (!state->im) {
- if (likely(state->idev != NULL)) {
+ if (likely(state->idev != NULL))
read_unlock(&state->idev->mc_list_lock);
- in_dev_put(state->idev);
- }
- state->dev = next_net_device(state->dev);
+
+ state->dev = next_net_device_rcu(state->dev);
if (!state->dev) {
state->idev = NULL;
goto out;
}
- state->idev = in_dev_get(state->dev);
+ state->idev = __in_dev_get_rcu(state->dev);
if (!state->idev)
continue;
read_lock(&state->idev->mc_list_lock);
@@ -2528,8 +2567,9 @@ static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos)
}
static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
{
- read_lock(&dev_base_lock);
+ rcu_read_lock();
return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
}
@@ -2545,6 +2585,7 @@ static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
{
struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq);
if (likely(state->im != NULL)) {
@@ -2553,11 +2594,10 @@ static void igmp_mcf_seq_stop(struct seq_file *seq, void *v)
}
if (likely(state->idev != NULL)) {
read_unlock(&state->idev->mc_list_lock);
- in_dev_put(state->idev);
state->idev = NULL;
}
state->dev = NULL;
- read_unlock(&dev_base_lock);
+ rcu_read_unlock();
}
static int igmp_mcf_seq_show(struct seq_file *seq, void *v)
@@ -2605,7 +2645,7 @@ static const struct file_operations igmp_mcf_seq_fops = {
.release = seq_release_net,
};
-static int igmp_net_init(struct net *net)
+static int __net_init igmp_net_init(struct net *net)
{
struct proc_dir_entry *pde;
@@ -2623,7 +2663,7 @@ out_igmp:
return -ENOMEM;
}
-static void igmp_net_exit(struct net *net)
+static void __net_exit igmp_net_exit(struct net *net)
{
proc_net_remove(net, "mcfilter");
proc_net_remove(net, "igmp");
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 537731b3bcb3..8da6429269dd 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -112,7 +112,7 @@ again:
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
- if (ib_net(tb) == net && tb->port == rover) {
+ if (net_eq(ib_net(tb), net) && tb->port == rover) {
if (tb->fastreuse > 0 &&
sk->sk_reuse &&
sk->sk_state != TCP_LISTEN &&
@@ -158,7 +158,7 @@ have_snum:
hashinfo->bhash_size)];
spin_lock(&head->lock);
inet_bind_bucket_for_each(tb, node, &head->chain)
- if (ib_net(tb) == net && tb->port == snum)
+ if (net_eq(ib_net(tb), net) && tb->port == snum)
goto tb_found;
}
tb = NULL;
@@ -358,6 +358,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
const struct inet_request_sock *ireq = inet_rsk(req);
struct ip_options *opt = inet_rsk(req)->opt;
struct flowi fl = { .oif = sk->sk_bound_dev_if,
+ .mark = sk->sk_mark,
.nl_u = { .ip4_u =
{ .daddr = ((opt && opt->srr) ?
opt->faddr :
@@ -367,7 +368,7 @@ struct dst_entry *inet_csk_route_req(struct sock *sk,
.proto = sk->sk_protocol,
.flags = inet_sk_flowi_flags(sk),
.uli_u = { .ports =
- { .sport = inet_sk(sk)->sport,
+ { .sport = inet_sk(sk)->inet_sport,
.dport = ireq->rmt_port } } };
struct net *net = sock_net(sk);
@@ -528,9 +529,11 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
syn_ack_recalc(req, thresh, max_retries,
queue->rskq_defer_accept,
&expire, &resend);
+ if (req->rsk_ops->syn_ack_timeout)
+ req->rsk_ops->syn_ack_timeout(parent, req);
if (!expire &&
(!resend ||
- !req->rsk_ops->rtx_syn_ack(parent, req) ||
+ !req->rsk_ops->rtx_syn_ack(parent, req, NULL) ||
inet_rsk(req)->acked)) {
unsigned long timeo;
@@ -574,9 +577,9 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
newsk->sk_state = TCP_SYN_RECV;
newicsk->icsk_bind_hash = NULL;
- inet_sk(newsk)->dport = inet_rsk(req)->rmt_port;
- inet_sk(newsk)->num = ntohs(inet_rsk(req)->loc_port);
- inet_sk(newsk)->sport = inet_rsk(req)->loc_port;
+ inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port;
+ inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port);
+ inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port;
newsk->sk_write_space = sk_stream_write_space;
newicsk->icsk_retransmits = 0;
@@ -607,8 +610,8 @@ void inet_csk_destroy_sock(struct sock *sk)
/* It cannot be in hash table! */
WARN_ON(!sk_unhashed(sk));
- /* If it has not 0 inet_sk(sk)->num, it must be bound */
- WARN_ON(inet_sk(sk)->num && !inet_csk(sk)->icsk_bind_hash);
+ /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */
+ WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash);
sk->sk_prot->destroy(sk);
@@ -643,8 +646,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
* after validation is complete.
*/
sk->sk_state = TCP_LISTEN;
- if (!sk->sk_prot->get_port(sk, inet->num)) {
- inet->sport = htons(inet->num);
+ if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
+ inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
sk->sk_prot->hash(sk);
@@ -720,8 +723,8 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
const struct inet_sock *inet = inet_sk(sk);
sin->sin_family = AF_INET;
- sin->sin_addr.s_addr = inet->daddr;
- sin->sin_port = inet->dport;
+ sin->sin_addr.s_addr = inet->inet_daddr;
+ sin->sin_port = inet->inet_dport;
}
EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index a706a47f4dbb..e5fa2ddce320 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/fcntl.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/init.h>
#include <linux/time.h>
@@ -116,10 +117,10 @@ static int inet_csk_diag_fill(struct sock *sk,
r->id.idiag_cookie[0] = (u32)(unsigned long)sk;
r->id.idiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
- r->id.idiag_sport = inet->sport;
- r->id.idiag_dport = inet->dport;
- r->id.idiag_src[0] = inet->rcv_saddr;
- r->id.idiag_dst[0] = inet->daddr;
+ r->id.idiag_sport = inet->inet_sport;
+ r->id.idiag_dport = inet->inet_dport;
+ r->id.idiag_src[0] = inet->inet_rcv_saddr;
+ r->id.idiag_dst[0] = inet->inet_daddr;
#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
if (r->idiag_family == AF_INET6) {
@@ -368,7 +369,7 @@ static int inet_diag_bc_run(const void *bc, int len,
yes = entry->sport >= op[1].no;
break;
case INET_DIAG_BC_S_LE:
- yes = entry->dport <= op[1].no;
+ yes = entry->sport <= op[1].no;
break;
case INET_DIAG_BC_D_GE:
yes = entry->dport >= op[1].no;
@@ -504,11 +505,11 @@ static int inet_csk_diag_dump(struct sock *sk,
} else
#endif
{
- entry.saddr = &inet->rcv_saddr;
- entry.daddr = &inet->daddr;
+ entry.saddr = &inet->inet_rcv_saddr;
+ entry.daddr = &inet->inet_daddr;
}
- entry.sport = inet->num;
- entry.dport = ntohs(inet->dport);
+ entry.sport = inet->inet_num;
+ entry.dport = ntohs(inet->inet_dport);
entry.userlocks = sk->sk_userlocks;
if (!inet_diag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry))
@@ -584,7 +585,7 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk,
if (tmo < 0)
tmo = 0;
- r->id.idiag_sport = inet->sport;
+ r->id.idiag_sport = inet->inet_sport;
r->id.idiag_dport = ireq->rmt_port;
r->id.idiag_src[0] = ireq->loc_addr;
r->id.idiag_dst[0] = ireq->rmt_addr;
@@ -639,7 +640,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) {
bc = (struct rtattr *)(r + 1);
- entry.sport = inet->num;
+ entry.sport = inet->inet_num;
entry.userlocks = sk->sk_userlocks;
}
@@ -732,7 +733,7 @@ static int inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
}
- if (r->id.idiag_sport != inet->sport &&
+ if (r->id.idiag_sport != inet->inet_sport &&
r->id.idiag_sport)
goto next_listen;
@@ -774,7 +775,7 @@ skip_listen_ht:
if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV)))
goto unlock;
- for (i = s_i; i < hashinfo->ehash_size; i++) {
+ for (i = s_i; i <= hashinfo->ehash_mask; i++) {
struct inet_ehash_bucket *head = &hashinfo->ehash[i];
spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
struct sock *sk;
@@ -797,10 +798,10 @@ skip_listen_ht:
goto next_normal;
if (!(r->idiag_states & (1 << sk->sk_state)))
goto next_normal;
- if (r->id.idiag_sport != inet->sport &&
+ if (r->id.idiag_sport != inet->inet_sport &&
r->id.idiag_sport)
goto next_normal;
- if (r->id.idiag_dport != inet->dport &&
+ if (r->id.idiag_dport != inet->inet_dport &&
r->id.idiag_dport)
goto next_normal;
if (inet_csk_diag_dump(sk, skb, cb) < 0) {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index eaf3e2c8646a..a2ca6aed763b 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -19,6 +19,7 @@
#include <linux/random.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
+#include <linux/slab.h>
#include <net/inet_frag.h>
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 625cc5f64c94..2b79377b468d 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -64,7 +64,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
atomic_inc(&hashinfo->bsockets);
- inet_sk(sk)->num = snum;
+ inet_sk(sk)->inet_num = snum;
sk_add_bind_node(sk, &tb->owners);
tb->num_owners++;
inet_csk(sk)->icsk_bind_hash = tb;
@@ -76,7 +76,7 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
static void __inet_put_port(struct sock *sk)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
- const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->num,
+ const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num,
hashinfo->bhash_size);
struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash];
struct inet_bind_bucket *tb;
@@ -88,7 +88,7 @@ static void __inet_put_port(struct sock *sk)
__sk_del_bind_node(sk);
tb->num_owners--;
inet_csk(sk)->icsk_bind_hash = NULL;
- inet_sk(sk)->num = 0;
+ inet_sk(sk)->inet_num = 0;
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
spin_unlock(&head->lock);
}
@@ -105,7 +105,7 @@ EXPORT_SYMBOL(inet_put_port);
void __inet_inherit_port(struct sock *sk, struct sock *child)
{
struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
- const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->num,
+ const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num,
table->bhash_size);
struct inet_bind_hashbucket *head = &table->bhash[bhash];
struct inet_bind_bucket *tb;
@@ -126,9 +126,9 @@ static inline int compute_score(struct sock *sk, struct net *net,
int score = -1;
struct inet_sock *inet = inet_sk(sk);
- if (net_eq(sock_net(sk), net) && inet->num == hnum &&
+ if (net_eq(sock_net(sk), net) && inet->inet_num == hnum &&
!ipv6_only_sock(sk)) {
- __be32 rcv_saddr = inet->rcv_saddr;
+ __be32 rcv_saddr = inet->inet_rcv_saddr;
score = sk->sk_family == PF_INET ? 1 : 0;
if (rcv_saddr) {
if (rcv_saddr != daddr)
@@ -209,7 +209,7 @@ struct sock * __inet_lookup_established(struct net *net,
* have wildcards anyways.
*/
unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
- unsigned int slot = hash & (hashinfo->ehash_size - 1);
+ unsigned int slot = hash & hashinfo->ehash_mask;
struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
rcu_read_lock();
@@ -273,18 +273,20 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
struct inet_sock *inet = inet_sk(sk);
- __be32 daddr = inet->rcv_saddr;
- __be32 saddr = inet->daddr;
+ __be32 daddr = inet->inet_rcv_saddr;
+ __be32 saddr = inet->inet_daddr;
int dif = sk->sk_bound_dev_if;
INET_ADDR_COOKIE(acookie, saddr, daddr)
- const __portpair ports = INET_COMBINED_PORTS(inet->dport, lport);
+ const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
struct net *net = sock_net(sk);
- unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->dport);
+ unsigned int hash = inet_ehashfn(net, daddr, lport,
+ saddr, inet->inet_dport);
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
struct sock *sk2;
const struct hlist_nulls_node *node;
struct inet_timewait_sock *tw;
+ int twrefcnt = 0;
spin_lock(lock);
@@ -312,25 +314,28 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
unique:
/* Must record num and sport now. Otherwise we will see
* in hash table socket with a funny identity. */
- inet->num = lport;
- inet->sport = htons(lport);
+ inet->inet_num = lport;
+ inet->inet_sport = htons(lport);
sk->sk_hash = hash;
WARN_ON(!sk_unhashed(sk));
__sk_nulls_add_node_rcu(sk, &head->chain);
+ if (tw) {
+ twrefcnt = inet_twsk_unhash(tw);
+ NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
+ }
spin_unlock(lock);
+ if (twrefcnt)
+ inet_twsk_put(tw);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
if (twp) {
*twp = tw;
- NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
} else if (tw) {
/* Silly. Should hash-dance instead... */
inet_twsk_deschedule(tw, death_row);
- NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
inet_twsk_put(tw);
}
-
return 0;
not_unique:
@@ -341,16 +346,18 @@ not_unique:
static inline u32 inet_sk_port_offset(const struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
- return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr,
- inet->dport);
+ return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
+ inet->inet_daddr,
+ inet->inet_dport);
}
-void __inet_hash_nolisten(struct sock *sk)
+int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
spinlock_t *lock;
struct inet_ehash_bucket *head;
+ int twrefcnt = 0;
WARN_ON(!sk_unhashed(sk));
@@ -361,8 +368,13 @@ void __inet_hash_nolisten(struct sock *sk)
spin_lock(lock);
__sk_nulls_add_node_rcu(sk, list);
+ if (tw) {
+ WARN_ON(sk->sk_hash != tw->tw_hash);
+ twrefcnt = inet_twsk_unhash(tw);
+ }
spin_unlock(lock);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ return twrefcnt;
}
EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
@@ -372,7 +384,7 @@ static void __inet_hash(struct sock *sk)
struct inet_listen_hashbucket *ilb;
if (sk->sk_state != TCP_LISTEN) {
- __inet_hash_nolisten(sk);
+ __inet_hash_nolisten(sk, NULL);
return;
}
@@ -421,14 +433,15 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *sk, u32 port_offset,
int (*check_established)(struct inet_timewait_death_row *,
struct sock *, __u16, struct inet_timewait_sock **),
- void (*hash)(struct sock *sk))
+ int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
- const unsigned short snum = inet_sk(sk)->num;
+ const unsigned short snum = inet_sk(sk)->inet_num;
struct inet_bind_hashbucket *head;
struct inet_bind_bucket *tb;
int ret;
struct net *net = sock_net(sk);
+ int twrefcnt = 1;
if (!snum) {
int i, remaining, low, high, port;
@@ -452,7 +465,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
* unique enough.
*/
inet_bind_bucket_for_each(tb, node, &head->chain) {
- if (ib_net(tb) == net && tb->port == port) {
+ if (net_eq(ib_net(tb), net) &&
+ tb->port == port) {
if (tb->fastreuse >= 0)
goto next_port;
WARN_ON(hlist_empty(&tb->owners));
@@ -485,14 +499,19 @@ ok:
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, port);
if (sk_unhashed(sk)) {
- inet_sk(sk)->sport = htons(port);
- hash(sk);
+ inet_sk(sk)->inet_sport = htons(port);
+ twrefcnt += hash(sk, tw);
}
+ if (tw)
+ twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
spin_unlock(&head->lock);
if (tw) {
inet_twsk_deschedule(tw, death_row);
- inet_twsk_put(tw);
+ while (twrefcnt) {
+ twrefcnt--;
+ inet_twsk_put(tw);
+ }
}
ret = 0;
@@ -503,7 +522,7 @@ ok:
tb = inet_csk(sk)->icsk_bind_hash;
spin_lock_bh(&head->lock);
if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- hash(sk);
+ hash(sk, NULL);
spin_unlock_bh(&head->lock);
return 0;
} else {
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
index 6a667dae315e..47038cb6c138 100644
--- a/net/ipv4/inet_lro.c
+++ b/net/ipv4/inet_lro.c
@@ -64,15 +64,15 @@ static int lro_tcp_ip_check(struct iphdr *iph, struct tcphdr *tcph,
if (iph->ihl != IPH_LEN_WO_OPTIONS)
return -1;
- if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack
- || tcph->rst || tcph->syn || tcph->fin)
+ if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
+ tcph->rst || tcph->syn || tcph->fin)
return -1;
if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
return -1;
- if (tcph->doff != TCPH_LEN_WO_OPTIONS
- && tcph->doff != TCPH_LEN_W_TIMESTAMP)
+ if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
+ tcph->doff != TCPH_LEN_W_TIMESTAMP)
return -1;
/* check tcp options (only timestamp allowed) */
@@ -262,10 +262,10 @@ static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
struct iphdr *iph,
struct tcphdr *tcph)
{
- if ((lro_desc->iph->saddr != iph->saddr)
- || (lro_desc->iph->daddr != iph->daddr)
- || (lro_desc->tcph->source != tcph->source)
- || (lro_desc->tcph->dest != tcph->dest))
+ if ((lro_desc->iph->saddr != iph->saddr) ||
+ (lro_desc->iph->daddr != iph->daddr) ||
+ (lro_desc->tcph->source != tcph->source) ||
+ (lro_desc->tcph->dest != tcph->dest))
return -1;
return 0;
}
@@ -339,9 +339,9 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
u64 flags;
int vlan_hdr_len = 0;
- if (!lro_mgr->get_skb_header
- || lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
- &flags, priv))
+ if (!lro_mgr->get_skb_header ||
+ lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
+ &flags, priv))
goto out;
if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
@@ -351,8 +351,8 @@ static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
if (!lro_desc)
goto out;
- if ((skb->protocol == htons(ETH_P_8021Q))
- && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
+ if ((skb->protocol == htons(ETH_P_8021Q)) &&
+ !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
vlan_hdr_len = VLAN_HLEN;
if (!lro_desc->active) { /* start new lro session */
@@ -446,9 +446,9 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
int hdr_len = LRO_MAX_PG_HLEN;
int vlan_hdr_len = 0;
- if (!lro_mgr->get_frag_header
- || lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
- (void *)&tcph, &flags, priv)) {
+ if (!lro_mgr->get_frag_header ||
+ lro_mgr->get_frag_header(frags, (void *)&mac_hdr, (void *)&iph,
+ (void *)&tcph, &flags, priv)) {
mac_hdr = page_address(frags->page) + frags->page_offset;
goto out1;
}
@@ -472,8 +472,8 @@ static struct sk_buff *__lro_proc_segment(struct net_lro_mgr *lro_mgr,
if (!skb)
goto out;
- if ((skb->protocol == htons(ETH_P_8021Q))
- && !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
+ if ((skb->protocol == htons(ETH_P_8021Q)) &&
+ !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
vlan_hdr_len = VLAN_HLEN;
iph = (void *)(skb->data + vlan_hdr_len);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 13f0781f35cd..c5af909cf701 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -10,44 +10,92 @@
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
+#include <linux/slab.h>
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
+
+/**
+ * inet_twsk_unhash - unhash a timewait socket from established hash
+ * @tw: timewait socket
+ *
+ * unhash a timewait socket from established hash, if hashed.
+ * ehash lock must be held by caller.
+ * Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+int inet_twsk_unhash(struct inet_timewait_sock *tw)
+{
+ if (hlist_nulls_unhashed(&tw->tw_node))
+ return 0;
+
+ hlist_nulls_del_rcu(&tw->tw_node);
+ sk_nulls_node_init(&tw->tw_node);
+ /*
+ * We cannot call inet_twsk_put() ourself under lock,
+ * caller must call it for us.
+ */
+ return 1;
+}
+
+/**
+ * inet_twsk_bind_unhash - unhash a timewait socket from bind hash
+ * @tw: timewait socket
+ * @hashinfo: hashinfo pointer
+ *
+ * unhash a timewait socket from bind hash, if hashed.
+ * bind hash lock must be held by caller.
+ * Returns 1 if caller should call inet_twsk_put() after lock release.
+ */
+int inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
+ struct inet_hashinfo *hashinfo)
+{
+ struct inet_bind_bucket *tb = tw->tw_tb;
+
+ if (!tb)
+ return 0;
+
+ __hlist_del(&tw->tw_bind_node);
+ tw->tw_tb = NULL;
+ inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ /*
+ * We cannot call inet_twsk_put() ourself under lock,
+ * caller must call it for us.
+ */
+ return 1;
+}
+
/* Must be called with locally disabled BHs. */
static void __inet_twsk_kill(struct inet_timewait_sock *tw,
struct inet_hashinfo *hashinfo)
{
struct inet_bind_hashbucket *bhead;
- struct inet_bind_bucket *tb;
+ int refcnt;
/* Unlink from established hashes. */
spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
spin_lock(lock);
- if (hlist_nulls_unhashed(&tw->tw_node)) {
- spin_unlock(lock);
- return;
- }
- hlist_nulls_del_rcu(&tw->tw_node);
- sk_nulls_node_init(&tw->tw_node);
+ refcnt = inet_twsk_unhash(tw);
spin_unlock(lock);
/* Disassociate with bind bucket. */
bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
hashinfo->bhash_size)];
+
spin_lock(&bhead->lock);
- tb = tw->tw_tb;
- __hlist_del(&tw->tw_bind_node);
- tw->tw_tb = NULL;
- inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
+ refcnt += inet_twsk_bind_unhash(tw, hashinfo);
spin_unlock(&bhead->lock);
+
#ifdef SOCK_REFCNT_DEBUG
if (atomic_read(&tw->tw_refcnt) != 1) {
printk(KERN_DEBUG "%s timewait_sock %p refcnt=%d\n",
tw->tw_prot->name, tw, atomic_read(&tw->tw_refcnt));
}
#endif
- inet_twsk_put(tw);
+ while (refcnt) {
+ inet_twsk_put(tw);
+ refcnt--;
+ }
}
static noinline void inet_twsk_free(struct inet_timewait_sock *tw)
@@ -86,7 +134,7 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
Note, that any socket with inet->num != 0 MUST be bound in
binding cache, even if it is closed.
*/
- bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->num,
+ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
hashinfo->bhash_size)];
spin_lock(&bhead->lock);
tw->tw_tb = icsk->icsk_bind_hash;
@@ -101,16 +149,24 @@ void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
* Should be done before removing sk from established chain
* because readers are lockless and search established first.
*/
- atomic_inc(&tw->tw_refcnt);
inet_twsk_add_node_rcu(tw, &ehead->twchain);
/* Step 3: Remove SK from established hash. */
if (__sk_nulls_del_node_init_rcu(sk))
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+ /*
+ * Notes :
+ * - We initially set tw_refcnt to 0 in inet_twsk_alloc()
+ * - We add one reference for the bhash link
+ * - We add one reference for the ehash link
+ * - We want this refcnt update done before allowing other
+ * threads to find this tw in ehash chain.
+ */
+ atomic_add(1 + 1 + 1, &tw->tw_refcnt);
+
spin_unlock(lock);
}
-
EXPORT_SYMBOL_GPL(__inet_twsk_hashdance);
struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state)
@@ -124,14 +180,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
kmemcheck_annotate_bitfield(tw, flags);
/* Give us an identity. */
- tw->tw_daddr = inet->daddr;
- tw->tw_rcv_saddr = inet->rcv_saddr;
+ tw->tw_daddr = inet->inet_daddr;
+ tw->tw_rcv_saddr = inet->inet_rcv_saddr;
tw->tw_bound_dev_if = sk->sk_bound_dev_if;
- tw->tw_num = inet->num;
+ tw->tw_num = inet->inet_num;
tw->tw_state = TCP_TIME_WAIT;
tw->tw_substate = state;
- tw->tw_sport = inet->sport;
- tw->tw_dport = inet->dport;
+ tw->tw_sport = inet->inet_sport;
+ tw->tw_dport = inet->inet_dport;
tw->tw_family = sk->sk_family;
tw->tw_reuse = sk->sk_reuse;
tw->tw_hash = sk->sk_hash;
@@ -139,14 +195,18 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int stat
tw->tw_transparent = inet->transparent;
tw->tw_prot = sk->sk_prot_creator;
twsk_net_set(tw, hold_net(sock_net(sk)));
- atomic_set(&tw->tw_refcnt, 1);
+ /*
+ * Because we use RCU lookups, we should not set tw_refcnt
+ * to a non null value before everything is setup for this
+ * timewait socket.
+ */
+ atomic_set(&tw->tw_refcnt, 0);
inet_twsk_dead_node_init(tw);
__module_get(tw->tw_prot->owner);
}
return tw;
}
-
EXPORT_SYMBOL_GPL(inet_twsk_alloc);
/* Returns non-zero if quota exceeded. */
@@ -225,7 +285,6 @@ void inet_twdr_hangman(unsigned long data)
out:
spin_unlock(&twdr->death_lock);
}
-
EXPORT_SYMBOL_GPL(inet_twdr_hangman);
void inet_twdr_twkill_work(struct work_struct *work)
@@ -256,7 +315,6 @@ void inet_twdr_twkill_work(struct work_struct *work)
spin_unlock_bh(&twdr->death_lock);
}
}
-
EXPORT_SYMBOL_GPL(inet_twdr_twkill_work);
/* These are always called from BH context. See callers in
@@ -276,7 +334,6 @@ void inet_twsk_deschedule(struct inet_timewait_sock *tw,
spin_unlock(&twdr->death_lock);
__inet_twsk_kill(tw, twdr->hashinfo);
}
-
EXPORT_SYMBOL(inet_twsk_deschedule);
void inet_twsk_schedule(struct inet_timewait_sock *tw,
@@ -357,7 +414,6 @@ void inet_twsk_schedule(struct inet_timewait_sock *tw,
mod_timer(&twdr->tw_timer, jiffies + twdr->period);
spin_unlock(&twdr->death_lock);
}
-
EXPORT_SYMBOL_GPL(inet_twsk_schedule);
void inet_twdr_twcal_tick(unsigned long data)
@@ -418,40 +474,48 @@ out:
#endif
spin_unlock(&twdr->death_lock);
}
-
EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick);
-void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo,
+void inet_twsk_purge(struct inet_hashinfo *hashinfo,
struct inet_timewait_death_row *twdr, int family)
{
struct inet_timewait_sock *tw;
struct sock *sk;
struct hlist_nulls_node *node;
- int h;
+ unsigned int slot;
- local_bh_disable();
- for (h = 0; h < (hashinfo->ehash_size); h++) {
- struct inet_ehash_bucket *head =
- inet_ehash_bucket(hashinfo, h);
- spinlock_t *lock = inet_ehash_lockp(hashinfo, h);
+ for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+restart_rcu:
+ rcu_read_lock();
restart:
- spin_lock(lock);
- sk_nulls_for_each(sk, node, &head->twchain) {
-
+ sk_nulls_for_each_rcu(sk, node, &head->twchain) {
tw = inet_twsk(sk);
- if (!net_eq(twsk_net(tw), net) ||
- tw->tw_family != family)
+ if ((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))
continue;
- atomic_inc(&tw->tw_refcnt);
- spin_unlock(lock);
+ if (unlikely(!atomic_inc_not_zero(&tw->tw_refcnt)))
+ continue;
+
+ if (unlikely((tw->tw_family != family) ||
+ atomic_read(&twsk_net(tw)->count))) {
+ inet_twsk_put(tw);
+ goto restart;
+ }
+
+ rcu_read_unlock();
inet_twsk_deschedule(tw, twdr);
inet_twsk_put(tw);
-
- goto restart;
+ goto restart_rcu;
}
- spin_unlock(lock);
+ /* If the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot)
+ goto restart;
+ rcu_read_unlock();
}
- local_bh_enable();
}
EXPORT_SYMBOL_GPL(inet_twsk_purge);
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index b1fbe18feb5a..6bcfe52a9c87 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -67,9 +67,6 @@
* ip_id_count: idlock
*/
-/* Exported for inet_getid inline function. */
-DEFINE_SPINLOCK(inet_peer_idlock);
-
static struct kmem_cache *peer_cachep __read_mostly;
#define node_height(x) x->avl_height
@@ -390,7 +387,7 @@ struct inet_peer *inet_getpeer(__be32 daddr, int create)
n->v4daddr = daddr;
atomic_set(&n->refcnt, 1);
atomic_set(&n->rid, 0);
- n->ip_id_count = secure_ip_id(daddr);
+ atomic_set(&n->ip_id_count, secure_ip_id(daddr));
n->tcp_ts_stamp = 0;
write_lock_bh(&peer_pool_lock);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index a2991bc8e32e..af10942b326c 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -25,6 +25,7 @@
#include <linux/ip.h>
#include <linux/icmp.h>
#include <linux/netdevice.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp.h>
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index d3fe10be7219..75347ea70ea0 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -32,6 +32,9 @@
#include <linux/netdevice.h>
#include <linux/jhash.h>
#include <linux/random.h>
+#include <linux/slab.h>
+#include <net/route.h>
+#include <net/dst.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -205,11 +208,35 @@ static void ip_expire(unsigned long arg)
if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) {
struct sk_buff *head = qp->q.fragments;
- /* Send an ICMP "Fragment Reassembly Timeout" message. */
- if ((head->dev = dev_get_by_index(net, qp->iif)) != NULL) {
- icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
- dev_put(head->dev);
+ rcu_read_lock();
+ head->dev = dev_get_by_index_rcu(net, qp->iif);
+ if (!head->dev)
+ goto out_rcu_unlock;
+
+ /*
+ * Only search router table for the head fragment,
+ * when defraging timeout at PRE_ROUTING HOOK.
+ */
+ if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) {
+ const struct iphdr *iph = ip_hdr(head);
+ int err = ip_route_input(head, iph->daddr, iph->saddr,
+ iph->tos, head->dev);
+ if (unlikely(err))
+ goto out_rcu_unlock;
+
+ /*
+ * Only an end host needs to send an ICMP
+ * "Fragment Reassembly Timeout" message, per RFC792.
+ */
+ if (skb_rtable(head)->rt_type != RTN_LOCAL)
+ goto out_rcu_unlock;
+
}
+
+ /* Send an ICMP "Fragment Reassembly Timeout" message. */
+ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
+out_rcu_unlock:
+ rcu_read_unlock();
}
out:
spin_unlock(&qp->q.lock);
@@ -603,7 +630,6 @@ static int zero;
static struct ctl_table ip4_frags_ns_ctl_table[] = {
{
- .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH,
.procname = "ipfrag_high_thresh",
.data = &init_net.ipv4.frags.high_thresh,
.maxlen = sizeof(int),
@@ -611,7 +637,6 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH,
.procname = "ipfrag_low_thresh",
.data = &init_net.ipv4.frags.low_thresh,
.maxlen = sizeof(int),
@@ -619,26 +644,22 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_IPFRAG_TIME,
.procname = "ipfrag_time",
.data = &init_net.ipv4.frags.timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies
},
{ }
};
static struct ctl_table ip4_frags_ctl_table[] = {
{
- .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL,
.procname = "ipfrag_secret_interval",
.data = &ip4_frags.secret_interval,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies
},
{
.procname = "ipfrag_max_dist",
@@ -651,13 +672,13 @@ static struct ctl_table ip4_frags_ctl_table[] = {
{ }
};
-static int ip4_frags_ns_ctl_register(struct net *net)
+static int __net_init ip4_frags_ns_ctl_register(struct net *net)
{
struct ctl_table *table;
struct ctl_table_header *hdr;
table = ip4_frags_ns_ctl_table;
- if (net != &init_net) {
+ if (!net_eq(net, &init_net)) {
table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
if (table == NULL)
goto err_alloc;
@@ -675,13 +696,13 @@ static int ip4_frags_ns_ctl_register(struct net *net)
return 0;
err_reg:
- if (net != &init_net)
+ if (!net_eq(net, &init_net))
kfree(table);
err_alloc:
return -ENOMEM;
}
-static void ip4_frags_ns_ctl_unregister(struct net *net)
+static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
{
struct ctl_table *table;
@@ -709,7 +730,7 @@ static inline void ip4_frags_ctl_register(void)
}
#endif
-static int ipv4_frags_init_net(struct net *net)
+static int __net_init ipv4_frags_init_net(struct net *net)
{
/*
* Fragment cache limits. We will commit 256K at one time. Should we
@@ -731,7 +752,7 @@ static int ipv4_frags_init_net(struct net *net)
return ip4_frags_ns_ctl_register(net);
}
-static void ipv4_frags_exit_net(struct net *net)
+static void __net_exit ipv4_frags_exit_net(struct net *net)
{
ip4_frags_ns_ctl_unregister(net);
inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 143333852624..fe381d12ecdd 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -14,6 +14,7 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
@@ -125,7 +126,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
#define HASH_SIZE 16
-static int ipgre_net_id;
+static int ipgre_net_id __read_mostly;
struct ipgre_net {
struct ip_tunnel *tunnels[4][HASH_SIZE];
@@ -156,8 +157,13 @@ struct ipgre_net {
#define tunnels_r tunnels[2]
#define tunnels_l tunnels[1]
#define tunnels_wc tunnels[0]
+/*
+ * Locking : hash tables are protected by RCU and a spinlock
+ */
+static DEFINE_SPINLOCK(ipgre_lock);
-static DEFINE_RWLOCK(ipgre_lock);
+#define for_each_ip_tunnel_rcu(start) \
+ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
/* Given src, dst and key, find appropriate for input tunnel. */
@@ -175,7 +181,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
ARPHRD_ETHER : ARPHRD_IPGRE;
int score, cand_score = 4;
- for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
+ for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
if (local != t->parms.iph.saddr ||
remote != t->parms.iph.daddr ||
key != t->parms.i_key ||
@@ -200,7 +206,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
}
}
- for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
+ for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
if (remote != t->parms.iph.daddr ||
key != t->parms.i_key ||
!(t->dev->flags & IFF_UP))
@@ -224,7 +230,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
}
}
- for (t = ign->tunnels_l[h1]; t; t = t->next) {
+ for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
if ((local != t->parms.iph.saddr &&
(local != t->parms.iph.daddr ||
!ipv4_is_multicast(local))) ||
@@ -250,7 +256,7 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
}
}
- for (t = ign->tunnels_wc[h1]; t; t = t->next) {
+ for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
if (t->parms.i_key != key ||
!(t->dev->flags & IFF_UP))
continue;
@@ -276,8 +282,9 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
if (cand != NULL)
return cand;
- if (ign->fb_tunnel_dev->flags & IFF_UP)
- return netdev_priv(ign->fb_tunnel_dev);
+ dev = ign->fb_tunnel_dev;
+ if (dev->flags & IFF_UP)
+ return netdev_priv(dev);
return NULL;
}
@@ -311,10 +318,10 @@ static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
{
struct ip_tunnel **tp = ipgre_bucket(ign, t);
+ spin_lock_bh(&ipgre_lock);
t->next = *tp;
- write_lock_bh(&ipgre_lock);
- *tp = t;
- write_unlock_bh(&ipgre_lock);
+ rcu_assign_pointer(*tp, t);
+ spin_unlock_bh(&ipgre_lock);
}
static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
@@ -323,9 +330,9 @@ static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
if (t == *tp) {
- write_lock_bh(&ipgre_lock);
+ spin_lock_bh(&ipgre_lock);
*tp = t->next;
- write_unlock_bh(&ipgre_lock);
+ spin_unlock_bh(&ipgre_lock);
break;
}
}
@@ -476,7 +483,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
break;
}
- read_lock(&ipgre_lock);
+ rcu_read_lock();
t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
flags & GRE_KEY ?
*(((__be32 *)p) + (grehlen / 4) - 1) : 0,
@@ -494,7 +501,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
t->err_count = 1;
t->err_time = jiffies;
out:
- read_unlock(&ipgre_lock);
+ rcu_read_unlock();
return;
}
@@ -573,7 +580,7 @@ static int ipgre_rcv(struct sk_buff *skb)
gre_proto = *(__be16 *)(h + 2);
- read_lock(&ipgre_lock);
+ rcu_read_lock();
if ((tunnel = ipgre_tunnel_lookup(skb->dev,
iph->saddr, iph->daddr, key,
gre_proto))) {
@@ -647,13 +654,13 @@ static int ipgre_rcv(struct sk_buff *skb)
ipgre_ecn_decapsulate(iph, skb);
netif_rx(skb);
- read_unlock(&ipgre_lock);
+ rcu_read_unlock();
return(0);
}
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
drop:
- read_unlock(&ipgre_lock);
+ rcu_read_unlock();
drop_nolock:
kfree_skb(skb);
return(0);
@@ -662,7 +669,8 @@ drop_nolock:
static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &tunnel->dev->stats;
+ struct net_device_stats *stats = &dev->stats;
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
struct iphdr *old_iph = ip_hdr(skb);
struct iphdr *tiph;
u8 tos;
@@ -786,7 +794,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
}
if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
- icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
ip_rt_put(rt);
goto tx_error;
}
@@ -803,14 +811,16 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
tunnel->err_count = 0;
}
- max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
+ max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
(skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+ if (max_headroom > dev->needed_headroom)
+ dev->needed_headroom = max_headroom;
if (!new_skb) {
ip_rt_put(rt);
- stats->tx_dropped++;
+ txq->tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -1137,12 +1147,9 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
if (saddr)
memcpy(&iph->saddr, saddr, 4);
-
- if (daddr) {
+ if (daddr)
memcpy(&iph->daddr, daddr, 4);
- return t->hlen;
- }
- if (iph->daddr && !ipv4_is_multicast(iph->daddr))
+ if (iph->daddr)
return t->hlen;
return -t->hlen;
@@ -1283,33 +1290,27 @@ static const struct net_protocol ipgre_protocol = {
.netns_ok = 1,
};
-static void ipgre_destroy_tunnels(struct ipgre_net *ign)
+static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
{
int prio;
for (prio = 0; prio < 4; prio++) {
int h;
for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t;
- while ((t = ign->tunnels[prio][h]) != NULL)
- unregister_netdevice(t->dev);
+ struct ip_tunnel *t = ign->tunnels[prio][h];
+
+ while (t != NULL) {
+ unregister_netdevice_queue(t->dev, head);
+ t = t->next;
+ }
}
}
}
-static int ipgre_init_net(struct net *net)
+static int __net_init ipgre_init_net(struct net *net)
{
+ struct ipgre_net *ign = net_generic(net, ipgre_net_id);
int err;
- struct ipgre_net *ign;
-
- err = -ENOMEM;
- ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
- if (ign == NULL)
- goto err_alloc;
-
- err = net_assign_generic(net, ipgre_net_id, ign);
- if (err < 0)
- goto err_assign;
ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
ipgre_tunnel_setup);
@@ -1330,27 +1331,26 @@ static int ipgre_init_net(struct net *net)
err_reg_dev:
free_netdev(ign->fb_tunnel_dev);
err_alloc_dev:
- /* nothing */
-err_assign:
- kfree(ign);
-err_alloc:
return err;
}
-static void ipgre_exit_net(struct net *net)
+static void __net_exit ipgre_exit_net(struct net *net)
{
struct ipgre_net *ign;
+ LIST_HEAD(list);
ign = net_generic(net, ipgre_net_id);
rtnl_lock();
- ipgre_destroy_tunnels(ign);
+ ipgre_destroy_tunnels(ign, &list);
+ unregister_netdevice_many(&list);
rtnl_unlock();
- kfree(ign);
}
static struct pernet_operations ipgre_net_ops = {
.init = ipgre_init_net,
.exit = ipgre_exit_net,
+ .id = &ipgre_net_id,
+ .size = sizeof(struct ipgre_net),
};
static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -1471,7 +1471,7 @@ static void ipgre_tap_setup(struct net_device *dev)
dev->features |= NETIF_F_NETNS_LOCAL;
}
-static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
+static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
struct nlattr *data[])
{
struct ip_tunnel *nt;
@@ -1665,15 +1665,16 @@ static int __init ipgre_init(void)
printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
- if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
+ err = register_pernet_device(&ipgre_net_ops);
+ if (err < 0)
+ return err;
+
+ err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
+ if (err < 0) {
printk(KERN_INFO "ipgre init: can't add protocol\n");
- return -EAGAIN;
+ goto add_proto_failed;
}
- err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
- if (err < 0)
- goto gen_device_failed;
-
err = rtnl_link_register(&ipgre_link_ops);
if (err < 0)
goto rtnl_link_failed;
@@ -1688,9 +1689,9 @@ out:
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
- unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
-gen_device_failed:
inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+add_proto_failed:
+ unregister_pernet_device(&ipgre_net_ops);
goto out;
}
@@ -1698,9 +1699,9 @@ static void __exit ipgre_fini(void)
{
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
- unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
printk(KERN_INFO "ipgre close: can't remove protocol\n");
+ unregister_pernet_device(&ipgre_net_ops);
}
module_init(ipgre_init);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 6c98b43badf4..f8ab7a380d4a 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -119,6 +119,7 @@
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
+#include <linux/slab.h>
#include <linux/net.h>
#include <linux/socket.h>
@@ -161,10 +162,10 @@ int ip_call_ra_chain(struct sk_buff *skb)
/* If socket is bound to an interface, only report
* the packet if it came from that interface.
*/
- if (sk && inet_sk(sk)->num == protocol &&
+ if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) &&
- sock_net(sk) == dev_net(dev)) {
+ net_eq(sock_net(sk), dev_net(dev))) {
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN)) {
read_unlock(&ip_ra_lock);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index 94bf105ef3c9..4c09a31fd140 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -11,6 +11,7 @@
#include <linux/capability.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/types.h>
#include <asm/uaccess.h>
#include <linux/skbuff.h>
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index f9895180f481..c65f18e0936e 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -51,6 +51,7 @@
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/highmem.h>
+#include <linux/slab.h>
#include <linux/socket.h>
#include <linux/sockios.h>
@@ -254,7 +255,7 @@ int ip_mc_output(struct sk_buff *skb)
*/
if (rt->rt_flags&RTCF_MULTICAST) {
- if ((!sk || inet_sk(sk)->mc_loop)
+ if (sk_mc_loop(sk)
#ifdef CONFIG_IP_MROUTE
/* Small optimization: do not loopback not local frames,
which returned after forwarding; they will be dropped
@@ -264,9 +265,11 @@ int ip_mc_output(struct sk_buff *skb)
This check is duplicated in ip_mr_input at the moment.
*/
- && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
+ &&
+ ((rt->rt_flags & RTCF_LOCAL) ||
+ !(IPCB(skb)->flags & IPSKB_FORWARDED))
#endif
- ) {
+ ) {
struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
if (newskb)
NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
@@ -329,7 +332,7 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
__be32 daddr;
/* Use correct destination address if we have options. */
- daddr = inet->daddr;
+ daddr = inet->inet_daddr;
if(opt && opt->srr)
daddr = opt->faddr;
@@ -338,13 +341,13 @@ int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
.mark = sk->sk_mark,
.nl_u = { .ip4_u =
{ .daddr = daddr,
- .saddr = inet->saddr,
+ .saddr = inet->inet_saddr,
.tos = RT_CONN_FLAGS(sk) } },
.proto = sk->sk_protocol,
.flags = inet_sk_flowi_flags(sk),
.uli_u = { .ports =
- { .sport = inet->sport,
- .dport = inet->dport } } };
+ { .sport = inet->inet_sport,
+ .dport = inet->inet_dport } } };
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
@@ -379,7 +382,7 @@ packet_routed:
if (opt && opt->optlen) {
iph->ihl += opt->optlen >> 2;
- ip_options_build(skb, opt, inet->daddr, rt, 0);
+ ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
}
ip_select_ident_more(iph, &rt->u.dst, sk,
@@ -501,8 +504,8 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
if (skb->sk) {
frag->sk = skb->sk;
frag->destructor = sock_wfree;
- truesizes += frag->truesize;
}
+ truesizes += frag->truesize;
}
/* Everything is OK. Generate! */
@@ -846,7 +849,8 @@ int ip_append_data(struct sock *sk,
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
- ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
+ mtu-exthdrlen);
return -EMSGSIZE;
}
@@ -1100,7 +1104,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
- ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
return -EMSGSIZE;
}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index e982b5c1ee17..1e64dabbd232 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -23,6 +23,7 @@
#include <linux/icmp.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -245,7 +246,7 @@ int ip_ra_control(struct sock *sk, unsigned char on,
{
struct ip_ra_chain *ra, *new_ra, **rap;
- if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW)
+ if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num == IPPROTO_RAW)
return -EINVAL;
new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL;
@@ -451,7 +452,8 @@ static int do_ip_setsockopt(struct sock *sk, int level,
(1<<IP_TTL) | (1<<IP_HDRINCL) |
(1<<IP_MTU_DISCOVER) | (1<<IP_RECVERR) |
(1<<IP_ROUTER_ALERT) | (1<<IP_FREEBIND) |
- (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT))) ||
+ (1<<IP_PASSSEC) | (1<<IP_TRANSPARENT) |
+ (1<<IP_MINTTL))) ||
optname == IP_MULTICAST_TTL ||
optname == IP_MULTICAST_ALL ||
optname == IP_MULTICAST_LOOP ||
@@ -480,7 +482,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
case IP_OPTIONS:
{
struct ip_options *opt = NULL;
- if (optlen > 40 || optlen < 0)
+ if (optlen > 40)
goto e_inval;
err = ip_options_get_from_user(sock_net(sk), &opt,
optval, optlen);
@@ -492,7 +494,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
if (sk->sk_family == PF_INET ||
(!((1 << sk->sk_state) &
(TCPF_LISTEN | TCPF_CLOSE)) &&
- inet->daddr != LOOPBACK4_IPV6)) {
+ inet->inet_daddr != LOOPBACK4_IPV6)) {
#endif
if (inet->opt)
icsk->icsk_ext_hdr_len -= inet->opt->optlen;
@@ -575,7 +577,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
inet->hdrincl = val ? 1 : 0;
break;
case IP_MTU_DISCOVER:
- if (val < 0 || val > 3)
+ if (val < IP_PMTUDISC_DONT || val > IP_PMTUDISC_PROBE)
goto e_inval;
inet->pmtudisc = val;
break;
@@ -936,6 +938,14 @@ mc_msf_out:
inet->transparent = !!val;
break;
+ case IP_MINTTL:
+ if (optlen < 1)
+ goto e_inval;
+ if (val < 0 || val > 255)
+ goto e_inval;
+ inet->min_ttl = val;
+ break;
+
default:
err = -ENOPROTOOPT;
break;
@@ -1180,8 +1190,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
if (inet->cmsg_flags & IP_CMSG_PKTINFO) {
struct in_pktinfo info;
- info.ipi_addr.s_addr = inet->rcv_saddr;
- info.ipi_spec_dst.s_addr = inet->rcv_saddr;
+ info.ipi_addr.s_addr = inet->inet_rcv_saddr;
+ info.ipi_spec_dst.s_addr = inet->inet_rcv_saddr;
info.ipi_ifindex = inet->mc_index;
put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info);
}
@@ -1198,6 +1208,9 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_TRANSPARENT:
val = inet->transparent;
break;
+ case IP_MINTTL:
+ val = inet->min_ttl;
+ break;
default:
release_sock(sk);
return -ENOPROTOOPT;
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c
index 38fbf04150ae..629067571f02 100644
--- a/net/ipv4/ipcomp.c
+++ b/net/ipv4/ipcomp.c
@@ -25,6 +25,7 @@
static void ipcomp4_err(struct sk_buff *skb, u32 info)
{
+ struct net *net = dev_net(skb->dev);
__be32 spi;
struct iphdr *iph = (struct iphdr *)skb->data;
struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
@@ -35,7 +36,7 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
return;
spi = htonl(ntohs(ipch->cpi));
- x = xfrm_state_lookup(&init_net, (xfrm_address_t *)&iph->daddr,
+ x = xfrm_state_lookup(net, skb->mark, (xfrm_address_t *)&iph->daddr,
spi, IPPROTO_COMP, AF_INET);
if (!x)
return;
@@ -47,9 +48,10 @@ static void ipcomp4_err(struct sk_buff *skb, u32 info)
/* We always hold one tunnel user reference to indicate a tunnel */
static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
{
+ struct net *net = xs_net(x);
struct xfrm_state *t;
- t = xfrm_state_alloc(&init_net);
+ t = xfrm_state_alloc(net);
if (t == NULL)
goto out;
@@ -61,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x)
t->props.mode = x->props.mode;
t->props.saddr.a4 = x->props.saddr.a4;
t->props.flags = x->props.flags;
+ memcpy(&t->mark, &x->mark, sizeof(t->mark));
if (xfrm_init_state(t))
goto error;
@@ -82,10 +85,12 @@ error:
*/
static int ipcomp_tunnel_attach(struct xfrm_state *x)
{
+ struct net *net = xs_net(x);
int err = 0;
struct xfrm_state *t;
+ u32 mark = x->mark.v & x->mark.m;
- t = xfrm_state_lookup(&init_net, (xfrm_address_t *)&x->id.daddr.a4,
+ t = xfrm_state_lookup(net, mark, (xfrm_address_t *)&x->id.daddr.a4,
x->props.saddr.a4, IPPROTO_IPIP, AF_INET);
if (!t) {
t = ipcomp_tunnel_create(x);
@@ -124,16 +129,12 @@ static int ipcomp4_init_state(struct xfrm_state *x)
if (x->props.mode == XFRM_MODE_TUNNEL) {
err = ipcomp_tunnel_attach(x);
if (err)
- goto error_tunnel;
+ goto out;
}
err = 0;
out:
return err;
-
-error_tunnel:
- ipcomp_destroy(x);
- goto out;
}
static const struct xfrm_type ipcomp_type = {
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index f8d04c256454..067ce9e043dc 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -53,6 +53,7 @@
#include <linux/root_dev.h>
#include <linux/delay.h>
#include <linux/nfs_fs.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/arp.h>
#include <net/ip.h>
@@ -187,6 +188,16 @@ struct ic_device {
static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */
static struct net_device *ic_dev __initdata = NULL; /* Selected device */
+static bool __init ic_device_match(struct net_device *dev)
+{
+ if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
+ (!(dev->flags & IFF_LOOPBACK) &&
+ (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
+ strncmp(dev->name, "dummy", 5)))
+ return true;
+ return false;
+}
+
static int __init ic_open_devs(void)
{
struct ic_device *d, **last;
@@ -207,10 +218,7 @@ static int __init ic_open_devs(void)
for_each_netdev(&init_net, dev) {
if (dev->flags & IFF_LOOPBACK)
continue;
- if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) :
- (!(dev->flags & IFF_LOOPBACK) &&
- (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) &&
- strncmp(dev->name, "dummy", 5))) {
+ if (ic_device_match(dev)) {
int able = 0;
if (dev->mtu >= 364)
able |= IC_BOOTP;
@@ -228,7 +236,7 @@ static int __init ic_open_devs(void)
}
if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) {
rtnl_unlock();
- return -1;
+ return -ENOMEM;
}
d->dev = dev;
*last = d;
@@ -253,7 +261,7 @@ static int __init ic_open_devs(void)
printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name);
else
printk(KERN_ERR "IP-Config: No network devices available.\n");
- return -1;
+ return -ENODEV;
}
return 0;
}
@@ -1172,10 +1180,9 @@ static int __init ic_dynamic(void)
schedule_timeout_uninterruptible(1);
#ifdef IPCONFIG_DHCP
/* DHCP isn't done until we get a DHCPACK. */
- if ((ic_got_reply & IC_BOOTP)
- && (ic_proto_enabled & IC_USE_DHCP)
- && ic_dhcp_msgtype != DHCPACK)
- {
+ if ((ic_got_reply & IC_BOOTP) &&
+ (ic_proto_enabled & IC_USE_DHCP) &&
+ ic_dhcp_msgtype != DHCPACK) {
ic_got_reply = 0;
printk(",");
continue;
@@ -1304,6 +1311,32 @@ __be32 __init root_nfs_parse_addr(char *name)
return addr;
}
+#define DEVICE_WAIT_MAX 12 /* 12 seconds */
+
+static int __init wait_for_devices(void)
+{
+ int i;
+
+ msleep(CONF_PRE_OPEN);
+ for (i = 0; i < DEVICE_WAIT_MAX; i++) {
+ struct net_device *dev;
+ int found = 0;
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ if (ic_device_match(dev)) {
+ found = 1;
+ break;
+ }
+ }
+ rtnl_unlock();
+ if (found)
+ return 0;
+ ssleep(1);
+ }
+ return -ENODEV;
+}
+
/*
* IP Autoconfig dispatcher.
*/
@@ -1314,6 +1347,7 @@ static int __init ip_auto_config(void)
#ifdef IPCONFIG_DYNAMIC
int retries = CONF_OPEN_RETRIES;
#endif
+ int err;
#ifdef CONFIG_PROC_FS
proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops);
@@ -1326,12 +1360,15 @@ static int __init ip_auto_config(void)
#ifdef IPCONFIG_DYNAMIC
try_try_again:
#endif
- /* Give hardware a chance to settle */
- msleep(CONF_PRE_OPEN);
+ /* Wait for devices to appear */
+ err = wait_for_devices();
+ if (err)
+ return err;
/* Setup all network devices */
- if (ic_open_devs() < 0)
- return -1;
+ err = ic_open_devs();
+ if (err)
+ return err;
/* Give drivers a chance to settle */
ssleep(CONF_POST_OPEN);
@@ -1344,9 +1381,9 @@ static int __init ip_auto_config(void)
*/
if (ic_myaddr == NONE ||
#ifdef CONFIG_ROOT_NFS
- (root_server_addr == NONE
- && ic_servaddr == NONE
- && ROOT_DEV == Root_NFS) ||
+ (root_server_addr == NONE &&
+ ic_servaddr == NONE &&
+ ROOT_DEV == Root_NFS) ||
#endif
ic_first_dev->next) {
#ifdef IPCONFIG_DYNAMIC
@@ -1447,7 +1484,7 @@ late_initcall(ip_auto_config);
/*
* Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel
- * command line parameter. See Documentation/filesystems/nfsroot.txt.
+ * command line parameter. See Documentation/filesystems/nfs/nfsroot.txt.
*/
static int __init ic_proto_name(char *name)
{
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ae40ed1ba560..0b27b14dcc9d 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -95,6 +95,7 @@
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
@@ -119,7 +120,7 @@
#define HASH_SIZE 16
#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
-static int ipip_net_id;
+static int ipip_net_id __read_mostly;
struct ipip_net {
struct ip_tunnel *tunnels_r_l[HASH_SIZE];
struct ip_tunnel *tunnels_r[HASH_SIZE];
@@ -130,11 +131,16 @@ struct ipip_net {
struct net_device *fb_tunnel_dev;
};
-static void ipip_fb_tunnel_init(struct net_device *dev);
static void ipip_tunnel_init(struct net_device *dev);
static void ipip_tunnel_setup(struct net_device *dev);
-static DEFINE_RWLOCK(ipip_lock);
+/*
+ * Locking : hash tables are protected by RCU and a spinlock
+ */
+static DEFINE_SPINLOCK(ipip_lock);
+
+#define for_each_ip_tunnel_rcu(start) \
+ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
__be32 remote, __be32 local)
@@ -144,20 +150,21 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
struct ip_tunnel *t;
struct ipip_net *ipn = net_generic(net, ipip_net_id);
- for (t = ipn->tunnels_r_l[h0^h1]; t; t = t->next) {
+ for_each_ip_tunnel_rcu(ipn->tunnels_r_l[h0 ^ h1])
if (local == t->parms.iph.saddr &&
remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
return t;
- }
- for (t = ipn->tunnels_r[h0]; t; t = t->next) {
+
+ for_each_ip_tunnel_rcu(ipn->tunnels_r[h0])
if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP))
return t;
- }
- for (t = ipn->tunnels_l[h1]; t; t = t->next) {
+
+ for_each_ip_tunnel_rcu(ipn->tunnels_l[h1])
if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP))
return t;
- }
- if ((t = ipn->tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP))
+
+ t = rcu_dereference(ipn->tunnels_wc[0]);
+ if (t && (t->dev->flags&IFF_UP))
return t;
return NULL;
}
@@ -193,9 +200,9 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
if (t == *tp) {
- write_lock_bh(&ipip_lock);
+ spin_lock_bh(&ipip_lock);
*tp = t->next;
- write_unlock_bh(&ipip_lock);
+ spin_unlock_bh(&ipip_lock);
break;
}
}
@@ -205,10 +212,10 @@ static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
{
struct ip_tunnel **tp = ipip_bucket(ipn, t);
+ spin_lock_bh(&ipip_lock);
t->next = *tp;
- write_lock_bh(&ipip_lock);
- *tp = t;
- write_unlock_bh(&ipip_lock);
+ rcu_assign_pointer(*tp, t);
+ spin_unlock_bh(&ipip_lock);
}
static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -267,9 +274,9 @@ static void ipip_tunnel_uninit(struct net_device *dev)
struct ipip_net *ipn = net_generic(net, ipip_net_id);
if (dev == ipn->fb_tunnel_dev) {
- write_lock_bh(&ipip_lock);
+ spin_lock_bh(&ipip_lock);
ipn->tunnels_wc[0] = NULL;
- write_unlock_bh(&ipip_lock);
+ spin_unlock_bh(&ipip_lock);
} else
ipip_tunnel_unlink(ipn, netdev_priv(dev));
dev_put(dev);
@@ -318,7 +325,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
err = -ENOENT;
- read_lock(&ipip_lock);
+ rcu_read_lock();
t = ipip_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr);
if (t == NULL || t->parms.iph.daddr == 0)
goto out;
@@ -333,7 +340,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
t->err_count = 1;
t->err_time = jiffies;
out:
- read_unlock(&ipip_lock);
+ rcu_read_unlock();
return err;
}
@@ -351,11 +358,11 @@ static int ipip_rcv(struct sk_buff *skb)
struct ip_tunnel *tunnel;
const struct iphdr *iph = ip_hdr(skb);
- read_lock(&ipip_lock);
+ rcu_read_lock();
if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
iph->saddr, iph->daddr)) != NULL) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
- read_unlock(&ipip_lock);
+ rcu_read_unlock();
kfree_skb(skb);
return 0;
}
@@ -374,10 +381,10 @@ static int ipip_rcv(struct sk_buff *skb)
nf_reset(skb);
ipip_ecn_decapsulate(iph, skb);
netif_rx(skb);
- read_unlock(&ipip_lock);
+ rcu_read_unlock();
return 0;
}
- read_unlock(&ipip_lock);
+ rcu_read_unlock();
return -1;
}
@@ -390,7 +397,8 @@ static int ipip_rcv(struct sk_buff *skb)
static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- struct net_device_stats *stats = &tunnel->dev->stats;
+ struct net_device_stats *stats = &dev->stats;
+ struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
struct iphdr *tiph = &tunnel->parms.iph;
u8 tos = tunnel->parms.iph.tos;
__be16 df = tiph->frag_off;
@@ -480,7 +488,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
ip_rt_put(rt);
- stats->tx_dropped++;
+ txq->tx_dropped++;
dev_kfree_skb(skb);
return NETDEV_TX_OK;
}
@@ -722,7 +730,7 @@ static void ipip_tunnel_init(struct net_device *dev)
ipip_tunnel_bind_dev(dev);
}
-static void ipip_fb_tunnel_init(struct net_device *dev)
+static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
struct iphdr *iph = &tunnel->parms.iph;
@@ -748,33 +756,27 @@ static struct xfrm_tunnel ipip_handler = {
static const char banner[] __initconst =
KERN_INFO "IPv4 over IPv4 tunneling driver\n";
-static void ipip_destroy_tunnels(struct ipip_net *ipn)
+static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
{
int prio;
for (prio = 1; prio < 4; prio++) {
int h;
for (h = 0; h < HASH_SIZE; h++) {
- struct ip_tunnel *t;
- while ((t = ipn->tunnels[prio][h]) != NULL)
- unregister_netdevice(t->dev);
+ struct ip_tunnel *t = ipn->tunnels[prio][h];
+
+ while (t != NULL) {
+ unregister_netdevice_queue(t->dev, head);
+ t = t->next;
+ }
}
}
}
-static int ipip_init_net(struct net *net)
+static int __net_init ipip_init_net(struct net *net)
{
+ struct ipip_net *ipn = net_generic(net, ipip_net_id);
int err;
- struct ipip_net *ipn;
-
- err = -ENOMEM;
- ipn = kzalloc(sizeof(struct ipip_net), GFP_KERNEL);
- if (ipn == NULL)
- goto err_alloc;
-
- err = net_assign_generic(net, ipip_net_id, ipn);
- if (err < 0)
- goto err_assign;
ipn->tunnels[0] = ipn->tunnels_wc;
ipn->tunnels[1] = ipn->tunnels_l;
@@ -801,27 +803,26 @@ err_reg_dev:
free_netdev(ipn->fb_tunnel_dev);
err_alloc_dev:
/* nothing */
-err_assign:
- kfree(ipn);
-err_alloc:
return err;
}
-static void ipip_exit_net(struct net *net)
+static void __net_exit ipip_exit_net(struct net *net)
{
- struct ipip_net *ipn;
+ struct ipip_net *ipn = net_generic(net, ipip_net_id);
+ LIST_HEAD(list);
- ipn = net_generic(net, ipip_net_id);
rtnl_lock();
- ipip_destroy_tunnels(ipn);
- unregister_netdevice(ipn->fb_tunnel_dev);
+ ipip_destroy_tunnels(ipn, &list);
+ unregister_netdevice_queue(ipn->fb_tunnel_dev, &list);
+ unregister_netdevice_many(&list);
rtnl_unlock();
- kfree(ipn);
}
static struct pernet_operations ipip_net_ops = {
.init = ipip_init_net,
.exit = ipip_exit_net,
+ .id = &ipip_net_id,
+ .size = sizeof(struct ipip_net),
};
static int __init ipip_init(void)
@@ -830,15 +831,14 @@ static int __init ipip_init(void)
printk(banner);
- if (xfrm4_tunnel_register(&ipip_handler, AF_INET)) {
+ err = register_pernet_device(&ipip_net_ops);
+ if (err < 0)
+ return err;
+ err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
+ if (err < 0) {
+ unregister_pernet_device(&ipip_net_ops);
printk(KERN_INFO "ipip init: can't register tunnel\n");
- return -EAGAIN;
}
-
- err = register_pernet_gen_device(&ipip_net_id, &ipip_net_ops);
- if (err)
- xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
-
return err;
}
@@ -847,7 +847,7 @@ static void __exit ipip_fini(void)
if (xfrm4_tunnel_deregister(&ipip_handler, AF_INET))
printk(KERN_INFO "ipip close: can't deregister tunnel\n");
- unregister_pernet_gen_device(ipip_net_id, &ipip_net_ops);
+ unregister_pernet_device(&ipip_net_ops);
}
module_init(ipip_init);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 99508d66a642..9d4f6d1340a4 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -47,6 +47,7 @@
#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/if_ether.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -275,7 +276,8 @@ failure:
* @notify: Set to 1, if the caller is a notifier_call
*/
-static int vif_delete(struct net *net, int vifi, int notify)
+static int vif_delete(struct net *net, int vifi, int notify,
+ struct list_head *head)
{
struct vif_device *v;
struct net_device *dev;
@@ -319,7 +321,7 @@ static int vif_delete(struct net *net, int vifi, int notify)
}
if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
- unregister_netdevice(dev);
+ unregister_netdevice_queue(dev, head);
dev_put(dev);
return 0;
@@ -469,8 +471,18 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock)
return err;
}
break;
+
+ case VIFF_USE_IFINDEX:
case 0:
- dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
+ if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
+ dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
+ if (dev && dev->ip_ptr == NULL) {
+ dev_put(dev);
+ return -EADDRNOTAVAIL;
+ }
+ } else
+ dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
+
if (!dev)
return -EADDRNOTAVAIL;
err = dev_set_allmulti(dev, 1);
@@ -791,6 +803,9 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
int line;
struct mfc_cache *uc, *c, **cp;
+ if (mfc->mfcc_parent >= MAXVIFS)
+ return -ENFILE;
+
line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
for (cp = &net->ipv4.mfc_cache_array[line];
@@ -862,14 +877,16 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
static void mroute_clean_tables(struct net *net)
{
int i;
+ LIST_HEAD(list);
/*
* Shut down all active vif entries
*/
for (i = 0; i < net->ipv4.maxvif; i++) {
if (!(net->ipv4.vif_table[i].flags&VIFF_STATIC))
- vif_delete(net, i, 0);
+ vif_delete(net, i, 0, &list);
}
+ unregister_netdevice_many(&list);
/*
* Wipe the cache
@@ -948,7 +965,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
switch (optname) {
case MRT_INIT:
if (sk->sk_type != SOCK_RAW ||
- inet_sk(sk)->num != IPPROTO_IGMP)
+ inet_sk(sk)->inet_num != IPPROTO_IGMP)
return -EOPNOTSUPP;
if (optlen != sizeof(int))
return -ENOPROTOOPT;
@@ -985,7 +1002,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
if (optname == MRT_ADD_VIF) {
ret = vif_add(net, &vif, sk == net->ipv4.mroute_sk);
} else {
- ret = vif_delete(net, vif.vifc_vifi, 0);
+ ret = vif_delete(net, vif.vifc_vifi, 0, NULL);
}
rtnl_unlock();
return ret;
@@ -1148,17 +1165,16 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
struct net *net = dev_net(dev);
struct vif_device *v;
int ct;
-
- if (!net_eq(dev_net(dev), net))
- return NOTIFY_DONE;
+ LIST_HEAD(list);
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
v = &net->ipv4.vif_table[0];
for (ct = 0; ct < net->ipv4.maxvif; ct++, v++) {
if (v->dev == dev)
- vif_delete(net, ct, 1);
+ vif_delete(net, ct, 1, &list);
}
+ unregister_netdevice_many(&list);
return NOTIFY_DONE;
}
@@ -1601,17 +1617,20 @@ ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
int ct;
struct rtnexthop *nhp;
struct net *net = mfc_net(c);
- struct net_device *dev = net->ipv4.vif_table[c->mfc_parent].dev;
u8 *b = skb_tail_pointer(skb);
struct rtattr *mp_head;
- if (dev)
- RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
+ /* If cache is unresolved, don't try to parse IIF and OIF */
+ if (c->mfc_parent > MAXVIFS)
+ return -ENOENT;
+
+ if (VIF_EXISTS(net, c->mfc_parent))
+ RTA_PUT(skb, RTA_IIF, 4, &net->ipv4.vif_table[c->mfc_parent].dev->ifindex);
mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
- if (c->mfc_un.res.ttls[ct] < 255) {
+ if (VIF_EXISTS(net, ct) && c->mfc_un.res.ttls[ct] < 255) {
if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
goto rtattr_failure;
nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 1725dc0ef688..82fb43c5c59e 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -4,6 +4,7 @@
#include <linux/netfilter_ipv4.h>
#include <linux/ip.h>
#include <linux/skbuff.h>
+#include <linux/gfp.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <net/ip.h>
@@ -155,10 +156,10 @@ static int nf_ip_reroute(struct sk_buff *skb,
if (entry->hook == NF_INET_LOCAL_OUT) {
const struct iphdr *iph = ip_hdr(skb);
- if (!(iph->tos == rt_info->tos
- && skb->mark == rt_info->mark
- && iph->daddr == rt_info->daddr
- && iph->saddr == rt_info->saddr))
+ if (!(iph->tos == rt_info->tos &&
+ skb->mark == rt_info->mark &&
+ iph->daddr == rt_info->daddr &&
+ iph->saddr == rt_info->saddr))
return ip_route_me_harder(skb, RTN_UNSPEC);
}
return 0;
@@ -248,9 +249,9 @@ module_exit(ipv4_netfilter_fini);
#ifdef CONFIG_SYSCTL
struct ctl_path nf_net_ipv4_netfilter_sysctl_path[] = {
- { .procname = "net", .ctl_name = CTL_NET, },
- { .procname = "ipv4", .ctl_name = NET_IPV4, },
- { .procname = "netfilter", .ctl_name = NET_IPV4_NETFILTER, },
+ { .procname = "net", },
+ { .procname = "ipv4", },
+ { .procname = "netfilter", },
{ }
};
EXPORT_SYMBOL_GPL(nf_net_ipv4_netfilter_sysctl_path);
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 27774c99d888..f07d77f65751 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -27,6 +27,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp/arp_tables.h>
+#include "../../netfilter/xt_repldata.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
@@ -58,6 +59,12 @@ do { \
#define ARP_NF_ASSERT(x)
#endif
+void *arpt_alloc_initial_table(const struct xt_table *info)
+{
+ return xt_alloc_initial_table(arpt, ARPT);
+}
+EXPORT_SYMBOL_GPL(arpt_alloc_initial_table);
+
static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
const char *hdr_addr, int len)
{
@@ -226,7 +233,14 @@ arpt_error(struct sk_buff *skb, const struct xt_target_param *par)
return NF_DROP;
}
-static inline struct arpt_entry *get_entry(void *base, unsigned int offset)
+static inline const struct arpt_entry_target *
+arpt_get_target_c(const struct arpt_entry *e)
+{
+ return arpt_get_target((struct arpt_entry *)e);
+}
+
+static inline struct arpt_entry *
+get_entry(const void *base, unsigned int offset)
{
return (struct arpt_entry *)(base + offset);
}
@@ -273,7 +287,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
arp = arp_hdr(skb);
do {
- struct arpt_entry_target *t;
+ const struct arpt_entry_target *t;
int hdr_len;
if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
@@ -285,7 +299,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
(2 * skb->dev->addr_len);
ADD_COUNTER(e->counters, hdr_len, 1);
- t = arpt_get_target(e);
+ t = arpt_get_target_c(e);
/* Standard target? */
if (!t->u.kernel.target->target) {
@@ -351,7 +365,7 @@ static inline bool unconditional(const struct arpt_arp *arp)
/* Figures out from what hook each rule can be called: returns 0 if
* there are loops. Puts hook bitmask in comefrom.
*/
-static int mark_source_chains(struct xt_table_info *newinfo,
+static int mark_source_chains(const struct xt_table_info *newinfo,
unsigned int valid_hooks, void *entry0)
{
unsigned int hook;
@@ -372,7 +386,7 @@ static int mark_source_chains(struct xt_table_info *newinfo,
for (;;) {
const struct arpt_standard_target *t
- = (void *)arpt_get_target(e);
+ = (void *)arpt_get_target_c(e);
int visited = e->comefrom & (1 << hook);
if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) {
@@ -384,11 +398,11 @@ static int mark_source_chains(struct xt_table_info *newinfo,
|= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
/* Unconditional return/END. */
- if ((e->target_offset == sizeof(struct arpt_entry)
- && (strcmp(t->target.u.user.name,
- ARPT_STANDARD_TARGET) == 0)
- && t->verdict < 0
- && unconditional(&e->arp)) || visited) {
+ if ((e->target_offset == sizeof(struct arpt_entry) &&
+ (strcmp(t->target.u.user.name,
+ ARPT_STANDARD_TARGET) == 0) &&
+ t->verdict < 0 && unconditional(&e->arp)) ||
+ visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
@@ -427,8 +441,8 @@ static int mark_source_chains(struct xt_table_info *newinfo,
int newpos = t->verdict;
if (strcmp(t->target.u.user.name,
- ARPT_STANDARD_TARGET) == 0
- && newpos >= 0) {
+ ARPT_STANDARD_TARGET) == 0 &&
+ newpos >= 0) {
if (newpos > newinfo->size -
sizeof(struct arpt_entry)) {
duprintf("mark_source_chains: "
@@ -456,7 +470,7 @@ static int mark_source_chains(struct xt_table_info *newinfo,
return 1;
}
-static inline int check_entry(struct arpt_entry *e, const char *name)
+static inline int check_entry(const struct arpt_entry *e, const char *name)
{
const struct arpt_entry_target *t;
@@ -468,7 +482,7 @@ static inline int check_entry(struct arpt_entry *e, const char *name)
if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset)
return -EINVAL;
- t = arpt_get_target(e);
+ t = arpt_get_target_c(e);
if (e->target_offset + t->u.target_size > e->next_offset)
return -EINVAL;
@@ -498,8 +512,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
}
static inline int
-find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
- unsigned int *i)
+find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
{
struct arpt_entry_target *t;
struct xt_target *target;
@@ -524,8 +537,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size,
ret = check_target(e, name);
if (ret)
goto err;
-
- (*i)++;
return 0;
err:
module_put(t->u.kernel.target->me);
@@ -533,14 +544,14 @@ out:
return ret;
}
-static bool check_underflow(struct arpt_entry *e)
+static bool check_underflow(const struct arpt_entry *e)
{
const struct arpt_entry_target *t;
unsigned int verdict;
if (!unconditional(&e->arp))
return false;
- t = arpt_get_target(e);
+ t = arpt_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
return false;
verdict = ((struct arpt_standard_target *)t)->verdict;
@@ -550,17 +561,16 @@ static bool check_underflow(struct arpt_entry *e)
static inline int check_entry_size_and_hooks(struct arpt_entry *e,
struct xt_table_info *newinfo,
- unsigned char *base,
- unsigned char *limit,
+ const unsigned char *base,
+ const unsigned char *limit,
const unsigned int *hook_entries,
const unsigned int *underflows,
- unsigned int valid_hooks,
- unsigned int *i)
+ unsigned int valid_hooks)
{
unsigned int h;
- if ((unsigned long)e % __alignof__(struct arpt_entry) != 0
- || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
+ if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
duprintf("Bad offset %p\n", e);
return -EINVAL;
}
@@ -592,19 +602,14 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
/* Clear counters and comefrom */
e->counters = ((struct xt_counters) { 0, 0 });
e->comefrom = 0;
-
- (*i)++;
return 0;
}
-static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
+static inline void cleanup_entry(struct arpt_entry *e)
{
struct xt_tgdtor_param par;
struct arpt_entry_target *t;
- if (i && (*i)-- == 0)
- return 1;
-
t = arpt_get_target(e);
par.target = t->u.kernel.target;
par.targinfo = t->data;
@@ -612,26 +617,20 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i)
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
- return 0;
}
/* Checks and translates the user-supplied table segment (held in
* newinfo).
*/
-static int translate_table(const char *name,
- unsigned int valid_hooks,
- struct xt_table_info *newinfo,
- void *entry0,
- unsigned int size,
- unsigned int number,
- const unsigned int *hook_entries,
- const unsigned int *underflows)
+static int translate_table(struct xt_table_info *newinfo, void *entry0,
+ const struct arpt_replace *repl)
{
+ struct arpt_entry *iter;
unsigned int i;
- int ret;
+ int ret = 0;
- newinfo->size = size;
- newinfo->number = number;
+ newinfo->size = repl->size;
+ newinfo->number = repl->num_entries;
/* Init all hooks to impossible value. */
for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
@@ -643,52 +642,63 @@ static int translate_table(const char *name,
i = 0;
/* Walk through entries, checking offsets. */
- ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
- check_entry_size_and_hooks,
- newinfo,
- entry0,
- entry0 + size,
- hook_entries, underflows, valid_hooks, &i);
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+ entry0 + repl->size,
+ repl->hook_entry,
+ repl->underflow,
+ repl->valid_hooks);
+ if (ret != 0)
+ break;
+ ++i;
+ }
duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret);
if (ret != 0)
return ret;
- if (i != number) {
+ if (i != repl->num_entries) {
duprintf("translate_table: %u not %u entries\n",
- i, number);
+ i, repl->num_entries);
return -EINVAL;
}
/* Check hooks all assigned */
for (i = 0; i < NF_ARP_NUMHOOKS; i++) {
/* Only hooks which are valid */
- if (!(valid_hooks & (1 << i)))
+ if (!(repl->valid_hooks & (1 << i)))
continue;
if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
duprintf("Invalid hook entry %u %u\n",
- i, hook_entries[i]);
+ i, repl->hook_entry[i]);
return -EINVAL;
}
if (newinfo->underflow[i] == 0xFFFFFFFF) {
duprintf("Invalid underflow %u %u\n",
- i, underflows[i]);
+ i, repl->underflow[i]);
return -EINVAL;
}
}
- if (!mark_source_chains(newinfo, valid_hooks, entry0)) {
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0)) {
duprintf("Looping hook\n");
return -ELOOP;
}
/* Finally, each sanity check must pass */
i = 0;
- ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size,
- find_check_entry, name, size, &i);
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = find_check_entry(iter, repl->name, repl->size);
+ if (ret != 0)
+ break;
+ ++i;
+ }
if (ret != 0) {
- ARPT_ENTRY_ITERATE(entry0, newinfo->size,
- cleanup_entry, &i);
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter);
+ }
return ret;
}
@@ -701,30 +711,10 @@ static int translate_table(const char *name,
return ret;
}
-/* Gets counters. */
-static inline int add_entry_to_counter(const struct arpt_entry *e,
- struct xt_counters total[],
- unsigned int *i)
-{
- ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
-
- (*i)++;
- return 0;
-}
-
-static inline int set_entry_to_counter(const struct arpt_entry *e,
- struct xt_counters total[],
- unsigned int *i)
-{
- SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
-
- (*i)++;
- return 0;
-}
-
static void get_counters(const struct xt_table_info *t,
struct xt_counters counters[])
{
+ struct arpt_entry *iter;
unsigned int cpu;
unsigned int i;
unsigned int curcpu;
@@ -740,32 +730,32 @@ static void get_counters(const struct xt_table_info *t,
curcpu = smp_processor_id();
i = 0;
- ARPT_ENTRY_ITERATE(t->entries[curcpu],
- t->size,
- set_entry_to_counter,
- counters,
- &i);
+ xt_entry_foreach(iter, t->entries[curcpu], t->size) {
+ SET_COUNTER(counters[i], iter->counters.bcnt,
+ iter->counters.pcnt);
+ ++i;
+ }
for_each_possible_cpu(cpu) {
if (cpu == curcpu)
continue;
i = 0;
xt_info_wrlock(cpu);
- ARPT_ENTRY_ITERATE(t->entries[cpu],
- t->size,
- add_entry_to_counter,
- counters,
- &i);
+ xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ ADD_COUNTER(counters[i], iter->counters.bcnt,
+ iter->counters.pcnt);
+ ++i;
+ }
xt_info_wrunlock(cpu);
}
local_bh_enable();
}
-static struct xt_counters *alloc_counters(struct xt_table *table)
+static struct xt_counters *alloc_counters(const struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
- struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = table->private;
/* We need atomic snapshot of counters: rest doesn't change
* (other than comefrom, which userspace doesn't care
@@ -783,11 +773,11 @@ static struct xt_counters *alloc_counters(struct xt_table *table)
}
static int copy_entries_to_user(unsigned int total_size,
- struct xt_table *table,
+ const struct xt_table *table,
void __user *userptr)
{
unsigned int off, num;
- struct arpt_entry *e;
+ const struct arpt_entry *e;
struct xt_counters *counters;
struct xt_table_info *private = table->private;
int ret = 0;
@@ -807,7 +797,7 @@ static int copy_entries_to_user(unsigned int total_size,
/* FIXME: use iterator macros --RR */
/* ... then go back and fix counters and names */
for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
- struct arpt_entry_target *t;
+ const struct arpt_entry_target *t;
e = (struct arpt_entry *)(loc_cpu_entry + off);
if (copy_to_user(userptr + off
@@ -818,7 +808,7 @@ static int copy_entries_to_user(unsigned int total_size,
goto free_counters;
}
- t = arpt_get_target(e);
+ t = arpt_get_target_c(e);
if (copy_to_user(userptr + off + e->target_offset
+ offsetof(struct arpt_entry_target,
u.user.name),
@@ -835,7 +825,7 @@ static int copy_entries_to_user(unsigned int total_size,
}
#ifdef CONFIG_COMPAT
-static void compat_standard_from_user(void *dst, void *src)
+static void compat_standard_from_user(void *dst, const void *src)
{
int v = *(compat_int_t *)src;
@@ -844,7 +834,7 @@ static void compat_standard_from_user(void *dst, void *src)
memcpy(dst, &v, sizeof(v));
}
-static int compat_standard_to_user(void __user *dst, void *src)
+static int compat_standard_to_user(void __user *dst, const void *src)
{
compat_int_t cv = *(int *)src;
@@ -853,18 +843,18 @@ static int compat_standard_to_user(void __user *dst, void *src)
return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
}
-static int compat_calc_entry(struct arpt_entry *e,
+static int compat_calc_entry(const struct arpt_entry *e,
const struct xt_table_info *info,
- void *base, struct xt_table_info *newinfo)
+ const void *base, struct xt_table_info *newinfo)
{
- struct arpt_entry_target *t;
+ const struct arpt_entry_target *t;
unsigned int entry_offset;
int off, i, ret;
off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
entry_offset = (void *)e - base;
- t = arpt_get_target(e);
+ t = arpt_get_target_c(e);
off += xt_compat_target_offset(t->u.kernel.target);
newinfo->size -= off;
ret = xt_compat_add_offset(NFPROTO_ARP, entry_offset, off);
@@ -885,7 +875,9 @@ static int compat_calc_entry(struct arpt_entry *e,
static int compat_table_info(const struct xt_table_info *info,
struct xt_table_info *newinfo)
{
+ struct arpt_entry *iter;
void *loc_cpu_entry;
+ int ret;
if (!newinfo || !info)
return -EINVAL;
@@ -894,13 +886,17 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
- return ARPT_ENTRY_ITERATE(loc_cpu_entry, info->size,
- compat_calc_entry, info, loc_cpu_entry,
- newinfo);
+ xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+ ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
}
#endif
-static int get_info(struct net *net, void __user *user, int *len, int compat)
+static int get_info(struct net *net, void __user *user,
+ const int *len, int compat)
{
char name[ARPT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -925,10 +921,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
if (t && !IS_ERR(t)) {
struct arpt_getinfo info;
const struct xt_table_info *private = t->private;
-
#ifdef CONFIG_COMPAT
+ struct xt_table_info tmp;
+
if (compat) {
- struct xt_table_info tmp;
ret = compat_table_info(private, &tmp);
xt_compat_flush_offsets(NFPROTO_ARP);
private = &tmp;
@@ -959,7 +955,7 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
}
static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
- int *len)
+ const int *len)
{
int ret;
struct arpt_get_entries get;
@@ -1010,6 +1006,7 @@ static int __do_replace(struct net *net, const char *name,
struct xt_table_info *oldinfo;
struct xt_counters *counters;
void *loc_cpu_old_entry;
+ struct arpt_entry *iter;
ret = 0;
counters = vmalloc_node(num_counters * sizeof(struct xt_counters),
@@ -1053,8 +1050,8 @@ static int __do_replace(struct net *net, const char *name,
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
- ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
- NULL);
+ xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ cleanup_entry(iter);
xt_free_table_info(oldinfo);
if (copy_to_user(counters_ptr, counters,
@@ -1073,12 +1070,14 @@ static int __do_replace(struct net *net, const char *name,
return ret;
}
-static int do_replace(struct net *net, void __user *user, unsigned int len)
+static int do_replace(struct net *net, const void __user *user,
+ unsigned int len)
{
int ret;
struct arpt_replace tmp;
struct xt_table_info *newinfo;
void *loc_cpu_entry;
+ struct arpt_entry *iter;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1099,9 +1098,7 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
goto free_newinfo;
}
- ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
- tmp.hook_entry, tmp.underflow);
+ ret = translate_table(newinfo, loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
@@ -1114,27 +1111,15 @@ static int do_replace(struct net *net, void __user *user, unsigned int len)
return 0;
free_newinfo_untrans:
- ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
}
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct arpt_entry *e,
- const struct xt_counters addme[],
- unsigned int *i)
-{
- ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
- (*i)++;
- return 0;
-}
-
-static int do_add_counters(struct net *net, void __user *user, unsigned int len,
- int compat)
+static int do_add_counters(struct net *net, const void __user *user,
+ unsigned int len, int compat)
{
unsigned int i, curcpu;
struct xt_counters_info tmp;
@@ -1147,6 +1132,7 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
const struct xt_table_info *private;
int ret = 0;
void *loc_cpu_entry;
+ struct arpt_entry *iter;
#ifdef CONFIG_COMPAT
struct compat_xt_counters_info compat_tmp;
@@ -1204,11 +1190,10 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
curcpu = smp_processor_id();
loc_cpu_entry = private->entries[curcpu];
xt_info_wrlock(curcpu);
- ARPT_ENTRY_ITERATE(loc_cpu_entry,
- private->size,
- add_counter_to_entry,
- paddc,
- &i);
+ xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+ ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ ++i;
+ }
xt_info_wrunlock(curcpu);
unlock_up_free:
local_bh_enable();
@@ -1221,28 +1206,22 @@ static int do_add_counters(struct net *net, void __user *user, unsigned int len,
}
#ifdef CONFIG_COMPAT
-static inline int
-compat_release_entry(struct compat_arpt_entry *e, unsigned int *i)
+static inline void compat_release_entry(struct compat_arpt_entry *e)
{
struct arpt_entry_target *t;
- if (i && (*i)-- == 0)
- return 1;
-
t = compat_arpt_get_target(e);
module_put(t->u.kernel.target->me);
- return 0;
}
static inline int
check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
struct xt_table_info *newinfo,
unsigned int *size,
- unsigned char *base,
- unsigned char *limit,
- unsigned int *hook_entries,
- unsigned int *underflows,
- unsigned int *i,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
const char *name)
{
struct arpt_entry_target *t;
@@ -1251,8 +1230,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
int ret, off, h;
duprintf("check_compat_entry_size_and_hooks %p\n", e);
- if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0
- || (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
+ if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
duprintf("Bad offset %p, limit = %p\n", e, limit);
return -EINVAL;
}
@@ -1302,8 +1281,6 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
/* Clear counters and comefrom */
memset(&e->counters, 0, sizeof(e->counters));
e->comefrom = 0;
-
- (*i)++;
return 0;
release_target:
@@ -1347,19 +1324,6 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
return ret;
}
-static inline int compat_check_entry(struct arpt_entry *e, const char *name,
- unsigned int *i)
-{
- int ret;
-
- ret = check_target(e, name);
- if (ret)
- return ret;
-
- (*i)++;
- return 0;
-}
-
static int translate_compat_table(const char *name,
unsigned int valid_hooks,
struct xt_table_info **pinfo,
@@ -1372,8 +1336,10 @@ static int translate_compat_table(const char *name,
unsigned int i, j;
struct xt_table_info *newinfo, *info;
void *pos, *entry0, *entry1;
+ struct compat_arpt_entry *iter0;
+ struct arpt_entry *iter1;
unsigned int size;
- int ret;
+ int ret = 0;
info = *pinfo;
entry0 = *pentry0;
@@ -1390,13 +1356,17 @@ static int translate_compat_table(const char *name,
j = 0;
xt_compat_lock(NFPROTO_ARP);
/* Walk through entries, checking offsets. */
- ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size,
- check_compat_entry_size_and_hooks,
- info, &size, entry0,
- entry0 + total_size,
- hook_entries, underflows, &j, name);
- if (ret != 0)
- goto out_unlock;
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+ entry0,
+ entry0 + total_size,
+ hook_entries,
+ underflows,
+ name);
+ if (ret != 0)
+ goto out_unlock;
+ ++j;
+ }
ret = -EINVAL;
if (j != number) {
@@ -1435,9 +1405,12 @@ static int translate_compat_table(const char *name,
entry1 = newinfo->entries[raw_smp_processor_id()];
pos = entry1;
size = total_size;
- ret = COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size,
- compat_copy_entry_from_user,
- &pos, &size, name, newinfo, entry1);
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = compat_copy_entry_from_user(iter0, &pos, &size,
+ name, newinfo, entry1);
+ if (ret != 0)
+ break;
+ }
xt_compat_flush_offsets(NFPROTO_ARP);
xt_compat_unlock(NFPROTO_ARP);
if (ret)
@@ -1448,13 +1421,32 @@ static int translate_compat_table(const char *name,
goto free_newinfo;
i = 0;
- ret = ARPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry,
- name, &i);
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ ret = check_target(iter1, name);
+ if (ret != 0)
+ break;
+ ++i;
+ }
if (ret) {
+ /*
+ * The first i matches need cleanup_entry (calls ->destroy)
+ * because they had called ->check already. The other j-i
+ * entries need only release.
+ */
+ int skip = i;
j -= i;
- COMPAT_ARPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i,
- compat_release_entry, &j);
- ARPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i);
+ xt_entry_foreach(iter0, entry0, newinfo->size) {
+ if (skip-- > 0)
+ continue;
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter1);
+ }
xt_free_table_info(newinfo);
return ret;
}
@@ -1472,7 +1464,11 @@ static int translate_compat_table(const char *name,
free_newinfo:
xt_free_table_info(newinfo);
out:
- COMPAT_ARPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j);
+ xt_entry_foreach(iter0, entry0, total_size) {
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
return ret;
out_unlock:
xt_compat_flush_offsets(NFPROTO_ARP);
@@ -1499,6 +1495,7 @@ static int compat_do_replace(struct net *net, void __user *user,
struct compat_arpt_replace tmp;
struct xt_table_info *newinfo;
void *loc_cpu_entry;
+ struct arpt_entry *iter;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1536,7 +1533,8 @@ static int compat_do_replace(struct net *net, void __user *user,
return 0;
free_newinfo_untrans:
- ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
@@ -1570,7 +1568,7 @@ static int compat_do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user,
static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
compat_uint_t *size,
struct xt_counters *counters,
- unsigned int *i)
+ unsigned int i)
{
struct arpt_entry_target *t;
struct compat_arpt_entry __user *ce;
@@ -1578,14 +1576,12 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
compat_uint_t origsize;
int ret;
- ret = -EFAULT;
origsize = *size;
ce = (struct compat_arpt_entry __user *)*dstptr;
- if (copy_to_user(ce, e, sizeof(struct arpt_entry)))
- goto out;
-
- if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i])))
- goto out;
+ if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 ||
+ copy_to_user(&ce->counters, &counters[i],
+ sizeof(counters[i])) != 0)
+ return -EFAULT;
*dstptr += sizeof(struct compat_arpt_entry);
*size -= sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
@@ -1595,18 +1591,12 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
t = arpt_get_target(e);
ret = xt_compat_target_to_user(t, dstptr, size);
if (ret)
- goto out;
- ret = -EFAULT;
+ return ret;
next_offset = e->next_offset - (origsize - *size);
- if (put_user(target_offset, &ce->target_offset))
- goto out;
- if (put_user(next_offset, &ce->next_offset))
- goto out;
-
- (*i)++;
+ if (put_user(target_offset, &ce->target_offset) != 0 ||
+ put_user(next_offset, &ce->next_offset) != 0)
+ return -EFAULT;
return 0;
-out:
- return ret;
}
static int compat_copy_entries_to_user(unsigned int total_size,
@@ -1620,6 +1610,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
int ret = 0;
void *loc_cpu_entry;
unsigned int i = 0;
+ struct arpt_entry *iter;
counters = alloc_counters(table);
if (IS_ERR(counters))
@@ -1629,9 +1620,12 @@ static int compat_copy_entries_to_user(unsigned int total_size,
loc_cpu_entry = private->entries[raw_smp_processor_id()];
pos = userptr;
size = total_size;
- ret = ARPT_ENTRY_ITERATE(loc_cpu_entry, total_size,
- compat_copy_entry_to_user,
- &pos, &size, counters, &i);
+ xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ ret = compat_copy_entry_to_user(iter, &pos,
+ &size, counters, i++);
+ if (ret != 0)
+ break;
+ }
vfree(counters);
return ret;
}
@@ -1799,12 +1793,7 @@ struct xt_table *arpt_register_table(struct net *net,
loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
memcpy(loc_cpu_entry, repl->entries, repl->size);
- ret = translate_table(table->name, table->valid_hooks,
- newinfo, loc_cpu_entry, repl->size,
- repl->num_entries,
- repl->hook_entry,
- repl->underflow);
-
+ ret = translate_table(newinfo, loc_cpu_entry, repl);
duprintf("arpt_register_table: translate table gives %d\n", ret);
if (ret != 0)
goto out_free;
@@ -1827,13 +1816,14 @@ void arpt_unregister_table(struct xt_table *table)
struct xt_table_info *private;
void *loc_cpu_entry;
struct module *table_owner = table->me;
+ struct arpt_entry *iter;
private = xt_unregister_table(table);
/* Decrease module usage counts and free resources */
loc_cpu_entry = private->entries[raw_smp_processor_id()];
- ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size,
- cleanup_entry, NULL);
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter);
if (private->number > private->initial_entries)
module_put(table_owner);
xt_free_table_info(private);
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 97337601827a..79ca5e70d497 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -6,7 +6,9 @@
*/
#include <linux/module.h>
+#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_arp/arp_tables.h>
+#include <linux/slab.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("David S. Miller <davem@redhat.com>");
@@ -15,93 +17,37 @@ MODULE_DESCRIPTION("arptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
(1 << NF_ARP_FORWARD))
-static const struct
-{
- struct arpt_replace repl;
- struct arpt_standard entries[3];
- struct arpt_error term;
-} initial_table __net_initdata = {
- .repl = {
- .name = "filter",
- .valid_hooks = FILTER_VALID_HOOKS,
- .num_entries = 4,
- .size = sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error),
- .hook_entry = {
- [NF_ARP_IN] = 0,
- [NF_ARP_OUT] = sizeof(struct arpt_standard),
- [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard),
- },
- .underflow = {
- [NF_ARP_IN] = 0,
- [NF_ARP_OUT] = sizeof(struct arpt_standard),
- [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard),
- },
- },
- .entries = {
- ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_IN */
- ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_OUT */
- ARPT_STANDARD_INIT(NF_ACCEPT), /* ARP_FORWARD */
- },
- .term = ARPT_ERROR_INIT,
-};
-
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_ARP,
+ .priority = NF_IP_PRI_FILTER,
};
/* The work comes in here from netfilter.c */
-static unsigned int arpt_in_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+static unsigned int
+arptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
- return arpt_do_table(skb, hook, in, out,
- dev_net(in)->ipv4.arptable_filter);
-}
+ const struct net *net = dev_net((in != NULL) ? in : out);
-static unsigned int arpt_out_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- return arpt_do_table(skb, hook, in, out,
- dev_net(out)->ipv4.arptable_filter);
+ return arpt_do_table(skb, hook, in, out, net->ipv4.arptable_filter);
}
-static struct nf_hook_ops arpt_ops[] __read_mostly = {
- {
- .hook = arpt_in_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_ARP,
- .hooknum = NF_ARP_IN,
- .priority = NF_IP_PRI_FILTER,
- },
- {
- .hook = arpt_out_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_ARP,
- .hooknum = NF_ARP_OUT,
- .priority = NF_IP_PRI_FILTER,
- },
- {
- .hook = arpt_in_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_ARP,
- .hooknum = NF_ARP_FORWARD,
- .priority = NF_IP_PRI_FILTER,
- },
-};
+static struct nf_hook_ops *arpfilter_ops __read_mostly;
static int __net_init arptable_filter_net_init(struct net *net)
{
- /* Register table */
+ struct arpt_replace *repl;
+
+ repl = arpt_alloc_initial_table(&packet_filter);
+ if (repl == NULL)
+ return -ENOMEM;
net->ipv4.arptable_filter =
- arpt_register_table(net, &packet_filter, &initial_table.repl);
+ arpt_register_table(net, &packet_filter, repl);
+ kfree(repl);
if (IS_ERR(net->ipv4.arptable_filter))
return PTR_ERR(net->ipv4.arptable_filter);
return 0;
@@ -125,9 +71,11 @@ static int __init arptable_filter_init(void)
if (ret < 0)
return ret;
- ret = nf_register_hooks(arpt_ops, ARRAY_SIZE(arpt_ops));
- if (ret < 0)
+ arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
+ if (IS_ERR(arpfilter_ops)) {
+ ret = PTR_ERR(arpfilter_ops);
goto cleanup_table;
+ }
return ret;
cleanup_table:
@@ -137,7 +85,7 @@ cleanup_table:
static void __exit arptable_filter_fini(void)
{
- nf_unregister_hooks(arpt_ops, ARRAY_SIZE(arpt_ops));
+ xt_hook_unlink(&packet_filter, arpfilter_ops);
unregister_pernet_subsys(&arptable_filter_net_ops);
}
diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c
index c156db215987..e2787048aa0a 100644
--- a/net/ipv4/netfilter/ip_queue.c
+++ b/net/ipv4/netfilter/ip_queue.c
@@ -26,6 +26,7 @@
#include <linux/security.h>
#include <linux/net.h>
#include <linux/mutex.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/route.h>
@@ -497,10 +498,9 @@ ipq_rcv_nl_event(struct notifier_block *this,
{
struct netlink_notify *n = ptr;
- if (event == NETLINK_URELEASE &&
- n->protocol == NETLINK_FIREWALL && n->pid) {
+ if (event == NETLINK_URELEASE && n->protocol == NETLINK_FIREWALL) {
write_lock_bh(&queue_lock);
- if ((n->net == &init_net) && (n->pid == peer_pid))
+ if ((net_eq(n->net, &init_net)) && (n->pid == peer_pid))
__ipq_reset();
write_unlock_bh(&queue_lock);
}
@@ -516,14 +516,13 @@ static struct ctl_table_header *ipq_sysctl_header;
static ctl_table ipq_table[] = {
{
- .ctl_name = NET_IPQ_QMAX,
.procname = NET_IPQ_QMAX_NAME,
.data = &queue_maxlen,
.maxlen = sizeof(queue_maxlen),
.mode = 0644,
.proc_handler = proc_dointvec
},
- { .ctl_name = 0 }
+ { }
};
#endif
@@ -622,7 +621,7 @@ cleanup_netlink_notifier:
static void __exit ip_queue_fini(void)
{
nf_unregister_queue_handlers(&nfqh);
- synchronize_net();
+
ipq_flush(NULL, 0);
#ifdef CONFIG_SYSCTL
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cde755d5eeab..b29c66df8d1f 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -28,6 +28,7 @@
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter_ipv4/ip_tables.h>
#include <net/netfilter/nf_log.h>
+#include "../../netfilter/xt_repldata.h"
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
@@ -66,6 +67,12 @@ do { \
#define inline
#endif
+void *ipt_alloc_initial_table(const struct xt_table *info)
+{
+ return xt_alloc_initial_table(ipt, IPT);
+}
+EXPORT_SYMBOL_GPL(ipt_alloc_initial_table);
+
/*
We keep a set of rules for each CPU, so we can avoid write-locking
them in the softirq when updating the counters and therefore
@@ -89,9 +96,9 @@ ip_packet_match(const struct iphdr *ip,
#define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
- IPT_INV_SRCIP)
- || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
- IPT_INV_DSTIP)) {
+ IPT_INV_SRCIP) ||
+ FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
+ IPT_INV_DSTIP)) {
dprintf("Source or dest mismatch.\n");
dprintf("SRC: %pI4. Mask: %pI4. Target: %pI4.%s\n",
@@ -122,8 +129,8 @@ ip_packet_match(const struct iphdr *ip,
}
/* Check specific protocol */
- if (ipinfo->proto
- && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
+ if (ipinfo->proto &&
+ FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) {
dprintf("Packet protocol %hi does not match %hi.%s\n",
ip->protocol, ipinfo->proto,
ipinfo->invflags&IPT_INV_PROTO ? " (INV)":"");
@@ -169,7 +176,7 @@ ipt_error(struct sk_buff *skb, const struct xt_target_param *par)
/* Performance critical - called for every packet */
static inline bool
-do_match(struct ipt_entry_match *m, const struct sk_buff *skb,
+do_match(const struct ipt_entry_match *m, const struct sk_buff *skb,
struct xt_match_param *par)
{
par->match = m->u.kernel.match;
@@ -184,7 +191,7 @@ do_match(struct ipt_entry_match *m, const struct sk_buff *skb,
/* Performance critical */
static inline struct ipt_entry *
-get_entry(void *base, unsigned int offset)
+get_entry(const void *base, unsigned int offset)
{
return (struct ipt_entry *)(base + offset);
}
@@ -199,6 +206,13 @@ static inline bool unconditional(const struct ipt_ip *ip)
#undef FWINV
}
+/* for const-correctness */
+static inline const struct ipt_entry_target *
+ipt_get_target_c(const struct ipt_entry *e)
+{
+ return ipt_get_target((struct ipt_entry *)e);
+}
+
#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
static const char *const hooknames[] = {
@@ -233,11 +247,11 @@ static struct nf_loginfo trace_loginfo = {
/* Mildly perf critical (only if packet tracing is on) */
static inline int
-get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e,
+get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
const char *hookname, const char **chainname,
const char **comment, unsigned int *rulenum)
{
- struct ipt_standard_target *t = (void *)ipt_get_target(s);
+ const struct ipt_standard_target *t = (void *)ipt_get_target_c(s);
if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) {
/* Head of user chain: ERROR target with chainname */
@@ -246,11 +260,11 @@ get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e,
} else if (s == e) {
(*rulenum)++;
- if (s->target_offset == sizeof(struct ipt_entry)
- && strcmp(t->target.u.kernel.target->name,
- IPT_STANDARD_TARGET) == 0
- && t->verdict < 0
- && unconditional(&s->ip)) {
+ if (s->target_offset == sizeof(struct ipt_entry) &&
+ strcmp(t->target.u.kernel.target->name,
+ IPT_STANDARD_TARGET) == 0 &&
+ t->verdict < 0 &&
+ unconditional(&s->ip)) {
/* Tail of chains: STANDARD target (return/policy) */
*comment = *chainname == hookname
? comments[NF_IP_TRACE_COMMENT_POLICY]
@@ -263,17 +277,18 @@ get_chainname_rulenum(struct ipt_entry *s, struct ipt_entry *e,
return 0;
}
-static void trace_packet(struct sk_buff *skb,
+static void trace_packet(const struct sk_buff *skb,
unsigned int hook,
const struct net_device *in,
const struct net_device *out,
const char *tablename,
- struct xt_table_info *private,
- struct ipt_entry *e)
+ const struct xt_table_info *private,
+ const struct ipt_entry *e)
{
- void *table_base;
+ const void *table_base;
const struct ipt_entry *root;
const char *hookname, *chainname, *comment;
+ const struct ipt_entry *iter;
unsigned int rulenum = 0;
table_base = private->entries[smp_processor_id()];
@@ -282,10 +297,10 @@ static void trace_packet(struct sk_buff *skb,
hookname = chainname = hooknames[hook];
comment = comments[NF_IP_TRACE_COMMENT_RULE];
- IPT_ENTRY_ITERATE(root,
- private->size - private->hook_entry[hook],
- get_chainname_rulenum,
- e, hookname, &chainname, &comment, &rulenum);
+ xt_entry_foreach(iter, root, private->size - private->hook_entry[hook])
+ if (get_chainname_rulenum(iter, e, hookname,
+ &chainname, &comment, &rulenum) != 0)
+ break;
nf_log_packet(AF_INET, hook, skb, in, out, &trace_loginfo,
"TRACE: %s:%s:%s:%u ",
@@ -315,9 +330,9 @@ ipt_do_table(struct sk_buff *skb,
/* Initializing verdict to NF_DROP keeps gcc happy. */
unsigned int verdict = NF_DROP;
const char *indev, *outdev;
- void *table_base;
+ const void *table_base;
struct ipt_entry *e, *back;
- struct xt_table_info *private;
+ const struct xt_table_info *private;
struct xt_match_param mtpar;
struct xt_target_param tgpar;
@@ -350,17 +365,22 @@ ipt_do_table(struct sk_buff *skb,
back = get_entry(table_base, private->underflow[hook]);
do {
- struct ipt_entry_target *t;
+ const struct ipt_entry_target *t;
+ const struct xt_entry_match *ematch;
IP_NF_ASSERT(e);
IP_NF_ASSERT(back);
if (!ip_packet_match(ip, indev, outdev,
- &e->ip, mtpar.fragoff) ||
- IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0) {
+ &e->ip, mtpar.fragoff)) {
+ no_match:
e = ipt_next_entry(e);
continue;
}
+ xt_ematch_foreach(ematch, e)
+ if (do_match(ematch, skb, &mtpar) != 0)
+ goto no_match;
+
ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
t = ipt_get_target(e);
@@ -388,8 +408,8 @@ ipt_do_table(struct sk_buff *skb,
back = get_entry(table_base, back->comefrom);
continue;
}
- if (table_base + v != ipt_next_entry(e)
- && !(e->ip.flags & IPT_F_GOTO)) {
+ if (table_base + v != ipt_next_entry(e) &&
+ !(e->ip.flags & IPT_F_GOTO)) {
/* Save old back ptr in next entry */
struct ipt_entry *next = ipt_next_entry(e);
next->comefrom = (void *)back - table_base;
@@ -443,7 +463,7 @@ ipt_do_table(struct sk_buff *skb,
/* Figures out from what hook each rule can be called: returns 0 if
there are loops. Puts hook bitmask in comefrom. */
static int
-mark_source_chains(struct xt_table_info *newinfo,
+mark_source_chains(const struct xt_table_info *newinfo,
unsigned int valid_hooks, void *entry0)
{
unsigned int hook;
@@ -461,8 +481,8 @@ mark_source_chains(struct xt_table_info *newinfo,
e->counters.pcnt = pos;
for (;;) {
- struct ipt_standard_target *t
- = (void *)ipt_get_target(e);
+ const struct ipt_standard_target *t
+ = (void *)ipt_get_target_c(e);
int visited = e->comefrom & (1 << hook);
if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
@@ -473,11 +493,11 @@ mark_source_chains(struct xt_table_info *newinfo,
e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
/* Unconditional return/END. */
- if ((e->target_offset == sizeof(struct ipt_entry)
- && (strcmp(t->target.u.user.name,
- IPT_STANDARD_TARGET) == 0)
- && t->verdict < 0
- && unconditional(&e->ip)) || visited) {
+ if ((e->target_offset == sizeof(struct ipt_entry) &&
+ (strcmp(t->target.u.user.name,
+ IPT_STANDARD_TARGET) == 0) &&
+ t->verdict < 0 && unconditional(&e->ip)) ||
+ visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
@@ -524,8 +544,8 @@ mark_source_chains(struct xt_table_info *newinfo,
int newpos = t->verdict;
if (strcmp(t->target.u.user.name,
- IPT_STANDARD_TARGET) == 0
- && newpos >= 0) {
+ IPT_STANDARD_TARGET) == 0 &&
+ newpos >= 0) {
if (newpos > newinfo->size -
sizeof(struct ipt_entry)) {
duprintf("mark_source_chains: "
@@ -552,27 +572,23 @@ mark_source_chains(struct xt_table_info *newinfo,
return 1;
}
-static int
-cleanup_match(struct ipt_entry_match *m, unsigned int *i)
+static void cleanup_match(struct ipt_entry_match *m, struct net *net)
{
struct xt_mtdtor_param par;
- if (i && (*i)-- == 0)
- return 1;
-
+ par.net = net;
par.match = m->u.kernel.match;
par.matchinfo = m->data;
par.family = NFPROTO_IPV4;
if (par.match->destroy != NULL)
par.match->destroy(&par);
module_put(par.match->me);
- return 0;
}
static int
-check_entry(struct ipt_entry *e, const char *name)
+check_entry(const struct ipt_entry *e, const char *name)
{
- struct ipt_entry_target *t;
+ const struct ipt_entry_target *t;
if (!ip_checkentry(&e->ip)) {
duprintf("ip_tables: ip check failed %p %s.\n", e, name);
@@ -583,7 +599,7 @@ check_entry(struct ipt_entry *e, const char *name)
e->next_offset)
return -EINVAL;
- t = ipt_get_target(e);
+ t = ipt_get_target_c(e);
if (e->target_offset + t->u.target_size > e->next_offset)
return -EINVAL;
@@ -591,8 +607,7 @@ check_entry(struct ipt_entry *e, const char *name)
}
static int
-check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
- unsigned int *i)
+check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
{
const struct ipt_ip *ip = par->entryinfo;
int ret;
@@ -607,13 +622,11 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
par.match->name);
return ret;
}
- ++*i;
return 0;
}
static int
-find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
- unsigned int *i)
+find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
{
struct xt_match *match;
int ret;
@@ -627,7 +640,7 @@ find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par,
}
m->u.kernel.match = match;
- ret = check_match(m, par, i);
+ ret = check_match(m, par);
if (ret)
goto err;
@@ -637,10 +650,11 @@ err:
return ret;
}
-static int check_target(struct ipt_entry *e, const char *name)
+static int check_target(struct ipt_entry *e, struct net *net, const char *name)
{
struct ipt_entry_target *t = ipt_get_target(e);
struct xt_tgchk_param par = {
+ .net = net,
.table = name,
.entryinfo = e,
.target = t->u.kernel.target,
@@ -661,27 +675,32 @@ static int check_target(struct ipt_entry *e, const char *name)
}
static int
-find_check_entry(struct ipt_entry *e, const char *name, unsigned int size,
- unsigned int *i)
+find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
+ unsigned int size)
{
struct ipt_entry_target *t;
struct xt_target *target;
int ret;
unsigned int j;
struct xt_mtchk_param mtpar;
+ struct xt_entry_match *ematch;
ret = check_entry(e, name);
if (ret)
return ret;
j = 0;
+ mtpar.net = net;
mtpar.table = name;
mtpar.entryinfo = &e->ip;
mtpar.hook_mask = e->comefrom;
mtpar.family = NFPROTO_IPV4;
- ret = IPT_MATCH_ITERATE(e, find_check_match, &mtpar, &j);
- if (ret != 0)
- goto cleanup_matches;
+ xt_ematch_foreach(ematch, e) {
+ ret = find_check_match(ematch, &mtpar);
+ if (ret != 0)
+ goto cleanup_matches;
+ ++j;
+ }
t = ipt_get_target(e);
target = try_then_request_module(xt_find_target(AF_INET,
@@ -695,27 +714,29 @@ find_check_entry(struct ipt_entry *e, const char *name, unsigned int size,
}
t->u.kernel.target = target;
- ret = check_target(e, name);
+ ret = check_target(e, net, name);
if (ret)
goto err;
-
- (*i)++;
return 0;
err:
module_put(t->u.kernel.target->me);
cleanup_matches:
- IPT_MATCH_ITERATE(e, cleanup_match, &j);
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ cleanup_match(ematch, net);
+ }
return ret;
}
-static bool check_underflow(struct ipt_entry *e)
+static bool check_underflow(const struct ipt_entry *e)
{
const struct ipt_entry_target *t;
unsigned int verdict;
if (!unconditional(&e->ip))
return false;
- t = ipt_get_target(e);
+ t = ipt_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
return false;
verdict = ((struct ipt_standard_target *)t)->verdict;
@@ -726,17 +747,16 @@ static bool check_underflow(struct ipt_entry *e)
static int
check_entry_size_and_hooks(struct ipt_entry *e,
struct xt_table_info *newinfo,
- unsigned char *base,
- unsigned char *limit,
+ const unsigned char *base,
+ const unsigned char *limit,
const unsigned int *hook_entries,
const unsigned int *underflows,
- unsigned int valid_hooks,
- unsigned int *i)
+ unsigned int valid_hooks)
{
unsigned int h;
- if ((unsigned long)e % __alignof__(struct ipt_entry) != 0
- || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
+ if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
duprintf("Bad offset %p\n", e);
return -EINVAL;
}
@@ -768,50 +788,42 @@ check_entry_size_and_hooks(struct ipt_entry *e,
/* Clear counters and comefrom */
e->counters = ((struct xt_counters) { 0, 0 });
e->comefrom = 0;
-
- (*i)++;
return 0;
}
-static int
-cleanup_entry(struct ipt_entry *e, unsigned int *i)
+static void
+cleanup_entry(struct ipt_entry *e, struct net *net)
{
struct xt_tgdtor_param par;
struct ipt_entry_target *t;
-
- if (i && (*i)-- == 0)
- return 1;
+ struct xt_entry_match *ematch;
/* Cleanup all matches */
- IPT_MATCH_ITERATE(e, cleanup_match, NULL);
+ xt_ematch_foreach(ematch, e)
+ cleanup_match(ematch, net);
t = ipt_get_target(e);
+ par.net = net;
par.target = t->u.kernel.target;
par.targinfo = t->data;
par.family = NFPROTO_IPV4;
if (par.target->destroy != NULL)
par.target->destroy(&par);
module_put(par.target->me);
- return 0;
}
/* Checks and translates the user-supplied table segment (held in
newinfo) */
static int
-translate_table(const char *name,
- unsigned int valid_hooks,
- struct xt_table_info *newinfo,
- void *entry0,
- unsigned int size,
- unsigned int number,
- const unsigned int *hook_entries,
- const unsigned int *underflows)
+translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
+ const struct ipt_replace *repl)
{
+ struct ipt_entry *iter;
unsigned int i;
- int ret;
+ int ret = 0;
- newinfo->size = size;
- newinfo->number = number;
+ newinfo->size = repl->size;
+ newinfo->number = repl->num_entries;
/* Init all hooks to impossible value. */
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
@@ -822,49 +834,58 @@ translate_table(const char *name,
duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
- ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
- check_entry_size_and_hooks,
- newinfo,
- entry0,
- entry0 + size,
- hook_entries, underflows, valid_hooks, &i);
- if (ret != 0)
- return ret;
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = check_entry_size_and_hooks(iter, newinfo, entry0,
+ entry0 + repl->size,
+ repl->hook_entry,
+ repl->underflow,
+ repl->valid_hooks);
+ if (ret != 0)
+ return ret;
+ ++i;
+ }
- if (i != number) {
+ if (i != repl->num_entries) {
duprintf("translate_table: %u not %u entries\n",
- i, number);
+ i, repl->num_entries);
return -EINVAL;
}
/* Check hooks all assigned */
for (i = 0; i < NF_INET_NUMHOOKS; i++) {
/* Only hooks which are valid */
- if (!(valid_hooks & (1 << i)))
+ if (!(repl->valid_hooks & (1 << i)))
continue;
if (newinfo->hook_entry[i] == 0xFFFFFFFF) {
duprintf("Invalid hook entry %u %u\n",
- i, hook_entries[i]);
+ i, repl->hook_entry[i]);
return -EINVAL;
}
if (newinfo->underflow[i] == 0xFFFFFFFF) {
duprintf("Invalid underflow %u %u\n",
- i, underflows[i]);
+ i, repl->underflow[i]);
return -EINVAL;
}
}
- if (!mark_source_chains(newinfo, valid_hooks, entry0))
+ if (!mark_source_chains(newinfo, repl->valid_hooks, entry0))
return -ELOOP;
/* Finally, each sanity check must pass */
i = 0;
- ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
- find_check_entry, name, size, &i);
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ ret = find_check_entry(iter, net, repl->name, repl->size);
+ if (ret != 0)
+ break;
+ ++i;
+ }
if (ret != 0) {
- IPT_ENTRY_ITERATE(entry0, newinfo->size,
- cleanup_entry, &i);
+ xt_entry_foreach(iter, entry0, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter, net);
+ }
return ret;
}
@@ -877,33 +898,11 @@ translate_table(const char *name,
return ret;
}
-/* Gets counters. */
-static inline int
-add_entry_to_counter(const struct ipt_entry *e,
- struct xt_counters total[],
- unsigned int *i)
-{
- ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
-
- (*i)++;
- return 0;
-}
-
-static inline int
-set_entry_to_counter(const struct ipt_entry *e,
- struct ipt_counters total[],
- unsigned int *i)
-{
- SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
-
- (*i)++;
- return 0;
-}
-
static void
get_counters(const struct xt_table_info *t,
struct xt_counters counters[])
{
+ struct ipt_entry *iter;
unsigned int cpu;
unsigned int i;
unsigned int curcpu;
@@ -919,32 +918,32 @@ get_counters(const struct xt_table_info *t,
curcpu = smp_processor_id();
i = 0;
- IPT_ENTRY_ITERATE(t->entries[curcpu],
- t->size,
- set_entry_to_counter,
- counters,
- &i);
+ xt_entry_foreach(iter, t->entries[curcpu], t->size) {
+ SET_COUNTER(counters[i], iter->counters.bcnt,
+ iter->counters.pcnt);
+ ++i;
+ }
for_each_possible_cpu(cpu) {
if (cpu == curcpu)
continue;
i = 0;
xt_info_wrlock(cpu);
- IPT_ENTRY_ITERATE(t->entries[cpu],
- t->size,
- add_entry_to_counter,
- counters,
- &i);
+ xt_entry_foreach(iter, t->entries[cpu], t->size) {
+ ADD_COUNTER(counters[i], iter->counters.bcnt,
+ iter->counters.pcnt);
+ ++i; /* macro does multi eval of i */
+ }
xt_info_wrunlock(cpu);
}
local_bh_enable();
}
-static struct xt_counters * alloc_counters(struct xt_table *table)
+static struct xt_counters *alloc_counters(const struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
- struct xt_table_info *private = table->private;
+ const struct xt_table_info *private = table->private;
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
@@ -962,11 +961,11 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
static int
copy_entries_to_user(unsigned int total_size,
- struct xt_table *table,
+ const struct xt_table *table,
void __user *userptr)
{
unsigned int off, num;
- struct ipt_entry *e;
+ const struct ipt_entry *e;
struct xt_counters *counters;
const struct xt_table_info *private = table->private;
int ret = 0;
@@ -1018,7 +1017,7 @@ copy_entries_to_user(unsigned int total_size,
}
}
- t = ipt_get_target(e);
+ t = ipt_get_target_c(e);
if (copy_to_user(userptr + off + e->target_offset
+ offsetof(struct ipt_entry_target,
u.user.name),
@@ -1035,7 +1034,7 @@ copy_entries_to_user(unsigned int total_size,
}
#ifdef CONFIG_COMPAT
-static void compat_standard_from_user(void *dst, void *src)
+static void compat_standard_from_user(void *dst, const void *src)
{
int v = *(compat_int_t *)src;
@@ -1044,7 +1043,7 @@ static void compat_standard_from_user(void *dst, void *src)
memcpy(dst, &v, sizeof(v));
}
-static int compat_standard_to_user(void __user *dst, void *src)
+static int compat_standard_to_user(void __user *dst, const void *src)
{
compat_int_t cv = *(int *)src;
@@ -1053,25 +1052,20 @@ static int compat_standard_to_user(void __user *dst, void *src)
return copy_to_user(dst, &cv, sizeof(cv)) ? -EFAULT : 0;
}
-static inline int
-compat_calc_match(struct ipt_entry_match *m, int *size)
-{
- *size += xt_compat_match_offset(m->u.kernel.match);
- return 0;
-}
-
-static int compat_calc_entry(struct ipt_entry *e,
+static int compat_calc_entry(const struct ipt_entry *e,
const struct xt_table_info *info,
- void *base, struct xt_table_info *newinfo)
+ const void *base, struct xt_table_info *newinfo)
{
- struct ipt_entry_target *t;
+ const struct xt_entry_match *ematch;
+ const struct ipt_entry_target *t;
unsigned int entry_offset;
int off, i, ret;
off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
entry_offset = (void *)e - base;
- IPT_MATCH_ITERATE(e, compat_calc_match, &off);
- t = ipt_get_target(e);
+ xt_ematch_foreach(ematch, e)
+ off += xt_compat_match_offset(ematch->u.kernel.match);
+ t = ipt_get_target_c(e);
off += xt_compat_target_offset(t->u.kernel.target);
newinfo->size -= off;
ret = xt_compat_add_offset(AF_INET, entry_offset, off);
@@ -1092,7 +1086,9 @@ static int compat_calc_entry(struct ipt_entry *e,
static int compat_table_info(const struct xt_table_info *info,
struct xt_table_info *newinfo)
{
+ struct ipt_entry *iter;
void *loc_cpu_entry;
+ int ret;
if (!newinfo || !info)
return -EINVAL;
@@ -1101,13 +1097,17 @@ static int compat_table_info(const struct xt_table_info *info,
memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
newinfo->initial_entries = 0;
loc_cpu_entry = info->entries[raw_smp_processor_id()];
- return IPT_ENTRY_ITERATE(loc_cpu_entry, info->size,
- compat_calc_entry, info, loc_cpu_entry,
- newinfo);
+ xt_entry_foreach(iter, loc_cpu_entry, info->size) {
+ ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
+ if (ret != 0)
+ return ret;
+ }
+ return 0;
}
#endif
-static int get_info(struct net *net, void __user *user, int *len, int compat)
+static int get_info(struct net *net, void __user *user,
+ const int *len, int compat)
{
char name[IPT_TABLE_MAXNAMELEN];
struct xt_table *t;
@@ -1132,10 +1132,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
if (t && !IS_ERR(t)) {
struct ipt_getinfo info;
const struct xt_table_info *private = t->private;
-
#ifdef CONFIG_COMPAT
+ struct xt_table_info tmp;
+
if (compat) {
- struct xt_table_info tmp;
ret = compat_table_info(private, &tmp);
xt_compat_flush_offsets(AF_INET);
private = &tmp;
@@ -1167,7 +1167,8 @@ static int get_info(struct net *net, void __user *user, int *len, int compat)
}
static int
-get_entries(struct net *net, struct ipt_get_entries __user *uptr, int *len)
+get_entries(struct net *net, struct ipt_get_entries __user *uptr,
+ const int *len)
{
int ret;
struct ipt_get_entries get;
@@ -1215,6 +1216,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
struct xt_table_info *oldinfo;
struct xt_counters *counters;
void *loc_cpu_old_entry;
+ struct ipt_entry *iter;
ret = 0;
counters = vmalloc(num_counters * sizeof(struct xt_counters));
@@ -1257,8 +1259,9 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
/* Decrease module usage counts and free resource */
loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
- IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
- NULL);
+ xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+ cleanup_entry(iter, net);
+
xt_free_table_info(oldinfo);
if (copy_to_user(counters_ptr, counters,
sizeof(struct xt_counters) * num_counters) != 0)
@@ -1277,12 +1280,13 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
}
static int
-do_replace(struct net *net, void __user *user, unsigned int len)
+do_replace(struct net *net, const void __user *user, unsigned int len)
{
int ret;
struct ipt_replace tmp;
struct xt_table_info *newinfo;
void *loc_cpu_entry;
+ struct ipt_entry *iter;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1303,9 +1307,7 @@ do_replace(struct net *net, void __user *user, unsigned int len)
goto free_newinfo;
}
- ret = translate_table(tmp.name, tmp.valid_hooks,
- newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
- tmp.hook_entry, tmp.underflow);
+ ret = translate_table(net, newinfo, loc_cpu_entry, &tmp);
if (ret != 0)
goto free_newinfo;
@@ -1318,27 +1320,16 @@ do_replace(struct net *net, void __user *user, unsigned int len)
return 0;
free_newinfo_untrans:
- IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
}
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
static int
-add_counter_to_entry(struct ipt_entry *e,
- const struct xt_counters addme[],
- unsigned int *i)
-{
- ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
- (*i)++;
- return 0;
-}
-
-static int
-do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
+do_add_counters(struct net *net, const void __user *user,
+ unsigned int len, int compat)
{
unsigned int i, curcpu;
struct xt_counters_info tmp;
@@ -1351,6 +1342,7 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
const struct xt_table_info *private;
int ret = 0;
void *loc_cpu_entry;
+ struct ipt_entry *iter;
#ifdef CONFIG_COMPAT
struct compat_xt_counters_info compat_tmp;
@@ -1408,11 +1400,10 @@ do_add_counters(struct net *net, void __user *user, unsigned int len, int compat
curcpu = smp_processor_id();
loc_cpu_entry = private->entries[curcpu];
xt_info_wrlock(curcpu);
- IPT_ENTRY_ITERATE(loc_cpu_entry,
- private->size,
- add_counter_to_entry,
- paddc,
- &i);
+ xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+ ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+ ++i;
+ }
xt_info_wrunlock(curcpu);
unlock_up_free:
local_bh_enable();
@@ -1440,45 +1431,40 @@ struct compat_ipt_replace {
static int
compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
unsigned int *size, struct xt_counters *counters,
- unsigned int *i)
+ unsigned int i)
{
struct ipt_entry_target *t;
struct compat_ipt_entry __user *ce;
u_int16_t target_offset, next_offset;
compat_uint_t origsize;
- int ret;
+ const struct xt_entry_match *ematch;
+ int ret = 0;
- ret = -EFAULT;
origsize = *size;
ce = (struct compat_ipt_entry __user *)*dstptr;
- if (copy_to_user(ce, e, sizeof(struct ipt_entry)))
- goto out;
-
- if (copy_to_user(&ce->counters, &counters[*i], sizeof(counters[*i])))
- goto out;
+ if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 ||
+ copy_to_user(&ce->counters, &counters[i],
+ sizeof(counters[i])) != 0)
+ return -EFAULT;
*dstptr += sizeof(struct compat_ipt_entry);
*size -= sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
- ret = IPT_MATCH_ITERATE(e, xt_compat_match_to_user, dstptr, size);
+ xt_ematch_foreach(ematch, e) {
+ ret = xt_compat_match_to_user(ematch, dstptr, size);
+ if (ret != 0)
+ return ret;
+ }
target_offset = e->target_offset - (origsize - *size);
- if (ret)
- goto out;
t = ipt_get_target(e);
ret = xt_compat_target_to_user(t, dstptr, size);
if (ret)
- goto out;
- ret = -EFAULT;
+ return ret;
next_offset = e->next_offset - (origsize - *size);
- if (put_user(target_offset, &ce->target_offset))
- goto out;
- if (put_user(next_offset, &ce->next_offset))
- goto out;
-
- (*i)++;
+ if (put_user(target_offset, &ce->target_offset) != 0 ||
+ put_user(next_offset, &ce->next_offset) != 0)
+ return -EFAULT;
return 0;
-out:
- return ret;
}
static int
@@ -1486,7 +1472,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
const char *name,
const struct ipt_ip *ip,
unsigned int hookmask,
- int *size, unsigned int *i)
+ int *size)
{
struct xt_match *match;
@@ -1500,47 +1486,32 @@ compat_find_calc_match(struct ipt_entry_match *m,
}
m->u.kernel.match = match;
*size += xt_compat_match_offset(match);
-
- (*i)++;
- return 0;
-}
-
-static int
-compat_release_match(struct ipt_entry_match *m, unsigned int *i)
-{
- if (i && (*i)-- == 0)
- return 1;
-
- module_put(m->u.kernel.match->me);
return 0;
}
-static int
-compat_release_entry(struct compat_ipt_entry *e, unsigned int *i)
+static void compat_release_entry(struct compat_ipt_entry *e)
{
struct ipt_entry_target *t;
-
- if (i && (*i)-- == 0)
- return 1;
+ struct xt_entry_match *ematch;
/* Cleanup all matches */
- COMPAT_IPT_MATCH_ITERATE(e, compat_release_match, NULL);
+ xt_ematch_foreach(ematch, e)
+ module_put(ematch->u.kernel.match->me);
t = compat_ipt_get_target(e);
module_put(t->u.kernel.target->me);
- return 0;
}
static int
check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
struct xt_table_info *newinfo,
unsigned int *size,
- unsigned char *base,
- unsigned char *limit,
- unsigned int *hook_entries,
- unsigned int *underflows,
- unsigned int *i,
+ const unsigned char *base,
+ const unsigned char *limit,
+ const unsigned int *hook_entries,
+ const unsigned int *underflows,
const char *name)
{
+ struct xt_entry_match *ematch;
struct ipt_entry_target *t;
struct xt_target *target;
unsigned int entry_offset;
@@ -1548,8 +1519,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
int ret, off, h;
duprintf("check_compat_entry_size_and_hooks %p\n", e);
- if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0
- || (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
+ if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
+ (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
duprintf("Bad offset %p, limit = %p\n", e, limit);
return -EINVAL;
}
@@ -1569,10 +1540,13 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
entry_offset = (void *)e - (void *)base;
j = 0;
- ret = COMPAT_IPT_MATCH_ITERATE(e, compat_find_calc_match, name,
- &e->ip, e->comefrom, &off, &j);
- if (ret != 0)
- goto release_matches;
+ xt_ematch_foreach(ematch, e) {
+ ret = compat_find_calc_match(ematch, name,
+ &e->ip, e->comefrom, &off);
+ if (ret != 0)
+ goto release_matches;
+ ++j;
+ }
t = compat_ipt_get_target(e);
target = try_then_request_module(xt_find_target(AF_INET,
@@ -1604,14 +1578,16 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
/* Clear counters and comefrom */
memset(&e->counters, 0, sizeof(e->counters));
e->comefrom = 0;
-
- (*i)++;
return 0;
out:
module_put(t->u.kernel.target->me);
release_matches:
- IPT_MATCH_ITERATE(e, compat_release_match, &j);
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ module_put(ematch->u.kernel.match->me);
+ }
return ret;
}
@@ -1625,6 +1601,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
struct ipt_entry *de;
unsigned int origsize;
int ret, h;
+ struct xt_entry_match *ematch;
ret = 0;
origsize = *size;
@@ -1635,10 +1612,11 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
*dstptr += sizeof(struct ipt_entry);
*size += sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
- ret = COMPAT_IPT_MATCH_ITERATE(e, xt_compat_match_from_user,
- dstptr, size);
- if (ret)
- return ret;
+ xt_ematch_foreach(ematch, e) {
+ ret = xt_compat_match_from_user(ematch, dstptr, size);
+ if (ret != 0)
+ return ret;
+ }
de->target_offset = e->target_offset - (origsize - *size);
t = compat_ipt_get_target(e);
target = t->u.kernel.target;
@@ -1655,36 +1633,43 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
}
static int
-compat_check_entry(struct ipt_entry *e, const char *name,
- unsigned int *i)
+compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
{
+ struct xt_entry_match *ematch;
struct xt_mtchk_param mtpar;
unsigned int j;
- int ret;
+ int ret = 0;
j = 0;
+ mtpar.net = net;
mtpar.table = name;
mtpar.entryinfo = &e->ip;
mtpar.hook_mask = e->comefrom;
mtpar.family = NFPROTO_IPV4;
- ret = IPT_MATCH_ITERATE(e, check_match, &mtpar, &j);
- if (ret)
- goto cleanup_matches;
+ xt_ematch_foreach(ematch, e) {
+ ret = check_match(ematch, &mtpar);
+ if (ret != 0)
+ goto cleanup_matches;
+ ++j;
+ }
- ret = check_target(e, name);
+ ret = check_target(e, net, name);
if (ret)
goto cleanup_matches;
-
- (*i)++;
return 0;
cleanup_matches:
- IPT_MATCH_ITERATE(e, cleanup_match, &j);
+ xt_ematch_foreach(ematch, e) {
+ if (j-- == 0)
+ break;
+ cleanup_match(ematch, net);
+ }
return ret;
}
static int
-translate_compat_table(const char *name,
+translate_compat_table(struct net *net,
+ const char *name,
unsigned int valid_hooks,
struct xt_table_info **pinfo,
void **pentry0,
@@ -1696,6 +1681,8 @@ translate_compat_table(const char *name,
unsigned int i, j;
struct xt_table_info *newinfo, *info;
void *pos, *entry0, *entry1;
+ struct compat_ipt_entry *iter0;
+ struct ipt_entry *iter1;
unsigned int size;
int ret;
@@ -1714,13 +1701,17 @@ translate_compat_table(const char *name,
j = 0;
xt_compat_lock(AF_INET);
/* Walk through entries, checking offsets. */
- ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size,
- check_compat_entry_size_and_hooks,
- info, &size, entry0,
- entry0 + total_size,
- hook_entries, underflows, &j, name);
- if (ret != 0)
- goto out_unlock;
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = check_compat_entry_size_and_hooks(iter0, info, &size,
+ entry0,
+ entry0 + total_size,
+ hook_entries,
+ underflows,
+ name);
+ if (ret != 0)
+ goto out_unlock;
+ ++j;
+ }
ret = -EINVAL;
if (j != number) {
@@ -1759,9 +1750,12 @@ translate_compat_table(const char *name,
entry1 = newinfo->entries[raw_smp_processor_id()];
pos = entry1;
size = total_size;
- ret = COMPAT_IPT_ENTRY_ITERATE(entry0, total_size,
- compat_copy_entry_from_user,
- &pos, &size, name, newinfo, entry1);
+ xt_entry_foreach(iter0, entry0, total_size) {
+ ret = compat_copy_entry_from_user(iter0, &pos, &size,
+ name, newinfo, entry1);
+ if (ret != 0)
+ break;
+ }
xt_compat_flush_offsets(AF_INET);
xt_compat_unlock(AF_INET);
if (ret)
@@ -1772,13 +1766,32 @@ translate_compat_table(const char *name,
goto free_newinfo;
i = 0;
- ret = IPT_ENTRY_ITERATE(entry1, newinfo->size, compat_check_entry,
- name, &i);
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ ret = compat_check_entry(iter1, net, name);
+ if (ret != 0)
+ break;
+ ++i;
+ }
if (ret) {
+ /*
+ * The first i matches need cleanup_entry (calls ->destroy)
+ * because they had called ->check already. The other j-i
+ * entries need only release.
+ */
+ int skip = i;
j -= i;
- COMPAT_IPT_ENTRY_ITERATE_CONTINUE(entry0, newinfo->size, i,
- compat_release_entry, &j);
- IPT_ENTRY_ITERATE(entry1, newinfo->size, cleanup_entry, &i);
+ xt_entry_foreach(iter0, entry0, newinfo->size) {
+ if (skip-- > 0)
+ continue;
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
+ xt_entry_foreach(iter1, entry1, newinfo->size) {
+ if (i-- == 0)
+ break;
+ cleanup_entry(iter1, net);
+ }
xt_free_table_info(newinfo);
return ret;
}
@@ -1796,7 +1809,11 @@ translate_compat_table(const char *name,
free_newinfo:
xt_free_table_info(newinfo);
out:
- COMPAT_IPT_ENTRY_ITERATE(entry0, total_size, compat_release_entry, &j);
+ xt_entry_foreach(iter0, entry0, total_size) {
+ if (j-- == 0)
+ break;
+ compat_release_entry(iter0);
+ }
return ret;
out_unlock:
xt_compat_flush_offsets(AF_INET);
@@ -1811,6 +1828,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
struct compat_ipt_replace tmp;
struct xt_table_info *newinfo;
void *loc_cpu_entry;
+ struct ipt_entry *iter;
if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
return -EFAULT;
@@ -1833,7 +1851,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
goto free_newinfo;
}
- ret = translate_compat_table(tmp.name, tmp.valid_hooks,
+ ret = translate_compat_table(net, tmp.name, tmp.valid_hooks,
&newinfo, &loc_cpu_entry, tmp.size,
tmp.num_entries, tmp.hook_entry,
tmp.underflow);
@@ -1849,7 +1867,8 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
return 0;
free_newinfo_untrans:
- IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL);
+ xt_entry_foreach(iter, loc_cpu_entry, newinfo->size)
+ cleanup_entry(iter, net);
free_newinfo:
xt_free_table_info(newinfo);
return ret;
@@ -1898,6 +1917,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
int ret = 0;
const void *loc_cpu_entry;
unsigned int i = 0;
+ struct ipt_entry *iter;
counters = alloc_counters(table);
if (IS_ERR(counters))
@@ -1910,9 +1930,12 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
loc_cpu_entry = private->entries[raw_smp_processor_id()];
pos = userptr;
size = total_size;
- ret = IPT_ENTRY_ITERATE(loc_cpu_entry, total_size,
- compat_copy_entry_to_user,
- &pos, &size, counters, &i);
+ xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+ ret = compat_copy_entry_to_user(iter, &pos,
+ &size, counters, i++);
+ if (ret != 0)
+ break;
+ }
vfree(counters);
return ret;
@@ -2086,11 +2109,7 @@ struct xt_table *ipt_register_table(struct net *net,
loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
memcpy(loc_cpu_entry, repl->entries, repl->size);
- ret = translate_table(table->name, table->valid_hooks,
- newinfo, loc_cpu_entry, repl->size,
- repl->num_entries,
- repl->hook_entry,
- repl->underflow);
+ ret = translate_table(net, newinfo, loc_cpu_entry, repl);
if (ret != 0)
goto out_free;
@@ -2108,17 +2127,19 @@ out:
return ERR_PTR(ret);
}
-void ipt_unregister_table(struct xt_table *table)
+void ipt_unregister_table(struct net *net, struct xt_table *table)
{
struct xt_table_info *private;
void *loc_cpu_entry;
struct module *table_owner = table->me;
+ struct ipt_entry *iter;
private = xt_unregister_table(table);
/* Decrease module usage counts and free resources */
loc_cpu_entry = private->entries[raw_smp_processor_id()];
- IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, cleanup_entry, NULL);
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
if (private->number > private->initial_entries)
module_put(table_owner);
xt_free_table_info(private);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 2e4f98b85524..ab828400ed71 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -14,6 +14,7 @@
#include <linux/jhash.h>
#include <linux/bitops.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <linux/udp.h>
@@ -303,9 +304,9 @@ clusterip_tg(struct sk_buff *skb, const struct xt_target_param *par)
/* special case: ICMP error handling. conntrack distinguishes between
* error messages (RELATED) and information requests (see below) */
- if (ip_hdr(skb)->protocol == IPPROTO_ICMP
- && (ctinfo == IP_CT_RELATED
- || ctinfo == IP_CT_RELATED+IP_CT_IS_REPLY))
+ if (ip_hdr(skb)->protocol == IPPROTO_ICMP &&
+ (ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY))
return XT_CONTINUE;
/* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO,
@@ -362,8 +363,8 @@ static bool clusterip_tg_check(const struct xt_tgchk_param *par)
return false;
}
- if (e->ip.dmsk.s_addr != htonl(0xffffffff)
- || e->ip.dst.s_addr == 0) {
+ if (e->ip.dmsk.s_addr != htonl(0xffffffff) ||
+ e->ip.dst.s_addr == 0) {
printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n");
return false;
}
@@ -495,14 +496,14 @@ arp_mangle(unsigned int hook,
struct clusterip_config *c;
/* we don't care about non-ethernet and non-ipv4 ARP */
- if (arp->ar_hrd != htons(ARPHRD_ETHER)
- || arp->ar_pro != htons(ETH_P_IP)
- || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
+ if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
+ arp->ar_pro != htons(ETH_P_IP) ||
+ arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN)
return NF_ACCEPT;
/* we only want to mangle arp requests and replies */
- if (arp->ar_op != htons(ARPOP_REPLY)
- && arp->ar_op != htons(ARPOP_REQUEST))
+ if (arp->ar_op != htons(ARPOP_REPLY) &&
+ arp->ar_op != htons(ARPOP_REQUEST))
return NF_ACCEPT;
payload = (void *)(arp+1);
@@ -560,8 +561,7 @@ struct clusterip_seq_position {
static void *clusterip_seq_start(struct seq_file *s, loff_t *pos)
{
- const struct proc_dir_entry *pde = s->private;
- struct clusterip_config *c = pde->data;
+ struct clusterip_config *c = s->private;
unsigned int weight;
u_int32_t local_nodes;
struct clusterip_seq_position *idx;
@@ -632,10 +632,9 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
if (!ret) {
struct seq_file *sf = file->private_data;
- struct proc_dir_entry *pde = PDE(inode);
- struct clusterip_config *c = pde->data;
+ struct clusterip_config *c = PDE(inode)->data;
- sf->private = pde;
+ sf->private = c;
clusterip_config_get(c);
}
@@ -645,8 +644,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
static int clusterip_proc_release(struct inode *inode, struct file *file)
{
- struct proc_dir_entry *pde = PDE(inode);
- struct clusterip_config *c = pde->data;
+ struct clusterip_config *c = PDE(inode)->data;
int ret;
ret = seq_release(inode, file);
@@ -660,10 +658,9 @@ static int clusterip_proc_release(struct inode *inode, struct file *file)
static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
size_t size, loff_t *ofs)
{
+ struct clusterip_config *c = PDE(file->f_path.dentry->d_inode)->data;
#define PROC_WRITELEN 10
char buffer[PROC_WRITELEN+1];
- const struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
- struct clusterip_config *c = pde->data;
unsigned long nodenum;
if (copy_from_user(buffer, input, PROC_WRITELEN))
diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c
index f7e2fa0974dc..ea5cea2415c1 100644
--- a/net/ipv4/netfilter/ipt_ECN.c
+++ b/net/ipv4/netfilter/ipt_ECN.c
@@ -50,7 +50,7 @@ set_ect_tcp(struct sk_buff *skb, const struct ipt_ECN_info *einfo)
struct tcphdr _tcph, *tcph;
__be16 oldval;
- /* Not enought header? */
+ /* Not enough header? */
tcph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
if (!tcph)
return false;
@@ -85,8 +85,8 @@ ecn_tg(struct sk_buff *skb, const struct xt_target_param *par)
if (!set_ect_ip(skb, einfo))
return NF_DROP;
- if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR)
- && ip_hdr(skb)->protocol == IPPROTO_TCP)
+ if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) &&
+ ip_hdr(skb)->protocol == IPPROTO_TCP)
if (!set_ect_tcp(skb, einfo))
return NF_DROP;
@@ -108,8 +108,8 @@ static bool ecn_tg_check(const struct xt_tgchk_param *par)
einfo->ip_ect);
return false;
}
- if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR))
- && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
+ if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) &&
+ (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & XT_INV_PROTO))) {
printk(KERN_WARNING "ECN: cannot use TCP operations on a "
"non-tcp rule\n");
return false;
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index acc44c69eb68..ee128efa1c8d 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -74,8 +74,8 @@ static void dump_packet(const struct nf_loginfo *info,
if (ntohs(ih->frag_off) & IP_OFFSET)
printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
- if ((logflags & IPT_LOG_IPOPT)
- && ih->ihl * 4 > sizeof(struct iphdr)) {
+ if ((logflags & IPT_LOG_IPOPT) &&
+ ih->ihl * 4 > sizeof(struct iphdr)) {
const unsigned char *op;
unsigned char _opt[4 * 15 - sizeof(struct iphdr)];
unsigned int i, optsize;
@@ -146,8 +146,8 @@ static void dump_packet(const struct nf_loginfo *info,
/* Max length: 11 "URGP=65535 " */
printk("URGP=%u ", ntohs(th->urg_ptr));
- if ((logflags & IPT_LOG_TCPOPT)
- && th->doff * 4 > sizeof(struct tcphdr)) {
+ if ((logflags & IPT_LOG_TCPOPT) &&
+ th->doff * 4 > sizeof(struct tcphdr)) {
unsigned char _opt[4 * 15 - sizeof(struct tcphdr)];
const unsigned char *op;
unsigned int i, optsize;
@@ -238,9 +238,9 @@ static void dump_packet(const struct nf_loginfo *info,
printk("TYPE=%u CODE=%u ", ich->type, ich->code);
/* Max length: 25 "INCOMPLETE [65535 bytes] " */
- if (ich->type <= NR_ICMP_TYPES
- && required_len[ich->type]
- && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
+ if (ich->type <= NR_ICMP_TYPES &&
+ required_len[ich->type] &&
+ skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
printk("INCOMPLETE [%u bytes] ",
skb->len - iphoff - ih->ihl*4);
break;
@@ -276,8 +276,8 @@ static void dump_packet(const struct nf_loginfo *info,
}
/* Max length: 10 "MTU=65535 " */
- if (ich->type == ICMP_DEST_UNREACH
- && ich->code == ICMP_FRAG_NEEDED)
+ if (ich->type == ICMP_DEST_UNREACH &&
+ ich->code == ICMP_FRAG_NEEDED)
printk("MTU=%u ", ntohs(ich->un.frag.mtu));
}
break;
@@ -407,8 +407,8 @@ ipt_log_packet(u_int8_t pf,
if (in && !out) {
/* MAC logging for input chain only. */
printk("MAC=");
- if (skb->dev && skb->dev->hard_header_len
- && skb->mac_header != skb->network_header) {
+ if (skb->dev && skb->dev->hard_header_len &&
+ skb->mac_header != skb->network_header) {
int i;
const unsigned char *p = skb_mac_header(skb);
for (i = 0; i < skb->dev->hard_header_len; i++,p++)
diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c
index dada0863946d..650b54042b01 100644
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -59,8 +59,8 @@ masquerade_tg(struct sk_buff *skb, const struct xt_target_param *par)
ct = nf_ct_get(skb, &ctinfo);
nat = nfct_nat(ct);
- NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED
- || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
+ NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED ||
+ ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY));
/* Source address is 0.0.0.0 - locally generated packet that is
* probably not supposed to be masqueraded.
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c
index c93ae44bff2a..a0e8bcf04159 100644
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -12,6 +12,7 @@
#include <linux/module.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <linux/ip.h>
#include <linux/udp.h>
#include <linux/icmp.h>
@@ -184,8 +185,8 @@ static bool reject_tg_check(const struct xt_tgchk_param *par)
return false;
} else if (rejinfo->with == IPT_TCP_RESET) {
/* Must specify that it's a TCP packet */
- if (e->ip.proto != IPPROTO_TCP
- || (e->ip.invflags & XT_INV_PROTO)) {
+ if (e->ip.proto != IPPROTO_TCP ||
+ (e->ip.invflags & XT_INV_PROTO)) {
printk("ipt_REJECT: TCP_RESET invalid for non-tcp\n");
return false;
}
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index d32cc4bb328a..0dbe697f164f 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -33,6 +33,7 @@
#include <linux/module.h>
#include <linux/spinlock.h>
#include <linux/socket.h>
+#include <linux/slab.h>
#include <linux/skbuff.h>
#include <linux/kernel.h>
#include <linux/timer.h>
@@ -226,9 +227,9 @@ static void ipt_ulog_packet(unsigned int hooknum,
else
*(pm->prefix) = '\0';
- if (in && in->hard_header_len > 0
- && skb->mac_header != skb->network_header
- && in->hard_header_len <= ULOG_MAC_LEN) {
+ if (in && in->hard_header_len > 0 &&
+ skb->mac_header != skb->network_header &&
+ in->hard_header_len <= ULOG_MAC_LEN) {
memcpy(pm->mac, skb_mac_header(skb), in->hard_header_len);
pm->mac_len = in->hard_header_len;
} else
@@ -338,7 +339,7 @@ struct compat_ipt_ulog_info {
char prefix[ULOG_PREFIX_LEN];
};
-static void ulog_tg_compat_from_user(void *dst, void *src)
+static void ulog_tg_compat_from_user(void *dst, const void *src)
{
const struct compat_ipt_ulog_info *cl = src;
struct ipt_ulog_info l = {
@@ -351,7 +352,7 @@ static void ulog_tg_compat_from_user(void *dst, void *src)
memcpy(dst, &l, sizeof(l));
}
-static int ulog_tg_compat_to_user(void __user *dst, void *src)
+static int ulog_tg_compat_to_user(void __user *dst, const void *src)
{
const struct ipt_ulog_info *l = src;
struct compat_ipt_ulog_info cl = {
diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c
index 6289b64144c6..2a1e56b71908 100644
--- a/net/ipv4/netfilter/ipt_ecn.c
+++ b/net/ipv4/netfilter/ipt_ecn.c
@@ -96,8 +96,8 @@ static bool ecn_mt_check(const struct xt_mtchk_param *par)
if (info->invert & IPT_ECN_OP_MATCH_MASK)
return false;
- if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)
- && ip->proto != IPPROTO_TCP) {
+ if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) &&
+ ip->proto != IPPROTO_TCP) {
printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for"
" non-tcp packets\n");
return false;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index df566cbd68e5..55392466daa4 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -13,6 +13,7 @@
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
#include <net/ip.h>
MODULE_LICENSE("GPL");
@@ -23,104 +24,32 @@ MODULE_DESCRIPTION("iptables filter table");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
-static struct
-{
- struct ipt_replace repl;
- struct ipt_standard entries[3];
- struct ipt_error term;
-} initial_table __net_initdata = {
- .repl = {
- .name = "filter",
- .valid_hooks = FILTER_VALID_HOOKS,
- .num_entries = 4,
- .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
- .hook_entry = {
- [NF_INET_LOCAL_IN] = 0,
- [NF_INET_FORWARD] = sizeof(struct ipt_standard),
- [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
- },
- .underflow = {
- [NF_INET_LOCAL_IN] = 0,
- [NF_INET_FORWARD] = sizeof(struct ipt_standard),
- [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
- },
- },
- .entries = {
- IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
- IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
- IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
- },
- .term = IPT_ERROR_INIT, /* ERROR */
-};
-
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_FILTER,
};
-/* The work comes in here from netfilter.c. */
-static unsigned int
-ipt_local_in_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- return ipt_do_table(skb, hook, in, out,
- dev_net(in)->ipv4.iptable_filter);
-}
-
static unsigned int
-ipt_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+iptable_filter_hook(unsigned int hook, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
- return ipt_do_table(skb, hook, in, out,
- dev_net(in)->ipv4.iptable_filter);
-}
+ const struct net *net;
-static unsigned int
-ipt_local_out_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
+ if (hook == NF_INET_LOCAL_OUT &&
+ (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr)))
+ /* root is playing with raw sockets. */
return NF_ACCEPT;
- return ipt_do_table(skb, hook, in, out,
- dev_net(out)->ipv4.iptable_filter);
+
+ net = dev_net((in != NULL) ? in : out);
+ return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_filter);
}
-static struct nf_hook_ops ipt_ops[] __read_mostly = {
- {
- .hook = ipt_local_in_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_FILTER,
- },
- {
- .hook = ipt_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_FORWARD,
- .priority = NF_IP_PRI_FILTER,
- },
- {
- .hook = ipt_local_out_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP_PRI_FILTER,
- },
-};
+static struct nf_hook_ops *filter_ops __read_mostly;
/* Default to forward because I got too much mail already. */
static int forward = NF_ACCEPT;
@@ -128,9 +57,18 @@ module_param(forward, bool, 0000);
static int __net_init iptable_filter_net_init(struct net *net)
{
- /* Register table */
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&packet_filter);
+ if (repl == NULL)
+ return -ENOMEM;
+ /* Entry 1 is the FORWARD hook */
+ ((struct ipt_standard *)repl->entries)[1].target.verdict =
+ -forward - 1;
+
net->ipv4.iptable_filter =
- ipt_register_table(net, &packet_filter, &initial_table.repl);
+ ipt_register_table(net, &packet_filter, repl);
+ kfree(repl);
if (IS_ERR(net->ipv4.iptable_filter))
return PTR_ERR(net->ipv4.iptable_filter);
return 0;
@@ -138,7 +76,7 @@ static int __net_init iptable_filter_net_init(struct net *net)
static void __net_exit iptable_filter_net_exit(struct net *net)
{
- ipt_unregister_table(net->ipv4.iptable_filter);
+ ipt_unregister_table(net, net->ipv4.iptable_filter);
}
static struct pernet_operations iptable_filter_net_ops = {
@@ -155,17 +93,16 @@ static int __init iptable_filter_init(void)
return -EINVAL;
}
- /* Entry 1 is the FORWARD hook */
- initial_table.entries[1].target.verdict = -forward - 1;
-
ret = register_pernet_subsys(&iptable_filter_net_ops);
if (ret < 0)
return ret;
/* Register hooks */
- ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
- if (ret < 0)
+ filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
+ if (IS_ERR(filter_ops)) {
+ ret = PTR_ERR(filter_ops);
goto cleanup_table;
+ }
return ret;
@@ -176,7 +113,7 @@ static int __init iptable_filter_init(void)
static void __exit iptable_filter_fini(void)
{
- nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
+ xt_hook_unlink(&packet_filter, filter_ops);
unregister_pernet_subsys(&iptable_filter_net_ops);
}
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 036047f9b0f2..294a2a32f293 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -12,6 +12,7 @@
#include <linux/netfilter_ipv4/ip_tables.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/sock.h>
#include <net/route.h>
#include <linux/ip.h>
@@ -27,101 +28,16 @@ MODULE_DESCRIPTION("iptables mangle table");
(1 << NF_INET_LOCAL_OUT) | \
(1 << NF_INET_POST_ROUTING))
-/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */
-static const struct
-{
- struct ipt_replace repl;
- struct ipt_standard entries[5];
- struct ipt_error term;
-} initial_table __net_initdata = {
- .repl = {
- .name = "mangle",
- .valid_hooks = MANGLE_VALID_HOOKS,
- .num_entries = 6,
- .size = sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error),
- .hook_entry = {
- [NF_INET_PRE_ROUTING] = 0,
- [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard),
- [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2,
- [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
- [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
- },
- .underflow = {
- [NF_INET_PRE_ROUTING] = 0,
- [NF_INET_LOCAL_IN] = sizeof(struct ipt_standard),
- [NF_INET_FORWARD] = sizeof(struct ipt_standard) * 2,
- [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 3,
- [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard) * 4,
- },
- },
- .entries = {
- IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
- IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
- IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
- IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
- IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */
- },
- .term = IPT_ERROR_INIT, /* ERROR */
-};
-
static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_MANGLE,
};
-/* The work comes in here from netfilter.c. */
-static unsigned int
-ipt_pre_routing_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- return ipt_do_table(skb, hook, in, out,
- dev_net(in)->ipv4.iptable_mangle);
-}
-
-static unsigned int
-ipt_post_routing_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- return ipt_do_table(skb, hook, in, out,
- dev_net(out)->ipv4.iptable_mangle);
-}
-
-static unsigned int
-ipt_local_in_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- return ipt_do_table(skb, hook, in, out,
- dev_net(in)->ipv4.iptable_mangle);
-}
-
-static unsigned int
-ipt_forward_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- return ipt_do_table(skb, hook, in, out,
- dev_net(in)->ipv4.iptable_mangle);
-}
-
static unsigned int
-ipt_local_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
{
unsigned int ret;
const struct iphdr *iph;
@@ -130,8 +46,8 @@ ipt_local_hook(unsigned int hook,
u_int32_t mark;
/* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr)
- || ip_hdrlen(skb) < sizeof(struct iphdr))
+ if (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
/* Save things which could affect route */
@@ -141,7 +57,7 @@ ipt_local_hook(unsigned int hook,
daddr = iph->daddr;
tos = iph->tos;
- ret = ipt_do_table(skb, hook, in, out,
+ ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
dev_net(out)->ipv4.iptable_mangle);
/* Reroute for ANY change. */
if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) {
@@ -158,49 +74,36 @@ ipt_local_hook(unsigned int hook,
return ret;
}
-static struct nf_hook_ops ipt_ops[] __read_mostly = {
- {
- .hook = ipt_pre_routing_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP_PRI_MANGLE,
- },
- {
- .hook = ipt_local_in_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_MANGLE,
- },
- {
- .hook = ipt_forward_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_FORWARD,
- .priority = NF_IP_PRI_MANGLE,
- },
- {
- .hook = ipt_local_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP_PRI_MANGLE,
- },
- {
- .hook = ipt_post_routing_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_MANGLE,
- },
-};
+/* The work comes in here from netfilter.c. */
+static unsigned int
+iptable_mangle_hook(unsigned int hook,
+ struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
+{
+ if (hook == NF_INET_LOCAL_OUT)
+ return ipt_mangle_out(skb, out);
+ if (hook == NF_INET_POST_ROUTING)
+ return ipt_do_table(skb, hook, in, out,
+ dev_net(out)->ipv4.iptable_mangle);
+ /* PREROUTING/INPUT/FORWARD: */
+ return ipt_do_table(skb, hook, in, out,
+ dev_net(in)->ipv4.iptable_mangle);
+}
+
+static struct nf_hook_ops *mangle_ops __read_mostly;
static int __net_init iptable_mangle_net_init(struct net *net)
{
- /* Register table */
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&packet_mangler);
+ if (repl == NULL)
+ return -ENOMEM;
net->ipv4.iptable_mangle =
- ipt_register_table(net, &packet_mangler, &initial_table.repl);
+ ipt_register_table(net, &packet_mangler, repl);
+ kfree(repl);
if (IS_ERR(net->ipv4.iptable_mangle))
return PTR_ERR(net->ipv4.iptable_mangle);
return 0;
@@ -208,7 +111,7 @@ static int __net_init iptable_mangle_net_init(struct net *net)
static void __net_exit iptable_mangle_net_exit(struct net *net)
{
- ipt_unregister_table(net->ipv4.iptable_mangle);
+ ipt_unregister_table(net, net->ipv4.iptable_mangle);
}
static struct pernet_operations iptable_mangle_net_ops = {
@@ -225,9 +128,11 @@ static int __init iptable_mangle_init(void)
return ret;
/* Register hooks */
- ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
- if (ret < 0)
+ mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
+ if (IS_ERR(mangle_ops)) {
+ ret = PTR_ERR(mangle_ops);
goto cleanup_table;
+ }
return ret;
@@ -238,7 +143,7 @@ static int __init iptable_mangle_init(void)
static void __exit iptable_mangle_fini(void)
{
- nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
+ xt_hook_unlink(&packet_mangler, mangle_ops);
unregister_pernet_subsys(&iptable_mangle_net_ops);
}
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 993edc23be09..07fb710cd722 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -5,94 +5,49 @@
*/
#include <linux/module.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
#include <net/ip.h>
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
-static const struct
-{
- struct ipt_replace repl;
- struct ipt_standard entries[2];
- struct ipt_error term;
-} initial_table __net_initdata = {
- .repl = {
- .name = "raw",
- .valid_hooks = RAW_VALID_HOOKS,
- .num_entries = 3,
- .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error),
- .hook_entry = {
- [NF_INET_PRE_ROUTING] = 0,
- [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard)
- },
- .underflow = {
- [NF_INET_PRE_ROUTING] = 0,
- [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard)
- },
- },
- .entries = {
- IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
- IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
- },
- .term = IPT_ERROR_INIT, /* ERROR */
-};
-
static const struct xt_table packet_raw = {
.name = "raw",
.valid_hooks = RAW_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_RAW,
};
/* The work comes in here from netfilter.c. */
static unsigned int
-ipt_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+iptable_raw_hook(unsigned int hook, struct sk_buff *skb,
+ const struct net_device *in, const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
- return ipt_do_table(skb, hook, in, out,
- dev_net(in)->ipv4.iptable_raw);
-}
+ const struct net *net;
-static unsigned int
-ipt_local_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- /* root is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr) ||
- ip_hdrlen(skb) < sizeof(struct iphdr))
+ if (hook == NF_INET_LOCAL_OUT &&
+ (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr)))
+ /* root is playing with raw sockets. */
return NF_ACCEPT;
- return ipt_do_table(skb, hook, in, out,
- dev_net(out)->ipv4.iptable_raw);
+
+ net = dev_net((in != NULL) ? in : out);
+ return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_raw);
}
-/* 'raw' is the very first table. */
-static struct nf_hook_ops ipt_ops[] __read_mostly = {
- {
- .hook = ipt_hook,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP_PRI_RAW,
- .owner = THIS_MODULE,
- },
- {
- .hook = ipt_local_hook,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP_PRI_RAW,
- .owner = THIS_MODULE,
- },
-};
+static struct nf_hook_ops *rawtable_ops __read_mostly;
static int __net_init iptable_raw_net_init(struct net *net)
{
- /* Register table */
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&packet_raw);
+ if (repl == NULL)
+ return -ENOMEM;
net->ipv4.iptable_raw =
- ipt_register_table(net, &packet_raw, &initial_table.repl);
+ ipt_register_table(net, &packet_raw, repl);
+ kfree(repl);
if (IS_ERR(net->ipv4.iptable_raw))
return PTR_ERR(net->ipv4.iptable_raw);
return 0;
@@ -100,7 +55,7 @@ static int __net_init iptable_raw_net_init(struct net *net)
static void __net_exit iptable_raw_net_exit(struct net *net)
{
- ipt_unregister_table(net->ipv4.iptable_raw);
+ ipt_unregister_table(net, net->ipv4.iptable_raw);
}
static struct pernet_operations iptable_raw_net_ops = {
@@ -117,9 +72,11 @@ static int __init iptable_raw_init(void)
return ret;
/* Register hooks */
- ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
- if (ret < 0)
+ rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
+ if (IS_ERR(rawtable_ops)) {
+ ret = PTR_ERR(rawtable_ops);
goto cleanup_table;
+ }
return ret;
@@ -130,7 +87,7 @@ static int __init iptable_raw_init(void)
static void __exit iptable_raw_fini(void)
{
- nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
+ xt_hook_unlink(&packet_raw, rawtable_ops);
unregister_pernet_subsys(&iptable_raw_net_ops);
}
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index 99eb76c65d25..be45bdc4c602 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -17,6 +17,7 @@
*/
#include <linux/module.h>
#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/slab.h>
#include <net/ip.h>
MODULE_LICENSE("GPL");
@@ -27,109 +28,44 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT)
-static const struct
-{
- struct ipt_replace repl;
- struct ipt_standard entries[3];
- struct ipt_error term;
-} initial_table __net_initdata = {
- .repl = {
- .name = "security",
- .valid_hooks = SECURITY_VALID_HOOKS,
- .num_entries = 4,
- .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
- .hook_entry = {
- [NF_INET_LOCAL_IN] = 0,
- [NF_INET_FORWARD] = sizeof(struct ipt_standard),
- [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
- },
- .underflow = {
- [NF_INET_LOCAL_IN] = 0,
- [NF_INET_FORWARD] = sizeof(struct ipt_standard),
- [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2,
- },
- },
- .entries = {
- IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_IN */
- IPT_STANDARD_INIT(NF_ACCEPT), /* FORWARD */
- IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
- },
- .term = IPT_ERROR_INIT, /* ERROR */
-};
-
static const struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
+ .priority = NF_IP_PRI_SECURITY,
};
static unsigned int
-ipt_local_in_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- return ipt_do_table(skb, hook, in, out,
- dev_net(in)->ipv4.iptable_security);
-}
-
-static unsigned int
-ipt_forward_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
+iptable_security_hook(unsigned int hook, struct sk_buff *skb,
+ const struct net_device *in,
+ const struct net_device *out,
+ int (*okfn)(struct sk_buff *))
{
- return ipt_do_table(skb, hook, in, out,
- dev_net(in)->ipv4.iptable_security);
-}
+ const struct net *net;
-static unsigned int
-ipt_local_out_hook(unsigned int hook,
- struct sk_buff *skb,
- const struct net_device *in,
- const struct net_device *out,
- int (*okfn)(struct sk_buff *))
-{
- /* Somebody is playing with raw sockets. */
- if (skb->len < sizeof(struct iphdr)
- || ip_hdrlen(skb) < sizeof(struct iphdr))
+ if (hook == NF_INET_LOCAL_OUT &&
+ (skb->len < sizeof(struct iphdr) ||
+ ip_hdrlen(skb) < sizeof(struct iphdr)))
+ /* Somebody is playing with raw sockets. */
return NF_ACCEPT;
- return ipt_do_table(skb, hook, in, out,
- dev_net(out)->ipv4.iptable_security);
+
+ net = dev_net((in != NULL) ? in : out);
+ return ipt_do_table(skb, hook, in, out, net->ipv4.iptable_security);
}
-static struct nf_hook_ops ipt_ops[] __read_mostly = {
- {
- .hook = ipt_local_in_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_SECURITY,
- },
- {
- .hook = ipt_forward_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_FORWARD,
- .priority = NF_IP_PRI_SECURITY,
- },
- {
- .hook = ipt_local_out_hook,
- .owner = THIS_MODULE,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP_PRI_SECURITY,
- },
-};
+static struct nf_hook_ops *sectbl_ops __read_mostly;
static int __net_init iptable_security_net_init(struct net *net)
{
- net->ipv4.iptable_security =
- ipt_register_table(net, &security_table, &initial_table.repl);
+ struct ipt_replace *repl;
+ repl = ipt_alloc_initial_table(&security_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.iptable_security =
+ ipt_register_table(net, &security_table, repl);
+ kfree(repl);
if (IS_ERR(net->ipv4.iptable_security))
return PTR_ERR(net->ipv4.iptable_security);
@@ -138,7 +74,7 @@ static int __net_init iptable_security_net_init(struct net *net)
static void __net_exit iptable_security_net_exit(struct net *net)
{
- ipt_unregister_table(net->ipv4.iptable_security);
+ ipt_unregister_table(net, net->ipv4.iptable_security);
}
static struct pernet_operations iptable_security_net_ops = {
@@ -154,9 +90,11 @@ static int __init iptable_security_init(void)
if (ret < 0)
return ret;
- ret = nf_register_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
- if (ret < 0)
+ sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
+ if (IS_ERR(sectbl_ops)) {
+ ret = PTR_ERR(sectbl_ops);
goto cleanup_table;
+ }
return ret;
@@ -167,7 +105,7 @@ cleanup_table:
static void __exit iptable_security_fini(void)
{
- nf_unregister_hooks(ipt_ops, ARRAY_SIZE(ipt_ops));
+ xt_hook_unlink(&security_table, sectbl_ops);
unregister_pernet_subsys(&iptable_security_net_ops);
}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index aa95bb82ee6c..2bb1f87051c4 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -22,6 +22,7 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
+#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/nf_nat_helper.h>
@@ -195,7 +196,6 @@ static int log_invalid_proto_max = 255;
static ctl_table ip_ct_sysctl_table[] = {
{
- .ctl_name = NET_IPV4_NF_CONNTRACK_MAX,
.procname = "ip_conntrack_max",
.data = &nf_conntrack_max,
.maxlen = sizeof(int),
@@ -203,7 +203,6 @@ static ctl_table ip_ct_sysctl_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT,
.procname = "ip_conntrack_count",
.data = &init_net.ct.count,
.maxlen = sizeof(int),
@@ -211,15 +210,13 @@ static ctl_table ip_ct_sysctl_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS,
.procname = "ip_conntrack_buckets",
- .data = &nf_conntrack_htable_size,
+ .data = &init_net.ct.htable_size,
.maxlen = sizeof(unsigned int),
.mode = 0444,
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM,
.procname = "ip_conntrack_checksum",
.data = &init_net.ct.sysctl_checksum,
.maxlen = sizeof(int),
@@ -227,19 +224,15 @@ static ctl_table ip_ct_sysctl_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID,
.procname = "ip_conntrack_log_invalid",
.data = &init_net.ct.sysctl_log_invalid,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .strategy = sysctl_intvec,
.extra1 = &log_invalid_proto_min,
.extra2 = &log_invalid_proto_max,
},
- {
- .ctl_name = 0
- }
+ { }
};
#endif /* CONFIG_SYSCTL && CONFIG_NF_CONNTRACK_PROC_COMPAT */
@@ -255,10 +248,10 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
struct nf_conntrack_tuple tuple;
memset(&tuple, 0, sizeof(tuple));
- tuple.src.u3.ip = inet->rcv_saddr;
- tuple.src.u.tcp.port = inet->sport;
- tuple.dst.u3.ip = inet->daddr;
- tuple.dst.u.tcp.port = inet->dport;
+ tuple.src.u3.ip = inet->inet_rcv_saddr;
+ tuple.src.u.tcp.port = inet->inet_sport;
+ tuple.dst.u3.ip = inet->inet_daddr;
+ tuple.dst.u.tcp.port = inet->inet_dport;
tuple.src.l3num = PF_INET;
tuple.dst.protonum = sk->sk_protocol;
@@ -274,7 +267,7 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len)
return -EINVAL;
}
- h = nf_conntrack_find_get(sock_net(sk), &tuple);
+ h = nf_conntrack_find_get(sock_net(sk), NF_CT_DEFAULT_ZONE, &tuple);
if (h) {
struct sockaddr_in sin;
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 8668a3defda6..2fb7b76da94f 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -32,7 +32,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
struct hlist_nulls_node *n;
for (st->bucket = 0;
- st->bucket < nf_conntrack_htable_size;
+ st->bucket < net->ct.htable_size;
st->bucket++) {
n = rcu_dereference(net->ct.hash[st->bucket].first);
if (!is_a_nulls(n))
@@ -50,7 +50,7 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
head = rcu_dereference(head->next);
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
- if (++st->bucket >= nf_conntrack_htable_size)
+ if (++st->bucket >= net->ct.htable_size)
return NULL;
}
head = rcu_dereference(net->ct.hash[st->bucket].first);
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
index d71ba7677344..7404bde95994 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
@@ -18,6 +18,7 @@
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_log.h>
static unsigned int nf_ct_icmp_timeout __read_mostly = 30*HZ;
@@ -54,8 +55,8 @@ static const u_int8_t invmap[] = {
static bool icmp_invert_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig)
{
- if (orig->dst.u.icmp.type >= sizeof(invmap)
- || !invmap[orig->dst.u.icmp.type])
+ if (orig->dst.u.icmp.type >= sizeof(invmap) ||
+ !invmap[orig->dst.u.icmp.type])
return false;
tuple->src.u.icmp.id = orig->src.u.icmp.id;
@@ -101,8 +102,8 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
[ICMP_ADDRESS] = 1
};
- if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new)
- || !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
+ if (ct->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) ||
+ !valid_new[ct->tuplehash[0].tuple.dst.u.icmp.type]) {
/* Can't create a new ICMP `conn' with this. */
pr_debug("icmp: can't create new conn with type %u\n",
ct->tuplehash[0].tuple.dst.u.icmp.type);
@@ -114,13 +115,14 @@ static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
/* Returns conntrack if it dealt with ICMP, and filled in skb fields */
static int
-icmp_error_message(struct net *net, struct sk_buff *skb,
+icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
enum ip_conntrack_info *ctinfo,
unsigned int hooknum)
{
struct nf_conntrack_tuple innertuple, origtuple;
const struct nf_conntrack_l4proto *innerproto;
const struct nf_conntrack_tuple_hash *h;
+ u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE;
NF_CT_ASSERT(skb->nfct == NULL);
@@ -146,7 +148,7 @@ icmp_error_message(struct net *net, struct sk_buff *skb,
*ctinfo = IP_CT_RELATED;
- h = nf_conntrack_find_get(net, &innertuple);
+ h = nf_conntrack_find_get(net, zone, &innertuple);
if (!h) {
pr_debug("icmp_error_message: no match\n");
return -NF_ACCEPT;
@@ -163,7 +165,8 @@ icmp_error_message(struct net *net, struct sk_buff *skb,
/* Small and modified version of icmp_rcv */
static int
-icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
+icmp_error(struct net *net, struct nf_conn *tmpl,
+ struct sk_buff *skb, unsigned int dataoff,
enum ip_conntrack_info *ctinfo, u_int8_t pf, unsigned int hooknum)
{
const struct icmphdr *icmph;
@@ -201,14 +204,14 @@ icmp_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
}
/* Need to track icmp error message? */
- if (icmph->type != ICMP_DEST_UNREACH
- && icmph->type != ICMP_SOURCE_QUENCH
- && icmph->type != ICMP_TIME_EXCEEDED
- && icmph->type != ICMP_PARAMETERPROB
- && icmph->type != ICMP_REDIRECT)
+ if (icmph->type != ICMP_DEST_UNREACH &&
+ icmph->type != ICMP_SOURCE_QUENCH &&
+ icmph->type != ICMP_TIME_EXCEEDED &&
+ icmph->type != ICMP_PARAMETERPROB &&
+ icmph->type != ICMP_REDIRECT)
return NF_ACCEPT;
- return icmp_error_message(net, skb, ctinfo, hooknum);
+ return icmp_error_message(net, tmpl, skb, ctinfo, hooknum);
}
#if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
@@ -238,17 +241,17 @@ static const struct nla_policy icmp_nla_policy[CTA_PROTO_MAX+1] = {
static int icmp_nlattr_to_tuple(struct nlattr *tb[],
struct nf_conntrack_tuple *tuple)
{
- if (!tb[CTA_PROTO_ICMP_TYPE]
- || !tb[CTA_PROTO_ICMP_CODE]
- || !tb[CTA_PROTO_ICMP_ID])
+ if (!tb[CTA_PROTO_ICMP_TYPE] ||
+ !tb[CTA_PROTO_ICMP_CODE] ||
+ !tb[CTA_PROTO_ICMP_ID])
return -EINVAL;
tuple->dst.u.icmp.type = nla_get_u8(tb[CTA_PROTO_ICMP_TYPE]);
tuple->dst.u.icmp.code = nla_get_u8(tb[CTA_PROTO_ICMP_CODE]);
tuple->src.u.icmp.id = nla_get_be16(tb[CTA_PROTO_ICMP_ID]);
- if (tuple->dst.u.icmp.type >= sizeof(invmap)
- || !invmap[tuple->dst.u.icmp.type])
+ if (tuple->dst.u.icmp.type >= sizeof(invmap) ||
+ !invmap[tuple->dst.u.icmp.type])
return -EINVAL;
return 0;
@@ -270,9 +273,7 @@ static struct ctl_table icmp_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
- .ctl_name = 0
- }
+ { }
};
#ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
static struct ctl_table icmp_compat_sysctl_table[] = {
@@ -283,9 +284,7 @@ static struct ctl_table icmp_compat_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
- .ctl_name = 0
- }
+ { }
};
#endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
#endif /* CONFIG_SYSCTL */
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index fa2d6b6fc3e5..cb763ae9ed90 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -14,8 +14,13 @@
#include <net/route.h>
#include <net/ip.h>
+#include <linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#include <net/netfilter/nf_conntrack.h>
+#endif
+#include <net/netfilter/nf_conntrack_zones.h>
/* Returns new sk_buff, or NULL */
static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
@@ -34,6 +39,27 @@ static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
return err;
}
+static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum,
+ struct sk_buff *skb)
+{
+ u16 zone = NF_CT_DEFAULT_ZONE;
+
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+ if (skb->nfct)
+ zone = nf_ct_zone((struct nf_conn *)skb->nfct);
+#endif
+
+#ifdef CONFIG_BRIDGE_NETFILTER
+ if (skb->nf_bridge &&
+ skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)
+ return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone;
+#endif
+ if (hooknum == NF_INET_PRE_ROUTING)
+ return IP_DEFRAG_CONNTRACK_IN + zone;
+ else
+ return IP_DEFRAG_CONNTRACK_OUT + zone;
+}
+
static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
@@ -44,16 +70,14 @@ static unsigned int ipv4_conntrack_defrag(unsigned int hooknum,
#if !defined(CONFIG_NF_NAT) && !defined(CONFIG_NF_NAT_MODULE)
/* Previously seen (loopback)? Ignore. Do this before
fragment check. */
- if (skb->nfct)
+ if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct))
return NF_ACCEPT;
#endif
#endif
/* Gather fragments. */
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
- if (nf_ct_ipv4_gather_frags(skb,
- hooknum == NF_INET_PRE_ROUTING ?
- IP_DEFRAG_CONNTRACK_IN :
- IP_DEFRAG_CONNTRACK_OUT))
+ enum ip_defrag_users user = nf_ct_defrag_user(hooknum, skb);
+ if (nf_ct_ipv4_gather_frags(skb, user))
return NF_STOLEN;
}
return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index fe1a64479dd0..4f8bddb760c9 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -12,6 +12,7 @@
#include <linux/types.h>
#include <linux/timer.h>
#include <linux/skbuff.h>
+#include <linux/gfp.h>
#include <net/checksum.h>
#include <net/icmp.h>
#include <net/ip.h>
@@ -30,14 +31,12 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_zones.h>
static DEFINE_SPINLOCK(nf_nat_lock);
static struct nf_conntrack_l3proto *l3proto __read_mostly;
-/* Calculated at init based on memory size */
-static unsigned int nf_nat_htable_size __read_mostly;
-
#define MAX_IP_NAT_PROTO 256
static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO]
__read_mostly;
@@ -72,15 +71,16 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put);
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
-hash_by_src(const struct nf_conntrack_tuple *tuple)
+hash_by_src(const struct net *net, u16 zone,
+ const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
/* Original src, to ensure we map it consistently if poss. */
hash = jhash_3words((__force u32)tuple->src.u3.ip,
- (__force u32)tuple->src.u.all,
+ (__force u32)tuple->src.u.all ^ zone,
tuple->dst.protonum, 0);
- return ((u64)hash * nf_nat_htable_size) >> 32;
+ return ((u64)hash * net->ipv4.nat_htable_size) >> 32;
}
/* Is this tuple already taken? (not by us) */
@@ -142,12 +142,12 @@ same_src(const struct nf_conn *ct,
/* Only called for SRC manip */
static int
-find_appropriate_src(struct net *net,
+find_appropriate_src(struct net *net, u16 zone,
const struct nf_conntrack_tuple *tuple,
struct nf_conntrack_tuple *result,
const struct nf_nat_range *range)
{
- unsigned int h = hash_by_src(tuple);
+ unsigned int h = hash_by_src(net, zone, tuple);
const struct nf_conn_nat *nat;
const struct nf_conn *ct;
const struct hlist_node *n;
@@ -155,7 +155,7 @@ find_appropriate_src(struct net *net,
rcu_read_lock();
hlist_for_each_entry_rcu(nat, n, &net->ipv4.nat_bysource[h], bysource) {
ct = nat->ct;
- if (same_src(ct, tuple)) {
+ if (same_src(ct, tuple) && nf_ct_zone(ct) == zone) {
/* Copy source part from reply tuple. */
nf_ct_invert_tuplepr(result,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
@@ -178,7 +178,7 @@ find_appropriate_src(struct net *net,
the ip with the lowest src-ip/dst-ip/proto usage.
*/
static void
-find_best_ips_proto(struct nf_conntrack_tuple *tuple,
+find_best_ips_proto(u16 zone, struct nf_conntrack_tuple *tuple,
const struct nf_nat_range *range,
const struct nf_conn *ct,
enum nf_nat_manip_type maniptype)
@@ -212,7 +212,7 @@ find_best_ips_proto(struct nf_conntrack_tuple *tuple,
maxip = ntohl(range->max_ip);
j = jhash_2words((__force u32)tuple->src.u3.ip,
range->flags & IP_NAT_RANGE_PERSISTENT ?
- 0 : (__force u32)tuple->dst.u3.ip, 0);
+ 0 : (__force u32)tuple->dst.u3.ip ^ zone, 0);
j = ((u64)j * (maxip - minip + 1)) >> 32;
*var_ipp = htonl(minip + j);
}
@@ -232,6 +232,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
{
struct net *net = nf_ct_net(ct);
const struct nf_nat_protocol *proto;
+ u16 zone = nf_ct_zone(ct);
/* 1) If this srcip/proto/src-proto-part is currently mapped,
and that same mapping gives a unique tuple within the given
@@ -242,7 +243,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
manips not an issue. */
if (maniptype == IP_NAT_MANIP_SRC &&
!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
- if (find_appropriate_src(net, orig_tuple, tuple, range)) {
+ if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) {
pr_debug("get_unique_tuple: Found current src map\n");
if (!nf_nat_used_tuple(tuple, ct))
return;
@@ -252,7 +253,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
/* 2) Select the least-used IP/proto combination in the given
range. */
*tuple = *orig_tuple;
- find_best_ips_proto(tuple, range, ct, maniptype);
+ find_best_ips_proto(zone, tuple, range, ct, maniptype);
/* 3) The per-protocol part of the manip is made to map into
the range to make a unique tuple. */
@@ -330,7 +331,8 @@ nf_nat_setup_info(struct nf_conn *ct,
if (have_to_hash) {
unsigned int srchash;
- srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ srchash = hash_by_src(net, nf_ct_zone(ct),
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
spin_lock_bh(&nf_nat_lock);
/* nf_conntrack_alter_reply might re-allocate exntension aera */
nat = nfct_nat(ct);
@@ -679,8 +681,10 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
static int __net_init nf_nat_net_init(struct net *net)
{
- net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size,
- &net->ipv4.nat_vmalloced, 0);
+ /* Leave them the same for the moment. */
+ net->ipv4.nat_htable_size = net->ct.htable_size;
+ net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size,
+ &net->ipv4.nat_vmalloced, 0);
if (!net->ipv4.nat_bysource)
return -ENOMEM;
return 0;
@@ -703,7 +707,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
nf_ct_iterate_cleanup(net, &clean_nat, NULL);
synchronize_rcu();
nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced,
- nf_nat_htable_size);
+ net->ipv4.nat_htable_size);
}
static struct pernet_operations nf_nat_net_ops = {
@@ -724,9 +728,6 @@ static int __init nf_nat_init(void)
return ret;
}
- /* Leave them the same for the moment. */
- nf_nat_htable_size = nf_conntrack_htable_size;
-
ret = register_pernet_subsys(&nf_nat_net_ops);
if (ret < 0)
goto cleanup_extend;
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index a1d5d58a58bf..86e0e84ff0a0 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -27,76 +27,29 @@ MODULE_ALIAS("ip_nat_ftp");
/* FIXME: Time out? --RR */
-static int
-mangle_rfc959_packet(struct sk_buff *skb,
- __be32 newip,
- u_int16_t port,
- unsigned int matchoff,
- unsigned int matchlen,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
+static int nf_nat_ftp_fmt_cmd(enum nf_ct_ftp_type type,
+ char *buffer, size_t buflen,
+ __be32 addr, u16 port)
{
- char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")];
-
- sprintf(buffer, "%u,%u,%u,%u,%u,%u",
- NIPQUAD(newip), port>>8, port&0xFF);
-
- pr_debug("calling nf_nat_mangle_tcp_packet\n");
-
- return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
- matchlen, buffer, strlen(buffer));
-}
-
-/* |1|132.235.1.2|6275| */
-static int
-mangle_eprt_packet(struct sk_buff *skb,
- __be32 newip,
- u_int16_t port,
- unsigned int matchoff,
- unsigned int matchlen,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
-{
- char buffer[sizeof("|1|255.255.255.255|65535|")];
-
- sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port);
-
- pr_debug("calling nf_nat_mangle_tcp_packet\n");
-
- return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
- matchlen, buffer, strlen(buffer));
-}
-
-/* |1|132.235.1.2|6275| */
-static int
-mangle_epsv_packet(struct sk_buff *skb,
- __be32 newip,
- u_int16_t port,
- unsigned int matchoff,
- unsigned int matchlen,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo)
-{
- char buffer[sizeof("|||65535|")];
-
- sprintf(buffer, "|||%u|", port);
-
- pr_debug("calling nf_nat_mangle_tcp_packet\n");
+ switch (type) {
+ case NF_CT_FTP_PORT:
+ case NF_CT_FTP_PASV:
+ return snprintf(buffer, buflen, "%u,%u,%u,%u,%u,%u",
+ ((unsigned char *)&addr)[0],
+ ((unsigned char *)&addr)[1],
+ ((unsigned char *)&addr)[2],
+ ((unsigned char *)&addr)[3],
+ port >> 8,
+ port & 0xFF);
+ case NF_CT_FTP_EPRT:
+ return snprintf(buffer, buflen, "|1|%pI4|%u|", &addr, port);
+ case NF_CT_FTP_EPSV:
+ return snprintf(buffer, buflen, "|||%u|", port);
+ }
- return nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
- matchlen, buffer, strlen(buffer));
+ return 0;
}
-static int (*mangle[])(struct sk_buff *, __be32, u_int16_t,
- unsigned int, unsigned int, struct nf_conn *,
- enum ip_conntrack_info)
-= {
- [NF_CT_FTP_PORT] = mangle_rfc959_packet,
- [NF_CT_FTP_PASV] = mangle_rfc959_packet,
- [NF_CT_FTP_EPRT] = mangle_eprt_packet,
- [NF_CT_FTP_EPSV] = mangle_epsv_packet
-};
-
/* So, this packet has hit the connection tracking matching code.
Mangle it, and change the expectation to match the new version. */
static unsigned int nf_nat_ftp(struct sk_buff *skb,
@@ -110,6 +63,8 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
u_int16_t port;
int dir = CTINFO2DIR(ctinfo);
struct nf_conn *ct = exp->master;
+ char buffer[sizeof("|1|255.255.255.255|65535|")];
+ unsigned int buflen;
pr_debug("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen);
@@ -132,11 +87,21 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
if (port == 0)
return NF_DROP;
- if (!mangle[type](skb, newip, port, matchoff, matchlen, ct, ctinfo)) {
- nf_ct_unexpect_related(exp);
- return NF_DROP;
- }
+ buflen = nf_nat_ftp_fmt_cmd(type, buffer, sizeof(buffer), newip, port);
+ if (!buflen)
+ goto out;
+
+ pr_debug("calling nf_nat_mangle_tcp_packet\n");
+
+ if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, matchoff,
+ matchlen, buffer, buflen))
+ goto out;
+
return NF_ACCEPT;
+
+out:
+ nf_ct_unexpect_related(exp);
+ return NF_DROP;
}
static void __exit nf_nat_ftp_fini(void)
diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index f9520fa3aba9..4a0c6b548eee 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -8,6 +8,7 @@
* published by the Free Software Foundation.
*/
#include <linux/module.h>
+#include <linux/gfp.h>
#include <linux/kmod.h>
#include <linux/types.h>
#include <linux/timer.h>
@@ -41,18 +42,14 @@ adjust_tcp_sequence(u32 seq,
struct nf_conn *ct,
enum ip_conntrack_info ctinfo)
{
- int dir;
- struct nf_nat_seq *this_way, *other_way;
+ enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
struct nf_conn_nat *nat = nfct_nat(ct);
+ struct nf_nat_seq *this_way = &nat->seq[dir];
- pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n", seq, seq);
-
- dir = CTINFO2DIR(ctinfo);
-
- this_way = &nat->seq[dir];
- other_way = &nat->seq[!dir];
+ pr_debug("adjust_tcp_sequence: seq = %u, sizediff = %d\n",
+ seq, sizediff);
- pr_debug("nf_nat_resize_packet: Seq_offset before: ");
+ pr_debug("adjust_tcp_sequence: Seq_offset before: ");
DUMP_OFFSET(this_way);
spin_lock_bh(&nf_nat_seqofs_lock);
@@ -63,13 +60,13 @@ adjust_tcp_sequence(u32 seq,
* retransmit */
if (this_way->offset_before == this_way->offset_after ||
before(this_way->correction_pos, seq)) {
- this_way->correction_pos = seq;
- this_way->offset_before = this_way->offset_after;
- this_way->offset_after += sizediff;
+ this_way->correction_pos = seq;
+ this_way->offset_before = this_way->offset_after;
+ this_way->offset_after += sizediff;
}
spin_unlock_bh(&nf_nat_seqofs_lock);
- pr_debug("nf_nat_resize_packet: Seq_offset after: ");
+ pr_debug("adjust_tcp_sequence: Seq_offset after: ");
DUMP_OFFSET(this_way);
}
@@ -145,6 +142,17 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
return 1;
}
+void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+ __be32 seq, s16 off)
+{
+ if (!off)
+ return;
+ set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
+ adjust_tcp_sequence(ntohl(seq), off, ct, ctinfo);
+ nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
+}
+EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
+
/* Generic function for mangling variable-length address changes inside
* NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
* command in FTP).
@@ -153,14 +161,13 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra)
* skb enlargement, ...
*
* */
-int
-nf_nat_mangle_tcp_packet(struct sk_buff *skb,
- struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- unsigned int match_offset,
- unsigned int match_len,
- const char *rep_buffer,
- unsigned int rep_len)
+int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ unsigned int match_offset,
+ unsigned int match_len,
+ const char *rep_buffer,
+ unsigned int rep_len, bool adjust)
{
struct rtable *rt = skb_rtable(skb);
struct iphdr *iph;
@@ -206,16 +213,13 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb,
inet_proto_csum_replace2(&tcph->check, skb,
htons(oldlen), htons(datalen), 1);
- if (rep_len != match_len) {
- set_bit(IPS_SEQ_ADJUST_BIT, &ct->status);
- adjust_tcp_sequence(ntohl(tcph->seq),
- (int)rep_len - (int)match_len,
- ct, ctinfo);
- nf_conntrack_event_cache(IPCT_NATSEQADJ, ct);
- }
+ if (adjust && rep_len != match_len)
+ nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
+ (int)rep_len - (int)match_len);
+
return 1;
}
-EXPORT_SYMBOL(nf_nat_mangle_tcp_packet);
+EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet);
/* Generic function for mangling variable-length address changes inside
* NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 9eb171056c63..4c060038d29f 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -25,6 +25,7 @@
#include <net/netfilter/nf_nat_rule.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_expect.h>
+#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_conntrack_proto_gre.h>
#include <linux/netfilter/nf_conntrack_pptp.h>
@@ -74,7 +75,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
pr_debug("trying to unexpect other dir: ");
nf_ct_dump_tuple_ip(&t);
- other_exp = nf_ct_expect_find_get(net, &t);
+ other_exp = nf_ct_expect_find_get(net, nf_ct_zone(ct), &t);
if (other_exp) {
nf_ct_unexpect_related(other_exp);
nf_ct_expect_put(other_exp);
diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index 9e81e0dfb4ec..26de2c1f7fab 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -15,6 +15,7 @@
#include <linux/kmod.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
+#include <linux/slab.h>
#include <net/checksum.h>
#include <net/route.h>
#include <linux/bitops.h>
@@ -28,36 +29,6 @@
(1 << NF_INET_POST_ROUTING) | \
(1 << NF_INET_LOCAL_OUT))
-static const struct
-{
- struct ipt_replace repl;
- struct ipt_standard entries[3];
- struct ipt_error term;
-} nat_initial_table __net_initdata = {
- .repl = {
- .name = "nat",
- .valid_hooks = NAT_VALID_HOOKS,
- .num_entries = 4,
- .size = sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error),
- .hook_entry = {
- [NF_INET_PRE_ROUTING] = 0,
- [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard),
- [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
- },
- .underflow = {
- [NF_INET_PRE_ROUTING] = 0,
- [NF_INET_POST_ROUTING] = sizeof(struct ipt_standard),
- [NF_INET_LOCAL_OUT] = sizeof(struct ipt_standard) * 2
- },
- },
- .entries = {
- IPT_STANDARD_INIT(NF_ACCEPT), /* PRE_ROUTING */
- IPT_STANDARD_INIT(NF_ACCEPT), /* POST_ROUTING */
- IPT_STANDARD_INIT(NF_ACCEPT), /* LOCAL_OUT */
- },
- .term = IPT_ERROR_INIT, /* ERROR */
-};
-
static const struct xt_table nat_table = {
.name = "nat",
.valid_hooks = NAT_VALID_HOOKS,
@@ -186,8 +157,13 @@ static struct xt_target ipt_dnat_reg __read_mostly = {
static int __net_init nf_nat_rule_net_init(struct net *net)
{
- net->ipv4.nat_table = ipt_register_table(net, &nat_table,
- &nat_initial_table.repl);
+ struct ipt_replace *repl;
+
+ repl = ipt_alloc_initial_table(&nat_table);
+ if (repl == NULL)
+ return -ENOMEM;
+ net->ipv4.nat_table = ipt_register_table(net, &nat_table, repl);
+ kfree(repl);
if (IS_ERR(net->ipv4.nat_table))
return PTR_ERR(net->ipv4.nat_table);
return 0;
@@ -195,7 +171,7 @@ static int __net_init nf_nat_rule_net_init(struct net *net)
static void __net_exit nf_nat_rule_net_exit(struct net *net)
{
- ipt_unregister_table(net->ipv4.nat_table);
+ ipt_unregister_table(net, net->ipv4.nat_table);
}
static struct pernet_operations nf_nat_rule_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 07d61a57613c..11b538deaaec 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -1,4 +1,4 @@
-/* SIP extension for UDP NAT alteration.
+/* SIP extension for NAT alteration.
*
* (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar>
* based on RR's ip_nat_ftp.c and other modules.
@@ -15,6 +15,7 @@
#include <linux/ip.h>
#include <net/ip.h>
#include <linux/udp.h>
+#include <linux/tcp.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_helper.h>
@@ -29,25 +30,42 @@ MODULE_DESCRIPTION("SIP NAT helper");
MODULE_ALIAS("ip_nat_sip");
-static unsigned int mangle_packet(struct sk_buff *skb,
+static unsigned int mangle_packet(struct sk_buff *skb, unsigned int dataoff,
const char **dptr, unsigned int *datalen,
unsigned int matchoff, unsigned int matchlen,
const char *buffer, unsigned int buflen)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
-
- if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, matchoff, matchlen,
- buffer, buflen))
- return 0;
+ struct tcphdr *th;
+ unsigned int baseoff;
+
+ if (nf_ct_protonum(ct) == IPPROTO_TCP) {
+ th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
+ baseoff = ip_hdrlen(skb) + th->doff * 4;
+ matchoff += dataoff - baseoff;
+
+ if (!__nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ matchoff, matchlen,
+ buffer, buflen, false))
+ return 0;
+ } else {
+ baseoff = ip_hdrlen(skb) + sizeof(struct udphdr);
+ matchoff += dataoff - baseoff;
+
+ if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo,
+ matchoff, matchlen,
+ buffer, buflen))
+ return 0;
+ }
/* Reload data pointer and adjust datalen value */
- *dptr = skb->data + ip_hdrlen(skb) + sizeof(struct udphdr);
+ *dptr = skb->data + dataoff;
*datalen += buflen - matchlen;
return 1;
}
-static int map_addr(struct sk_buff *skb,
+static int map_addr(struct sk_buff *skb, unsigned int dataoff,
const char **dptr, unsigned int *datalen,
unsigned int matchoff, unsigned int matchlen,
union nf_inet_addr *addr, __be16 port)
@@ -76,11 +94,11 @@ static int map_addr(struct sk_buff *skb,
buflen = sprintf(buffer, "%pI4:%u", &newaddr, ntohs(newport));
- return mangle_packet(skb, dptr, datalen, matchoff, matchlen,
+ return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
buffer, buflen);
}
-static int map_sip_addr(struct sk_buff *skb,
+static int map_sip_addr(struct sk_buff *skb, unsigned int dataoff,
const char **dptr, unsigned int *datalen,
enum sip_header_types type)
{
@@ -93,16 +111,18 @@ static int map_sip_addr(struct sk_buff *skb,
if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, type, NULL,
&matchoff, &matchlen, &addr, &port) <= 0)
return 1;
- return map_addr(skb, dptr, datalen, matchoff, matchlen, &addr, port);
+ return map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
+ &addr, port);
}
-static unsigned int ip_nat_sip(struct sk_buff *skb,
+static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
const char **dptr, unsigned int *datalen)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- unsigned int dataoff, matchoff, matchlen;
+ unsigned int coff, matchoff, matchlen;
+ enum sip_header_types hdr;
union nf_inet_addr addr;
__be16 port;
int request, in_header;
@@ -112,16 +132,21 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
if (ct_sip_parse_request(ct, *dptr, *datalen,
&matchoff, &matchlen,
&addr, &port) > 0 &&
- !map_addr(skb, dptr, datalen, matchoff, matchlen,
+ !map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
&addr, port))
return NF_DROP;
request = 1;
} else
request = 0;
+ if (nf_ct_protonum(ct) == IPPROTO_TCP)
+ hdr = SIP_HDR_VIA_TCP;
+ else
+ hdr = SIP_HDR_VIA_UDP;
+
/* Translate topmost Via header and parameters */
if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
- SIP_HDR_VIA, NULL, &matchoff, &matchlen,
+ hdr, NULL, &matchoff, &matchlen,
&addr, &port) > 0) {
unsigned int matchend, poff, plen, buflen, n;
char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
@@ -138,7 +163,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
goto next;
}
- if (!map_addr(skb, dptr, datalen, matchoff, matchlen,
+ if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
&addr, port))
return NF_DROP;
@@ -153,8 +178,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
buflen = sprintf(buffer, "%pI4",
&ct->tuplehash[!dir].tuple.dst.u3.ip);
- if (!mangle_packet(skb, dptr, datalen, poff, plen,
- buffer, buflen))
+ if (!mangle_packet(skb, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen))
return NF_DROP;
}
@@ -167,8 +192,8 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
buflen = sprintf(buffer, "%pI4",
&ct->tuplehash[!dir].tuple.src.u3.ip);
- if (!mangle_packet(skb, dptr, datalen, poff, plen,
- buffer, buflen))
+ if (!mangle_packet(skb, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen))
return NF_DROP;
}
@@ -181,31 +206,45 @@ static unsigned int ip_nat_sip(struct sk_buff *skb,
htons(n) != ct->tuplehash[!dir].tuple.src.u.udp.port) {
__be16 p = ct->tuplehash[!dir].tuple.src.u.udp.port;
buflen = sprintf(buffer, "%u", ntohs(p));
- if (!mangle_packet(skb, dptr, datalen, poff, plen,
- buffer, buflen))
+ if (!mangle_packet(skb, dataoff, dptr, datalen,
+ poff, plen, buffer, buflen))
return NF_DROP;
}
}
next:
/* Translate Contact headers */
- dataoff = 0;
+ coff = 0;
in_header = 0;
- while (ct_sip_parse_header_uri(ct, *dptr, &dataoff, *datalen,
+ while (ct_sip_parse_header_uri(ct, *dptr, &coff, *datalen,
SIP_HDR_CONTACT, &in_header,
&matchoff, &matchlen,
&addr, &port) > 0) {
- if (!map_addr(skb, dptr, datalen, matchoff, matchlen,
+ if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
&addr, port))
return NF_DROP;
}
- if (!map_sip_addr(skb, dptr, datalen, SIP_HDR_FROM) ||
- !map_sip_addr(skb, dptr, datalen, SIP_HDR_TO))
+ if (!map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_FROM) ||
+ !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO))
return NF_DROP;
+
return NF_ACCEPT;
}
+static void ip_nat_sip_seq_adjust(struct sk_buff *skb, s16 off)
+{
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+ const struct tcphdr *th;
+
+ if (nf_ct_protonum(ct) != IPPROTO_TCP || off == 0)
+ return;
+
+ th = (struct tcphdr *)(skb->data + ip_hdrlen(skb));
+ nf_nat_set_seq_adjust(ct, ctinfo, th->seq, off);
+}
+
/* Handles expected signalling connections and media streams */
static void ip_nat_sip_expected(struct nf_conn *ct,
struct nf_conntrack_expect *exp)
@@ -232,7 +271,7 @@ static void ip_nat_sip_expected(struct nf_conn *ct,
}
}
-static unsigned int ip_nat_sip_expect(struct sk_buff *skb,
+static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
const char **dptr, unsigned int *datalen,
struct nf_conntrack_expect *exp,
unsigned int matchoff,
@@ -279,8 +318,8 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb,
if (exp->tuple.dst.u3.ip != exp->saved_ip ||
exp->tuple.dst.u.udp.port != exp->saved_proto.udp.port) {
buflen = sprintf(buffer, "%pI4:%u", &newip, port);
- if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen,
- buffer, buflen))
+ if (!mangle_packet(skb, dataoff, dptr, datalen,
+ matchoff, matchlen, buffer, buflen))
goto err;
}
return NF_ACCEPT;
@@ -290,7 +329,7 @@ err:
return NF_DROP;
}
-static int mangle_content_len(struct sk_buff *skb,
+static int mangle_content_len(struct sk_buff *skb, unsigned int dataoff,
const char **dptr, unsigned int *datalen)
{
enum ip_conntrack_info ctinfo;
@@ -312,12 +351,13 @@ static int mangle_content_len(struct sk_buff *skb,
return 0;
buflen = sprintf(buffer, "%u", c_len);
- return mangle_packet(skb, dptr, datalen, matchoff, matchlen,
+ return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
buffer, buflen);
}
-static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr,
- unsigned int dataoff, unsigned int *datalen,
+static int mangle_sdp_packet(struct sk_buff *skb, unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
enum sdp_header_types type,
enum sdp_header_types term,
char *buffer, int buflen)
@@ -326,16 +366,16 @@ static int mangle_sdp_packet(struct sk_buff *skb, const char **dptr,
struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
unsigned int matchlen, matchoff;
- if (ct_sip_get_sdp_header(ct, *dptr, dataoff, *datalen, type, term,
+ if (ct_sip_get_sdp_header(ct, *dptr, sdpoff, *datalen, type, term,
&matchoff, &matchlen) <= 0)
return -ENOENT;
- return mangle_packet(skb, dptr, datalen, matchoff, matchlen,
+ return mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
buffer, buflen) ? 0 : -EINVAL;
}
-static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr,
- unsigned int dataoff,
- unsigned int *datalen,
+static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
enum sdp_header_types type,
enum sdp_header_types term,
const union nf_inet_addr *addr)
@@ -344,16 +384,15 @@ static unsigned int ip_nat_sdp_addr(struct sk_buff *skb, const char **dptr,
unsigned int buflen;
buflen = sprintf(buffer, "%pI4", &addr->ip);
- if (mangle_sdp_packet(skb, dptr, dataoff, datalen, type, term,
+ if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff, type, term,
buffer, buflen))
return 0;
- return mangle_content_len(skb, dptr, datalen);
+ return mangle_content_len(skb, dataoff, dptr, datalen);
}
-static unsigned int ip_nat_sdp_port(struct sk_buff *skb,
- const char **dptr,
- unsigned int *datalen,
+static unsigned int ip_nat_sdp_port(struct sk_buff *skb, unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
unsigned int matchoff,
unsigned int matchlen,
u_int16_t port)
@@ -362,16 +401,16 @@ static unsigned int ip_nat_sdp_port(struct sk_buff *skb,
unsigned int buflen;
buflen = sprintf(buffer, "%u", port);
- if (!mangle_packet(skb, dptr, datalen, matchoff, matchlen,
+ if (!mangle_packet(skb, dataoff, dptr, datalen, matchoff, matchlen,
buffer, buflen))
return 0;
- return mangle_content_len(skb, dptr, datalen);
+ return mangle_content_len(skb, dataoff, dptr, datalen);
}
-static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr,
- unsigned int dataoff,
- unsigned int *datalen,
+static unsigned int ip_nat_sdp_session(struct sk_buff *skb, unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
+ unsigned int sdpoff,
const union nf_inet_addr *addr)
{
char buffer[sizeof("nnn.nnn.nnn.nnn")];
@@ -379,12 +418,12 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr,
/* Mangle session description owner and contact addresses */
buflen = sprintf(buffer, "%pI4", &addr->ip);
- if (mangle_sdp_packet(skb, dptr, dataoff, datalen,
+ if (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
SDP_HDR_OWNER_IP4, SDP_HDR_MEDIA,
buffer, buflen))
return 0;
- switch (mangle_sdp_packet(skb, dptr, dataoff, datalen,
+ switch (mangle_sdp_packet(skb, dataoff, dptr, datalen, sdpoff,
SDP_HDR_CONNECTION_IP4, SDP_HDR_MEDIA,
buffer, buflen)) {
case 0:
@@ -401,14 +440,13 @@ static unsigned int ip_nat_sdp_session(struct sk_buff *skb, const char **dptr,
return 0;
}
- return mangle_content_len(skb, dptr, datalen);
+ return mangle_content_len(skb, dataoff, dptr, datalen);
}
/* So, this packet has hit the connection tracking matching code.
Mangle it, and change the expectation to match the new version. */
-static unsigned int ip_nat_sdp_media(struct sk_buff *skb,
- const char **dptr,
- unsigned int *datalen,
+static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
+ const char **dptr, unsigned int *datalen,
struct nf_conntrack_expect *rtp_exp,
struct nf_conntrack_expect *rtcp_exp,
unsigned int mediaoff,
@@ -456,7 +494,8 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb,
/* Update media port. */
if (rtp_exp->tuple.dst.u.udp.port != rtp_exp->saved_proto.udp.port &&
- !ip_nat_sdp_port(skb, dptr, datalen, mediaoff, medialen, port))
+ !ip_nat_sdp_port(skb, dataoff, dptr, datalen,
+ mediaoff, medialen, port))
goto err2;
return NF_ACCEPT;
@@ -471,6 +510,7 @@ err1:
static void __exit nf_nat_sip_fini(void)
{
rcu_assign_pointer(nf_nat_sip_hook, NULL);
+ rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, NULL);
rcu_assign_pointer(nf_nat_sip_expect_hook, NULL);
rcu_assign_pointer(nf_nat_sdp_addr_hook, NULL);
rcu_assign_pointer(nf_nat_sdp_port_hook, NULL);
@@ -482,12 +522,14 @@ static void __exit nf_nat_sip_fini(void)
static int __init nf_nat_sip_init(void)
{
BUG_ON(nf_nat_sip_hook != NULL);
+ BUG_ON(nf_nat_sip_seq_adjust_hook != NULL);
BUG_ON(nf_nat_sip_expect_hook != NULL);
BUG_ON(nf_nat_sdp_addr_hook != NULL);
BUG_ON(nf_nat_sdp_port_hook != NULL);
BUG_ON(nf_nat_sdp_session_hook != NULL);
BUG_ON(nf_nat_sdp_media_hook != NULL);
rcu_assign_pointer(nf_nat_sip_hook, ip_nat_sip);
+ rcu_assign_pointer(nf_nat_sip_seq_adjust_hook, ip_nat_sip_seq_adjust);
rcu_assign_pointer(nf_nat_sip_expect_hook, ip_nat_sip_expect);
rcu_assign_pointer(nf_nat_sdp_addr_hook, ip_nat_sdp_addr);
rcu_assign_pointer(nf_nat_sdp_port_hook, ip_nat_sdp_port);
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index d9521f6f9ed0..4d85b6e55f29 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -43,6 +43,7 @@
#include <linux/moduleparam.h>
#include <linux/types.h>
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/udp.h>
@@ -1038,7 +1039,7 @@ static int snmp_parse_mangle(unsigned char *msg,
unsigned int cls, con, tag, vers, pdutype;
struct asn1_ctx ctx;
struct asn1_octstr comm;
- struct snmp_object **obj;
+ struct snmp_object *obj;
if (debug > 1)
hex_dump(msg, len);
@@ -1148,43 +1149,34 @@ static int snmp_parse_mangle(unsigned char *msg,
if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ)
return 0;
- obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC);
- if (obj == NULL) {
- if (net_ratelimit())
- printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__);
- return 0;
- }
-
while (!asn1_eoc_decode(&ctx, eoc)) {
unsigned int i;
- if (!snmp_object_decode(&ctx, obj)) {
- if (*obj) {
- kfree((*obj)->id);
- kfree(*obj);
+ if (!snmp_object_decode(&ctx, &obj)) {
+ if (obj) {
+ kfree(obj->id);
+ kfree(obj);
}
- kfree(obj);
return 0;
}
if (debug > 1) {
printk(KERN_DEBUG "bsalg: object: ");
- for (i = 0; i < (*obj)->id_len; i++) {
+ for (i = 0; i < obj->id_len; i++) {
if (i > 0)
printk(".");
- printk("%lu", (*obj)->id[i]);
+ printk("%lu", obj->id[i]);
}
- printk(": type=%u\n", (*obj)->type);
+ printk(": type=%u\n", obj->type);
}
- if ((*obj)->type == SNMP_IPADDR)
+ if (obj->type == SNMP_IPADDR)
mangle_address(ctx.begin, ctx.pointer - 4 , map, check);
- kfree((*obj)->id);
- kfree(*obj);
+ kfree(obj->id);
+ kfree(obj);
}
- kfree(obj);
if (!asn1_eoc_decode(&ctx, eoc))
return 0;
diff --git a/net/ipv4/netfilter/nf_nat_standalone.c b/net/ipv4/netfilter/nf_nat_standalone.c
index 5f41d017ddd8..c39c9cf6bee6 100644
--- a/net/ipv4/netfilter/nf_nat_standalone.c
+++ b/net/ipv4/netfilter/nf_nat_standalone.c
@@ -7,6 +7,7 @@
*/
#include <linux/types.h>
#include <linux/icmp.h>
+#include <linux/gfp.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
@@ -197,11 +198,11 @@ nf_nat_out(unsigned int hooknum,
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
- if (ct->tuplehash[dir].tuple.src.u3.ip !=
- ct->tuplehash[!dir].tuple.dst.u3.ip
- || ct->tuplehash[dir].tuple.src.u.all !=
- ct->tuplehash[!dir].tuple.dst.u.all
- )
+ if ((ct->tuplehash[dir].tuple.src.u3.ip !=
+ ct->tuplehash[!dir].tuple.dst.u3.ip) ||
+ (ct->tuplehash[dir].tuple.src.u.all !=
+ ct->tuplehash[!dir].tuple.dst.u.all)
+ )
return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP;
}
#endif
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index f25542c48b7d..4f1f337f4337 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -127,8 +127,8 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_SENTINEL
};
-static struct {
- char *name;
+static const struct {
+ const char *name;
int index;
} icmpmibmap[] = {
{ "DestUnreachs", ICMP_DEST_UNREACH },
@@ -249,6 +249,8 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPSackShifted", LINUX_MIB_SACKSHIFTED),
SNMP_MIB_ITEM("TCPSackMerged", LINUX_MIB_SACKMERGED),
SNMP_MIB_ITEM("TCPSackShiftFallback", LINUX_MIB_SACKSHIFTFALLBACK),
+ SNMP_MIB_ITEM("TCPBacklogDrop", LINUX_MIB_TCPBACKLOGDROP),
+ SNMP_MIB_ITEM("TCPMinTTLDrop", LINUX_MIB_TCPMINTTLDROP),
SNMP_MIB_SENTINEL
};
@@ -280,7 +282,7 @@ static void icmpmsg_put(struct seq_file *seq)
count = 0;
for (i = 0; i < ICMPMSG_MIB_MAX; i++) {
- val = snmp_fold_field((void **) net->mib.icmpmsg_statistics, i);
+ val = snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics, i);
if (val) {
type[count] = i;
vals[count++] = val;
@@ -307,18 +309,18 @@ static void icmp_put(struct seq_file *seq)
for (i=0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " Out%s", icmpmibmap[i].name);
seq_printf(seq, "\nIcmp: %lu %lu",
- snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
- snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
+ snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INMSGS),
+ snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_INERRORS));
for (i=0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void **) net->mib.icmpmsg_statistics,
+ snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
icmpmibmap[i].index));
seq_printf(seq, " %lu %lu",
- snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
- snmp_fold_field((void **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
+ snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTMSGS),
+ snmp_fold_field((void __percpu **) net->mib.icmp_statistics, ICMP_MIB_OUTERRORS));
for (i=0; icmpmibmap[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void **) net->mib.icmpmsg_statistics,
+ snmp_fold_field((void __percpu **) net->mib.icmpmsg_statistics,
icmpmibmap[i].index | 0x100));
}
@@ -341,7 +343,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void **)net->mib.ip_statistics,
+ snmp_fold_field((void __percpu **)net->mib.ip_statistics,
snmp4_ipstats_list[i].entry));
icmp_put(seq); /* RFC 2011 compatibility */
@@ -356,11 +358,11 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
/* MaxConn field is signed, RFC 2012 */
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
seq_printf(seq, " %ld",
- snmp_fold_field((void **)net->mib.tcp_statistics,
+ snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
snmp4_tcp_list[i].entry));
else
seq_printf(seq, " %lu",
- snmp_fold_field((void **)net->mib.tcp_statistics,
+ snmp_fold_field((void __percpu **)net->mib.tcp_statistics,
snmp4_tcp_list[i].entry));
}
@@ -371,7 +373,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nUdp:");
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void **)net->mib.udp_statistics,
+ snmp_fold_field((void __percpu **)net->mib.udp_statistics,
snmp4_udp_list[i].entry));
/* the UDP and UDP-Lite MIBs are the same */
@@ -382,7 +384,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nUdpLite:");
for (i = 0; snmp4_udp_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void **)net->mib.udplite_statistics,
+ snmp_fold_field((void __percpu **)net->mib.udplite_statistics,
snmp4_udp_list[i].entry));
seq_putc(seq, '\n');
@@ -419,7 +421,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nTcpExt:");
for (i = 0; snmp4_net_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void **)net->mib.net_statistics,
+ snmp_fold_field((void __percpu **)net->mib.net_statistics,
snmp4_net_list[i].entry));
seq_puts(seq, "\nIpExt:");
@@ -429,7 +431,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
seq_puts(seq, "\nIpExt:");
for (i = 0; snmp4_ipextstats_list[i].name != NULL; i++)
seq_printf(seq, " %lu",
- snmp_fold_field((void **)net->mib.ip_statistics,
+ snmp_fold_field((void __percpu **)net->mib.ip_statistics,
snmp4_ipextstats_list[i].entry));
seq_putc(seq, '\n');
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ab996f9c0fe0..cc6f097fbd5f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -60,7 +60,6 @@
#include <net/net_namespace.h>
#include <net/dst.h>
#include <net/sock.h>
-#include <linux/gfp.h>
#include <linux/ip.h>
#include <linux/net.h>
#include <net/ip.h>
@@ -87,7 +86,7 @@ void raw_hash_sk(struct sock *sk)
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
struct hlist_head *head;
- head = &h->ht[inet_sk(sk)->num & (RAW_HTABLE_SIZE - 1)];
+ head = &h->ht[inet_sk(sk)->inet_num & (RAW_HTABLE_SIZE - 1)];
write_lock_bh(&h->lock);
sk_add_node(sk, head);
@@ -115,9 +114,9 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
sk_for_each_from(sk, node) {
struct inet_sock *inet = inet_sk(sk);
- if (net_eq(sock_net(sk), net) && inet->num == num &&
- !(inet->daddr && inet->daddr != raddr) &&
- !(inet->rcv_saddr && inet->rcv_saddr != laddr) &&
+ if (net_eq(sock_net(sk), net) && inet->inet_num == num &&
+ !(inet->inet_daddr && inet->inet_daddr != raddr) &&
+ !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) &&
!(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
goto found; /* gotcha */
}
@@ -292,7 +291,6 @@ static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb)
/* Charge it to the socket. */
if (sock_queue_rcv_skb(sk, skb) < 0) {
- atomic_inc(&sk->sk_drops);
kfree_skb(skb);
return NET_RX_DROP;
}
@@ -327,7 +325,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length,
int err;
if (length > rt->u.dst.dev->mtu) {
- ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport,
+ ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
rt->u.dst.dev->mtu);
return -EMSGSIZE;
}
@@ -500,10 +498,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
err = -EDESTADDRREQ;
if (sk->sk_state != TCP_ESTABLISHED)
goto out;
- daddr = inet->daddr;
+ daddr = inet->inet_daddr;
}
- ipc.addr = inet->saddr;
+ ipc.addr = inet->inet_saddr;
ipc.opt = NULL;
ipc.shtx.flags = 0;
ipc.oif = sk->sk_bound_dev_if;
@@ -645,9 +643,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
goto out;
- inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
- inet->saddr = 0; /* Use device */
+ inet->inet_saddr = 0; /* Use device */
sk_dst_reset(sk);
ret = 0;
out: return ret;
@@ -692,7 +690,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if (err)
goto done;
- sock_recv_timestamp(msg, sk, skb);
+ sock_recv_ts_and_drops(msg, sk, skb);
/* Copy the address. */
if (sin) {
@@ -717,7 +715,7 @@ static int raw_init(struct sock *sk)
{
struct raw_sock *rp = raw_sk(sk);
- if (inet_sk(sk)->num == IPPROTO_ICMP)
+ if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
memset(&rp->filter, 0, sizeof(rp->filter));
return 0;
}
@@ -754,7 +752,7 @@ static int do_raw_setsockopt(struct sock *sk, int level, int optname,
char __user *optval, unsigned int optlen)
{
if (optname == ICMP_FILTER) {
- if (inet_sk(sk)->num != IPPROTO_ICMP)
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
return -EOPNOTSUPP;
else
return raw_seticmpfilter(sk, optval, optlen);
@@ -784,7 +782,7 @@ static int do_raw_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen)
{
if (optname == ICMP_FILTER) {
- if (inet_sk(sk)->num != IPPROTO_ICMP)
+ if (inet_sk(sk)->inet_num != IPPROTO_ICMP)
return -EOPNOTSUPP;
else
return raw_geticmpfilter(sk, optval, optlen);
@@ -943,10 +941,10 @@ EXPORT_SYMBOL_GPL(raw_seq_stop);
static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
{
struct inet_sock *inet = inet_sk(sp);
- __be32 dest = inet->daddr,
- src = inet->rcv_saddr;
+ __be32 dest = inet->inet_daddr,
+ src = inet->inet_rcv_saddr;
__u16 destp = 0,
- srcp = inet->num;
+ srcp = inet->inet_num;
seq_printf(seq, "%4d: %08X:%04X %08X:%04X"
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d\n",
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 5b1050a5d874..cb562fdd9b9a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -90,6 +90,7 @@
#include <linux/jhash.h>
#include <linux/rcupdate.h>
#include <linux/times.h>
+#include <linux/slab.h>
#include <net/dst.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
@@ -146,7 +147,6 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
static void ipv4_link_failure(struct sk_buff *skb);
static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
static int rt_garbage_collect(struct dst_ops *ops);
-static void rt_emergency_hash_rebuild(struct net *net);
static struct dst_ops ipv4_dst_ops = {
@@ -287,12 +287,12 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
if (!rt_hash_table[st->bucket].chain)
continue;
rcu_read_lock_bh();
- r = rcu_dereference(rt_hash_table[st->bucket].chain);
+ r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
while (r) {
if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
r->rt_genid == st->genid)
return r;
- r = rcu_dereference(r->u.dst.rt_next);
+ r = rcu_dereference_bh(r->u.dst.rt_next);
}
rcu_read_unlock_bh();
}
@@ -314,7 +314,7 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
rcu_read_lock_bh();
r = rt_hash_table[st->bucket].chain;
}
- return rcu_dereference(r);
+ return rcu_dereference_bh(r);
}
static struct rtable *rt_cache_get_next(struct seq_file *seq,
@@ -513,43 +513,42 @@ static const struct file_operations rt_cpu_seq_fops = {
};
#ifdef CONFIG_NET_CLS_ROUTE
-static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
- int length, int *eof, void *data)
-{
- unsigned int i;
-
- if ((offset & 3) || (length & 3))
- return -EIO;
-
- if (offset >= sizeof(struct ip_rt_acct) * 256) {
- *eof = 1;
- return 0;
- }
-
- if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
- length = sizeof(struct ip_rt_acct) * 256 - offset;
- *eof = 1;
+static int rt_acct_proc_show(struct seq_file *m, void *v)
+{
+ struct ip_rt_acct *dst, *src;
+ unsigned int i, j;
+
+ dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
+ if (!dst)
+ return -ENOMEM;
+
+ for_each_possible_cpu(i) {
+ src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
+ for (j = 0; j < 256; j++) {
+ dst[j].o_bytes += src[j].o_bytes;
+ dst[j].o_packets += src[j].o_packets;
+ dst[j].i_bytes += src[j].i_bytes;
+ dst[j].i_packets += src[j].i_packets;
+ }
}
- offset /= sizeof(u32);
-
- if (length > 0) {
- u32 *dst = (u32 *) buffer;
-
- *start = buffer;
- memset(dst, 0, length);
-
- for_each_possible_cpu(i) {
- unsigned int j;
- u32 *src;
+ seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
+ kfree(dst);
+ return 0;
+}
- src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
- for (j = 0; j < length/4; j++)
- dst[j] += src[j];
- }
- }
- return length;
+static int rt_acct_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rt_acct_proc_show, NULL);
}
+
+static const struct file_operations rt_acct_proc_fops = {
+ .owner = THIS_MODULE,
+ .open = rt_acct_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
#endif
static int __net_init ip_rt_do_proc_init(struct net *net)
@@ -567,8 +566,7 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
goto err2;
#ifdef CONFIG_NET_CLS_ROUTE
- pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
- ip_rt_acct_read, NULL);
+ pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
if (!pde)
goto err3;
#endif
@@ -588,7 +586,9 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
{
remove_proc_entry("rt_cache", net->proc_net_stat);
remove_proc_entry("rt_cache", net->proc_net);
+#ifdef CONFIG_NET_CLS_ROUTE
remove_proc_entry("rt_acct", net->proc_net);
+#endif
}
static struct pernet_operations ip_rt_proc_ops __net_initdata = {
@@ -703,7 +703,7 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
{
- return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
+ return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
}
static inline int rt_is_expired(struct rtable *rth)
@@ -780,11 +780,30 @@ static void rt_do_flush(int process_context)
#define FRACT_BITS 3
#define ONE (1UL << FRACT_BITS)
+/*
+ * Given a hash chain and an item in this hash chain,
+ * find if a previous entry has the same hash_inputs
+ * (but differs on tos, mark or oif)
+ * Returns 0 if an alias is found.
+ * Returns ONE if rth has no alias before itself.
+ */
+static int has_noalias(const struct rtable *head, const struct rtable *rth)
+{
+ const struct rtable *aux = head;
+
+ while (aux != rth) {
+ if (compare_hash_inputs(&aux->fl, &rth->fl))
+ return 0;
+ aux = aux->u.dst.rt_next;
+ }
+ return ONE;
+}
+
static void rt_check_expire(void)
{
static unsigned int rover;
unsigned int i = rover, goal;
- struct rtable *rth, *aux, **rthp;
+ struct rtable *rth, **rthp;
unsigned long samples = 0;
unsigned long sum = 0, sum2 = 0;
unsigned long delta;
@@ -835,15 +854,7 @@ nofree:
* attributes don't unfairly skew
* the length computation
*/
- for (aux = rt_hash_table[i].chain;;) {
- if (aux == rth) {
- length += ONE;
- break;
- }
- if (compare_hash_inputs(&aux->fl, &rth->fl))
- break;
- aux = aux->u.dst.rt_next;
- }
+ length += has_noalias(rt_hash_table[i].chain, rth);
continue;
}
} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
@@ -902,6 +913,12 @@ void rt_cache_flush(struct net *net, int delay)
rt_do_flush(!in_softirq());
}
+/* Flush previous cache invalidated entries from the cache */
+void rt_cache_flush_batch(void)
+{
+ rt_do_flush(!in_softirq());
+}
+
/*
* We change rt_genid and let gc do the cleanup
*/
@@ -916,10 +933,8 @@ static void rt_secret_rebuild_oneshot(struct net *net)
{
del_timer_sync(&net->ipv4.rt_secret_timer);
rt_cache_invalidate(net);
- if (ip_rt_secret_interval) {
- net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
- add_timer(&net->ipv4.rt_secret_timer);
- }
+ if (ip_rt_secret_interval)
+ mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
}
static void rt_emergency_hash_rebuild(struct net *net)
@@ -1067,8 +1082,23 @@ work_done:
out: return 0;
}
+/*
+ * Returns number of entries in a hash chain that have different hash_inputs
+ */
+static int slow_chain_length(const struct rtable *head)
+{
+ int length = 0;
+ const struct rtable *rth = head;
+
+ while (rth) {
+ length += has_noalias(head, rth);
+ rth = rth->u.dst.rt_next;
+ }
+ return length >> FRACT_BITS;
+}
+
static int rt_intern_hash(unsigned hash, struct rtable *rt,
- struct rtable **rp, struct sk_buff *skb)
+ struct rtable **rp, struct sk_buff *skb, int ifindex)
{
struct rtable *rth, **rthp;
unsigned long now;
@@ -1179,14 +1209,20 @@ restart:
rt_free(cand);
}
} else {
- if (chain_length > rt_chain_length_max) {
+ if (chain_length > rt_chain_length_max &&
+ slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
struct net *net = dev_net(rt->u.dst.dev);
int num = ++net->ipv4.current_rt_cache_rebuild_count;
- if (!rt_caching(dev_net(rt->u.dst.dev))) {
+ if (!rt_caching(net)) {
printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
rt->u.dst.dev->name, num);
}
- rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
+ rt_emergency_hash_rebuild(net);
+ spin_unlock_bh(rt_hash_lock_addr(hash));
+
+ hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
+ ifindex, rt_genid(net));
+ goto restart;
}
}
@@ -1346,9 +1382,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
return;
net = dev_net(dev);
- if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
- || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
- || ipv4_is_zeronet(new_gw))
+ if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
+ ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
+ ipv4_is_zeronet(new_gw))
goto reject_redirect;
if (!rt_caching(net))
@@ -1411,7 +1447,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
dev_hold(rt->u.dst.dev);
if (rt->idev)
in_dev_hold(rt->idev);
- rt->u.dst.obsolete = 0;
+ rt->u.dst.obsolete = -1;
rt->u.dst.lastuse = jiffies;
rt->u.dst.path = &rt->u.dst;
rt->u.dst.neighbour = NULL;
@@ -1447,7 +1483,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
&netevent);
rt_del(hash, rth);
- if (!rt_intern_hash(hash, rt, &rt, NULL))
+ if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
ip_rt_put(rt);
goto do_next;
}
@@ -1476,11 +1512,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
struct dst_entry *ret = dst;
if (rt) {
- if (dst->obsolete) {
+ if (dst->obsolete > 0) {
ip_rt_put(rt);
ret = NULL;
} else if ((rt->rt_flags & RTCF_REDIRECTED) ||
- rt->u.dst.expires) {
+ (rt->u.dst.expires &&
+ time_after_eq(jiffies, rt->u.dst.expires))) {
unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
rt->fl.oif,
rt_genid(dev_net(dst->dev)));
@@ -1628,9 +1665,6 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
__be32 daddr = iph->daddr;
unsigned short est_mtu = 0;
- if (ipv4_config.no_pmtu_disc)
- return 0;
-
for (k = 0; k < 2; k++) {
for (i = 0; i < 2; i++) {
unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
@@ -1699,7 +1733,9 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
{
- return NULL;
+ if (rt_is_expired((struct rtable *)dst))
+ return NULL;
+ return dst;
}
static void ipv4_dst_destroy(struct dst_entry *dst)
@@ -1861,7 +1897,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (!rth)
goto e_nobufs;
- rth->u.dst.output= ip_rt_bug;
+ rth->u.dst.output = ip_rt_bug;
+ rth->u.dst.obsolete = -1;
atomic_set(&rth->u.dst.__refcnt, 1);
rth->u.dst.flags= DST_HOST;
@@ -1900,7 +1937,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
in_dev_put(in_dev);
hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
- return rt_intern_hash(hash, rth, NULL, skb);
+ return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
e_nobufs:
in_dev_put(in_dev);
@@ -1987,8 +2024,13 @@ static int __mkroute_input(struct sk_buff *skb,
if (skb->protocol != htons(ETH_P_IP)) {
/* Not IP (i.e. ARP). Do not create route, if it is
* invalid for proxy arp. DNAT routes are always valid.
+ *
+ * Proxy arp feature have been extended to allow, ARP
+ * replies back to the same interface, to support
+ * Private VLAN switch technologies. See arp.c.
*/
- if (out_dev == in_dev) {
+ if (out_dev == in_dev &&
+ IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
err = -EINVAL;
goto cleanup;
}
@@ -2022,6 +2064,7 @@ static int __mkroute_input(struct sk_buff *skb,
rth->fl.oif = 0;
rth->rt_spec_dst= spec_dst;
+ rth->u.dst.obsolete = -1;
rth->u.dst.input = ip_forward;
rth->u.dst.output = ip_output;
rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
@@ -2061,7 +2104,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
/* put it into the cache */
hash = rt_hash(daddr, saddr, fl->iif,
rt_genid(dev_net(rth->u.dst.dev)));
- return rt_intern_hash(hash, rth, NULL, skb);
+ return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
}
/*
@@ -2186,6 +2229,7 @@ local_input:
goto e_nobufs;
rth->u.dst.output= ip_rt_bug;
+ rth->u.dst.obsolete = -1;
rth->rt_genid = rt_genid(net);
atomic_set(&rth->u.dst.__refcnt, 1);
@@ -2217,7 +2261,7 @@ local_input:
}
rth->rt_type = res.type;
hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
- err = rt_intern_hash(hash, rth, NULL, skb);
+ err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
goto done;
no_route:
@@ -2314,10 +2358,11 @@ skip_cache:
ip_hdr(skb)->protocol);
if (our
#ifdef CONFIG_IP_MROUTE
- || (!ipv4_is_local_multicast(daddr) &&
- IN_DEV_MFORWARD(in_dev))
+ ||
+ (!ipv4_is_local_multicast(daddr) &&
+ IN_DEV_MFORWARD(in_dev))
#endif
- ) {
+ ) {
rcu_read_unlock();
return ip_route_input_mc(skb, daddr, saddr,
tos, dev, our);
@@ -2411,6 +2456,7 @@ static int __mkroute_output(struct rtable **result,
rth->rt_spec_dst= fl->fl4_src;
rth->u.dst.output=ip_output;
+ rth->u.dst.obsolete = -1;
rth->rt_genid = rt_genid(dev_net(dev_out));
RT_CACHE_STAT_INC(out_slow_tot);
@@ -2462,7 +2508,7 @@ static int ip_mkroute_output(struct rtable **rp,
if (err == 0) {
hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
rt_genid(dev_net(dev_out)));
- err = rt_intern_hash(hash, rth, rp, NULL);
+ err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
}
return err;
@@ -2514,9 +2560,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
of another iface. --ANK
*/
- if (oldflp->oif == 0
- && (ipv4_is_multicast(oldflp->fl4_dst) ||
- oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+ if (oldflp->oif == 0 &&
+ (ipv4_is_multicast(oldflp->fl4_dst) ||
+ oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
dev_out = ip_dev_find(net, oldflp->fl4_src);
if (dev_out == NULL)
@@ -2685,8 +2731,8 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
rcu_read_lock_bh();
- for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
- rth = rcu_dereference(rth->u.dst.rt_next)) {
+ for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
+ rth = rcu_dereference_bh(rth->u.dst.rt_next)) {
if (rth->fl.fl4_dst == flp->fl4_dst &&
rth->fl.fl4_src == flp->fl4_src &&
rth->fl.iif == 0 &&
@@ -2855,7 +2901,7 @@ static int rt_fill_info(struct net *net,
error = rt->u.dst.error;
expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
if (rt->peer) {
- id = rt->peer->ip_id_count;
+ id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
if (rt->peer->tcp_ts_stamp) {
ts = rt->peer->tcp_ts;
tsage = get_seconds() - rt->peer->tcp_ts_stamp;
@@ -3004,8 +3050,8 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
if (!rt_hash_table[h].chain)
continue;
rcu_read_lock_bh();
- for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
- rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
+ for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
+ rt = rcu_dereference_bh(rt->u.dst.rt_next), idx++) {
if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
continue;
if (rt_is_expired(rt))
@@ -3056,23 +3102,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
return -EINVAL;
}
-static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
- void __user *oldval,
- size_t __user *oldlenp,
- void __user *newval,
- size_t newlen)
-{
- int delay;
- struct net *net;
- if (newlen != sizeof(int))
- return -EINVAL;
- if (get_user(delay, (int __user *)newval))
- return -EFAULT;
- net = (struct net *)table->extra1;
- rt_cache_flush(net, delay);
- return 0;
-}
-
static void rt_secret_reschedule(int old)
{
struct net *net;
@@ -3085,22 +3114,20 @@ static void rt_secret_reschedule(int old)
rtnl_lock();
for_each_net(net) {
int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
+ long time;
if (!new)
continue;
if (deleted) {
- long time = net->ipv4.rt_secret_timer.expires - jiffies;
+ time = net->ipv4.rt_secret_timer.expires - jiffies;
if (time <= 0 || (time += diff) <= 0)
time = 0;
-
- net->ipv4.rt_secret_timer.expires = time;
} else
- net->ipv4.rt_secret_timer.expires = new;
+ time = new;
- net->ipv4.rt_secret_timer.expires += jiffies;
- add_timer(&net->ipv4.rt_secret_timer);
+ mod_timer(&net->ipv4.rt_secret_timer, jiffies + time);
}
rtnl_unlock();
}
@@ -3117,23 +3144,8 @@ static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
return ret;
}
-static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
- void __user *oldval,
- size_t __user *oldlenp,
- void __user *newval,
- size_t newlen)
-{
- int old = ip_rt_secret_interval;
- int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
-
- rt_secret_reschedule(old);
-
- return ret;
-}
-
static ctl_table ipv4_route_table[] = {
{
- .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
.procname = "gc_thresh",
.data = &ipv4_dst_ops.gc_thresh,
.maxlen = sizeof(int),
@@ -3141,7 +3153,6 @@ static ctl_table ipv4_route_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
.procname = "max_size",
.data = &ip_rt_max_size,
.maxlen = sizeof(int),
@@ -3151,43 +3162,34 @@ static ctl_table ipv4_route_table[] = {
{
/* Deprecated. Use gc_min_interval_ms */
- .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
.procname = "gc_min_interval",
.data = &ip_rt_gc_min_interval,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies,
},
{
- .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
.procname = "gc_min_interval_ms",
.data = &ip_rt_gc_min_interval,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
- .strategy = sysctl_ms_jiffies,
},
{
- .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
.procname = "gc_timeout",
.data = &ip_rt_gc_timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies,
},
{
- .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
.procname = "gc_interval",
.data = &ip_rt_gc_interval,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies,
},
{
- .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
.procname = "redirect_load",
.data = &ip_rt_redirect_load,
.maxlen = sizeof(int),
@@ -3195,7 +3197,6 @@ static ctl_table ipv4_route_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
.procname = "redirect_number",
.data = &ip_rt_redirect_number,
.maxlen = sizeof(int),
@@ -3203,7 +3204,6 @@ static ctl_table ipv4_route_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
.procname = "redirect_silence",
.data = &ip_rt_redirect_silence,
.maxlen = sizeof(int),
@@ -3211,7 +3211,6 @@ static ctl_table ipv4_route_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
.procname = "error_cost",
.data = &ip_rt_error_cost,
.maxlen = sizeof(int),
@@ -3219,7 +3218,6 @@ static ctl_table ipv4_route_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
.procname = "error_burst",
.data = &ip_rt_error_burst,
.maxlen = sizeof(int),
@@ -3227,7 +3225,6 @@ static ctl_table ipv4_route_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
.procname = "gc_elasticity",
.data = &ip_rt_gc_elasticity,
.maxlen = sizeof(int),
@@ -3235,16 +3232,13 @@ static ctl_table ipv4_route_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
.procname = "mtu_expires",
.data = &ip_rt_mtu_expires,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies,
},
{
- .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
.procname = "min_pmtu",
.data = &ip_rt_min_pmtu,
.maxlen = sizeof(int),
@@ -3252,7 +3246,6 @@ static ctl_table ipv4_route_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
.procname = "min_adv_mss",
.data = &ip_rt_min_advmss,
.maxlen = sizeof(int),
@@ -3260,50 +3253,46 @@ static ctl_table ipv4_route_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
.procname = "secret_interval",
.data = &ip_rt_secret_interval,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = ipv4_sysctl_rt_secret_interval,
- .strategy = ipv4_sysctl_rt_secret_interval_strategy,
},
- { .ctl_name = 0 }
+ { }
};
static struct ctl_table empty[1];
static struct ctl_table ipv4_skeleton[] =
{
- { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
+ { .procname = "route",
.mode = 0555, .child = ipv4_route_table},
- { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
+ { .procname = "neigh",
.mode = 0555, .child = empty},
{ }
};
static __net_initdata struct ctl_path ipv4_path[] = {
- { .procname = "net", .ctl_name = CTL_NET, },
- { .procname = "ipv4", .ctl_name = NET_IPV4, },
+ { .procname = "net", },
+ { .procname = "ipv4", },
{ },
};
static struct ctl_table ipv4_route_flush_table[] = {
{
- .ctl_name = NET_IPV4_ROUTE_FLUSH,
.procname = "flush",
.maxlen = sizeof(int),
.mode = 0200,
.proc_handler = ipv4_sysctl_rtcache_flush,
- .strategy = ipv4_sysctl_rtcache_flush_strategy,
},
- { .ctl_name = 0 },
+ { },
};
static __net_initdata struct ctl_path ipv4_route_path[] = {
- { .procname = "net", .ctl_name = CTL_NET, },
- { .procname = "ipv4", .ctl_name = NET_IPV4, },
- { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
+ { .procname = "net", },
+ { .procname = "ipv4", },
+ { .procname = "route", },
{ },
};
@@ -3312,7 +3301,7 @@ static __net_init int sysctl_route_net_init(struct net *net)
struct ctl_table *tbl;
tbl = ipv4_route_flush_table;
- if (net != &init_net) {
+ if (!net_eq(net, &init_net)) {
tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
if (tbl == NULL)
goto err_dup;
@@ -3380,7 +3369,7 @@ static __net_initdata struct pernet_operations rt_secret_timer_ops = {
#ifdef CONFIG_NET_CLS_ROUTE
-struct ip_rt_acct *ip_rt_acct __read_mostly;
+struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
#endif /* CONFIG_NET_CLS_ROUTE */
static __initdata unsigned long rhash_entries;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index a6e0e077ac33..5c24db4a3c91 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -253,6 +253,8 @@ EXPORT_SYMBOL(cookie_check_timestamp);
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
struct ip_options *opt)
{
+ struct tcp_options_received tcp_opt;
+ u8 *hash_location;
struct inet_request_sock *ireq;
struct tcp_request_sock *treq;
struct tcp_sock *tp = tcp_sk(sk);
@@ -263,7 +265,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
int mss;
struct rtable *rt;
__u8 rcv_wscale;
- struct tcp_options_received tcp_opt;
if (!sysctl_tcp_syncookies || !th->ack)
goto out;
@@ -278,7 +279,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
/* check for timestamp cookie support */
memset(&tcp_opt, 0, sizeof(tcp_opt));
- tcp_parse_options(skb, &tcp_opt, 0);
+ tcp_parse_options(skb, &tcp_opt, &hash_location, 0);
if (tcp_opt.saw_tstamp)
cookie_check_timestamp(&tcp_opt);
@@ -333,7 +334,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
* no easy way to do this.
*/
{
- struct flowi fl = { .nl_u = { .ip4_u =
+ struct flowi fl = { .mark = sk->sk_mark,
+ .nl_u = { .ip4_u =
{ .daddr = ((opt && opt->srr) ?
opt->faddr :
ireq->rmt_addr),
@@ -356,7 +358,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
tcp_select_initial_window(tcp_full_space(sk), req->mss,
&req->rcv_wnd, &req->window_clamp,
- ireq->wscale_ok, &rcv_wscale);
+ ireq->wscale_ok, &rcv_wscale,
+ dst_metric(&rt->u.dst, RTAX_INITRWND));
ireq->rcv_wscale = rcv_wscale;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 2dcf04d9b005..1cd5c15174b8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -12,6 +12,7 @@
#include <linux/inetdevice.h>
#include <linux/seqlock.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <net/snmp.h>
#include <net/icmp.h>
#include <net/ip.h>
@@ -63,34 +64,6 @@ static int ipv4_local_port_range(ctl_table *table, int write,
return ret;
}
-/* Validate changes from sysctl interface. */
-static int ipv4_sysctl_local_port_range(ctl_table *table,
- void __user *oldval,
- size_t __user *oldlenp,
- void __user *newval, size_t newlen)
-{
- int ret;
- int range[2];
- ctl_table tmp = {
- .data = &range,
- .maxlen = sizeof(range),
- .mode = table->mode,
- .extra1 = &ip_local_port_range_min,
- .extra2 = &ip_local_port_range_max,
- };
-
- inet_get_local_port_range(range, range + 1);
- ret = sysctl_intvec(&tmp, oldval, oldlenp, newval, newlen);
- if (ret == 0 && newval && newlen) {
- if (range[1] < range[0])
- ret = -EINVAL;
- else
- set_local_port_range(range);
- }
- return ret;
-}
-
-
static int proc_tcp_congestion_control(ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -109,25 +82,6 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write,
return ret;
}
-static int sysctl_tcp_congestion_control(ctl_table *table,
- void __user *oldval,
- size_t __user *oldlenp,
- void __user *newval, size_t newlen)
-{
- char val[TCP_CA_NAME_MAX];
- ctl_table tbl = {
- .data = val,
- .maxlen = TCP_CA_NAME_MAX,
- };
- int ret;
-
- tcp_get_default_congestion_control(val);
- ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen);
- if (ret == 1 && newval && newlen)
- ret = tcp_set_default_congestion_control(val);
- return ret;
-}
-
static int proc_tcp_available_congestion_control(ctl_table *ctl,
int write,
void __user *buffer, size_t *lenp,
@@ -165,32 +119,8 @@ static int proc_allowed_congestion_control(ctl_table *ctl,
return ret;
}
-static int strategy_allowed_congestion_control(ctl_table *table,
- void __user *oldval,
- size_t __user *oldlenp,
- void __user *newval,
- size_t newlen)
-{
- ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX };
- int ret;
-
- tbl.data = kmalloc(tbl.maxlen, GFP_USER);
- if (!tbl.data)
- return -ENOMEM;
-
- tcp_get_available_congestion_control(tbl.data, tbl.maxlen);
- ret = sysctl_string(&tbl, oldval, oldlenp, newval, newlen);
- if (ret == 1 && newval && newlen)
- ret = tcp_set_allowed_congestion_control(tbl.data);
- kfree(tbl.data);
-
- return ret;
-
-}
-
static struct ctl_table ipv4_table[] = {
{
- .ctl_name = NET_IPV4_TCP_TIMESTAMPS,
.procname = "tcp_timestamps",
.data = &sysctl_tcp_timestamps,
.maxlen = sizeof(int),
@@ -198,7 +128,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_TCP_WINDOW_SCALING,
.procname = "tcp_window_scaling",
.data = &sysctl_tcp_window_scaling,
.maxlen = sizeof(int),
@@ -206,7 +135,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_TCP_SACK,
.procname = "tcp_sack",
.data = &sysctl_tcp_sack,
.maxlen = sizeof(int),
@@ -214,7 +142,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE,
.procname = "tcp_retrans_collapse",
.data = &sysctl_tcp_retrans_collapse,
.maxlen = sizeof(int),
@@ -222,17 +149,14 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_DEFAULT_TTL,
.procname = "ip_default_ttl",
.data = &sysctl_ip_default_ttl,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = ipv4_doint_and_flush,
- .strategy = ipv4_doint_and_flush_strategy,
.extra2 = &init_net,
},
{
- .ctl_name = NET_IPV4_NO_PMTU_DISC,
.procname = "ip_no_pmtu_disc",
.data = &ipv4_config.no_pmtu_disc,
.maxlen = sizeof(int),
@@ -240,7 +164,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_NONLOCAL_BIND,
.procname = "ip_nonlocal_bind",
.data = &sysctl_ip_nonlocal_bind,
.maxlen = sizeof(int),
@@ -248,7 +171,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_TCP_SYN_RETRIES,
.procname = "tcp_syn_retries",
.data = &sysctl_tcp_syn_retries,
.maxlen = sizeof(int),
@@ -256,7 +178,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_SYNACK_RETRIES,
.procname = "tcp_synack_retries",
.data = &sysctl_tcp_synack_retries,
.maxlen = sizeof(int),
@@ -264,7 +185,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_MAX_ORPHANS,
.procname = "tcp_max_orphans",
.data = &sysctl_tcp_max_orphans,
.maxlen = sizeof(int),
@@ -272,7 +192,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_MAX_TW_BUCKETS,
.procname = "tcp_max_tw_buckets",
.data = &tcp_death_row.sysctl_max_tw_buckets,
.maxlen = sizeof(int),
@@ -280,7 +199,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_DYNADDR,
.procname = "ip_dynaddr",
.data = &sysctl_ip_dynaddr,
.maxlen = sizeof(int),
@@ -288,16 +206,13 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME,
.procname = "tcp_keepalive_time",
.data = &sysctl_tcp_keepalive_time,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies
},
{
- .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES,
.procname = "tcp_keepalive_probes",
.data = &sysctl_tcp_keepalive_probes,
.maxlen = sizeof(int),
@@ -305,26 +220,21 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL,
.procname = "tcp_keepalive_intvl",
.data = &sysctl_tcp_keepalive_intvl,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies
},
{
- .ctl_name = NET_IPV4_TCP_RETRIES1,
.procname = "tcp_retries1",
.data = &sysctl_tcp_retries1,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .strategy = sysctl_intvec,
.extra2 = &tcp_retr1_max
},
{
- .ctl_name = NET_IPV4_TCP_RETRIES2,
.procname = "tcp_retries2",
.data = &sysctl_tcp_retries2,
.maxlen = sizeof(int),
@@ -332,17 +242,14 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT,
.procname = "tcp_fin_timeout",
.data = &sysctl_tcp_fin_timeout,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies
},
#ifdef CONFIG_SYN_COOKIES
{
- .ctl_name = NET_TCP_SYNCOOKIES,
.procname = "tcp_syncookies",
.data = &sysctl_tcp_syncookies,
.maxlen = sizeof(int),
@@ -351,7 +258,6 @@ static struct ctl_table ipv4_table[] = {
},
#endif
{
- .ctl_name = NET_TCP_TW_RECYCLE,
.procname = "tcp_tw_recycle",
.data = &tcp_death_row.sysctl_tw_recycle,
.maxlen = sizeof(int),
@@ -359,7 +265,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_ABORT_ON_OVERFLOW,
.procname = "tcp_abort_on_overflow",
.data = &sysctl_tcp_abort_on_overflow,
.maxlen = sizeof(int),
@@ -367,7 +272,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_STDURG,
.procname = "tcp_stdurg",
.data = &sysctl_tcp_stdurg,
.maxlen = sizeof(int),
@@ -375,7 +279,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_RFC1337,
.procname = "tcp_rfc1337",
.data = &sysctl_tcp_rfc1337,
.maxlen = sizeof(int),
@@ -383,7 +286,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_MAX_SYN_BACKLOG,
.procname = "tcp_max_syn_backlog",
.data = &sysctl_max_syn_backlog,
.maxlen = sizeof(int),
@@ -391,17 +293,14 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_LOCAL_PORT_RANGE,
.procname = "ip_local_port_range",
.data = &sysctl_local_ports.range,
.maxlen = sizeof(sysctl_local_ports.range),
.mode = 0644,
.proc_handler = ipv4_local_port_range,
- .strategy = ipv4_sysctl_local_port_range,
},
#ifdef CONFIG_IP_MULTICAST
{
- .ctl_name = NET_IPV4_IGMP_MAX_MEMBERSHIPS,
.procname = "igmp_max_memberships",
.data = &sysctl_igmp_max_memberships,
.maxlen = sizeof(int),
@@ -411,7 +310,6 @@ static struct ctl_table ipv4_table[] = {
#endif
{
- .ctl_name = NET_IPV4_IGMP_MAX_MSF,
.procname = "igmp_max_msf",
.data = &sysctl_igmp_max_msf,
.maxlen = sizeof(int),
@@ -419,7 +317,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_INET_PEER_THRESHOLD,
.procname = "inet_peer_threshold",
.data = &inet_peer_threshold,
.maxlen = sizeof(int),
@@ -427,43 +324,34 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_INET_PEER_MINTTL,
.procname = "inet_peer_minttl",
.data = &inet_peer_minttl,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies
},
{
- .ctl_name = NET_IPV4_INET_PEER_MAXTTL,
.procname = "inet_peer_maxttl",
.data = &inet_peer_maxttl,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies
},
{
- .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME,
.procname = "inet_peer_gc_mintime",
.data = &inet_peer_gc_mintime,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies
},
{
- .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME,
.procname = "inet_peer_gc_maxtime",
.data = &inet_peer_gc_maxtime,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
- .strategy = sysctl_jiffies
},
{
- .ctl_name = NET_TCP_ORPHAN_RETRIES,
.procname = "tcp_orphan_retries",
.data = &sysctl_tcp_orphan_retries,
.maxlen = sizeof(int),
@@ -471,7 +359,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_FACK,
.procname = "tcp_fack",
.data = &sysctl_tcp_fack,
.maxlen = sizeof(int),
@@ -479,7 +366,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_REORDERING,
.procname = "tcp_reordering",
.data = &sysctl_tcp_reordering,
.maxlen = sizeof(int),
@@ -487,7 +373,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_ECN,
.procname = "tcp_ecn",
.data = &sysctl_tcp_ecn,
.maxlen = sizeof(int),
@@ -495,7 +380,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_DSACK,
.procname = "tcp_dsack",
.data = &sysctl_tcp_dsack,
.maxlen = sizeof(int),
@@ -503,7 +387,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_MEM,
.procname = "tcp_mem",
.data = &sysctl_tcp_mem,
.maxlen = sizeof(sysctl_tcp_mem),
@@ -511,7 +394,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_WMEM,
.procname = "tcp_wmem",
.data = &sysctl_tcp_wmem,
.maxlen = sizeof(sysctl_tcp_wmem),
@@ -519,7 +401,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_RMEM,
.procname = "tcp_rmem",
.data = &sysctl_tcp_rmem,
.maxlen = sizeof(sysctl_tcp_rmem),
@@ -527,7 +408,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_APP_WIN,
.procname = "tcp_app_win",
.data = &sysctl_tcp_app_win,
.maxlen = sizeof(int),
@@ -535,7 +415,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_ADV_WIN_SCALE,
.procname = "tcp_adv_win_scale",
.data = &sysctl_tcp_adv_win_scale,
.maxlen = sizeof(int),
@@ -543,7 +422,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_TW_REUSE,
.procname = "tcp_tw_reuse",
.data = &sysctl_tcp_tw_reuse,
.maxlen = sizeof(int),
@@ -551,7 +429,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_FRTO,
.procname = "tcp_frto",
.data = &sysctl_tcp_frto,
.maxlen = sizeof(int),
@@ -559,7 +436,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_FRTO_RESPONSE,
.procname = "tcp_frto_response",
.data = &sysctl_tcp_frto_response,
.maxlen = sizeof(int),
@@ -567,7 +443,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_LOW_LATENCY,
.procname = "tcp_low_latency",
.data = &sysctl_tcp_low_latency,
.maxlen = sizeof(int),
@@ -575,7 +450,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_TCP_NO_METRICS_SAVE,
.procname = "tcp_no_metrics_save",
.data = &sysctl_tcp_nometrics_save,
.maxlen = sizeof(int),
@@ -583,7 +457,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_TCP_MODERATE_RCVBUF,
.procname = "tcp_moderate_rcvbuf",
.data = &sysctl_tcp_moderate_rcvbuf,
.maxlen = sizeof(int),
@@ -591,7 +464,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_TCP_TSO_WIN_DIVISOR,
.procname = "tcp_tso_win_divisor",
.data = &sysctl_tcp_tso_win_divisor,
.maxlen = sizeof(int),
@@ -599,15 +471,12 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_TCP_CONG_CONTROL,
.procname = "tcp_congestion_control",
.mode = 0644,
.maxlen = TCP_CA_NAME_MAX,
.proc_handler = proc_tcp_congestion_control,
- .strategy = sysctl_tcp_congestion_control,
},
{
- .ctl_name = NET_TCP_ABC,
.procname = "tcp_abc",
.data = &sysctl_tcp_abc,
.maxlen = sizeof(int),
@@ -615,7 +484,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_TCP_MTU_PROBING,
.procname = "tcp_mtu_probing",
.data = &sysctl_tcp_mtu_probing,
.maxlen = sizeof(int),
@@ -623,7 +491,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_TCP_BASE_MSS,
.procname = "tcp_base_mss",
.data = &sysctl_tcp_base_mss,
.maxlen = sizeof(int),
@@ -631,7 +498,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS,
.procname = "tcp_workaround_signed_windows",
.data = &sysctl_tcp_workaround_signed_windows,
.maxlen = sizeof(int),
@@ -640,7 +506,6 @@ static struct ctl_table ipv4_table[] = {
},
#ifdef CONFIG_NET_DMA
{
- .ctl_name = NET_TCP_DMA_COPYBREAK,
.procname = "tcp_dma_copybreak",
.data = &sysctl_tcp_dma_copybreak,
.maxlen = sizeof(int),
@@ -649,7 +514,6 @@ static struct ctl_table ipv4_table[] = {
},
#endif
{
- .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE,
.procname = "tcp_slow_start_after_idle",
.data = &sysctl_tcp_slow_start_after_idle,
.maxlen = sizeof(int),
@@ -658,7 +522,6 @@ static struct ctl_table ipv4_table[] = {
},
#ifdef CONFIG_NETLABEL
{
- .ctl_name = NET_CIPSOV4_CACHE_ENABLE,
.procname = "cipso_cache_enable",
.data = &cipso_v4_cache_enabled,
.maxlen = sizeof(int),
@@ -666,7 +529,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_CIPSOV4_CACHE_BUCKET_SIZE,
.procname = "cipso_cache_bucket_size",
.data = &cipso_v4_cache_bucketsize,
.maxlen = sizeof(int),
@@ -674,7 +536,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_CIPSOV4_RBM_OPTFMT,
.procname = "cipso_rbm_optfmt",
.data = &cipso_v4_rbm_optfmt,
.maxlen = sizeof(int),
@@ -682,7 +543,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = NET_CIPSOV4_RBM_STRICTVALID,
.procname = "cipso_rbm_strictvalid",
.data = &cipso_v4_rbm_strictvalid,
.maxlen = sizeof(int),
@@ -697,15 +557,12 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_tcp_available_congestion_control,
},
{
- .ctl_name = NET_TCP_ALLOWED_CONG_CONTROL,
.procname = "tcp_allowed_congestion_control",
.maxlen = TCP_CA_BUF_MAX,
.mode = 0644,
.proc_handler = proc_allowed_congestion_control,
- .strategy = strategy_allowed_congestion_control,
},
{
- .ctl_name = NET_TCP_MAX_SSTHRESH,
.procname = "tcp_max_ssthresh",
.data = &sysctl_tcp_max_ssthresh,
.maxlen = sizeof(int),
@@ -713,41 +570,55 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .ctl_name = CTL_UNNUMBERED,
+ .procname = "tcp_cookie_size",
+ .data = &sysctl_tcp_cookie_size,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_thin_linear_timeouts",
+ .data = &sysctl_tcp_thin_linear_timeouts,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_thin_dupack",
+ .data = &sysctl_tcp_thin_dupack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
.procname = "udp_mem",
.data = &sysctl_udp_mem,
.maxlen = sizeof(sysctl_udp_mem),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .strategy = sysctl_intvec,
.extra1 = &zero
},
{
- .ctl_name = CTL_UNNUMBERED,
.procname = "udp_rmem_min",
.data = &sysctl_udp_rmem_min,
.maxlen = sizeof(sysctl_udp_rmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .strategy = sysctl_intvec,
.extra1 = &zero
},
{
- .ctl_name = CTL_UNNUMBERED,
.procname = "udp_wmem_min",
.data = &sysctl_udp_wmem_min,
.maxlen = sizeof(sysctl_udp_wmem_min),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .strategy = sysctl_intvec,
.extra1 = &zero
},
- { .ctl_name = 0 }
+ { }
};
static struct ctl_table ipv4_net_table[] = {
{
- .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL,
.procname = "icmp_echo_ignore_all",
.data = &init_net.ipv4.sysctl_icmp_echo_ignore_all,
.maxlen = sizeof(int),
@@ -755,7 +626,6 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS,
.procname = "icmp_echo_ignore_broadcasts",
.data = &init_net.ipv4.sysctl_icmp_echo_ignore_broadcasts,
.maxlen = sizeof(int),
@@ -763,7 +633,6 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES,
.procname = "icmp_ignore_bogus_error_responses",
.data = &init_net.ipv4.sysctl_icmp_ignore_bogus_error_responses,
.maxlen = sizeof(int),
@@ -771,7 +640,6 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR,
.procname = "icmp_errors_use_inbound_ifaddr",
.data = &init_net.ipv4.sysctl_icmp_errors_use_inbound_ifaddr,
.maxlen = sizeof(int),
@@ -779,16 +647,13 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = NET_IPV4_ICMP_RATELIMIT,
.procname = "icmp_ratelimit",
.data = &init_net.ipv4.sysctl_icmp_ratelimit,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
- .strategy = sysctl_ms_jiffies
},
{
- .ctl_name = NET_IPV4_ICMP_RATEMASK,
.procname = "icmp_ratemask",
.data = &init_net.ipv4.sysctl_icmp_ratemask,
.maxlen = sizeof(int),
@@ -796,7 +661,6 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
- .ctl_name = CTL_UNNUMBERED,
.procname = "rt_cache_rebuild_count",
.data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
.maxlen = sizeof(int),
@@ -807,8 +671,8 @@ static struct ctl_table ipv4_net_table[] = {
};
struct ctl_path net_ipv4_ctl_path[] = {
- { .procname = "net", .ctl_name = CTL_NET, },
- { .procname = "ipv4", .ctl_name = NET_IPV4, },
+ { .procname = "net", },
+ { .procname = "ipv4", },
{ },
};
EXPORT_SYMBOL_GPL(net_ipv4_ctl_path);
@@ -818,7 +682,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
struct ctl_table *table;
table = ipv4_net_table;
- if (net != &init_net) {
+ if (!net_eq(net, &init_net)) {
table = kmemdup(table, sizeof(ipv4_net_table), GFP_KERNEL);
if (table == NULL)
goto err_alloc;
@@ -849,7 +713,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
return 0;
err_reg:
- if (net != &init_net)
+ if (!net_eq(net, &init_net))
kfree(table);
err_alloc:
return -ENOMEM;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index f1813bc71088..0f8caf64caa3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -264,6 +264,8 @@
#include <linux/cache.h>
#include <linux/err.h>
#include <linux/crypto.h>
+#include <linux/time.h>
+#include <linux/slab.h>
#include <net/icmp.h>
#include <net/tcp.h>
@@ -428,7 +430,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tp->urg_seq == tp->copied_seq &&
!sock_flag(sk, SOCK_URGINLINE) &&
tp->urg_data)
- target--;
+ target++;
/* Potential race condition. If read of tp below will
* escape above sk->sk_state, we can be illegally awaken
@@ -535,8 +537,7 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb)
tp->nonagle &= ~TCP_NAGLE_PUSH;
}
-static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
- struct sk_buff *skb)
+static inline void tcp_mark_urg(struct tcp_sock *tp, int flags)
{
if (flags & MSG_OOB)
tp->snd_up = tp->write_seq;
@@ -545,13 +546,13 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags,
static inline void tcp_push(struct sock *sk, int flags, int mss_now,
int nonagle)
{
- struct tcp_sock *tp = tcp_sk(sk);
-
if (tcp_send_head(sk)) {
- struct sk_buff *skb = tcp_write_queue_tail(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
if (!(flags & MSG_MORE) || forced_push(tp))
- tcp_mark_push(tp, skb);
- tcp_mark_urg(tp, flags, skb);
+ tcp_mark_push(tp, tcp_write_queue_tail(sk));
+
+ tcp_mark_urg(tp, flags);
__tcp_push_pending_frames(sk, mss_now,
(flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle);
}
@@ -876,12 +877,12 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,
#define TCP_PAGE(sk) (sk->sk_sndmsg_page)
#define TCP_OFF(sk) (sk->sk_sndmsg_off)
-static inline int select_size(struct sock *sk)
+static inline int select_size(struct sock *sk, int sg)
{
struct tcp_sock *tp = tcp_sk(sk);
int tmp = tp->mss_cache;
- if (sk->sk_route_caps & NETIF_F_SG) {
+ if (sg) {
if (sk_can_gso(sk))
tmp = 0;
else {
@@ -905,7 +906,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
struct sk_buff *skb;
int iovlen, flags;
int mss_now, size_goal;
- int err, copied;
+ int sg, err, copied;
long timeo;
lock_sock(sk);
@@ -933,6 +934,8 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
+ sg = sk->sk_route_caps & NETIF_F_SG;
+
while (--iovlen >= 0) {
int seglen = iov->iov_len;
unsigned char __user *from = iov->iov_base;
@@ -958,8 +961,9 @@ new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
- skb = sk_stream_alloc_skb(sk, select_size(sk),
- sk->sk_allocation);
+ skb = sk_stream_alloc_skb(sk,
+ select_size(sk, sg),
+ sk->sk_allocation);
if (!skb)
goto wait_for_memory;
@@ -996,9 +1000,7 @@ new_segment:
/* We can extend the last page
* fragment. */
merge = 1;
- } else if (i == MAX_SKB_FRAGS ||
- (!i &&
- !(sk->sk_route_caps & NETIF_F_SG))) {
+ } else if (i == MAX_SKB_FRAGS || !sg) {
/* Need to add new fragment and cannot
* do this because interface is non-SG,
* or because all the page slots are
@@ -1253,6 +1255,39 @@ static void tcp_prequeue_process(struct sock *sk)
tp->ucopy.memory = 0;
}
+#ifdef CONFIG_NET_DMA
+static void tcp_service_net_dma(struct sock *sk, bool wait)
+{
+ dma_cookie_t done, used;
+ dma_cookie_t last_issued;
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (!tp->ucopy.dma_chan)
+ return;
+
+ last_issued = tp->ucopy.dma_cookie;
+ dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
+ do {
+ if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
+ last_issued, &done,
+ &used) == DMA_SUCCESS) {
+ /* Safe to free early-copied skbs now */
+ __skb_queue_purge(&sk->sk_async_wait_queue);
+ break;
+ } else {
+ struct sk_buff *skb;
+ while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
+ (dma_async_is_complete(skb->dma_cookie, done,
+ used) == DMA_SUCCESS)) {
+ __skb_dequeue(&sk->sk_async_wait_queue);
+ kfree_skb(skb);
+ }
+ }
+ } while (wait);
+}
+#endif
+
static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
{
struct sk_buff *skb;
@@ -1334,6 +1369,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_eat_skb(sk, skb, 0);
if (!desc->count)
break;
+ tp->copied_seq = seq;
}
tp->copied_seq = seq;
@@ -1545,6 +1581,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
/* __ Set realtime policy in scheduler __ */
}
+#ifdef CONFIG_NET_DMA
+ if (tp->ucopy.dma_chan)
+ dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+#endif
if (copied >= target) {
/* Do not sleep, just process backlog. */
release_sock(sk);
@@ -1553,6 +1593,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
sk_wait_data(sk, &timeo);
#ifdef CONFIG_NET_DMA
+ tcp_service_net_dma(sk, false); /* Don't block */
tp->ucopy.wakeup = 0;
#endif
@@ -1632,6 +1673,9 @@ do_prequeue:
copied = -EFAULT;
break;
}
+
+ dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
+
if ((offset + used) == skb->len)
copied_early = 1;
@@ -1701,27 +1745,9 @@ skip_copy:
}
#ifdef CONFIG_NET_DMA
- if (tp->ucopy.dma_chan) {
- dma_cookie_t done, used;
-
- dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
-
- while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
- tp->ucopy.dma_cookie, &done,
- &used) == DMA_IN_PROGRESS) {
- /* do partial cleanup of sk_async_wait_queue */
- while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
- (dma_async_is_complete(skb->dma_cookie, done,
- used) == DMA_SUCCESS)) {
- __skb_dequeue(&sk->sk_async_wait_queue);
- kfree_skb(skb);
- }
- }
+ tcp_service_net_dma(sk, true); /* Wait for queue to drain */
+ tp->ucopy.dma_chan = NULL;
- /* Safe to free early-copied skbs now */
- __skb_queue_purge(&sk->sk_async_wait_queue);
- tp->ucopy.dma_chan = NULL;
- }
if (tp->ucopy.pinned_list) {
dma_unpin_iovec_pages(tp->ucopy.pinned_list);
tp->ucopy.pinned_list = NULL;
@@ -2042,7 +2068,7 @@ int tcp_disconnect(struct sock *sk, int flags)
__skb_queue_purge(&sk->sk_async_wait_queue);
#endif
- inet->dport = 0;
+ inet->inet_dport = 0;
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
@@ -2059,6 +2085,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_cnt = 0;
tp->bytes_acked = 0;
+ tp->window_clamp = 0;
tcp_set_ca_state(sk, TCP_CA_Open);
tcp_clear_retrans(tp);
inet_csk_delack_init(sk);
@@ -2066,7 +2093,7 @@ int tcp_disconnect(struct sock *sk, int flags)
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
- WARN_ON(inet->num && !icsk->icsk_bind_hash);
+ WARN_ON(inet->inet_num && !icsk->icsk_bind_hash);
sk->sk_error_report(sk);
return err;
@@ -2083,8 +2110,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
int val;
int err = 0;
- /* This is a string value all the others are int's */
- if (optname == TCP_CONGESTION) {
+ /* These are data/string values, all the others are ints */
+ switch (optname) {
+ case TCP_CONGESTION: {
char name[TCP_CA_NAME_MAX];
if (optlen < 1)
@@ -2101,6 +2129,93 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
release_sock(sk);
return err;
}
+ case TCP_COOKIE_TRANSACTIONS: {
+ struct tcp_cookie_transactions ctd;
+ struct tcp_cookie_values *cvp = NULL;
+
+ if (sizeof(ctd) > optlen)
+ return -EINVAL;
+ if (copy_from_user(&ctd, optval, sizeof(ctd)))
+ return -EFAULT;
+
+ if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
+ ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
+ return -EINVAL;
+
+ if (ctd.tcpct_cookie_desired == 0) {
+ /* default to global value */
+ } else if ((0x1 & ctd.tcpct_cookie_desired) ||
+ ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
+ ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
+ return -EINVAL;
+ }
+
+ if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
+ /* Supercedes all other values */
+ lock_sock(sk);
+ if (tp->cookie_values != NULL) {
+ kref_put(&tp->cookie_values->kref,
+ tcp_cookie_values_release);
+ tp->cookie_values = NULL;
+ }
+ tp->rx_opt.cookie_in_always = 0; /* false */
+ tp->rx_opt.cookie_out_never = 1; /* true */
+ release_sock(sk);
+ return err;
+ }
+
+ /* Allocate ancillary memory before locking.
+ */
+ if (ctd.tcpct_used > 0 ||
+ (tp->cookie_values == NULL &&
+ (sysctl_tcp_cookie_size > 0 ||
+ ctd.tcpct_cookie_desired > 0 ||
+ ctd.tcpct_s_data_desired > 0))) {
+ cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
+ GFP_KERNEL);
+ if (cvp == NULL)
+ return -ENOMEM;
+ }
+ lock_sock(sk);
+ tp->rx_opt.cookie_in_always =
+ (TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
+ tp->rx_opt.cookie_out_never = 0; /* false */
+
+ if (tp->cookie_values != NULL) {
+ if (cvp != NULL) {
+ /* Changed values are recorded by a changed
+ * pointer, ensuring the cookie will differ,
+ * without separately hashing each value later.
+ */
+ kref_put(&tp->cookie_values->kref,
+ tcp_cookie_values_release);
+ kref_init(&cvp->kref);
+ tp->cookie_values = cvp;
+ } else {
+ cvp = tp->cookie_values;
+ }
+ }
+ if (cvp != NULL) {
+ cvp->cookie_desired = ctd.tcpct_cookie_desired;
+
+ if (ctd.tcpct_used > 0) {
+ memcpy(cvp->s_data_payload, ctd.tcpct_value,
+ ctd.tcpct_used);
+ cvp->s_data_desired = ctd.tcpct_used;
+ cvp->s_data_constant = 1; /* true */
+ } else {
+ /* No constant payload data. */
+ cvp->s_data_desired = ctd.tcpct_s_data_desired;
+ cvp->s_data_constant = 0; /* false */
+ }
+ }
+ release_sock(sk);
+ return err;
+ }
+ default:
+ /* fallthru */
+ break;
+ };
if (optlen < sizeof(int))
return -EINVAL;
@@ -2139,6 +2254,20 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
}
break;
+ case TCP_THIN_LINEAR_TIMEOUTS:
+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else
+ tp->thin_lto = val;
+ break;
+
+ case TCP_THIN_DUPACK:
+ if (val < 0 || val > 1)
+ err = -EINVAL;
+ else
+ tp->thin_dupack = val;
+ break;
+
case TCP_CORK:
/* When set indicates to always queue non-full frames.
* Later the user clears this option and we transmit
@@ -2425,6 +2554,42 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
if (copy_to_user(optval, icsk->icsk_ca_ops->name, len))
return -EFAULT;
return 0;
+
+ case TCP_COOKIE_TRANSACTIONS: {
+ struct tcp_cookie_transactions ctd;
+ struct tcp_cookie_values *cvp = tp->cookie_values;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+ if (len < sizeof(ctd))
+ return -EINVAL;
+
+ memset(&ctd, 0, sizeof(ctd));
+ ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
+ TCP_COOKIE_IN_ALWAYS : 0)
+ | (tp->rx_opt.cookie_out_never ?
+ TCP_COOKIE_OUT_NEVER : 0);
+
+ if (cvp != NULL) {
+ ctd.tcpct_flags |= (cvp->s_data_in ?
+ TCP_S_DATA_IN : 0)
+ | (cvp->s_data_out ?
+ TCP_S_DATA_OUT : 0);
+
+ ctd.tcpct_cookie_desired = cvp->cookie_desired;
+ ctd.tcpct_s_data_desired = cvp->s_data_desired;
+
+ memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
+ cvp->cookie_pair_size);
+ ctd.tcpct_used = cvp->cookie_pair_size;
+ }
+
+ if (put_user(sizeof(ctd), optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &ctd, sizeof(ctd)))
+ return -EFAULT;
+ return 0;
+ }
default:
return -ENOPROTOOPT;
}
@@ -2662,10 +2827,10 @@ EXPORT_SYMBOL(tcp_gro_complete);
#ifdef CONFIG_TCP_MD5SIG
static unsigned long tcp_md5sig_users;
-static struct tcp_md5sig_pool **tcp_md5sig_pool;
+static struct tcp_md5sig_pool * __percpu *tcp_md5sig_pool;
static DEFINE_SPINLOCK(tcp_md5sig_pool_lock);
-static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
+static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool * __percpu *pool)
{
int cpu;
for_each_possible_cpu(cpu) {
@@ -2682,7 +2847,7 @@ static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool)
void tcp_free_md5sig_pool(void)
{
- struct tcp_md5sig_pool **pool = NULL;
+ struct tcp_md5sig_pool * __percpu *pool = NULL;
spin_lock_bh(&tcp_md5sig_pool_lock);
if (--tcp_md5sig_users == 0) {
@@ -2696,10 +2861,11 @@ void tcp_free_md5sig_pool(void)
EXPORT_SYMBOL(tcp_free_md5sig_pool);
-static struct tcp_md5sig_pool **__tcp_alloc_md5sig_pool(struct sock *sk)
+static struct tcp_md5sig_pool * __percpu *
+__tcp_alloc_md5sig_pool(struct sock *sk)
{
int cpu;
- struct tcp_md5sig_pool **pool;
+ struct tcp_md5sig_pool * __percpu *pool;
pool = alloc_percpu(struct tcp_md5sig_pool *);
if (!pool)
@@ -2726,9 +2892,9 @@ out_free:
return NULL;
}
-struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(struct sock *sk)
+struct tcp_md5sig_pool * __percpu *tcp_alloc_md5sig_pool(struct sock *sk)
{
- struct tcp_md5sig_pool **pool;
+ struct tcp_md5sig_pool * __percpu *pool;
int alloc = 0;
retry:
@@ -2747,7 +2913,9 @@ retry:
if (alloc) {
/* we cannot hold spinlock here because this may sleep. */
- struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(sk);
+ struct tcp_md5sig_pool * __percpu *p;
+
+ p = __tcp_alloc_md5sig_pool(sk);
spin_lock_bh(&tcp_md5sig_pool_lock);
if (!p) {
tcp_md5sig_users--;
@@ -2771,7 +2939,7 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool);
struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu)
{
- struct tcp_md5sig_pool **p;
+ struct tcp_md5sig_pool * __percpu *p;
spin_lock_bh(&tcp_md5sig_pool_lock);
p = tcp_md5sig_pool;
if (p)
@@ -2847,6 +3015,135 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
#endif
+/**
+ * Each Responder maintains up to two secret values concurrently for
+ * efficient secret rollover. Each secret value has 4 states:
+ *
+ * Generating. (tcp_secret_generating != tcp_secret_primary)
+ * Generates new Responder-Cookies, but not yet used for primary
+ * verification. This is a short-term state, typically lasting only
+ * one round trip time (RTT).
+ *
+ * Primary. (tcp_secret_generating == tcp_secret_primary)
+ * Used both for generation and primary verification.
+ *
+ * Retiring. (tcp_secret_retiring != tcp_secret_secondary)
+ * Used for verification, until the first failure that can be
+ * verified by the newer Generating secret. At that time, this
+ * cookie's state is changed to Secondary, and the Generating
+ * cookie's state is changed to Primary. This is a short-term state,
+ * typically lasting only one round trip time (RTT).
+ *
+ * Secondary. (tcp_secret_retiring == tcp_secret_secondary)
+ * Used for secondary verification, after primary verification
+ * failures. This state lasts no more than twice the Maximum Segment
+ * Lifetime (2MSL). Then, the secret is discarded.
+ */
+struct tcp_cookie_secret {
+ /* The secret is divided into two parts. The digest part is the
+ * equivalent of previously hashing a secret and saving the state,
+ * and serves as an initialization vector (IV). The message part
+ * serves as the trailing secret.
+ */
+ u32 secrets[COOKIE_WORKSPACE_WORDS];
+ unsigned long expires;
+};
+
+#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
+#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
+#define TCP_SECRET_LIFE (HZ * 600)
+
+static struct tcp_cookie_secret tcp_secret_one;
+static struct tcp_cookie_secret tcp_secret_two;
+
+/* Essentially a circular list, without dynamic allocation. */
+static struct tcp_cookie_secret *tcp_secret_generating;
+static struct tcp_cookie_secret *tcp_secret_primary;
+static struct tcp_cookie_secret *tcp_secret_retiring;
+static struct tcp_cookie_secret *tcp_secret_secondary;
+
+static DEFINE_SPINLOCK(tcp_secret_locker);
+
+/* Select a pseudo-random word in the cookie workspace.
+ */
+static inline u32 tcp_cookie_work(const u32 *ws, const int n)
+{
+ return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
+}
+
+/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
+ * Called in softirq context.
+ * Returns: 0 for success.
+ */
+int tcp_cookie_generator(u32 *bakery)
+{
+ unsigned long jiffy = jiffies;
+
+ if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
+ spin_lock_bh(&tcp_secret_locker);
+ if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
+ /* refreshed by another */
+ memcpy(bakery,
+ &tcp_secret_generating->secrets[0],
+ COOKIE_WORKSPACE_WORDS);
+ } else {
+ /* still needs refreshing */
+ get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
+
+ /* The first time, paranoia assumes that the
+ * randomization function isn't as strong. But,
+ * this secret initialization is delayed until
+ * the last possible moment (packet arrival).
+ * Although that time is observable, it is
+ * unpredictably variable. Mash in the most
+ * volatile clock bits available, and expire the
+ * secret extra quickly.
+ */
+ if (unlikely(tcp_secret_primary->expires ==
+ tcp_secret_secondary->expires)) {
+ struct timespec tv;
+
+ getnstimeofday(&tv);
+ bakery[COOKIE_DIGEST_WORDS+0] ^=
+ (u32)tv.tv_nsec;
+
+ tcp_secret_secondary->expires = jiffy
+ + TCP_SECRET_1MSL
+ + (0x0f & tcp_cookie_work(bakery, 0));
+ } else {
+ tcp_secret_secondary->expires = jiffy
+ + TCP_SECRET_LIFE
+ + (0xff & tcp_cookie_work(bakery, 1));
+ tcp_secret_primary->expires = jiffy
+ + TCP_SECRET_2MSL
+ + (0x1f & tcp_cookie_work(bakery, 2));
+ }
+ memcpy(&tcp_secret_secondary->secrets[0],
+ bakery, COOKIE_WORKSPACE_WORDS);
+
+ rcu_assign_pointer(tcp_secret_generating,
+ tcp_secret_secondary);
+ rcu_assign_pointer(tcp_secret_retiring,
+ tcp_secret_primary);
+ /*
+ * Neither call_rcu() nor synchronize_rcu() needed.
+ * Retiring data is not freed. It is replaced after
+ * further (locked) pointer updates, and a quiet time
+ * (minimum 1MSL, maximum LIFE - 2MSL).
+ */
+ }
+ spin_unlock_bh(&tcp_secret_locker);
+ } else {
+ rcu_read_lock_bh();
+ memcpy(bakery,
+ &rcu_dereference(tcp_secret_generating)->secrets[0],
+ COOKIE_WORKSPACE_WORDS);
+ rcu_read_unlock_bh();
+ }
+ return 0;
+}
+EXPORT_SYMBOL(tcp_cookie_generator);
+
void tcp_done(struct sock *sk)
{
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
@@ -2881,6 +3178,7 @@ void __init tcp_init(void)
struct sk_buff *skb = NULL;
unsigned long nr_pages, limit;
int order, i, max_share;
+ unsigned long jiffy = jiffies;
BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
@@ -2903,11 +3201,10 @@ void __init tcp_init(void)
(totalram_pages >= 128 * 1024) ?
13 : 15,
0,
- &tcp_hashinfo.ehash_size,
NULL,
+ &tcp_hashinfo.ehash_mask,
thash_entries ? 0 : 512 * 1024);
- tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
- for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
+ for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
}
@@ -2916,7 +3213,7 @@ void __init tcp_init(void)
tcp_hashinfo.bhash =
alloc_large_system_hash("TCP bind",
sizeof(struct inet_bind_hashbucket),
- tcp_hashinfo.ehash_size,
+ tcp_hashinfo.ehash_mask + 1,
(totalram_pages >= 128 * 1024) ?
13 : 15,
0,
@@ -2971,10 +3268,19 @@ void __init tcp_init(void)
sysctl_tcp_rmem[2] = max(87380, max_share);
printk(KERN_INFO "TCP: Hash tables configured "
- "(established %d bind %d)\n",
- tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size);
+ "(established %u bind %u)\n",
+ tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
tcp_register_congestion_control(&tcp_reno);
+
+ memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
+ memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
+ tcp_secret_one.expires = jiffy; /* past due */
+ tcp_secret_two.expires = jiffy; /* past due */
+ tcp_secret_generating = &tcp_secret_one;
+ tcp_secret_primary = &tcp_secret_one;
+ tcp_secret_retiring = &tcp_secret_two;
+ tcp_secret_secondary = &tcp_secret_two;
}
EXPORT_SYMBOL(tcp_close);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 6428b342b164..0ec9bd0ae94f 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -10,6 +10,7 @@
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/list.h>
+#include <linux/gfp.h>
#include <net/tcp.h>
int sysctl_tcp_max_ssthresh = 0;
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index fcbcd4ff6c5f..939edb3b8e4d 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -27,7 +27,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
r->idiag_rqueue = sk->sk_ack_backlog;
r->idiag_wqueue = sk->sk_max_ack_backlog;
} else {
- r->idiag_rqueue = tp->rcv_nxt - tp->copied_seq;
+ r->idiag_rqueue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
r->idiag_wqueue = tp->write_seq - tp->snd_una;
}
if (info != NULL)
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 26d5c7fc7de5..7c94a4955416 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -92,8 +92,8 @@ static inline void measure_rtt(struct sock *sk, u32 srtt)
if (icsk->icsk_ca_state == TCP_CA_Open) {
if (ca->maxRTT < ca->minRTT)
ca->maxRTT = ca->minRTT;
- if (ca->maxRTT < srtt
- && srtt <= ca->maxRTT + msecs_to_jiffies(20))
+ if (ca->maxRTT < srtt &&
+ srtt <= ca->maxRTT + msecs_to_jiffies(20))
ca->maxRTT = srtt;
}
}
@@ -123,9 +123,9 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt
ca->packetcount += pkts_acked;
- if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1)
- && now - ca->lasttime >= ca->minRTT
- && ca->minRTT > 0) {
+ if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) &&
+ now - ca->lasttime >= ca->minRTT &&
+ ca->minRTT > 0) {
__u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime);
if (htcp_ccount(ca) <= 3) {
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d86784be7ab3..f240f57b2199 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -62,6 +62,7 @@
*/
#include <linux/mm.h>
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/sysctl.h>
#include <linux/kernel.h>
@@ -89,6 +90,8 @@ int sysctl_tcp_frto __read_mostly = 2;
int sysctl_tcp_frto_response __read_mostly;
int sysctl_tcp_nometrics_save __read_mostly;
+int sysctl_tcp_thin_dupack __read_mostly;
+
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_abc __read_mostly;
@@ -140,7 +143,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
* "len" is invariant segment length, including TCP header.
*/
len += skb->data - skb_transport_header(skb);
- if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) ||
+ if (len >= TCP_MSS_DEFAULT + sizeof(struct tcphdr) ||
/* If PSH is not set, packet should be
* full sized, provided peer TCP is not badly broken.
* This observation (if it is correct 8)) allows
@@ -411,7 +414,7 @@ void tcp_initialize_rcv_mss(struct sock *sk)
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);
hint = min(hint, tp->rcv_wnd / 2);
- hint = min(hint, TCP_MIN_RCVMSS);
+ hint = min(hint, TCP_MSS_DEFAULT);
hint = max(hint, TCP_MIN_MSS);
inet_csk(sk)->icsk_ack.rcv_mss = hint;
@@ -2300,7 +2303,7 @@ static inline int tcp_fackets_out(struct tcp_sock *tp)
* they differ. Since neither occurs due to loss, TCP should really
* ignore them.
*/
-static inline int tcp_dupack_heurestics(struct tcp_sock *tp)
+static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
{
return tcp_is_fack(tp) ? tp->fackets_out : tp->sacked_out + 1;
}
@@ -2425,7 +2428,7 @@ static int tcp_time_to_recover(struct sock *sk)
return 1;
/* Not-A-Trick#2 : Classic rule... */
- if (tcp_dupack_heurestics(tp) > tp->reordering)
+ if (tcp_dupack_heuristics(tp) > tp->reordering)
return 1;
/* Trick#3 : when we use RFC2988 timer restart, fast
@@ -2447,6 +2450,16 @@ static int tcp_time_to_recover(struct sock *sk)
return 1;
}
+ /* If a thin stream is detected, retransmit after first
+ * received dupack. Employ only if SACK is supported in order
+ * to avoid possible corner-case series of spurious retransmissions
+ * Use only if there are no unsent data.
+ */
+ if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
+ tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
+ tcp_is_sack(tp) && !tcp_send_head(sk))
+ return 1;
+
return 0;
}
@@ -2499,6 +2512,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
int err;
unsigned int mss;
+ if (packets == 0)
+ return;
+
WARN_ON(packets > tp->packets_out);
if (tp->lost_skb_hint) {
skb = tp->lost_skb_hint;
@@ -2717,6 +2733,35 @@ static void tcp_try_undo_dsack(struct sock *sk)
}
}
+/* We can clear retrans_stamp when there are no retransmissions in the
+ * window. It would seem that it is trivially available for us in
+ * tp->retrans_out, however, that kind of assumptions doesn't consider
+ * what will happen if errors occur when sending retransmission for the
+ * second time. ...It could the that such segment has only
+ * TCPCB_EVER_RETRANS set at the present time. It seems that checking
+ * the head skb is enough except for some reneging corner cases that
+ * are not worth the effort.
+ *
+ * Main reason for all this complexity is the fact that connection dying
+ * time now depends on the validity of the retrans_stamp, in particular,
+ * that successive retransmissions of a segment must not advance
+ * retrans_stamp under any conditions.
+ */
+static int tcp_any_retrans_done(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb;
+
+ if (tp->retrans_out)
+ return 1;
+
+ skb = tcp_write_queue_head(sk);
+ if (unlikely(skb && TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS))
+ return 1;
+
+ return 0;
+}
+
/* Undo during fast recovery after partial ACK. */
static int tcp_try_undo_partial(struct sock *sk, int acked)
@@ -2729,7 +2774,7 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
/* Plain luck! Hole if filled with delayed
* packet, rather than with a retransmit.
*/
- if (tp->retrans_out == 0)
+ if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1);
@@ -2788,7 +2833,7 @@ static void tcp_try_keep_open(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
int state = TCP_CA_Open;
- if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker)
+ if (tcp_left_out(tp) || tcp_any_retrans_done(sk) || tp->undo_marker)
state = TCP_CA_Disorder;
if (inet_csk(sk)->icsk_ca_state != state) {
@@ -2803,7 +2848,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
tcp_verify_left_out(tp);
- if (!tp->frto_counter && tp->retrans_out == 0)
+ if (!tp->frto_counter && !tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
if (flag & FLAG_ECE)
@@ -3698,7 +3743,7 @@ old_ack:
* the fast version below fails.
*/
void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
- int estab)
+ u8 **hvpp, int estab)
{
unsigned char *ptr;
struct tcphdr *th = tcp_hdr(skb);
@@ -3782,7 +3827,30 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
*/
break;
#endif
- }
+ case TCPOPT_COOKIE:
+ /* This option is variable length.
+ */
+ switch (opsize) {
+ case TCPOLEN_COOKIE_BASE:
+ /* not yet implemented */
+ break;
+ case TCPOLEN_COOKIE_PAIR:
+ /* not yet implemented */
+ break;
+ case TCPOLEN_COOKIE_MIN+0:
+ case TCPOLEN_COOKIE_MIN+2:
+ case TCPOLEN_COOKIE_MIN+4:
+ case TCPOLEN_COOKIE_MIN+6:
+ case TCPOLEN_COOKIE_MAX:
+ /* 16-bit multiple */
+ opt_rx->cookie_plus = opsize;
+ *hvpp = ptr;
+ default:
+ /* ignore option */
+ break;
+ };
+ break;
+ };
ptr += opsize-2;
length -= opsize;
@@ -3810,17 +3878,20 @@ static int tcp_parse_aligned_timestamp(struct tcp_sock *tp, struct tcphdr *th)
* If it is wrong it falls back on tcp_parse_options().
*/
static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
- struct tcp_sock *tp)
+ struct tcp_sock *tp, u8 **hvpp)
{
- if (th->doff == sizeof(struct tcphdr) >> 2) {
+ /* In the spirit of fast parsing, compare doff directly to constant
+ * values. Because equality is used, short doff can be ignored here.
+ */
+ if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
return 0;
} else if (tp->rx_opt.tstamp_ok &&
- th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
+ th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
if (tcp_parse_aligned_timestamp(tp, th))
return 1;
}
- tcp_parse_options(skb, &tp->rx_opt, 1);
+ tcp_parse_options(skb, &tp->rx_opt, hvpp, 1);
return 1;
}
@@ -4845,11 +4916,11 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
struct tcp_sock *tp = tcp_sk(sk);
/* More than one full frame received... */
- if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss
+ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
/* ... and right edge of window advances far enough.
* (tcp_recvmsg() will send ACK otherwise). Or...
*/
- && __tcp_select_window(sk) >= tp->rcv_wnd) ||
+ __tcp_select_window(sk) >= tp->rcv_wnd) ||
/* We ACK each frame or... */
tcp_in_quickack_mode(sk) ||
/* We have out of order data. */
@@ -5070,10 +5141,12 @@ out:
static int tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, int syn_inerr)
{
+ u8 *hash_location;
struct tcp_sock *tp = tcp_sk(sk);
/* RFC1323: H1. Apply PAWS check first. */
- if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
+ if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
+ tp->rx_opt.saw_tstamp &&
tcp_paws_discard(sk, skb)) {
if (!th->rst) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5361,11 +5434,13 @@ discard:
static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcphdr *th, unsigned len)
{
- struct tcp_sock *tp = tcp_sk(sk);
+ u8 *hash_location;
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct tcp_cookie_values *cvp = tp->cookie_values;
int saved_clamp = tp->rx_opt.mss_clamp;
- tcp_parse_options(skb, &tp->rx_opt, 0);
+ tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0);
if (th->ack) {
/* rfc793:
@@ -5462,6 +5537,31 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* Change state from SYN-SENT only after copied_seq
* is initialized. */
tp->copied_seq = tp->rcv_nxt;
+
+ if (cvp != NULL &&
+ cvp->cookie_pair_size > 0 &&
+ tp->rx_opt.cookie_plus > 0) {
+ int cookie_size = tp->rx_opt.cookie_plus
+ - TCPOLEN_COOKIE_BASE;
+ int cookie_pair_size = cookie_size
+ + cvp->cookie_desired;
+
+ /* A cookie extension option was sent and returned.
+ * Note that each incoming SYNACK replaces the
+ * Responder cookie. The initial exchange is most
+ * fragile, as protection against spoofing relies
+ * entirely upon the sequence and timestamp (above).
+ * This replacement strategy allows the correct pair to
+ * pass through, while any others will be filtered via
+ * Responder verification later.
+ */
+ if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
+ memcpy(&cvp->cookie_pair[cvp->cookie_desired],
+ hash_location, cookie_size);
+ cvp->cookie_pair_size = cookie_pair_size;
+ }
+ }
+
smp_mb();
tcp_set_state(sk, TCP_ESTABLISHED);
@@ -5699,11 +5799,9 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
/* tcp_ack considers this ACK as duplicate
* and does not calculate rtt.
- * Fix it at least with timestamps.
+ * Force it here.
*/
- if (tp->rx_opt.saw_tstamp &&
- tp->rx_opt.rcv_tsecr && !tp->srtt)
- tcp_ack_saw_tstamp(sk, 0);
+ tcp_ack_update_rtt(sk, 0, 0);
if (tp->rx_opt.tstamp_ok)
tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7cda24b53f61..3c23e70885f4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -60,6 +60,7 @@
#include <linux/jhash.h>
#include <linux/init.h>
#include <linux/times.h>
+#include <linux/slab.h>
#include <net/net_namespace.h>
#include <net/icmp.h>
@@ -165,10 +166,10 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
nexthop = inet->opt->faddr;
}
- tmp = ip_route_connect(&rt, nexthop, inet->saddr,
+ tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
IPPROTO_TCP,
- inet->sport, usin->sin_port, sk, 1);
+ inet->inet_sport, usin->sin_port, sk, 1);
if (tmp < 0) {
if (tmp == -ENETUNREACH)
IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
@@ -183,11 +184,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
if (!inet->opt || !inet->opt->srr)
daddr = rt->rt_dst;
- if (!inet->saddr)
- inet->saddr = rt->rt_src;
- inet->rcv_saddr = inet->saddr;
+ if (!inet->inet_saddr)
+ inet->inet_saddr = rt->rt_src;
+ inet->inet_rcv_saddr = inet->inet_saddr;
- if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
+ if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
/* Reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
@@ -204,20 +205,20 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
* when trying new connection.
*/
if (peer != NULL &&
- peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
+ (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
tp->rx_opt.ts_recent = peer->tcp_ts;
}
}
- inet->dport = usin->sin_port;
- inet->daddr = daddr;
+ inet->inet_dport = usin->sin_port;
+ inet->inet_daddr = daddr;
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet->opt)
inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
- tp->rx_opt.mss_clamp = 536;
+ tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
/* Socket identity is still unknown (sport may be zero).
* However we set state to SYN-SENT and not releasing socket
@@ -230,7 +231,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
goto failure;
err = ip_route_newports(&rt, IPPROTO_TCP,
- inet->sport, inet->dport, sk);
+ inet->inet_sport, inet->inet_dport, sk);
if (err)
goto failure;
@@ -239,12 +240,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
sk_setup_caps(sk, &rt->u.dst);
if (!tp->write_seq)
- tp->write_seq = secure_tcp_sequence_number(inet->saddr,
- inet->daddr,
- inet->sport,
+ tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
+ inet->inet_daddr,
+ inet->inet_sport,
usin->sin_port);
- inet->id = tp->write_seq ^ jiffies;
+ inet->inet_id = tp->write_seq ^ jiffies;
err = tcp_connect(sk);
rt = NULL;
@@ -261,7 +262,7 @@ failure:
tcp_set_state(sk, TCP_CLOSE);
ip_rt_put(rt);
sk->sk_route_caps = 0;
- inet->dport = 0;
+ inet->inet_dport = 0;
return err;
}
@@ -370,6 +371,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
if (sk->sk_state == TCP_CLOSE)
goto out;
+ if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ goto out;
+ }
+
icsk = inet_csk(sk);
tp = tcp_sk(sk);
seq = ntohl(th->seq);
@@ -520,12 +526,13 @@ void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
struct tcphdr *th = tcp_hdr(skb);
if (skb->ip_summed == CHECKSUM_PARTIAL) {
- th->check = ~tcp_v4_check(len, inet->saddr,
- inet->daddr, 0);
+ th->check = ~tcp_v4_check(len, inet->inet_saddr,
+ inet->inet_daddr, 0);
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct tcphdr, check);
} else {
- th->check = tcp_v4_check(len, inet->saddr, inet->daddr,
+ th->check = tcp_v4_check(len, inet->inet_saddr,
+ inet->inet_daddr,
csum_partial(th,
th->doff << 2,
skb->csum));
@@ -741,8 +748,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
* This still operates on a request_sock only, not on a big
* socket.
*/
-static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
- struct dst_entry *dst)
+static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+ struct request_sock *req,
+ struct request_values *rvp)
{
const struct inet_request_sock *ireq = inet_rsk(req);
int err = -1;
@@ -752,7 +760,7 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
return -1;
- skb = tcp_make_synack(sk, dst, req);
+ skb = tcp_make_synack(sk, dst, req, rvp);
if (skb) {
struct tcphdr *th = tcp_hdr(skb);
@@ -773,9 +781,11 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
return err;
}
-static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
+static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
+ struct request_values *rvp)
{
- return __tcp_v4_send_synack(sk, req, NULL);
+ TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+ return tcp_v4_send_synack(sk, NULL, req, rvp);
}
/*
@@ -848,7 +858,7 @@ static struct tcp_md5sig_key *
struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
struct sock *addr_sk)
{
- return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->daddr);
+ return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
}
EXPORT_SYMBOL(tcp_v4_md5_lookup);
@@ -923,7 +933,7 @@ EXPORT_SYMBOL(tcp_v4_md5_do_add);
static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
u8 *newkey, u8 newkeylen)
{
- return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->daddr,
+ return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
newkey, newkeylen);
}
@@ -1089,8 +1099,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
__be32 saddr, daddr;
if (sk) {
- saddr = inet_sk(sk)->saddr;
- daddr = inet_sk(sk)->daddr;
+ saddr = inet_sk(sk)->inet_saddr;
+ daddr = inet_sk(sk)->inet_daddr;
} else if (req) {
saddr = inet_rsk(req)->loc_addr;
daddr = inet_rsk(req)->rmt_addr;
@@ -1189,10 +1199,11 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
struct request_sock_ops tcp_request_sock_ops __read_mostly = {
.family = PF_INET,
.obj_size = sizeof(struct tcp_request_sock),
- .rtx_syn_ack = tcp_v4_send_synack,
+ .rtx_syn_ack = tcp_v4_rtx_synack,
.send_ack = tcp_v4_reqsk_send_ack,
.destructor = tcp_v4_reqsk_destructor,
.send_reset = tcp_v4_send_reset,
+ .syn_ack_timeout = tcp_syn_ack_timeout,
};
#ifdef CONFIG_TCP_MD5SIG
@@ -1210,13 +1221,16 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = {
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
{
- struct inet_request_sock *ireq;
+ struct tcp_extend_values tmp_ext;
struct tcp_options_received tmp_opt;
+ u8 *hash_location;
struct request_sock *req;
+ struct inet_request_sock *ireq;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct dst_entry *dst = NULL;
__be32 saddr = ip_hdr(skb)->saddr;
__be32 daddr = ip_hdr(skb)->daddr;
__u32 isn = TCP_SKB_CB(skb)->when;
- struct dst_entry *dst = NULL;
#ifdef CONFIG_SYN_COOKIES
int want_cookie = 0;
#else
@@ -1257,16 +1271,50 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
#endif
tcp_clear_options(&tmp_opt);
- tmp_opt.mss_clamp = 536;
- tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
+ tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
+ tmp_opt.user_mss = tp->rx_opt.user_mss;
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+
+ if (tmp_opt.cookie_plus > 0 &&
+ tmp_opt.saw_tstamp &&
+ !tp->rx_opt.cookie_out_never &&
+ (sysctl_tcp_cookie_size > 0 ||
+ (tp->cookie_values != NULL &&
+ tp->cookie_values->cookie_desired > 0))) {
+ u8 *c;
+ u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
+ int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
+
+ if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
+ goto drop_and_release;
+
+ /* Secret recipe starts with IP addresses */
+ *mess++ ^= daddr;
+ *mess++ ^= saddr;
- tcp_parse_options(skb, &tmp_opt, 0);
+ /* plus variable length Initiator Cookie */
+ c = (u8 *)mess;
+ while (l-- > 0)
+ *c++ ^= *hash_location++;
+
+#ifdef CONFIG_SYN_COOKIES
+ want_cookie = 0; /* not our kind of cookie */
+#endif
+ tmp_ext.cookie_out_never = 0; /* false */
+ tmp_ext.cookie_plus = tmp_opt.cookie_plus;
+ } else if (!tp->rx_opt.cookie_in_always) {
+ /* redundant indications, but ensure initialization. */
+ tmp_ext.cookie_out_never = 1; /* true */
+ tmp_ext.cookie_plus = 0;
+ } else {
+ goto drop_and_release;
+ }
+ tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
if (want_cookie && !tmp_opt.saw_tstamp)
tcp_clear_options(&tmp_opt);
tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
-
tcp_openreq_init(req, &tmp_opt, skb);
ireq = inet_rsk(req);
@@ -1304,7 +1352,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
(dst = inet_csk_route_req(sk, req)) != NULL &&
(peer = rt_get_peer((struct rtable *)dst)) != NULL &&
peer->v4daddr == saddr) {
- if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
+ if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
(s32)(peer->tcp_ts - req->ts_recent) >
TCP_PAWS_WINDOW) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
@@ -1333,7 +1381,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
}
tcp_rsk(req)->snt_isn = isn;
- if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
+ if (tcp_v4_send_synack(sk, dst, req,
+ (struct request_values *)&tmp_ext) ||
+ want_cookie)
goto drop_and_free;
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
@@ -1380,9 +1430,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
newtp = tcp_sk(newsk);
newinet = inet_sk(newsk);
ireq = inet_rsk(req);
- newinet->daddr = ireq->rmt_addr;
- newinet->rcv_saddr = ireq->loc_addr;
- newinet->saddr = ireq->loc_addr;
+ newinet->inet_daddr = ireq->rmt_addr;
+ newinet->inet_rcv_saddr = ireq->loc_addr;
+ newinet->inet_saddr = ireq->loc_addr;
newinet->opt = ireq->opt;
ireq->opt = NULL;
newinet->mc_index = inet_iif(skb);
@@ -1390,7 +1440,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
inet_csk(newsk)->icsk_ext_hdr_len = 0;
if (newinet->opt)
inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
- newinet->id = newtp->write_seq ^ jiffies;
+ newinet->inet_id = newtp->write_seq ^ jiffies;
tcp_mtup_init(newsk);
tcp_sync_mss(newsk, dst_mtu(dst));
@@ -1403,7 +1453,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
#ifdef CONFIG_TCP_MD5SIG
/* Copy over the MD5 key from the original socket */
- if ((key = tcp_v4_md5_do_lookup(sk, newinet->daddr)) != NULL) {
+ key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
+ if (key != NULL) {
/*
* We're using one, so create a matching key
* on the newsk structure. If we fail to get
@@ -1412,13 +1463,13 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
*/
char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
if (newkey != NULL)
- tcp_v4_md5_do_add(newsk, newinet->daddr,
+ tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
newkey, key->keylen);
newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
}
#endif
- __inet_hash_nolisten(newsk);
+ __inet_hash_nolisten(newsk, NULL);
__inet_inherit_port(sk, newsk);
return newsk;
@@ -1610,6 +1661,11 @@ process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
+ if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+ goto discard_and_relse;
+ }
+
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse;
nf_reset(skb);
@@ -1634,8 +1690,11 @@ process:
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);
}
- } else
- sk_add_backlog(sk, skb);
+ } else if (unlikely(sk_add_backlog(sk, skb))) {
+ bh_unlock_sock(sk);
+ NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+ goto discard_and_relse;
+ }
bh_unlock_sock(sk);
sock_put(sk);
@@ -1711,8 +1770,8 @@ int tcp_v4_remember_stamp(struct sock *sk)
struct inet_peer *peer = NULL;
int release_it = 0;
- if (!rt || rt->rt_dst != inet->daddr) {
- peer = inet_getpeer(inet->daddr, 1);
+ if (!rt || rt->rt_dst != inet->inet_daddr) {
+ peer = inet_getpeer(inet->inet_daddr, 1);
release_it = 1;
} else {
if (!rt->peer)
@@ -1722,9 +1781,9 @@ int tcp_v4_remember_stamp(struct sock *sk)
if (peer) {
if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
- (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
- peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
- peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
+ ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+ peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+ peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
peer->tcp_ts = tp->rx_opt.ts_recent;
}
if (release_it)
@@ -1743,9 +1802,9 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
- (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
- peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
- peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
+ ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+ peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+ peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
peer->tcp_ts = tcptw->tw_ts_recent;
}
inet_putpeer(peer);
@@ -1810,7 +1869,7 @@ static int tcp_v4_init_sock(struct sock *sk)
*/
tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
tp->snd_cwnd_clamp = ~0;
- tp->mss_cache = 536;
+ tp->mss_cache = TCP_MSS_DEFAULT;
tp->reordering = sysctl_tcp_reordering;
icsk->icsk_ca_ops = &tcp_init_congestion_ops;
@@ -1826,6 +1885,19 @@ static int tcp_v4_init_sock(struct sock *sk)
tp->af_specific = &tcp_sock_ipv4_specific;
#endif
+ /* TCP Cookie Transactions */
+ if (sysctl_tcp_cookie_size > 0) {
+ /* Default, cookies without s_data_payload. */
+ tp->cookie_values =
+ kzalloc(sizeof(*tp->cookie_values),
+ sk->sk_allocation);
+ if (tp->cookie_values != NULL)
+ kref_init(&tp->cookie_values->kref);
+ }
+ /* Presumed zeroed, in order of appearance:
+ * cookie_in_always, cookie_out_never,
+ * s_data_constant, s_data_in, s_data_out
+ */
sk->sk_sndbuf = sysctl_tcp_wmem[1];
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
@@ -1879,6 +1951,13 @@ void tcp_v4_destroy_sock(struct sock *sk)
sk->sk_sndmsg_page = NULL;
}
+ /* TCP Cookie Transactions */
+ if (tp->cookie_values != NULL) {
+ kref_put(&tp->cookie_values->kref,
+ tcp_cookie_values_release);
+ tp->cookie_values = NULL;
+ }
+
percpu_counter_dec(&tcp_sockets_allocated);
}
@@ -2000,7 +2079,7 @@ static void *established_get_first(struct seq_file *seq)
struct net *net = seq_file_net(seq);
void *rc = NULL;
- for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
+ for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
struct sock *sk;
struct hlist_nulls_node *node;
struct inet_timewait_sock *tw;
@@ -2061,10 +2140,10 @@ get_tw:
st->state = TCP_SEQ_STATE_ESTABLISHED;
/* Look for next non empty bucket */
- while (++st->bucket < tcp_hashinfo.ehash_size &&
+ while (++st->bucket <= tcp_hashinfo.ehash_mask &&
empty_bucket(st))
;
- if (st->bucket >= tcp_hashinfo.ehash_size)
+ if (st->bucket > tcp_hashinfo.ehash_mask)
return NULL;
spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
@@ -2225,7 +2304,7 @@ static void get_openreq4(struct sock *sk, struct request_sock *req,
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
i,
ireq->loc_addr,
- ntohs(inet_sk(sk)->sport),
+ ntohs(inet_sk(sk)->inet_sport),
ireq->rmt_addr,
ntohs(ireq->rmt_port),
TCP_SYN_RECV,
@@ -2248,10 +2327,11 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
- __be32 dest = inet->daddr;
- __be32 src = inet->rcv_saddr;
- __u16 destp = ntohs(inet->dport);
- __u16 srcp = ntohs(inet->sport);
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
+ int rx_queue;
if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
timer_active = 1;
@@ -2267,12 +2347,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
timer_expires = jiffies;
}
+ if (sk->sk_state == TCP_LISTEN)
+ rx_queue = sk->sk_ack_backlog;
+ else
+ /*
+ * because we dont lock socket, we might find a transient negative value
+ */
+ rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
i, src, srcp, dest, destp, sk->sk_state,
tp->write_seq - tp->snd_una,
- sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
- (tp->rcv_nxt - tp->copied_seq),
+ rx_queue,
timer_active,
jiffies_to_clock_t(timer_expires - jiffies),
icsk->icsk_retransmits,
@@ -2354,12 +2441,12 @@ static struct tcp_seq_afinfo tcp4_seq_afinfo = {
},
};
-static int tcp4_proc_init_net(struct net *net)
+static int __net_init tcp4_proc_init_net(struct net *net)
{
return tcp_proc_register(net, &tcp4_seq_afinfo);
}
-static void tcp4_proc_exit_net(struct net *net)
+static void __net_exit tcp4_proc_exit_net(struct net *net)
{
tcp_proc_unregister(net, &tcp4_seq_afinfo);
}
@@ -2463,12 +2550,17 @@ static int __net_init tcp_sk_init(struct net *net)
static void __net_exit tcp_sk_exit(struct net *net)
{
inet_ctl_sock_destroy(net->ipv4.tcp_sock);
- inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
+}
+
+static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+{
+ inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
}
static struct pernet_operations __net_initdata tcp_sk_ops = {
- .init = tcp_sk_init,
- .exit = tcp_sk_exit,
+ .init = tcp_sk_init,
+ .exit = tcp_sk_exit,
+ .exit_batch = tcp_sk_exit_batch,
};
void __init tcp_v4_init(void)
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index ce3c41ff50b2..de870377fbba 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -143,8 +143,8 @@ static u32 tcp_lp_remote_hz_estimator(struct sock *sk)
goto out;
/* we can't calc remote HZ with no different!! */
- if (tp->rx_opt.rcv_tsval == lp->remote_ref_time
- || tp->rx_opt.rcv_tsecr == lp->local_ref_time)
+ if (tp->rx_opt.rcv_tsval == lp->remote_ref_time ||
+ tp->rx_opt.rcv_tsecr == lp->local_ref_time)
goto out;
m = HZ * (tp->rx_opt.rcv_tsval -
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4c03598ed924..5fabff9ac6d6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -20,19 +20,14 @@
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/workqueue.h>
#include <net/tcp.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
-#ifdef CONFIG_SYSCTL
-#define SYNC_INIT 0 /* let the user enable it */
-#else
-#define SYNC_INIT 1
-#endif
-
-int sysctl_tcp_syncookies __read_mostly = SYNC_INIT;
+int sysctl_tcp_syncookies __read_mostly = 1;
EXPORT_SYMBOL(sysctl_tcp_syncookies);
int sysctl_tcp_abort_on_overflow __read_mostly;
@@ -96,13 +91,14 @@ enum tcp_tw_status
tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
const struct tcphdr *th)
{
- struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
struct tcp_options_received tmp_opt;
+ u8 *hash_location;
+ struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
int paws_reject = 0;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
- tcp_parse_options(skb, &tmp_opt, 0);
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = tcptw->tw_ts_recent;
@@ -389,14 +385,43 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
const struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_request_sock *treq = tcp_rsk(req);
struct inet_connection_sock *newicsk = inet_csk(newsk);
- struct tcp_sock *newtp;
+ struct tcp_sock *newtp = tcp_sk(newsk);
+ struct tcp_sock *oldtp = tcp_sk(sk);
+ struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
+
+ /* TCP Cookie Transactions require space for the cookie pair,
+ * as it differs for each connection. There is no need to
+ * copy any s_data_payload stored at the original socket.
+ * Failure will prevent resuming the connection.
+ *
+ * Presumed copied, in order of appearance:
+ * cookie_in_always, cookie_out_never
+ */
+ if (oldcvp != NULL) {
+ struct tcp_cookie_values *newcvp =
+ kzalloc(sizeof(*newtp->cookie_values),
+ GFP_ATOMIC);
+
+ if (newcvp != NULL) {
+ kref_init(&newcvp->kref);
+ newcvp->cookie_desired =
+ oldcvp->cookie_desired;
+ newtp->cookie_values = newcvp;
+ } else {
+ /* Not Yet Implemented */
+ newtp->cookie_values = NULL;
+ }
+ }
/* Now setup tcp_sock */
- newtp = tcp_sk(newsk);
newtp->pred_flags = 0;
- newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
- newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = treq->snt_isn + 1;
- newtp->snd_up = treq->snt_isn + 1;
+
+ newtp->rcv_wup = newtp->copied_seq =
+ newtp->rcv_nxt = treq->rcv_isn + 1;
+
+ newtp->snd_sml = newtp->snd_una =
+ newtp->snd_nxt = newtp->snd_up =
+ treq->snt_isn + 1 + tcp_s_data_size(oldtp);
tcp_prequeue_init(newtp);
@@ -429,8 +454,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
skb_queue_head_init(&newtp->out_of_order_queue);
- newtp->write_seq = treq->snt_isn + 1;
- newtp->pushed_seq = newtp->write_seq;
+ newtp->write_seq = newtp->pushed_seq =
+ treq->snt_isn + 1 + tcp_s_data_size(oldtp);
newtp->rx_opt.saw_tstamp = 0;
@@ -476,7 +501,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
if (newtp->af_specific->md5_lookup(sk, newsk))
newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
- if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
+ if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
TCP_ECN_openreq_child(newtp, req);
@@ -495,15 +520,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct request_sock **prev)
{
+ struct tcp_options_received tmp_opt;
+ u8 *hash_location;
+ struct sock *child;
const struct tcphdr *th = tcp_hdr(skb);
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
int paws_reject = 0;
- struct tcp_options_received tmp_opt;
- struct sock *child;
tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
- tcp_parse_options(skb, &tmp_opt, 0);
+ tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
if (tmp_opt.saw_tstamp) {
tmp_opt.ts_recent = req->ts_recent;
@@ -537,7 +563,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* Enforce "SYN-ACK" according to figure 8, figure 6
* of RFC793, fixed by RFC1122.
*/
- req->rsk_ops->rtx_syn_ack(sk, req);
+ req->rsk_ops->rtx_syn_ack(sk, req, NULL);
return NULL;
}
@@ -596,7 +622,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
* Invalid ACK: reset will be sent by listening socket
*/
if ((flg & TCP_FLAG_ACK) &&
- (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
+ (TCP_SKB_CB(skb)->ack_seq !=
+ tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
return sk;
/* Also, it would be not so bad idea to check rcv_tsecr, which
@@ -702,7 +729,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
* in main socket hash table and lock on listening
* socket does not protect us more.
*/
- sk_add_backlog(child, skb);
+ __sk_add_backlog(child, skb);
}
bh_unlock_sock(child);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index fcd278a7080e..0dda86e72ad8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -37,6 +37,7 @@
#include <net/tcp.h>
#include <linux/compiler.h>
+#include <linux/gfp.h>
#include <linux/module.h>
/* People can turn this off for buggy TCP's found in printers etc. */
@@ -59,6 +60,10 @@ int sysctl_tcp_base_mss __read_mostly = 512;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
+EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
+
+
/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
@@ -179,7 +184,8 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
*/
void tcp_select_initial_window(int __space, __u32 mss,
__u32 *rcv_wnd, __u32 *window_clamp,
- int wscale_ok, __u8 *rcv_wscale)
+ int wscale_ok, __u8 *rcv_wscale,
+ __u32 init_rcv_wnd)
{
unsigned int space = (__space < 0 ? 0 : __space);
@@ -228,7 +234,13 @@ void tcp_select_initial_window(int __space, __u32 mss,
init_cwnd = 2;
else if (mss > 1460)
init_cwnd = 3;
- if (*rcv_wnd > init_cwnd * mss)
+ /* when initializing use the value from init_rcv_wnd
+ * rather than the default from above
+ */
+ if (init_rcv_wnd &&
+ (*rcv_wnd > init_rcv_wnd * mss))
+ *rcv_wnd = init_rcv_wnd * mss;
+ else if (*rcv_wnd > init_cwnd * mss)
*rcv_wnd = init_cwnd * mss;
}
@@ -362,15 +374,45 @@ static inline int tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_TS (1 << 1)
#define OPTION_MD5 (1 << 2)
#define OPTION_WSCALE (1 << 3)
+#define OPTION_COOKIE_EXTENSION (1 << 4)
struct tcp_out_options {
u8 options; /* bit field of OPTION_* */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
+ u8 hash_size; /* bytes in hash_location */
u16 mss; /* 0 to disable */
__u32 tsval, tsecr; /* need to include OPTION_TS */
+ __u8 *hash_location; /* temporary pointer, overloaded */
};
+/* The sysctl int routines are generic, so check consistency here.
+ */
+static u8 tcp_cookie_size_check(u8 desired)
+{
+ if (desired > 0) {
+ /* previously specified */
+ return desired;
+ }
+ if (sysctl_tcp_cookie_size <= 0) {
+ /* no default specified */
+ return 0;
+ }
+ if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) {
+ /* value too small, specify minimum */
+ return TCP_COOKIE_MIN;
+ }
+ if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) {
+ /* value too large, specify maximum */
+ return TCP_COOKIE_MAX;
+ }
+ if (0x1 & sysctl_tcp_cookie_size) {
+ /* 8-bit multiple, illegal, fix it */
+ return (u8)(sysctl_tcp_cookie_size + 0x1);
+ }
+ return (u8)sysctl_tcp_cookie_size;
+}
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -385,17 +427,34 @@ struct tcp_out_options {
* (but it may well be that other scenarios fail similarly).
*/
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
- const struct tcp_out_options *opts,
- __u8 **md5_hash) {
- if (unlikely(OPTION_MD5 & opts->options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- *md5_hash = (__u8 *)ptr;
+ struct tcp_out_options *opts)
+{
+ u8 options = opts->options; /* mungable copy */
+
+ /* Having both authentication and cookies for security is redundant,
+ * and there's certainly not enough room. Instead, the cookie-less
+ * extension variant is proposed.
+ *
+ * Consider the pessimal case with authentication. The options
+ * could look like:
+ * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
+ */
+ if (unlikely(OPTION_MD5 & options)) {
+ if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
+ *ptr++ = htonl((TCPOPT_COOKIE << 24) |
+ (TCPOLEN_COOKIE_BASE << 16) |
+ (TCPOPT_MD5SIG << 8) |
+ TCPOLEN_MD5SIG);
+ } else {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) |
+ TCPOLEN_MD5SIG);
+ }
+ options &= ~OPTION_COOKIE_EXTENSION;
+ /* overload cookie hash location */
+ opts->hash_location = (__u8 *)ptr;
ptr += 4;
- } else {
- *md5_hash = NULL;
}
if (unlikely(opts->mss)) {
@@ -404,12 +463,13 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
opts->mss);
}
- if (likely(OPTION_TS & opts->options)) {
- if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) {
+ if (likely(OPTION_TS & options)) {
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
(TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
+ options &= ~OPTION_SACK_ADVERTISE;
} else {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
@@ -420,15 +480,52 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
*ptr++ = htonl(opts->tsecr);
}
- if (unlikely(OPTION_SACK_ADVERTISE & opts->options &&
- !(OPTION_TS & opts->options))) {
+ /* Specification requires after timestamp, so do it now.
+ *
+ * Consider the pessimal case without authentication. The options
+ * could look like:
+ * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
+ */
+ if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
+ __u8 *cookie_copy = opts->hash_location;
+ u8 cookie_size = opts->hash_size;
+
+ /* 8-bit multiple handled in tcp_cookie_size_check() above,
+ * and elsewhere.
+ */
+ if (0x2 & cookie_size) {
+ __u8 *p = (__u8 *)ptr;
+
+ /* 16-bit multiple */
+ *p++ = TCPOPT_COOKIE;
+ *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
+ *p++ = *cookie_copy++;
+ *p++ = *cookie_copy++;
+ ptr++;
+ cookie_size -= 2;
+ } else {
+ /* 32-bit multiple */
+ *ptr++ = htonl(((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_COOKIE << 8) |
+ TCPOLEN_COOKIE_BASE) +
+ cookie_size);
+ }
+
+ if (cookie_size > 0) {
+ memcpy(ptr, cookie_copy, cookie_size);
+ ptr += (cookie_size / 4);
+ }
+ }
+
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
}
- if (unlikely(OPTION_WSCALE & opts->options)) {
+ if (unlikely(OPTION_WSCALE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
@@ -463,13 +560,17 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5) {
struct tcp_sock *tp = tcp_sk(sk);
- unsigned size = 0;
+ struct tcp_cookie_values *cvp = tp->cookie_values;
+ unsigned remaining = MAX_TCP_OPTION_SPACE;
+ u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
+ tcp_cookie_size_check(cvp->cookie_desired) :
+ 0;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
- size += TCPOLEN_MD5SIG_ALIGNED;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
}
#else
*md5 = NULL;
@@ -485,26 +586,72 @@ static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
* SACKs don't matter, we never delay an ACK when we have any of those
* going out. */
opts->mss = tcp_advertise_mss(sk);
- size += TCPOLEN_MSS_ALIGNED;
+ remaining -= TCPOLEN_MSS_ALIGNED;
if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
opts->options |= OPTION_TS;
opts->tsval = TCP_SKB_CB(skb)->when;
opts->tsecr = tp->rx_opt.ts_recent;
- size += TCPOLEN_TSTAMP_ALIGNED;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(sysctl_tcp_window_scaling)) {
opts->ws = tp->rx_opt.rcv_wscale;
opts->options |= OPTION_WSCALE;
- size += TCPOLEN_WSCALE_ALIGNED;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
}
if (likely(sysctl_tcp_sack)) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!(OPTION_TS & opts->options)))
- size += TCPOLEN_SACKPERM_ALIGNED;
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
- return size;
+ /* Note that timestamps are required by the specification.
+ *
+ * Odd numbers of bytes are prohibited by the specification, ensuring
+ * that the cookie is 16-bit aligned, and the resulting cookie pair is
+ * 32-bit aligned.
+ */
+ if (*md5 == NULL &&
+ (OPTION_TS & opts->options) &&
+ cookie_size > 0) {
+ int need = TCPOLEN_COOKIE_BASE + cookie_size;
+
+ if (0x2 & need) {
+ /* 32-bit multiple */
+ need += 2; /* NOPs */
+
+ if (need > remaining) {
+ /* try shrinking cookie to fit */
+ cookie_size -= 2;
+ need -= 4;
+ }
+ }
+ while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
+ cookie_size -= 4;
+ need -= 4;
+ }
+ if (TCP_COOKIE_MIN <= cookie_size) {
+ opts->options |= OPTION_COOKIE_EXTENSION;
+ opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
+ opts->hash_size = cookie_size;
+
+ /* Remember for future incarnations. */
+ cvp->cookie_desired = cookie_size;
+
+ if (cvp->cookie_desired != cvp->cookie_pair_size) {
+ /* Currently use random bytes as a nonce,
+ * assuming these are completely unpredictable
+ * by hostile users of the same system.
+ */
+ get_random_bytes(&cvp->cookie_pair[0],
+ cookie_size);
+ cvp->cookie_pair_size = cookie_size;
+ }
+
+ remaining -= need;
+ }
+ }
+ return MAX_TCP_OPTION_SPACE - remaining;
}
/* Set up TCP options for SYN-ACKs. */
@@ -512,48 +659,77 @@ static unsigned tcp_synack_options(struct sock *sk,
struct request_sock *req,
unsigned mss, struct sk_buff *skb,
struct tcp_out_options *opts,
- struct tcp_md5sig_key **md5) {
- unsigned size = 0;
+ struct tcp_md5sig_key **md5,
+ struct tcp_extend_values *xvp)
+{
struct inet_request_sock *ireq = inet_rsk(req);
- char doing_ts;
+ unsigned remaining = MAX_TCP_OPTION_SPACE;
+ u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
+ xvp->cookie_plus :
+ 0;
+ bool doing_ts = ireq->tstamp_ok;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
if (*md5) {
opts->options |= OPTION_MD5;
- size += TCPOLEN_MD5SIG_ALIGNED;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+
+ /* We can't fit any SACK blocks in a packet with MD5 + TS
+ * options. There was discussion about disabling SACK
+ * rather than TS in order to fit in better with old,
+ * buggy kernels, but that was deemed to be unnecessary.
+ */
+ doing_ts &= !ireq->sack_ok;
}
#else
*md5 = NULL;
#endif
- /* we can't fit any SACK blocks in a packet with MD5 + TS
- options. There was discussion about disabling SACK rather than TS in
- order to fit in better with old, buggy kernels, but that was deemed
- to be unnecessary. */
- doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok);
-
+ /* We always send an MSS option. */
opts->mss = mss;
- size += TCPOLEN_MSS_ALIGNED;
+ remaining -= TCPOLEN_MSS_ALIGNED;
if (likely(ireq->wscale_ok)) {
opts->ws = ireq->rcv_wscale;
opts->options |= OPTION_WSCALE;
- size += TCPOLEN_WSCALE_ALIGNED;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
}
if (likely(doing_ts)) {
opts->options |= OPTION_TS;
opts->tsval = TCP_SKB_CB(skb)->when;
opts->tsecr = req->ts_recent;
- size += TCPOLEN_TSTAMP_ALIGNED;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(ireq->sack_ok)) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!doing_ts))
- size += TCPOLEN_SACKPERM_ALIGNED;
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
- return size;
+ /* Similar rationale to tcp_syn_options() applies here, too.
+ * If the <SYN> options fit, the same options should fit now!
+ */
+ if (*md5 == NULL &&
+ doing_ts &&
+ cookie_plus > TCPOLEN_COOKIE_BASE) {
+ int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
+
+ if (0x2 & need) {
+ /* 32-bit multiple */
+ need += 2; /* NOPs */
+ }
+ if (need <= remaining) {
+ opts->options |= OPTION_COOKIE_EXTENSION;
+ opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
+ remaining -= need;
+ } else {
+ /* There's no error return, so flag it. */
+ xvp->cookie_out_never = 1; /* true */
+ opts->hash_size = 0;
+ }
+ }
+ return MAX_TCP_OPTION_SPACE - remaining;
}
/* Compute TCP options for ESTABLISHED sockets. This is not the
@@ -619,7 +795,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
struct tcp_out_options opts;
unsigned tcp_options_size, tcp_header_size;
struct tcp_md5sig_key *md5;
- __u8 *md5_hash_location;
struct tcphdr *th;
int err;
@@ -661,8 +836,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
/* Build TCP header and checksum it. */
th = tcp_hdr(skb);
- th->source = inet->sport;
- th->dest = inet->dport;
+ th->source = inet->inet_sport;
+ th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(tp->rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
@@ -690,7 +865,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
}
}
- tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
+ tcp_options_write((__be32 *)(th + 1), tp, &opts);
if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
TCP_ECN_send(sk, skb, tcp_header_size);
@@ -698,7 +873,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
- tp->af_specific->calc_md5_hash(md5_hash_location,
+ tp->af_specific->calc_md5_hash(opts.hash_location,
md5, sk, NULL, skb);
}
#endif
@@ -1627,11 +1802,6 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle)
{
- struct sk_buff *skb = tcp_send_head(sk);
-
- if (!skb)
- return;
-
/* If we are closed, the bytes will have to remain here.
* In time closedown will finish, we empty the write queue and
* all will be happy.
@@ -1918,8 +2088,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
* case, when window is shrunk to zero. In this case
* our retransmit serves as a zero window probe.
*/
- if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))
- && TCP_SKB_CB(skb)->seq != tp->snd_una)
+ if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
+ TCP_SKB_CB(skb)->seq != tp->snd_una)
return -EAGAIN;
if (skb->len > cur_mss) {
@@ -2219,19 +2389,24 @@ int tcp_send_synack(struct sock *sk)
/* Prepare a SYN-ACK. */
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
- struct request_sock *req)
+ struct request_sock *req,
+ struct request_values *rvp)
{
+ struct tcp_out_options opts;
+ struct tcp_extend_values *xvp = tcp_xv(rvp);
struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_cookie_values *cvp = tp->cookie_values;
struct tcphdr *th;
- int tcp_header_size;
- struct tcp_out_options opts;
struct sk_buff *skb;
struct tcp_md5sig_key *md5;
- __u8 *md5_hash_location;
+ int tcp_header_size;
int mss;
+ int s_data_desired = 0;
- skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
+ if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
+ s_data_desired = cvp->s_data_desired;
+ skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15 + s_data_desired, 1, GFP_ATOMIC);
if (skb == NULL)
return NULL;
@@ -2254,7 +2429,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
&req->rcv_wnd,
&req->window_clamp,
ireq->wscale_ok,
- &rcv_wscale);
+ &rcv_wscale,
+ dst_metric(dst, RTAX_INITRWND));
ireq->rcv_wscale = rcv_wscale;
}
@@ -2266,8 +2442,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
#endif
TCP_SKB_CB(skb)->when = tcp_time_stamp;
tcp_header_size = tcp_synack_options(sk, req, mss,
- skb, &opts, &md5) +
- sizeof(struct tcphdr);
+ skb, &opts, &md5, xvp)
+ + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
@@ -2284,19 +2460,54 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
*/
tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
+
+ if (OPTION_COOKIE_EXTENSION & opts.options) {
+ if (s_data_desired) {
+ u8 *buf = skb_put(skb, s_data_desired);
+
+ /* copy data directly from the listening socket. */
+ memcpy(buf, cvp->s_data_payload, s_data_desired);
+ TCP_SKB_CB(skb)->end_seq += s_data_desired;
+ }
+
+ if (opts.hash_size > 0) {
+ __u32 workspace[SHA_WORKSPACE_WORDS];
+ u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
+ u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
+
+ /* Secret recipe depends on the Timestamp, (future)
+ * Sequence and Acknowledgment Numbers, Initiator
+ * Cookie, and others handled by IP variant caller.
+ */
+ *tail-- ^= opts.tsval;
+ *tail-- ^= tcp_rsk(req)->rcv_isn + 1;
+ *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
+
+ /* recommended */
+ *tail-- ^= ((th->dest << 16) | th->source);
+ *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
+
+ sha_transform((__u32 *)&xvp->cookie_bakery[0],
+ (char *)mess,
+ &workspace[0]);
+ opts.hash_location =
+ (__u8 *)&xvp->cookie_bakery[0];
+ }
+ }
+
th->seq = htonl(TCP_SKB_CB(skb)->seq);
th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rcv_wnd, 65535U));
- tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
+ tcp_options_write((__be32 *)(th + 1), tp, &opts);
th->doff = (tcp_header_size >> 2);
TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
if (md5) {
- tcp_rsk(req)->af_specific->calc_md5_hash(md5_hash_location,
+ tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
md5, NULL, req, skb);
}
#endif
@@ -2342,7 +2553,8 @@ static void tcp_connect_init(struct sock *sk)
&tp->rcv_wnd,
&tp->window_clamp,
sysctl_tcp_window_scaling,
- &rcv_wscale);
+ &rcv_wscale,
+ dst_metric(dst, RTAX_INITRWND));
tp->rx_opt.rcv_wscale = rcv_wscale;
tp->rcv_ssthresh = tp->rcv_wnd;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 59f5b5e7c566..f8efada580e8 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -22,6 +22,7 @@
#include <linux/kprobes.h>
#include <linux/socket.h>
#include <linux/tcp.h>
+#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/module.h>
#include <linux/ktime.h>
@@ -39,9 +40,9 @@ static int port __read_mostly = 0;
MODULE_PARM_DESC(port, "Port to match (0=all)");
module_param(port, int, 0);
-static int bufsize __read_mostly = 4096;
+static unsigned int bufsize __read_mostly = 4096;
MODULE_PARM_DESC(bufsize, "Log buffer size in packets (4096)");
-module_param(bufsize, int, 0);
+module_param(bufsize, uint, 0);
static int full __read_mostly;
MODULE_PARM_DESC(full, "Full log (1=every ack packet received, 0=only cwnd changes)");
@@ -75,12 +76,12 @@ static struct {
static inline int tcp_probe_used(void)
{
- return (tcp_probe.head - tcp_probe.tail) % bufsize;
+ return (tcp_probe.head - tcp_probe.tail) & (bufsize - 1);
}
static inline int tcp_probe_avail(void)
{
- return bufsize - tcp_probe_used();
+ return bufsize - tcp_probe_used() - 1;
}
/*
@@ -94,8 +95,9 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct inet_sock *inet = inet_sk(sk);
/* Only update if port matches */
- if ((port == 0 || ntohs(inet->dport) == port || ntohs(inet->sport) == port)
- && (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
+ if ((port == 0 || ntohs(inet->inet_dport) == port ||
+ ntohs(inet->inet_sport) == port) &&
+ (full || tp->snd_cwnd != tcp_probe.lastcwnd)) {
spin_lock(&tcp_probe.lock);
/* If log fills, just silently drop */
@@ -103,10 +105,10 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
struct tcp_log *p = tcp_probe.log + tcp_probe.head;
p->tstamp = ktime_get();
- p->saddr = inet->saddr;
- p->sport = inet->sport;
- p->daddr = inet->daddr;
- p->dport = inet->dport;
+ p->saddr = inet->inet_saddr;
+ p->sport = inet->inet_sport;
+ p->daddr = inet->inet_daddr;
+ p->dport = inet->inet_dport;
p->length = skb->len;
p->snd_nxt = tp->snd_nxt;
p->snd_una = tp->snd_una;
@@ -115,7 +117,7 @@ static int jtcp_rcv_established(struct sock *sk, struct sk_buff *skb,
p->ssthresh = tcp_current_ssthresh(sk);
p->srtt = tp->srtt >> 3;
- tcp_probe.head = (tcp_probe.head + 1) % bufsize;
+ tcp_probe.head = (tcp_probe.head + 1) & (bufsize - 1);
}
tcp_probe.lastcwnd = tp->snd_cwnd;
spin_unlock(&tcp_probe.lock);
@@ -148,7 +150,7 @@ static int tcpprobe_open(struct inode * inode, struct file * file)
static int tcpprobe_sprint(char *tbuf, int n)
{
const struct tcp_log *p
- = tcp_probe.log + tcp_probe.tail % bufsize;
+ = tcp_probe.log + tcp_probe.tail;
struct timespec tv
= ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
@@ -191,7 +193,7 @@ static ssize_t tcpprobe_read(struct file *file, char __user *buf,
width = tcpprobe_sprint(tbuf, sizeof(tbuf));
if (cnt + width < len)
- tcp_probe.tail = (tcp_probe.tail + 1) % bufsize;
+ tcp_probe.tail = (tcp_probe.tail + 1) & (bufsize - 1);
spin_unlock_bh(&tcp_probe.lock);
@@ -221,9 +223,10 @@ static __init int tcpprobe_init(void)
init_waitqueue_head(&tcp_probe.wait);
spin_lock_init(&tcp_probe.lock);
- if (bufsize < 0)
+ if (bufsize == 0)
return -EINVAL;
+ bufsize = roundup_pow_of_two(bufsize);
tcp_probe.log = kcalloc(bufsize, sizeof(struct tcp_log), GFP_KERNEL);
if (!tcp_probe.log)
goto err0;
@@ -235,7 +238,7 @@ static __init int tcpprobe_init(void)
if (ret)
goto err1;
- pr_info("TCP probe registered (port=%d)\n", port);
+ pr_info("TCP probe registered (port=%d) bufsize=%u\n", port, bufsize);
return 0;
err1:
proc_net_remove(&init_net, procname);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index cdb2ca7684d4..8a0ab2977f1f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -19,6 +19,7 @@
*/
#include <linux/module.h>
+#include <linux/gfp.h>
#include <net/tcp.h>
int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
@@ -29,6 +30,7 @@ int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
int sysctl_tcp_orphan_retries __read_mostly;
+int sysctl_tcp_thin_linear_timeouts __read_mostly;
static void tcp_write_timer(unsigned long);
static void tcp_delack_timer(unsigned long);
@@ -132,6 +134,35 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
}
}
+/* This function calculates a "timeout" which is equivalent to the timeout of a
+ * TCP connection after "boundary" unsuccessful, exponentially backed-off
+ * retransmissions with an initial RTO of TCP_RTO_MIN.
+ */
+static bool retransmits_timed_out(struct sock *sk,
+ unsigned int boundary)
+{
+ unsigned int timeout, linear_backoff_thresh;
+ unsigned int start_ts;
+
+ if (!inet_csk(sk)->icsk_retransmits)
+ return false;
+
+ if (unlikely(!tcp_sk(sk)->retrans_stamp))
+ start_ts = TCP_SKB_CB(tcp_write_queue_head(sk))->when;
+ else
+ start_ts = tcp_sk(sk)->retrans_stamp;
+
+ linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
+
+ if (boundary <= linear_backoff_thresh)
+ timeout = ((2 << boundary) - 1) * TCP_RTO_MIN;
+ else
+ timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
+ (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+
+ return (tcp_time_stamp - start_ts) >= timeout;
+}
+
/* A write timeout has occurred. Process the after effects. */
static int tcp_write_timeout(struct sock *sk)
{
@@ -141,14 +172,14 @@ static int tcp_write_timeout(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
- dst_negative_advice(&sk->sk_dst_cache);
+ dst_negative_advice(&sk->sk_dst_cache, sk);
retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
} else {
if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
- dst_negative_advice(&sk->sk_dst_cache);
+ dst_negative_advice(&sk->sk_dst_cache, sk);
}
retry_until = sysctl_tcp_retries2;
@@ -303,15 +334,15 @@ void tcp_retransmit_timer(struct sock *sk)
struct inet_sock *inet = inet_sk(sk);
if (sk->sk_family == AF_INET) {
LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI4:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
- &inet->daddr, ntohs(inet->dport),
- inet->num, tp->snd_una, tp->snd_nxt);
+ &inet->inet_daddr, ntohs(inet->inet_dport),
+ inet->inet_num, tp->snd_una, tp->snd_nxt);
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
else if (sk->sk_family == AF_INET6) {
struct ipv6_pinfo *np = inet6_sk(sk);
LIMIT_NETDEBUG(KERN_DEBUG "TCP: Peer %pI6:%u/%u unexpectedly shrunk window %u:%u (repaired)\n",
- &np->daddr, ntohs(inet->dport),
- inet->num, tp->snd_una, tp->snd_nxt);
+ &np->daddr, ntohs(inet->inet_dport),
+ inet->inet_num, tp->snd_una, tp->snd_nxt);
}
#endif
#endif
@@ -386,7 +417,25 @@ void tcp_retransmit_timer(struct sock *sk)
icsk->icsk_retransmits++;
out_reset_timer:
- icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is
+ * used to reset timer, set to 0. Recalculate 'icsk_rto' as this
+ * might be increased if the stream oscillates between thin and thick,
+ * thus the old value might already be too high compared to the value
+ * set by 'tcp_set_rto' in tcp_input.c which resets the rto without
+ * backoff. Limit to TCP_THIN_LINEAR_RETRIES before initiating
+ * exponential backoff behaviour to avoid continue hammering
+ * linear-timeout retransmissions into a black hole
+ */
+ if (sk->sk_state == TCP_ESTABLISHED &&
+ (tp->thin_lto || sysctl_tcp_thin_linear_timeouts) &&
+ tcp_stream_is_thin(tp) &&
+ icsk->icsk_retransmits <= TCP_THIN_LINEAR_RETRIES) {
+ icsk->icsk_backoff = 0;
+ icsk->icsk_rto = min(__tcp_set_rto(tp), TCP_RTO_MAX);
+ } else {
+ /* Use normal (exponential) backoff */
+ icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+ }
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
__sk_dst_reset(sk);
@@ -445,6 +494,12 @@ static void tcp_synack_timer(struct sock *sk)
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
}
+void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req)
+{
+ NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPTIMEOUTS);
+}
+EXPORT_SYMBOL(tcp_syn_ack_timeout);
+
void tcp_set_keepalive(struct sock *sk, int val)
{
if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index e9bbff746488..b612acf76183 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -165,9 +165,8 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
* every other rtt.
*/
if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
- if (veno->inc
- && tp->snd_cwnd <
- tp->snd_cwnd_clamp) {
+ if (veno->inc &&
+ tp->snd_cwnd < tp->snd_cwnd_clamp) {
tp->snd_cwnd++;
veno->inc = 0;
} else
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 66b6821b984e..a0f240358892 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -157,8 +157,8 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
if (queue > TCP_YEAH_ALPHA ||
rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) {
- if (queue > TCP_YEAH_ALPHA
- && tp->snd_cwnd > yeah->reno_count) {
+ if (queue > TCP_YEAH_ALPHA &&
+ tp->snd_cwnd > yeah->reno_count) {
u32 reduction = min(queue / TCP_YEAH_GAMMA ,
tp->snd_cwnd >> TCP_YEAH_EPSILON);
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 3959e0ca456a..3b3813cc80b9 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -8,6 +8,7 @@
#include <linux/mutex.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
+#include <linux/slab.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/protocol.h>
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0fa9f70e4b19..954bbfb39dff 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -95,6 +95,7 @@
#include <linux/mm.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
+#include <linux/slab.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
#include <linux/proc_fs.h>
@@ -106,7 +107,7 @@
#include <net/xfrm.h>
#include "udp_impl.h"
-struct udp_table udp_table;
+struct udp_table udp_table __read_mostly;
EXPORT_SYMBOL(udp_table);
int sysctl_udp_mem[3] __read_mostly;
@@ -121,28 +122,30 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min);
atomic_t udp_memory_allocated;
EXPORT_SYMBOL(udp_memory_allocated);
-#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE)
+#define MAX_UDP_PORTS 65536
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
static int udp_lib_lport_inuse(struct net *net, __u16 num,
const struct udp_hslot *hslot,
unsigned long *bitmap,
struct sock *sk,
int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2))
+ const struct sock *sk2),
+ unsigned int log)
{
struct sock *sk2;
struct hlist_nulls_node *node;
sk_nulls_for_each(sk2, node, &hslot->head)
- if (net_eq(sock_net(sk2), net) &&
- sk2 != sk &&
- (bitmap || sk2->sk_hash == num) &&
- (!sk2->sk_reuse || !sk->sk_reuse) &&
- (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if
- || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
+ (!sk2->sk_reuse || !sk->sk_reuse) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
(*saddr_comp)(sk, sk2)) {
if (bitmap)
- __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE,
+ __set_bit(udp_sk(sk2)->udp_port_hash >> log,
bitmap);
else
return 1;
@@ -150,18 +153,51 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
return 0;
}
+/*
+ * Note: we still hold spinlock of primary hash chain, so no other writer
+ * can insert/delete a socket with local_port == num
+ */
+static int udp_lib_lport_inuse2(struct net *net, __u16 num,
+ struct udp_hslot *hslot2,
+ struct sock *sk,
+ int (*saddr_comp)(const struct sock *sk1,
+ const struct sock *sk2))
+{
+ struct sock *sk2;
+ struct hlist_nulls_node *node;
+ int res = 0;
+
+ spin_lock(&hslot2->lock);
+ udp_portaddr_for_each_entry(sk2, node, &hslot2->head)
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ (udp_sk(sk2)->udp_port_hash == num) &&
+ (!sk2->sk_reuse || !sk->sk_reuse) &&
+ (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ (*saddr_comp)(sk, sk2)) {
+ res = 1;
+ break;
+ }
+ spin_unlock(&hslot2->lock);
+ return res;
+}
+
/**
* udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
*
* @sk: socket struct in question
* @snum: port number to look up
* @saddr_comp: AF-dependent comparison of bound local IP addresses
+ * @hash2_nulladdr: AF-dependant hash value in secondary hash chains,
+ * with NULL address
*/
int udp_lib_get_port(struct sock *sk, unsigned short snum,
int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2))
+ const struct sock *sk2),
+ unsigned int hash2_nulladdr)
{
- struct udp_hslot *hslot;
+ struct udp_hslot *hslot, *hslot2;
struct udp_table *udptable = sk->sk_prot->h.udp_table;
int error = 1;
struct net *net = sock_net(sk);
@@ -180,13 +216,14 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
/*
* force rand to be an odd multiple of UDP_HTABLE_SIZE
*/
- rand = (rand | 1) * UDP_HTABLE_SIZE;
- for (last = first + UDP_HTABLE_SIZE; first != last; first++) {
- hslot = &udptable->hash[udp_hashfn(net, first)];
+ rand = (rand | 1) * (udptable->mask + 1);
+ last = first + udptable->mask + 1;
+ do {
+ hslot = udp_hashslot(udptable, net, first);
bitmap_zero(bitmap, PORTS_PER_CHAIN);
spin_lock_bh(&hslot->lock);
udp_lib_lport_inuse(net, snum, hslot, bitmap, sk,
- saddr_comp);
+ saddr_comp, udptable->log);
snum = first;
/*
@@ -196,25 +233,59 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum,
*/
do {
if (low <= snum && snum <= high &&
- !test_bit(snum / UDP_HTABLE_SIZE, bitmap))
+ !test_bit(snum >> udptable->log, bitmap))
goto found;
snum += rand;
} while (snum != first);
spin_unlock_bh(&hslot->lock);
- }
+ } while (++first != last);
goto fail;
} else {
- hslot = &udptable->hash[udp_hashfn(net, snum)];
+ hslot = udp_hashslot(udptable, net, snum);
spin_lock_bh(&hslot->lock);
- if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp))
+ if (hslot->count > 10) {
+ int exist;
+ unsigned int slot2 = udp_sk(sk)->udp_portaddr_hash ^ snum;
+
+ slot2 &= udptable->mask;
+ hash2_nulladdr &= udptable->mask;
+
+ hslot2 = udp_hashslot2(udptable, slot2);
+ if (hslot->count < hslot2->count)
+ goto scan_primary_hash;
+
+ exist = udp_lib_lport_inuse2(net, snum, hslot2,
+ sk, saddr_comp);
+ if (!exist && (hash2_nulladdr != slot2)) {
+ hslot2 = udp_hashslot2(udptable, hash2_nulladdr);
+ exist = udp_lib_lport_inuse2(net, snum, hslot2,
+ sk, saddr_comp);
+ }
+ if (exist)
+ goto fail_unlock;
+ else
+ goto found;
+ }
+scan_primary_hash:
+ if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk,
+ saddr_comp, 0))
goto fail_unlock;
}
found:
- inet_sk(sk)->num = snum;
- sk->sk_hash = snum;
+ inet_sk(sk)->inet_num = snum;
+ udp_sk(sk)->udp_port_hash = snum;
+ udp_sk(sk)->udp_portaddr_hash ^= snum;
if (sk_unhashed(sk)) {
sk_nulls_add_node_rcu(sk, &hslot->head);
+ hslot->count++;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+ spin_lock(&hslot2->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &hslot2->head);
+ hslot2->count++;
+ spin_unlock(&hslot2->lock);
}
error = 0;
fail_unlock:
@@ -229,13 +300,26 @@ static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
return (!ipv6_only_sock(sk2) &&
- (!inet1->rcv_saddr || !inet2->rcv_saddr ||
- inet1->rcv_saddr == inet2->rcv_saddr));
+ (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
+ inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
+}
+
+static unsigned int udp4_portaddr_hash(struct net *net, __be32 saddr,
+ unsigned int port)
+{
+ return jhash_1word(saddr, net_hash_mix(net)) ^ port;
}
int udp_v4_get_port(struct sock *sk, unsigned short snum)
{
- return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal);
+ unsigned int hash2_nulladdr =
+ udp4_portaddr_hash(sock_net(sk), INADDR_ANY, snum);
+ unsigned int hash2_partial =
+ udp4_portaddr_hash(sock_net(sk), inet_sk(sk)->inet_rcv_saddr, 0);
+
+ /* precompute partial secondary hash */
+ udp_sk(sk)->udp_portaddr_hash = hash2_partial;
+ return udp_lib_get_port(sk, snum, ipv4_rcv_saddr_equal, hash2_nulladdr);
}
static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
@@ -244,23 +328,61 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
{
int score = -1;
- if (net_eq(sock_net(sk), net) && sk->sk_hash == hnum &&
+ if (net_eq(sock_net(sk), net) && udp_sk(sk)->udp_port_hash == hnum &&
!ipv6_only_sock(sk)) {
struct inet_sock *inet = inet_sk(sk);
score = (sk->sk_family == PF_INET ? 1 : 0);
- if (inet->rcv_saddr) {
- if (inet->rcv_saddr != daddr)
+ if (inet->inet_rcv_saddr) {
+ if (inet->inet_rcv_saddr != daddr)
+ return -1;
+ score += 2;
+ }
+ if (inet->inet_daddr) {
+ if (inet->inet_daddr != saddr)
+ return -1;
+ score += 2;
+ }
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
+ return -1;
+ score += 2;
+ }
+ if (sk->sk_bound_dev_if) {
+ if (sk->sk_bound_dev_if != dif)
return -1;
score += 2;
}
- if (inet->daddr) {
- if (inet->daddr != saddr)
+ }
+ return score;
+}
+
+/*
+ * In this second variant, we check (daddr, dport) matches (inet_rcv_sadd, inet_num)
+ */
+#define SCORE2_MAX (1 + 2 + 2 + 2)
+static inline int compute_score2(struct sock *sk, struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum, int dif)
+{
+ int score = -1;
+
+ if (net_eq(sock_net(sk), net) && !ipv6_only_sock(sk)) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (inet->inet_rcv_saddr != daddr)
+ return -1;
+ if (inet->inet_num != hnum)
+ return -1;
+
+ score = (sk->sk_family == PF_INET ? 1 : 0);
+ if (inet->inet_daddr) {
+ if (inet->inet_daddr != saddr)
return -1;
score += 2;
}
- if (inet->dport) {
- if (inet->dport != sport)
+ if (inet->inet_dport) {
+ if (inet->inet_dport != sport)
return -1;
score += 2;
}
@@ -273,6 +395,51 @@ static inline int compute_score(struct sock *sk, struct net *net, __be32 saddr,
return score;
}
+
+/* called with read_rcu_lock() */
+static struct sock *udp4_lib_lookup2(struct net *net,
+ __be32 saddr, __be16 sport,
+ __be32 daddr, unsigned int hnum, int dif,
+ struct udp_hslot *hslot2, unsigned int slot2)
+{
+ struct sock *sk, *result;
+ struct hlist_nulls_node *node;
+ int score, badness;
+
+begin:
+ result = NULL;
+ badness = -1;
+ udp_portaddr_for_each_entry_rcu(sk, node, &hslot2->head) {
+ score = compute_score2(sk, net, saddr, sport,
+ daddr, hnum, dif);
+ if (score > badness) {
+ result = sk;
+ badness = score;
+ if (score == SCORE2_MAX)
+ goto exact_match;
+ }
+ }
+ /*
+ * if the nulls value we got at the end of this lookup is
+ * not the expected one, we must restart lookup.
+ * We probably met an item that was moved to another chain.
+ */
+ if (get_nulls_value(node) != slot2)
+ goto begin;
+
+ if (result) {
+exact_match:
+ if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
+ result = NULL;
+ else if (unlikely(compute_score2(result, net, saddr, sport,
+ daddr, hnum, dif) < badness)) {
+ sock_put(result);
+ goto begin;
+ }
+ }
+ return result;
+}
+
/* UDP is nearly always wildcards out the wazoo, it makes no sense to try
* harder than this. -DaveM
*/
@@ -283,11 +450,35 @@ static struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
struct sock *sk, *result;
struct hlist_nulls_node *node;
unsigned short hnum = ntohs(dport);
- unsigned int hash = udp_hashfn(net, hnum);
- struct udp_hslot *hslot = &udptable->hash[hash];
+ unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
+ struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
int score, badness;
rcu_read_lock();
+ if (hslot->count > 10) {
+ hash2 = udp4_portaddr_hash(net, daddr, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ if (hslot->count < hslot2->count)
+ goto begin;
+
+ result = udp4_lib_lookup2(net, saddr, sport,
+ daddr, hnum, dif,
+ hslot2, slot2);
+ if (!result) {
+ hash2 = udp4_portaddr_hash(net, INADDR_ANY, hnum);
+ slot2 = hash2 & udptable->mask;
+ hslot2 = &udptable->hash2[slot2];
+ if (hslot->count < hslot2->count)
+ goto begin;
+
+ result = udp4_lib_lookup2(net, INADDR_ANY, sport,
+ daddr, hnum, dif,
+ hslot2, slot2);
+ }
+ rcu_read_unlock();
+ return result;
+ }
begin:
result = NULL;
badness = -1;
@@ -304,7 +495,7 @@ begin:
* not the expected one, we must restart lookup.
* We probably met an item that was moved to another chain.
*/
- if (get_nulls_value(node) != hash)
+ if (get_nulls_value(node) != slot)
goto begin;
if (result) {
@@ -354,12 +545,13 @@ static inline struct sock *udp_v4_mcast_next(struct net *net, struct sock *sk,
sk_nulls_for_each_from(s, node) {
struct inet_sock *inet = inet_sk(s);
- if (!net_eq(sock_net(s), net) ||
- s->sk_hash != hnum ||
- (inet->daddr && inet->daddr != rmt_addr) ||
- (inet->dport != rmt_port && inet->dport) ||
- (inet->rcv_saddr && inet->rcv_saddr != loc_addr) ||
- ipv6_only_sock(s) ||
+ if (!net_eq(sock_net(s), net) ||
+ udp_sk(s)->udp_port_hash != hnum ||
+ (inet->inet_daddr && inet->inet_daddr != rmt_addr) ||
+ (inet->inet_dport != rmt_port && inet->inet_dport) ||
+ (inet->inet_rcv_saddr &&
+ inet->inet_rcv_saddr != loc_addr) ||
+ ipv6_only_sock(s) ||
(s->sk_bound_dev_if && s->sk_bound_dev_if != dif))
continue;
if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif))
@@ -642,14 +834,14 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
} else {
if (sk->sk_state != TCP_ESTABLISHED)
return -EDESTADDRREQ;
- daddr = inet->daddr;
- dport = inet->dport;
+ daddr = inet->inet_daddr;
+ dport = inet->inet_dport;
/* Open fast path for connected socket.
Route will not be used, if at least one option is set.
*/
connected = 1;
}
- ipc.addr = inet->saddr;
+ ipc.addr = inet->inet_saddr;
ipc.oif = sk->sk_bound_dev_if;
err = sock_tx_timestamp(msg, sk, &ipc.shtx);
@@ -704,7 +896,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
.proto = sk->sk_protocol,
.flags = inet_sk_flowi_flags(sk),
.uli_u = { .ports =
- { .sport = inet->sport,
+ { .sport = inet->inet_sport,
.dport = dport } } };
struct net *net = sock_net(sk);
@@ -748,7 +940,7 @@ back_from_confirm:
inet->cork.fl.fl4_dst = daddr;
inet->cork.fl.fl_ip_dport = dport;
inet->cork.fl.fl4_src = saddr;
- inet->cork.fl.fl_ip_sport = inet->sport;
+ inet->cork.fl.fl_ip_sport = inet->inet_sport;
up->pending = AF_INET;
do_append_data:
@@ -862,6 +1054,7 @@ static unsigned int first_packet_length(struct sock *sk)
udp_lib_checksum_complete(skb)) {
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
IS_UDPLITE(sk));
+ atomic_inc(&sk->sk_drops);
__skb_unlink(skb, rcvq);
__skb_queue_tail(&list_kill, skb);
}
@@ -925,7 +1118,7 @@ int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct inet_sock *inet = inet_sk(sk);
struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
struct sk_buff *skb;
- unsigned int ulen, copied;
+ unsigned int ulen;
int peeked;
int err;
int is_udplite = IS_UDPLITE(sk);
@@ -946,10 +1139,9 @@ try_again:
goto out;
ulen = skb->len - sizeof(struct udphdr);
- copied = len;
- if (copied > ulen)
- copied = ulen;
- else if (copied < ulen)
+ if (len > ulen)
+ len = ulen;
+ else if (len < ulen)
msg->msg_flags |= MSG_TRUNC;
/*
@@ -958,14 +1150,14 @@ try_again:
* coverage checksum (UDP-Lite), do it before the copy.
*/
- if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
+ if (len < ulen || UDP_SKB_CB(skb)->partial_cov) {
if (udp_lib_checksum_complete(skb))
goto csum_copy_err;
}
if (skb_csum_unnecessary(skb))
err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
- msg->msg_iov, copied);
+ msg->msg_iov, len);
else {
err = skb_copy_and_csum_datagram_iovec(skb,
sizeof(struct udphdr),
@@ -982,7 +1174,7 @@ try_again:
UDP_INC_STATS_USER(sock_net(sk),
UDP_MIB_INDATAGRAMS, is_udplite);
- sock_recv_timestamp(msg, sk, skb);
+ sock_recv_ts_and_drops(msg, sk, skb);
/* Copy the address. */
if (sin) {
@@ -994,7 +1186,7 @@ try_again:
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);
- err = copied;
+ err = len;
if (flags & MSG_TRUNC)
err = ulen;
@@ -1023,15 +1215,15 @@ int udp_disconnect(struct sock *sk, int flags)
*/
sk->sk_state = TCP_CLOSE;
- inet->daddr = 0;
- inet->dport = 0;
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
sk->sk_bound_dev_if = 0;
if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
inet_reset_saddr(sk);
if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) {
sk->sk_prot->unhash(sk);
- inet->sport = 0;
+ inet->inet_sport = 0;
}
sk_dst_reset(sk);
return 0;
@@ -1042,13 +1234,22 @@ void udp_lib_unhash(struct sock *sk)
{
if (sk_hashed(sk)) {
struct udp_table *udptable = sk->sk_prot->h.udp_table;
- unsigned int hash = udp_hashfn(sock_net(sk), sk->sk_hash);
- struct udp_hslot *hslot = &udptable->hash[hash];
+ struct udp_hslot *hslot, *hslot2;
+
+ hslot = udp_hashslot(udptable, sock_net(sk),
+ udp_sk(sk)->udp_port_hash);
+ hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
spin_lock_bh(&hslot->lock);
if (sk_nulls_del_node_init_rcu(sk)) {
- inet_sk(sk)->num = 0;
+ hslot->count--;
+ inet_sk(sk)->inet_num = 0;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+
+ spin_lock(&hslot2->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
}
spin_unlock_bh(&hslot->lock);
}
@@ -1057,25 +1258,22 @@ EXPORT_SYMBOL(udp_lib_unhash);
static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
- int is_udplite = IS_UDPLITE(sk);
- int rc;
+ int rc = sock_queue_rcv_skb(sk, skb);
+
+ if (rc < 0) {
+ int is_udplite = IS_UDPLITE(sk);
- if ((rc = sock_queue_rcv_skb(sk, skb)) < 0) {
/* Note that an ENOMEM error is charged twice */
- if (rc == -ENOMEM) {
+ if (rc == -ENOMEM)
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
- atomic_inc(&sk->sk_drops);
- }
- goto drop;
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ kfree_skb(skb);
+ return -1;
}
return 0;
-drop:
- UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- kfree_skb(skb);
- return -1;
}
/* returns:
@@ -1174,61 +1372,98 @@ int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
bh_lock_sock(sk);
if (!sock_owned_by_user(sk))
rc = __udp_queue_rcv_skb(sk, skb);
- else
- sk_add_backlog(sk, skb);
+ else if (sk_add_backlog(sk, skb)) {
+ bh_unlock_sock(sk);
+ goto drop;
+ }
bh_unlock_sock(sk);
return rc;
drop:
UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
+ atomic_inc(&sk->sk_drops);
kfree_skb(skb);
return -1;
}
+
+static void flush_stack(struct sock **stack, unsigned int count,
+ struct sk_buff *skb, unsigned int final)
+{
+ unsigned int i;
+ struct sk_buff *skb1 = NULL;
+ struct sock *sk;
+
+ for (i = 0; i < count; i++) {
+ sk = stack[i];
+ if (likely(skb1 == NULL))
+ skb1 = (i == final) ? skb : skb_clone(skb, GFP_ATOMIC);
+
+ if (!skb1) {
+ atomic_inc(&sk->sk_drops);
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_RCVBUFERRORS,
+ IS_UDPLITE(sk));
+ UDP_INC_STATS_BH(sock_net(sk), UDP_MIB_INERRORS,
+ IS_UDPLITE(sk));
+ }
+
+ if (skb1 && udp_queue_rcv_skb(sk, skb1) <= 0)
+ skb1 = NULL;
+ }
+ if (unlikely(skb1))
+ kfree_skb(skb1);
+}
+
/*
* Multicasts and broadcasts go to each listener.
*
- * Note: called only from the BH handler context,
- * so we don't need to lock the hashes.
+ * Note: called only from the BH handler context.
*/
static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
struct udphdr *uh,
__be32 saddr, __be32 daddr,
struct udp_table *udptable)
{
- struct sock *sk;
- struct udp_hslot *hslot = &udptable->hash[udp_hashfn(net, ntohs(uh->dest))];
+ struct sock *sk, *stack[256 / sizeof(struct sock *)];
+ struct udp_hslot *hslot = udp_hashslot(udptable, net, ntohs(uh->dest));
int dif;
+ unsigned int i, count = 0;
spin_lock(&hslot->lock);
sk = sk_nulls_head(&hslot->head);
dif = skb->dev->ifindex;
sk = udp_v4_mcast_next(net, sk, uh->dest, daddr, uh->source, saddr, dif);
- if (sk) {
- struct sock *sknext = NULL;
+ while (sk) {
+ stack[count++] = sk;
+ sk = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
+ daddr, uh->source, saddr, dif);
+ if (unlikely(count == ARRAY_SIZE(stack))) {
+ if (!sk)
+ break;
+ flush_stack(stack, count, skb, ~0);
+ count = 0;
+ }
+ }
+ /*
+ * before releasing chain lock, we must take a reference on sockets
+ */
+ for (i = 0; i < count; i++)
+ sock_hold(stack[i]);
- do {
- struct sk_buff *skb1 = skb;
-
- sknext = udp_v4_mcast_next(net, sk_nulls_next(sk), uh->dest,
- daddr, uh->source, saddr,
- dif);
- if (sknext)
- skb1 = skb_clone(skb, GFP_ATOMIC);
-
- if (skb1) {
- int ret = udp_queue_rcv_skb(sk, skb1);
- if (ret > 0)
- /* we should probably re-process instead
- * of dropping packets here. */
- kfree_skb(skb1);
- }
- sk = sknext;
- } while (sknext);
- } else
- consume_skb(skb);
spin_unlock(&hslot->lock);
+
+ /*
+ * do the slow work with no lock held
+ */
+ if (count) {
+ flush_stack(stack, count, skb, count - 1);
+
+ for (i = 0; i < count; i++)
+ sock_put(stack[i]);
+ } else {
+ kfree_skb(skb);
+ }
return 0;
}
@@ -1620,9 +1855,14 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
struct udp_iter_state *state = seq->private;
struct net *net = seq_file_net(seq);
- for (state->bucket = start; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) {
+ for (state->bucket = start; state->bucket <= state->udp_table->mask;
+ ++state->bucket) {
struct hlist_nulls_node *node;
struct udp_hslot *hslot = &state->udp_table->hash[state->bucket];
+
+ if (hlist_nulls_empty(&hslot->head))
+ continue;
+
spin_lock_bh(&hslot->lock);
sk_nulls_for_each(sk, node, &hslot->head) {
if (!net_eq(sock_net(sk), net))
@@ -1647,7 +1887,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
if (!sk) {
- if (state->bucket < UDP_HTABLE_SIZE)
+ if (state->bucket <= state->udp_table->mask)
spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
return udp_get_first(seq, state->bucket + 1);
}
@@ -1667,7 +1907,7 @@ static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos)
static void *udp_seq_start(struct seq_file *seq, loff_t *pos)
{
struct udp_iter_state *state = seq->private;
- state->bucket = UDP_HTABLE_SIZE;
+ state->bucket = MAX_UDP_PORTS;
return *pos ? udp_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
@@ -1689,7 +1929,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
{
struct udp_iter_state *state = seq->private;
- if (state->bucket < UDP_HTABLE_SIZE)
+ if (state->bucket <= state->udp_table->mask)
spin_unlock_bh(&state->udp_table->hash[state->bucket].lock);
}
@@ -1744,12 +1984,12 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
int bucket, int *len)
{
struct inet_sock *inet = inet_sk(sp);
- __be32 dest = inet->daddr;
- __be32 src = inet->rcv_saddr;
- __u16 destp = ntohs(inet->dport);
- __u16 srcp = ntohs(inet->sport);
+ __be32 dest = inet->inet_daddr;
+ __be32 src = inet->inet_rcv_saddr;
+ __u16 destp = ntohs(inet->inet_dport);
+ __u16 srcp = ntohs(inet->inet_sport);
- seq_printf(f, "%4d: %08X:%04X %08X:%04X"
+ seq_printf(f, "%5d: %08X:%04X %08X:%04X"
" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %d%n",
bucket, src, srcp, dest, destp, sp->sk_state,
sk_wmem_alloc_get(sp),
@@ -1789,12 +2029,12 @@ static struct udp_seq_afinfo udp4_seq_afinfo = {
},
};
-static int udp4_proc_init_net(struct net *net)
+static int __net_init udp4_proc_init_net(struct net *net)
{
return udp_proc_register(net, &udp4_seq_afinfo);
}
-static void udp4_proc_exit_net(struct net *net)
+static void __net_exit udp4_proc_exit_net(struct net *net)
{
udp_proc_unregister(net, &udp4_seq_afinfo);
}
@@ -1815,21 +2055,60 @@ void udp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
-void __init udp_table_init(struct udp_table *table)
+static __initdata unsigned long uhash_entries;
+static int __init set_uhash_entries(char *str)
{
- int i;
+ if (!str)
+ return 0;
+ uhash_entries = simple_strtoul(str, &str, 0);
+ if (uhash_entries && uhash_entries < UDP_HTABLE_SIZE_MIN)
+ uhash_entries = UDP_HTABLE_SIZE_MIN;
+ return 1;
+}
+__setup("uhash_entries=", set_uhash_entries);
- for (i = 0; i < UDP_HTABLE_SIZE; i++) {
+void __init udp_table_init(struct udp_table *table, const char *name)
+{
+ unsigned int i;
+
+ if (!CONFIG_BASE_SMALL)
+ table->hash = alloc_large_system_hash(name,
+ 2 * sizeof(struct udp_hslot),
+ uhash_entries,
+ 21, /* one slot per 2 MB */
+ 0,
+ &table->log,
+ &table->mask,
+ 64 * 1024);
+ /*
+ * Make sure hash table has the minimum size
+ */
+ if (CONFIG_BASE_SMALL || table->mask < UDP_HTABLE_SIZE_MIN - 1) {
+ table->hash = kmalloc(UDP_HTABLE_SIZE_MIN *
+ 2 * sizeof(struct udp_hslot), GFP_KERNEL);
+ if (!table->hash)
+ panic(name);
+ table->log = ilog2(UDP_HTABLE_SIZE_MIN);
+ table->mask = UDP_HTABLE_SIZE_MIN - 1;
+ }
+ table->hash2 = table->hash + (table->mask + 1);
+ for (i = 0; i <= table->mask; i++) {
INIT_HLIST_NULLS_HEAD(&table->hash[i].head, i);
+ table->hash[i].count = 0;
spin_lock_init(&table->hash[i].lock);
}
+ for (i = 0; i <= table->mask; i++) {
+ INIT_HLIST_NULLS_HEAD(&table->hash2[i].head, i);
+ table->hash2[i].count = 0;
+ spin_lock_init(&table->hash2[i].lock);
+ }
}
void __init udp_init(void)
{
unsigned long nr_pages, limit;
- udp_table_init(&udp_table);
+ udp_table_init(&udp_table, "UDP");
/* Set the pressure threshold up by the same strategy of TCP. It is a
* fraction of global memory that is up to 1/2 at 256 MB, decreasing
* toward zero with the amount of memory, with a floor of 128 pages.
diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c
index 95248d7f75ec..6610bf76369f 100644
--- a/net/ipv4/udplite.c
+++ b/net/ipv4/udplite.c
@@ -12,7 +12,7 @@
*/
#include "udp_impl.h"
-struct udp_table udplite_table;
+struct udp_table udplite_table __read_mostly;
EXPORT_SYMBOL(udplite_table);
static int udplite_rcv(struct sk_buff *skb)
@@ -64,7 +64,6 @@ static struct inet_protosw udplite4_protosw = {
.protocol = IPPROTO_UDPLITE,
.prot = &udplite_prot,
.ops = &inet_dgram_ops,
- .capability = -1,
.no_check = 0, /* must checksum (RFC 3828) */
.flags = INET_PROTOSW_PERMANENT,
};
@@ -82,12 +81,12 @@ static struct udp_seq_afinfo udplite4_seq_afinfo = {
},
};
-static int udplite4_proc_init_net(struct net *net)
+static int __net_init udplite4_proc_init_net(struct net *net)
{
return udp_proc_register(net, &udplite4_seq_afinfo);
}
-static void udplite4_proc_exit_net(struct net *net)
+static void __net_exit udplite4_proc_exit_net(struct net *net)
{
udp_proc_unregister(net, &udplite4_seq_afinfo);
}
@@ -110,7 +109,7 @@ static inline int udplite4_proc_init(void)
void __init udplite4_register(void)
{
- udp_table_init(&udplite_table);
+ udp_table_init(&udplite_table, "UDP-Lite");
if (proto_register(&udplite_prot, 1))
goto out_register_err;
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index f9f922a0ba88..c791bb63203f 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -9,6 +9,7 @@
*
*/
+#include <linux/slab.h>
#include <linux/module.h>
#include <linux/string.h>
#include <linux/netfilter.h>
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 3444f3b34eca..6f368413eb0e 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -4,6 +4,7 @@
* Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au>
*/
+#include <linux/gfp.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/module.h>
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 74fb2eb833ec..e4a1483fba77 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -15,7 +15,6 @@
#include <net/xfrm.h>
#include <net/ip.h>
-static struct dst_ops xfrm4_dst_ops;
static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos,
@@ -92,11 +91,12 @@ static int xfrm4_init_path(struct xfrm_dst *path, struct dst_entry *dst,
return 0;
}
-static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev)
+static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
+ struct flowi *fl)
{
struct rtable *rt = (struct rtable *)xdst->route;
- xdst->u.rt.fl = rt->fl;
+ xdst->u.rt.fl = *fl;
xdst->u.dst.dev = dev;
dev_hold(dev);
@@ -190,8 +190,10 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl, int reverse)
static inline int xfrm4_garbage_collect(struct dst_ops *ops)
{
- xfrm4_policy_afinfo.garbage_collect(&init_net);
- return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2);
+ struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
+
+ xfrm4_policy_afinfo.garbage_collect(net);
+ return (atomic_read(&ops->entries) > ops->gc_thresh * 2);
}
static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -267,9 +269,8 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
#ifdef CONFIG_SYSCTL
static struct ctl_table xfrm4_policy_table[] = {
{
- .ctl_name = CTL_UNNUMBERED,
.procname = "xfrm4_gc_thresh",
- .data = &xfrm4_dst_ops.gc_thresh,
+ .data = &init_net.xfrm.xfrm4_dst_ops.gc_thresh,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
@@ -296,8 +297,6 @@ static void __exit xfrm4_policy_fini(void)
void __init xfrm4_init(int rt_max_size)
{
- xfrm4_state_init();
- xfrm4_policy_init();
/*
* Select a default value for the gc_thresh based on the main route
* table hash size. It seems to me the worst case scenario is when
@@ -309,6 +308,9 @@ void __init xfrm4_init(int rt_max_size)
* and start cleaning when were 1/2 full
*/
xfrm4_dst_ops.gc_thresh = rt_max_size/2;
+
+ xfrm4_state_init();
+ xfrm4_policy_init();
#ifdef CONFIG_SYSCTL
sysctl_hdr = register_net_sysctl_table(&init_net, net_ipv4_ctl_path,
xfrm4_policy_table);