From 6b58e0a5f32dedb609438bb9c9c82aa6e23381f2 Mon Sep 17 00:00:00 2001 From: Fan Du Date: Fri, 6 Mar 2015 11:18:23 +0800 Subject: ipv4: Use binary search to choose tcp PMTU probe_size Current probe_size is chosen by doubling mss_cache, the probing process will end shortly with a sub-optimal mss size, and the link mtu will not be taken full advantage of, in return, this will make user to tweak tcp_base_mss with care. Use binary search to choose probe_size in a fine granularity manner, an optimal mss will be found to boost performance as its maxmium. In addition, introduce a sysctl_tcp_probe_threshold to control when probing will stop in respect to the width of search range. Test env: Docker instance with vxlan encapuslation(82599EB) iperf -c 10.0.0.24 -t 60 before this patch: 1.26 Gbits/sec After this patch: increase 26% 1.59 Gbits/sec Signed-off-by: Fan Du Acked-by: John Heffner Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5a2dfed4783b..35790d977a2b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2460,6 +2460,7 @@ static int __net_init tcp_sk_init(struct net *net) } net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; + net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; return 0; fail: -- cgit v1.2.3 From 05cbc0db03e82128f2e7e353d4194dd24a1627fe Mon Sep 17 00:00:00 2001 From: Fan Du Date: Fri, 6 Mar 2015 11:18:24 +0800 Subject: ipv4: Create probe timer for tcp PMTU as per RFC4821 As per RFC4821 7.3. Selecting Probe Size, a probe timer should be armed once probing has converged. Once this timer expired, probing again to take advantage of any path PMTU change. The recommended probing interval is 10 minutes per RFC1981. Probing interval could be sysctled by sysctl_tcp_probe_interval. Eric Dumazet suggested to implement pseudo timer based on 32bits jiffies tcp_time_stamp instead of using classic timer for such rare event. Signed-off-by: Fan Du Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 35790d977a2b..f0c6fc32bfa8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2461,6 +2461,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD; + net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL; return 0; fail: -- cgit v1.2.3 From d4f06873b636519cedbe8d2eeae77c713c6a121c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Mar 2015 16:44:09 -0700 Subject: inet: get_openreq4() & get_openreq6() do not need listener ireq->ir_num contains local port, use it. Also, get_openreq4() dumping listen_sk->refcnt makes litle sense. inet_diag_fill_req() can also use ireq->ir_num Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f0c6fc32bfa8..70b0f701bbdb 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2204,7 +2204,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) } EXPORT_SYMBOL(tcp_proc_unregister); -static void get_openreq4(const struct sock *sk, const struct request_sock *req, +static void get_openreq4(const struct request_sock *req, struct seq_file *f, int i, kuid_t uid) { const struct inet_request_sock *ireq = inet_rsk(req); @@ -2214,7 +2214,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", i, ireq->ir_loc_addr, - ntohs(inet_sk(sk)->inet_sport), + ireq->ir_num, ireq->ir_rmt_addr, ntohs(ireq->ir_rmt_port), TCP_SYN_RECV, @@ -2225,7 +2225,7 @@ static void get_openreq4(const struct sock *sk, const struct request_sock *req, from_kuid_munged(seq_user_ns(f), uid), 0, /* non standard timer */ 0, /* open_requests have no inode */ - atomic_read(&sk->sk_refcnt), + 0, req); } @@ -2332,7 +2332,7 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) get_tcp4_sock(v, seq, st->num); break; case TCP_SEQ_STATE_OPENREQ: - get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid); + get_openreq4(v, seq, st->num, st->uid); break; } out: -- cgit v1.2.3 From 3f66b083a5b7f1a63540c24df3679c24f2e935a9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 12 Mar 2015 16:44:10 -0700 Subject: inet: introduce ireq_family Before inserting request socks into general hash table, fill their socket family. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 70b0f701bbdb..1f514a0c5e60 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1228,6 +1228,7 @@ static void tcp_v4_init_req(struct request_sock *req, struct sock *sk, ireq->ir_rmt_addr = ip_hdr(skb)->saddr; ireq->no_srccheck = inet_sk(sk)->transparent; ireq->opt = tcp_v4_save_options(skb); + ireq->ireq_family = AF_INET; } static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, -- cgit v1.2.3 From f7e4eb03f9d9e2522bdd5107f37f9cf1af0bf0fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Mar 2015 21:12:13 -0700 Subject: inet: ip early demux should avoid request sockets When a request socket is created, we do not cache ip route dst entry, like for timewait sockets. Let's use sk_fullsock() helper. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1f514a0c5e60..80067d5858b4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1518,7 +1518,7 @@ void tcp_v4_early_demux(struct sk_buff *skb) if (sk) { skb->sk = sk; skb->destructor = sock_edemux; - if (sk->sk_state != TCP_TIME_WAIT) { + if (sk_fullsock(sk)) { struct dst_entry *dst = sk->sk_rx_dst; if (dst) -- cgit v1.2.3 From d1e559d0b1b0d02f76a6bd5b768a99dc834ae926 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Mar 2015 14:05:35 -0700 Subject: inet: add IPv6 support to sk_ehashfn() Intent is to converge IPv4 & IPv6 inet_hash functions to factorize code. IPv4 sockets initialize sk_rcv_saddr and sk_v6_daddr in this patch, thanks to new sk_daddr_set() and sk_rcv_saddr_set() helpers. __inet6_hash can now use sk_ehashfn() instead of a private inet6_sk_ehashfn() and will simply use __inet_hash() in a following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 80067d5858b4..ca207df4af1c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -189,7 +189,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!inet->inet_saddr) inet->inet_saddr = fl4->saddr; - inet->inet_rcv_saddr = inet->inet_saddr; + sk_rcv_saddr_set(sk, inet->inet_saddr); if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { /* Reset inherited state */ @@ -204,7 +204,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tcp_fetch_timewait_stamp(sk, &rt->dst); inet->inet_dport = usin->sin_port; - inet->inet_daddr = daddr; + sk_daddr_set(sk, daddr); inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet_opt) @@ -1319,8 +1319,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); - newinet->inet_daddr = ireq->ir_rmt_addr; - newinet->inet_rcv_saddr = ireq->ir_loc_addr; + sk_daddr_set(newsk, ireq->ir_rmt_addr); + sk_rcv_saddr_set(newsk, ireq->ir_loc_addr); newinet->inet_saddr = ireq->ir_loc_addr; inet_opt = ireq->opt; rcu_assign_pointer(newinet->inet_opt, inet_opt); -- cgit v1.2.3 From 08d2cc3b26554cae21f279b520ae5c2a3b2be421 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Mar 2015 14:05:38 -0700 Subject: inet: request sock should init IPv6/IPv4 addresses In order to be able to use sk_ehashfn() for request socks, we need to initialize their IPv6/IPv4 addresses. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ca207df4af1c..ddd0b1f25b96 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1219,14 +1219,14 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) #endif -static void tcp_v4_init_req(struct request_sock *req, struct sock *sk, +static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); - ireq->ir_loc_addr = ip_hdr(skb)->daddr; - ireq->ir_rmt_addr = ip_hdr(skb)->saddr; - ireq->no_srccheck = inet_sk(sk)->transparent; + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); + ireq->no_srccheck = inet_sk(sk_listener)->transparent; ireq->opt = tcp_v4_save_options(skb); ireq->ireq_family = AF_INET; } -- cgit v1.2.3 From 52452c542559ac980b48dbf22a30ee7fa0af507c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Mar 2015 19:04:19 -0700 Subject: inet: drop prev pointer handling in request sock When request sock are put in ehash table, the whole notion of having a previous request to update dl_next is pointless. Also, following patch will get rid of big purge timer, so we want to delete a request sock without holding listener lock. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ddd0b1f25b96..19c3770f1e97 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -458,12 +458,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } switch (sk->sk_state) { - struct request_sock *req, **prev; + struct request_sock *req; case TCP_LISTEN: if (sock_owned_by_user(sk)) goto out; - req = inet_csk_search_req(sk, &prev, th->dest, + req = inet_csk_search_req(sk, th->dest, iph->daddr, iph->saddr); if (!req) goto out; @@ -484,7 +484,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) * created socket, and POSIX does not want network * errors returned from accept(). */ - inet_csk_reqsk_queue_drop(sk, req, prev); + inet_csk_reqsk_queue_drop(sk, req); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); goto out; @@ -1392,15 +1392,14 @@ EXPORT_SYMBOL(tcp_v4_syn_recv_sock); static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = tcp_hdr(skb); + const struct tcphdr *th = tcp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); + struct request_sock *req; struct sock *nsk; - struct request_sock **prev; - /* Find possible connection requests. */ - struct request_sock *req = inet_csk_search_req(sk, &prev, th->source, - iph->saddr, iph->daddr); + + req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); if (req) - return tcp_check_req(sk, skb, req, prev, false); + return tcp_check_req(sk, skb, req, false); nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); -- cgit v1.2.3 From fa76ce7328b289b6edd476e24eb52fd634261720 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Mar 2015 19:04:20 -0700 Subject: inet: get rid of central tcp/dccp listener timer One of the major issue for TCP is the SYNACK rtx handling, done by inet_csk_reqsk_queue_prune(), fired by the keepalive timer of a TCP_LISTEN socket. This function runs for awful long times, with socket lock held, meaning that other cpus needing this lock have to spin for hundred of ms. SYNACK are sent in huge bursts, likely to cause severe drops anyway. This model was OK 15 years ago when memory was very tight. We now can afford to have a timer per request sock. Timer invocations no longer need to lock the listener, and can be run from all cpus in parallel. With following patch increasing somaxconn width to 32 bits, I tested a listener with more than 4 million active request sockets, and a steady SYNFLOOD of ~200,000 SYN per second. Host was sending ~830,000 SYNACK per second. This is ~100 times more what we could achieve before this patch. Later, we will get rid of the listener hash and use ehash instead. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 19c3770f1e97..5554b8f33d41 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -475,6 +475,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + reqsk_put(req); goto out; } @@ -486,6 +487,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) */ inet_csk_reqsk_queue_drop(sk, req); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + reqsk_put(req); goto out; case TCP_SYN_SENT: @@ -1398,8 +1400,11 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) struct sock *nsk; req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr); - if (req) - return tcp_check_req(sk, skb, req, false); + if (req) { + nsk = tcp_check_req(sk, skb, req, false); + reqsk_put(req); + return nsk; + } nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb)); @@ -2208,7 +2213,7 @@ static void get_openreq4(const struct request_sock *req, struct seq_file *f, int i, kuid_t uid) { const struct inet_request_sock *ireq = inet_rsk(req); - long delta = req->expires - jiffies; + long delta = req->rsk_timer.expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK", -- cgit v1.2.3 From b282705336e03fc7b9377a278939594870a40f96 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Mar 2015 10:22:21 -0700 Subject: net: convert syn_wait_lock to a spinlock This is a low hanging fruit, as we'll get rid of syn_wait_lock eventually. We hold syn_wait_lock for such small sections, that it makes no sense to use a read/write lock. A spin lock is simply faster. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5554b8f33d41..8028ad5920a4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1909,13 +1909,13 @@ get_req: } sk = sk_nulls_next(st->syn_wait_sk); st->state = TCP_SEQ_STATE_LISTENING; - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } else { icsk = inet_csk(sk); - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) goto start_req; - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); sk = sk_nulls_next(sk); } get_sk: @@ -1927,7 +1927,7 @@ get_sk: goto out; } icsk = inet_csk(sk); - read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock); if (reqsk_queue_len(&icsk->icsk_accept_queue)) { start_req: st->uid = sock_i_uid(sk); @@ -1936,7 +1936,7 @@ start_req: st->sbucket = 0; goto get_req; } - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } spin_unlock_bh(&ilb->lock); st->offset = 0; @@ -2155,7 +2155,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) case TCP_SEQ_STATE_OPENREQ: if (v) { struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk); - read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); + spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock); } case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) -- cgit v1.2.3 From 26e3736090e1037ac929787df21c05497479b77f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 22 Mar 2015 10:22:22 -0700 Subject: ipv4: tcp: handle ICMP messages on TCP_NEW_SYN_RECV request sockets tcp_v4_err() can restrict lookups to ehash table, and not to listeners. Note this patch creates the infrastructure, but this means that ICMP messages for request sockets are ignored until complete conversion. New tcp_req_err() helper is exported so that we can use it in IPv6 in following patch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 69 ++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 35 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 8028ad5920a4..a57615062b66 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -310,6 +310,34 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk) dst->ops->redirect(dst, sk, skb); } + +/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */ +void tcp_req_err(struct sock *sk, u32 seq) +{ + struct request_sock *req = inet_reqsk(sk); + struct net *net = sock_net(sk); + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + WARN_ON(req->sk); + + if (seq != tcp_rsk(req)->snt_isn) { + NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + } else { + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + inet_csk_reqsk_queue_drop(req->rsk_listener, req); + NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); + } + reqsk_put(req); +} +EXPORT_SYMBOL(tcp_req_err); + /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should @@ -343,8 +371,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) int err; struct net *net = dev_net(icmp_skb->dev); - sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, - iph->saddr, th->source, inet_iif(icmp_skb)); + sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, + th->dest, iph->saddr, ntohs(th->source), + inet_iif(icmp_skb)); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; @@ -353,6 +382,9 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) inet_twsk_put(inet_twsk(sk)); return; } + seq = ntohl(th->seq); + if (sk->sk_state == TCP_NEW_SYN_RECV) + return tcp_req_err(sk, seq); bh_lock_sock(sk); /* If too many ICMPs get dropped on busy @@ -374,7 +406,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) icsk = inet_csk(sk); tp = tcp_sk(sk); - seq = ntohl(th->seq); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = tp->fastopen_rsk; snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; @@ -458,38 +489,6 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) } switch (sk->sk_state) { - struct request_sock *req; - case TCP_LISTEN: - if (sock_owned_by_user(sk)) - goto out; - - req = inet_csk_search_req(sk, th->dest, - iph->daddr, iph->saddr); - if (!req) - goto out; - - /* ICMPs are not backlogged, hence we cannot get - an established socket here. - */ - WARN_ON(req->sk); - - if (seq != tcp_rsk(req)->snt_isn) { - NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); - reqsk_put(req); - goto out; - } - - /* - * Still in SYN_RECV, just remove it silently. - * There is no good way to pass the error to the newly - * created socket, and POSIX does not want network - * errors returned from accept(). - */ - inet_csk_reqsk_queue_drop(sk, req); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - reqsk_put(req); - goto out; - case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is -- cgit v1.2.3 From c69736696cf3742b37d850289dc0d7ead177bb14 Mon Sep 17 00:00:00 2001 From: Fan Du Date: Mon, 23 Mar 2015 15:00:41 -0700 Subject: inet: fix double request socket freeing Eric Hugne reported following error : I'm hitting this warning on latest net-next when i try to SSH into a machine with eth0 added to a bridge (but i think the problem is older than that) Steps to reproduce: node2 ~ # brctl addif br0 eth0 [ 223.758785] device eth0 entered promiscuous mode node2 ~ # ip link set br0 up [ 244.503614] br0: port 1(eth0) entered forwarding state [ 244.505108] br0: port 1(eth0) entered forwarding state node2 ~ # [ 251.160159] ------------[ cut here ]------------ [ 251.160831] WARNING: CPU: 0 PID: 3 at include/net/request_sock.h:102 tcp_v4_err+0x6b1/0x720() [ 251.162077] Modules linked in: [ 251.162496] CPU: 0 PID: 3 Comm: ksoftirqd/0 Not tainted 4.0.0-rc3+ #18 [ 251.163334] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 251.164078] ffffffff81a8365c ffff880038a6ba18 ffffffff8162ace4 0000000000009898 [ 251.165084] 0000000000000000 ffff880038a6ba58 ffffffff8104da85 ffff88003fa437c0 [ 251.166195] ffff88003fa437c0 ffff88003fa74e00 ffff88003fa43bb8 ffff88003fad99a0 [ 251.167203] Call Trace: [ 251.167533] [] dump_stack+0x45/0x57 [ 251.168206] [] warn_slowpath_common+0x85/0xc0 [ 251.169239] [] warn_slowpath_null+0x15/0x20 [ 251.170271] [] tcp_v4_err+0x6b1/0x720 [ 251.171408] [] ? _raw_read_lock_irq+0x3/0x10 [ 251.172589] [] ? inet_del_offload+0x40/0x40 [ 251.173366] [] icmp_socket_deliver+0x65/0xb0 [ 251.174134] [] icmp_unreach+0xc2/0x280 [ 251.174820] [] icmp_rcv+0x2bd/0x3a0 [ 251.175473] [] ip_local_deliver_finish+0x82/0x1e0 [ 251.176282] [] ip_local_deliver+0x88/0x90 [ 251.177004] [] ip_rcv_finish+0xf0/0x310 [ 251.177693] [] ip_rcv+0x2dc/0x390 [ 251.178336] [] __netif_receive_skb_core+0x713/0xa20 [ 251.179170] [] __netif_receive_skb+0x1a/0x80 [ 251.179922] [] process_backlog+0x94/0x120 [ 251.180639] [] net_rx_action+0x1e2/0x310 [ 251.181356] [] __do_softirq+0xa7/0x290 [ 251.182046] [] run_ksoftirqd+0x19/0x30 [ 251.182726] [] smpboot_thread_fn+0x153/0x1d0 [ 251.183485] [] ? SyS_setgroups+0x130/0x130 [ 251.184228] [] kthread+0xee/0x110 [ 251.184871] [] ? kthread_create_on_node+0x1b0/0x1b0 [ 251.185690] [] ret_from_fork+0x58/0x90 [ 251.186385] [] ? kthread_create_on_node+0x1b0/0x1b0 [ 251.187216] ---[ end trace c947fc7b24e42ea1 ]--- [ 259.542268] br0: port 1(eth0) entered forwarding state Remove the double calls to reqsk_put() [edumazet] : I got confused because reqsk_timer_handler() _has_ to call reqsk_put(req) after calling inet_csk_reqsk_queue_drop(), as the timer handler holds a reference on req. Signed-off-by: Fan Du Signed-off-by: Eric Dumazet Reported-by: Erik Hugne Fixes: fa76ce7328b2 ("inet: get rid of central tcp/dccp listener timer") Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a57615062b66..4e90217003e8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -324,6 +324,7 @@ void tcp_req_err(struct sock *sk, u32 seq) if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); + reqsk_put(req); } else { /* * Still in SYN_RECV, just remove it silently. @@ -331,10 +332,9 @@ void tcp_req_err(struct sock *sk, u32 seq) * created socket, and POSIX does not want network * errors returned from accept(). */ - inet_csk_reqsk_queue_drop(req->rsk_listener, req); NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS); + inet_csk_reqsk_queue_drop(req->rsk_listener, req); } - reqsk_put(req); } EXPORT_SYMBOL(tcp_req_err); -- cgit v1.2.3 From ff74e23f7edb3759d1290b10f80222e3bbb6304b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Mar 2015 15:58:54 -0700 Subject: tcp: md5: input path is run under rcu protected sections It is guaranteed that both tcp_v4_rcv() and tcp_v6_rcv() run from rcu read locked sections : ip_local_deliver_finish() and ip6_input_finish() both use rcu_read_lock() Also align tcp_v6_inbound_md5_hash() on tcp_v4_inbound_md5_hash() by returning a boolean. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4e90217003e8..d339a0488f51 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1153,8 +1153,9 @@ clear_hash_noput: } EXPORT_SYMBOL(tcp_v4_md5_hash_skb); -static bool __tcp_v4_inbound_md5_hash(struct sock *sk, - const struct sk_buff *skb) +/* Called with rcu_read_lock() */ +static bool tcp_v4_inbound_md5_hash(struct sock *sk, + const struct sk_buff *skb) { /* * This gets called for each TCP segment that arrives @@ -1206,18 +1207,6 @@ static bool __tcp_v4_inbound_md5_hash(struct sock *sk, } return false; } - -static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) -{ - bool ret; - - rcu_read_lock(); - ret = __tcp_v4_inbound_md5_hash(sk, skb); - rcu_read_unlock(); - - return ret; -} - #endif static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, -- cgit v1.2.3 From 39f8e58e53be32ab758d30536e0bd2e6ce766462 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Mar 2015 15:58:55 -0700 Subject: tcp: md5: remove request sock argument of calc_md5_hash() Since request and established sockets now have same base, there is no need to pass two pointers to tcp_v4_md5_hash_skb() or tcp_v6_md5_hash_skb() Also add a const qualifier to their struct tcp_md5sig_key argument. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d339a0488f51..79d5c641688c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -648,7 +648,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) if (!key) goto release_sk1; - genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb); + genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) goto release_sk1; } else { @@ -1102,8 +1102,8 @@ clear_hash_noput: return 1; } -int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, - const struct sock *sk, const struct request_sock *req, +int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, + const struct sock *sk, const struct sk_buff *skb) { struct tcp_md5sig_pool *hp; @@ -1111,12 +1111,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key, const struct tcphdr *th = tcp_hdr(skb); __be32 saddr, daddr; - if (sk) { - saddr = inet_sk(sk)->inet_saddr; - daddr = inet_sk(sk)->inet_daddr; - } else if (req) { - saddr = inet_rsk(req)->ir_loc_addr; - daddr = inet_rsk(req)->ir_rmt_addr; + if (sk) { /* valid for establish/request sockets */ + saddr = sk->sk_rcv_saddr; + daddr = sk->sk_daddr; } else { const struct iphdr *iph = ip_hdr(skb); saddr = iph->saddr; @@ -1195,7 +1192,7 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, */ genhash = tcp_v4_md5_hash_skb(newhash, hash_expected, - NULL, NULL, skb); + NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n", -- cgit v1.2.3 From fd3a154a00fb991872680f19021f5edbb40b4dbe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Mar 2015 15:58:56 -0700 Subject: tcp: md5: get rid of tcp_v[46]_reqsk_md5_lookup() With request socks convergence, we no longer need different lookup methods. A request socket can use generic lookup function. Add const qualifier to 2nd tcp_v[46]_md5_lookup() parameter. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 79d5c641688c..fc8995a702a6 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -898,10 +898,10 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, const union tcp_md5_addr *addr, int family) { - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; unsigned int size = sizeof(struct in_addr); - struct tcp_md5sig_info *md5sig; + const struct tcp_md5sig_info *md5sig; /* caller either holds rcu_read_lock() or socket lock */ md5sig = rcu_dereference_check(tp->md5sig_info, @@ -924,24 +924,15 @@ struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk, EXPORT_SYMBOL(tcp_md5_do_lookup); struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, - struct sock *addr_sk) + const struct sock *addr_sk) { union tcp_md5_addr *addr; - addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr; + addr = (union tcp_md5_addr *)&sk->sk_daddr; return tcp_md5_do_lookup(sk, addr, AF_INET); } EXPORT_SYMBOL(tcp_v4_md5_lookup); -static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk, - struct request_sock *req) -{ - union tcp_md5_addr *addr; - - addr = (union tcp_md5_addr *)&inet_rsk(req)->ir_rmt_addr; - return tcp_md5_do_lookup(sk, addr, AF_INET); -} - /* This can be called on a newly created socket, from other files */ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, const u8 *newkey, u8 newkeylen, gfp_t gfp) @@ -1247,7 +1238,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .mss_clamp = TCP_MSS_DEFAULT, #ifdef CONFIG_TCP_MD5SIG - .md5_lookup = tcp_v4_reqsk_md5_lookup, + .req_md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, #endif .init_req = tcp_v4_init_req, -- cgit v1.2.3 From 0144a81cccf7532bead90f0542f517bd028d3b3c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Mar 2015 21:45:56 -0700 Subject: tcp: fix ipv4 mapped request socks ss should display ipv4 mapped request sockets like this : tcp SYN-RECV 0 0 ::ffff:192.168.0.1:8080 ::ffff:192.0.2.1:35261 and not like this : tcp SYN-RECV 0 0 192.168.0.1:8080 192.0.2.1:35261 We should init ireq->ireq_family based on listener sk_family, not the actual protocol carried by SYN packet. This means we can set ireq_family in inet_reqsk_alloc() Fixes: 3f66b083a5b7 ("inet: introduce ireq_family") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fc8995a702a6..e073517b2cc7 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1206,7 +1206,6 @@ static void tcp_v4_init_req(struct request_sock *req, struct sock *sk_listener, sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); ireq->no_srccheck = inet_sk(sk_listener)->transparent; ireq->opt = tcp_v4_save_options(skb); - ireq->ireq_family = AF_INET; } static struct dst_entry *tcp_v4_route_req(struct sock *sk, struct flowi *fl, -- cgit v1.2.3 From 41d25fe0927aabb1d4b671871a99a55bcd203257 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 25 Mar 2015 15:08:47 -0700 Subject: tcp: tcp_syn_flood_action() can be static After commit 1fb6f159fd21 ("tcp: add tcp_conn_request"), tcp_syn_flood_action() is no longer used from IPv6. We can make it static, by moving it above tcp_conn_request() Signed-off-by: Eric Dumazet Reviewed-by: Octavian Purdila Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 29 ----------------------------- 1 file changed, 29 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e073517b2cc7..5aababa20a21 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -856,35 +856,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -/* - * Return true if a syncookie should be sent - */ -bool tcp_syn_flood_action(struct sock *sk, - const struct sk_buff *skb, - const char *proto) -{ - const char *msg = "Dropping request"; - bool want_cookie = false; - struct listen_sock *lopt; - -#ifdef CONFIG_SYN_COOKIES - if (sysctl_tcp_syncookies) { - msg = "Sending cookies"; - want_cookie = true; - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES); - } else -#endif - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - - lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; - if (!lopt->synflood_warned && sysctl_tcp_syncookies != 2) { - lopt->synflood_warned = 1; - pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", - proto, ntohs(tcp_hdr(skb)->dest), msg); - } - return want_cookie; -} -EXPORT_SYMBOL(tcp_syn_flood_action); #ifdef CONFIG_TCP_MD5SIG /* -- cgit v1.2.3 From 51456b2914a34d16b1255b7c55d5cbf6a681d306 Mon Sep 17 00:00:00 2001 From: Ian Morris Date: Fri, 3 Apr 2015 09:17:26 +0100 Subject: ipv4: coding style: comparison for equality with NULL The ipv4 code uses a mixture of coding styles. In some instances check for NULL pointer is done as x == NULL and sometimes as !x. !x is preferred according to checkpatch and this patch makes the code consistent by adopting the latter form. No changes detected by objdiff. Signed-off-by: Ian Morris Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 69f9cf684744..9ff311cf00f3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -122,7 +122,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) and use initial timestamp retrieved from peer table. */ if (tcptw->tw_ts_recent_stamp && - (twp == NULL || (sysctl_tcp_tw_reuse && + (!twp || (sysctl_tcp_tw_reuse && get_seconds() - tcptw->tw_ts_recent_stamp > 1))) { tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; if (tp->write_seq == 0) @@ -494,7 +494,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) /* Only in fast or simultaneous open. If a fast open socket is * is already accepted it is treated as a connected one below. */ - if (fastopen && fastopen->sk == NULL) + if (fastopen && !fastopen->sk) break; if (!sock_owned_by_user(sk)) { @@ -1390,7 +1390,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || - dst->ops->check(dst, 0) == NULL) { + !dst->ops->check(dst, 0)) { dst_release(dst); sk->sk_rx_dst = NULL; } -- cgit v1.2.3 From 00db41243e8d5032c2e0f5bf6063bb19324bfdb3 Mon Sep 17 00:00:00 2001 From: Ian Morris Date: Fri, 3 Apr 2015 09:17:27 +0100 Subject: ipv4: coding style: comparison for inequality with NULL The ipv4 code uses a mixture of coding styles. In some instances check for non-NULL pointer is done as x != NULL and sometimes as x. x is preferred according to checkpatch and this patch makes the code consistent by adopting the latter form. No changes detected by objdiff. Signed-off-by: Ian Morris Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9ff311cf00f3..560f9571f7c4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1305,7 +1305,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Copy over the MD5 key from the original socket */ key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr, AF_INET); - if (key != NULL) { + if (key) { /* * We're using one, so create a matching key * on the newsk structure. If we fail to get @@ -1797,7 +1797,7 @@ void tcp_v4_destroy_sock(struct sock *sk) if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); - BUG_ON(tp->fastopen_rsk != NULL); + BUG_ON(tp->fastopen_rsk); /* If socket is aborted during connect operation */ tcp_free_fastopen_req(tp); -- cgit v1.2.3 From b52e69217b5a02469f8431934f59c0d7103dd32f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Apr 2015 14:36:42 -0700 Subject: tcp: md5: fix a typo in tcp_v4_md5_lookup() Lookup key for tcp_md5_do_lookup() has to be taken from addr_sk, not sk (which can be the listener) Fixes: fd3a154a00fb ("tcp: md5: get rid of tcp_v[46]_reqsk_md5_lookup()") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 560f9571f7c4..37578d52897e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -897,9 +897,9 @@ EXPORT_SYMBOL(tcp_md5_do_lookup); struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk, const struct sock *addr_sk) { - union tcp_md5_addr *addr; + const union tcp_md5_addr *addr; - addr = (union tcp_md5_addr *)&sk->sk_daddr; + addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr; return tcp_md5_do_lookup(sk, addr, AF_INET); } EXPORT_SYMBOL(tcp_v4_md5_lookup); -- cgit v1.2.3 From 789f558cfb3680aeb52de137418637f6b04b7d22 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 12 Apr 2015 18:51:09 -0700 Subject: tcp/dccp: get rid of central timewait timer Using a timer wheel for timewait sockets was nice ~15 years ago when memory was expensive and machines had a single processor. This does not scale, code is ugly and source of huge latencies (Typically 30 ms have been seen, cpus spinning on death_lock spinlock.) We can afford to use an extra 64 bytes per timewait sock and spread timewait load to all cpus to have better behavior. Tested: On following test, /proc/sys/net/ipv4/tcp_tw_recycle is set to 1 on the target (lpaa24) Before patch : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 419594 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 437171 While test is running, we can observe 25 or even 33 ms latencies. lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20601ms rtt min/avg/max/mdev = 0.020/0.217/25.771/1.535 ms, pipe 2 lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 20702ms rtt min/avg/max/mdev = 0.019/0.183/33.761/1.441 ms, pipe 2 After patch : About 90% increase of throughput : lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 810442 lpaa23:~# ./super_netperf 200 -H lpaa24 -t TCP_CC -l 60 -- -p0,0 800992 And latencies are kept to minimal values during this load, even if network utilization is 90% higher : lpaa24:~# ping -c 1000 -i 0.02 -qn lpaa23 ... 1000 packets transmitted, 1000 received, 0% packet loss, time 19991ms rtt min/avg/max/mdev = 0.023/0.064/0.360/0.042 ms Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 37578d52897e..3571f2be4470 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1685,7 +1685,7 @@ do_time_wait: iph->daddr, th->dest, inet_iif(skb)); if (sk2) { - inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_deschedule(inet_twsk(sk)); inet_twsk_put(inet_twsk(sk)); sk = sk2; goto process; @@ -2242,9 +2242,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) static void get_timewait4_sock(const struct inet_timewait_sock *tw, struct seq_file *f, int i) { + long delta = tw->tw_timer.expires - jiffies; __be32 dest, src; __u16 destp, srcp; - s32 delta = tw->tw_ttd - inet_tw_time_stamp(); dest = tw->tw_daddr; src = tw->tw_rcv_saddr; -- cgit v1.2.3