From efe4208f47f907b86f528788da711e8ab9dea44d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Oct 2013 15:42:29 -0700 Subject: ipv6: make lookups simpler and faster TCP listener refactoring, part 4 : To speed up inet lookups, we moved IPv4 addresses from inet to struct sock_common Now is time to do the same for IPv6, because it permits us to have fast lookups for all kind of sockets, including upcoming SYN_RECV. Getting IPv6 addresses in TCP lookups currently requires two extra cache lines, plus a dereference (and memory stall). inet6_sk(sk) does the dereference of inet_sk(__sk)->pinet6 This patch is way bigger than its IPv4 counter part, because for IPv4, we could add aliases (inet_daddr, inet_rcv_saddr), while on IPv6, it's not doable easily. inet6_sk(sk)->daddr becomes sk->sk_v6_daddr inet6_sk(sk)->rcv_saddr becomes sk->sk_v6_rcv_saddr And timewait socket also have tw->tw_v6_daddr & tw->tw_v6_rcv_saddr at the same offset. We get rid of INET6_TW_MATCH() as INET6_MATCH() is now the generic macro. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net/ipv4/tcp_metrics.c') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 52f3c6b971d2..27535fd5ea10 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -240,7 +240,6 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) { - struct inet6_timewait_sock *tw6; struct tcp_metrics_block *tm; struct inetpeer_addr addr; unsigned int hash; @@ -253,9 +252,8 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock hash = (__force unsigned int) addr.addr.a4; break; case AF_INET6: - tw6 = inet6_twsk((struct sock *)tw); - *(struct in6_addr *)addr.addr.a6 = tw6->tw_v6_daddr; - hash = ipv6_addr_hash(&tw6->tw_v6_daddr); + *(struct in6_addr *)addr.addr.a6 = tw->tw_v6_daddr; + hash = ipv6_addr_hash(&tw->tw_v6_daddr); break; default: return NULL; @@ -289,8 +287,8 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, hash = (__force unsigned int) addr.addr.a4; break; case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = inet6_sk(sk)->daddr; - hash = ipv6_addr_hash(&inet6_sk(sk)->daddr); + *(struct in6_addr *)addr.addr.a6 = sk->sk_v6_daddr; + hash = ipv6_addr_hash(&sk->sk_v6_daddr); break; default: return NULL; -- cgit v1.2.3 From c2bb06db59eaf92eb5ca9c6faed590597c6ceccb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2013 03:05:48 -0700 Subject: net: fix build errors if ipv6 is disabled CONFIG_IPV6=n is still a valid choice ;) It appears we can remove dead code. Reported-by: Wu Fengguang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4/tcp_metrics.c') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 27535fd5ea10..8fcc2cb9dba4 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -251,10 +251,12 @@ static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock addr.addr.a4 = tw->tw_daddr; hash = (__force unsigned int) addr.addr.a4; break; +#if IS_ENABLED(CONFIG_IPV6) case AF_INET6: *(struct in6_addr *)addr.addr.a6 = tw->tw_v6_daddr; hash = ipv6_addr_hash(&tw->tw_v6_daddr); break; +#endif default: return NULL; } @@ -286,10 +288,12 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, addr.addr.a4 = inet_sk(sk)->inet_daddr; hash = (__force unsigned int) addr.addr.a4; break; +#if IS_ENABLED(CONFIG_IPV6) case AF_INET6: *(struct in6_addr *)addr.addr.a6 = sk->sk_v6_daddr; hash = ipv6_addr_hash(&sk->sk_v6_daddr); break; +#endif default: return NULL; } -- cgit v1.2.3 From 634fb979e8f3a70f04c1f2f519d0cd1142eb5c1a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Oct 2013 15:21:29 -0700 Subject: inet: includes a sock_common in request_sock TCP listener refactoring, part 5 : We want to be able to insert request sockets (SYN_RECV) into main ehash table instead of the per listener hash table to allow RCU lookups and remove listener lock contention. This patch includes the needed struct sock_common in front of struct request_sock This means there is no more inet6_request_sock IPv6 specific structure. Following inet_request_sock fields were renamed as they became macros to reference fields from struct sock_common. Prefix ir_ was chosen to avoid name collisions. loc_port -> ir_loc_port loc_addr -> ir_loc_addr rmt_addr -> ir_rmt_addr rmt_port -> ir_rmt_port iif -> ir_iif Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_metrics.c') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 8fcc2cb9dba4..4a2a84110dfb 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -215,13 +215,15 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, addr.family = req->rsk_ops->family; switch (addr.family) { case AF_INET: - addr.addr.a4 = inet_rsk(req)->rmt_addr; + addr.addr.a4 = inet_rsk(req)->ir_rmt_addr; hash = (__force unsigned int) addr.addr.a4; break; +#if IS_ENABLED(CONFIG_IPV6) case AF_INET6: - *(struct in6_addr *)addr.addr.a6 = inet6_rsk(req)->rmt_addr; - hash = ipv6_addr_hash(&inet6_rsk(req)->rmt_addr); + *(struct in6_addr *)addr.addr.a6 = inet_rsk(req)->ir_v6_rmt_addr; + hash = ipv6_addr_hash(&inet_rsk(req)->ir_v6_rmt_addr); break; +#endif default: return NULL; } -- cgit v1.2.3 From c968601d174739cb1e7100c95e0eb3d2f7e91bc9 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 29 Oct 2013 10:09:05 -0700 Subject: tcp: temporarily disable Fast Open on SYN timeout Fast Open currently has a fall back feature to address SYN-data being dropped but it requires the middle-box to pass on regular SYN retry after SYN-data. This is implemented in commit aab487435 ("net-tcp: Fast Open client - detecting SYN-data drops") However some NAT boxes will drop all subsequent packets after first SYN-data and blackholes the entire connections. An example is in commit 356d7d8 "netfilter: nf_conntrack: fix tcp_in_window for Fast Open". The sender should note such incidents and fall back to use the regular TCP handshake on subsequent attempts temporarily as well: after the second SYN timeouts the original Fast Open SYN is most likely lost. When such an event recurs Fast Open is disabled based on the number of recurrences exponentially. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/ipv4/tcp_metrics.c') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4a2a84110dfb..2ab09cbae74d 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -671,8 +671,9 @@ void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; write_seqlock_bh(&fastopen_seqlock); - tfom->mss = mss; - if (cookie->len > 0) + if (mss) + tfom->mss = mss; + if (cookie && cookie->len > 0) tfom->cookie = *cookie; if (syn_lost) { ++tfom->syn_loss; -- cgit v1.2.3 From dccf76ca6b626c0c4a4e09bb221adee3270ab0ef Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Nov 2013 15:00:46 -0800 Subject: net-tcp: fix panic in tcp_fastopen_cache_set() We had some reports of crashes using TCP fastopen, and Dave Jones gave a nice stack trace pointing to the error. Issue is that tcp_get_metrics() should not be called with a NULL dst Fixes: 1fe4c481ba637 ("net-tcp: Fast Open client - cookie cache") Signed-off-by: Eric Dumazet Reported-by: Dave Jones Cc: Yuchung Cheng Acked-by: Yuchung Cheng Tested-by: Dave Jones Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_metrics.c') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 2ab09cbae74d..d3ee2e0c28b6 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -663,10 +663,13 @@ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost) { + struct dst_entry *dst = __sk_dst_get(sk); struct tcp_metrics_block *tm; + if (!dst) + return; rcu_read_lock(); - tm = tcp_get_metrics(sk, __sk_dst_get(sk), true); + tm = tcp_get_metrics(sk, dst, true); if (tm) { struct tcp_fastopen_metrics *tfom = &tm->tcpm_fastopen; -- cgit v1.2.3 From 4534de8305b3f1460a527a0cda0e3dc2224c6f0c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 14 Nov 2013 17:14:46 +0100 Subject: genetlink: make all genl_ops users const Now that genl_ops are no longer modified in place when registering, they can be made const. This patch was done mostly with spatch: @@ identifier ops; @@ +const struct genl_ops ops[] = { ... }; (except the struct thing in net/openvswitch/datapath.c) Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4/tcp_metrics.c') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index d3ee2e0c28b6..8c121b523eee 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -991,7 +991,7 @@ static int tcp_metrics_nl_cmd_del(struct sk_buff *skb, struct genl_info *info) return 0; } -static struct genl_ops tcp_metrics_nl_ops[] = { +static const struct genl_ops tcp_metrics_nl_ops[] = { { .cmd = TCP_METRICS_CMD_GET, .doit = tcp_metrics_nl_cmd_get, -- cgit v1.2.3 From c53ed7423619b4e8108914a9f31b426dd58ad591 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Nov 2013 15:19:31 +0100 Subject: genetlink: only pass array to genl_register_family_with_ops() As suggested by David Miller, make genl_register_family_with_ops() a macro and pass only the array, evaluating ARRAY_SIZE() in the macro, this is a little safer. The openvswitch has some indirection, assing ops/n_ops directly in that code. This might ultimately just assign the pointers in the family initializations, saving the struct genl_family_and_ops and code (once mcast groups are handled differently.) Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/ipv4/tcp_metrics.c') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 8c121b523eee..06493736fbc8 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -1082,8 +1082,7 @@ void __init tcp_metrics_init(void) if (ret < 0) goto cleanup; ret = genl_register_family_with_ops(&tcp_metrics_nl_family, - tcp_metrics_nl_ops, - ARRAY_SIZE(tcp_metrics_nl_ops)); + tcp_metrics_nl_ops); if (ret < 0) goto cleanup_subsys; return; -- cgit v1.2.3 From 77f99ad16a07aa062c2d30fae57b1fee456f6ef6 Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Thu, 16 Jan 2014 20:01:21 +0100 Subject: tcp: metrics: Avoid duplicate entries with the same destination-IP Because the tcp-metrics is an RCU-list, it may be that two soft-interrupts are inside __tcp_get_metrics() for the same destination-IP at the same time. If this destination-IP is not yet part of the tcp-metrics, both soft-interrupts will end up in tcpm_new and create a new entry for this IP. So, we will have two tcp-metrics with the same destination-IP in the list. This patch checks twice __tcp_get_metrics(). First without holding the lock, then while holding the lock. The second one is there to confirm that the entry has not been added by another soft-irq while waiting for the spin-lock. Fixes: 51c5d0c4b169b (tcp: Maintain dynamic metrics in local cache.) Signed-off-by: Christoph Paasch Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_metrics.c | 51 +++++++++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 19 deletions(-) (limited to 'net/ipv4/tcp_metrics.c') diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 06493736fbc8..098b3a29f6f3 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -22,6 +22,9 @@ int sysctl_tcp_nometrics_save __read_mostly; +static struct tcp_metrics_block *__tcp_get_metrics(const struct inetpeer_addr *addr, + struct net *net, unsigned int hash); + struct tcp_fastopen_metrics { u16 mss; u16 syn_loss:10; /* Recurring Fast Open SYN losses */ @@ -130,16 +133,41 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst, } } +#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) + +static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) +{ + if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) + tcpm_suck_dst(tm, dst, false); +} + +#define TCP_METRICS_RECLAIM_DEPTH 5 +#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL + static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, struct inetpeer_addr *addr, - unsigned int hash, - bool reclaim) + unsigned int hash) { struct tcp_metrics_block *tm; struct net *net; + bool reclaim = false; spin_lock_bh(&tcp_metrics_lock); net = dev_net(dst->dev); + + /* While waiting for the spin-lock the cache might have been populated + * with this entry and so we have to check again. + */ + tm = __tcp_get_metrics(addr, net, hash); + if (tm == TCP_METRICS_RECLAIM_PTR) { + reclaim = true; + tm = NULL; + } + if (tm) { + tcpm_check_stamp(tm, dst); + goto out_unlock; + } + if (unlikely(reclaim)) { struct tcp_metrics_block *oldest; @@ -169,17 +197,6 @@ out_unlock: return tm; } -#define TCP_METRICS_TIMEOUT (60 * 60 * HZ) - -static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) -{ - if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) - tcpm_suck_dst(tm, dst, false); -} - -#define TCP_METRICS_RECLAIM_DEPTH 5 -#define TCP_METRICS_RECLAIM_PTR (struct tcp_metrics_block *) 0x1UL - static struct tcp_metrics_block *tcp_get_encode(struct tcp_metrics_block *tm, int depth) { if (tm) @@ -282,7 +299,6 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, struct inetpeer_addr addr; unsigned int hash; struct net *net; - bool reclaim; addr.family = sk->sk_family; switch (addr.family) { @@ -304,13 +320,10 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, hash = hash_32(hash, net->ipv4.tcp_metrics_hash_log); tm = __tcp_get_metrics(&addr, net, hash); - reclaim = false; - if (tm == TCP_METRICS_RECLAIM_PTR) { - reclaim = true; + if (tm == TCP_METRICS_RECLAIM_PTR) tm = NULL; - } if (!tm && create) - tm = tcpm_new(dst, &addr, hash, reclaim); + tm = tcpm_new(dst, &addr, hash); else tcpm_check_stamp(tm, dst); -- cgit v1.2.3