diff options
Diffstat (limited to 'net/ipv4/tcp_input.c')
| -rw-r--r-- | net/ipv4/tcp_input.c | 830 |
1 files changed, 472 insertions, 358 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c53b7f35c51d..00a41499d52c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -73,7 +73,7 @@ #include <net/inet_common.h> #include <linux/ipsec.h> #include <asm/unaligned.h> -#include <net/netdma.h> +#include <linux/errqueue.h> int sysctl_tcp_timestamps __read_mostly = 1; int sysctl_tcp_window_scaling __read_mostly = 1; @@ -200,28 +200,25 @@ static inline bool tcp_in_quickack_mode(const struct sock *sk) return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } -static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) +static void tcp_ecn_queue_cwr(struct tcp_sock *tp) { if (tp->ecn_flags & TCP_ECN_OK) tp->ecn_flags |= TCP_ECN_QUEUE_CWR; } -static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) +static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb) { if (tcp_hdr(skb)->cwr) tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } -static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) +static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) { tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; } -static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) { - if (!(tp->ecn_flags & TCP_ECN_OK)) - return; - switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) { case INET_ECN_NOT_ECT: /* Funny extension: if ECT is not set on a segment, @@ -232,30 +229,43 @@ static inline void TCP_ECN_check_ce(struct tcp_sock *tp, const struct sk_buff *s tcp_enter_quickack_mode((struct sock *)tp); break; case INET_ECN_CE: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE); + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode((struct sock *)tp); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } - /* fallinto */ + tp->ecn_flags |= TCP_ECN_SEEN; + break; default: + if (tcp_ca_needs_ecn((struct sock *)tp)) + tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; + break; } } -static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) +static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb) +{ + if (tp->ecn_flags & TCP_ECN_OK) + __tcp_ecn_check_ce(tp, skb); +} + +static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } -static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) +static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) { if ((tp->ecn_flags & TCP_ECN_OK) && (!th->ece || !th->cwr)) tp->ecn_flags &= ~TCP_ECN_OK; } -static bool TCP_ECN_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) +static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) { if (th->ece && !th->syn && (tp->ecn_flags & TCP_ECN_OK)) return true; @@ -652,7 +662,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) } icsk->icsk_ack.lrcvtime = now; - TCP_ECN_check_ce(tp, skb); + tcp_ecn_check_ce(tp, skb); if (skb->len >= 128) tcp_grow_window(sk, skb); @@ -667,10 +677,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) +static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) { struct tcp_sock *tp = tcp_sk(sk); - long m = mrtt; /* RTT */ + long m = mrtt_us; /* RTT */ + u32 srtt = tp->srtt_us; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -688,14 +699,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ - if (m == 0) - m = 1; - if (tp->srtt != 0) { - m -= (tp->srtt >> 3); /* m is now error in rtt est */ - tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (srtt != 0) { + m -= (srtt >> 3); /* m is now error in rtt est */ + srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ /* This is similar to one of Eifel findings. * Eifel blocks mdev updates when rtt decreases. * This solution is a bit different: we use finer gain @@ -707,27 +716,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) if (m > 0) m >>= 3; } else { - m -= (tp->mdev >> 2); /* similar update on mdev */ + m -= (tp->mdev_us >> 2); /* similar update on mdev */ } - tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ - if (tp->mdev > tp->mdev_max) { - tp->mdev_max = tp->mdev; - if (tp->mdev_max > tp->rttvar) - tp->rttvar = tp->mdev_max; + tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */ + if (tp->mdev_us > tp->mdev_max_us) { + tp->mdev_max_us = tp->mdev_us; + if (tp->mdev_max_us > tp->rttvar_us) + tp->rttvar_us = tp->mdev_max_us; } if (after(tp->snd_una, tp->rtt_seq)) { - if (tp->mdev_max < tp->rttvar) - tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2; + if (tp->mdev_max_us < tp->rttvar_us) + tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2; tp->rtt_seq = tp->snd_nxt; - tp->mdev_max = tcp_rto_min(sk); + tp->mdev_max_us = tcp_rto_min_us(sk); } } else { /* no previous measure. */ - tp->srtt = m << 3; /* take the measured time to be rtt */ - tp->mdev = m << 1; /* make sure rto = 3*rtt */ - tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); + srtt = m << 3; /* take the measured time to be rtt */ + tp->mdev_us = m << 1; /* make sure rto = 3*rtt */ + tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk)); + tp->mdev_max_us = tp->rttvar_us; tp->rtt_seq = tp->snd_nxt; } + tp->srtt_us = max(1U, srtt); } /* Set the sk_pacing_rate to allow proper sizing of TSO packets. @@ -742,18 +753,12 @@ static void tcp_update_pacing_rate(struct sock *sk) u64 rate; /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */ - rate = (u64)tp->mss_cache * 2 * (HZ << 3); + rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3); rate *= max(tp->snd_cwnd, tp->packets_out); - /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), - * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) - * We probably need usec resolution in the future. - * Note: This also takes care of possible srtt=0 case, - * when tcp_rtt_estimator() was not yet called. - */ - if (tp->srtt > 8 + 2) - do_div(rate, tp->srtt); + if (likely(tp->srtt_us)) + do_div(rate, tp->srtt_us); /* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate * without any lock. We want to make sure compiler wont store @@ -766,7 +771,7 @@ static void tcp_update_pacing_rate(struct sock *sk) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -void tcp_set_rto(struct sock *sk) +static void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -1111,7 +1116,7 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } /* D-SACK for already forgotten data... Do dumb counting. */ - if (dup_sack && tp->undo_marker && tp->undo_retrans && + if (dup_sack && tp->undo_marker && tp->undo_retrans > 0 && !after(end_seq_0, prior_snd_una) && after(end_seq_0, tp->undo_marker)) tp->undo_retrans--; @@ -1120,10 +1125,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb, } struct tcp_sacktag_state { - int reord; - int fack_count; - int flag; - s32 rtt; /* RTT measured by SACKing never-retransmitted data */ + int reord; + int fack_count; + long rtt_us; /* RTT measured by SACKing never-retransmitted data */ + int flag; }; /* Check if skb is fully within the SACK block. In presence of GSO skbs, @@ -1167,12 +1172,12 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, unsigned int new_len = (pkt_len / mss) * mss; if (!in_sack && new_len < pkt_len) { new_len += mss; - if (new_len > skb->len) + if (new_len >= skb->len) return 0; } pkt_len = new_len; } - err = tcp_fragment(sk, skb, pkt_len, mss); + err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC); if (err < 0) return err; } @@ -1184,14 +1189,15 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - int dup_sack, int pcount, u32 xmit_time) + int dup_sack, int pcount, + const struct skb_mstamp *xmit_time) { struct tcp_sock *tp = tcp_sk(sk); int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { - if (tp->undo_marker && tp->undo_retrans && + if (tp->undo_marker && tp->undo_retrans > 0 && after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) @@ -1225,8 +1231,13 @@ static u8 tcp_sacktag_one(struct sock *sk, if (!after(end_seq, tp->high_seq)) state->flag |= FLAG_ORIG_SACK_ACKED; /* Pick the earliest sequence sacked for RTT */ - if (state->rtt < 0) - state->rtt = tcp_time_stamp - xmit_time; + if (state->rtt_us < 0) { + struct skb_mstamp now; + + skb_mstamp_get(&now); + state->rtt_us = skb_mstamp_us_delta(&now, + xmit_time); + } } if (sacked & TCPCB_LOST) { @@ -1285,7 +1296,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, start_seq, end_seq, dup_sack, pcount, - TCP_SKB_CB(skb)->when); + &skb->skb_mstamp); if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1293,9 +1304,9 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; - skb_shinfo(prev)->gso_segs += pcount; - BUG_ON(skb_shinfo(skb)->gso_segs < pcount); - skb_shinfo(skb)->gso_segs -= pcount; + tcp_skb_pcount_add(prev, pcount); + BUG_ON(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK @@ -1308,7 +1319,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, } /* CHECKME: To clear or not to clear? Mimics normal skb currently */ - if (skb_shinfo(skb)->gso_segs <= 1) { + if (tcp_skb_pcount(skb) <= 1) { skb_shinfo(skb)->gso_size = 0; skb_shinfo(skb)->gso_type = 0; } @@ -1563,7 +1574,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), - TCP_SKB_CB(skb)->when); + &skb->skb_mstamp); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1620,7 +1631,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl static int tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, - u32 prior_snd_una, s32 *sack_rtt) + u32 prior_snd_una, long *sack_rtt_us) { struct tcp_sock *tp = tcp_sk(sk); const unsigned char *ptr = (skb_transport_header(ack_skb) + @@ -1638,7 +1649,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, state.flag = 0; state.reord = tp->packets_out; - state.rtt = -1; + state.rtt_us = -1L; if (!tp->sacked_out) { if (WARN_ON(tp->fackets_out)) @@ -1822,7 +1833,7 @@ out: WARN_ON((int)tp->retrans_out < 0); WARN_ON((int)tcp_packets_in_flight(tp) < 0); #endif - *sack_rtt = state.rtt; + *sack_rtt_us = state.rtt_us; return state.flag; } @@ -1886,33 +1897,34 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp) tp->sacked_out = 0; } -static void tcp_clear_retrans_partial(struct tcp_sock *tp) +void tcp_clear_retrans(struct tcp_sock *tp) { tp->retrans_out = 0; tp->lost_out = 0; - tp->undo_marker = 0; - tp->undo_retrans = 0; + tp->undo_retrans = -1; + tp->fackets_out = 0; + tp->sacked_out = 0; } -void tcp_clear_retrans(struct tcp_sock *tp) +static inline void tcp_init_undo(struct tcp_sock *tp) { - tcp_clear_retrans_partial(tp); - - tp->fackets_out = 0; - tp->sacked_out = 0; + tp->undo_marker = tp->snd_una; + /* Retransmission still in flight may cause DSACKs later. */ + tp->undo_retrans = tp->retrans_out ? : -1; } -/* Enter Loss state. If "how" is not zero, forget all SACK information +/* Enter Loss state. If we detect SACK reneging, forget all SACK information * and reset tags completely, otherwise preserve SACKs. If receiver * dropped its ofo queue, we will know this due to reneging detection. */ -void tcp_enter_loss(struct sock *sk, int how) +void tcp_enter_loss(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; bool new_recovery = false; + bool is_reneg; /* is receiver reneging on SACKs? */ /* Reduce ssthresh if it has not yet been made inside this window. */ if (icsk->icsk_ca_state <= TCP_CA_Disorder || @@ -1922,18 +1934,22 @@ void tcp_enter_loss(struct sock *sk, int how) tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); + tcp_init_undo(tp); } tp->snd_cwnd = 1; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tcp_clear_retrans_partial(tp); + tp->retrans_out = 0; + tp->lost_out = 0; if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - tp->undo_marker = tp->snd_una; - if (how) { + skb = tcp_write_queue_head(sk); + is_reneg = skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED); + if (is_reneg) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); tp->sacked_out = 0; tp->fackets_out = 0; } @@ -1943,10 +1959,8 @@ void tcp_enter_loss(struct sock *sk, int how) if (skb == tcp_send_head(sk)) break; - if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) - tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || is_reneg) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); @@ -1964,7 +1978,7 @@ void tcp_enter_loss(struct sock *sk, int how) sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; - TCP_ECN_queue_cwr(tp); + tcp_ecn_queue_cwr(tp); /* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous * loss recovery is underway except recurring timeout(s) on @@ -1979,19 +1993,21 @@ void tcp_enter_loss(struct sock *sk, int how) * remembered SACKs do not reflect real state of receiver i.e. * receiver _host_ is heavily congested (or buggy). * - * Do processing similar to RTO timeout. + * To avoid big spurious retransmission bursts due to transient SACK + * scoreboard oddities that look like reneging, we give the receiver a + * little time (max(RTT/2, 10ms)) to send us some more ACKs that will + * restore sanity to the SACK scoreboard. If the apparent reneging + * persists until this RTO then we'll clear the SACK scoreboard. */ static bool tcp_check_sack_reneging(struct sock *sk, int flag) { if (flag & FLAG_SACK_RENEGING) { - struct inet_connection_sock *icsk = inet_csk(sk); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSACKRENEGING); + struct tcp_sock *tp = tcp_sk(sk); + unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), + msecs_to_jiffies(10)); - tcp_enter_loss(sk, 1); - icsk->icsk_retransmits++; - tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - icsk->icsk_rto, TCP_RTO_MAX); + delay, TCP_RTO_MAX); return true; } return false; @@ -2032,10 +2048,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag) * available, or RTO is scheduled to fire first. */ if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 || - (flag & FLAG_ECE) || !tp->srtt) + (flag & FLAG_ECE) || !tp->srtt_us) return false; - delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2)); + delay = max(usecs_to_jiffies(tp->srtt_us >> 5), + msecs_to_jiffies(2)); + if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay))) return false; @@ -2237,7 +2255,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) break; mss = skb_shinfo(skb)->gso_size; - err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss); + err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, + mss, GFP_ATOMIC); if (err < 0) break; cnt = packets; @@ -2351,7 +2370,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; - TCP_ECN_withdraw_cwr(tp); + tcp_ecn_withdraw_cwr(tp); } } else { tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); @@ -2470,7 +2489,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) * losses and/or application stalls), do not perform any further cwnd * reductions, but instead slow start up to ssthresh. */ -static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) +static void tcp_init_cwnd_reduction(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2480,9 +2499,8 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh) tp->prior_cwnd = tp->snd_cwnd; tp->prr_delivered = 0; tp->prr_out = 0; - if (set_ssthresh) - tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); - TCP_ECN_queue_cwr(tp); + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + tcp_ecn_queue_cwr(tp); } static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked, @@ -2523,14 +2541,14 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) } /* Enter CWR state. Disable cwnd undo since congestion is proven with ECN */ -void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) +void tcp_enter_cwr(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); tp->prior_ssthresh = 0; if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { tp->undo_marker = 0; - tcp_init_cwnd_reduction(sk, set_ssthresh); + tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); } } @@ -2559,7 +2577,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked) tp->retrans_stamp = 0; if (flag & FLAG_ECE) - tcp_enter_cwr(sk, 1); + tcp_enter_cwr(sk); if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { tcp_try_keep_open(sk); @@ -2659,13 +2677,12 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) NET_INC_STATS_BH(sock_net(sk), mib_idx); tp->prior_ssthresh = 0; - tp->undo_marker = tp->snd_una; - tp->undo_retrans = tp->retrans_out; + tcp_init_undo(tp); if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { if (!ece_ack) tp->prior_ssthresh = tcp_current_ssthresh(sk); - tcp_init_cwnd_reduction(sk, true); + tcp_init_cwnd_reduction(sk); } tcp_set_ca_state(sk, TCP_CA_Recovery); } @@ -2675,18 +2692,16 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack) */ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) { - struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); bool recovered = !before(tp->snd_una, tp->high_seq); if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */ - if (flag & FLAG_ORIG_SACK_ACKED) { - /* Step 3.b. A timeout is spurious if not all data are - * lost, i.e., never-retransmitted data are (s)acked. - */ - tcp_try_undo_loss(sk, true); + /* Step 3.b. A timeout is spurious if not all data are + * lost, i.e., never-retransmitted data are (s)acked. + */ + if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED)) return; - } + if (after(tp->snd_nxt, tp->high_seq) && (flag & FLAG_DATA_SACKED || is_dupack)) { tp->frto = 0; /* Loss was real: 2nd part of step 3.a */ @@ -2702,12 +2717,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack) if (recovered) { /* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */ - icsk->icsk_retransmits = 0; tcp_try_undo_recovery(sk); return; } - if (flag & FLAG_DATA_ACKED) - icsk->icsk_retransmits = 0; if (tcp_is_reno(tp)) { /* A Reno DUPACK means new data in F-RTO step 2.b above are * delivered. Lower inflight to clock out (re)tranmissions. @@ -2882,7 +2894,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, } static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, - s32 seq_rtt, s32 sack_rtt) + long seq_rtt_us, long sack_rtt_us) { const struct tcp_sock *tp = tcp_sk(sk); @@ -2892,10 +2904,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * is acked (RFC6298). */ if (flag & FLAG_RETRANS_DATA_ACKED) - seq_rtt = -1; + seq_rtt_us = -1L; - if (seq_rtt < 0) - seq_rtt = sack_rtt; + if (seq_rtt_us < 0) + seq_rtt_us = sack_rtt_us; /* RTTM Rule: A TSecr value received in a segment is used to * update the averaged RTT measurement only if the segment @@ -2903,14 +2915,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, * left edge of the send window. * See draft-ietf-tcplw-high-performance-00, section 3.3. */ - if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && flag & FLAG_ACKED) - seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr); - if (seq_rtt < 0) + if (seq_rtt_us < 0) return false; - tcp_rtt_estimator(sk, seq_rtt); + tcp_rtt_estimator(sk, seq_rtt_us); tcp_set_rto(sk); /* RFC6298: only reset backoff on valid RTT measurement. */ @@ -2922,22 +2934,23 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag, static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp) { struct tcp_sock *tp = tcp_sk(sk); - s32 seq_rtt = -1; + long seq_rtt_us = -1L; if (synack_stamp && !tp->total_retrans) - seq_rtt = tcp_time_stamp - synack_stamp; + seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp); /* If the ACK acks both the SYNACK and the (Fast Open'd) data packets * sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack() */ - if (!tp->srtt) - tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1); + if (!tp->srtt_us) + tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, acked, in_flight); + + icsk->icsk_ca_ops->cong_avoid(sk, ack, acked); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } @@ -2963,7 +2976,8 @@ void tcp_rearm_rto(struct sock *sk) if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { struct sk_buff *skb = tcp_write_queue_head(sk); - const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto; + const u32 rto_time_stamp = + tcp_skb_timestamp(skb) + rto; s32 delta = (s32)(rto_time_stamp - tcp_time_stamp); /* delta may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. @@ -3020,26 +3034,32 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) * arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, - u32 prior_snd_una, s32 sack_rtt) + u32 prior_snd_una, long sack_rtt_us) { - struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - struct sk_buff *skb; - u32 now = tcp_time_stamp; + struct skb_mstamp first_ackt, last_ackt, now; + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_sacked = tp->sacked_out; + u32 reord = tp->packets_out; bool fully_acked = true; - int flag = 0; + long ca_seq_rtt_us = -1L; + long seq_rtt_us = -1L; + struct sk_buff *skb; u32 pkts_acked = 0; - u32 reord = tp->packets_out; - u32 prior_sacked = tp->sacked_out; - s32 seq_rtt = -1; - s32 ca_seq_rtt = -1; - ktime_t last_ackt = net_invalid_timestamp(); bool rtt_update; + int flag = 0; + + first_ackt.v64 = 0; while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { + struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - u32 acked_pcount; u8 sacked = scb->sacked; + u32 acked_pcount; + + if (unlikely(shinfo->tx_flags & SKBTX_ACK_TSTAMP) && + between(shinfo->tskey, prior_snd_una, tp->snd_una - 1)) + __skb_tstamp_tx(skb, NULL, sk, SCM_TSTAMP_ACK); /* Determine how many packets and what bytes were acked, tso and else */ if (after(scb->end_seq, tp->snd_una)) { @@ -3061,11 +3081,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->retrans_out -= acked_pcount; flag |= FLAG_RETRANS_DATA_ACKED; } else { - ca_seq_rtt = now - scb->when; - last_ackt = skb->tstamp; - if (seq_rtt < 0) { - seq_rtt = ca_seq_rtt; - } + last_ackt = skb->skb_mstamp; + WARN_ON_ONCE(last_ackt.v64 == 0); + if (!first_ackt.v64) + first_ackt = last_ackt; + if (!(sacked & TCPCB_SACKED_ACKED)) reord = min(pkts_acked, reord); if (!after(scb->end_seq, tp->high_seq)) @@ -3111,7 +3131,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; - rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt); + skb_mstamp_get(&now); + if (first_ackt.v64) { + seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt); + ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt); + } + + rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us); if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops @@ -3139,25 +3165,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->fackets_out -= min(pkts_acked, tp->fackets_out); - if (ca_ops->pkts_acked) { - s32 rtt_us = -1; - - /* Is the ACK triggering packet unambiguous? */ - if (!(flag & FLAG_RETRANS_DATA_ACKED)) { - /* High resolution needed and available? */ - if (ca_ops->flags & TCP_CONG_RTT_STAMP && - !ktime_equal(last_ackt, - net_invalid_timestamp())) - rtt_us = ktime_us_delta(ktime_get_real(), - last_ackt); - else if (ca_seq_rtt >= 0) - rtt_us = jiffies_to_usecs(ca_seq_rtt); - } + if (ca_ops->pkts_acked) + ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us); - ca_ops->pkts_acked(sk, pkts_acked, rtt_us); - } - } else if (skb && rtt_update && sack_rtt >= 0 && - sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) { + } else if (skb && rtt_update && sack_rtt_us >= 0 && + sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) { /* Do not re-arm RTO if the sack RTT is measured from data sent * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. @@ -3205,9 +3217,10 @@ static void tcp_ack_probe(struct sock *sk) * This function is not for random using! */ } else { + unsigned long when = inet_csk_rto_backoff(icsk, TCP_RTO_MAX); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX), - TCP_RTO_MAX); + when, TCP_RTO_MAX); } } @@ -3348,7 +3361,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) tp->tlp_high_seq = 0; /* Don't reduce cwnd if DSACK arrives for TLP retrans. */ if (!(flag & FLAG_DSACKING_ACK)) { - tcp_init_cwnd_reduction(sk, true); + tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); tcp_try_keep_open(sk); @@ -3358,6 +3371,14 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } +static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (icsk->icsk_ca_ops->in_ack_event) + icsk->icsk_ca_ops->in_ack_event(sk, flags); +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -3367,12 +3388,11 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; bool is_dupack = false; - u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt; u32 prior_fackets; int prior_packets = tp->packets_out; const int prior_unsacked = tp->packets_out - tp->sacked_out; int acked = 0; /* Number of packets newly acked */ - s32 sack_rtt = -1; + long sack_rtt_us = -1L; /* If the ack is older than previous acks * then we can probably ignore it. @@ -3396,11 +3416,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); - if (after(ack, prior_snd_una)) + if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; + icsk->icsk_retransmits = 0; + } prior_fackets = tp->fackets_out; - prior_in_flight = tcp_packets_in_flight(tp); /* ts_recent update must be made after we are sure that the packet * is in window. @@ -3417,10 +3438,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tp->snd_una = ack; flag |= FLAG_WIN_UPDATE; - tcp_ca_event(sk, CA_EVENT_FAST_ACK); + tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { + u32 ack_ev_flags = CA_ACK_SLOWPATH; + if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -3430,12 +3453,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt); + &sack_rtt_us); - if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { flag |= FLAG_ECE; + ack_ev_flags |= CA_ACK_ECE; + } - tcp_ca_event(sk, CA_EVENT_SLOW_ACK); + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; + + tcp_in_ack_event(sk, ack_ev_flags); } /* We passed data and got it acked, remove any soft error @@ -3449,12 +3477,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* See if we can take anything off of the retransmit queue. */ acked = tp->packets_out; - flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt); + flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, + sack_rtt_us); acked -= tp->packets_out; /* Advance cwnd if state allows */ if (tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, acked, prior_in_flight); + tcp_cong_avoid(sk, ack, acked); if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); @@ -3472,8 +3501,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); - if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd) - tcp_update_pacing_rate(sk); + tcp_update_pacing_rate(sk); return 1; no_queue: @@ -3502,7 +3530,7 @@ old_ack: */ if (TCP_SKB_CB(skb)->sacked) { flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, - &sack_rtt); + &sack_rtt_us); tcp_fastretrans_alert(sk, acked, prior_unsacked, is_dupack, flag); } @@ -3686,7 +3714,7 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th) int opcode = *ptr++; int opsize; - switch(opcode) { + switch (opcode) { case TCPOPT_EOL: return NULL; case TCPOPT_NOP: @@ -4046,7 +4074,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) WARN_ON(before(tp->rcv_nxt, sp->end_seq)); /* Zap this SACK, by moving forward any other SACKS. */ - for (i=this_sack+1; i < num_sacks; i++) + for (i = this_sack+1; i < num_sacks; i++) tp->selective_acks[i-1] = tp->selective_acks[i]; num_sacks--; continue; @@ -4057,6 +4085,44 @@ static void tcp_sack_remove(struct tcp_sock *tp) tp->rx_opt.num_sacks = num_sacks; } +/** + * tcp_try_coalesce - try to merge skb to prior one + * @sk: socket + * @to: prior buffer + * @from: buffer to add in queue + * @fragstolen: pointer to boolean + * + * Before queueing skb @from after @to, try to merge them + * to reduce overall memory use and queue lengths, if cost is small. + * Packets in ofo or receive queues can stay a long time. + * Better try to coalesce them right now to avoid future collapses. + * Returns true if caller should free @from instead of queueing it + */ +static bool tcp_try_coalesce(struct sock *sk, + struct sk_buff *to, + struct sk_buff *from, + bool *fragstolen) +{ + int delta; + + *fragstolen = false; + + /* Its possible this segment overlaps with prior segment in queue */ + if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) + return false; + + if (!skb_try_coalesce(to, from, fragstolen, &delta)) + return false; + + atomic_add(delta, &sk->sk_rmem_alloc); + sk_mem_charge(sk, delta); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); + TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; + TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; + TCP_SKB_CB(to)->tcp_flags |= TCP_SKB_CB(from)->tcp_flags; + return true; +} + /* This one checks to see if we can put data from the * out_of_order queue into the receive_queue. */ @@ -4064,7 +4130,8 @@ static void tcp_ofo_queue(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); __u32 dsack_high = tp->rcv_nxt; - struct sk_buff *skb; + struct sk_buff *skb, *tail; + bool fragstolen, eaten; while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) @@ -4077,9 +4144,9 @@ static void tcp_ofo_queue(struct sock *sk) tcp_dsack_extend(sk, TCP_SKB_CB(skb)->seq, dsack); } + __skb_unlink(skb, &tp->out_of_order_queue); if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { SOCK_DEBUG(sk, "ofo packet was already received\n"); - __skb_unlink(skb, &tp->out_of_order_queue); __kfree_skb(skb); continue; } @@ -4087,11 +4154,15 @@ static void tcp_ofo_queue(struct sock *sk) tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); - __skb_unlink(skb, &tp->out_of_order_queue); - __skb_queue_tail(&sk->sk_receive_queue, skb); + tail = skb_peek_tail(&sk->sk_receive_queue); + eaten = tail && tcp_try_coalesce(sk, tail, skb, &fragstolen); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if (tcp_hdr(skb)->fin) + if (!eaten) + __skb_queue_tail(&sk->sk_receive_queue, skb); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); + if (eaten) + kfree_skb_partial(skb, fragstolen); } } @@ -4118,53 +4189,13 @@ static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, return 0; } -/** - * tcp_try_coalesce - try to merge skb to prior one - * @sk: socket - * @to: prior buffer - * @from: buffer to add in queue - * @fragstolen: pointer to boolean - * - * Before queueing skb @from after @to, try to merge them - * to reduce overall memory use and queue lengths, if cost is small. - * Packets in ofo or receive queues can stay a long time. - * Better try to coalesce them right now to avoid future collapses. - * Returns true if caller should free @from instead of queueing it - */ -static bool tcp_try_coalesce(struct sock *sk, - struct sk_buff *to, - struct sk_buff *from, - bool *fragstolen) -{ - int delta; - - *fragstolen = false; - - if (tcp_hdr(from)->fin) - return false; - - /* Its possible this segment overlaps with prior segment in queue */ - if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq) - return false; - - if (!skb_try_coalesce(to, from, fragstolen, &delta)) - return false; - - atomic_add(delta, &sk->sk_rmem_alloc); - sk_mem_charge(sk, delta); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPRCVCOALESCE); - TCP_SKB_CB(to)->end_seq = TCP_SKB_CB(from)->end_seq; - TCP_SKB_CB(to)->ack_seq = TCP_SKB_CB(from)->ack_seq; - return true; -} - static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb1; u32 seq, end_seq; - TCP_ECN_check_ce(tp, skb); + tcp_ecn_check_ce(tp, skb); if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); @@ -4303,24 +4334,19 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { - struct sk_buff *skb = NULL; - struct tcphdr *th; + struct sk_buff *skb; bool fragstolen; if (size == 0) return 0; - skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); + skb = alloc_skb(size, sk->sk_allocation); if (!skb) goto err; - if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) + if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto err_free; - th = (struct tcphdr *)skb_put(skb, sizeof(*th)); - skb_reset_transport_header(skb); - memset(th, 0, sizeof(*th)); - if (memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size)) goto err_free; @@ -4328,7 +4354,7 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + size; TCP_SKB_CB(skb)->ack_seq = tcp_sk(sk)->snd_una - 1; - if (tcp_queue_rcv(sk, skb, sizeof(*th), &fragstolen)) { + if (tcp_queue_rcv(sk, skb, 0, &fragstolen)) { WARN_ON_ONCE(fragstolen); /* should not happen */ __kfree_skb(skb); } @@ -4342,7 +4368,6 @@ err: static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { - const struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; bool fragstolen = false; @@ -4351,9 +4376,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) goto drop; skb_dst_drop(skb); - __skb_pull(skb, th->doff * 4); + __skb_pull(skb, tcp_hdr(skb)->doff * 4); - TCP_ECN_accept_cwr(tp, skb); + tcp_ecn_accept_cwr(tp, skb); tp->rx_opt.dsack = 0; @@ -4395,7 +4420,7 @@ queue_and_out: tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (skb->len) tcp_event_data_recv(sk, skb); - if (th->fin) + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) tcp_fin(sk); if (!skb_queue_empty(&tp->out_of_order_queue)) { @@ -4416,7 +4441,7 @@ queue_and_out: if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return; } @@ -4510,7 +4535,7 @@ restart: * - bloated or contains data before "start" or * overlaps to the next one. */ - if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && + if (!(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) && (tcp_win_from_space(skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start))) { end_of_skbs = false; @@ -4529,30 +4554,18 @@ restart: /* Decided to skip this, advance start seq. */ start = TCP_SKB_CB(skb)->end_seq; } - if (end_of_skbs || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) + if (end_of_skbs || + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; while (before(start, end)) { + int copy = min_t(int, SKB_MAX_ORDER(0, 0), end - start); struct sk_buff *nskb; - unsigned int header = skb_headroom(skb); - int copy = SKB_MAX_ORDER(header, 0); - /* Too big header? This can happen with IPv6. */ - if (copy < 0) - return; - if (end - start < copy) - copy = end - start; - nskb = alloc_skb(copy + header, GFP_ATOMIC); + nskb = alloc_skb(copy, GFP_ATOMIC); if (!nskb) return; - skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); - skb_set_network_header(nskb, (skb_network_header(skb) - - skb->head)); - skb_set_transport_header(nskb, (skb_transport_header(skb) - - skb->head)); - skb_reserve(nskb, header); - memcpy(nskb->head, skb->head, header); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; __skb_queue_before(list, skb, nskb); @@ -4576,8 +4589,7 @@ restart: skb = tcp_collapse_one(sk, skb, list); if (!skb || skb == tail || - tcp_hdr(skb)->syn || - tcp_hdr(skb)->fin) + (TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN))) return; } } @@ -4706,28 +4718,6 @@ static int tcp_prune_queue(struct sock *sk) return -1; } -/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. - * As additional protections, we do not touch cwnd in retransmission phases, - * and if application hit its sndbuf limit recently. - */ -void tcp_cwnd_application_limited(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open && - sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - /* Limited by application or receiver window. */ - u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); - u32 win_used = max(tp->snd_cwnd_used, init_win); - if (win_used < tp->snd_cwnd) { - tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; - } - tp->snd_cwnd_used = 0; - } - tp->snd_cwnd_stamp = tcp_time_stamp; -} - static bool tcp_should_expand_sndbuf(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -4917,7 +4907,7 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, const struct tcphdr *t BUG(); tp->urg_data = TCP_URG_VALID | tmp; if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); } } } @@ -4967,53 +4957,6 @@ static inline bool tcp_checksum_complete_user(struct sock *sk, __tcp_checksum_complete_user(sk, skb); } -#ifdef CONFIG_NET_DMA -static bool tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, - int hlen) -{ - struct tcp_sock *tp = tcp_sk(sk); - int chunk = skb->len - hlen; - int dma_cookie; - bool copied_early = false; - - if (tp->ucopy.wakeup) - return false; - - if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = net_dma_find_channel(); - - if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { - - dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, - skb, hlen, - tp->ucopy.iov, chunk, - tp->ucopy.pinned_list); - - if (dma_cookie < 0) - goto out; - - tp->ucopy.dma_cookie = dma_cookie; - copied_early = true; - - tp->ucopy.len -= chunk; - tp->copied_seq += chunk; - tcp_rcv_space_adjust(sk); - - if ((tp->ucopy.len == 0) || - (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || - (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { - tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); - } - } else if (chunk > 0) { - tp->ucopy.wakeup = 1; - sk->sk_data_ready(sk, 0); - } -out: - return copied_early; -} -#endif /* CONFIG_NET_DMA */ - /* Does PAWS and seqno based validation of an incoming segment, flags will * play significant role here. */ @@ -5193,27 +5136,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else { int eaten = 0; - int copied_early = 0; bool fragstolen = false; - if (tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len) { -#ifdef CONFIG_NET_DMA - if (tp->ucopy.task == current && - sock_owned_by_user(sk) && - tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { - copied_early = 1; - eaten = 1; - } -#endif - if (tp->ucopy.task == current && - sock_owned_by_user(sk) && !copied_early) { - __set_current_state(TASK_RUNNING); + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len && + sock_owned_by_user(sk)) { + __set_current_state(TASK_RUNNING); - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) - eaten = 1; - } - if (eaten) { + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -5229,9 +5160,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPHPHITSTOUSER); + eaten = 1; } - if (copied_early) - tcp_cleanup_rbuf(sk, skb->len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) @@ -5268,17 +5198,11 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, goto no_ack; } - if (!copied_early || tp->rcv_nxt != tp->rcv_wup) - __tcp_ack_snd_check(sk, 0); + __tcp_ack_snd_check(sk, 0); no_ack: -#ifdef CONFIG_NET_DMA - if (copied_early) - __skb_queue_tail(&sk->sk_async_wait_queue, skb); - else -#endif if (eaten) kfree_skb_partial(skb, fragstolen); - sk->sk_data_ready(sk, 0); + sk->sk_data_ready(sk); return; } } @@ -5398,9 +5322,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, break; } tcp_rearm_rto(sk); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVEFAIL); return true; } tp->syn_data_acked = tp->syn_data; + if (tp->syn_data_acked) + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE); return false; } @@ -5466,7 +5393,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, * state to ESTABLISHED..." */ - TCP_ECN_rcv_synack(tp, th); + tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_ack(sk, skb, FLAG_SLOWPATH); @@ -5585,7 +5512,7 @@ discard: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - TCP_ECN_rcv_syn(tp, th); + tcp_ecn_rcv_syn(tp, th); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -5900,3 +5827,190 @@ discard: return 0; } EXPORT_SYMBOL(tcp_rcv_state_process); + +static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) +{ + struct inet_request_sock *ireq = inet_rsk(req); + + if (family == AF_INET) + LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"), + &ireq->ir_rmt_addr, port); +#if IS_ENABLED(CONFIG_IPV6) + else if (family == AF_INET6) + LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI6/%u\n"), + &ireq->ir_v6_rmt_addr, port); +#endif +} + +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set + * + * If we receive a SYN packet with these bits set, it means a + * network is playing bad games with TOS bits. In order to + * avoid possible false congestion notifications, we disable + * TCP ECN negociation. + * + * Exception: tcp_ca wants ECN. This is required for DCTCP + * congestion control; it requires setting ECT on all packets, + * including SYN. We inverse the test in this case: If our + * local socket wants ECN, but peer only set ece/cwr (but not + * ECT in IP header) its probably a non-DCTCP aware sender. + */ +static void tcp_ecn_create_request(struct request_sock *req, + const struct sk_buff *skb, + const struct sock *listen_sk) +{ + const struct tcphdr *th = tcp_hdr(skb); + const struct net *net = sock_net(listen_sk); + bool th_ecn = th->ece && th->cwr; + bool ect, need_ecn; + + if (!th_ecn) + return; + + ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); + need_ecn = tcp_ca_needs_ecn(listen_sk); + + if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) + inet_rsk(req)->ecn_ok = 1; + else if (ect && need_ecn) + inet_rsk(req)->ecn_ok = 1; +} + +int tcp_conn_request(struct request_sock_ops *rsk_ops, + const struct tcp_request_sock_ops *af_ops, + struct sock *sk, struct sk_buff *skb) +{ + struct tcp_options_received tmp_opt; + struct request_sock *req; + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = NULL; + __u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn; + bool want_cookie = false, fastopen; + struct flowi fl; + struct tcp_fastopen_cookie foc = { .len = -1 }; + int err; + + + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if ((sysctl_tcp_syncookies == 2 || + inet_csk_reqsk_queue_is_full(sk)) && !isn) { + want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name); + if (!want_cookie) + goto drop; + } + + + /* Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); + goto drop; + } + + req = inet_reqsk_alloc(rsk_ops); + if (!req) + goto drop; + + tcp_rsk(req)->af_specific = af_ops; + + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = af_ops->mss_clamp; + tmp_opt.user_mss = tp->rx_opt.user_mss; + tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc); + + if (want_cookie && !tmp_opt.saw_tstamp) + tcp_clear_options(&tmp_opt); + + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + tcp_openreq_init(req, &tmp_opt, skb, sk); + + af_ops->init_req(req, sk, skb); + + if (security_inet_conn_request(sk, skb, req)) + goto drop_and_free; + + if (!want_cookie || tmp_opt.tstamp_ok) + tcp_ecn_create_request(req, skb, sk); + + if (want_cookie) { + isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); + req->cookie_ts = tmp_opt.tstamp_ok; + } else if (!isn) { + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tcp_death_row.sysctl_tw_recycle) { + bool strict; + + dst = af_ops->route_req(sk, &fl, req, &strict); + + if (dst && strict && + !tcp_peer_is_proven(req, dst, true, + tmp_opt.saw_tstamp)) { + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); + goto drop_and_release; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (sysctl_max_syn_backlog >> 2)) && + !tcp_peer_is_proven(req, dst, false, + tmp_opt.saw_tstamp)) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + pr_drop_req(req, ntohs(tcp_hdr(skb)->source), + rsk_ops->family); + goto drop_and_release; + } + + isn = af_ops->init_seq(skb); + } + if (!dst) { + dst = af_ops->route_req(sk, &fl, req, NULL); + if (!dst) + goto drop_and_free; + } + + tcp_rsk(req)->snt_isn = isn; + tcp_openreq_init_rwin(req, sk, dst); + fastopen = !want_cookie && + tcp_try_fastopen(sk, skb, req, &foc, dst); + err = af_ops->send_synack(sk, dst, &fl, req, + skb_get_queue_mapping(skb), &foc); + if (!fastopen) { + if (err || want_cookie) + goto drop_and_free; + + tcp_rsk(req)->listener = NULL; + af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); + } + + return 0; + +drop_and_release: + dst_release(dst); +drop_and_free: + reqsk_free(req); +drop: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); + return 0; +} +EXPORT_SYMBOL(tcp_conn_request); |
