summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/ipv4/tcp_input.c')
-rw-r--r--net/ipv4/tcp_input.c175
1 files changed, 101 insertions, 74 deletions
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 12b98e257c5f..4e3bf801a72e 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -102,6 +102,7 @@ int sysctl_tcp_thin_dupack __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
int sysctl_tcp_early_retrans __read_mostly = 3;
int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
+int sysctl_tcp_default_init_rwnd __read_mostly = TCP_INIT_CWND * 2;
#define FLAG_DATA 0x01 /* Incoming frame contained data. */
#define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */
@@ -117,6 +118,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */
#define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */
#define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */
+#define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -175,24 +177,27 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
}
}
-static void tcp_incr_quickack(struct sock *sk)
+static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
unsigned int quickacks = tcp_sk(sk)->rcv_wnd / (2 * icsk->icsk_ack.rcv_mss);
if (quickacks == 0)
quickacks = 2;
+ quickacks = min(quickacks, max_quickacks);
if (quickacks > icsk->icsk_ack.quick)
- icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+ icsk->icsk_ack.quick = quickacks;
}
-static void tcp_enter_quickack_mode(struct sock *sk)
+void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
- tcp_incr_quickack(sk);
+
+ tcp_incr_quickack(sk, max_quickacks);
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
+EXPORT_SYMBOL(tcp_enter_quickack_mode);
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.
@@ -224,8 +229,10 @@ static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
}
-static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
switch (TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK) {
case INET_ECN_NOT_ECT:
/* Funny extension: if ECT is not set on a segment,
@@ -233,31 +240,31 @@ static void __tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
* it is probably a retransmit.
*/
if (tp->ecn_flags & TCP_ECN_SEEN)
- tcp_enter_quickack_mode((struct sock *)tp);
+ tcp_enter_quickack_mode(sk, 2);
break;
case INET_ECN_CE:
- if (tcp_ca_needs_ecn((struct sock *)tp))
- tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_IS_CE);
+ if (tcp_ca_needs_ecn(sk))
+ tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
/* Better not delay acks, sender can have a very low cwnd */
- tcp_enter_quickack_mode((struct sock *)tp);
+ tcp_enter_quickack_mode(sk, 2);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
- if (tcp_ca_needs_ecn((struct sock *)tp))
- tcp_ca_event((struct sock *)tp, CA_EVENT_ECN_NO_CE);
+ if (tcp_ca_needs_ecn(sk))
+ tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
tp->ecn_flags |= TCP_ECN_SEEN;
break;
}
}
-static void tcp_ecn_check_ce(struct tcp_sock *tp, const struct sk_buff *skb)
+static void tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb)
{
- if (tp->ecn_flags & TCP_ECN_OK)
- __tcp_ecn_check_ce(tp, skb);
+ if (tcp_sk(sk)->ecn_flags & TCP_ECN_OK)
+ __tcp_ecn_check_ce(sk, skb);
}
static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
@@ -556,8 +563,8 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
void tcp_rcv_space_adjust(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 copied;
int time;
- int copied;
time = tcp_time_stamp - tp->rcvq_space.time;
if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0)
@@ -579,12 +586,13 @@ void tcp_rcv_space_adjust(struct sock *sk)
if (sysctl_tcp_moderate_rcvbuf &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- int rcvwin, rcvmem, rcvbuf;
+ int rcvmem, rcvbuf;
+ u64 rcvwin;
/* minimal window to cope with packet losses, assuming
* steady state. Add some cushion because of small variations.
*/
- rcvwin = (copied << 1) + 16 * tp->advmss;
+ rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
/* If rate increased by 25%,
* assume slow start, rcvwin = 3 * copied
@@ -604,12 +612,13 @@ void tcp_rcv_space_adjust(struct sock *sk)
while (tcp_win_from_space(rcvmem) < tp->advmss)
rcvmem += 128;
- rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
+ do_div(rcvwin, tp->advmss);
+ rcvbuf = min_t(u64, rcvwin * rcvmem, sysctl_tcp_rmem[2]);
if (rcvbuf > sk->sk_rcvbuf) {
sk->sk_rcvbuf = rcvbuf;
/* Make the window clamp follow along. */
- tp->window_clamp = rcvwin;
+ tp->window_clamp = tcp_win_from_space(rcvbuf);
}
}
tp->rcvq_space.space = copied;
@@ -647,7 +656,7 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
/* The _first_ data packet received, initialize
* delayed ACK engine.
*/
- tcp_incr_quickack(sk);
+ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
icsk->icsk_ack.ato = TCP_ATO_MIN;
} else {
int m = now - icsk->icsk_ack.lrcvtime;
@@ -663,13 +672,13 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
/* Too long gap. Apparently sender failed to
* restart window, so that we send ACKs quickly.
*/
- tcp_incr_quickack(sk);
+ tcp_incr_quickack(sk, TCP_MAX_QUICKACKS);
sk_mem_reclaim(sk);
}
}
icsk->icsk_ack.lrcvtime = now;
- tcp_ecn_check_ce(tp, skb);
+ tcp_ecn_check_ce(sk, skb);
if (skb->len >= 128)
tcp_grow_window(sk, skb);
@@ -1134,13 +1143,14 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
*/
if (pkt_len > mss) {
unsigned int new_len = (pkt_len / mss) * mss;
- if (!in_sack && new_len < pkt_len) {
+ if (!in_sack && new_len < pkt_len)
new_len += mss;
- if (new_len >= skb->len)
- return 0;
- }
pkt_len = new_len;
}
+
+ if (pkt_len >= skb->len && !in_sack)
+ return 0;
+
err = tcp_fragment(sk, skb, pkt_len, mss, GFP_ATOMIC);
if (err < 0)
return err;
@@ -2164,8 +2174,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int cnt, oldcnt;
- int err;
+ int cnt, oldcnt, lost;
unsigned int mss;
/* Use SACK to deduce losses of new sequences sent during recovery */
const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
@@ -2205,9 +2214,10 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
break;
mss = tcp_skb_mss(skb);
- err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
- mss, GFP_ATOMIC);
- if (err < 0)
+ /* If needed, chop off the prefix to mark as lost. */
+ lost = (packets - oldcnt) * mss;
+ if (lost < skb->len &&
+ tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
break;
cnt = packets;
}
@@ -2324,10 +2334,9 @@ static void DBGUNDO(struct sock *sk, const char *msg)
}
#if IS_ENABLED(CONFIG_IPV6)
else if (sk->sk_family == AF_INET6) {
- struct ipv6_pinfo *np = inet6_sk(sk);
pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n",
msg,
- &np->daddr, ntohs(inet->inet_dport),
+ &sk->sk_v6_daddr, ntohs(inet->inet_dport),
tp->snd_cwnd, tcp_left_out(tp),
tp->snd_ssthresh, tp->prior_ssthresh,
tp->packets_out);
@@ -2503,8 +2512,8 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
/* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */
- if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR ||
- (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) {
+ if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH &&
+ (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) {
tp->snd_cwnd = tp->snd_ssthresh;
tp->snd_cwnd_stamp = tcp_time_stamp;
}
@@ -3028,8 +3037,7 @@ void tcp_rearm_rto(struct sock *sk)
/* delta may not be positive if the socket is locked
* when the retrans timer fires and is rescheduled.
*/
- if (delta > 0)
- rto = delta;
+ rto = max(delta, 1);
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
TCP_RTO_MAX);
@@ -3216,11 +3224,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (tcp_is_reno(tp)) {
tcp_remove_reno_sacks(sk, pkts_acked);
+
+ /* If any of the cumulatively ACKed segments was
+ * retransmitted, non-SACK case cannot confirm that
+ * progress was due to original transmission due to
+ * lack of TCPCB_SACKED_ACKED bits even if some of
+ * the packets may have been never retransmitted.
+ */
+ if (flag & FLAG_RETRANS_DATA_ACKED)
+ flag &= ~FLAG_ORIG_SACK_ACKED;
} else {
int delta;
/* Non-retransmitted hole got filled? That's reordering */
- if (reord < prior_fackets)
+ if (reord < prior_fackets && reord <= tp->fackets_out)
tcp_update_reordering(sk, tp->fackets_out - reord, 0);
delta = tcp_is_fack(tp) ? pkts_acked :
@@ -3544,7 +3561,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (before(ack, prior_snd_una)) {
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
if (before(ack, prior_snd_una - tp->max_window)) {
- tcp_send_challenge_ack(sk, skb);
+ if (!(flag & FLAG_NO_CHALLENGE_ACK))
+ tcp_send_challenge_ack(sk, skb);
return -1;
}
goto old_ack;
@@ -3868,11 +3886,8 @@ const u8 *tcp_parse_md5sig_option(const struct tcphdr *th)
int length = (th->doff << 2) - sizeof(*th);
const u8 *ptr = (const u8 *)(th + 1);
- /* If the TCP option is too short, we can short cut */
- if (length < TCPOLEN_MD5SIG)
- return NULL;
-
- while (length > 0) {
+ /* If not enough data remaining, we can short cut */
+ while (length >= TCPOLEN_MD5SIG) {
int opcode = *ptr++;
int opsize;
@@ -4126,7 +4141,7 @@ static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq &&
before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOST);
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
if (tcp_is_sack(tp) && sysctl_tcp_dsack) {
u32 end_seq = TCP_SKB_CB(skb)->end_seq;
@@ -4354,7 +4369,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
struct sk_buff *skb1;
u32 seq, end_seq;
- tcp_ecn_check_ce(tp, skb);
+ tcp_ecn_check_ce(sk, skb);
if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
@@ -4628,7 +4643,7 @@ queue_and_out:
tcp_dsack_set(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
out_of_window:
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_schedule_ack(sk);
drop:
__kfree_skb(skb);
@@ -4639,8 +4654,6 @@ drop:
if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp)))
goto out_of_window;
- tcp_enter_quickack_mode(sk);
-
if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {
/* Partial packet, seq < rcv_next < end_seq */
SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n",
@@ -4779,6 +4792,7 @@ restart:
static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 range_truesize, sum_tiny = 0;
struct sk_buff *skb = skb_peek(&tp->out_of_order_queue);
struct sk_buff *head;
u32 start, end;
@@ -4788,6 +4802,7 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
+ range_truesize = skb->truesize;
head = skb;
for (;;) {
@@ -4802,15 +4817,26 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
if (!skb ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
- tcp_collapse(sk, &tp->out_of_order_queue,
- head, skb, start, end);
+ /* Do not attempt collapsing tiny skbs */
+ if (range_truesize != head->truesize ||
+ end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+ tcp_collapse(sk, &tp->out_of_order_queue,
+ head, skb, start, end);
+ } else {
+ sum_tiny += range_truesize;
+ if (sum_tiny > sk->sk_rcvbuf >> 3)
+ return;
+ }
+
head = skb;
if (!skb)
break;
/* Start new segment */
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
+ range_truesize = skb->truesize;
} else {
+ range_truesize += skb->truesize;
if (before(TCP_SKB_CB(skb)->seq, start))
start = TCP_SKB_CB(skb)->seq;
if (after(TCP_SKB_CB(skb)->end_seq, end))
@@ -4865,6 +4891,9 @@ static int tcp_prune_queue(struct sock *sk)
else if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ return 0;
+
tcp_collapse_ofo_queue(sk);
if (!skb_queue_empty(&sk->sk_receive_queue))
tcp_collapse(sk, &sk->sk_receive_queue,
@@ -4943,7 +4972,7 @@ static void tcp_check_space(struct sock *sk)
if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) {
sock_reset_flag(sk, SOCK_QUEUE_SHRUNK);
/* pairs with tcp_poll() */
- smp_mb__after_atomic();
+ smp_mb();
if (sk->sk_socket &&
test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
tcp_new_space(sk);
@@ -5436,6 +5465,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
struct inet_connection_sock *icsk = inet_csk(sk);
tcp_set_state(sk, TCP_ESTABLISHED);
+ icsk->icsk_ack.lrcvtime = tcp_time_stamp;
if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
@@ -5464,10 +5494,6 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
else
tp->pred_flags = 0;
- if (!sock_flag(sk, SOCK_DEAD)) {
- sk->sk_state_change(sk);
- sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
- }
}
static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
@@ -5531,6 +5557,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_cookie foc = { .len = -1 };
int saved_clamp = tp->rx_opt.mss_clamp;
+ bool fastopen_fail;
tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr)
@@ -5633,10 +5660,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
tcp_finish_connect(sk, skb);
- if ((tp->syn_fastopen || tp->syn_data) &&
- tcp_rcv_fastopen_synack(sk, skb, &foc))
- return -1;
+ fastopen_fail = (tp->syn_fastopen || tp->syn_data) &&
+ tcp_rcv_fastopen_synack(sk, skb, &foc);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_state_change(sk);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
+ }
+ if (fastopen_fail)
+ return -1;
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
icsk->icsk_ack.pingpong) {
@@ -5648,8 +5680,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
* to stand against the temptation 8) --ANK
*/
inet_csk_schedule_ack(sk);
- icsk->icsk_ack.lrcvtime = tcp_time_stamp;
- tcp_enter_quickack_mode(sk);
+ tcp_enter_quickack_mode(sk, TCP_MAX_QUICKACKS);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
TCP_DELACK_MAX, TCP_RTO_MAX);
@@ -5831,13 +5862,17 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
/* step 5: check the ACK field */
acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH |
- FLAG_UPDATE_TS_RECENT) > 0;
+ FLAG_UPDATE_TS_RECENT |
+ FLAG_NO_CHALLENGE_ACK) > 0;
+ if (!acceptable) {
+ if (sk->sk_state == TCP_SYN_RECV)
+ return 1; /* send one RST */
+ tcp_send_challenge_ack(sk, skb);
+ goto discard;
+ }
switch (sk->sk_state) {
case TCP_SYN_RECV:
- if (!acceptable)
- return 1;
-
if (!tp->srtt_us)
tcp_synack_rtt_meas(sk, req);
@@ -5906,14 +5941,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
* our SYNACK so stop the SYNACK timer.
*/
if (req) {
- /* Return RST if ack_seq is invalid.
- * Note that RFC793 only says to generate a
- * DUPACK for it but for TCP Fast Open it seems
- * better to treat this case like TCP_SYN_RECV
- * above.
- */
- if (!acceptable)
- return 1;
/* We no longer need the request sock. */
reqsk_fastopen_remove(sk, req, false);
tcp_rearm_rto(sk);
@@ -6106,7 +6133,7 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
struct inet_request_sock *ireq = inet_rsk(req);
kmemcheck_annotate_bitfield(ireq, flags);
- ireq->opt = NULL;
+ ireq->ireq_opt = NULL;
atomic64_set(&ireq->ir_cookie, 0);
ireq->ireq_state = TCP_NEW_SYN_RECV;
write_pnet(&ireq->ireq_net, sock_net(sk_listener));