From 8336886f786fdacbc19b719c1f7ea91eb70706d4 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Fri, 31 Aug 2012 12:29:12 +0000 Subject: tcp: TCP Fast Open Server - support TFO listeners This patch builds on top of the previous patch to add the support for TFO listeners. This includes - 1. allocating, properly initializing, and managing the per listener fastopen_queue structure when TFO is enabled 2. changes to the inet_csk_accept code to support TFO. E.g., the request_sock can no longer be freed upon accept(), not until 3WHS finishes 3. allowing a TCP_SYN_RECV socket to properly poll() and sendmsg() if it's a TFO socket 4. properly closing a TFO listener, and a TFO socket before 3WHS finishes 5. supporting TCP_FASTOPEN socket option 6. modifying tcp_check_req() to use to check a TFO socket as well as request_sock 7. supporting TCP's TFO cookie option 8. adding a new SYN-ACK retransmit handler to use the timer directly off the TFO socket rather than the listener socket. Note that TFO server side will not retransmit anything other than SYN-ACK until the 3WHS is completed. The patch also contains an important function "reqsk_fastopen_remove()" to manage the somewhat complex relation between a listener, its request_sock, and the corresponding child socket. See the comment above the function for the detail. Signed-off-by: H.K. Jerry Chu Cc: Yuchung Cheng Cc: Neal Cardwell Cc: Eric Dumazet Cc: Tom Herbert Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 61 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 11 deletions(-) (limited to 'net/ipv4/tcp_minisocks.c') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 6ff7f10dce9d..e965319d610b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -507,6 +507,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; TCP_ECN_openreq_child(newtp, req); + newtp->fastopen_rsk = NULL; TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } @@ -515,13 +516,18 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, EXPORT_SYMBOL(tcp_create_openreq_child); /* - * Process an incoming packet for SYN_RECV sockets represented - * as a request_sock. + * Process an incoming packet for SYN_RECV sockets represented as a + * request_sock. Normally sk is the listener socket but for TFO it + * points to the child socket. + * + * XXX (TFO) - The current impl contains a special check for ack + * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct request_sock **prev) + struct request_sock **prev, + bool fastopen) { struct tcp_options_received tmp_opt; const u8 *hash_location; @@ -530,6 +536,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; + BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN)); + tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL); @@ -565,6 +573,9 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. + * + * Note that even if there is new data in the SYN packet + * they will be thrown away too. */ req->rsk_ops->rtx_syn_ack(sk, req, NULL); return NULL; @@ -622,9 +633,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * - * Invalid ACK: reset will be sent by listening socket + * Invalid ACK: reset will be sent by listening socket. + * Note that the ACK validity check for a Fast Open socket is done + * elsewhere and is checked directly against the child socket rather + * than req because user data may have been sent out. */ - if ((flg & TCP_FLAG_ACK) && + if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk)))) return sk; @@ -637,7 +651,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, - tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) { + tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST)) req->rsk_ops->send_ack(sk, skb, req); @@ -648,7 +662,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* In sequence, PAWS is OK. */ - if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1)) + if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) req->ts_recent = tmp_opt.rcv_tsval; if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { @@ -667,10 +681,19 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. + * + * XXX (TFO) - if we ever allow "data after SYN", the + * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; + /* For Fast Open no more processing is needed (sk is the + * child socket). + */ + if (fastopen) + return sk; + /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { @@ -706,11 +729,21 @@ listen_overflow: } embryonic_reset: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); - if (!(flg & TCP_FLAG_RST)) + if (!(flg & TCP_FLAG_RST)) { + /* Received a bad SYN pkt - for TFO We try not to reset + * the local connection unless it's really necessary to + * avoid becoming vulnerable to outside attack aiming at + * resetting legit local connections. + */ req->rsk_ops->send_reset(sk, skb); - - inet_csk_reqsk_queue_drop(sk, req, prev); + } else if (fastopen) { /* received a valid RST pkt */ + reqsk_fastopen_remove(sk, req, true); + tcp_reset(sk); + } + if (!fastopen) { + inet_csk_reqsk_queue_drop(sk, req, prev); + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); + } return NULL; } EXPORT_SYMBOL(tcp_check_req); @@ -719,6 +752,12 @@ EXPORT_SYMBOL(tcp_check_req); * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with * the new socket. + * + * For the vast majority of cases child->sk_state will be TCP_SYN_RECV + * when entering. But other states are possible due to a race condition + * where after __inet_lookup_established() fails but before the listener + * locked is obtained, other packets cause the same connection to + * be created. */ int tcp_child_process(struct sock *parent, struct sock *child, -- cgit v1.2.3 From 4308fc58dceda40fac8a8d9c05b7cfba0a6bbed3 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 19 Sep 2012 15:46:06 +0100 Subject: tcp: Document use of undefined variable. Both tcp_timewait_state_process and tcp_check_req use the same basic construct of struct tcp_options received tmp_opt; tmp_opt.saw_tstamp = 0; then call tcp_parse_options However if they are fed a frame containing a TCP_SACK then tbe code behaviour is undefined because opt_rx->sack_ok is undefined data. This ought to be documented if it is intentional. Signed-off-by: Alan Cox Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4/tcp_minisocks.c') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e965319d610b..5792577b5bc5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -85,6 +85,8 @@ static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) * spinlock it. I do not want! Well, probability of misbehaviour * is ridiculously low and, seems, we could use some mb() tricks * to avoid misread sequence numbers, states etc. --ANK + * + * We don't need to initialize tmp_out.sack_ok as we don't use the results */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, @@ -522,6 +524,8 @@ EXPORT_SYMBOL(tcp_create_openreq_child); * * XXX (TFO) - The current impl contains a special check for ack * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? + * + * We don't need to initialize tmp_opt.sack_ok as we don't use the results */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, -- cgit v1.2.3 From 072539880156fafd86c753070e598c23e8e74586 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 22 Sep 2012 04:18:56 +0000 Subject: tcp: TCP Fast Open Server - note timestamps and retransmits for SYNACK RTT Previously, when using TCP Fast Open a server would return from tcp_check_req() before updating snt_synack based on TCP timestamp echo replies and whether or not we've retransmitted the SYNACK. The result was that (a) for TFO connections using timestamps we used an incorrect baseline SYNACK send time (tcp_time_stamp of SYNACK send instead of rcv_tsecr), and (b) for TFO connections that do not have TCP timestamps but retransmit the SYNACK we took a SYNACK RTT sample when we should not take a sample. This fix merely moves the snt_synack update logic a bit earlier in the function, so that connections using TCP Fast Open will properly do these updates when the ACK for the SYNACK arrives. Moving this snt_synack update logic means that with TCP_DEFER_ACCEPT enabled we do a few instructions of wasted work on each bare ACK, but that seems OK. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/ipv4/tcp_minisocks.c') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5792577b5bc5..27536ba16c9d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -692,6 +692,12 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + /* Got ACK for our SYNACK, so update baseline for SYNACK RTT sample. */ + if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) + tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; + else if (req->retrans) /* don't take RTT sample if retrans && ~TS */ + tcp_rsk(req)->snt_synack = 0; + /* For Fast Open no more processing is needed (sk is the * child socket). */ @@ -705,10 +711,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } - if (tmp_opt.saw_tstamp && tmp_opt.rcv_tsecr) - tcp_rsk(req)->snt_synack = tmp_opt.rcv_tsecr; - else if (req->retrans) /* don't take RTT sample if retrans && ~TS */ - tcp_rsk(req)->snt_synack = 0; /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all -- cgit v1.2.3