假定客户端主动打开,发送syn包到服务器,服务器创建连接请求控制块加入到队列,进入TCP_NEW_SYN_RECV 状态,发送syn+ack给客户端,并启动定时器,等待客户端回复最后一个握手ack;
tcp_v4_rcv上来的包,会判断连接状态,当状态为TCP_NEW_SYN_RECV时,期望得到对端发来的ack,以完成三次握手正式建立连接;函数通过调用tcp_check_req处理ack,成功会返回新建的子控制块,然后调用tcp_child_process进行进一步的处理,包括更新状态为已连接状态,通知正在等待的应用程序等;
1 int tcp_v4_rcv(struct sk_buff *skb) 2 { 3 /* 省略一些无关代码 */ 4 5 if (sk->sk_state == TCP_NEW_SYN_RECV) { 6 struct request_sock *req = inet_reqsk(sk); 7 struct sock *nsk; 8 9 /* 获取控制块 */ 10 sk = req->rsk_listener; 11 if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) { 12 sk_drops_add(sk, skb); 13 reqsk_put(req); 14 goto discard_it; 15 } 16 17 /* 不是listen状态 */ 18 if (unlikely(sk->sk_state != TCP_LISTEN)) { 19 /* 从连接队列移除控制块 */ 20 inet_csk_reqsk_queue_drop_and_put(sk, req); 21 22 /* 根据skb参数重新查找控制块 */ 23 goto lookup; 24 } 25 /* We own a reference on the listener, increase it again 26 * as we might lose it too soon. 27 */ 28 sock_hold(sk); 29 refcounted = true; 30 31 /* 处理第三次握手ack,成功返回新控制块 */ 32 nsk = tcp_check_req(sk, skb, req, false); 33 34 /* 失败 */ 35 if (!nsk) { 36 reqsk_put(req); 37 goto discard_and_relse; 38 } 39 40 /* 未新建控制块,进一步处理 */ 41 if (nsk == sk) { 42 reqsk_put(req); 43 } 44 /* 有新建控制块,进行初始化等 */ 45 else if (tcp_child_process(sk, nsk, skb)) { 46 /* 失败发送rst */ 47 tcp_v4_send_reset(nsk, skb); 48 goto discard_and_relse; 49 } else { 50 sock_put(sk); 51 return 0; 52 } 53 } 54 55 /* 省略一些无关代码 */ 56 }
tcp_check_req为处理ack的核心流程,除了各种状态的检查之外,最主要的是在状态检查通过之后(1)调用child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req);创建子控制块,这里需要注意,子控制块的状态为TCP_SYN_RECV,这与刚收到syn建立的控制块状态不一样,那时创建的控制块为TCP_NEW_SYN_RECV;然后(2)将请求控制块从未完成连接队列中删除,加入到已完成连接队列中;
1 struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, 2 struct request_sock *req, 3 bool fastopen) 4 { 5 struct tcp_options_received tmp_opt; 6 struct sock *child; 7 const struct tcphdr *th = tcp_hdr(skb); 8 __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); 9 bool paws_reject = false; 10 bool own_req; 11 12 tmp_opt.saw_tstamp = 0; 13 14 /* 如果有tcp选项 */ 15 if (th->doff > (sizeof(struct tcphdr)>>2)) { 16 17 /* 解析选项 */ 18 tcp_parse_options(skb, &tmp_opt, 0, NULL); 19 20 /* 有时间戳选项处理 */ 21 if (tmp_opt.saw_tstamp) { 22 tmp_opt.ts_recent = req->ts_recent; 23 if (tmp_opt.rcv_tsecr) 24 tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; 25 /* We do not store true stamp, but it is not required, 26 * it can be estimated (approximately) 27 * from another data. 28 */ 29 /* 序号回绕检查 */ 30 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->num_timeout); 31 paws_reject = tcp_paws_reject(&tmp_opt, th->rst); 32 } 33 } 34 35 /* Check for pure retransmitted SYN. */ 36 /* 客户端重传的syn包 */ 37 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && 38 flg == TCP_FLAG_SYN && 39 !paws_reject) { 40 /* 41 * RFC793 draws (Incorrectly! It was fixed in RFC1122) 42 * this case on figure 6 and figure 8, but formal 43 * protocol description says NOTHING. 44 * To be more exact, it says that we should send ACK, 45 * because this segment (at least, if it has no data) 46 * is out of window. 47 * 48 * CONCLUSION: RFC793 (even with RFC1122) DOES NOT 49 * describe SYN-RECV state. All the description 50 * is wrong, we cannot believe to it and should 51 * rely only on common sense and implementation 52 * experience. 53 * 54 * Enforce "SYN-ACK" according to figure 8, figure 6 55 * of RFC793, fixed by RFC1122. 56 * 57 * Note that even if there is new data in the SYN packet 58 * they will be thrown away too. 59 * 60 * Reset timer after retransmitting SYNACK, similar to 61 * the idea of fast retransmit in recovery. 62 */ 63 /* 限速检查 */ 64 if (!tcp_oow_rate_limited(sock_net(sk), skb, 65 LINUX_MIB_TCPACKSKIPPEDSYNRECV, 66 &tcp_rsk(req)->last_oow_ack_time) && 67 /* 重新发送syn+ack */ 68 !inet_rtx_syn_ack(sk, req)) { 69 70 /* 计算超时时间,调整定时器 */ 71 unsigned long expires = jiffies; 72 73 expires += min(TCP_TIMEOUT_INIT << req->num_timeout, 74 TCP_RTO_MAX); 75 if (!fastopen) 76 mod_timer_pending(&req->rsk_timer, expires); 77 else 78 req->rsk_timer.expires = expires; 79 } 80 81 /* 处理完毕,无需后续处理 */ 82 return NULL; 83 } 84 85 /* Further reproduces section "SEGMENT ARRIVES" 86 for state SYN-RECEIVED of RFC793. 87 It is broken, however, it does not work only 88 when SYNs are crossed. 89 90 You would think that SYN crossing is impossible here, since 91 we should have a SYN_SENT socket (from connect()) on our end, 92 but this is not true if the crossed SYNs were sent to both 93 ends by a malicious third party. We must defend against this, 94 and to do that we first verify the ACK (as per RFC793, page 95 36) and reset if it is invalid. Is this a true full defense? 96 To convince ourselves, let us consider a way in which the ACK 97 test can still pass in this 'malicious crossed SYNs' case. 98 Malicious sender sends identical SYNs (and thus identical sequence 99 numbers) to both A and B: 100 101 A: gets SYN, seq=7 102 B: gets SYN, seq=7 103 104 By our good fortune, both A and B select the same initial 105 send sequence number of seven :-) 106 107 A: sends SYN|ACK, seq=7, ack_seq=8 108 B: sends SYN|ACK, seq=7, ack_seq=8 109 110 So we are now A eating this SYN|ACK, ACK test passes. So 111 does sequence test, SYN is truncated, and thus we consider 112 it a bare ACK. 113 114 If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this 115 bare ACK. Otherwise, we create an established connection. Both 116 ends (listening sockets) accept the new incoming connection and try 117 to talk to each other. 8-) 118 119 Note: This case is both harmless, and rare. Possibility is about the 120 same as us discovering intelligent life on another plant tomorrow. 121 122 But generally, we should (RFC lies!) to accept ACK 123 from SYNACK both here and in tcp_rcv_state_process(). 124 tcp_rcv_state_process() does not, hence, we do not too. 125 126 Note that the case is absolutely generic: 127 we cannot optimize anything here without 128 violating protocol. All the checks must be made 129 before attempt to create socket. 130 */ 131 132 /* RFC793 page 36: "If the connection is in any non-synchronized state ... 133 * and the incoming segment acknowledges something not yet 134 * sent (the segment carries an unacceptable ACK) ... 135 * a reset is sent." 136 * 137 * Invalid ACK: reset will be sent by listening socket. 138 * Note that the ACK validity check for a Fast Open socket is done 139 * elsewhere and is checked directly against the child socket rather 140 * than req because user data may have been sent out. 141 */ 142 /* ACK但是序号对不上,返回原有控制块,外面不做处理 */ 143 if ((flg & TCP_FLAG_ACK) && !fastopen && 144 (TCP_SKB_CB(skb)->ack_seq != 145 tcp_rsk(req)->snt_isn + 1)) 146 return sk; 147 148 /* Also, it would be not so bad idea to check rcv_tsecr, which 149 * is essentially ACK extension and too early or too late values 150 * should cause reset in unsynchronized states. 151 */ 152 153 /* RFC793: "first check sequence number". */ 154 /* 无效序号,且接收数据不在窗口范围内 */ 155 if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, 156 tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { 157 /* Out of window: send ACK and drop. */ 158 /* 如果不是rst,则给对端发送ack */ 159 if (!(flg & TCP_FLAG_RST) && 160 !tcp_oow_rate_limited(sock_net(sk), skb, 161 LINUX_MIB_TCPACKSKIPPEDSYNRECV, 162 &tcp_rsk(req)->last_oow_ack_time)) 163 req->rsk_ops->send_ack(sk, skb, req); 164 if (paws_reject) 165 __NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); 166 return NULL; 167 } 168 169 /* In sequence, PAWS is OK. */ 170 171 /* 有时间戳选项,序号合法,则记录时间戳 */ 172 if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) 173 req->ts_recent = tmp_opt.rcv_tsval; 174 175 /*如果序号是syn序号,已经在窗口外,清除syn标记 */ 176 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { 177 /* Truncate SYN, it is out of window starting 178 at tcp_rsk(req)->rcv_isn + 1. */ 179 flg &= ~TCP_FLAG_SYN; 180 } 181 182 /* RFC793: "second check the RST bit" and 183 * "fourth, check the SYN bit" 184 */ 185 /* 186 有rst标记或者syn标记,上面已经检查了syn重传包了, 187 这里有syn一定是问题包, 188 则需要复位未完成的连接 189 */ 190 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { 191 __TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); 192 goto embryonic_reset; 193 } 194 195 /* ACK sequence verified above, just make sure ACK is 196 * set. If ACK not set, just silently drop the packet. 197 * 198 * XXX (TFO) - if we ever allow "data after SYN", the 199 * following check needs to be removed. 200 */ 201 202 /* 上面流程保证了有ack,若没有,直接返回 */ 203 if (!(flg & TCP_FLAG_ACK)) 204 return NULL; 205 206 /* For Fast Open no more processing is needed (sk is the 207 * child socket). 208 */ 209 if (fastopen) 210 return sk; 211 212 /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ 213 /* 设置了DEFER_ACCEPT,直接丢弃该ack,后面有数据的包在建立连接 */ 214 if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && 215 TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { 216 inet_rsk(req)->acked = 1; 217 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); 218 return NULL; 219 } 220 221 /* OK, ACK is valid, create big socket and 222 * feed this segment to it. It will repeat all 223 * the tests. THIS SEGMENT MUST MOVE SOCKET TO 224 * ESTABLISHED STATE. If it will be dropped after 225 * socket is created, wait for troubles. 226 */ 227 /* ack有效,创建子控制块,注意子控制块的状态为TCP_SYN_RECV */ 228 child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, 229 req, &own_req); 230 /* 创建失败 */ 231 if (!child) 232 goto listen_overflow; 233 234 sock_rps_save_rxhash(child, skb); 235 /* 计算三次握手中synack-ack消耗的时间 */ 236 tcp_synack_rtt_meas(child, req); 237 /* 从未完成队列删除原控制块,加入到已完成队列 */ 238 return inet_csk_complete_hashdance(sk, child, req, own_req); 239 240 listen_overflow: 241 /* 服务器原因未建立连接的,打个标记,后续再发送syn+ack */ 242 if (!sysctl_tcp_abort_on_overflow) { 243 inet_rsk(req)->acked = 1; 244 return NULL; 245 } 246 247 embryonic_reset: 248 249 /* 不合法的syn包,发送rst */ 250 if (!(flg & TCP_FLAG_RST)) { 251 /* Received a bad SYN pkt - for TFO We try not to reset 252 * the local connection unless it's really necessary to 253 * avoid becoming vulnerable to outside attack aiming at 254 * resetting legit local connections. 255 */ 256 req->rsk_ops->send_reset(sk, skb); 257 } else if (fastopen) { /* received a valid RST pkt */ 258 reqsk_fastopen_remove(sk, req, true); 259 tcp_reset(sk); 260 } 261 262 /* 从连接请求队列删除控制块 */ 263 if (!fastopen) { 264 inet_csk_reqsk_queue_drop(sk, req); 265 __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); 266 } 267 return NULL; 268 }
tcp_child_process对新控制块进行进一步处理,在控制块未被用户进程锁定的情况下,调用tcp_rcv_state_process进行相关初始化,并将连接状态更新到TCP_ESTABLISHED已连接状态,之后通知等待进程;如果控制块被用户进程锁住,则将数据加入到控制块的后备队列中延后处理;
1 /* 2 * Queue segment on the new socket if the new socket is active, 3 * otherwise we just shortcircuit this and continue with 4 * the new socket. 5 * 6 * For the vast majority of cases child->sk_state will be TCP_SYN_RECV 7 * when entering. But other states are possible due to a race condition 8 * where after __inet_lookup_established() fails but before the listener 9 * locked is obtained, other packets cause the same connection to 10 * be created. 11 */ 12 13 int tcp_child_process(struct sock *parent, struct sock *child, 14 struct sk_buff *skb) 15 { 16 int ret = 0; 17 int state = child->sk_state; 18 19 /* record NAPI ID of child */ 20 sk_mark_napi_id(child, skb); 21 22 /* 记录数据分段数 */ 23 tcp_segs_in(tcp_sk(child), skb); 24 25 /* 未被用户层锁住 */ 26 if (!sock_owned_by_user(child)) { 27 28 /* 子控制块状态的进一步处理 */ 29 ret = tcp_rcv_state_process(child, skb); 30 /* Wakeup parent, send SIGIO */ 31 /* 唤醒该套接口的等待进程 */ 32 if (state == TCP_SYN_RECV && child->sk_state != state) 33 parent->sk_data_ready(parent); 34 } 35 /* 被用户层锁住,加入后备队列 */ 36 else { 37 /* Alas, it is possible again, because we do lookup 38 * in main socket hash table and lock on listening 39 * socket does not protect us more. 40 */ 41 __sk_add_backlog(child, skb); 42 } 43 44 bh_unlock_sock(child); 45 sock_put(child); 46 return ret; 47 }
tcp_rcv_state_process对于TCP_SYN_RECV的处理主要是完成连接建立之前的必要初始化,以及将连接状态更新为TCP_ESTABLISHED,通知进程可写入数据,判断并标记快慢路等;其中前后的公共流程,这里没有给出;
1 int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) 2 { 3 /* step 5: check the ACK field */ 4 acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | 5 FLAG_UPDATE_TS_RECENT) > 0; 6 7 switch (sk->sk_state) { 8 case TCP_SYN_RECV: 9 10 /* ack处理失败 */ 11 if (!acceptable) 12 return 1; 13 14 /* RTT */ 15 if (!tp->srtt_us) 16 tcp_synack_rtt_meas(sk, req); 17 18 /* Once we leave TCP_SYN_RECV, we no longer need req 19 * so release it. 20 */ 21 if (req) { 22 inet_csk(sk)->icsk_retransmits = 0; 23 reqsk_fastopen_remove(sk, req, false); 24 } else { 25 /* Make sure socket is routed, for correct metrics. */ 26 /* 检查重建路由 */ 27 icsk->icsk_af_ops->rebuild_header(sk); 28 /* 初始化拥塞邋控制 */ 29 tcp_init_congestion_control(sk); 30 /* 路径mtu发现初始化 */ 31 tcp_mtup_init(sk); 32 /* 用户待读取数据初始化 */ 33 tp->copied_seq = tp->rcv_nxt; 34 /* 调整接收发送缓存以及窗口等 */ 35 tcp_init_buffer_space(sk); 36 } 37 smp_mb(); 38 39 /* 连接更新为已连接状态 */ 40 tcp_set_state(sk, TCP_ESTABLISHED); 41 sk->sk_state_change(sk); 42 43 /* Note, that this wakeup is only for marginal crossed SYN case. 44 * Passively open sockets are not waked up, because 45 * sk->sk_sleep == NULL and sk->sk_socket == NULL. 46 */ 47 /* 通知进程可以发送数据 */ 48 if (sk->sk_socket) 49 sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); 50 51 /* 初始化窗口相关字段 */ 52 tp->snd_una = TCP_SKB_CB(skb)->ack_seq; 53 tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale; 54 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); 55 56 /* 如果有时间戳,mss减去时间戳选项长度 */ 57 if (tp->rx_opt.tstamp_ok) 58 tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 59 60 if (req) { 61 /* Re-arm the timer because data may have been sent out. 62 * This is similar to the regular data transmission case 63 * when new data has just been ack'ed. 64 * 65 * (TFO) - we could try to be more aggressive and 66 * retransmitting any data sooner based on when they 67 * are sent out. 68 */ 69 tcp_rearm_rto(sk); 70 } 71 /* 根据路由缓存信息初始化控制块 */ 72 else 73 tcp_init_metrics(sk); 74 75 if (!inet_csk(sk)->icsk_ca_ops->cong_control) 76 tcp_update_pacing_rate(sk); 77 78 /* Prevent spurious tcp_cwnd_restart() on first data packet */ 79 tp->lsndtime = tcp_time_stamp; 80 81 /* 初始化rcv_mss */ 82 tcp_initialize_rcv_mss(sk); 83 84 /* 快路检查和标记 */ 85 tcp_fast_path_on(tp); 86 break; 87 }