tcp客户端与服务器端建立连接需要经过三次握手过程,本文主要分析客户端主动打开中的第一次握手部分,即客户端发送syn段到服务器端;
tcp_v4_connect为发起连接主流程,首先对必要参数进行检查,获取路由信息,改变连接状态成SYN_SENT,再调用inet_hash_connect将控制块加入到ehash,最后调用tcp_connect发送syn;
1 /* This will initiate an outgoing connection. */ 2 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 3 { 4 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; 5 struct inet_sock *inet = inet_sk(sk); 6 struct tcp_sock *tp = tcp_sk(sk); 7 __be16 orig_sport, orig_dport; 8 __be32 daddr, nexthop; 9 struct flowi4 *fl4; 10 struct rtable *rt; 11 int err; 12 struct ip_options_rcu *inet_opt; 13 14 /* timewait控制块结构 */ 15 struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; 16 17 /* 地址长度不合法 */ 18 if (addr_len < sizeof(struct sockaddr_in)) 19 return -EINVAL; 20 21 /* 地址族不合法 */ 22 if (usin->sin_family != AF_INET) 23 return -EAFNOSUPPORT; 24 25 /* 设置下一跳和目的地址 */ 26 nexthop = daddr = usin->sin_addr.s_addr; 27 28 /* 获取ip选项 */ 29 inet_opt = rcu_dereference_protected(inet->inet_opt, 30 lockdep_sock_is_held(sk)); 31 32 /* 使用了源路由选项 */ 33 if (inet_opt && inet_opt->opt.srr) { 34 if (!daddr) 35 return -EINVAL; 36 /* 下一跳地址设置为选项中的地址 */ 37 nexthop = inet_opt->opt.faddr; 38 } 39 40 /* 获取源端口目的端口 */ 41 orig_sport = inet->inet_sport; 42 orig_dport = usin->sin_port; 43 44 /* 查找路由 */ 45 fl4 = &inet->cork.fl.u.ip4; 46 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 47 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 48 IPPROTO_TCP, 49 orig_sport, orig_dport, sk); 50 /* 查找失败 */ 51 if (IS_ERR(rt)) { 52 err = PTR_ERR(rt); 53 if (err == -ENETUNREACH) 54 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); 55 return err; 56 } 57 58 59 /* 查找成功 */ 60 61 /* 路由是组播或者广播 */ 62 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { 63 ip_rt_put(rt); 64 return -ENETUNREACH; 65 } 66 67 /* 选项为空或者未启用源路由选项 */ 68 /* 设置目的地址为路由缓存中地址 */ 69 if (!inet_opt || !inet_opt->opt.srr) 70 daddr = fl4->daddr; 71 72 /* 源地址为空 */ 73 /* 使用路由缓存中的源地址 */ 74 if (!inet->inet_saddr) 75 inet->inet_saddr = fl4->saddr; 76 /* 设置接收地址为源地址 */ 77 sk_rcv_saddr_set(sk, inet->inet_saddr); 78 79 /* 控制块中的时间戳存在&& 目的地址不是当前地址 */ 80 /* 控制块被使用过,重新初始化 */ 81 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) { 82 /* Reset inherited state */ 83 tp->rx_opt.ts_recent = 0; 84 tp->rx_opt.ts_recent_stamp = 0; 85 if (likely(!tp->repair)) 86 tp->write_seq = 0; 87 } 88 89 /* 设置目的端口 */ 90 inet->inet_dport = usin->sin_port; 91 /* 设置目的地址 */ 92 sk_daddr_set(sk, daddr); 93 94 /* 获取ip选项长度 */ 95 inet_csk(sk)->icsk_ext_hdr_len = 0; 96 if (inet_opt) 97 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; 98 99 /* 设置mss */ 100 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; 101 102 /* Socket identity is still unknown (sport may be zero). 103 * However we set state to SYN-SENT and not releasing socket 104 * lock select source port, enter ourselves into the hash tables and 105 * complete initialization after this. 106 */ 107 /* 设置连接状态为TCP_SYN_SENT */ 108 tcp_set_state(sk, TCP_SYN_SENT); 109 110 /* 端口绑定,加入ehash */ 111 err = inet_hash_connect(tcp_death_row, sk); 112 if (err) 113 goto failure; 114 115 /* 设置hash值 */ 116 sk_set_txhash(sk); 117 118 /* 119 如果源端口或者目的端口发生变化, 120 重新获取路由,并更新sk的路由缓存 121 */ 122 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 123 inet->inet_sport, inet->inet_dport, sk); 124 if (IS_ERR(rt)) { 125 err = PTR_ERR(rt); 126 rt = NULL; 127 goto failure; 128 } 129 /* OK, now commit destination to socket. */ 130 sk->sk_gso_type = SKB_GSO_TCPV4; 131 132 /* 存储目的路由缓存和网络设备特性到控制块 */ 133 sk_setup_caps(sk, &rt->dst); 134 rt = NULL; 135 136 if (likely(!tp->repair)) { 137 /* 获取发送序号 */ 138 if (!tp->write_seq) 139 tp->write_seq = secure_tcp_seq(inet->inet_saddr, 140 inet->inet_daddr, 141 inet->inet_sport, 142 usin->sin_port); 143 /* 时间戳偏移 */ 144 tp->tsoffset = secure_tcp_ts_off(inet->inet_saddr, 145 inet->inet_daddr); 146 } 147 148 /* 设置ip首部的id */ 149 inet->inet_id = tp->write_seq ^ jiffies; 150 151 /* fastopen */ 152 if (tcp_fastopen_defer_connect(sk, &err)) 153 return err; 154 if (err) 155 goto failure; 156 157 /* 发送syn */ 158 err = tcp_connect(sk); 159 160 if (err) 161 goto failure; 162 163 return 0; 164 165 failure: 166 /* 167 * This unhashes the socket and releases the local port, 168 * if necessary. 169 */ 170 tcp_set_state(sk, TCP_CLOSE); 171 ip_rt_put(rt); 172 sk->sk_route_caps = 0; 173 inet->inet_dport = 0; 174 return err; 175 }
__inet_hash_connect将端口检查通过的控制块加入到ehash;函数对是否设置端口进行了不同处理,若未设置端口,则需要查找一个端口;函数还调用check_established检查是否可以复用处在TIME_WAIT的控制块,以及调用inet_ehash_nolisten将端口对应的控制块加入的ehash;
1 int __inet_hash_connect(struct inet_timewait_death_row *death_row, 2 struct sock *sk, u32 port_offset, 3 int (*check_established)(struct inet_timewait_death_row *, 4 struct sock *, __u16, struct inet_timewait_sock **)) 5 { 6 struct inet_hashinfo *hinfo = death_row->hashinfo; 7 struct inet_timewait_sock *tw = NULL; 8 struct inet_bind_hashbucket *head; 9 int port = inet_sk(sk)->inet_num; 10 struct net *net = sock_net(sk); 11 struct inet_bind_bucket *tb; 12 u32 remaining, offset; 13 int ret, i, low, high; 14 static u32 hint; 15 16 /* 存在端口 */ 17 if (port) { 18 head = &hinfo->bhash[inet_bhashfn(net, port, 19 hinfo->bhash_size)]; 20 21 /* 找到端口绑定信息 */ 22 tb = inet_csk(sk)->icsk_bind_hash; 23 spin_lock_bh(&head->lock); 24 25 /* 当前端口绑定的只有当前控制块 */ 26 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { 27 /* 将控制块加入只ehash */ 28 inet_ehash_nolisten(sk, NULL); 29 spin_unlock_bh(&head->lock); 30 return 0; 31 } 32 spin_unlock(&head->lock); 33 /* No definite answer... Walk to established hash table */ 34 /* 检查复用情况 */ 35 ret = check_established(death_row, sk, port, NULL); 36 local_bh_enable(); 37 return ret; 38 } 39 40 41 /* 没有确定端口,则随机端口 */ 42 43 inet_get_local_port_range(net, &low, &high); 44 high++; /* [32768, 60999] -> [32768, 61000[ */ 45 remaining = high - low; 46 if (likely(remaining > 1)) 47 remaining &= ~1U; 48 49 offset = (hint + port_offset) % remaining; 50 /* In first pass we try ports of @low parity. 51 * inet_csk_get_port() does the opposite choice. 52 */ 53 offset &= ~1U; 54 other_parity_scan: 55 port = low + offset; 56 57 /* 遍历端口 */ 58 for (i = 0; i < remaining; i += 2, port += 2) { 59 if (unlikely(port >= high)) 60 port -= remaining; 61 /* 保留端口 */ 62 if (inet_is_local_reserved_port(net, port)) 63 continue; 64 65 /* 找到端口对应的绑定hash桶 */ 66 head = &hinfo->bhash[inet_bhashfn(net, port, 67 hinfo->bhash_size)]; 68 spin_lock_bh(&head->lock); 69 70 /* Does not bother with rcv_saddr checks, because 71 * the established check is already unique enough. 72 */ 73 /* 遍历绑定的链表中的节点 */ 74 inet_bind_bucket_for_each(tb, &head->chain) { 75 76 /* 找到端口相同节点 */ 77 if (net_eq(ib_net(tb), net) && tb->port == port) { 78 79 /* 设置被重用了,继续找,随机端口不能重用 */ 80 if (tb->fastreuse >= 0 || 81 tb->fastreuseport >= 0) 82 goto next_port; 83 WARN_ON(hlist_empty(&tb->owners)); 84 85 /* 检查timewait复用情况 */ 86 if (!check_established(death_row, sk, 87 port, &tw)) 88 goto ok; 89 goto next_port; 90 } 91 } 92 93 /* 遍历没有重复 */ 94 95 /* 创建该端口的绑定信息节点,加入绑定hash */ 96 tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, 97 net, head, port); 98 if (!tb) { 99 spin_unlock_bh(&head->lock); 100 return -ENOMEM; 101 } 102 103 /* 设置默认重用标记 */ 104 tb->fastreuse = -1; 105 tb->fastreuseport = -1; 106 goto ok; 107 next_port: 108 spin_unlock_bh(&head->lock); 109 cond_resched(); 110 } 111 112 /* 继续从下一半端口中找 */ 113 offset++; 114 if ((offset & 1) && remaining > 1) 115 goto other_parity_scan; 116 117 return -EADDRNOTAVAIL; 118 119 ok: 120 hint += i + 2; 121 122 /* Head lock still held and bh's disabled */ 123 124 /* 控制块加入该端口的使用者列表 */ 125 inet_bind_hash(sk, tb, port); 126 127 /* 初始化源端口,加入到ehash */ 128 if (sk_unhashed(sk)) { 129 inet_sk(sk)->inet_sport = htons(port); 130 inet_ehash_nolisten(sk, (struct sock *)tw); 131 } 132 /*有timewait控制块则从bind列表中移除 */ 133 if (tw) 134 inet_twsk_bind_unhash(tw, hinfo); 135 spin_unlock(&head->lock); 136 137 /* 调度销毁timewait控制块 */ 138 if (tw) 139 inet_twsk_deschedule_put(tw); 140 local_bh_enable(); 141 return 0; 142 }
__inet_check_established用于检查与相同端口中处于TIME_WAIT状态的控制块是否可以复用;
1 /* called with local bh disabled */ 2 static int __inet_check_established(struct inet_timewait_death_row *death_row, 3 struct sock *sk, __u16 lport, 4 struct inet_timewait_sock **twp) 5 { 6 struct inet_hashinfo *hinfo = death_row->hashinfo; 7 struct inet_sock *inet = inet_sk(sk); 8 __be32 daddr = inet->inet_rcv_saddr; 9 __be32 saddr = inet->inet_daddr; 10 int dif = sk->sk_bound_dev_if; 11 INET_ADDR_COOKIE(acookie, saddr, daddr); 12 const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport); 13 struct net *net = sock_net(sk); 14 unsigned int hash = inet_ehashfn(net, daddr, lport, 15 saddr, inet->inet_dport); 16 struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); 17 spinlock_t *lock = inet_ehash_lockp(hinfo, hash); 18 struct sock *sk2; 19 const struct hlist_nulls_node *node; 20 struct inet_timewait_sock *tw = NULL; 21 22 spin_lock(lock); 23 24 /* 遍历链表 */ 25 sk_nulls_for_each(sk2, node, &head->chain) { 26 27 /* hash不等 */ 28 if (sk2->sk_hash != hash) 29 continue; 30 31 /* 找到节点 */ 32 if (likely(INET_MATCH(sk2, net, acookie, 33 saddr, daddr, ports, dif))) { 34 /* 节点连接处于timewait状态 */ 35 if (sk2->sk_state == TCP_TIME_WAIT) { 36 tw = inet_twsk(sk2); 37 38 /* 可以复用 */ 39 if (twsk_unique(sk, sk2, twp)) 40 break; 41 } 42 43 /* 不处于tw,或者不能复用 */ 44 goto not_unique; 45 } 46 } 47 48 /* Must record num and sport now. Otherwise we will see 49 * in hash table socket with a funny identity. 50 */ 51 /* 设置端口和hash */ 52 inet->inet_num = lport; 53 inet->inet_sport = htons(lport); 54 sk->sk_hash = hash; 55 WARN_ON(!sk_unhashed(sk)); 56 57 /* 节点加入ehash */ 58 __sk_nulls_add_node_rcu(sk, &head->chain); 59 if (tw) { 60 /* 删除tw节点 */ 61 sk_nulls_del_node_init_rcu((struct sock *)tw); 62 __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED); 63 } 64 spin_unlock(lock); 65 66 /* 增加使用计数 */ 67 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 68 69 /* 设置能复用的控制块 */ 70 if (twp) { 71 *twp = tw; 72 } else if (tw) { 73 /* Silly. Should hash-dance instead... */ 74 inet_twsk_deschedule_put(tw); 75 } 76 return 0; 77 78 not_unique: 79 spin_unlock(lock); 80 return -EADDRNOTAVAIL; 81 }
inet_ehash_nolisten用于将控制块加入ehash,并根据结果做不同处理;
1 /* 添加到ehash中 */ 2 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk) 3 { 4 /* 添加到ehash中 */ 5 bool ok = inet_ehash_insert(sk, osk); 6 7 if (ok) { 8 /* 成功增加计数 */ 9 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); 10 } else { 11 /* 增加孤儿数量 */ 12 percpu_counter_inc(sk->sk_prot->orphan_count); 13 /* 标识连接关闭状态 */ 14 sk->sk_state = TCP_CLOSE; 15 /* 设置销毁标记 */ 16 sock_set_flag(sk, SOCK_DEAD); 17 /* 销毁控制块 */ 18 inet_csk_destroy_sock(sk); 19 } 20 return ok; 21 }
tcp_connect用于构造syn包并发送之,发送之后需要设置syn包的重传定时器;
1 /* Build a SYN and send it off. */ 2 int tcp_connect(struct sock *sk) 3 { 4 struct tcp_sock *tp = tcp_sk(sk); 5 struct sk_buff *buff; 6 int err; 7 8 /* 检查重建路由 */ 9 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) 10 return -EHOSTUNREACH; /* Routing failure or similar. */ 11 12 /* 初始化控制块中与连接相关的成员 */ 13 tcp_connect_init(sk); 14 15 if (unlikely(tp->repair)) { 16 tcp_finish_connect(sk, NULL); 17 return 0; 18 } 19 20 /* 分配skb */ 21 buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true); 22 if (unlikely(!buff)) 23 return -ENOBUFS; 24 25 /* 无数据的skb相关控制信息初始化 */ 26 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); 27 28 /* 设置发送syn的时间 */ 29 tp->retrans_stamp = tcp_time_stamp; 30 31 /* 加入发送队列 */ 32 tcp_connect_queue_skb(sk, buff); 33 34 /* enc拥塞通告支持 */ 35 tcp_ecn_send_syn(sk, buff); 36 37 /* Send off SYN; include data in Fast Open. */ 38 /* 发送syn */ 39 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : 40 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); 41 if (err == -ECONNREFUSED) 42 return err; 43 44 /* We change tp->snd_nxt after the tcp_transmit_skb() call 45 * in order to make this packet get counted in tcpOutSegs. 46 */ 47 /* 设置序号信息 */ 48 tp->snd_nxt = tp->write_seq; 49 tp->pushed_seq = tp->write_seq; 50 TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); 51 52 /* Timer for repeating the SYN until an answer. */ 53 /* 启动重传定时器 */ 54 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 55 inet_csk(sk)->icsk_rto, TCP_RTO_MAX); 56 return 0; 57 }