1、TCP基本概念
传输控制协议TCP是一种面向连接的、可靠的、基于字节流的运输层通信协议。TCP层是位于IP层之上,应用层之下的传输层。
2、TCP连接时三次握手示意
3. TCP协议栈从上到下提供的接口
创建socket
创建TCP socket调用接口
在创建socket套接字描述符, sys_socket内核函数会根据指定的协议(例如socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP))挂载对应的协议处理函数
250 static int inet_create(struct net *net, struct socket *sock, int protocol,int kern) 251{ ... 262 /* Look for the requested type/protocol pair. */ 263 lookup_protocol: 264 err = -ESOCKTNOSUPPORT; 265 rcu_read_lock(); // TCP套接字、UDP套接字、原始套接字的inet_protosw实 例都在inetsw_array数组中定义, //这些实例会调inet_register_protosw()注册到inetsw中 //根据protocol查找要创建的套接字对应的四层传输协议。 266 list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { 268 ... 283 } 284 //如果没有找到,则调用request_module()来尝试加载协议所属的模块,正常情况下不会发生。 285 if (unlikely(err)) { 286 if (try_loading_module < 2) { 287 rcu_read_unlock(); ... }
三次握手
结构体变量struct proto tcp_prot指定了TCP协议栈的访问接口函数
首先客户端发送SYN报文
调用tcp_v4_connect函数建立与服务器联系并发送SYN段:
tcp_v4_connect函数
140/* This will initiate an outgoing connection. */ 141int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 142{ ... 171 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr, 172 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, 173 IPPROTO_TCP, 174 orig_sport, orig_dport, sk); ... 214215 /* Socket identity is still unknown (sport may be zero). 216 * However we set state to SYN-SENT and not releasing socket 217 * lock select source port, enter ourselves into the hash tables and 218 * complete initialization after this. 219 */ 220 tcp_set_state(sk, TCP_SYN_SENT); ... 227 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport, 228 inet->inet_sport, inet->inet_dport, sk); ... 246 err = tcp_connect(sk); ... } 265EXPORT_SYMBOL(tcp_v4_connect);
此函数前面部分是确定socket的源端口,目的ip及端口。目的IP和目的端口是由connect系统调用的入参指定。tcp_connect函数用于构建并发送一个SYN请求。
tcp_connect函数
- 构造一个携带SYN标志位的TCP头,tcp_init_nondata_skb函数实现
- 发送带有SYN的TCP报文,tcp_transmit_skb函数实现
- 设置计时器超时重发,net_csk_reset_xmit_timer函数实现
3090/* Build a SYN and send it off. */ 3091int tcp_connect(struct sock *sk) 3092{ ... 3108 /* Reserve space for headers. */ 3109 skb_reserve(buff, MAX_TCP_HEADER); 3110 3111 tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN); 3112 tp->retrans_stamp = tcp_time_stamp; 3113 tcp_connect_queue_skb(sk, buff); 3114 tcp_ecn_send_syn(sk, buff); 3115 3116 /* Send off SYN; include data in Fast Open. */ 3117 err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) : 3118 tcp_transmit_skb(sk, buff, 1, sk->sk_allocation); ... 3129 /* Timer for repeating the SYN until an answer. */ 3130 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, 3131 inet_csk(sk)->icsk_rto, TCP_RTO_MAX); ... } 3134EXPORT_SYMBOL(tcp_connect);
tcp_transmit_sbk函数
__tcp_transmit_skb函数的主要任务是向ip层发送数据包,其中包括
初始化TCP协议头等数据结构
查看clone_it是否要克隆Socket Buffer,应用Socket Buffer可能正被其他进程使用,就要克隆一个份
构建TCP协议选项
阻塞控制,确定网络上有多少数据包最好
构建TCP协议头主要的数据域:源端口、目的端口、数据段初始序列号,计算窗口大小,如果是SYN请求包就不需要计算窗口大小
发送数据包到ip层,发送过程状态机切换,发送SYN包之后切换为SYN_SENT
// net/ipv4/tcp_output.c static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) { return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, tcp_sk(sk)->rcv_nxt); }
tcp_transmit_skb是对__tcp_transmit_skb的封装,继续调用,进入__tcp_transmit_skb发送SYN报文
__tcp_transmit_skb函数
// net/ipv4/tcp_output.c static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask, u32 rcv_nxt) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; struct tcp_out_options opts; unsigned int tcp_options_size, tcp_header_size; struct sk_buff *oskb = NULL; struct tcp_md5sig_key *md5; struct tcphdr *th; u64 prior_wstamp; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); if (clone_it) { Socket Buffer TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; oskb = skb; tcp_skb_tsorted_save(oskb) { if (unlikely(skb_cloned(oskb))) skb = pskb_copy(oskb, gfp_mask); else skb = skb_clone(oskb, gfp_mask); } tcp_skb_tsorted_restore(oskb); if (unlikely(!skb)) return -ENOBUFS; } prior_wstamp = tp->tcp_wstamp_ns; tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb->skb_mstamp_ns = tp->tcp_wstamp_ns; inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); if (unlikely(tcb->tcp_flags & TCPHDR_SYN)) tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); else tcp_options_size = tcp_established_options(sk, skb, &opts, &md5); tcp_header_size = tcp_options_size + sizeof(struct tcphdr); /* if no packet is in qdisc/device queue, then allow XPS to select * another queue. We can be called from tcp_tsq_handler() * which holds one reference to sk. * * TODO: Ideally, in-flight pure ACK packets should not matter here. * One way to get this would be to set skb->truesize = 2 on them. */ skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1); /* If we had to use memory reserve to allocate this skb, * this might cause drops if packet is looped back : * Other socket might not have SOCK_MEMALLOC. * Packets not looped back do not care about pfmemalloc. */ skb->pfmemalloc = 0; skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_orphan(skb); skb->sk = sk; skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree; skb_set_hash_from_sk(skb, sk); refcount_add(skb->truesize, &sk->sk_wmem_alloc); skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm); /* Build TCP header and checksum it. */ th = (struct tcphdr *)skb->data; th->source = inet->inet_sport; th->dest = inet->inet_dport; th->seq = htonl(tcb->seq); th->ack_seq = htonl(rcv_nxt); *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->tcp_flags); th->check = 0; th->urg_ptr = 0; /* The urg_mode check is necessary during a below snd_una win probe */ if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) { if (before(tp->snd_up, tcb->seq + 0x10000)) { th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) { th->urg_ptr = htons(0xFFFF); th->urg = 1; } } tcp_options_write((__be32 *)(th + 1), tp, &opts); skb_shinfo(skb)->gso_type = sk->sk_gso_type; if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) { th->window = htons(tcp_select_window(sk)); tcp_ecn_send(sk, skb, th, tcp_header_size); } else { /* RFC1323: The window in SYN & SYN/ACK segments * is never scaled. */ th->window = htons(min(tp->rcv_wnd, 65535U)); } #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { sk_nocaps_add(sk, NETIF_F_GSO_MASK); tp->af_specific->calc_md5_hash(opts.hash_location, md5, sk, skb); } #endif icsk->icsk_af_ops->send_check(sk, skb); if (likely(tcb->tcp_flags & TCPHDR_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); if (skb->len != tcp_header_size) { tcp_event_data_sent(tp, sk); tp->data_segs_out += tcp_skb_pcount(skb); tp->bytes_sent += skb->len - tcp_header_size; } if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb)); tp->segs_out += tcp_skb_pcount(skb); /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */ skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb); skb_shinfo(skb)->gso_size = tcp_skb_mss(skb); /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */ /* Cleanup our debris for IP stacks */ memset(skb->cb, 0, max(sizeof(struct inet_skb_parm), sizeof(struct inet6_skb_parm))); err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); if (unlikely(err > 0)) { tcp_enter_cwr(sk); err = net_xmit_eval(err); } if (!err && oskb) { tcp_update_skb_after_send(sk, oskb, prior_wstamp); tcp_rate_skb_sent(sk, oskb); } return err; }
客户端tcp层是完成SYN包的发送了,经过下层传输到网卡。之后服务端接收客户端发来的tcp报文,并发送回SYN+ACK。