zoukankan      html  css  js  c++  java
  • 深入理解TCP协议及其源代码——TCP三次握客户端tcp层SYN包的发送

    1、TCP基本概念

    传输控制协议TCP是一种面向连接的、可靠的、基于字节流的运输层通信协议。TCP层是位于IP层之上,应用层之下的传输层。

    2、TCP连接时三次握手示意

    3. TCP协议栈从上到下提供的接口

    创建socket

    创建TCP socket调用接口

     在创建socket套接字描述符, sys_socket内核函数会根据指定的协议(例如socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP))挂载对应的协议处理函数

    250 static int inet_create(struct net *net, struct socket *sock, int protocol,int kern)
    251{
    ...
    262     /* Look for the requested type/protocol pair. */
    263     lookup_protocol:
    264     err = -ESOCKTNOSUPPORT;
    265     rcu_read_lock();
    
               // TCP套接字、UDP套接字、原始套接字的inet_protosw实 例都在inetsw_array数组中定义,
               //这些实例会调inet_register_protosw()注册到inetsw中
              //根据protocol查找要创建的套接字对应的四层传输协议。
    266     list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
    268           ...
    283     }
    284
               //如果没有找到,则调用request_module()来尝试加载协议所属的模块,正常情况下不会发生。
    285     if (unlikely(err)) {
    286             if (try_loading_module < 2) {
    287                     rcu_read_unlock();
    ...
    }
    

      三次握手

     结构体变量struct proto tcp_prot指定了TCP协议栈的访问接口函数

     首先客户端发送SYN报文

    调用tcp_v4_connect函数建立与服务器联系并发送SYN段:

    tcp_v4_connect函数

    140/* This will initiate an outgoing connection. */
    141int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
    142{
    ...
    171    rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
    172                          RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
    173                          IPPROTO_TCP,
    174                          orig_sport, orig_dport, sk);
    ...
    214215    /* Socket identity is still unknown (sport may be zero).
    216     * However we set state to SYN-SENT and not releasing socket
    217     * lock select source port, enter ourselves into the hash tables and
    218     * complete initialization after this.
    219     */
    220   tcp_set_state(sk, TCP_SYN_SENT);
    ...
    227    rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
    228                           inet->inet_sport, inet->inet_dport, sk);
    ...
    246    err = tcp_connect(sk);
    ...
    }
    265EXPORT_SYMBOL(tcp_v4_connect);

    此函数前面部分是确定socket的源端口,目的ip及端口。目的IP和目的端口是由connect系统调用的入参指定。tcp_connect函数用于构建并发送一个SYN请求。

    tcp_connect函数

    • 构造一个携带SYN标志位的TCP头,tcp_init_nondata_skb函数实现
    • 发送带有SYN的TCP报文,tcp_transmit_skb函数实现
    • 设置计时器超时重发,net_csk_reset_xmit_timer函数实现

    3090/* Build a SYN and send it off. */
    3091int tcp_connect(struct sock *sk)
    3092{
    ...
    3108       /* Reserve space for headers. */
    3109       skb_reserve(buff, MAX_TCP_HEADER);
    3110
    3111       tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
    3112       tp->retrans_stamp = tcp_time_stamp;
    3113       tcp_connect_queue_skb(sk, buff);
    3114       tcp_ecn_send_syn(sk, buff);
    3115
    3116       /* Send off SYN; include data in Fast Open. */
    3117       err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
    3118             tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
    ...
    3129       /* Timer for repeating the SYN until an answer. */
    3130      inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
    3131                                 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
    ...
    }
    3134EXPORT_SYMBOL(tcp_connect);

    tcp_transmit_sbk函数

    __tcp_transmit_skb函数的主要任务是向ip层发送数据包,其中包括

    初始化TCP协议头等数据结构

    查看clone_it是否要克隆Socket Buffer,应用Socket Buffer可能正被其他进程使用,就要克隆一个份

    构建TCP协议选项

    阻塞控制,确定网络上有多少数据包最好

    构建TCP协议头主要的数据域:源端口、目的端口、数据段初始序列号,计算窗口大小,如果是SYN请求包就不需要计算窗口大小

    发送数据包到ip层,发送过程状态机切换,发送SYN包之后切换为SYN_SENT

    // net/ipv4/tcp_output.c
    static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
                    gfp_t gfp_mask)
    {
        return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
                      tcp_sk(sk)->rcv_nxt);
    }

    tcp_transmit_skb是对__tcp_transmit_skb的封装,继续调用,进入__tcp_transmit_skb发送SYN报文

    __tcp_transmit_skb函数

    // net/ipv4/tcp_output.c
    static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
                      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
    {
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct inet_sock *inet;
        struct tcp_sock *tp;
        struct tcp_skb_cb *tcb;
        struct tcp_out_options opts;
        unsigned int tcp_options_size, tcp_header_size;
        struct sk_buff *oskb = NULL;
        struct tcp_md5sig_key *md5;
        struct tcphdr *th;
        u64 prior_wstamp;
        int err;
    
        BUG_ON(!skb || !tcp_skb_pcount(skb));
        tp = tcp_sk(sk);
    
        if (clone_it) { 
            Socket Buffer
            TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
                - tp->snd_una;
            oskb = skb;
    
            tcp_skb_tsorted_save(oskb) {
                if (unlikely(skb_cloned(oskb)))
                    skb = pskb_copy(oskb, gfp_mask);
                else
                    skb = skb_clone(oskb, gfp_mask);
            } tcp_skb_tsorted_restore(oskb);
    
            if (unlikely(!skb))
                return -ENOBUFS;
        }
    
        prior_wstamp = tp->tcp_wstamp_ns;
        tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
    
        skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
    
        inet = inet_sk(sk); 
        tcb = TCP_SKB_CB(skb); 
        memset(&opts, 0, sizeof(opts));
    
        if (unlikely(tcb->tcp_flags & TCPHDR_SYN))  
            tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
        else
            tcp_options_size = tcp_established_options(sk, skb, &opts,
                                   &md5);
        tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
    
        /* if no packet is in qdisc/device queue, then allow XPS to select
         * another queue. We can be called from tcp_tsq_handler()
         * which holds one reference to sk.
         *
         * TODO: Ideally, in-flight pure ACK packets should not matter here.
         * One way to get this would be to set skb->truesize = 2 on them.
         */
        skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);
    
        /* If we had to use memory reserve to allocate this skb,
         * this might cause drops if packet is looped back :
         * Other socket might not have SOCK_MEMALLOC.
         * Packets not looped back do not care about pfmemalloc.
         */
        skb->pfmemalloc = 0;
    
        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);
    
        skb_orphan(skb);
        skb->sk = sk;
        skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
        skb_set_hash_from_sk(skb, sk);
        refcount_add(skb->truesize, &sk->sk_wmem_alloc);
    
        skb_set_dst_pending_confirm(skb, sk->sk_dst_pending_confirm);
    
        /* Build TCP header and checksum it. */
        th = (struct tcphdr *)skb->data;
        th->source      = inet->inet_sport;
        th->dest        = inet->inet_dport;
        th->seq         = htonl(tcb->seq);
        th->ack_seq     = htonl(rcv_nxt);
        *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) |
                        tcb->tcp_flags);
    
        th->check       = 0;
        th->urg_ptr     = 0;
    
        /* The urg_mode check is necessary during a below snd_una win probe */
        if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
            if (before(tp->snd_up, tcb->seq + 0x10000)) {
                th->urg_ptr = htons(tp->snd_up - tcb->seq);
                th->urg = 1;
            } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
                th->urg_ptr = htons(0xFFFF);
                th->urg = 1;
            }
        }
    
        tcp_options_write((__be32 *)(th + 1), tp, &opts);
        skb_shinfo(skb)->gso_type = sk->sk_gso_type;
        if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
            th->window      = htons(tcp_select_window(sk));
            tcp_ecn_send(sk, skb, th, tcp_header_size);
        } else {
            /* RFC1323: The window in SYN & SYN/ACK segments
             * is never scaled.
             */
            th->window  = htons(min(tp->rcv_wnd, 65535U));
        }
    #ifdef CONFIG_TCP_MD5SIG
        /* Calculate the MD5 hash, as we have all we need now */
        if (md5) {
            sk_nocaps_add(sk, NETIF_F_GSO_MASK);
            tp->af_specific->calc_md5_hash(opts.hash_location,
                               md5, sk, skb);
        }
    #endif
    
        icsk->icsk_af_ops->send_check(sk, skb);
    
        if (likely(tcb->tcp_flags & TCPHDR_ACK))
            tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
    
        if (skb->len != tcp_header_size) {
            tcp_event_data_sent(tp, sk);
            tp->data_segs_out += tcp_skb_pcount(skb);
            tp->bytes_sent += skb->len - tcp_header_size;
        }
    
        if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
            TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
                      tcp_skb_pcount(skb));
    
        tp->segs_out += tcp_skb_pcount(skb);
        /* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
        skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
        skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);
    
        /* Leave earliest departure time in skb->tstamp (skb->skb_mstamp_ns) */
    
        /* Cleanup our debris for IP stacks */
        memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
                       sizeof(struct inet6_skb_parm)));
    
        err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);
        if (unlikely(err > 0)) { 
            tcp_enter_cwr(sk);
            err = net_xmit_eval(err);
        }
        if (!err && oskb) {
            tcp_update_skb_after_send(sk, oskb, prior_wstamp);
            tcp_rate_skb_sent(sk, oskb);
        }
        return err;
    }    

    客户端tcp层是完成SYN包的发送了,经过下层传输到网卡。之后服务端接收客户端发来的tcp报文,并发送回SYN+ACK。

     
  • 相关阅读:
    [LeetCode] Course Schedule
    [Algorithms] Topological Sort
    [Algorithms] Graph Traversal (BFS and DFS)
    [LeetCode] One Edit Distance
    [LeetCode] Summary Ranges
    [LeetCode] Missing Ranges
    [LeetCode] Fraction to Recurring Decimal
    17.Docker之使用dockerfile创建jdk镜像
    16.Docker之使用dockerfile创建nginx镜像
    7.Docker之dockerfile指令简介
  • 原文地址:https://www.cnblogs.com/zzydexiaowu/p/12103552.html
Copyright © 2011-2022 走看看