zoukankan      html  css  js  c++  java
  • TCP/IP源码学习(52)——TCP的连接过程的实现(1)

    http://blog.chinaunix.net/uid-23629988-id-3178006.html


    作者:gfree.wind@gmail.com
    博客:blog.focus-linux.net   linuxfocus.blog.chinaunix.net
     
     
    本文的copyleft归gfree.wind@gmail.com所有,使用GPL发布,可以自由拷贝,转载。但转载请保持文档的完整性,注明原作者及原链接,严禁用于任何商业用途。
    ======================================================================================================
    在以前的文章中,学习了UDP数据包的接收和发送。今天开始研究一下TCP数据包的接受。与UDP数据包类似,当IP数据包到达ip_local_deliver_finish函数时,根据四层协议从inet_protos数组中得到TCP协议对应的tcp_protocol。
        static const struct net_protocol tcp_protocol = {
            .handler = tcp_v4_rcv,
            .err_handler = tcp_v4_err,
            .gso_send_check = tcp_v4_gso_send_check,
            .gso_segment = tcp_tso_segment,
            .gro_receive = tcp4_gro_receive,
            .gro_complete = tcp4_gro_complete,
            .no_policy = 1,
            .netns_ok = 1,
        };

    那么TCP数据包的接收函数入口即为tcp_v4_rcv
        int tcp_v4_rcv(struct sk_buff *skb)
        {
            const struct iphdr *iph;
            const struct tcphdr *th;
            struct sock *sk;
            int ret;
            struct net *net = dev_net(skb->dev);
    
         
         /* 检测该包是否为发给本机的 */
    
            if (skb->pkt_type != PACKET_HOST)
                goto discard_it;
    
            /* Count it even if it's bad */
            TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
    
         
         /* 检查包长至少比TCP的首部长 */
    
            if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
                goto discard_it;
    
            th = tcp_hdr(skb);
    
    
          /* 检查TCP首部 */
    
            if (th->doff < sizeof(struct tcphdr) / 4)
                goto bad_packet;
            if (!pskb_may_pull(skb, th->doff * 4))
                goto discard_it;
    
            /* An explanation is required here, I think.
             * Packet length and doff are validated by header prediction,
             * provided case of th->doff==0 is eliminated.
             * So, we defer the checks. */
            if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
                goto bad_packet;
    
         /* 将sequence,ack等保存到socket的TCP控制块中 */
    
            th = tcp_hdr(skb);
            iph = ip_hdr(skb);
            TCP_SKB_CB(skb)->seq = ntohl(th->seq);
            TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
                         skb->len - th->doff * 4);
            TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
            TCP_SKB_CB(skb)->when     = 0;
            TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
            TCP_SKB_CB(skb)->sacked     = 0;
    
         
         /* 
         通过源IP,目的IP,源端口,目的端口,和接收到的interface来查找socket。
         这里一共涉及两个hash表,一个是保存已连接TCP session,一个是处于listening的TCP session
         关于这两个hash,以后再分析。
         */
    
            sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
            if (!sk)
                goto no_tcp_socket;
    
        process:
            /* TIME_WAIT的处理,以后再学习 */
            if (sk->sk_state == TCP_TIME_WAIT)
                goto do_time_wait;
    
            if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
                NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
                goto discard_and_relse;
            }
    
         /* IPsec的检查 */
    
            if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
                goto discard_and_relse;
            nf_reset(skb);
    
         
         /* socket filter没有用过。。。 */
    
            if (sk_filter(sk, skb))
                goto discard_and_relse;
    
            skb->dev = NULL;
    
            bh_lock_sock_nested(sk);
            ret = 0;
    
         /* 
         检查该socket是否由当前执行上下文拥有,如果是,可以继续处理该skb,
         如果不是,那么就将skb加到当前socket的sk_backlog上。
         这样的处理与UDP不同,因为TCP是有内部状态的,当处理一个TCP报文的时候,在中间又处理另外一个TCP报文的      时候,可能会改变TCP的状态,导致被打断的TCP报文处理失败。
         这里保证TCP的一个报文处理不会被打断
         */
    
            if (!sock_owned_by_user(sk)) {
        #ifdef CONFIG_NET_DMA
                struct tcp_sock *tp = tcp_sk(sk);
                if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
                    tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
                if (tp->ucopy.dma_chan)
                    ret = tcp_v4_do_rcv(sk, skb);
                else
        #endif
                {
                    if (!tcp_prequeue(sk, skb))
                        ret = tcp_v4_do_rcv(sk, skb);
                }
            } else if (unlikely(sk_add_backlog(sk, skb))) {
                bh_unlock_sock(sk);
                NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
                goto discard_and_relse;
            }
    
            ...... ......

    进入tcp_v4_do_rcv

        int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
        {
            struct sock *rsk;
        #ifdef CONFIG_TCP_MD5SIG
            /*
             * We really want to reject the packet as early as possible
             * if:
             * o We're expecting an MD5'd packet and this is no MD5 tcp option
             * o There is an MD5 option and we're not expecting one
             */
            if (tcp_v4_inbound_md5_hash(sk, skb))
                goto discard;
        #endif
    
            if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
                /* 该TCP处于已连接状态,留作以后学习 */
                sock_rps_save_rxhash(sk, skb);
                if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
                    rsk = sk;
                    goto reset;
                }
                return 0;
            }
    
         
    
            if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
                goto csum_err;
    
            if (sk->sk_state == TCP_LISTEN) {
                /* 
                处理TCP request包,即请求连接本机TCP端口的TCP报文,并返回应处理该skb的socket。
                对于第一个sync包,返回的nsk就是sk。
                */
                struct sock *nsk = tcp_v4_hnd_req(sk, skb);
                if (!nsk)
                    goto discard;
    
    
             /* 如前面所说,对于第一个sync包,nsk就是sk,于是继续往下执行 */
    
                if (nsk != sk) {
                    sock_rps_save_rxhash(nsk, skb);
                    if (tcp_child_process(sk, nsk, skb)) {
                        rsk = nsk;
                        goto reset;
                    }
                    return 0;
                }
            } else
                sock_rps_save_rxhash(sk, skb);
    
    
            if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
                rsk = sk;
                goto reset;
            }
            return 0;
    
            ...... ...... 
        }

    进入tcp_rcv_state_process
        int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
                     const struct tcphdr *th, unsigned int len)
        {
            struct tcp_sock *tp = tcp_sk(sk);
            struct inet_connection_sock *icsk = inet_csk(sk);
            int queued = 0;
            int res;
    
            tp->rx_opt.saw_tstamp = 0;
    
            switch (sk->sk_state) {
            case TCP_CLOSE:
                goto discard;
    
            case TCP_LISTEN:
                /* 本文的重点,第一个sync包会到这里 */
                 
                /* 非法的TCP包,LISTEN状态只处理sync包 */
                if (th->ack)
                    return 1;
    
                if (th->rst)
                    goto discard;
    
                if (th->syn) {
                    /* 第一个syn包 */
                    if (icsk->icsk_af_ops->conn_request(sk, skb) < 0)
                        return 1;
    
                    /* Now we have several options: In theory there is
                     * nothing else in the frame. KA9Q has an option to
                     * send data with the syn, BSD accepts data with the
                     * syn up to the [to be] advertised window and
                     * Solaris 2.1 gives you a protocol error. For now
                     * we just ignore it, that fits the spec precisely
                     * and avoids incompatibilities. It would be nice in
                     * future to drop through and process the data.
                     *
                     * Now that TTCP is starting to be used we ought to
                     * queue this data.
                     * But, this leaves one open to an easy denial of
                     * service attack, and SYN cookies can't defend
                     * against this problem. So, we drop the data
                     * in the interest of security over speed unless
                     * it's still in use.
                     */
                    kfree_skb(skb);
                    return 0;
                }
                goto discard;
    
         ......  ......
         ......  ......
    
        }

    对于IPv4的TCP数据包,conn_request为tcp_v4_conn_request

        int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        {
            struct tcp_extend_values tmp_ext;
            struct tcp_options_received tmp_opt;
            const u8 *hash_location;
            struct request_sock *req;
            struct inet_request_sock *ireq;
            struct tcp_sock *tp = tcp_sk(sk);
            struct dst_entry *dst = NULL;
            __be32 saddr = ip_hdr(skb)->saddr;
            __be32 daddr = ip_hdr(skb)->daddr;
            __u32 isn = TCP_SKB_CB(skb)->when;
            int want_cookie = 0;
    
            /* Never answer to SYNs send to broadcast or multicast */
            if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
                goto drop;
    
            /* TW buckets are converted to open requests without
             * limitations, they conserve resources and peer is
             * evidently real one.
             */
            //检查syn queue是否已满,即request queue是否已满
            if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
                /* 是否使用sync cookie */
                want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
                if (!want_cookie)
                    goto drop;
            }
    
            /* Accept backlog is full. If we have already queued enough
             * of warm entries in syn queue, drop request. It is better than
             * clogging syn queue with openreqs with exponentially increasing
             * timeout.
             */
            //检查accept queue是否已满
            if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
                goto drop;
    
         
         //申请一个新的request_sock
    
            req = inet_reqsk_alloc(&tcp_request_sock_ops);
            if (!req)
                goto drop;
    
        #ifdef CONFIG_TCP_MD5SIG
            tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
        #endif
    
         //解析TCP的option
    
            tcp_clear_options(&tmp_opt);
            tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
            tmp_opt.user_mss = tp->rx_opt.user_mss;
            tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
    
    
            if (tmp_opt.cookie_plus > 0 &&
             tmp_opt.saw_tstamp &&
             !tp->rx_opt.cookie_out_never &&
             (sysctl_tcp_cookie_size > 0 ||
             (tp->cookie_values != NULL &&
             tp->cookie_values->cookie_desired > 0))) {
                /* 
                不太确定这部分代码的用途,看上去跟sync cookie相关
                貌似是为了检查sync-cookie。
                */
                u8 *c;
                u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
                int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
    
                if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
                    goto drop_and_release;
    
                /* Secret recipe starts with IP addresses */
                *mess++ ^= (__force u32)daddr;
                *mess++ ^= (__force u32)saddr;
    
                /* plus variable length Initiator Cookie */
                c = (u8 *)mess;
                while (l-- > 0)
                    *c++ ^= *hash_location++;
    
                want_cookie = 0;    /* not our kind of cookie */
                tmp_ext.cookie_out_never = 0; /* false */
                tmp_ext.cookie_plus = tmp_opt.cookie_plus;
            } else if (!tp->rx_opt.cookie_in_always) {
                /* redundant indications, but ensure initialization. */
                tmp_ext.cookie_out_never = 1; /* true */
                tmp_ext.cookie_plus = 0;
            } else {
                goto drop_and_release;
            }
            tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
    
            if (want_cookie && !tmp_opt.saw_tstamp)
                tcp_clear_options(&tmp_opt);
    
            tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
            tcp_openreq_init(req, &tmp_opt, skb);
    
            ireq = inet_rsk(req);
            ireq->loc_addr = daddr;
            ireq->rmt_addr = saddr;
            ireq->no_srccheck = inet_sk(sk)->transparent;
            ireq->opt = tcp_v4_save_options(sk, skb);
    
            if (security_inet_conn_request(sk, skb, req))
                goto drop_and_free;
    
            if (!want_cookie || tmp_opt.tstamp_ok)
                TCP_ECN_create_request(req, tcp_hdr(skb));
    
            if (want_cookie) {
                /* 生成sync cookie使用的Initial sequence numnber */
                isn = cookie_v4_init_sequence(sk, skb, &req->mss);
                req->cookie_ts = tmp_opt.tstamp_ok;
            } else if (!isn) {
                struct inet_peer *peer = NULL;
                struct flowi4 fl4;
    
                /* VJ's idea. We save last timestamp seen
                 * from the destination in peer table, when entering
                 * state TIME-WAIT, and check against it before
                 * accepting new connection request.
                 *
                 * If "isn" is not zero, this request hit alive
                 * timewait bucket, so that all the necessary checks
                 * are made in the function processing timewait state.
                 */
                /* 还是不懂这块的检查是为了什么。。。*/
                if (tmp_opt.saw_tstamp &&
                 tcp_death_row.sysctl_tw_recycle &&
                 (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
                 fl4.daddr == saddr &&
                 (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
                    inet_peer_refcheck(peer);
                    if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
                     (s32)(peer->tcp_ts - req->ts_recent) >
                                    TCP_PAWS_WINDOW) {
                        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
                        goto drop_and_release;
                    }
                }
                /* Kill the following clause, if you dislike this way. */
                else if (!sysctl_tcp_syncookies &&
                     (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
                     (sysctl_max_syn_backlog >> 2)) &&
                     (!peer || !peer->tcp_ts_stamp) &&
                     (!dst || !dst_metric(dst, RTAX_RTT))) {
                    /* Without syncookies last quarter of
                     * backlog is filled with destinations,
                     * proven to be alive.
                     * It means that we continue to communicate
                     * to destinations, already remembered
                     * to the moment of synflood.
                     */
                    LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u
    ",
                         &saddr, ntohs(tcp_hdr(skb)->source));
                    goto drop_and_release;
                }
    
    
             /* 生成Initial Sequence Number */
    
                isn = tcp_v4_init_sequence(skb);
            }
            tcp_rsk(req)->snt_isn = isn;
            tcp_rsk(req)->snt_synack = tcp_time_stamp;
    
         /* 回复syn+ack包 */
    
            if (tcp_v4_send_synack(sk, dst, req,
                     (struct request_values *)&tmp_ext) ||
             want_cookie)
                goto drop_and_free;
    
         /* 将该request_sock添加到父socket的icsk_accept_queue中的listen_opt上 */
    
            inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
            return 0;
    
        drop_and_release:
            dst_release(dst);
        drop_and_free:
            reqsk_free(req);
        drop:
            return 0;
        }

    今天仅仅学习了一下TCP处理第一个sync包的过程,就发现了很多不明白的地方,还需要继续努力啊。争取早日把TCP的这些细节搞懂。


  • 相关阅读:
    PHP-------抽象和接口
    MySQL函数
    MySQL索引
    MySQL语法
    MySQL视图、事务
    最简洁粗暴版的虚拟用户配置FTP
    linux网络bond技术
    CentOS 7服务
    Nginx 基础
    shell 截取指定的字符串
  • 原文地址:https://www.cnblogs.com/ztguang/p/12645549.html
Copyright © 2011-2022 走看看