zoukankan      html  css  js  c++  java
  • TCP连接建立系列 — 服务端发送SYNACK段

    本文主要分析:服务器端如何构造和发送SYNACK段。

    内核版本:3.6

    Author:zhangskd @ csdn blog

    发送入口

    tcp_v4_send_synack()用于发送SYNACK段,在tcp_v4_conn_request()中被调用。

    首先调用tcp_make_synack()构造SYNACK段,主要是构造TCP报头和初始化skb中的一些字段。

    然后调用ip_build_and_send_pkt()添加IP报头后发送出去。

    /* Send a SYN-ACK after having received a SYN.
     * This still operates on a request_sock only, not on a big socket.
     */
    
    static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req,
         struct request_values *rvp, u16 queue_mapping, bool nocache)
    {
        const struct inet_request_sock *ireq = inet_rsk(req);
        struct flowi4 fl4;
        int err = -1;
        struct sk_buff *skb;
    
        /* First, grab a route.
         * 获取路由缓存。
         */
        if (! dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
            return -1;
    
        /* 构造一个SYNACK段,初始化TCP首部和skb中的一些字段。*/
        skb = tcp_make_synack(sk, dst, req, rvp);
    
        if (skb) {
            /* 计算TCP报文的校验和 */
            __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
            skb_set_queue_mapping(skb, queue_mapping);
    
            /* 添加IP报头,并把此SYNACK段发送出去 */
            err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr, ireq->rmt_addr, ireq->opt);
            err = net_xmit_eval(err);
        }
    
        return err;
    }
    

    构造SYNACK段

    构造一个SYNACK段,初始化TCP报头和skb中的一些字段。

    /**
     * tcp_make_synack - Prepare a SYN-ACK.
     * sk: listener socket
     * dst: dst entry attached to the SYNACK
     * req: request_sock pointer
     * rvp: request_values pointer
     *
     * Allocate one skb and build a SYNACK packet.
     * @dst is consumed: Caller should not use it again.
     */
    
    struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, struct request_sock *req,
         struct request_values *rvp)
    {
        struct tcp_out_options opts;
        struct tcp_extend_values *xvp = tcp_xv(rvp);
        struct inet_request_sock *ireq = inet_rsk(req);
        struct tcp_sock *tp = tcp_sk(sk);
        const struct tcp_cookie_values *cvp = tp->cookie_values;
        struct tcphdr *th;
        struct sk_buff *skb;
        struct tcp_md5sig_key *md5;
        int tcp_header_size;
        int mss;
        int s_data_desired = 0;
    
        if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
            s_data_desired = cvp->s_data_desired;
    
        /* 申请一个skb,用于发送SYNACK */
        skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, sk_gfp_atomic(sk, GFP_ATOMIC));
        if (unlikely(! skb)) {
            dst_release(dst);
            return NULL;
        }
       
        /* Reserve space for headers. 
         * 拓展headroom,为MAC、IP、TCP协议头预留空间。
         */
        skb_reserve(skb, MAX_TCP_HEADER);
        skb_dst_set(skb, dst); /* 保存路由缓存的地址 */
    
        /* 从路由缓存中获取本端的通告MSS */
        mss = dst_metric_advmss(dst);
        if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
            mss = tp->rx_opt.user_mss; /* 如果用户使用了TCP_MAXSEG选项 */
    
        /* ignored for retransmitted syns.
         * 设置接收窗口的初始值、窗口扩大因子、通告窗口的上限。
         */
        if (req->rcv_wnd == 0) {
            __u8 rcv_wscale;
            /* 最大的通告窗口 */
            req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
    
            /* 如果用户使用了SO_RCVBUF选项做限制 */
            if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
                 (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
                req->window_clamp = tcp_full_space(sk); /* 3/4 * sk->sk_rcvbuf */
    
            /* 获取接收窗口的初始值、窗口扩大因子和接收窗口的上限 */
            tcp_select_initial_window(tcp_full_space(sk), mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
                &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND));
            ireq->rcv_wscale = rcv_wscale;
        }
        memset(&opts, 0, sizeof(opts));
    
    #ifdef CONFIG_SYN_COOKIES
        /* 如果SYNACK段使用了SYN COOKIE,并且使用时间戳选项,
         * 则把TCP选项信息保存在SYNACK段中tsval的低6位。
         */
        if (unlikely(req->cookie_ts))
            TCP_SKB_CB(skb)->when = cookie_init_timestamp(req);
        else
    #endif
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
    
        /* TCP首部和选项的长度,赋值TCP选项实例tcp_out_options */
        tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5, xvp) + sizeof(*th);
    
        /* 向headroom扩展,使data room包含TCP首部和选项 */
        skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);
    
        th = tcp_hdr(skb);
        memset(th, 0, sizeof(struct tcphdr));
        th->syn = 1;
        th->ack = 1;
        /* 如果连接支持ECN,把th->ece置1 */
        TCP_ECN_make_synack(req, th);
        th->source = ireq->loc_port; /* 源端口 */
        th->dest = ireq->rmt_port; /* 目的端口 */
        /* 初始化skb中的一些控制字段 */
        tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, TCPHDR_SYN | TCPHDR_ACK);
    
        /* TCPCT,新内核已废弃此选项,不做分析 */
        if (OPTION_COOKIE_EXTENSION & opts.options) {
            ...
        }
    
        th->seq = htonl(TCP_SKB_CB(skb)->seq); /* 序号 */
        th->ack_seq = htonl(tcp_rsk(seq)->rcv_isn + 1); /* 确认序号 */
        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
        th->window = htons(min(req->rcv_wnd, 65535U)); /* 初始接收窗口 */
    
        /* 把TCP选项实例tcp_out_options写到skb中 */
        tcp_options_write((__u32 *) (th + 1), tp, &opts);
        th->doff = (tcp_header_size >> 2); /* TCP首部长度 */
        TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS, tcp_skb_pcount(skb));
    
    #ifdef CONFIG_TCP_MD5SIG
        /* Okay, we have all we need - do the md5 hash if needed */
        if (md5)
            tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location, md5, NULL, req, skb);
    #endif
    
        return skb;
    }
    
    /* MAC层、IP层、TCP层,首部和选项的最大长度 */
    #define MAX_TCP_HEADER (128 + MAX_HEADER)
    #define MAX_TCP_OPTION_SPACE 40
    
    static inline gfp_t sk_gfp_atomic(struct sock *sk, gfp_t gfp_mask)
    {
        /* Allow access to emergency reserves */
        return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC);
    }
    

    如果SYNACK段使用SYN Cookie,并且使用时间戳选项,则把TCP选项信息保存在SYNACK段

    中tsval的低6位。

    /* When syncookies are in effect and tcp timestamps are enabled we encode tcp options
     * in the lower bits of the timestamp value that will be sent in the syn-ack.
     * Since subsequent timestamps use the normal tcp_time_stamp value, we must make
     * sure that the resulting initial timestamp is <= tcp_time_stamp.
     */
    __u32 cookie_init_timestamp(struct request_sock *req)
    {
        struct inet_request_sock *ireq;
        u32 ts, ts_now = tcp_time_stamp;
        u32 options = 0;
        ireq = inet_rsk(req);
    
        options = ireq->wscale_ok ? ireq->snd_wscale : 0xf;
        options |= ireq->sack_ok << 4;
        options |= ireq->ecn_ok << 5;
    
        ts = ts_now & ~TSMASK;
        ts |= options;
    
        if (ts > ts_now) {
            ts >>= TSBITS;
            ts--;
            ts <<= TSBITS;
            ts |= options;
        }
        return ts;
    }
    
    #define TSBITS 6
    #define TSMASK (((__u32) 1 << TSBITS) - 1)
    

    TCP Cookie Transaction (TCPCT) 选项功能类似与SYN Cookie,是2009年加入内核,

    2013/3从内核中移除,本文不对其进行分析。

    #define OPTION_SACK_ADVERTISE (1 << 0)
    #define OPTION_TS (1 << 1)
    #define OPTION_MD5 (1 << 2)
    #define OPTION_WSCALE (1 << 3)
    #define OPTION_COOKIE_EXTENSION (1 << 4)
    #define OPTION_FAST_OPEN_COOKIE (1 << 8)
    
    /* TCP选项实例,用于发送。*/
    struct tcp_out_options {
        u16 options; /* bit field of OPTION_* */
        u16 mss; /* 0 to disable,Max Segment Size选项 */
        u8 ws; /* window scale, 0 to disable,Window Scaling选项 */
        u8 num_sack_blocks; /* number of SACK blocks to include */
        u8 hash_size; /* bytes in hash_location */
        __u8 *hash_location; /* temporary pointer, overloaded */
        __u32 tsval, tsecr; /* need to include OPTION_TS,时间戳 */
        struct tcp_fastopen_cookie *fastopen_cookie; /* Fast open cookie,Fast Open选项 */
    };
    

    赋值TCP选项实例tcp_out_options,用于构造SYNACK段。

    /* Set up TCP options for SYN-ACKs. */
    static unsigned int tcp_synack_options(struct sock *sk, struct request_sock *req,
        unsigned int mss, struct sk_buff *skb, struct tcp_out_options *opts,
        struct tcp_md5sig_key **md5, struct tcp_extend_values *xvp)
    {
        struct inet_request_sock *ireq = inet_rsk(req);
        unsigned int remaining = MAX_TCP_OPTION_SPACE;
        u8 cookie_plus = (xvp != NULL && ! xvp->cookie_out_never) ? xvp->cookie_plus : 0;
    
    #define CONFIG_TCP_MD5SIG
        *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
        if (*md5) {
            opts->options |= OPTION_MD5;
            remaining -= TCPOLEN_MD5SIG_ALIGNED;
           
            /* We can't fit any SACK blocks in a packet with MD5 + TS options.
             * There was discussion about disabling SACK rather than TS in order to
             * fit in better with old, buggy kernels, but that was deemed to be unnecessary.
             */
            ireq->tstamp_ok &= ! ireq->sack_ok;
        }
    #else
        *md5 = NULL;
    #endif
    
        /* We always send an MSS option. */
        opt->mss = mss; /* Max Segment Size选项 */
        remaining -= TCPOLEN_MSS_ALIGNED;
    
        if (likely(ireq->wscale_ok)) { /* Window Scaling选项 */
            opts->ws = ireq->rcv_wscale;
            opts->options |= OPTION_WSCALE;
            remaining -= TCPOLEN_WSCALE_ALIGNED;
        }
    
        if (likely(ireq->tstamp_ok)) { /* 时间戳选项 */
            opts->options |= OPTION_TS;
            opts->tsval = TCP_SKB_CB(skb)->when;
            opts->tsecr = req->ts_recent;
            remaining -= TCPOLEN_TSTAMP_ALIGNED;
        }
    
        if (likely(ireq->sack_ok)) { /* SACK Permit选项 */
            opts->options |= OPTION_SACK_ADVERTISE;
            if (unlikely(! ireq->tstamp_ok))
                remaining -= TCPOLEN_SACKPERM_ALIGNED;
        }
    
        /* TCPCT选项,新内核以废弃 */
        if (*md5 == NULL && ireq->tstamp_ok && cookie_plus > TCPOLEN_COOKIE_BASE) {
            ...
        }
    
        return MAX_TCP_OPTION_SPACE - remaining; /* TCP选项长度 */
    }
    

    初始化不携带数据skb的一些控制字段。

    /* Constructs common control bits of non-data skb. If SYN/FIN is present,
     * auto increment end seqno.
     */
    
    static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
    {
        skb->ip_summed = CHECKSUM_PARTIAL; /* 由硬件计算协议首部和数据的校验和。*/
        skb->csum = 0;
    
        TCP_SKB_CB(skb)->tcp_flags = flags;
        TCP_SKB_CB(skb)->sacked = 0;
    
        skb_shinfo(skb)->gso_segs = 1;
        skb_shinfo(skb)->gso_size = 0;
        skb_shinfo(skb)->gso_type = 0;
    
        TCP_SKB_CB(skb)->seq = seq;
        if (flags & (TCPHDR_SYN | TCPHDR_FIN))
            seq++;
        TCP_SKB_CB(skb)->end_seq = seq;
    }

    发送到IP层

    TCP报头中的校验和字段还没赋值,用__tcp_v4_send_check()来计算。

    static void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
    {
        struct tcphdr *th = tcp_hdr(skb);
    
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
            /* 只计算伪首部,TCP报头和TCP数据的累加由硬件完成 */
            th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
            skb->csum_start = skb_transport_header(skb) - skb->head; /* 下次计算开始位置的偏移 */
            skb->csum_offset = offsetof(struct tcphdr, check); /* 校验和值在TCP首部的偏移 */
    
        } else {
            /* tcp_v4_check()累加伪首部,获取最终的校验和。
             * csum_partial()累加TCP报头。
             * skb->csum是TCP数据部分的累加,这是在从用户空间复制时顺便累加的。
             */
            th->check = tcp_v4_check(skb->len, saddr, daddr, csum_partial(th, th->doff << 2, skb->csum));
        }
    }
    

    给skb添加一个IP报头,然后发送出去。

    /* Add an ip header to a skbuff and send it out. */
    int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, __be32 saddr, __be32 daddr,
             struct ip_options_rcu *opt)
    {
        struct inet_sock *inet = inet_sk(sk);
        struct rtable *rt = skb_rtable(skb);
        struct iphdr *iph;
    
         /* 向headroom扩展,使data room包含IP首部和选项 */
        skb_push(skb, sizeof(struct iphdr) + (opt ? opt->opt.optlen : 0));
        skb_reset_network_header(skb);
    
        iph = ip_hdr(skb);
        iph->version = 4;
        iph->ihl = 5;
        ip->tos = inet->tos;
    
        if (ip_dont_fragment(sk, &rt->dst))
            iph->frag_off = htons(IP_DF); /* 设置不允许分片标志 */
        else
            iph->frag_off = 0;
    
        iph->ttl = ip_select_ttl(inet, &rt->dst);
        iph->daddr = (opt && opt->opt.srr ? opt->opt.faddr : daddr);
        iph->saddr = saddr;
        iph->protocol = sk->sk_protocol;
        ip_select_ident(iph, &rt->dst, sk);
    
        if (opt && opt->opt.optlen) {
            iph->ihl += opt->opt.optlen >> 2;
            ip_options_build(skb, &opt->opt, daddr, rt, 0);
        }
    
        skb->priority = sk->sk_priority;
        skb->mark = sk->sk_mark;
    
        /* Send it out. */
        return ip_local_out(skb); /* 位于NF_INET_LOCAT_OUT之前 */
    
    }
    
    


     

  • 相关阅读:
    HiKey软硬件开发环境及其调试
    Android/Linux boot time分析优化
    Suspend to RAM和Suspend to Idle分析,以及在HiKey上性能对比
    使用Minicom基于串口调试HiKey
    系统级性能分析工具perf的介绍与使用
    Workload Automation分析及其使用
    重度使用示波器进行优化分析——一个DSDA项目回顾
    Linux CGroup之freezer分析与应用
    编译自己的Ubuntu内核
    Android中关于cpu/cpuset/schedtune的应用
  • 原文地址:https://www.cnblogs.com/aiwz/p/6333306.html
Copyright © 2011-2022 走看看