zoukankan      html  css  js  c++  java
  • TCP连接建立系列 — 服务端接收ACK段(二)

    本文主要分析:三次握手中最后一个ACK段到达时,服务器端的处理路径。

    内核版本:3.6

    Author:zhangskd @ csdn blog

    创建新sock

    协议族相关的操作函数,我们要看的是TCP/IPv4的实例ipv4_specific。

    const struct inet_connection_sock_af_ops ipv4_specific = {
        ...
        .conn_request = tcp_v4_conn_request, /* 处理SYN段 */
        .syn_recv_sock = tcp_v4_syn_recv_sock, /* 创建和初始化一个新的sock */
        ...
    };
    

    三次握手完成以后,要为新的连接创建一个传输控制块,并初始化传输控制块。

    一个TCP传输控制块是由多层组成的,包括:

    tcp_sock

    inet_connection_sock

    inet_sock

    sock

    sock_common

    所以,初始化要做的工作比较多。

    /* The three way handshake has completed - we got a valid synack - 
     * now create the new socket.
     */
    
    struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req,
        struct dst_entry *dst)
    {
        struct inet_request_sock *ireq;
        struct inet_sock *newinet;
        struct tcp_sock *newtp;
        struct sock *newsk;
    #ifdef CONFIG_TCP_MD5SIG
        struct tcp_md5sig_key *key;
    #endif
        struct ip_options_rcu *inet_opt;
    
        /* 如果全连接队列满了,那么返回NULL */
        if (sk_acceptq_is_full(sk))
            goto exit_overflow;
    
        /* 根据监听sock和req,为新连接创建一个传输控制块,并初始化 */
        newsk = tcp_create_openreq_child(sk, req, skb);
        if (! newsk)
            goto exit_nonewsk;
    
        newsk->sk_gso_type = SKB_GSO_TCPV4;
        inet_sk_rx_dst_set(newsk, skb); /* 保存接收路由缓存 */
    
        newtp = tcp_sk(newsk);
        newinet = inet_sk(newsk);
        ireq = inet_rsk(req);
        newinet->inet_daddr = ireq->rmt_addr; /* 目的IP */
        newinet->inet_rcv_saddr = ireq->loc_addr;
        newinet->inet_saddr = ireq->loc_addr; /* 源IP */
        inet_opt = ireq->opt;
        rcu_assign_pointer(newinet->inet_opt, inet_opt); /* IP选项 */
        ireq->opt = NULL;
    
        newinet->mc_index = inet_iif(skb);
        newinet->mc_ttl = ip_hdr(skb)->ttl;
        newinet->rcv_tos = ip_hdr(skb)->tos;
        inet_csk(newsk)->icsk_ext_hdr_len = 0;
        if (inet_opt)
            inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
        newinet->inet_id = newtp->write_seq ^ jiffies;
    
        if (! dst) {
            dst = inet_csk_route_child_sock(sk, newsk, req);
            if (! dst)
                goto put_and_exit;
        } else {
            /* syncookie case: see end of cookie_v4_check() */
        }
        sk_setup_caps(newsk, dst);
    
        tcp_mtup_init(newsk); /* MTU probe init */
        tcp_sync_mss(newsk, dst_mtu(dst));
        newtp->advmss = dst_metric_advmss(dst);
        if (tcp_sk(sk)->rx_opt.user_mss && tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
            newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
        tcp_initialize_rcv_mss(newsk);
    
        if (tcp_rsk(req)->snt_synack) /* 第一个RTT样本 */
            tcp_valid_rtt_meas(newsk, tcp_time_stamp - tcp_rsk(req)->snt_synack);
        newtp->total_retrans = req->retrans;
    
    #ifdef CONFIG_TCP_MD5SIG
        /* Copy over the MD5 key from the original socket */
        key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *) &newinet->inet_daddr, AF_INET);
        if (key != NULL) {
            /* We're using one, so create a matching key on the newsk structure.
             * If we fail to get memory, then we end up not copying the key across. Shucks.
             */
            tcp_md5_do_add(newsk, (union tcp_md5_addr *) &newinet->inet_daddr, AF_INET,
                key->key, key->keylen, GFP_ATOMIC);
            sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
        }
    #endif
    
        /* 把newsk链入使用端口的哈希链表中,更新端口的统计信息 */
        if (__inet_inherit_port(sk, newsk) < 0)
            goto put_and_eixt;
    
        /* 把newsk链入ESTABLISHED状态的哈希表中 */
        __inet_hash_nolisten(newsk, NULL);
    
        return newsk;
    
    exit_overflow:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
    exit_nonewsk:
        dst_release(dst);
    exit:
        NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
        return NULL;
    
    put_and_exit:
        tcp_clear_xmit_timers(newsk);
        tcp_cleanup_congestion_control(newsk);
        bh_unlock_sock(newsk);
        sock_put(newsk);
        goto exit;
    }
    

    根据监听传输控制块sock、连接请求块req,为新的连接创建一个传输控制块sock。

    初始化此传输控制块对应的inet_sock、inet_connection_sock、tcp_sock结构中的变量。

    struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
    {
        /* 克隆一个传输控制块,并对新的传输控制块上锁 */
        struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC);
    
        if (newsk != NULL) {
            cosnt struct inet_request_sock *ireq = inet_rsk(req);
            struct tcp_request_sock *treq = tcp_rsk(req);
            struct inet_connection_sock *newicsk = inet_csk(newsk);
            struct tcp_sock *newtp = tcp_sk(newsk);
            struct tcp_sock *oldtp = tcp_sk(sk);
            struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
    
            /* 由于TCPCT选项已被废弃,此处不做分析 */
            if (oldcvp != NULL) { ... }
    
            /* Now setup tcp_sock,初始化tcp_sock实例 */
            newtp->pred_flags = 0;
    
            /* 接收序号、发送序号相关变量初始化 */
            newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1;
            newtp->snd_sml = newtp->snd_una = newtp->snd_nxt = newtp->snd_up
                                            = treq->snt_isn + 1 + tcp_s_data_size(oldtp);
            
            tcp_prequeue_init(newtp); /* prequeue队列初始化 */
            INIT_LIST_HEAD(&newtp->tsq_node);
            tcp_init_wl(newtp, treq->rcv_isn); /* 上次更新发送窗口的ACK段序号 */
    
            /* 时延相关变量初始化 */
            newtp->srtt = 0;
            newtp->mdev = TCP_TIMEOUT_INIT;
            newicsk->icsk_rto = TCP_TIMEOUT_INIT;
    
            /* 拥塞控制相关变量初始化 */
            newtp->packets_out = 0;
            newtp->retrans_out = 0;
            newtp->sacked_out = 0;
            newtp->fackets_out = 0;
            newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
            tcp_enable_early_retrans(newtp);
    
            newtp->snd_cwnd = TCP_INIT_CWND;
            newtp->snd_cwnd_cnt = 0;
            newtp->bytes_acked = 0;
            newtp->frto_counter = 0;
            newtp->frto_highmark = 0;
    
            /* 如果拥塞控制算法不为Reno,则把使用的拥塞控制算法模块引用计数加1。
             * 如果该模块还没插入内核,则使用Reno。
             */
            if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops && 
                 ! try_module_get(newicsk->icsk_ca_ops->owner))
                newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
    
            tcp_set_ca_state(newsk, TCP_CA_Open);
            tcp_init_xmit_timers(newsk); /* 初始化几个定时器 */
            skb_queue_head_init(&newtp->out_of_order_queue);
            newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1 + tcp_s_data_size(oldtp);
    
            /* TCP选项相关 */
            newtp->rx_opt.saw_tstamp = 0;
            newtp->rx_opt.dsack = 0;
            newtp->rx_opt.num_sacks = 0;
            newtp->urg_data = 0;
    
            /* 如果用户设置了SO_KEEPALIVE选项 */
            if (sock_flag(newsk, SOCK_KEEPOPEN))
                inet_csk_reset_keepalive_timer(newsk, keepalive_time_when(newtp));
    
            newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
            if ((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
                if (sysctl_tcp_fack)
                    tcp_enable_fack(newtp);
            }
    
            newtp->window_clamp = req->window_clamp;
            newtp->rcv_ssthresh = req->rcv_wnd;
            newtp->rcv_wnd = req->rcv_wnd;
            newtp->rx_opt.wscale_ok = ireq->wscale_ok;
            if (newtp->rx_opt.wscale_ok) {
                newtp->rx_opt.snd_wscale = ireq->snd_wscale;
                newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
            } else {
                newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
                newtp->window_clamp = min(newtp->window_clamp, 65535U);
            }
    
            newtp->snd_wnd = (ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale);
            newtp->max_window = newtp->snd_wnd;
    
            if (newtp->rx_opt.tstamp_ok) {
                newtp->rx_opt.ts_recent = req->ts_recent;
                newtp->rx_opt.ts_recent_stamp = get_seconds();
                newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
            } else {
                newtp->rx_opt.ts_recent_stamp = 0;
                newtp->tcp_header_len = sizeof(struct tcphdr);
            }
    
    #ifdef CONFIG_TCP_MD5SIG
            newtp->md5sig_info = NULL;
            if (newtp->af_specific->md5_lookup(sk, newsk))
                newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
    #endif
    
            if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
                newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
    
            newtp->rx_opt.mss_clamp = req->mss;
            TCP_ECN_openreq_child(newtp, req);
    
            TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
        }
    
        return newsk;
    }
    

    克隆一个传输控制块,并对新的传输控制块上锁。

    /* inet_csk_clone_lock - clone an inet socket, and lock its clone.
     * @sk: the socket to clone
     * @req: request_sock
     * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
     *
     * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
     */
    struct sock *inet_csk_clone_lock(cons struct sock *sk, const struct request_sock *req, const gfp_t priority)
    {
        struct sock *newsk = sk_clone_lock(sk, priority); /* 从缓存中分配一个sock,并克隆sk */
    
        if (newsk != NULL) {
            struct inet_connection_sock *newicsk = inet_csk(newsk);
            newsk->sk_state = TCP_SYN_RECV; /* 新sock的状态为SYN_RECV */
            newicsk->icsk_bind_hash = NULL;  /* 端口绑定的哈希桶 */
    
            inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port; /* 目的端口 */
            inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port); /* 源端口 */
            inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port; /* 源端口 */
            newsk->sk_write_space = sk_stream_write_space; /* write_space callback */
    
            newicsk->icsk_retransmits = 0;
            newicsk->icsk_backoff = 0;
            newicsk->icsk_probes_out = 0;
    
            memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue));
            security_inet_csk_clone(newsk, req);
        }
    
        return newsk;
    }
    

    把newsk链入使用端口的哈希链表中,更新端口的统计信息。

    int __inet_inherit_port(struct sock *sk, struct sock *child)
    {
        struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; /* 指向tcp_hashinfo */
        unsigned short port = inet_sk(child)->inet_num; /* 端口 */
        const int bhash = inet_bhashfn(sock_net(sk), port, table->bhash_size); /* 哈希值 */
        struct inet_bind_hashbucket *head = &table->bhash[bhash]; /* 哈希桶 */
        struct inet_bind_bucket *tb; /* 端口实例 */
    
        spin_lock(&head->lock); /* 对哈希桶上锁 */
        tb = inet_csk(sk)->icsk_bind_hash;
    
        if (tb->port != port) {
            /* NOTE: using tproxy and redirecting skbs to a proxy on a different listener port
             * breaks the assumption that the listener socket's icsk_bind_hash is the same
             * as that of the child socket. We have to look up or create a new bind bucket for
             * the child here.
             */
            struct hlist_node *node;
    
            inet_bind_bucket_for_each(tb, node, &head->chain) {
                if (net_eq(ib_net(tb), sock_net(sk)) && tb->port == port)
                    break;
            }
    
            if (! node) {
                /* 申请和初始化一个inet_bind_bucket */
                tb = inet_bind_bucket_create(table->bind_bucket_cachep, sock_net(sk), head, port);
                if (! tb) {
                    spin_unlock(&head->lock);
                    return -ENOMEM;
                }
            }
        }
    
        inet_bind_hash(child, tb, port); /* 把child链入该端口的哈希链表中,更新相关变量 */
        spin_unlock(&head->lock);
    
        return 0;
    }
    
    void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, const unsigned short snum)
    {
        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; /* 指向tcp_hashinfo */
        atomic_inc(&hashinfo->bsockets); /* 增加总的绑定次数 */
        inet_sk(sk)->inet_num = snum; /* 保存绑定的端口 */
        sk_add_bind_node(sk, &tb->owners); /* 把此sock链入tb->owners哈希链表中 */
        tb->num_owners++; /* 增加端口绑定次数 */
        inet_csk(sk)->icsk_bind_hash = tb; /* 把此tb作为icsk成员icsk_bind_hash */
    }
    

    把newsk链入ESTABLISHED状态的哈希表中。

    int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
    {
        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
        struct hlist_nulls_head *list;
        spinlock_t *lock;
        struct inet_ehash_bucket *head;
        int twrefcnt = 0;
    
        WARN_ON(! sk_unhashed(sk)); /* 要求sk不能已经链入哈希链表中 */
        sk->sk_hash = inet_sk_ehashfn(sk); /* 连接的哈希值 */
    
        head = inet_ehash_bucket(hashinfo, sk->sk_hash); /* 哈希桶 */
        list = &head->chain;
        lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
        
        spin_lock(lock);
        __sk_nulls_add_node_rcu(sk, list); /* 把sk链入到哈希链表中 */
    
        if (tw) {
            WARN_ON(sk->sk_hash != tw->tw_hash);
            twrefcnt = inet_twsk_unhash(tw);
        }
        spin_unlock(lock);
    
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    
        return twrefcnt;
    }
    
    static inline int inet_sk_ehashfn(const struct sock *sk)
    {
    
        const struct inet_sock *inet = inet_sk(sk);
    
        const __be32 laddr = inet->inet_rcv_saddr;
        const __u16 lport = inet->inet_num;
        const __be32 faddr = inet->inet_daddr;
        const __be16 fport = inet->inet_dport;
        struct net *net = sock_net(sk);
    
        return inet_ehashfn(net, laddr, lport, faddr, fport);
    }
    

    唤醒监听进程

    调用tcp_child_process()来做最后的处理:

    1. tcp_ack()处理接收到的ACK,更新child的状态为ESTABLISHED。

        唤醒child上的等待进程,初始化子传输控制块的一些字段。

    2. 唤醒监听sock上的等待进程,以便监听进程执行accept()。

    3. 如果child被用户进程占用,则先把ACK段添加到backlog队列中。

    /* Queue segment on the new socket if the new socket is active,
     * otherwise we just shortcircuit this and continue with the new socket.
     */
    
    int tcp_child_process(struct sock *parent, struct sock *child, sk_buff *skb)
    {
        int ret = 0;
        int state = child->sk_state;
    
        /* child没被用户进程占用 */
        if (! sock_owned_by_user(child)) {
            /* 调用tcp_ack()处理接收的ACK,设置新状态ESTABLISHED,唤醒child上的等待进程,
             * 初始化child的一些字段。
             */
            ret = tcp_rcv_state_process(child, skb, tcp_hdr(skb), skb->len);
            
            /* Wakeup parent, send SIGIO.
             * 实例为sock_def_readable,唤醒调用accept()的进程。
             */
            if (state == TCP_SYN_RECV && child->sk_state != state)
                parent->sk_data_ready(parent, 0);
    
        } else { /* 如果child被用户进程占用,则先把skb添加到backlog队列中 */
            __sk_add_backlog(child, skb); 
        }
    
        bh_unlock_sock(child);
        sock_put(child);
        return ret;
    }

    把数据包添加到backlog队列中。

    static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb)
    {
        /* dont let skb not refcounted, we are going to leave rcu lock */
        skb_dst_force(skb);
    
        /* backlog队列为空时 */
        if (! sk->sk_backlog.tail)
            sk->sk_backlog.head = skb;
        else
           sk->sk_backlog.tail->next = skb;
    
        sk->sk_backlog.tail = skb;
        skb->next = NULL;
    }
    

    子传输控制块调用tcp_ack()处理收到的ACK,把子传输控制块的状态从TCP_SYN_RECV更新为TCP_ESTABLISHED,

    并唤醒子传输控制块上的等待进程,更新子传输控制块的一些字段。

    int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, unsigned int len)
    {
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        int queued = 0;
    
        tp->rx_opt.saw_tstamp = 0;
    
        switch(sk->sk_state) {
            ...
        }
    
        if (! tcp_validate_incoming(sk, skb, th, 0))
            return 0;
    
        /* step 5: check the ACK field */
        if (th->ack) {
            int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0; /* 进入ACK处理路径 */
    
            switch (sk->sk_state) {
                case TCP_SYN_RECV:
                    if (acceptable) {
                        tp->copied_seq = tp->rcv_nxt;
                        smp_mb();
    
                        /* 在这里,才从TCP_SYN_RECV变为TCP_ESTABLISHED */
                        tcp_set_state(sk, TCP_ESTABLISHED);
    
                        sk->sk_state_change(sk); /* 实例为sock_def_wakeup(),唤醒sk上的等待进程*/
    
                        /* Note, that this wakeup is only for marginal crossed SYN case.
                         * Passively Open sockets are not waked up, because sk->sk_sleep == NULL
                         * and sk->sk_socket == NULL.
                         */
                        if (sk->sk_socket)
                            sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT);
    
                        tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
                        tp->snd_wnd = ntohs(th->window) << tp->rx_opt.snd_wscale;
                        tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
                        if (tp->rx_opt.tstamp_ok)
                            tp->advmss -= TCPOLEN_TSTAMP_ALIGNED;
    
                        /* Make sure socket is routed, for correct metrics. */
                        icsk->icsk_af_ops->rebuild_header(sk);
    
                        tcp_init_metrics(sk); /* 根据路由缓存信息初始化控制块 */
                        tcp_init_congestion_control(sk); /* 初始化拥塞控制算法 */
    
                        /* Prevent spurious tcp_cwnd_restart() on first data packet. */
                        tp->lsndtime = tcp_time_stamp;
    
                        tcp_mtup_init(sk);
                        tcp_initialize_rcv_mss(sk);
                        tcp_init_buffer_space(sk);
                        tcp_fast_path_on(tp);
                    } else
                        return 1;
    
                    break;
                    ...
            } 
        } else
            goto discard;
        ...
    discard:
            __kfree_skb(skb);
        }
        return 0;
    }
    static void sock_def_wakeup(struct sock *sk)
    {
        struct socket_wq *wq;
        rcu_read_lock();
    
        wq = rcu_dereference(sk->sk_wq);
        if (wq_has_sleeper(wq)) /* 如果sock上有等待任务 */
            wake_up_interruptible_all(&wq->wait); /* 唤醒全部的等待任务 */
        rcu_read_unlock();
    }
    
    /* check if there are any waiting processes. */
    static inline bool wq_has_sleeper(struct socket_wq *wq)
    {
        smp_mb();
        return wq && waitqueue_active(&wq->wait);
    }
    
    static inline int waitqueue_active(wait_queue_head_t *q)
    {
        return ! list_empty(&q->task_list);
    }
    
    #define wake_up_interruptible_all(x) __wake_up(x, TASK_INTERRUPTIBLE, 0, NULL)
    
    void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key)
    {
        unsigned long flags;
        spin_lock_irqsave(&q->lock, flags);
        __wake_up_common(q, mode, nr_exclusive, 0, key);
        spin_unlock_irqrestore(&q->lock, flags);
    }
    
    static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive,
        int wake_flags, void *key)
    {
        wait_queue_t *curr, *next;
    
        list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
            unsigned flags = curr->flags;
    
            if (curr->func(curr, mode, wake_flags, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
                break;
        }
    }
    


     

  • 相关阅读:
    mysql排行榜sql的实现
    MYSQL 简单的循环存储过程
    Git学习笔记
    LeetCode-树-简单-108-110-111
    Android开发连接mysql云数据库中遇到的的一些问题
    使用mybatis遇到报错Invalid bound statement (not found)
    ajax使用时碰到的一些坑
    关于Echarts的常见报错
    deepin系统桌面图标和菜单栏突然消失
    SOA架构理解
  • 原文地址:https://www.cnblogs.com/aiwz/p/6333302.html
Copyright © 2011-2022 走看看