zoukankan      html  css  js  c++  java
  • socket connect tcp_v4_connect

    tcp_v4_connect

    /* This will initiate an outgoing connection. 
    tcp_v4_connect函数初始化一个对外的连接请求,创建一个SYN包并发送出去,
    把套接字的状态从CLOSE切换到SYN_SENT,初始化TCP部分选项数据包序列号、
    窗口大小、MSS、套接字传送超时等*/
    int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
    {
        struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
        struct inet_sock *inet = inet_sk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        __be16 orig_sport, orig_dport;
        __be32 daddr, nexthop;
        struct flowi4 *fl4;
        struct rtable *rt;
        int err;
        struct ip_options_rcu *inet_opt;
    
        if (addr_len < sizeof(struct sockaddr_in))
            return -EINVAL;
    
        if (usin->sin_family != AF_INET)
            return -EAFNOSUPPORT;
        //是否设置源路由选项
    
        nexthop = daddr = usin->sin_addr.s_addr;
        inet_opt = rcu_dereference_protected(inet->inet_opt,
                             sock_owned_by_user(sk));
        if (inet_opt && inet_opt->opt.srr) {
            if (!daddr)
                return -EINVAL;
            nexthop = inet_opt->opt.faddr;
        }
    /*
    根据目的ip、目的端口、网络设备接口调用ip_route_connect选路由,
    路由结构保存到rt->rt_dst中,实际调用的函数是ip_route_output_flow,
    如果是广播地址、组地址就返回
    
    */
        orig_sport = inet->inet_sport;
        orig_dport = usin->sin_port;
        fl4 = &inet->cork.fl.u.ip4;
        rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
                      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
                      IPPROTO_TCP,
                      orig_sport, orig_dport, sk, true);
        if (IS_ERR(rt)) {
            err = PTR_ERR(rt);
            if (err == -ENETUNREACH)
                IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
            return err;
        }
    
        if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
            ip_rt_put(rt);
            return -ENETUNREACH;
        }
    
        if (!inet_opt || !inet_opt->opt.srr)
            daddr = fl4->daddr;
    
        if (!inet->inet_saddr)
            inet->inet_saddr = fl4->saddr;
        inet->inet_rcv_saddr = inet->inet_saddr;
    
        if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
            /* Reset inherited state */
            tp->rx_opt.ts_recent       = 0;
            tp->rx_opt.ts_recent_stamp = 0;
            if (likely(!tp->repair))
                tp->write_seq       = 0;
        }
        ////获取套接字最近使用的时间
    
        if (tcp_death_row.sysctl_tw_recycle &&
            !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
            tcp_fetch_timewait_stamp(sk, &rt->dst);
    
        inet->inet_dport = usin->sin_port;
        inet->inet_daddr = daddr;
    
        inet_csk(sk)->icsk_ext_hdr_len = 0;
        if (inet_opt)
            inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
    
        tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
    
        /* Socket identity is still unknown (sport may be zero).
         * However we set state to SYN-SENT and not releasing socket
         * lock select source port, enter ourselves into the hash tables and
         * complete initialization after this.
         调用tcp_set_state设置套接字状态为TCP_SYN_SENT,本把套接字sk加入到连接管理哈希链表中,
         为连接分配一个临时端口
         */
        tcp_set_state(sk, TCP_SYN_SENT);
        //将套接字sk放入TCP连接管理哈希链表中 同时 Bind a port
        //绑定IP地址和端口,并将socket加入到连接表中
        err = inet_hash_connect(&tcp_death_row, sk);
        if (err)
            goto failure;
    
        rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
                       inet->inet_sport, inet->inet_dport, sk);
        if (IS_ERR(rt)) {
            err = PTR_ERR(rt);
            rt = NULL;
            goto failure;
        }
        /* OK, now commit destination to socket.  */
        sk->sk_gso_type = SKB_GSO_TCPV4;
        sk_setup_caps(sk, &rt->dst);
    
        if (!tp->write_seq && likely(!tp->repair))
            tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
                                   inet->inet_daddr,
                                   inet->inet_sport,
                                   usin->sin_port);
    
        inet->inet_id = tp->write_seq ^ jiffies;
    /*
    初始化第一个序列号,调用tcp_connect函数完成建立连接,
    包括发送SYN,tcp_connect将创建号的SYN数据段加入到套接字发送队列,
    最后调用tcp_transmit_skb数据包发送到IP层。
    
    */
        if (likely(!tp->repair))
            err = tcp_connect(sk);
        else
            err = tcp_repair_connect(sk);
    
        rt = NULL;
        if (err)
            goto failure;
    
        return 0;
    
    failure:
        /*
         * This unhashes the socket and releases the local port,
         * if necessary.
         */
        tcp_set_state(sk, TCP_CLOSE);
        ip_rt_put(rt);
        sk->sk_route_caps = 0;
        inet->inet_dport = 0;
        return err;
    }
    
    /*
     * Bind a port for a connect operation and hash it.
     */
    int inet_hash_connect(struct inet_timewait_death_row *death_row,
                  struct sock *sk)
    {
        return __inet_hash_connect(death_row, sk, inet_sk_port_offset(sk),
                __inet_check_established, __inet_hash_nolisten);
    }
    
    
    int __inet_hash_connect(struct inet_timewait_death_row *death_row,
            struct sock *sk, u32 port_offset,
            int (*check_established)(struct inet_timewait_death_row *,
                struct sock *, __u16, struct inet_timewait_sock **),
            int (*hash)(struct sock *sk, struct inet_timewait_sock *twp))
    {
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        const unsigned short snum = inet_sk(sk)->inet_num;
        struct inet_bind_hashbucket *head;
        struct inet_bind_bucket *tb;
        int ret;
        struct net *net = sock_net(sk);
        int twrefcnt = 1;
    
        if (!snum) {//端口未绑定
            int i, remaining, low, high, port;
            static u32 hint;
            u32 offset = hint + port_offset;
            struct hlist_node *node;
            struct inet_timewait_sock *tw = NULL;
    
            inet_get_local_port_range(&low, &high);
            remaining = (high - low) + 1;
    
            local_bh_disable();
            for (i = 1; i <= remaining; i++) {
                port = low + (i + offset) % remaining;
                if (inet_is_reserved_local_port(port))
                    continue;
                head = &hinfo->bhash[inet_bhashfn(net, port,
                        hinfo->bhash_size)];
                spin_lock(&head->lock);
    
                /* Does not bother with rcv_saddr checks,
                 * because the established check is already
                 * unique enough.
                 //绑定到一个port的socket可能是通过bind 系统调用,也可能是调用connect系统调用时__inet_hash_connect函数选取的
                 */
                inet_bind_bucket_for_each(tb, node, &head->chain) {
                    if (net_eq(ib_net(tb), net) &&
                        tb->port == port) {
                        if (tb->fastreuse >= 0)
                            goto next_port;
                        WARN_ON(hlist_empty(&tb->owners));
                        if (!check_established(death_row, sk,
                                    port, &tw))
                            goto ok;
                        goto next_port;
                    }
                }
    //当前端口没有被使用
                tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
                        net, head, port);
                if (!tb) {
                    spin_unlock(&head->lock);
                    break;
                }
                tb->fastreuse = -1;
                goto ok;
    
            next_port:
                spin_unlock(&head->lock);
            }
            local_bh_enable();
    
            return -EADDRNOTAVAIL;
    
    ok:
            hint += i;
    
            /* Head lock still held and bh's disabled 
            //将socket加入port对应的tb的socket队列中,即将此socket与port相关联
            */
            inet_bind_hash(sk, tb, port);
            if (sk_unhashed(sk)) { //如果socket没有被加入到“已建立连接”的连接表中
                inet_sk(sk)->inet_sport = htons(port);
                twrefcnt += hash(sk, tw);//将socket加入到“已建立连接”的连接表中
            }
            if (tw)
                twrefcnt += inet_twsk_bind_unhash(tw, hinfo);
            spin_unlock(&head->lock);
    
            if (tw) {
                inet_twsk_deschedule(tw, death_row);
                while (twrefcnt) {
                    twrefcnt--;
                    inet_twsk_put(tw);
                }
            }
    
            ret = 0;
            goto out;
        }
    
        head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
        tb  = inet_csk(sk)->icsk_bind_hash;//将tb加入到bind hash表中
        spin_lock_bh(&head->lock);
    //条件为false时,会执行else分支,检查是否可用。这么看来,调用bind()成功并不意味着这个端口就真的可以用
        if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {//有且仅有一个socket绑定到这个端口,无需冲突检查
            hash(sk, NULL);//将socket加入到“已建立连接”的连接表中
            spin_unlock_bh(&head->lock);
            return 0;
        } else {
            spin_unlock(&head->lock);
            /* No definite answer... Walk to established hash table */
            ret = check_established(death_row, sk, snum, NULL);
    out:
            local_bh_enable();
            return ret;
        }
    }

    创建一个套接字,设置SO_REUSEADDR选项,建立连接后立即关闭,关闭后立即又重复同样的过程,发现在第二次调用connect()的时候返回EADDRNOTAVAIL错误
    可以看到返回EADDRNOTVAIL错误的有两种情况:
       1、在TIME_WAIT传输控制块中找到匹配的端口,并且twsk_unique()返回true时
       2、在除TIME_WAIT和LISTEN状态外的传输块中存在匹配的端口。
      第二种情况很好容易理解了,只要状态在FIN_WAIT_1、ESTABLISHED等的传输控制块使用的端口和要查找的匹配,就会返回EADDRNOTVAIL错误。
    第一种情况还要取决于twsk_uniqueue()的返回值


    __inet_hash_connect的主要功能与bind系统调用中的inet_csk_get_port类似,都是:
    1、如果没有选取端口则选定一个;

    2、将socket与端口绑定;

    3、将scoket加入到连接表中(这个功能inet_csk_get_port没有)。

      另外一点不同是:inet_csk_get_port进行冲突检查时关注的是绑定冲突
    而__inet_hash_connect检查的是当前socket是否与“已建立连接的socket”的冲突。
    __inet_hash_connect检查冲突的函数是__inet_check_established:

    /* called with local bh disabled */
    static int __inet_check_established(struct inet_timewait_death_row *death_row,
                        struct sock *sk, __u16 lport,
                        struct inet_timewait_sock **twp)
    {
        struct inet_hashinfo *hinfo = death_row->hashinfo;
        struct inet_sock *inet = inet_sk(sk);
        __be32 daddr = inet->inet_rcv_saddr;
        __be32 saddr = inet->inet_daddr;
        int dif = sk->sk_bound_dev_if;
        INET_ADDR_COOKIE(acookie, saddr, daddr)
        const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
        struct net *net = sock_net(sk);
        unsigned int hash = inet_ehashfn(net, daddr, lport,
                         saddr, inet->inet_dport);
        struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);//找到连接表中的表项
        spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
        struct sock *sk2;
        const struct hlist_nulls_node *node;
        struct inet_timewait_sock *tw;
        int twrefcnt = 0;
    
        spin_lock(lock);
    
        /* Check TIME-WAIT sockets first. 
        先检查TIME_WAIT表,然后再检查establish表,与这两个表中的任意一个冲突都是不允许的
        */
        sk_nulls_for_each(sk2, node, &head->twchain) {
            tw = inet_twsk(sk2);
    
            if (INET_TW_MATCH(sk2, net, hash, acookie,
                        saddr, daddr, ports, dif)) {
                if (twsk_unique(sk, sk2, twp))
                    goto unique;
                else
                    goto not_unique;
            }
        }
        tw = NULL;
    
        /* And established part... */
        sk_nulls_for_each(sk2, node, &head->chain) {
            if (INET_MATCH(sk2, net, hash, acookie,
                        saddr, daddr, ports, dif))
                goto not_unique;
        }
    
    unique:
        /* Must record num and sport now. Otherwise we will see
         * in hash table socket with a funny identity. */
        inet->inet_num = lport;
        inet->inet_sport = htons(lport);
        sk->sk_hash = hash;
        WARN_ON(!sk_unhashed(sk));
        __sk_nulls_add_node_rcu(sk, &head->chain);
        if (tw) {
            twrefcnt = inet_twsk_unhash(tw);
            NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
        }
        spin_unlock(lock);
        if (twrefcnt)
            inet_twsk_put(tw);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
    
        if (twp) {
            *twp = tw;
        } else if (tw) {
            /* Silly. Should hash-dance instead... */
            inet_twsk_deschedule(tw, death_row);
    
            inet_twsk_put(tw);
        }
        return 0;
    
    not_unique:
        spin_unlock(lock);
        return -EADDRNOTAVAIL;
    }
     在listen系统调用中,inet_hash函数会将socket加入到listen连接表中:
    
    static void __inet_hash(struct sock *sk)
    {
        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
        struct inet_listen_hashbucket *ilb;
    
        if (sk->sk_state != TCP_LISTEN) {
            __inet_hash_nolisten(sk, NULL);
            return;
        }
    
        WARN_ON(!sk_unhashed(sk));
        ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
    
        spin_lock(&ilb->lock);
        __sk_nulls_add_node_rcu(sk, &ilb->head);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        spin_unlock(&ilb->lock);
    }
    
    
    int __inet_hash_nolisten(struct sock *sk, struct inet_timewait_sock *tw)
    {
        struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
        struct hlist_nulls_head *list;
        spinlock_t *lock;
        struct inet_ehash_bucket *head;
        int twrefcnt = 0;
    
        WARN_ON(!sk_unhashed(sk));
    
        sk->sk_hash = inet_sk_ehashfn(sk);
        head = inet_ehash_bucket(hashinfo, sk->sk_hash);
        list = &head->chain;
        lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
    
        spin_lock(lock);
        __sk_nulls_add_node_rcu(sk, list);
        if (tw) {
            WARN_ON(sk->sk_hash != tw->tw_hash);
            twrefcnt = inet_twsk_unhash(tw);
        }
        spin_unlock(lock);
        sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
        return twrefcnt;
    }
    
    
    static inline struct inet_ehash_bucket *inet_ehash_bucket(
        struct inet_hashinfo *hashinfo,
        unsigned int hash)
    {
        return &hashinfo->ehash[hash & hashinfo->ehash_mask];
    }/*
    可见server端的socket在进行listen系统调用后被加入到sk->sk_prot->h.hashinfo->listening_hash中,
    client端的socket在进行connect系统调用后被加入到sk->sk_prot->h.hashinfo->ehash中,
    而对于TCPv4和TCPv6,sk->sk_prot->h.hashinfo指向的都是tcp_hashinfo。*/
  • 相关阅读:
    php反射
    html video api控件总结
    linux CentOS7.2安装ffmpeg-3.0.2
    2019年7月12日星期五(C语言)
    2019年7月11日星期四(C语言)
    2019年7月10日星期三(C语言)
    2019年7月9日星期二(C语言)
    2019年7月8日星期一(C语言)
    2019年7月5日星期五(C语言)
    2019年7月4日星期四(C语言及LINUX命令)
  • 原文地址:https://www.cnblogs.com/codestack/p/11896557.html
Copyright © 2011-2022 走看看