zoukankan      html  css  js  c++  java
  • TCP拥塞控制算法内核实现剖析(一)

    内核版本:2.6.37

    主要源文件:linux-2.6.37/ net/ ipv4/ Tcp_cong.c 

    本文主要分析RENO及TCP拥塞控制基础的实现

    ======================================================================================================

    struct sock *sk 和 struct tcp_sock *tp 的转换

    在include/ linux/ Tcp.h中,
    static inline struct tcp_sock *tcp_sk(const struct sock *sk) 
    {
            return (struct tcp_sock *)sk ;
    }
    
    给出struct sock *sk,
    struct tcp_sock *tp = tcp_sk(sk) ;

    tcp_sock结构

    struct tcp_sock
    {
           ...
     u32 window_clamp ; /* Maximal window to advertise */
     u32 rcv_ssthresh ; /* Current window clamp */
     u32 rcv_wnd ; /* Current receiver window */
           ...
     /* snd_wll 记录发送窗口更新时,造成窗口更新的那个数据报的第一个序号。
      * 它主要用于在下一次判断是否需要更新发送窗口。
      */
     u32 snd_wll ; /* Sequence for window update */ 
     u32 snd_wnd ; /* 发送窗口的大小,直接取值于来自对方的数据报的TCP首部 */
     /* Maximal window ever seen from peer 记录来自对方通告的窗口的最大值 */
     /* First byte we want an ack for 发送窗口的左边沿 */
     u32 max_window ;  u32 snd_una ; 
            ...
     /*
      * Slow start and congestion control
      */
     u32 snd_ssthresh ; /* Slow start size threshold */
     u32 snd_cwnd ; /* Sending congestion window */
     /*表示在当前的拥塞控制窗口中已经发送的数据段的个数*/
     u32 snd_cwnd_cnt ; /* Linear increase counter */ 
     u32 snd_cwnd_clamp ; /* Do not allow snd_cwnd to grow above this */
            ...
     u32 mss_cache ; /* cached effective mss , not including SACKS */
     u32 bytes_acked ; /* Appropriate Byte Counting - RFC3465 */
           ...
    }
    


    拥塞避免算法关键部分

    /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd ( or alternative w ) */
    void tcp_cong_avoid_ai(struct tcp_sock *tp , u32 w)
    {
            if ( tp->snd_cwnd_cnt >= w) {
                       if ( tp->snd_cwnd < tp->snd_cwnd_clamp)
                            tp->snd_cwnd++ ;
                       tp->snd_cwnd_cnt = 0 ;
            } else {
                       tp->snd_cwnd_cnt ++ ;
            }                
    }
    EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai) ;
    


    慢启动算法

    void tcp_slow_start( struct tcp_sock *tp )
    {
     int cnt ; /* increase in packets */
    
    /* RFC3465 : ABC slow start
     * Increase only after a full MSS of bytes is acked
     *
     * TCP sender SHOULD increase cwnd by the number of
     * previously unacknowledged bytes ACKed by each incoming 
     * acknowledgment , provided the increase is not more than L
     */
     /* ack的数据少于MSS */
     if ( sysctl_tcp_abc && tp->bytes_acked < tp->mss_cached )  
            return ;
    
    /* 此时不是应该进入拥塞避免?*/
     if ( sysctl_tcp_max_ssthresh >0 && tcp->snd_cwnd >sysctl_tcp_max_ssthresh)
            cnt = sysctl_tcp_max_ssthresh >> 1 ; /* limited slow start */
     else 
            cnt = tp->snd_cwnd ; /* exponential increase */
    
    /* RFC3465 : ABC
     * We MAY increase by 2 if discovered delayed ack  
     */
    /* 如果接收方启用了延时确认,此时收到的确认代表两个MSS数据报*/
    if ( sysctl_tcp_abc >1 && tp->bytes_acked >= 2*tp->mss_cache ) 
            cnt <<= 1 ;
    
    tp->bytes_acked = 0 ; 
    tp->snd_cwnd_cnt += cnt ; /* 此时snd_cwnd_cnt等于snd_cwnd或2*snd_cwnd */
    
     while( tp->snd_cwnd_cnt >= tp->snd_cwnd ) { 
            tp->snd_cwnd_cnt -= tp->snd_cwnd ;
            if( tp->snd_cwnd < tp->snd_cwnd_clamp )
                    tp->snd_cwnd++ ;
     }
    }
    EXPORT_SYMBOL_GPL( tcp_slow_start ) ;


    代表拥塞算法的结构体

    #define TCP_CA_NAME_MAX 16struct tcp_congestion_ops {
            struct list_head list ;
            unsigned long flags ;
            /* initialize private data (optional) */
            void (*init) (struct sock *sk) ;
            /* cleanup private data (optional) */ 
            void (*release) (struct sock *sk) ;
            /* return slow start threshold (required) */
            u32 (*ssthresh) (struct sock *sk) ;
            /* lower bound for congestion window (optional) */
            u32 (*min_cwnd) (const struct sock *sk) ;
            /* do new cwnd calculation (required) */
            void (*cong_avoid) (struct sock *sk , u32 ack , u32 in_flight ) ;
            /* call before changing ca_state (optional) */
            void (*set_state) (struct sock *sk , u8 new_state) ;
            /* call when cwnd event occurs (optional) */
            void (*cwnd_event) (struct sock *sk , enum tcp_ca_event ev) ;
            /* new value of cwnd after loss (optional) */
            u32 (*undo_cwnd) (struct sock *sk) ;
            /* hook for packet ack accounting (optional) */
            void (*pkts_acked) (struct sock *sk , u32 num_acked , s32 rtt_us) ;
            /* get info for inet_diag (optional) */
            void (*get_info) (struct sock *sk , u32 ext , struct sk_buff *skb) ;
            char name[TCP_CA_NAME_MAX] ;
            struct module *owner ;
    }

    在Tcp_cong.c中,有全局变量:

    int sysctl_tcp_max_ssthresh = 0 ;

    /* define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) */

    static DEFINE_SPINLOCK( tcp_cong_list_lock ) ;

    static LIST_HEAD( tcp_cong_list ) ; // tcp拥塞控制算法链表,其元素为tcp_congestion_ops

    /*

     BUG_ON( ) ; 如果BUG_ON中的条件为真就调用BUG,它输出一些信息,然后调用panic函数挂起系统。

     char *strncpy( char * dest , char *src , size_t n ) ;

     它与strcpy不同之处在于复制n个字符,而不是把所有的字符拷贝(包括结尾'\0')。

      当src的长度小于n时,dst内的未复制空间用'\0'填充。否则,复制n个字符到dst,没有加'\0'。这里就要注意在字符串dst结尾处理加'\0'的情况了。

     rcu_read_lock() // 读者在读取由RCU保护的共享数据时使用该函数标记它进入读端临界区。

     rcu_read_unlock() // 该函数与rcu_read_lock配对使用,用以标记读者退出读端临界区。

    */

    对拥塞控制算法的一些操作(读写增减注册等)

    /* Get current default congestion control */
    void tcp_get_default_congestion_control( char *name )
    {
            struct tcp_congestion_ops *ca ;
            /* We will always have reno */
            BUG_ON( list_empty( &tcp_cong_list) ) ;
    
            rcu_read_lock( ) ;
            ca = list_entry( tcp_cong_list . next , struct tcp_congestion_ops , list ) ;
            strncpy( name , ca->name , TCP_CA_NAME_MAX ) ;
            rcu_read_unlock( ) ;
    }
    

    struct sock——representation of sockets

     struct inet_sock——representation of INET sockets

     struct inet_connection_sock——INET connection oriented sockets

     struct tcp_sock——tcp sockets

     以上几种socket越分越细,比如inet_connection_sock是在inet_sock上的扩展,具有自己特有的属性。

     tcp_sock是TCP协议专用的一个socket表示,它是在struct inet_connection_sock基础进行扩展,主要是增加了滑动窗口协议,避免拥塞算法等一些TCP专有属性。

    struct inet_connection_sock {
            ...
            // Pluggable congestion control hook
            const struct tcp_congestion_ops *icsk_ca_ops ; 
            ...
    
            u32 icsk_ca_priv[16] ;
    #define ICSK_CA_PRIV_SIZE (16*sizeof(u32))
    }
    

    举例://有一个初始化了得struct sock *sk

    struct inet_connection_sock *icsk = inet_csk( sk ) ;

    printk(KERN_INFO "%s" , icsk->icsk_ca_ops->name) ; //当前连接拥塞控制算法名称

    struct inet_sock {
            ...
            /* Socket demultiplex comparisons on incoming packets */
            __be32 inet_daddr ;
            __be16 inet_dport ;
            __be32 inet_saddr ;
            __be16 inet_sport ;
            __be16 inet_num ; // local port 
            __be32 inet_rcv_saddr ; // Bound local IPv4 addr
            ...
    }
    


     

    /* Built list of non-restricted congestion control values*/
    void tcp_get_allowed_congestion_control( char *buf , size_t maxlen)
    {
            struct tcp_congestion_ops *ca ;
            size_t offs = 0 ;
            *buf = '\0' ; //有必要?
             rcu_read_lock() ;
            list_for_each_entry( ca , &tcp_cong_list , list ) {
                    if( !( ca->flags & TCP_CONG_NON_RESTRICTED)) //排除有限制的。限制和非限制区别?
                             continue;
                    offs += snprintf( buf+offs , maxlen-offs , "%s%s" , offs == 0?"" : " " , ca->name) ;
            }
            rcu_read_unlock() ;
    }


     

    /* Simple linear search , don't expect many entries! */
    static struct tcp_congestion_ops*tcp_ca_find( const char *name)
    {
            struct tcp_congestion_ops *e ;
            list_for_each_entry_rcu( e , &tcp_cong_list , list ) {
                    if( strcmp(e->name , name)==0)
                            return e ;
             }
             return NULL ;
    }


     

    /*
     * Attach new congestion control algorithm to the list 
     * of available options.
     */
    int tcp_register_congestion_control( struct tcp_congestion_ops *ca )
    {
            int ret = 0 ;
            /* all algorithms must implement ssthresh and cong_avoid ops */
            if ( !ca->ssthresh || !ca->cong_avoid ) {
                    printk(KERN_ERR "TCP %s does not implement required ops\n",
                                ca->name) ;
                    return -EINVAL ;
            }
    
            spin_lock(&tcp_cong_list_lock) ;
            if( tcp_ca_find (ca->name)) {
                    printk(KERN_NOTICE "TCP %s already registered\n", ca->name) ;
                    ret = -EEXIST; //不能直接return,不然会造成死锁
             } else {
                    list_add_tail_rcu( &ca->list , &tcp_cong_list) ;
                    printk(KERN_INFO "TCP %s registered\n", ca->name) ;
            }
            spin_unlock(&tcp_cong_list_lock) ;
    
            return ret ;
    }


    ======================================================================================================

  • 相关阅读:
    HDU1258 Sum It Up(DFS)
    hdu 1078 FatMouse and Cheese(记忆化搜索)
    HDU1072 Nightmare (bfs+贪心)
    HDU 2102 A计划 经典搜索
    hdu 1180诡异的楼梯(bfs)
    HDU 1065.I Think I Need a Houseboat
    559_N叉树的最大深度
    236_二叉树的最近公共祖先
    589_N叉树的前序遍历
    每天进步一点点
  • 原文地址:https://www.cnblogs.com/aiwz/p/6333404.html
Copyright © 2011-2022 走看看