主要源文件:linux-2.6.37/ net/ ipv4/ Tcp_cong.c
本文主要分析RENO及TCP拥塞控制基础的实现
======================================================================================================
struct sock *sk 和 struct tcp_sock *tp 的转换
在include/ linux/ Tcp.h中, static inline struct tcp_sock *tcp_sk(const struct sock *sk) { return (struct tcp_sock *)sk ; } 给出struct sock *sk, struct tcp_sock *tp = tcp_sk(sk) ;
tcp_sock结构
struct tcp_sock { ... u32 window_clamp ; /* Maximal window to advertise */ u32 rcv_ssthresh ; /* Current window clamp */ u32 rcv_wnd ; /* Current receiver window */ ... /* snd_wll 记录发送窗口更新时,造成窗口更新的那个数据报的第一个序号。 * 它主要用于在下一次判断是否需要更新发送窗口。 */ u32 snd_wll ; /* Sequence for window update */ u32 snd_wnd ; /* 发送窗口的大小,直接取值于来自对方的数据报的TCP首部 */ /* Maximal window ever seen from peer 记录来自对方通告的窗口的最大值 */ /* First byte we want an ack for 发送窗口的左边沿 */ u32 max_window ; u32 snd_una ; ... /* * Slow start and congestion control */ u32 snd_ssthresh ; /* Slow start size threshold */ u32 snd_cwnd ; /* Sending congestion window */ /*表示在当前的拥塞控制窗口中已经发送的数据段的个数*/ u32 snd_cwnd_cnt ; /* Linear increase counter */ u32 snd_cwnd_clamp ; /* Do not allow snd_cwnd to grow above this */ ... u32 mss_cache ; /* cached effective mss , not including SACKS */ u32 bytes_acked ; /* Appropriate Byte Counting - RFC3465 */ ... }
拥塞避免算法关键部分
/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd ( or alternative w ) */ void tcp_cong_avoid_ai(struct tcp_sock *tp , u32 w) { if ( tp->snd_cwnd_cnt >= w) { if ( tp->snd_cwnd < tp->snd_cwnd_clamp) tp->snd_cwnd++ ; tp->snd_cwnd_cnt = 0 ; } else { tp->snd_cwnd_cnt ++ ; } } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai) ;
慢启动算法
void tcp_slow_start( struct tcp_sock *tp ) { int cnt ; /* increase in packets */ /* RFC3465 : ABC slow start * Increase only after a full MSS of bytes is acked * * TCP sender SHOULD increase cwnd by the number of * previously unacknowledged bytes ACKed by each incoming * acknowledgment , provided the increase is not more than L */ /* ack的数据少于MSS */ if ( sysctl_tcp_abc && tp->bytes_acked < tp->mss_cached ) return ; /* 此时不是应该进入拥塞避免?*/ if ( sysctl_tcp_max_ssthresh >0 && tcp->snd_cwnd >sysctl_tcp_max_ssthresh) cnt = sysctl_tcp_max_ssthresh >> 1 ; /* limited slow start */ else cnt = tp->snd_cwnd ; /* exponential increase */ /* RFC3465 : ABC * We MAY increase by 2 if discovered delayed ack */ /* 如果接收方启用了延时确认,此时收到的确认代表两个MSS数据报*/ if ( sysctl_tcp_abc >1 && tp->bytes_acked >= 2*tp->mss_cache ) cnt <<= 1 ; tp->bytes_acked = 0 ; tp->snd_cwnd_cnt += cnt ; /* 此时snd_cwnd_cnt等于snd_cwnd或2*snd_cwnd */ while( tp->snd_cwnd_cnt >= tp->snd_cwnd ) { tp->snd_cwnd_cnt -= tp->snd_cwnd ; if( tp->snd_cwnd < tp->snd_cwnd_clamp ) tp->snd_cwnd++ ; } } EXPORT_SYMBOL_GPL( tcp_slow_start ) ;
代表拥塞算法的结构体
#define TCP_CA_NAME_MAX 16struct tcp_congestion_ops { struct list_head list ; unsigned long flags ; /* initialize private data (optional) */ void (*init) (struct sock *sk) ; /* cleanup private data (optional) */ void (*release) (struct sock *sk) ; /* return slow start threshold (required) */ u32 (*ssthresh) (struct sock *sk) ; /* lower bound for congestion window (optional) */ u32 (*min_cwnd) (const struct sock *sk) ; /* do new cwnd calculation (required) */ void (*cong_avoid) (struct sock *sk , u32 ack , u32 in_flight ) ; /* call before changing ca_state (optional) */ void (*set_state) (struct sock *sk , u8 new_state) ; /* call when cwnd event occurs (optional) */ void (*cwnd_event) (struct sock *sk , enum tcp_ca_event ev) ; /* new value of cwnd after loss (optional) */ u32 (*undo_cwnd) (struct sock *sk) ; /* hook for packet ack accounting (optional) */ void (*pkts_acked) (struct sock *sk , u32 num_acked , s32 rtt_us) ; /* get info for inet_diag (optional) */ void (*get_info) (struct sock *sk , u32 ext , struct sk_buff *skb) ; char name[TCP_CA_NAME_MAX] ; struct module *owner ; }
在Tcp_cong.c中,有全局变量:
int sysctl_tcp_max_ssthresh = 0 ;
/* define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) */
static DEFINE_SPINLOCK( tcp_cong_list_lock ) ;
static LIST_HEAD( tcp_cong_list ) ; // tcp拥塞控制算法链表,其元素为tcp_congestion_ops
/*
BUG_ON( ) ; 如果BUG_ON中的条件为真就调用BUG,它输出一些信息,然后调用panic函数挂起系统。
char *strncpy( char * dest , char *src , size_t n ) ;
它与strcpy不同之处在于复制n个字符,而不是把所有的字符拷贝(包括结尾'\0')。
当src的长度小于n时,dst内的未复制空间用'\0'填充。否则,复制n个字符到dst,没有加'\0'。这里就要注意在字符串dst结尾处理加'\0'的情况了。
rcu_read_lock() // 读者在读取由RCU保护的共享数据时使用该函数标记它进入读端临界区。
rcu_read_unlock() // 该函数与rcu_read_lock配对使用,用以标记读者退出读端临界区。
*/
对拥塞控制算法的一些操作(读写增减注册等)
/* Get current default congestion control */ void tcp_get_default_congestion_control( char *name ) { struct tcp_congestion_ops *ca ; /* We will always have reno */ BUG_ON( list_empty( &tcp_cong_list) ) ; rcu_read_lock( ) ; ca = list_entry( tcp_cong_list . next , struct tcp_congestion_ops , list ) ; strncpy( name , ca->name , TCP_CA_NAME_MAX ) ; rcu_read_unlock( ) ; }
struct sock——representation of sockets
struct inet_sock——representation of INET sockets
struct inet_connection_sock——INET connection oriented sockets
struct tcp_sock——tcp sockets
以上几种socket越分越细,比如inet_connection_sock是在inet_sock上的扩展,具有自己特有的属性。
tcp_sock是TCP协议专用的一个socket表示,它是在struct inet_connection_sock基础进行扩展,主要是增加了滑动窗口协议,避免拥塞算法等一些TCP专有属性。
struct inet_connection_sock { ... // Pluggable congestion control hook const struct tcp_congestion_ops *icsk_ca_ops ; ... u32 icsk_ca_priv[16] ; #define ICSK_CA_PRIV_SIZE (16*sizeof(u32)) }
举例://有一个初始化了得struct sock *sk
struct inet_connection_sock *icsk = inet_csk( sk ) ;
printk(KERN_INFO "%s" , icsk->icsk_ca_ops->name) ; //当前连接拥塞控制算法名称
struct inet_sock { ... /* Socket demultiplex comparisons on incoming packets */ __be32 inet_daddr ; __be16 inet_dport ; __be32 inet_saddr ; __be16 inet_sport ; __be16 inet_num ; // local port __be32 inet_rcv_saddr ; // Bound local IPv4 addr ... }
/* Built list of non-restricted congestion control values*/ void tcp_get_allowed_congestion_control( char *buf , size_t maxlen) { struct tcp_congestion_ops *ca ; size_t offs = 0 ; *buf = '\0' ; //有必要? rcu_read_lock() ; list_for_each_entry( ca , &tcp_cong_list , list ) { if( !( ca->flags & TCP_CONG_NON_RESTRICTED)) //排除有限制的。限制和非限制区别? continue; offs += snprintf( buf+offs , maxlen-offs , "%s%s" , offs == 0?"" : " " , ca->name) ; } rcu_read_unlock() ; }
/* Simple linear search , don't expect many entries! */ static struct tcp_congestion_ops*tcp_ca_find( const char *name) { struct tcp_congestion_ops *e ; list_for_each_entry_rcu( e , &tcp_cong_list , list ) { if( strcmp(e->name , name)==0) return e ; } return NULL ; }
/* * Attach new congestion control algorithm to the list * of available options. */ int tcp_register_congestion_control( struct tcp_congestion_ops *ca ) { int ret = 0 ; /* all algorithms must implement ssthresh and cong_avoid ops */ if ( !ca->ssthresh || !ca->cong_avoid ) { printk(KERN_ERR "TCP %s does not implement required ops\n", ca->name) ; return -EINVAL ; } spin_lock(&tcp_cong_list_lock) ; if( tcp_ca_find (ca->name)) { printk(KERN_NOTICE "TCP %s already registered\n", ca->name) ; ret = -EEXIST; //不能直接return,不然会造成死锁 } else { list_add_tail_rcu( &ca->list , &tcp_cong_list) ; printk(KERN_INFO "TCP %s registered\n", ca->name) ; } spin_unlock(&tcp_cong_list_lock) ; return ret ; }
======================================================================================================