1、连接建立定时器:
tcp 连接建立时, client 会发出syn 然后等待ack,server 收到syn 后会回复ack 同时也会带上新的syn,此时等待客户端回复ack,当时server没有收到ack,server 会超时重发几次synack,最后没有收到ack,导致连接建立将终止。
创建request_sock, 并进入TCP_NEW_SYN_RECV状态后,插入ehash表中,发送synack,并初始化reqsk_timer定时器,准备好重传synack的准备
static void reqsk_queue_hash_req(struct request_sock *req, unsigned long timeout) { req->num_retrans = 0; req->num_timeout = 0; req->sk = NULL; setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); mod_timer_pinned(&req->rsk_timer, jiffies + timeout); inet_ehash_insert(req_to_sk(req), NULL);//短链接会频繁操作establish hash表 /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. */ smp_wmb(); atomic_set(&req->rsk_refcnt, 2 + 1); } /* 启动SYNACK定时器。这便是SYNACK定时器的激活时机,三次握手的详情可见之前的文章。 */ void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, unsigned long timeout) { reqsk_queue_hash_req(req, timeout); inet_csk_reqsk_queue_added(sk); } static inline void inet_csk_reqsk_queue_added(struct sock *sk) { reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue); } static inline void reqsk_queue_added(struct request_sock_queue *queue) { atomic_inc(&queue->young);//没有重传过synack的请求 atomic_inc(&queue->qlen);// 目前有多少个未完成握手的请求 }
/* * 在TCP传输控制块中有一个用于存放连接请求块(处于SYN_RECV状态以及 * 已连接但未被accept的传输控制块)的容器 */ //该结构在inet_connection_sock中的icsk_accept_queue struct request_sock_queue { spinlock_t rskq_lock; /* * 保存相关套接字TCP层的选项TCP_DEFER_ACCEPT的值,参见 * TCP_DEFER_ACCEPT * 保存的是启用TCP_DEFER_ACCEPT时允许重传SYN+ACK段的次数。 * 注意:如果启用了TCP_DEFER_ACCEPT选项,将使用rskq_defer_accept * 作为允许重传的最大次数,不再是sysctl_tcp_synack_retries, * 参见inet_csk_reqsk_queue_prune()。 */ u8 rskq_defer_accept; u32 synflood_warned; atomic_t qlen; atomic_t young; /* * rskq_accept_head和rskq_accept_tail表示的链表保存的是 * 已完成连接建立过程的连接请求块 服务器端accept的时候 struct sock是从这个队列上面取出来的 已经建立连接的连接的节点添加到这里, 这些链表的节点信息结构体是tcp_request_sock。 当应用程序调用accept函数后,会从这里面取走这个tcp_request_sock 当应用程序accept的时候, 会调用reqsk_queue_get_child取走这个新创建的sock, 同时就需要把这个取出的tcp_request_sock释放掉 */ struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine * if TFO is enabled. */ };
static void reqsk_timer_handler(unsigned long data) { struct request_sock *req = (struct request_sock *)data; struct sock *sk_listener = req->rsk_listener; struct net *net = sock_net(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener); struct request_sock_queue *queue = &icsk->icsk_accept_queue; int qlen, expire = 0, resend = 0; int max_retries, thresh; u8 defer_accept; if (sk_state_load(sk_listener) != TCP_LISTEN) goto drop; //优先使用TCP_SYNCNT socket选项 max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries; thresh = max_retries;//默认3-5 /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. * If synack was not acknowledged for 1 second, it means * one of the following things: synack was lost, ack was lost, * rtt is high or nobody planned to ack (i.e. synflood). * When server is a bit loaded, queue is populated with old * open requests, reducing effective size of queue. * When server is well loaded, queue size reduces to zero * after several minutes of work. It is not synflood, * it is normal operation. The solution is pruning * too old entries overriding normal timeout, when * situation becomes dangerous. * * Essentially, we reserve half of room for young * embrions; and abort old ones without pity, if old * ones are about to clog our table. */ qlen = reqsk_queue_len(queue);//// 目前有多少个未完成握手的请求 if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {//没有完成三次握手的数量,超过syn请求队列最大长度的一半 int young = reqsk_queue_len_young(queue) << 1;// 使用 * 2 而不是 除以 2 作比较 //young // 没有重传过synack的请求 while (thresh > 2) {//没重传过的请求大于等待完成三次握手数的一半 if (qlen < young)//队列中还在等待客户端的第三个ACK报文并且没有超时的请求套接口(young状态)的数量大于当前队列长度的一半,说明队列尚在健康状态 break; thresh--; young <<= 1; } } defer_accept = READ_ONCE(queue->rskq_defer_accept);//defer_accept指定的重传次数 if (defer_accept) max_retries = defer_accept; syn_ack_recalc(req, thresh, max_retries, defer_accept, &expire, &resend);//计算是否需要重传 req->rsk_ops->syn_ack_timeout(req); if (!expire &&//没有超过最大重传次数; 对于defer_accept来说,如果收到ack了,但是一直没有收到数据 (!resend ||//不需要重传 !inet_rtx_syn_ack(sk_listener, req) ||/// 需要重传且重传synack成功 --->执行tcp_v4_send_synack 重传 成功 inet_rsk(req)->acked)) {////重传失败,但是被ack, 说明是defer_accept unsigned long timeo; if (req->num_timeout++ == 0) atomic_dec(&queue->young);//第一次重传,则标记为old timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); mod_timer_pinned(&req->rsk_timer, jiffies + timeo);//指数增加超时时间 return; } drop://超过最大重传次数,删除这个req,从ehash中删除,并清除定时器 inet_csk_reqsk_queue_drop_and_put(sk_listener, req); }
/* Decide when to expire the request and when to resend SYN-ACK */ static inline void syn_ack_recalc(struct request_sock *req, const int thresh, const int max_retries, const u8 rskq_defer_accept, int *expire, int *resend) { if (!rskq_defer_accept) { //不考虑延时accept的情况下,实现逻辑 超时次数已经大于限定的阈值,说明已经超时,需要销毁此请求套接口 *expire = req->num_timeout >= thresh; *resend = 1; return; } //如果当前的超时次数大于阈值thresh,并且大于最大重传次数(即延时accept--max_retries的次数),判定为超时;同时 //同时acked等于0(即未接收到单独的ACK报文)也判定为超时,其它情况下判定未超时 *expire = req->num_timeout >= thresh && (!inet_rsk(req)->acked || req->num_timeout >= max_retries); /* * Do not resend while waiting for data after ACK, * start to resend on end of deferring period to give * last chance for data or ACK to create established socket. 重传resend,如果未接收到单独的ACK报文或者是已到延时accept的最后*/ *resend = !inet_rsk(req)->acked || req->num_timeout >= rskq_defer_accept - 1; }
延时ACCEPT功能
用户层可通过setsockopt设置延时accept功能
开启此功能,处理逻辑位于函数tcp_check_req中。如果仅仅是接收到客户端回复的第三个握手ACK报文,无数据,不进行处理,设置acked为1。反之如果接收到数据和ACK,进行正常处理,忽略延时accept功能。
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen) { /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; return NULL }
注意:
static void reqsk_queue_hash_req(struct request_sock *req,
unsigned long timeout)
{
req->num_retrans = 0;
req->num_timeout = 0;
req->sk = NULL;
setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
inet_ehash_insert(req_to_sk(req), NULL);//短链接会频繁操作establish hash表
/* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized.
*/
smp_wmb();
atomic_set(&req->rsk_refcnt, 2 + 1);
}
/* insert a socket into ehash, and eventually remove another one
* (The another one can be a SYN_RECV or TIMEWAIT
*/
bool inet_ehash_insert(struct sock *sk, struct sock *osk)
{//--->tcp_hashinfo 需要注意的是tcp_hashinfo.ehash不仅包括已建立连接的TCP套接口,
//还包括除了在LISTEN状态的其它所有状态的套接口。
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list;
struct inet_ehash_bucket *head;
spinlock_t *lock;
bool ret = true;
WARN_ON_ONCE(!sk_unhashed(sk));
sk->sk_hash = sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
list = &head->chain;
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);//,在当前的Linux TCP实现中,每一个hash bucket拥有一个spinlock 多核cpu 添加删除 的时候 抢占lock 有点麻烦
spin_lock(lock);
if (osk) {
WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
ret = sk_nulls_del_node_init_rcu(osk);
}
if (ret)
__sk_nulls_add_node_rcu(sk, list);
spin_unlock(lock);
return ret;
}