重传队列实际上就是发送队列(sk->sk_write_queue),保存着发送且未确认的数据段。
当有新的数据段被确认时,需要把这些段从重传队列中删除,同时更新一些变量,包括
packets_out、sacked_out、lost_out、retrans_out等。
对于非重复的ACK,会进行RTT采样,用于更新srtt和rto等时延信息。
本文主要内容:tcp_clean_rtx_queue()的实现。
内核版本:3.2.12
Author:zhangskd @ csdn
函数实现
Q:什么是重传队列?
A:重传队列实际上就是发送队列(sk->sk_write_queue),保存着发送且未确认的数据段。
The retransmit queue is implemented using a linked list to hold all the packets currently in
flight to the receiver.
Q:tcp_clean_rtx_queue()是干什么的?
A:tcp_clean_rtx_queue() is called to remove and free the acknowledged packets from the
retransmit queue, and packets_out is decremented by the number of freed packets.
/* Remove acknowledged frames from the retransmission queue. * If our packet is before the ack sequence we can discard it as it's confirmed to * have arrived at the other end. */ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, u32 prior_snd_una) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; u32 now = tcp_time_stamp; /* 当前时间,用于计算RTT */ int fully_acked = 1; /* 表示数据段是否完全被确认 */ int flag = 0; u32 pkts_acked = 0; u32 reord = tp->packets_out; u32 prior_sacked = tp->sacked_out; s32 seq_rtt = -1; s32 ca_seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); /* 把last_ackt置为0*/ /* 遍历发送队列sk_write_queue * 注意:遍历到snd_una即停止,也就是说如果snd_una没更新,那么这个循环马上就退出! */ while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u32 acked_pcount; u32 sacked = scb->sacked; /* 记分板scoreboard */ /* Determine how many packets and what bytes were acked, tso and else. * tp->snd_una已经是更新过的了,所以从发送队列头到snd_una就是此ACK确认的数据量。 */ if (after(scb->end_seq, tp->snd_una)) { /* 如果没有使用TSO,或seq >= snd_una,那么就退出遍历*/ if (tcp_skb_pcount(skb) == 1 || ! after(tp->snd_una, scb->seq)) break; /* 如果只确认了TSO段中的一部分,则截掉确认的部分,并统计确认了多少段*/ acked_pcount = tcp_tso_acked(sk, skb); if (! acked_pcount) /* 处理出错 */ break; fully_acked = 0; /* 表示没有确认完TSO段*/ } else { acked_pcount = tcp_skb_pcount(skb); /* 统计确认段的个数*/ } /* 如果此段被重传过*/ if (sacked & TCPCB_RETRANS) { if (sacked & TCPCB_SACKED_RETRANS) /* 之前重传了还没有恢复*/ tp->retrans_out -= acked_pcount; /* 更新网络中重传且未确认段的数量*/ flag |= FLAG_RETRANS_DATA_ACKED; ca_seq_rtt = -1; seq_rtt = -1; if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1)) flag |= FLAG_NONHEAD_RETRANS_ACKED; } else { /* 如果此段没有被重传过*/ ca_seq_rtt = now - scb->when; /* 通过此ACK计算skb的RTT采样值*/ last_ackt = skb->tstamp; /* 获取此skb的发送时间,可以精确到纳秒!*/ if (seq_rtt < 0) { seq_rtt = ca_seq_rtt; } /* 如果SACK块中有空洞,那么保存其中序号最小号的 */ if (! (sacked & TCPCB_SACKED_ACKED)) reord = min(pkts_acked, reord); } /* 如果skb之前是带有SACK标志 */ if (sacked & TCPCB_SACKED_ACKED) tp->sacked_out -= acked_pcount; /* 更新sacked_out */ /* 如果skb之前是带有LOST标志 */ if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; /* 更新lost_out */ tp->packets_out -= acked_pcount; /* 更新packets_out */ pkts_acked += acked_pcount; /* 累加此ACK确认的数据量*/ /* Initial outgoing SYN's get put onto the write_queue just like anything else * we transmit. It is not true data, and if we misinform our callers that this ACK * acks real data, we will erroneously exit connection startup slow start one packet * too quickly. This is severely frowned upon behavior. */ if (! (scb->flags & TCPHDR_SYN)) { flag |= FLAG_DATA_ACKED; /* 确认了新的数据 */ } else { flag |= FLAG_SYN_ACKED; /* 确认了SYN段 */ tp->retrans_stamp = 0; /* Clear the stamp of the first SYN */ } if (! fully_acked) /* 如果TSO段没被完全确认,则到此为止*/ break; tcp_unlink_write_queue(skb, sk); /* 从发送队列上移除skb */ sk_wmem_free_skb(sk, skb); /* 删除skb的内存对象*/ tp->scoreboard_skb_hint = NULL; if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = NULL; if (skb == tp->lost_skb_hint) tp->lost_skb_hint = NULL; } /* 退出循环了这里*/ if (likely(between(tp->snd_up, prior_snd_una, tp->snd_una))) tp->snd_up = tp->snd_una; /* 更新Urgent pointer */ if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) flag |= FLAG_SACK_RENEGING; /* 虚假的SACK */ /* 如果此ACK确认了新数据,使snd_una前进了*/ if (flag & FLAG_ACKED) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; /* 如果路径MTU的探测段被确认了*/ if (unlikely(icsk->icsk_mtup.probe_size && ! after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); } /* 更新srtt、RTO等RTT相关变量*/ tcp_ack_update_rtt(sk, flag, seq_rtt); tcp_rearm_rto(sk); /* 重置超时重传定时器*/ if (tcp_is_reno(tp)) { /* Reno模拟SACK处理,更新tp->sacked_out。 * 如果检测到乱序,更新tp->reordering。 */ tcp_remove_reno_sacks(sk, pkts_acked); } else { int delta; /* Non-retransmitted hole got filled? That's reordering。 * 如果之前没有SACK,prior_fackets为0,不会更新。 */ if (reord < prior_fackets) tcp_update_reordering(sk, tp->fackets_out - reord, 0); /* 更新乱序队列大小*/ delta = tcp_is_fack(tp) ? pkts_acked : prior_sacked - tp->sacked_out; tp->lost_cnt_hint = -= min(tp->lost_cnt_hint, delta); } tp->fackets_out -= min(pkts_acked, tp->fackets_out); /* 更新fackets_out */ /* 如果定义了pkts_acked()钩子*/ if (ca_ops->pkts_acked) { s32 rtt_us = -1; /* Is the ACK triggering packet unambiguous?,确认了非重传的数据段 */ if (! (flag & FLAG_RETRANS_DATA_ACKED)) { /* High resolution needed and available? * 高精确度的RTT测量,可以精确到微秒! */ if (ca_ops->flags & TCP_CONG_RTT_STAMP && ! ktime_equal(last_ackt, net_invalid_timestamp())) rtt_us = ktime_us_delta(ktime_get_real(), last_ackt); else if (ca_seq_rtt >=0) /* 普通测量,精确到毫秒,再转为微秒*/ rtt_us = jiffies_to_usecs(ca_seq_rtt); } ca_ops->pkts_acked(sk, pkts_acked, rtt_us); /* 我们可以自定义的 */ } } #if FASTRETRANS_DEBUG > 0 WARN_ON((int) tp->sacked_out < 0); WARN_ON((int) tp->lost_out < 0); WARN_ON((int) tp->retrans_out < 0); if (! tp->packets_out && tcp_is_sack(tp)) { icsk = inet_csk(sk); if (tp->lost_out) { printk(KERN_DEBUG "Leak l=%u %d ", tp->lost_out, icsk->icsk_ca_state); tp->lost_out = 0; } if (tp->sacked_out) { printk(KERN_DEBUG "Leak s=%u %d ", tp->sacked_out, icsk->icsk_ca_state); tp->sacked_out = 0; } if (tp->retrans_out) { printk(KERN_DEBUG "Leak r=%u %d ", tp->retrans_out, icsk->icsk_ca_state); tp->retrans_out = 0; } } #endif return flag; }
ktime_t
/* * ktime_t * On 64-bit CPUs a single 64-bit variable is used to store the hrtimes internal * representation of time values in scalar nanoseconds. The design plays out * best on 64-bit CPUs, where most conversions are NOPs and most arithmetic * ktime_t operations are plain arithmetic operations. * * On 32-bit CPUs an optimized representation of the timespec structure is used to * avoid expensive conversions from and to timespecs. The endian-aware order of * the tv struct members is chosen to allow mathematical operations on the tv64 * member of the union too, which for certain operations produces better code. * * For architectures with efficient support for 64/32-bit conversions the plain scalar * nanosecond based representation can be selected by the config switch * CONFIG_KTIME_SCALAR. */ union ktime { s64 tv64; #if BITS_PER_LONG != 64 && !defined(CONFIG_KTIME_SCALAR) struct { #ifdef __BIG_ENDIAN s32 sec, nsec; #else s32 nsec, sec; #endif } tv; #endif }; typedef union ktime ktime_t; /* 返回值为0的ktime_t*/ static inline ktime_t net_invalid_timestamp(void) { return ktime_set(0, 0); }
TSO
当TSO段不是整个被确认,而是被确认一部分时,那么就分割TSO段,返回确认的段数。
/* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); u32 packets_acked; BUG_ON(! after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); packets_acked = tcp_skb_pcount(skb); /* tso段总共包括多少个段*/ if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return 0; packets_acked -= tcp_skb_pcount(skb); /* 减去未确认的段*/ if (packets_acked) { BUG_ON(tcp_skb_pcount(skb) == 0); BUG_ON(! before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); } return packets_acked; /* 返回确认的段数 */ }