先上传后面整理
/*
* This routine is called by the ICMP module when it gets some
* sort of error condition. If err < 0 then the socket should
* be closed and the error returned to the user. If err > 0
* it's just the icmp type << 8 | icmp code. After adjustment
* header points to the first 8 bytes of the tcp header. We need
* to find the appropriate port.
*
* The locking strategy used here is very "optimistic". When
* someone else accesses the socket the ICMP is just dropped
* and for some paths there is no check at all.
* A more general error queue to queue errors for later handling
* is probably better.
*
*/
/*
* 目的不可达、源端被关闭、超时、参数错误这四种类型
* 的差错ICMP报文,都是由同一个函数icmp_unreach()来处理的,
* 对其中目的不可达、源端被关闭这两种类型ICMP报文
* 因要提取某些信息而需作一些特殊的处理,而另外
* 一些则不需要,根据差错报文中的信息直接调用
* 传输层的错误处理例程。参见<Linux内核源码剖析348页>
CMP差错报文的数据部分包括:原始数据报的IP首部再加上前8个字节的数据部分(2字节源端口+2字节目的端口+4字节序号)
*/
void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
{
struct iphdr *iph = (struct iphdr *)icmp_skb->data;
struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
struct inet_connection_sock *icsk;
struct tcp_sock *tp;
struct inet_sock *inet;
const int type = icmp_hdr(icmp_skb)->type;
const int code = icmp_hdr(icmp_skb)->code;
struct sock *sk;
struct sk_buff *skb;
__u32 seq;
__u32 remaining;
int err;
struct net *net = dev_net(icmp_skb->dev);
/*
* 检测ICMP报文长度是否包含了原始IP首部和原始IP数据包中
* 前8字节数据,如果不完整则返回
*/
if (icmp_skb->len < (iph->ihl << 2) + 8) {
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return;
}
/*
* 通过从ICMP报文数据中获取的原始TCP首部中源端口号和IP首部
* 中源地址,得到发送该TCP报文的传输控制块。如果获取失败,
* 则说明ICMP报文有误或该套接字已关闭;如果获取传输控制块
* 的TCP状态为TIME_WAIT,则说明套接字即将关闭,这两种情况
* 都无需进一步处理
*/
sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
iph->saddr, th->source, inet_iif(icmp_skb));
if (!sk) {
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return;
}
if (sk->sk_state == TCP_TIME_WAIT) {
inet_twsk_put(inet_twsk(sk));
return;
}
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
* servers this needs to be solved differently.
*//*
* 如果此时该传输控制块被用户进程锁定(如用户进程正在调用
* send等系统调用),则需累计相关SNMP的统计量
*/
if (sock_owned_by_user(sk))
NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
if (sk->sk_state == TCP_CLOSE)
goto out;
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto out;
}
/*
* 如果传输控制块不再侦听状态,且序号不再已发送未确认的区间内,则
* ICMP报文异常,无需进一步处理
*/
icsk = inet_csk(sk);
tp = tcp_sk(sk);
seq = ntohl(th->seq);
if (sk->sk_state != TCP_LISTEN &&
!between(seq, tp->snd_una, tp->snd_nxt)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
switch (type) {
case ICMP_SOURCE_QUENCH:
/* Just silently ignore these. */
goto out;
case ICMP_PARAMETERPROB:
err = EPROTO;
break;
/*
* 处理目的不可达类型,首先检测代码的合法性,然后根据
* 代码具体处理:如果需要分片而设置了不可分片,则调用
* do_pmtu_discovery()探测路径MTU;其他编码,则获取
* 对应的错误码
*/
case ICMP_DEST_UNREACH:
if (code > NR_ICMP_UNREACH)
goto out;
if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
if (!sock_owned_by_user(sk))
do_pmtu_discovery(sk, iph, info);
goto out;
}
err = icmp_err_convert[code].errno;
/* check if icmp_skb allows revert of backoff
* (see draft-zimmermann-tcp-lcd) */
if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
break;
if (seq != tp->snd_una || !icsk->icsk_retransmits ||
!icsk->icsk_backoff)
break;
if (sock_owned_by_user(sk))
break;
icsk->icsk_backoff--;
inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
icsk->icsk_backoff;
tcp_bound_rto(sk);
skb = tcp_write_queue_head(sk);
BUG_ON(!skb);
remaining = icsk->icsk_rto - min(icsk->icsk_rto,
tcp_time_stamp - TCP_SKB_CB(skb)->when);
if (remaining) {
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
remaining, TCP_RTO_MAX);
} else {
/* RTO revert clocked out retransmission.
* Will retransmit now */
tcp_retransmit_timer(sk);
}
break;
case ICMP_TIME_EXCEEDED:
err = EHOSTUNREACH;
break;
default:
goto out;
}
switch (sk->sk_state) {
struct request_sock *req, **prev;
case TCP_LISTEN:
/*
* 如果传输控制块被用户进程锁定,则不作进一步处理
*/
if (sock_owned_by_user(sk))
goto out;
/*
* 由于处于监听状态,因此根据目的端口号、源地址和目的地址查找
* 正在连接的对端套接字,如果查找失败则不作进一步处理
*/
req = inet_csk_search_req(sk, &prev, th->dest,
iph->daddr, iph->saddr);
if (!req)
goto out;
/* ICMPs are not backlogged, hence we cannot get
an established socket here.
*/
WARN_ON(req->sk);
/*
* 如果发送出去TCP段的序号不等于对端套接字中的发送序号,
* 则说明序号有误,不作进一步处理
*/
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
goto out;
}
/*
* Still in SYN_RECV, just remove it silently.
* There is no good way to pass the error to the newly
* created socket, and POSIX does not want network
* errors returned from accept().
*/
/*
* 删除并释放连接过程中的传输控制块
*/
inet_csk_reqsk_queue_drop(sk, req, prev);
goto out;
case TCP_SYN_SENT:
case TCP_SYN_RECV: /* Cannot happen.
It can f.e. if SYNs crossed.
*//*
* 如果传输控制块没有被用户进程锁定,则将错误码设置到sk_err,
* 调用该套接字的错误报告借口函数,关闭套接字;否则将错误码
* 设置到sk_err_soft,在这种情况下用户进程可使用SO_ERROR套接
* 字选项获取错误码
*/
if (!sock_owned_by_user(sk)) {
sk->sk_err = err;
sk->sk_error_report(sk);
tcp_done(sk);
} else {
sk->sk_err_soft = err;
}
goto out;
}
/* If we've already connected we will keep trying
* until we time out, or the user gives up.
*
* rfc1122 4.2.3.9 allows to consider as hard errors
* only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
* but it is obsoleted by pmtu discovery).
*
* Note, that in modern internet, where routing is unreliable
* and in each dark corner broken firewalls sit, sending random
* errors ordered by their masters even this two messages finally lose
* their original sense (even Linux sends invalid PORT_UNREACHs)
*
* Now we are in compliance with RFCs.
* --ANK (980905)
*/
/*
* 到这一步,则传输控制块一定不再LISTEN、SYN_SENT或SYN_RECV状态,
* 此时如果控制块没有被用户进程锁定,并且允许接收扩展的可靠错误
* 信息,则设置得到的错误码,然后通知错误;否则将错误码设置到sk_err_soft
*/
inet = inet_sk(sk);
if (!sock_owned_by_user(sk) && inet->recverr) {
sk->sk_err = err;
sk->sk_error_report(sk);
} else { /* Only an error on timeout */
sk->sk_err_soft = err;
}
out:
bh_unlock_sock(sk);
sock_put(sk);
}