先上传后面整理
/* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the tcp header. We need * to find the appropriate port. * * The locking strategy used here is very "optimistic". When * someone else accesses the socket the ICMP is just dropped * and for some paths there is no check at all. * A more general error queue to queue errors for later handling * is probably better. * */ /* * 目的不可达、源端被关闭、超时、参数错误这四种类型 * 的差错ICMP报文,都是由同一个函数icmp_unreach()来处理的, * 对其中目的不可达、源端被关闭这两种类型ICMP报文 * 因要提取某些信息而需作一些特殊的处理,而另外 * 一些则不需要,根据差错报文中的信息直接调用 * 传输层的错误处理例程。参见<Linux内核源码剖析348页> CMP差错报文的数据部分包括:原始数据报的IP首部再加上前8个字节的数据部分(2字节源端口+2字节目的端口+4字节序号) */ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) { struct iphdr *iph = (struct iphdr *)icmp_skb->data; struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2)); struct inet_connection_sock *icsk; struct tcp_sock *tp; struct inet_sock *inet; const int type = icmp_hdr(icmp_skb)->type; const int code = icmp_hdr(icmp_skb)->code; struct sock *sk; struct sk_buff *skb; __u32 seq; __u32 remaining; int err; struct net *net = dev_net(icmp_skb->dev); /* * 检测ICMP报文长度是否包含了原始IP首部和原始IP数据包中 * 前8字节数据,如果不完整则返回 */ if (icmp_skb->len < (iph->ihl << 2) + 8) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } /* * 通过从ICMP报文数据中获取的原始TCP首部中源端口号和IP首部 * 中源地址,得到发送该TCP报文的传输控制块。如果获取失败, * 则说明ICMP报文有误或该套接字已关闭;如果获取传输控制块 * 的TCP状态为TIME_WAIT,则说明套接字即将关闭,这两种情况 * 都无需进一步处理 */ sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest, iph->saddr, th->source, inet_iif(icmp_skb)); if (!sk) { ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS); return; } if (sk->sk_state == TCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); return; } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. *//* * 如果此时该传输控制块被用户进程锁定(如用户进程正在调用 * send等系统调用),则需累计相关SNMP的统计量 */ if (sock_owned_by_user(sk)) NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); goto out; } /* * 如果传输控制块不再侦听状态,且序号不再已发送未确认的区间内,则 * ICMP报文异常,无需进一步处理 */ icsk = inet_csk(sk); tp = tcp_sk(sk); seq = ntohl(th->seq); if (sk->sk_state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } switch (type) { case ICMP_SOURCE_QUENCH: /* Just silently ignore these. */ goto out; case ICMP_PARAMETERPROB: err = EPROTO; break; /* * 处理目的不可达类型,首先检测代码的合法性,然后根据 * 代码具体处理:如果需要分片而设置了不可分片,则调用 * do_pmtu_discovery()探测路径MTU;其他编码,则获取 * 对应的错误码 */ case ICMP_DEST_UNREACH: if (code > NR_ICMP_UNREACH) goto out; if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ if (!sock_owned_by_user(sk)) do_pmtu_discovery(sk, iph, info); goto out; } err = icmp_err_convert[code].errno; /* check if icmp_skb allows revert of backoff * (see draft-zimmermann-tcp-lcd) */ if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) break; if (seq != tp->snd_una || !icsk->icsk_retransmits || !icsk->icsk_backoff) break; if (sock_owned_by_user(sk)) break; icsk->icsk_backoff--; inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) << icsk->icsk_backoff; tcp_bound_rto(sk); skb = tcp_write_queue_head(sk); BUG_ON(!skb); remaining = icsk->icsk_rto - min(icsk->icsk_rto, tcp_time_stamp - TCP_SKB_CB(skb)->when); if (remaining) { inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, remaining, TCP_RTO_MAX); } else { /* RTO revert clocked out retransmission. * Will retransmit now */ tcp_retransmit_timer(sk); } break; case ICMP_TIME_EXCEEDED: err = EHOSTUNREACH; break; default: goto out; } switch (sk->sk_state) { struct request_sock *req, **prev; case TCP_LISTEN: /* * 如果传输控制块被用户进程锁定,则不作进一步处理 */ if (sock_owned_by_user(sk)) goto out; /* * 由于处于监听状态,因此根据目的端口号、源地址和目的地址查找 * 正在连接的对端套接字,如果查找失败则不作进一步处理 */ req = inet_csk_search_req(sk, &prev, th->dest, iph->daddr, iph->saddr); if (!req) goto out; /* ICMPs are not backlogged, hence we cannot get an established socket here. */ WARN_ON(req->sk); /* * 如果发送出去TCP段的序号不等于对端套接字中的发送序号, * 则说明序号有误,不作进一步处理 */ if (seq != tcp_rsk(req)->snt_isn) { NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } /* * Still in SYN_RECV, just remove it silently. * There is no good way to pass the error to the newly * created socket, and POSIX does not want network * errors returned from accept(). */ /* * 删除并释放连接过程中的传输控制块 */ inet_csk_reqsk_queue_drop(sk, req, prev); goto out; case TCP_SYN_SENT: case TCP_SYN_RECV: /* Cannot happen. It can f.e. if SYNs crossed. *//* * 如果传输控制块没有被用户进程锁定,则将错误码设置到sk_err, * 调用该套接字的错误报告借口函数,关闭套接字;否则将错误码 * 设置到sk_err_soft,在这种情况下用户进程可使用SO_ERROR套接 * 字选项获取错误码 */ if (!sock_owned_by_user(sk)) { sk->sk_err = err; sk->sk_error_report(sk); tcp_done(sk); } else { sk->sk_err_soft = err; } goto out; } /* If we've already connected we will keep trying * until we time out, or the user gives up. * * rfc1122 4.2.3.9 allows to consider as hard errors * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, * but it is obsoleted by pmtu discovery). * * Note, that in modern internet, where routing is unreliable * and in each dark corner broken firewalls sit, sending random * errors ordered by their masters even this two messages finally lose * their original sense (even Linux sends invalid PORT_UNREACHs) * * Now we are in compliance with RFCs. * --ANK (980905) */ /* * 到这一步,则传输控制块一定不再LISTEN、SYN_SENT或SYN_RECV状态, * 此时如果控制块没有被用户进程锁定,并且允许接收扩展的可靠错误 * 信息,则设置得到的错误码,然后通知错误;否则将错误码设置到sk_err_soft */ inet = inet_sk(sk); if (!sock_owned_by_user(sk) && inet->recverr) { sk->sk_err = err; sk->sk_error_report(sk); } else { /* Only an error on timeout */ sk->sk_err_soft = err; } out: bh_unlock_sock(sk); sock_put(sk); }