ip层收包流程概述:
(1) 在inet_init中注册了类型为ETH_P_IP协议的数据包的回调ip_rcv
(2) 当二层数据包接收完毕,会调用netif_receive_skb根据协议进行向上层分发
(3) 类型为ETH_P_IP类型的数据包,被传递到三层,调用ip_rcv函数
(4) ip_rcv完成基本的校验和处理工作后,经过PRE_ROUTING钩子点
(5) 经过PRE_ROUTING钩子点之后,调用ip_rcv_finish完成数据包接收,包括选项处理,路由查询,并且根据路由决定数据包是发往本机还是转发
以下为源码分析:
1 static struct packet_type ip_packet_type __read_mostly = { 2 .type = cpu_to_be16(ETH_P_IP), 3 .func = ip_rcv, 4 };
1 /* 2 * Main IP Receive routine. 3 */ 4 int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) 5 { 6 const struct iphdr *iph; 7 struct net *net; 8 u32 len; 9 10 /* When the interface is in promisc. mode, drop all the crap 11 * that it receives, do not try to analyse it. 12 */ 13 /* 混杂模式下,非本机包 */ 14 if (skb->pkt_type == PACKET_OTHERHOST) 15 goto drop; 16 17 18 /* 获取net */ 19 net = dev_net(dev); 20 __IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len); 21 22 /* 检查skb共享 */ 23 skb = skb_share_check(skb, GFP_ATOMIC); 24 if (!skb) { 25 __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); 26 goto out; 27 } 28 29 /* 测试是否可以取得ip头 */ 30 if (!pskb_may_pull(skb, sizeof(struct iphdr))) 31 goto inhdr_error; 32 33 /* 取ip头 */ 34 iph = ip_hdr(skb); 35 36 /* 37 * RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum. 38 * 39 * Is the datagram acceptable? 40 * 41 * 1. Length at least the size of an ip header 42 * 2. Version of 4 43 * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] 44 * 4. Doesn't have a bogus length 45 */ 46 47 /* 头部长度不足20 或者版本不是4 */ 48 if (iph->ihl < 5 || iph->version != 4) 49 goto inhdr_error; 50 51 BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1); 52 BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0); 53 BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE); 54 __IP_ADD_STATS(net, 55 IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK), 56 max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); 57 58 /* 测试实际应取的ip头 */ 59 if (!pskb_may_pull(skb, iph->ihl*4)) 60 goto inhdr_error; 61 62 /* 取ip头 */ 63 iph = ip_hdr(skb); 64 65 /* 校验和错误 */ 66 if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) 67 goto csum_error; 68 69 /* 取总长度 */ 70 len = ntohs(iph->tot_len); 71 72 /* skb长度比ip包总长度小 */ 73 if (skb->len < len) { 74 __IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS); 75 goto drop; 76 } 77 /* 比头部长度还小 */ 78 else if (len < (iph->ihl*4)) 79 goto inhdr_error; 80 81 /* Our transport medium may have padded the buffer out. Now we know it 82 * is IP we can trim to the true length of the frame. 83 * Note this now means skb->len holds ntohs(iph->tot_len). 84 */ 85 /* 设置总长度为ip包的长度 */ 86 if (pskb_trim_rcsum(skb, len)) { 87 __IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS); 88 goto drop; 89 } 90 91 /* 取得传输层头部 */ 92 skb->transport_header = skb->network_header + iph->ihl*4; 93 94 /* Remove any debris in the socket control block */ 95 /* 重置cb */ 96 memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); 97 98 /* 保存输入设备信息 */ 99 IPCB(skb)->iif = skb->skb_iif; 100 101 /* Must drop socket now because of tproxy. */ 102 skb_orphan(skb); 103 104 /* 经过PRE_ROUTING钩子点 */ 105 return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, 106 net, NULL, skb, dev, NULL, 107 ip_rcv_finish); 108 109 csum_error: 110 __IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS); 111 inhdr_error: 112 __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS); 113 drop: 114 kfree_skb(skb); 115 out: 116 return NET_RX_DROP; 117 }
1 static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) 2 { 3 const struct iphdr *iph = ip_hdr(skb); 4 struct rtable *rt; 5 struct net_device *dev = skb->dev; 6 void (*edemux)(struct sk_buff *skb); 7 8 /* if ingress device is enslaved to an L3 master device pass the 9 * skb to its handler for processing 10 */ 11 skb = l3mdev_ip_rcv(skb); 12 if (!skb) 13 return NET_RX_SUCCESS; 14 15 /* 16 启用了early_demux 17 skb路由缓存为空 18 skb的sock为空 19 不是分片包 20 */ 21 if (net->ipv4.sysctl_ip_early_demux && 22 !skb_dst(skb) && 23 !skb->sk && 24 !ip_is_fragment(iph)) { 25 const struct net_protocol *ipprot; 26 27 /* 找到上层协议 */ 28 int protocol = iph->protocol; 29 30 /* 获取协议对应的prot */ 31 ipprot = rcu_dereference(inet_protos[protocol]); 32 33 /* 找到early_demux函数,如tcp_v4_early_demux */ 34 if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { 35 36 /* 调用该函数,将路由信息缓存到skb->refdst */ 37 edemux(skb); 38 /* must reload iph, skb->head might have changed */ 39 /* 重新取ip头 */ 40 iph = ip_hdr(skb); 41 } 42 } 43 44 /* 45 * Initialise the virtual path cache for the packet. It describes 46 * how the packet travels inside Linux networking. 47 */ 48 /* 校验路由失败 */ 49 if (!skb_valid_dst(skb)) { 50 /* 查路由 */ 51 int err = ip_route_input_noref(skb, iph->daddr, iph->saddr, 52 iph->tos, dev); 53 if (unlikely(err)) { 54 if (err == -EXDEV) 55 __NET_INC_STATS(net, LINUX_MIB_IPRPFILTER); 56 goto drop; 57 } 58 } 59 60 #ifdef CONFIG_IP_ROUTE_CLASSID 61 if (unlikely(skb_dst(skb)->tclassid)) { 62 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); 63 u32 idx = skb_dst(skb)->tclassid; 64 st[idx&0xFF].o_packets++; 65 st[idx&0xFF].o_bytes += skb->len; 66 st[(idx>>16)&0xFF].i_packets++; 67 st[(idx>>16)&0xFF].i_bytes += skb->len; 68 } 69 #endif 70 71 /* 处理ip选项 */ 72 if (iph->ihl > 5 && ip_rcv_options(skb)) 73 goto drop; 74 75 /* 找到路由缓存项 */ 76 rt = skb_rtable(skb); 77 if (rt->rt_type == RTN_MULTICAST) { 78 __IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len); 79 } else if (rt->rt_type == RTN_BROADCAST) { 80 __IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len); 81 } else if (skb->pkt_type == PACKET_BROADCAST || 82 skb->pkt_type == PACKET_MULTICAST) { 83 struct in_device *in_dev = __in_dev_get_rcu(dev); 84 85 /* RFC 1122 3.3.6: 86 * 87 * When a host sends a datagram to a link-layer broadcast 88 * address, the IP destination address MUST be a legal IP 89 * broadcast or IP multicast address. 90 * 91 * A host SHOULD silently discard a datagram that is received 92 * via a link-layer broadcast (see Section 2.4) but does not 93 * specify an IP multicast or broadcast destination address. 94 * 95 * This doesn't explicitly say L2 *broadcast*, but broadcast is 96 * in a way a form of multicast and the most common use case for 97 * this is 802.11 protecting against cross-station spoofing (the 98 * so-called "hole-196" attack) so do it for both. 99 */ 100 if (in_dev && 101 IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) 102 goto drop; 103 } 104 105 /* 调用路由项的input函数,可能为ip_local_deliver或者ip_forward */ 106 return dst_input(skb); 107 108 drop: 109 kfree_skb(skb); 110 return NET_RX_DROP; 111 }