1.4.1:状态定时器回调neigh_timer_handler
设置定时器来处理那些需要定时器处理的状态,定时器回调函数为neigh_timer_handler;函数会根据状态机变换规则对状态进行切换,切换状态后,如果需要更新输出函数则更新,并更新定时器下一次超时时间;其中NUD_INCOMPLETE | NUD_PROBE状态需要发送邻居请求,如果超过最大次数,则释放缓存中的数据包;主要包含邻居项状态的转换以及邻居项solicit请求相关的函数
在申请邻居项的内存函数neigh_alloc里,会创建该定时器,并会将定时器的超时处理函数设置为neigh_timer_handler。
/* Called when a timer expires for a neighbour entry. */ /* 对于处于reach状态的邻居项: 1、如果当前时间距确认时间confirmed,还未到超时时限reachable_time,则将定时器时间设置为邻居项的超时时限reachable_time 2、当前时间已晚于确认时间加上超时时限,当未超过邻居项使用时间加上delay_probe_time,则将状态设置为DELAY。 这个状态的改变条件,我感觉设置的很巧妙。 一般是进入stale状态的邻居项,在超时前有数据时,则进入Delay状态。 3、当前时间晚于used+delay_probe_time,说明在confirmed+reachable_time超时前的短暂时间 内没有数据发送,此时即将状态设置为STALE, 对于Delay状态的邻居项: 1、当前时间小于connect_time+delay_time时,说明邻居项可能在定时器超时函数刚执行时 即已经更新了connect_time时间,此时即可以在邻居项的状态设置为reach (connect_time会在neigh_update里被更新) 2、说明该邻居项在delay_time超时后,还没有被外部确认,此时就需要将邻居项的状态设置为probe,准备发送solict请求 对于probe与incomplete状态的邻居项,此时需要将定时器的下一次超时时间设置为retrain,如果在下一次超时前,还没有得到确认,则还会执行该定时器处理函数 对于probe与incomplete状态的邻居项: 1、如果已经超过了最大发包次数,则将邻居项的状态设置FAILED,并调neigh_invalidate,发送错误报告,并释放缓存的数据包 2、如果还没有超过最大发包次数,则调用solicit,发送邻居项solicit请求。 */ 参考:https://blog.csdn.net/lickylin/article/details/22228047 static void neigh_timer_handler(unsigned long arg) { unsigned long now, next; struct neighbour *neigh = (struct neighbour *)arg; unsigned int state; int notify = 0; write_lock(&neigh->lock); state = neigh->nud_state; now = jiffies; next = now + HZ; /* 非定时器状态 */ if (!(state & NUD_IN_TIMER)) goto out; if (state & NUD_REACHABLE) {/* REACHABLE状态 */ if (time_before_eq(now, neigh->confirmed + neigh->parms->reachable_time)) {/* 确认时间未超时,设置下次超时时间 */ NEIGH_PRINTK2("neigh %p is still alive. ", neigh); next = neigh->confirmed + neigh->parms->reachable_time; } else if (time_before_eq(now, neigh->used + neigh->parms->delay_probe_time)) { /* 确认时间已经超时了,但是闲置时间未达到 */ NEIGH_PRINTK2("neigh %p is delayed. ", neigh); neigh->nud_state = NUD_DELAY; /* 进入DELAY状态 */ neigh->updated = jiffies; neigh_suspect(neigh); /* 更新output函数 */ next = now + neigh->parms->delay_probe_time; } else {/* 确认时间和闲置时间都超时了 */ NEIGH_PRINTK2("neigh %p is suspected. ", neigh); neigh->nud_state = NUD_STALE; /* 进入STALE状态 */ neigh->updated = jiffies; neigh_suspect(neigh); /* 更新输出函数 */ notify = 1; } } else if (state & NUD_DELAY) {/* DELAY状态 */ if (time_before_eq(now, neigh->confirmed + neigh->parms->delay_probe_time)) {//其间收到了应答报文/* 最后一次确认时间没达到超时时间 */ NEIGH_PRINTK2("neigh %p is now reachable. ", neigh); neigh->nud_state = NUD_REACHABLE;/* 进入REACHABLE状态,更新输出函数 */ neigh->updated = jiffies; neigh_connect(neigh); notify = 1; next = neigh->confirmed + neigh->parms->reachable_time; } else { /* 最后确认时间已经达到了超时时间,进入PROBE状态 */ NEIGH_PRINTK2("neigh %p is probed. ", neigh); neigh->nud_state = NUD_PROBE; neigh->updated = jiffies; atomic_set(&neigh->probes, 0); notify = 1; next = now + neigh->parms->retrans_time; } } else { /* NUD_PROBE|NUD_INCOMPLETE */ next = now + neigh->parms->retrans_time; } /* NUD_PROBE|NUD_INCOMPLETE状态,达到了最大尝试次数 */ if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { neigh->nud_state = NUD_FAILED; notify = 1; neigh_invalidate(neigh); } /* 定时器处理状态,则更新定时器 */ if (neigh->nud_state & NUD_IN_TIMER) { if (time_before(next, jiffies + HZ/2)) next = jiffies + HZ/2; if (!mod_timer(&neigh->timer, next)) neigh_hold(neigh); } if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { neigh_probe(neigh); } else { out: write_unlock(&neigh->lock); } /* 通知关心的模块 */ if (notify) neigh_update_notify(neigh); neigh_release(neigh); }
邻居项状态的更新函数3
第三个邻居项状态的更新函数,通过__neigh_event_send;
首先说明一下 Routing与 Neighboring subsystem的关联
1、在路由过程中,需要寻找或创建 struct dst_entry (另一种形式是 struct rtable)。 dst_entry 通过neighbour 域与 struct neighbour 关联。
每个 dst_entry 对应一个 neighbour,这样在路由之后,立刻能找到对应的 neighbour,此后,数据包通过 neighbour->output 送到链路层。
以 UDP 包的发送过程为例,这个过程如下
Udp_sendmsg() ==> ip_route_output()
==> udp_push_pending_frames()==》udp_send_skb==》ip_send_skb==》ip_local_out==》dst_output==》skb->dst->output
Ip_route_output_slow() : 当查不到路由 cache 后,根据 route rule ,通过 dst_alloc() 创建一个 dst_entry 结构,这同时也是一个 rtable 结构,然后将 dst_entry 的 output 指向 ip_output();
此后,udp_sendmsg 继续调用 ip_send_skb() 来发包;
rth->u.dst.output=ip_output;
Udp_sendmsg() ==> udp_push_pending_frames ==> udp_send_skb==> ip_send_skb==>ip_local_out==》skb->dst->output()//这里的 output 就是 ip_output()
ip_output ==> __ip_finish_output() ==> ip_finish_output2() ==> dst_neigh_output()
因此,最终数据包是通过dst_neigh_output 也就是 neighbour->output() 往下送的。
IPv4 代码实现:ip_route_output在路由 cache 中查不到路由结果后,查找__mkroute_output->rt_dst_alloc-> route rule ,如果没有合适的路由规则,则失败返回。否则,通过 dst_alloc() 创建一个 dst_entry 结构,这同时也是一个 rtable 结构,此 rtable 结构被挂入 hash 表中。这时候我们已经有了下一跳的 L3地址。(也可能没有,例如绑定 interface 的情况,需要看代码是如何处理的)。
static struct dst_ops ipv4_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), .check = ipv4_dst_check, .default_advmss = ipv4_default_advmss, .mtu = ipv4_mtu, .cow_metrics = ipv4_cow_metrics, .destroy = ipv4_dst_destroy, .ifdown = ipv4_dst_ifdown, .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .redirect = ip_do_redirect, .local_out = __ip_local_out, .neigh_lookup = ipv4_neigh_lookup,//rtable 和 neigh_table绑定 }; static struct rtable *rt_dst_alloc(struct net_device *dev, bool nopolicy, bool noxfrm, bool will_cache) { return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | (nopolicy ? DST_NOPOLICY : 0) | (noxfrm ? DST_NOXFRM : 0)); } void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { struct dst_entry *dst; if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { if (ops->gc(ops)) return NULL; } dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) return NULL; dst->child = NULL; dst->dev = dev; if (dev) dev_hold(dev); dst->ops = ops;// 赋值ops dst_init_metrics(dst, dst_default_metrics, true); dst->expires = 0UL; dst->path = dst; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif dst->input = dst_discard; dst->output = dst_discard//创建时 dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; dst->trailer_len = 0; #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = 0; #endif atomic_set(&dst->__refcnt, initial_ref); dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; dst->pending_confirm = 0; dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); return dst; }
下一步,要通过ip_finish_output2 将 rtable 与 neighbour 进行绑定
arp_bind_neighbour() 根据给定的下一跳 L3 地址,到 arp hash 表中找 neighbour,找到的话,dst->neighbour 就有了归宿;找不到,只好调用 neighbour_create() 创建一个新的 neighbour,这是在__neigh_lookup_errno() 中完成的
1.1 ip_finish_output2()
1.2 nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr)
1.3 __ipv4_neigh_lookup_noref(dev, nexthop)
if (!neigh)
neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
1.4 if(neigh)
dst_neigh_output(dst, neigh, skb);
static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n, struct sk_buff *skb) { struct hh_cache *hh; if (unlikely(dst->pending_confirm)) { n->confirmed = jiffies; dst->pending_confirm = 0; } hh = &n->hh; if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) return neigh_hh_output(hh, skb); else return n->output(n, skb); }
/*neigh_alloc() 用于分配 neighbour 结构 neigh_create() 进一步设置此结构,对于 ARP 来说,它调用 arp_constructor() ,在这个函数里面,对 neighbour 的 ops 域和 output 域进行设置。 Ops 域,根据底层 driver 的类型进行不同的设置, 对于没有链路层地址的,指向arp_direct_ops 对于没有链路层 cache 的,指向arp_generic_ops 对于有链路层 cache 的, 指向arp_hh_ops */
static int arp_constructor(struct neighbour *neigh) { __be32 addr = *(__be32 *)neigh->primary_key; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev == NULL) { rcu_read_unlock(); return -EINVAL; } neigh->type = inet_addr_type(dev_net(dev), addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (!dev->header_ops) {//haed ops 不存在直接赋值 neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh_direct_output; } else { /* Good devices (checked by reading texts, but only Ethernet is tested) ARPHRD_ETHER: (ethernet, apfddi) ARPHRD_FDDI: (fddi) ARPHRD_IEEE802: (tr) ARPHRD_METRICOM: (strip) ARPHRD_ARCNET: etc. etc. etc. ARPHRD_IPDDP will also work, if author repairs it. I did not it, because this driver does not work even in old paradigm. */ #if 1 /* So... these "amateur" devices are hopeless. The only thing, that I can say now: It is very sad that we need to keep ugly obsolete code to make them happy. They should be moved to more reasonable state, now they use rebuild_header INSTEAD OF hard_start_xmit!!! Besides that, they are sort of out of date (a lot of redundant clones/copies, useless in 2.1), I wonder why people believe that they work. */ switch (dev->type) { default: break; case ARPHRD_ROSE: #if IS_ENABLED(CONFIG_AX25) case ARPHRD_AX25: #if IS_ENABLED(CONFIG_NETROM) case ARPHRD_NETROM: #endif neigh->ops = &arp_broken_ops; neigh->output = neigh->ops->output; return 0; #else break; #endif } #endif if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); } else if (neigh->type == RTN_BROADCAST || (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } /* 设置neigh 接口**/ if (dev->header_ops->cache) neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } return 0;
static int arp_constructor(struct neighbour *neigh) { __be32 addr = *(__be32 *)neigh->primary_key; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev == NULL) { rcu_read_unlock(); return -EINVAL; } neigh->type = inet_addr_type(dev_net(dev), addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (!dev->header_ops) {//haed ops 不存在直接赋值 neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh_direct_output; } else { /* Good devices (checked by reading texts, but only Ethernet is tested) ARPHRD_ETHER: (ethernet, apfddi) ARPHRD_FDDI: (fddi) ARPHRD_IEEE802: (tr) ARPHRD_METRICOM: (strip) ARPHRD_ARCNET: etc. etc. etc. ARPHRD_IPDDP will also work, if author repairs it. I did not it, because this driver does not work even in old paradigm. */ #if 1 /* So... these "amateur" devices are hopeless. The only thing, that I can say now: It is very sad that we need to keep ugly obsolete code to make them happy. They should be moved to more reasonable state, now they use rebuild_header INSTEAD OF hard_start_xmit!!! Besides that, they are sort of out of date (a lot of redundant clones/copies, useless in 2.1), I wonder why people believe that they work. */ switch (dev->type) { default: break; case ARPHRD_ROSE: #if IS_ENABLED(CONFIG_AX25) case ARPHRD_AX25: #if IS_ENABLED(CONFIG_NETROM) case ARPHRD_NETROM: #endif neigh->ops = &arp_broken_ops; neigh->output = neigh->ops->output; return 0; #else break; #endif } #endif if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); } else if (neigh->type == RTN_BROADCAST || (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } /* 设置neigh 接口**/ if (dev->header_ops->cache) neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } return 0; }
对于以太网驱动程序,它的 net_device 结构在初始化的时候,已经有了默认的 hard_header 和 hard_header_cache 函数
ether_setup() dev->hard_header = eth_header; dev->hard_header_cache = eth_header_cache;
默认情况下,它的 ops 指向 arp_hh_ops()
对于arp来说:其ops函数有如下:
static const struct neigh_ops arp_generic_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops arp_direct_ops = { .family = AF_INET, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; static const struct neigh_ops arp_broken_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_compat_output, .connected_output = neigh_compat_output, };
对于 output 域,关键是看 neighbour 的状态,如果是有效状态,则设置为 ops->connected_output(),这样可以加快速度,
否则设置为 ops->output(),这样,需要进行 neighbor discovery 的处理
Neighbor Discovery 的 过程
从上面的状态机可以看到,当 neighbour 处于 INCOMPLETE、PROBE 状态的时候,会发送 Neighbor Solicit 包:
例如,通过 neigh_resolve_output() 导致新创建一个 neighbour 结构后,最后会调用 neigh->ops->solicit() 来发送 NS 包,对于 ARP 来说,就是 arp_solicit():
/*neigh_resolve_output() ==> neigh_event_send() ==>
__neigh_event_send() ==>neigh_probe--> neigh->ops->solicit(neigh, skb); ==> arp_solicit()*/
arp_solicit 调用 arp_send() 构造并发送 ARP request:
对于 INCOMPLETE 状态,需要发送一个新的 ARP 请求,它的目的 MAC 地址是广播地址,这样链路上所有节点都能收到此广播包;
对于 PROBE 状态, neighbour 中已经有了对端的 MAC 地址,此时发 ARP request 的目的只是验证这个映射还是有效的,因此此时发出的 ARP 包的目的 MAC 地址可以从 neighbour 中取到,是一个单播的 ARP 包。
neigh_resolve_output 分析:
int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); int rc = 0; if (!dst) goto discard; if (!neigh_event_send(neigh, skb)) {//其返回值很重要 /* 检测邻居项状态有效性 */ int err; struct net_device *dev = neigh->dev; unsigned int seq; if (dev->header_ops->cache && !neigh->hh.hh_len) /* 有二层头缓存函数,则缓存之 */ neigh_hh_init(neigh, dst); do { /* 填充二层头 */ __skb_pull(skb, skb_network_offset(skb)); seq = read_seqbegin(&neigh->ha_lock); err = dev_hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, NULL, skb->len); } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) /* 数据包发送 */ rc = dev_queue_xmit(skb); else goto out_kfree_skb; } out: return rc; discard: NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p ", dst, neigh); out_kfree_skb: rc = -EINVAL; kfree_skb(skb); goto out; }