http://blog.csdn.net/one_clouder/article/details/52889921
邻居子系统实现了IP层发包不感知MAC,即由邻居子系统实现了MAC头封装。MAC头信息包括:源MAC、目的MAC、协议类型,其中协议类型由上层指定,例如IPV4等等,源MAC地址是出口设备MAC地址(在路由表中确定出口设备),目的MAC是由邻居子系统提供的,大致可以猜到,邻居子系统会主动发起arp请求获取到mac地址,实现MAC封包。IP层发包最后会调用ip_finish_output2函数,我们从该函数入手分析邻居子系统。
ip_finish_output2函数
- static inline int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
- {
- struct dst_entry *dst = skb_dst(skb);
- struct rtable *rt = (struct rtable *)dst;
- struct net_device *dev = dst->dev; //出口设备
- unsigned int hh_len = LL_RESERVED_SPACE(dev);
- struct neighbour *neigh;
- u32 nexthop;
- if (rt->rt_type == RTN_MULTICAST) {
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
- } else if (rt->rt_type == RTN_BROADCAST)
- IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
- /* Be paranoid, rather than too clever. */
- if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
- struct sk_buff *skb2;
- skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
- if (!skb2) {
- kfree_skb(skb);
- return -ENOMEM;
- }
- if (skb->sk)
- skb_set_owner_w(skb2, skb->sk);
- consume_skb(skb);
- skb = skb2;
- }
- rcu_read_lock_bh();
- nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); //目的IP地址
- neigh = __ipv4_neigh_lookup_noref(dev, nexthop); //根据目的IP查找邻居项是否存在
- if (unlikely(!neigh))
- neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); //如果不存在,则创建neigh项
- if (!IS_ERR(neigh)) {
- int res = dst_neigh_output(dst, neigh, skb); //调用邻居子系统封装MAC头,并且调用二层发包函数完成报文发送
- rcu_read_unlock_bh();
- return res;
- }
- rcu_read_unlock_bh();
- net_dbg_ratelimited("%s: No header cache and no neighbour! ",
- __func__);
- kfree_skb(skb);
- return -EINVAL;
- }
__ipv4_neigh_lookup_noref函数
- static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key)
- {
- return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); //ipv4从arp_tbl中查找
- }
- static inline struct neighbour *___neigh_lookup_noref(
- struct neigh_table *tbl,
- bool (*key_eq)(const struct neighbour *n, const void *pkey),
- __u32 (*hash)(const void *pkey,
- const struct net_device *dev,
- __u32 *hash_rnd),
- const void *pkey,
- struct net_device *dev)
- {
- struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht); //hash表,邻居数量大时加速
- struct neighbour *n;
- u32 hash_val;
- hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); //计算hash值
- for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]);
- n != NULL;
- n = rcu_dereference_bh(n->next)) {
- if (n->dev == dev && key_eq(n, pkey)) //dev相同并且pkey相同,这里pkey是IPV4地址
- return n;
- }
- return NULL;
- }
__neigh_create函数
- struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
- struct net_device *dev, bool want_ref)
- {
- u32 hash_val;
- int key_len = tbl->key_len;
- int error;
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev); //创建邻居表项对象
- struct neigh_hash_table *nht;
- if (!n) {
- rc = ERR_PTR(-ENOBUFS);
- goto out;
- }
- memcpy(n->primary_key, pkey, key_len);
- n->dev = dev;
- dev_hold(dev);
- /* Protocol specific setup. */
- if (tbl->constructor && (error = tbl->constructor(n)) < 0) { //IPV4实际调用arp_constructor函数,设置output函数
- rc = ERR_PTR(error);
- goto out_neigh_release;
- }
- if (dev->netdev_ops->ndo_neigh_construct) { //一般设备不设置该变量
- error = dev->netdev_ops->ndo_neigh_construct(n);
- if (error < 0) {
- rc = ERR_PTR(error);
- goto out_neigh_release;
- }
- }
- /* Device specific setup. */
- if (n->parms->neigh_setup &&
- (error = n->parms->neigh_setup(n)) < 0) { //IPV4未定义该函数
- rc = ERR_PTR(error);
- goto out_neigh_release;
- }
- n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1);
- write_lock_bh(&tbl->lock);
- nht = rcu_dereference_protected(tbl->nht,
- lockdep_is_held(&tbl->lock));
- if (atomic_read(&tbl->entries) > (1 << nht->hash_shift))
- nht = neigh_hash_grow(tbl, nht->hash_shift + 1);
- hash_val = tbl->hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); //计算hash值,计算方式由邻居表定义
- if (n->parms->dead) {
- rc = ERR_PTR(-EINVAL);
- goto out_tbl_unlock;
- }
- for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], //找到有相同hash值得neighbour链表
- lockdep_is_held(&tbl->lock));
- n1 != NULL;
- n1 = rcu_dereference_protected(n1->next,
- lockdep_is_held(&tbl->lock))) {
- if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
- if (want_ref)
- neigh_hold(n1);
- rc = n1;
- goto out_tbl_unlock;
- }
- }
- n->dead = 0;
- if (want_ref)
- neigh_hold(n);
- rcu_assign_pointer(n->next,
- rcu_dereference_protected(nht->hash_buckets[hash_val],
- lockdep_is_held(&tbl->lock))); //插入到链表中
- rcu_assign_pointer(nht->hash_buckets[hash_val], n);
- write_unlock_bh(&tbl->lock);
- neigh_dbg(2, "neigh %p is created ", n);
- rc = n;
- out:
- return rc;
- out_tbl_unlock:
- write_unlock_bh(&tbl->lock);
- out_neigh_release:
- neigh_release(n);
- goto out;
- }
- static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
- {
- struct neighbour *n = NULL;
- unsigned long now = jiffies;
- int entries;
- entries = atomic_inc_return(&tbl->entries) - 1;
- if (entries >= tbl->gc_thresh3 ||
- (entries >= tbl->gc_thresh2 &&
- time_after(now, tbl->last_flush + 5 * HZ))) {
- if (!neigh_forced_gc(tbl) &&
- entries >= tbl->gc_thresh3)
- goto out_entries;
- }
- n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
- if (!n)
- goto out_entries;
- __skb_queue_head_init(&n->arp_queue); //初始化arp_queue队列
- rwlock_init(&n->lock);
- seqlock_init(&n->ha_lock);
- n->updated = n->used = now;
- n->nud_state = NUD_NONE; //状态为不可用
- n->output = neigh_blackhole; //直接丢弃报文
- seqlock_init(&n->hh.hh_lock);
- n->parms = neigh_parms_clone(&tbl->parms); //拷贝neigh_table中的parms
- setup_timer(&n->timer, neigh_timer_handler, (unsigned long)n); //注册定时器
- NEIGH_CACHE_STAT_INC(tbl, allocs);
- n->tbl = tbl;
- atomic_set(&n->refcnt, 1);
- n->dead = 1;
- out:
- return n;
- out_entries:
- atomic_dec(&tbl->entries);
- goto out;
- }
- static int arp_constructor(struct neighbour *neigh)
- {
- __be32 addr = *(__be32 *)neigh->primary_key;
- struct net_device *dev = neigh->dev;
- struct in_device *in_dev;
- struct neigh_parms *parms;
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(dev); //通过net_device得到in_device
- if (!in_dev) {
- rcu_read_unlock();
- return -EINVAL;
- }
- neigh->type = inet_addr_type(dev_net(dev), addr); //设置地址类型
- parms = in_dev->arp_parms;
- __neigh_parms_put(neigh->parms);
- neigh->parms = neigh_parms_clone(parms);
- rcu_read_unlock();
- if (!dev->header_ops) { //基本上的网卡都会设置该值
- neigh->nud_state = NUD_NOARP;
- neigh->ops = &arp_direct_ops;
- neigh->output = neigh_direct_output;
- } else {
- /* Good devices (checked by reading texts, but only Ethernet is
- tested)
- ARPHRD_ETHER: (ethernet, apfddi)
- ARPHRD_FDDI: (fddi)
- ARPHRD_IEEE802: (tr)
- ARPHRD_METRICOM: (strip)
- ARPHRD_ARCNET:
- etc. etc. etc.
- ARPHRD_IPDDP will also work, if author repairs it.
- I did not it, because this driver does not work even
- in old paradigm.
- */
- if (neigh->type == RTN_MULTICAST) { //组播地址不需要arp
- neigh->nud_state = NUD_NOARP;
- arp_mc_map(addr, neigh->ha, dev, 1);
- } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { //设备明确不需要arp或本地回环设备,不需要arp
- neigh->nud_state = NUD_NOARP;
- memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
- } else if (neigh->type == RTN_BROADCAST ||
- (dev->flags & IFF_POINTOPOINT)) { //广播或点对点,也不需要arp
- neigh->nud_state = NUD_NOARP;
- memcpy(neigh->ha, dev->broadcast, dev->addr_len);
- }
- if (dev->header_ops->cache) //eth_header_ops包含cache
- neigh->ops = &arp_hh_ops;
- else
- neigh->ops = &arp_generic_ops;
- if (neigh->nud_state & NUD_VALID)
- neigh->output = neigh->ops->connected_output;
- else
- neigh->output = neigh->ops->output; //初始阶段为该值,即arp_hh_ops的neigh_resolve_output函数
- }
- return 0;
- }
dst_neigh_output函数
- static inline int dst_neigh_output(struct dst_entry *dst, struct neighbour *n,
- struct sk_buff *skb)
- {
- const struct hh_cache *hh;
- if (dst->pending_confirm) {
- unsigned long now = jiffies;
- dst->pending_confirm = 0;
- /* avoid dirtying neighbour */
- if (n->confirmed != now)
- n->confirmed = now;
- }
- hh = &n->hh;
- if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) //如果neighbour已连接且hh已设置
- return neigh_hh_output(hh, skb);
- else
- return n->output(n, skb); //初始阶段调用此函数,此时为neigh_resolve_output函数
- }
- int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb)
- {
- int rc = 0;
- if (!neigh_event_send(neigh, skb)) { //发送arp请求,第一次返回true
- int err;
- struct net_device *dev = neigh->dev;
- unsigned int seq;
- if (dev->header_ops->cache && !neigh->hh.hh_len)
- neigh_hh_init(neigh); //初始化MAC缓存值,目的是加速
- do {
- __skb_pull(skb, skb_network_offset(skb)); //常见情况,skb指向network header
- seq = read_seqbegin(&neigh->ha_lock);
- err = dev_hard_header(skb, dev, ntohs(skb->protocol), //封装MAC头
- neigh->ha, NULL, skb->len);
- } while (read_seqretry(&neigh->ha_lock, seq));
- if (err >= 0)
- rc = dev_queue_xmit(skb); //二层发送报文
- else
- goto out_kfree_skb;
- }
- out:
- return rc;
- out_kfree_skb:
- rc = -EINVAL;
- kfree_skb(skb);
- goto out;
- }
- static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
- {
- unsigned long now = jiffies;
- if (neigh->used != now)
- neigh->used = now;
- if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))
- return __neigh_event_send(neigh, skb); //发送arp请求
- return 0;
- }
- int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
- {
- int rc;
- bool immediate_probe = false;
- write_lock_bh(&neigh->lock);
- rc = 0;
- if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
- goto out_unlock_bh;
- if (neigh->dead)
- goto out_dead;
- if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { //初始阶段进入此分支
- if (NEIGH_VAR(neigh->parms, MCAST_PROBES) +
- NEIGH_VAR(neigh->parms, APP_PROBES)) {
- unsigned long next, now = jiffies;
- atomic_set(&neigh->probes,
- NEIGH_VAR(neigh->parms, UCAST_PROBES));
- neigh->nud_state = NUD_INCOMPLETE; //设置表项状态为incomplete
- neigh->updated = now;
- next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
- HZ/2);
- neigh_add_timer(neigh, next); //触发定时器,期望刷新表项状态和output函数,500毫秒后执行
- immediate_probe = true;
- } else {
- neigh->nud_state = NUD_FAILED;
- neigh->updated = jiffies;
- write_unlock_bh(&neigh->lock);
- kfree_skb(skb);
- return 1;
- }
- } else if (neigh->nud_state & NUD_STALE) {
- neigh_dbg(2, "neigh %p is delayed ", neigh);
- neigh->nud_state = NUD_DELAY;
- neigh->updated = jiffies;
- neigh_add_timer(neigh, jiffies +
- NEIGH_VAR(neigh->parms, DELAY_PROBE_TIME));
- }
- if (neigh->nud_state == NUD_INCOMPLETE) {
- if (skb) {
- while (neigh->arp_queue_len_bytes + skb->truesize >
- NEIGH_VAR(neigh->parms, QUEUE_LEN_BYTES)) { //如果等待发送的报文数量超过设定值,丢弃报文
- struct sk_buff *buff;
- buff = __skb_dequeue(&neigh->arp_queue);
- if (!buff)
- break;
- neigh->arp_queue_len_bytes -= buff->truesize;
- kfree_skb(buff);
- NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
- }
- skb_dst_force(skb);
- __skb_queue_tail(&neigh->arp_queue, skb); //报文放入arp_queue队列中
- neigh->arp_queue_len_bytes += skb->truesize;
- }
- rc = 1;
- }
- out_unlock_bh:
- if (immediate_probe) //初始阶段,邻居项设置状态设置为incomplete,同时设置该变量为true
- neigh_probe(neigh); //探测邻居表项
- else
- write_unlock(&neigh->lock);
- local_bh_enable();
- return rc;
- out_dead:
- if (neigh->nud_state & NUD_STALE)
- goto out_unlock_bh;
- write_unlock_bh(&neigh->lock);
- kfree_skb(skb);
- return 1;
- }
- static void neigh_probe(struct neighbour *neigh)
- __releases(neigh->lock)
- {
- struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue); //取出报文
- /* keep skb alive even if arp_queue overflows */
- if (skb)
- skb = skb_copy(skb, GFP_ATOMIC); //拷贝skb
- write_unlock(&neigh->lock);
- neigh->ops->solicit(neigh, skb); //实际调用arp_solicit函数,该函数会发送arp请求
- atomic_inc(&neigh->probes);
- kfree_skb(skb);
- }
neigh_update函数
- /* Generic update routine.
- -- lladdr is new lladdr or NULL, if it is not supplied.
- -- new is new state.
- -- flags
- NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr,
- if it is different.
- NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected"
- lladdr instead of overriding it
- if it is different.
- It also allows to retain current state
- if lladdr is unchanged.
- NEIGH_UPDATE_F_ADMIN means that the change is administrative.
- NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing
- NTF_ROUTER flag.
- NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as
- a router.
- Caller MUST hold reference count on the entry.
- */
- int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
- u32 flags)
- {
- u8 old;
- int err;
- int notify = 0;
- struct net_device *dev;
- int update_isrouter = 0;
- write_lock_bh(&neigh->lock);
- dev = neigh->dev;
- old = neigh->nud_state;
- err = -EPERM;
- if (!(flags & NEIGH_UPDATE_F_ADMIN) &&
- (old & (NUD_NOARP | NUD_PERMANENT)))
- goto out;
- if (neigh->dead)
- goto out;
- if (!(new & NUD_VALID)) {
- neigh_del_timer(neigh);
- if (old & NUD_CONNECTED)
- neigh_suspect(neigh);
- neigh->nud_state = new;
- err = 0;
- notify = old & NUD_VALID;
- if ((old & (NUD_INCOMPLETE | NUD_PROBE)) &&
- (new & NUD_FAILED)) {
- neigh_invalidate(neigh);
- notify = 1;
- }
- goto out;
- }
- /* Compare new lladdr with cached one */
- if (!dev->addr_len) {
- /* First case: device needs no address. */
- lladdr = neigh->ha;
- } else if (lladdr) {
- /* The second case: if something is already cached
- and a new address is proposed:
- - compare new & old
- - if they are different, check override flag
- */
- if ((old & NUD_VALID) &&
- !memcmp(lladdr, neigh->ha, dev->addr_len))
- lladdr = neigh->ha;
- } else {
- /* No address is supplied; if we know something,
- use it, otherwise discard the request.
- */
- err = -EINVAL;
- if (!(old & NUD_VALID))
- goto out;
- lladdr = neigh->ha;
- }
- if (new & NUD_CONNECTED)
- neigh->confirmed = jiffies;
- neigh->updated = jiffies;
- /* If entry was valid and address is not changed,
- do not change entry state, if new one is STALE.
- */
- err = 0;
- update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER;
- if (old & NUD_VALID) {
- if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) {
- update_isrouter = 0;
- if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) &&
- (old & NUD_CONNECTED)) {
- lladdr = neigh->ha;
- new = NUD_STALE;
- } else
- goto out;
- } else {
- if (lladdr == neigh->ha && new == NUD_STALE &&
- ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) ||
- (old & NUD_CONNECTED))
- )
- new = old;
- }
- }
- if (new != old) {
- neigh_del_timer(neigh);
- if (new & NUD_IN_TIMER)
- neigh_add_timer(neigh, (jiffies +
- ((new & NUD_REACHABLE) ?
- neigh->parms->reachable_time :
- 0)));
- neigh->nud_state = new;
- notify = 1;
- }
- if (lladdr != neigh->ha) {
- write_seqlock(&neigh->ha_lock);
- memcpy(&neigh->ha, lladdr, dev->addr_len);
- write_sequnlock(&neigh->ha_lock);
- neigh_update_hhs(neigh);
- if (!(new & NUD_CONNECTED))
- neigh->confirmed = jiffies -
- (NEIGH_VAR(neigh->parms, BASE_REACHABLE_TIME) << 1);
- notify = 1;
- }
- if (new == old)
- goto out;
- if (new & NUD_CONNECTED)
- neigh_connect(neigh); //修改output函数为neigh_connected_output
- else
- neigh_suspect(neigh);
- if (!(old & NUD_VALID)) { //如果源状态不为valid,则发送缓存的skb
- struct sk_buff *skb;
- /* Again: avoid dead loop if something went wrong */
- while (neigh->nud_state & NUD_VALID &&
- (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { //取出缓冲报文
- struct dst_entry *dst = skb_dst(skb);
- struct neighbour *n2, *n1 = neigh;
- write_unlock_bh(&neigh->lock);
- rcu_read_lock();
- /* Why not just use 'neigh' as-is? The problem is that
- * things such as shaper, eql, and sch_teql can end up
- * using alternative, different, neigh objects to output
- * the packet in the output path. So what we need to do
- * here is re-lookup the top-level neigh in the path so
- * we can reinject the packet there.
- */
- n2 = NULL;
- if (dst) {
- n2 = dst_neigh_lookup_skb(dst, skb);
- if (n2)
- n1 = n2;
- }
- n1->output(n1, skb); //调用neigh的output函数,此时已经改成connect函数
- if (n2)
- neigh_release(n2);
- rcu_read_unlock();
- write_lock_bh(&neigh->lock);
- }
- __skb_queue_purge(&neigh->arp_queue); //清空缓存
- neigh->arp_queue_len_bytes = 0;
- }
- out:
- if (update_isrouter) {
- neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ?
- (neigh->flags | NTF_ROUTER) :
- (neigh->flags & ~NTF_ROUTER);
- }
- write_unlock_bh(&neigh->lock);
- if (notify)
- neigh_update_notify(neigh);
- return err;
- }