/* * Main IP Receive routine. */ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct iphdr *iph; struct net *net; u32 len; /* When the interface is in promisc. mode, drop all the crap * that it receives, do not try to analyse it. */ if (skb->pkt_type == PACKET_OTHERHOST) goto drop; net = dev_net(dev); IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_IN, skb->len); skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) { IP_INC_STATS_BH(net, IPSTATS_MIB_INDISCARDS); goto out; }
/* *pskb_may_pull确保skb->data指向的内存包含的数据至少为IP头部大小,由于每个
*IP数据包包括IP分片必须包含一个完整的IP头部。如果小于IP头部大小,则缺失
*的部分将从数据分片中拷贝。这些分片保存在skb_shinfo(skb)->frags[]中。
*/
if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto inhdr_error; ----------------------------------------------------------- }
/** * skb_share_check - check if buffer is shared and if so clone it * @skb: buffer to check * @pri: priority for memory allocation * * If the buffer is shared the buffer is cloned and the old copy * drops a reference. A new clone with a single reference is returned. * If the buffer is not shared the original buffer is returned. When * being called from interrupt status or with spinlocks held pri must * be GFP_ATOMIC. * * NULL is returned on a memory allocation failure. */ static inline struct sk_buff *skb_share_check(struct sk_buff *skb, gfp_t pri) { might_sleep_if(gfpflags_allow_blocking(pri)); if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, pri); if (likely(nskb)) consume_skb(skb); else kfree_skb(skb); skb = nskb; } return skb; }
为什么要这样做??p_rcv调用skb_share_check函数后,自己克隆了一个sk_buff,释放传入的sk_buff-----为了啥???
-----------原来的skb可能被共享,如果需要修改skb,则会影响共享该sbk的其他函数,因此如果被共享,则克隆一份,再调用kfree_skb(实际只是skb->users--,减少引用计数)
----------__netif_receive_skb_core 函数中 会 扫描ptype_all链表与ptype_base哈希表。ptype_base是各种已经注册的协议的哈希表,根据每个数据包协议不同分派给不同的协议来处理,在这里递增skb进行索引计数
sk_buff->len,表示当前协议下的数据长度,包括线性缓冲区的数据长度和分片的数据长度。线性缓冲区的长度从skb->data指针开始计算。注意skb->data是随着协议不同而变化的
sk_buff->data_len,只表示分片的数据长度
sk_buff->truesize,表示sk_buff->len 加上struct sk_buff结构大小
static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len) { /* sk_buff->len,表示当前协议下的数据长度,包括线性缓冲区的数据长度和分片的数据长度。线性缓冲区的长度从skb->data指针开始计算。注意skb->data是随着协议不同而变化的 sk_buff->data_len,只表示分片的数据长度 sk_buff->truesize,表示sk_buff->len 加上struct sk_buff结构大小 */ if (likely(len <= skb_headlen(skb)))//skb->data 到 skb->tail之间的数据足够len长度 return 1; if (unlikely(len > skb->len))//len长度超过skb总长度 return 0;//移动后边的数据到skb->data中 return __pskb_pull_tail(skb, len - skb_headlen(skb)) != NULL; } /** * __pskb_pull_tail - advance tail of skb header * @skb: buffer to reallocate * @delta: number of bytes to advance tail * * The function makes a sense only on a fragmented &sk_buff, * it expands header moving its tail forward and copying necessary * data from fragmented part. * * &sk_buff MUST have reference count of 1. * * Returns %NULL (and &sk_buff does not change) if pull failed * or value of new tail of skb in the case of success. * * All the pointers pointing into skb header may change and must be * reloaded after call to this function. */ /* Moves tail of skb head forward, copying data from fragmented part, * when it is necessary. * 1. It may fail due to malloc failure. * 2. It may change skb pointers. * * It is pretty complicated. Luckily, it is called only in exceptional cases. *///delta为需要从frags或者frag_list向前移动的数据量 unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) { /* If skb has not enough free space at tail, get new one * plus 128 bytes for future expansions. If we have enough * room at tail, reallocate without expansion only if skb is cloned. *///eat为去除当前skb可用内存,还需要多少内存 int i, k, eat = (skb->tail + delta) - skb->end; if (eat > 0 || skb_cloned(skb)) {//判断当前skb是否被克隆 if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, GFP_ATOMIC))//对sk_buff重新分配头 return NULL; } /从skb的offset(skb->tail),拷贝delta个字节到skb->tail之后 if (skb_copy_bits(skb, skb_headlen(skb), skb_tail_pointer(skb), delta)) BUG(); /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. *///没有分段 if (!skb_has_frag_list(skb)) goto pull_pages; //由于数据已经拷贝到了skb->data中,因此需要释放frags,frag_list中被拷贝过的数据 //计算从frags数组中拷贝的数据量 /* Estimate size of pulled pages. */ eat = delta; for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { //寻找到满足eat这么多数据量的最后一个page int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (size >= eat) goto pull_pages; eat -= size; } /* If we need update frag list, we are in troubles. * Certainly, it possible to add an offset to skb data, * but taking into account that pulling is expected to * be very rare operation, it is worth to fight against * further bloating skb head and crucify ourselves here instead. * Pure masohism, indeed. 8)8) *///eat仍不为0,说明从frag_list中进行了拷贝,释放frag_list if (eat) { struct sk_buff *list = skb_shinfo(skb)->frag_list; struct sk_buff *clone = NULL; struct sk_buff *insp = NULL; do { BUG_ON(!list);//list为null,说明数据量不够 if (list->len <= eat) {//当前skb的长度小于需要的长度 /* Eaten as whole. */ eat -= list->len;//找到下一个skb list = list->next;//list指向下一个需要的skb insp = list;//insp指向当前的skb } else { /* Eaten partially. */ //此时insp指向前一个skb //说明当前skb可以满足需要的数据量 if (skb_shared(list)) {//但是当前skb被共享 /* Sucks! We need to fork list. :-( */ clone = skb_clone(list, GFP_ATOMIC);//对最后那个拷贝不完全的skb,进行克隆 if (!clone) return NULL; //list指向当前被克隆的的skb //insp指向下一个skb insp = list->next; list = clone; } else { /* This may be pulled without * problems. *///list与insp指向当前的skb insp = list; }//修改最后一个skb,移动指针,删除掉被拷贝的数据 if (!pskb_pull(list, eat)) { kfree_skb(clone);///递减clone的引用计数 return NULL; } break; } } while (eat); //list指向frag_list头 //直到list遍历到数据量足够的最后一个skb /* Free pulled out fragments. */ while ((list = skb_shinfo(skb)->frag_list) != insp) { skb_shinfo(skb)->frag_list = list->next; kfree_skb(list); } /* And insert new clone at head. */ if (clone) {//说明最后一个skb只被拷贝了一部分,将此skb挂到frag_list头 clone->next = list; skb_shinfo(skb)->frag_list = clone; } } /* Success! Now we may commit changes to skb data. */ pull_pages: eat = delta; k = 0;//释放frags中的page for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { int size = skb_frag_size(&skb_shinfo(skb)->frags[i]); if (size <= eat) { skb_frag_unref(skb, i); eat -= size; } else { skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; if (eat) { skb_shinfo(skb)->frags[k].page_offset += eat; skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat); eat = 0; } k++; } } skb_shinfo(skb)->nr_frags = k; skb->tail += delta; skb->data_len -= delta; return skb_tail_pointer(skb); } EXPORT_SYMBOL(__pskb_pull_tail);
/* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ struct skb_shared_info { unsigned char nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; unsigned short gso_type; struct sk_buff *frag_list; struct skb_shared_hwtstamps hwtstamps; u32 tskey; __be32 ip6_frag_id; /* * Warning : all fields before dataref are cleared in __alloc_skb() */ atomic_t dataref; /* Intermediate layers must ensure that destructor_arg * remains valid until skb destructor */ void * destructor_arg; /* must be last field, see pskb_expand_head() */ skb_frag_t frags[MAX_SKB_FRAGS]; };
typedef struct skb_frag_struct skb_frag_t; struct skb_frag_struct { struct { struct page *p; } page; #if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536) __u32 page_offset; __u32 size; #else __u16 page_offset; __u16 size; #endif };
关于 skb_shinfo 需要继续分析理清楚;
注意:struct sk_buff *frag_list;
/* must be last field, see pskb_expand_head() */
skb_frag_t frags[MAX_SKB_FRAGS];
而frag_list则指向链表的下一个skb,合并skb数据的时候,先是合并frags数组的数据,然后再在frag_list里遍历下一个skb。frag_list一般用于IP分片的场景中,相比而言frags数组只是简单的scatter-gather IO
frags[]
用于分散收集I/O缓冲区;只有在DMA支持物理分散页的Scatter/Gather(SG,分散/聚集)操作时候才可以使用frags[]来保存剩下的数据,否则,只能扩展线性数据区域进行保存!!!frag_list
用于IP片段
那么skb->next pre 是用于什么呢???