zoukankan      html  css  js  c++  java
  • Openvswitch原理与代码分析(5): 内核中的流表flow table操作

     

     

    当一个数据包到达网卡的时候,首先要经过内核Openvswitch.ko,流表Flow Table在内核中有一份,通过key查找内核中的flow table,即可以得到action,然后执行action之后,直接发送这个包,只有在内核无法查找到流表项的时候,才会到用户态查找用户态的流表。仅仅查找内核中flow table的情况被称为fast path.

    第一步:从数据包中提取出key

    实现函数为int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key)

    在这个函数中,首先提取的是物理层的信息,主要是从哪个网口进入的。

    1. key->phy.priority = skb->priority;
    2. key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
    3. key->phy.skb_mark = skb->mark;
    4. ovs_ct_fill_key(skb, key);
    5. key->ovs_flow_hash = 0;
    6. key->recirc_id = 0;

    然后调用函数static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)提取其他的key

    提取MAC层

    1. /* Link layer. We are guaranteed to have at least the 14 byte Ethernet
    2.  * header in the linear data area.
    3.  */
    4. eth = eth_hdr(skb);
    5. ether_addr_copy(key->eth.src, eth->h_source);
    6. ether_addr_copy(key->eth.dst, eth->h_dest);
    7. __skb_pull(skb, 2 * ETH_ALEN);
    8. /* We are going to push all headers that we pull, so no need to
    9.  * update skb->csum here.
    10.  */
    11. key->eth.tci = 0;
    12. if (skb_vlan_tag_present(skb))
    13.    key->eth.tci = htons(vlan_get_tci(skb));
    14. else if (eth->h_proto == htons(ETH_P_8021Q))
    15.    if (unlikely(parse_vlan(skb, key)))
    16.       return -ENOMEM;
    17. key->eth.type = parse_ethertype(skb);

    提取网络层

    1. struct iphdr *nh;
    2. __be16 offset;
    3. error = check_iphdr(skb);
    4. if (unlikely(error)) {
    5.    memset(&key->ip, 0, sizeof(key->ip));
    6.    memset(&key->ipv4, 0, sizeof(key->ipv4));
    7.    if (error == -EINVAL) {
    8.       skb->transport_header = skb->network_header;
    9.       error = 0;
    10.    }
    11.    return error;
    12. }
    13. nh = ip_hdr(skb);
    14. key->ipv4.addr.src = nh->saddr;
    15. key->ipv4.addr.dst = nh->daddr;
    16. key->ip.proto = nh->protocol;
    17. key->ip.tos = nh->tos;
    18. key->ip.ttl = nh->ttl;
    19. offset = nh->frag_off & htons(IP_OFFSET);
    20. if (offset) {
    21.    key->ip.frag = OVS_FRAG_TYPE_LATER;
    22.    return 0;
    23. }
    24. if (nh->frag_off & htons(IP_MF) ||
    25.    skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
    26.    key->ip.frag = OVS_FRAG_TYPE_FIRST;
    27. else
    28.    key->ip.frag = OVS_FRAG_TYPE_NONE;

    提取传输层

    1. /* Transport layer. */
    2. if (key->ip.proto == IPPROTO_TCP) {
    3.    if (tcphdr_ok(skb)) {
    4.       struct tcphdr *tcp = tcp_hdr(skb);
    5.       key->tp.src = tcp->source;
    6.       key->tp.dst = tcp->dest;
    7.       key->tp.flags = TCP_FLAGS_BE16(tcp);
    8.    } else {
    9.       memset(&key->tp, 0, sizeof(key->tp));
    10.    }
    11. } else if (key->ip.proto == IPPROTO_UDP) {
    12.    if (udphdr_ok(skb)) {
    13.       struct udphdr *udp = udp_hdr(skb);
    14.       key->tp.src = udp->source;
    15.       key->tp.dst = udp->dest;
    16.    } else {
    17.       memset(&key->tp, 0, sizeof(key->tp));
    18.    }
    19. } else if (key->ip.proto == IPPROTO_SCTP) {
    20.    if (sctphdr_ok(skb)) {
    21.       struct sctphdr *sctp = sctp_hdr(skb);
    22.       key->tp.src = sctp->source;
    23.       key->tp.dst = sctp->dest;
    24.    } else {
    25.       memset(&key->tp, 0, sizeof(key->tp));
    26.    }
    27. } else if (key->ip.proto == IPPROTO_ICMP) {
    28.    if (icmphdr_ok(skb)) {
    29.       struct icmphdr *icmp = icmp_hdr(skb);
    30.       /* The ICMP type and code fields use the 16-bit
    31.        * transport port fields, so we need to store
    32.        * them in 16-bit network byte order.
    33.        */
    34.       key->tp.src = htons(icmp->type);
    35.       key->tp.dst = htons(icmp->code);
    36.    } else {
    37.       memset(&key->tp, 0, sizeof(key->tp));
    38.    }
    39. }

    第二步:根据key查找flow table

    调用struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, u32 *n_mask_hit)进行查找。

    在内核中,flow table的数据结构如上图所示。

    每个虚拟交换机对应一个datapath,每个datapath有一个flow table,每个flow table分成N个桶,根据key进行哈希,不同的key分布在不同的桶里面。

    每个桶的大小是一个内存页的大小,在内存页的头部保存了保存了多少个元素,每个元素的大小。每个元素都是sw_flow,里面有key,也有action。

    ovs_flow_tbl_lookup_stats会调用static struct sw_flow *flow_lookup(struct flow_table *tbl, struct table_instance *ti, const struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, u32 *index)

    会调用masked_flow_lookup如下

    1. static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
    2.                  const struct sw_flow_key *unmasked,
    3.                  const struct sw_flow_mask *mask,
    4.                  u32 *n_mask_hit)
    5. {
    6.    struct sw_flow *flow;
    7.    struct hlist_head *head;
    8.    u32 hash;
    9.    struct sw_flow_key masked_key;
    10.  
    11.    ovs_flow_mask_key(&masked_key, unmasked, false, mask);
    12.    hash = flow_hash(&masked_key, &mask->range);
    13.    head = find_bucket(ti, hash);
    14.    (*n_mask_hit)++;
    15.    hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
    16.       if (flow->mask == mask && flow->flow_table.hash == hash &&
    17.           flow_cmp_masked_key(flow, &masked_key, &mask->range))
    18.          return flow;
    19.    }
    20.    return NULL;
    21. }

    其中flow_hash计算哈希值,find_bucket根据哈希值查找桶,然后就是一个循环,逐个比较key是否相等,相等则返回flow。

    第三步:执行action

    调用int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *acts,struct sw_flow_key *key)

    调用static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len)

    在这个函数中,通过case语句,不同的action进行不同的操作。

    1. static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
    2.                struct sw_flow_key *key,
    3.                const struct nlattr *attr, int len)
    4. {
    5.    /* Every output action needs a separate clone of 'skb', but the common
    6.     * case is just a single output action, so that doing a clone and
    7.     * then freeing the original skbuff is wasteful. So the following code
    8.     * is slightly obscure just to avoid that.
    9.     */
    10.    int prev_port = -1;
    11.    const struct nlattr *a;
    12.    int rem;
    13.  
    14.    for (a = attr, rem = len; rem > 0;
    15.         a = nla_next(a, &rem)) {
    16.       int err = 0;
    17.  
    18.       if (unlikely(prev_port != -1)) {
    19.          struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
    20.  
    21.          if (out_skb)
    22.             do_output(dp, out_skb, prev_port, key);
    23.  
    24.          prev_port = -1;
    25.       }
    26.  
    27.       switch (nla_type(a)) {
    28.       case OVS_ACTION_ATTR_OUTPUT:
    29.          prev_port = nla_get_u32(a);
    30.          break;
    31.  
    32.       case OVS_ACTION_ATTR_USERSPACE:
    33.          output_userspace(dp, skb, key, a, attr, len);
    34.          break;
    35.  
    36.       case OVS_ACTION_ATTR_HASH:
    37.          execute_hash(skb, key, a);
    38.          break;
    39.  
    40.       case OVS_ACTION_ATTR_PUSH_MPLS:
    41.          err = push_mpls(skb, key, nla_data(a));
    42.          break;
    43.  
    44.       case OVS_ACTION_ATTR_POP_MPLS:
    45.          err = pop_mpls(skb, key, nla_get_be16(a));
    46.          break;
    47.  
    48.       case OVS_ACTION_ATTR_PUSH_VLAN:
    49.          err = push_vlan(skb, key, nla_data(a));
    50.          break;
    51.  
    52.       case OVS_ACTION_ATTR_POP_VLAN:
    53.          err = pop_vlan(skb, key);
    54.          break;
    55.  
    56.       case OVS_ACTION_ATTR_RECIRC:
    57.          err = execute_recirc(dp, skb, key, a, rem);
    58.          if (nla_is_last(a, rem)) {
    59.             /* If this is the last action, the skb has
    60.              * been consumed or freed.
    61.              * Return immediately.
    62.              */
    63.             return err;
    64.          }
    65.          break;
    66.  
    67.       case OVS_ACTION_ATTR_SET:
    68.          err = execute_set_action(skb, key, nla_data(a));
    69.          break;
    70.  
    71.       case OVS_ACTION_ATTR_SET_MASKED:
    72.       case OVS_ACTION_ATTR_SET_TO_MASKED:
    73.          err = execute_masked_set_action(skb, key, nla_data(a));
    74.          break;
    75.  
    76.       case OVS_ACTION_ATTR_SAMPLE:
    77.          err = sample(dp, skb, key, a, attr, len);
    78.          break;
    79.  
    80.       case OVS_ACTION_ATTR_CT:
    81.          if (!is_flow_key_valid(key)) {
    82.             err = ovs_flow_key_update(skb, key);
    83.             if (err)
    84.                return err;
    85.          }
    86.  
    87.          err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
    88.                     nla_data(a));
    89.  
    90.          /* Hide stolen IP fragments from user space. */
    91.          if (err)
    92.             return err == -EINPROGRESS ? 0 : err;
    93.          break;
    94.       }
    95.  
    96.       if (unlikely(err)) {
    97.          kfree_skb(skb);
    98.          return err;
    99.       }
    100.    }
    101.  
    102.    if (prev_port != -1)
    103.       do_output(dp, skb, prev_port, key);
    104.    else
    105.       consume_skb(skb);
    106.  
    107.    return 0;
    108. }

    如果可以直接输出,则调用static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, struct sw_flow_key *key)他调用void ovs_vport_send(struct vport *vport, struct sk_buff *skb)进行发送。

  • 相关阅读:
    多测师讲解html _伪类选择器17_高级讲师肖sir
    多测师讲解html _后代选择器16_高级讲师肖sir
    多测师讲解html _组合选择器_高级讲师肖sir
    多测师讲解html _标签选择器14_高级讲师肖sir
    前端 HTML form表单标签 input标签 type属性 重置按钮 reset
    前端 HTML form表单标签 textarea标签 多行文本
    前端 HTML form表单标签 input标签 type属性 file 上传文件
    前端 HTML form表单标签 input标签 type属性 radio 单选框
    前端 HTML form表单标签 input标签 type属性 checkbox 多选框
    前端 HTML form表单目录
  • 原文地址:https://www.cnblogs.com/liuhongru/p/11398591.html
Copyright © 2011-2022 走看看