zoukankan      html  css  js  c++  java
  • Openvswitch原理与代码分析(5): 内核中的流表flow table操作

     

     

    当一个数据包到达网卡的时候,首先要经过内核Openvswitch.ko,流表Flow Table在内核中有一份,通过key查找内核中的flow table,即可以得到action,然后执行action之后,直接发送这个包,只有在内核无法查找到流表项的时候,才会到用户态查找用户态的流表。仅仅查找内核中flow table的情况被称为fast path.

    第一步:从数据包中提取出key

    实现函数为int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, struct sk_buff *skb, struct sw_flow_key *key)

    在这个函数中,首先提取的是物理层的信息,主要是从哪个网口进入的。

    1. key->phy.priority = skb->priority;
    2. key->phy.in_port = OVS_CB(skb)->input_vport->port_no;
    3. key->phy.skb_mark = skb->mark;
    4. ovs_ct_fill_key(skb, key);
    5. key->ovs_flow_hash = 0;
    6. key->recirc_id = 0;

    然后调用函数static int key_extract(struct sk_buff *skb, struct sw_flow_key *key)提取其他的key

    提取MAC层

    1. /* Link layer. We are guaranteed to have at least the 14 byte Ethernet
    2.  * header in the linear data area.
    3.  */
    4. eth = eth_hdr(skb);
    5. ether_addr_copy(key->eth.src, eth->h_source);
    6. ether_addr_copy(key->eth.dst, eth->h_dest);
    7. __skb_pull(skb, 2 * ETH_ALEN);
    8. /* We are going to push all headers that we pull, so no need to
    9.  * update skb->csum here.
    10.  */
    11. key->eth.tci = 0;
    12. if (skb_vlan_tag_present(skb))
    13.    key->eth.tci = htons(vlan_get_tci(skb));
    14. else if (eth->h_proto == htons(ETH_P_8021Q))
    15.    if (unlikely(parse_vlan(skb, key)))
    16.       return -ENOMEM;
    17. key->eth.type = parse_ethertype(skb);

    提取网络层

    1. struct iphdr *nh;
    2. __be16 offset;
    3. error = check_iphdr(skb);
    4. if (unlikely(error)) {
    5.    memset(&key->ip, 0, sizeof(key->ip));
    6.    memset(&key->ipv4, 0, sizeof(key->ipv4));
    7.    if (error == -EINVAL) {
    8.       skb->transport_header = skb->network_header;
    9.       error = 0;
    10.    }
    11.    return error;
    12. }
    13. nh = ip_hdr(skb);
    14. key->ipv4.addr.src = nh->saddr;
    15. key->ipv4.addr.dst = nh->daddr;
    16. key->ip.proto = nh->protocol;
    17. key->ip.tos = nh->tos;
    18. key->ip.ttl = nh->ttl;
    19. offset = nh->frag_off & htons(IP_OFFSET);
    20. if (offset) {
    21.    key->ip.frag = OVS_FRAG_TYPE_LATER;
    22.    return 0;
    23. }
    24. if (nh->frag_off & htons(IP_MF) ||
    25.    skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
    26.    key->ip.frag = OVS_FRAG_TYPE_FIRST;
    27. else
    28.    key->ip.frag = OVS_FRAG_TYPE_NONE;

    提取传输层

    1. /* Transport layer. */
    2. if (key->ip.proto == IPPROTO_TCP) {
    3.    if (tcphdr_ok(skb)) {
    4.       struct tcphdr *tcp = tcp_hdr(skb);
    5.       key->tp.src = tcp->source;
    6.       key->tp.dst = tcp->dest;
    7.       key->tp.flags = TCP_FLAGS_BE16(tcp);
    8.    } else {
    9.       memset(&key->tp, 0, sizeof(key->tp));
    10.    }
    11. } else if (key->ip.proto == IPPROTO_UDP) {
    12.    if (udphdr_ok(skb)) {
    13.       struct udphdr *udp = udp_hdr(skb);
    14.       key->tp.src = udp->source;
    15.       key->tp.dst = udp->dest;
    16.    } else {
    17.       memset(&key->tp, 0, sizeof(key->tp));
    18.    }
    19. } else if (key->ip.proto == IPPROTO_SCTP) {
    20.    if (sctphdr_ok(skb)) {
    21.       struct sctphdr *sctp = sctp_hdr(skb);
    22.       key->tp.src = sctp->source;
    23.       key->tp.dst = sctp->dest;
    24.    } else {
    25.       memset(&key->tp, 0, sizeof(key->tp));
    26.    }
    27. } else if (key->ip.proto == IPPROTO_ICMP) {
    28.    if (icmphdr_ok(skb)) {
    29.       struct icmphdr *icmp = icmp_hdr(skb);
    30.       /* The ICMP type and code fields use the 16-bit
    31.        * transport port fields, so we need to store
    32.        * them in 16-bit network byte order.
    33.        */
    34.       key->tp.src = htons(icmp->type);
    35.       key->tp.dst = htons(icmp->code);
    36.    } else {
    37.       memset(&key->tp, 0, sizeof(key->tp));
    38.    }
    39. }

    第二步:根据key查找flow table

    调用struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, u32 *n_mask_hit)进行查找。

    在内核中,flow table的数据结构如上图所示。

    每个虚拟交换机对应一个datapath,每个datapath有一个flow table,每个flow table分成N个桶,根据key进行哈希,不同的key分布在不同的桶里面。

    每个桶的大小是一个内存页的大小,在内存页的头部保存了保存了多少个元素,每个元素的大小。每个元素都是sw_flow,里面有key,也有action。

    ovs_flow_tbl_lookup_stats会调用static struct sw_flow *flow_lookup(struct flow_table *tbl, struct table_instance *ti, const struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, u32 *index)

    会调用masked_flow_lookup如下

    1. static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
    2.                  const struct sw_flow_key *unmasked,
    3.                  const struct sw_flow_mask *mask,
    4.                  u32 *n_mask_hit)
    5. {
    6.    struct sw_flow *flow;
    7.    struct hlist_head *head;
    8.    u32 hash;
    9.    struct sw_flow_key masked_key;
    10.  
    11.    ovs_flow_mask_key(&masked_key, unmasked, false, mask);
    12.    hash = flow_hash(&masked_key, &mask->range);
    13.    head = find_bucket(ti, hash);
    14.    (*n_mask_hit)++;
    15.    hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
    16.       if (flow->mask == mask && flow->flow_table.hash == hash &&
    17.           flow_cmp_masked_key(flow, &masked_key, &mask->range))
    18.          return flow;
    19.    }
    20.    return NULL;
    21. }

    其中flow_hash计算哈希值,find_bucket根据哈希值查找桶,然后就是一个循环,逐个比较key是否相等,相等则返回flow。

    第三步:执行action

    调用int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *acts,struct sw_flow_key *key)

    调用static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, int len)

    在这个函数中,通过case语句,不同的action进行不同的操作。

    1. static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
    2.                struct sw_flow_key *key,
    3.                const struct nlattr *attr, int len)
    4. {
    5.    /* Every output action needs a separate clone of 'skb', but the common
    6.     * case is just a single output action, so that doing a clone and
    7.     * then freeing the original skbuff is wasteful. So the following code
    8.     * is slightly obscure just to avoid that.
    9.     */
    10.    int prev_port = -1;
    11.    const struct nlattr *a;
    12.    int rem;
    13.  
    14.    for (a = attr, rem = len; rem > 0;
    15.         a = nla_next(a, &rem)) {
    16.       int err = 0;
    17.  
    18.       if (unlikely(prev_port != -1)) {
    19.          struct sk_buff *out_skb = skb_clone(skb, GFP_ATOMIC);
    20.  
    21.          if (out_skb)
    22.             do_output(dp, out_skb, prev_port, key);
    23.  
    24.          prev_port = -1;
    25.       }
    26.  
    27.       switch (nla_type(a)) {
    28.       case OVS_ACTION_ATTR_OUTPUT:
    29.          prev_port = nla_get_u32(a);
    30.          break;
    31.  
    32.       case OVS_ACTION_ATTR_USERSPACE:
    33.          output_userspace(dp, skb, key, a, attr, len);
    34.          break;
    35.  
    36.       case OVS_ACTION_ATTR_HASH:
    37.          execute_hash(skb, key, a);
    38.          break;
    39.  
    40.       case OVS_ACTION_ATTR_PUSH_MPLS:
    41.          err = push_mpls(skb, key, nla_data(a));
    42.          break;
    43.  
    44.       case OVS_ACTION_ATTR_POP_MPLS:
    45.          err = pop_mpls(skb, key, nla_get_be16(a));
    46.          break;
    47.  
    48.       case OVS_ACTION_ATTR_PUSH_VLAN:
    49.          err = push_vlan(skb, key, nla_data(a));
    50.          break;
    51.  
    52.       case OVS_ACTION_ATTR_POP_VLAN:
    53.          err = pop_vlan(skb, key);
    54.          break;
    55.  
    56.       case OVS_ACTION_ATTR_RECIRC:
    57.          err = execute_recirc(dp, skb, key, a, rem);
    58.          if (nla_is_last(a, rem)) {
    59.             /* If this is the last action, the skb has
    60.              * been consumed or freed.
    61.              * Return immediately.
    62.              */
    63.             return err;
    64.          }
    65.          break;
    66.  
    67.       case OVS_ACTION_ATTR_SET:
    68.          err = execute_set_action(skb, key, nla_data(a));
    69.          break;
    70.  
    71.       case OVS_ACTION_ATTR_SET_MASKED:
    72.       case OVS_ACTION_ATTR_SET_TO_MASKED:
    73.          err = execute_masked_set_action(skb, key, nla_data(a));
    74.          break;
    75.  
    76.       case OVS_ACTION_ATTR_SAMPLE:
    77.          err = sample(dp, skb, key, a, attr, len);
    78.          break;
    79.  
    80.       case OVS_ACTION_ATTR_CT:
    81.          if (!is_flow_key_valid(key)) {
    82.             err = ovs_flow_key_update(skb, key);
    83.             if (err)
    84.                return err;
    85.          }
    86.  
    87.          err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
    88.                     nla_data(a));
    89.  
    90.          /* Hide stolen IP fragments from user space. */
    91.          if (err)
    92.             return err == -EINPROGRESS ? 0 : err;
    93.          break;
    94.       }
    95.  
    96.       if (unlikely(err)) {
    97.          kfree_skb(skb);
    98.          return err;
    99.       }
    100.    }
    101.  
    102.    if (prev_port != -1)
    103.       do_output(dp, skb, prev_port, key);
    104.    else
    105.       consume_skb(skb);
    106.  
    107.    return 0;
    108. }

    如果可以直接输出,则调用static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, struct sw_flow_key *key)他调用void ovs_vport_send(struct vport *vport, struct sk_buff *skb)进行发送。

  • 相关阅读:
    (转)五大常用算法之二:动态规划算法
    (转)五大常用算法之一:分治算法
    dict
    usaco2
    usaco3
    usaco4
    usaco1
    并查集
    洛谷P1428小鱼比可爱
    洛谷P1967货车运输
  • 原文地址:https://www.cnblogs.com/liuhongru/p/11398591.html
Copyright © 2011-2022 走看看