隧道创建
对于隧道L2TP、FOU/GUE、GENEVE和VXLAN,隧道创建时,都需要在内核中新建一个UDP套接口,框架中的函数udp_sock_create4提供此功能。不仅是套接口的创建,还有本机接口的绑定bind,以及如果特定隧道提供了对端地址信息,进行连接connect。
int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; err = kernel_bind(sock, (struct sockaddr *)&udp_addr, sizeof(udp_addr)); if (cfg->peer_udp_port) { udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->peer_ip; udp_addr.sin_port = cfg->peer_udp_port; err = kernel_connect(sock, (struct sockaddr *)&udp_addr, sizeof(udp_addr), 0); } sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; }
函数setup_udp_tunnel_sock建立套接口的隧道绑定。此函数将第二个参数socket套接口关联tunnel隧道属性,内核由此套接口接收到的数据包交由配置的encap_rcv回调函数处理(cfg->encap_rcv)。目前基于UDP的隧道协议主要有L2TP、VxLAN和GENEVE,分别注册了接收处理函数l2tp_udp_encap_recv、vxlan_rcv和geneve_udp_encap_recv。通用的UDP隧道协议FOU和GUE,处理函数分别为fou_udp_recv和gue_udp_recv。
void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { struct sock *sk = sock->sk; udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sock); }
UDP隧道接收
在UDP数据包处理路径中,函数udp_queue_rcv_skb判断当前套接口的udp_encap_needed是否使能,并且encap_type不为0。随即调用绑定在此套接口上的封装数据包回调处理函数encap_rcv进行处理。
static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { struct udp_sock *up = udp_sk(sk); if (static_key_false(&udp_encap_needed) && up->encap_type) { encap_rcv = READ_ONCE(up->encap_rcv); if (encap_rcv) { ret = encap_rcv(sk, skb); }
void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, ...) { uh = udp_hdr(skb); uh->dest = dst_port; uh->source = src_port; uh->len = htons(skb->len); iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); }
UDP隧道发送
当数据包到达UDP隧道设备的发送函数(ndo_start_xmit)时,例如GENEVE隧道发送函数(geneve_xmit),进行特定隧道相关处理,之后由通用UDP隧道发送函数udp_tunnel_xmit_skb进行发送。
const struct net_device_ops mlx5e_netdev_ops = { .ndo_open = mlx5e_open, .ndo_stop = mlx5e_close, .ndo_start_xmit = mlx5e_xmit, .ndo_setup_tc = mlx5e_setup_tc, .ndo_select_queue = mlx5e_select_queue, .ndo_get_stats64 = mlx5e_get_stats, .ndo_set_rx_mode = mlx5e_set_rx_mode, .ndo_set_mac_address = mlx5e_set_mac, .ndo_vlan_rx_add_vid = mlx5e_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = mlx5e_vlan_rx_kill_vid, .ndo_set_features = mlx5e_set_features, .ndo_fix_features = mlx5e_fix_features, .ndo_change_mtu = mlx5e_change_nic_mtu, .ndo_do_ioctl = mlx5e_ioctl, .ndo_set_tx_maxrate = mlx5e_set_tx_maxrate, .ndo_udp_tunnel_add = mlx5e_add_vxlan_port, .ndo_udp_tunnel_del = mlx5e_del_vxlan_port, .ndo_features_check = mlx5e_features_check, .ndo_tx_timeout = mlx5e_tx_timeout, .ndo_bpf = mlx5e_xdp, .ndo_xdp_xmit = mlx5e_xdp_xmit, .ndo_xsk_wakeup = mlx5e_xsk_wakeup, #ifdef CONFIG_MLX5_EN_ARFS .ndo_rx_flow_steer = mlx5e_rx_flow_steer, #endif #ifdef CONFIG_MLX5_ESWITCH .ndo_bridge_setlink = mlx5e_bridge_setlink, .ndo_bridge_getlink = mlx5e_bridge_getlink, /* SRIOV E-Switch NDOs */ .ndo_set_vf_mac = mlx5e_set_vf_mac, .ndo_set_vf_vlan = mlx5e_set_vf_vlan, .ndo_set_vf_spoofchk = mlx5e_set_vf_spoofchk, .ndo_set_vf_trust = mlx5e_set_vf_trust, .ndo_set_vf_rate = mlx5e_set_vf_rate, .ndo_get_vf_config = mlx5e_get_vf_config, .ndo_set_vf_link_state = mlx5e_set_vf_link_state, .ndo_get_vf_stats = mlx5e_get_vf_stats, #endif };
UDP隧道Offload
对于支持UDP隧道(VXLAN/GENEVE)Offloading功能的物理网卡,其通过标志位NETDEV_UDP_TUNNEL_PUSH_INFO/NETDEV_UDP_TUNNEL_DROP_INFO进行表示。函数udp_tunnel_push_rx_port与udp_tunnel_drop_rx_port用于设置和取消网卡的Offloading功能。
void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; if (!dev->netdev_ops->ndo_udp_tunnel_add || !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; dev->netdev_ops->ndo_udp_tunnel_add(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; if (!dev->netdev_ops->ndo_udp_tunnel_del || !(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) return; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; dev->netdev_ops->ndo_udp_tunnel_del(dev, &ti); }
static int vxlan_netdevice_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id); if (event == NETDEV_UNREGISTER) { vxlan_offload_rx_ports(dev, false); vxlan_handle_lowerdev_unregister(vn, dev); } else if (event == NETDEV_REGISTER) { vxlan_offload_rx_ports(dev, true); } else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO || event == NETDEV_UDP_TUNNEL_DROP_INFO) { vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO); } return NOTIFY_DONE; } static void vxlan_offload_rx_ports(struct net_device *dev, bool push) { struct vxlan_sock *vs; struct net *net = dev_net(dev); struct vxlan_net *vn = net_generic(net, vxlan_net_id); unsigned int i; spin_lock(&vn->sock_lock); for (i = 0; i < PORT_HASH_SIZE; ++i) { hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) { unsigned short type; if (vs->flags & VXLAN_F_GPE) type = UDP_TUNNEL_TYPE_VXLAN_GPE; else type = UDP_TUNNEL_TYPE_VXLAN; if (push) udp_tunnel_push_rx_port(dev, vs->sock, type); else udp_tunnel_drop_rx_port(dev, vs->sock, type); } } spin_unlock(&vn->sock_lock); }
static void mlx5e_vxlan_queue_work(struct mlx5e_priv *priv, u16 port, int add) { struct mlx5e_vxlan_work *vxlan_work; vxlan_work = kmalloc(sizeof(*vxlan_work), GFP_ATOMIC); if (!vxlan_work) return; if (add) INIT_WORK(&vxlan_work->work, mlx5e_vxlan_add_work); else INIT_WORK(&vxlan_work->work, mlx5e_vxlan_del_work); vxlan_work->priv = priv; vxlan_work->port = port; queue_work(priv->wq, &vxlan_work->work); } void mlx5e_add_vxlan_port(struct net_device *netdev, struct udp_tunnel_info *ti) { struct mlx5e_priv *priv = netdev_priv(netdev); if (ti->type != UDP_TUNNEL_TYPE_VXLAN) return; if (!mlx5_vxlan_allowed(priv->mdev->vxlan)) return; mlx5e_vxlan_queue_work(priv, be16_to_cpu(ti->port), 1); } static void mlx5e_vxlan_add_work(struct work_struct *work) { struct mlx5e_vxlan_work *vxlan_work = container_of(work, struct mlx5e_vxlan_work, work); struct mlx5e_priv *priv = vxlan_work->priv; u16 port = vxlan_work->port; mutex_lock(&priv->state_lock); mlx5_vxlan_add_port(priv->mdev->vxlan, port); mutex_unlock(&priv->state_lock); kfree(vxlan_work); } int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port) { ret = mlx5_vxlan_core_add_port_cmd(vxlan->mdev, port); } static int mlx5_vxlan_core_add_port_cmd(struct mlx5_core_dev *mdev, u16 port) { u32 in[MLX5_ST_SZ_DW(add_vxlan_udp_dport_in)] = {0}; u32 out[MLX5_ST_SZ_DW(add_vxlan_udp_dport_out)] = {0}; MLX5_SET(add_vxlan_udp_dport_in, in, opcode, MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT); MLX5_SET(add_vxlan_udp_dport_in, in, vxlan_udp_port, port); return mlx5_cmd_exec(mdev, in, sizeof(in), out, sizeof(out)); }
外部UDP隧道
对于非使用ip link系统生成的UDP隧道,即控制通道在外部系统的隧道,如路由系统,其通过ip route encap指定隧道参数,就需要将这些有路由相关的隧道信息保存在路由缓存中。参见UDP框架函数udp_tun_rx_dst,使用metadata_dst结构体保存通用路由信息和隧道信息。
struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, __be16 flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size); info = &tun_dst->u.tun_info; info->key.tp_src = udp_hdr(skb)->source; info->key.tp_dst = udp_hdr(skb)->dest; if (udp_hdr(skb)->check) info->key.tun_flags |= TUNNEL_CSUM; }