zoukankan      html  css  js  c++  java
  • ip_vs实现分析(6)

    本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝,转载,转载时请保持文档的完整性,严禁用于任何商业用途。
    msn: yfydz_no1@hotmail.com
    来源:http://yfydz.cublog.cn

     
    8. IPVS的数据包发送
    IPVS连接中的数据包的发送方法是由ip_vs_bind_xmit()函数定义的,具体的发送数据包处理函数定义在net/ipv4/ipvs/ip_vs_xmit.c中。
    8.1 NAT发送

    NAT发送只发送请求方向的数据,因此是进行目的NAT
    /*
     *      NAT transmitter (only for outside-to-inside nat forwarding)
     *      Not used for related ICMP
     */
    int
    ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
            struct ip_vs_protocol *pp)
    {
     struct rtable *rt;  /* Route to the other host */
     int mtu;
     struct iphdr *iph = skb->nh.iph;
     EnterFunction(10);
     /* check if it is a connection of no-client-port */
     if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
    // 如果连接标志了客户端端口为0,将当前skb中的端口填给连接
      __u16 _pt, *p;
      p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
      if (p == NULL)
       goto tx_error;
    // *p是源端口
      ip_vs_conn_fill_cport(cp, *p);
      IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
     }
    // 查找路由,找不到的话发ICMP出错包
     if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
      goto tx_error_icmp;
     /* MTU checking */
    // 检查路由发出网卡的MTU,如果包长超过MTU又有DF标志,发送ICMP错误信息,而不进行分片操作
     mtu = dst_mtu(&rt->u.dst);
     if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
      ip_rt_put(rt);
      icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
      IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
      goto tx_error;
     }
     /* copy-on-write the packet before mangling it */
    // 让skb包的IP头部分是可写的
     if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
      goto tx_error_put;
    // 扩充skb头部空间以容纳硬件MAC头数据
     if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
      goto tx_error_put;
     /* drop old route */
    // 释放skb当前的路由cache
     dst_release(skb->dst);
     skb->dst = &rt->u.dst;
     /* mangle the packet */
    // 对上层协议(TCP/UDP...)进行目的NAT,因为要发送给实际的目的服务器
     if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
      goto tx_error;
    // 修改目的地址为真实目的服务器地址
     skb->nh.iph->daddr = cp->daddr;
    // 计算IP头校验和
     ip_send_check(skb->nh.iph);
     IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
     /* FIXME: when application helper enlarges the packet and the length
        is larger than the MTU of outgoing device, there will be still
        MTU problem. */
     /* Another hack: avoid icmp_send in ip_fragment */
    // don't fragment标志
     skb->local_df = 1;
    // 发送数据包,实际还是HOOK住netfilter的OUTPUT点,受OUTPUT规则限制
     IP_VS_XMIT(skb, rt);
     LeaveFunction(10);
    // 返回STOLEN告诉原来的hook点不用处理该包了
     return NF_STOLEN;
      tx_error_icmp:
     dst_link_failure(skb);
      tx_error:
     LeaveFunction(10);
     kfree_skb(skb);
     return NF_STOLEN;
      tx_error_put:
     ip_rt_put(rt);
     goto tx_error;
    }
     
    8.2 TUNNEL发送

    TUNNEL发送是把原来的IP部分再加在一个IPIP协议(4)头后发出去,新头的目的IP是真实目的服务器,源IP是真实客户端IP,该包是可以路由的,服务器的回应包将直接路由回去而不经过IPVS.
    /*
     *   IP Tunneling transmitter
     *
     *   This function encapsulates the packet in a new IP packet, its
     *   destination will be set to cp->daddr. Most code of this function
     *   is taken from ipip.c.
     *
     *   It is used in VS/TUN cluster. The load balancer selects a real
     *   server from a cluster based on a scheduling algorithm,
     *   encapsulates the request packet and forwards it to the selected
     *   server. For example, all real servers are configured with
     *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
     *   the encapsulated packet, it will decapsulate the packet, processe
     *   the request and return the response packets directly to the client
     *   without passing the load balancer. This can greatly increase the
     *   scalability of virtual server.
     *
     *   Used for ANY protocol
     */
    int
    ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        struct ip_vs_protocol *pp)
    {
     struct rtable *rt;   /* Route to the other host */
     struct net_device *tdev;  /* Device to other host */
     struct iphdr  *old_iph = skb->nh.iph;
     u8     tos = old_iph->tos;
     __be16 df = old_iph->frag_off;
     struct iphdr  *iph;   /* Our new IP header */
     int    max_headroom;   /* The extra header space needed */
     int    mtu;
     EnterFunction(10);
    // 只包装IP包,其他协议如ARP,IPX等不管
     if (skb->protocol != __constant_htons(ETH_P_IP)) {
      IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
            "ETH_P_IP: %d, skb protocol: %d\n",
            __constant_htons(ETH_P_IP), skb->protocol);
      goto tx_error;
     }
    // 根据连接信息找外出的路由cache
     if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
      goto tx_error_icmp;
    // 数据包发出网卡
     tdev = rt->u.dst.dev;
    // 检查路径的MTU
     mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
     if (mtu < 68) {
      ip_rt_put(rt);
      IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
      goto tx_error;
     }
    // 更新路由的MTU
     if (skb->dst)
      skb->dst->ops->update_pmtu(skb->dst, mtu);
    // 检查don't fragement标志
     df |= (old_iph->frag_off&__constant_htons(IP_DF));
     if ((old_iph->frag_off&__constant_htons(IP_DF))
         && mtu < ntohs(old_iph->tot_len)) {
    // 如果skb包长超过MTU又有DF标志,发送ICMP错误信息,而不进行分片操作
      icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
      ip_rt_put(rt);
      IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
      goto tx_error;
     }
     /*
      * Okay, now see if we can stuff it in the buffer as-is.
      */
    // 计算需要添加的IP头的最大长度
     max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
     if (skb_headroom(skb) < max_headroom
         || skb_cloned(skb) || skb_shared(skb)) {
    // 重新分配一个skb包,该skb头部足够大可容纳外部IP头空间
    // 分配失败则不发送该包了
      struct sk_buff *new_skb =
       skb_realloc_headroom(skb, max_headroom);
      if (!new_skb) {
       ip_rt_put(rt);
       kfree_skb(skb);
       IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
       return NF_STOLEN;
      }
    // 将原来的skb释放掉
      kfree_skb(skb);
    // 将skb指向新包,更新ip头指针
      skb = new_skb;
      old_iph = skb->nh.iph;
     }
    // skb->h是传输层头,现在要新加个IP头,原来的IP头就升级为传输层头
     skb->h.raw = (void *) old_iph;
     /* fix old IP header checksum */
    // 计算老IP头的校验和
     ip_send_check(old_iph);
    // skb的data指针前移出IP头长度作为新IP头的起点
     skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
     memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
     /* drop old route */
    // 更新路由cache
     dst_release(skb->dst);
     skb->dst = &rt->u.dst;
    // 填写新IP头部信息
     /*
      * Push down and install the IPIP header.
      */
     iph   = skb->nh.iph;
     iph->version  = 4;
     iph->ihl  = sizeof(struct iphdr)>>2;
     iph->frag_off  = df;
    // 协议设置为IPIP, 值为4
     iph->protocol  = IPPROTO_IPIP;
     iph->tos  = tos;
     iph->daddr  = rt->rt_dst;
     iph->saddr  = rt->rt_src;
     iph->ttl  = old_iph->ttl;
     iph->tot_len  = htons(skb->len);
    // 设置IP头中的ID值
     ip_select_ident(iph, &rt->u.dst, NULL);
    // 计算IP头校验和
     ip_send_check(iph);
     /* Another hack: avoid icmp_send in ip_fragment */
    // don't fragmemt
     skb->local_df = 1;
    // 发送新的skb包
     IP_VS_XMIT(skb, rt);
     LeaveFunction(10);
     return NF_STOLEN;
      tx_error_icmp:
     dst_link_failure(skb);
      tx_error:
     kfree_skb(skb);
     LeaveFunction(10);
     return NF_STOLEN;
    }
     
    8.3 DR发送

    DR发送是将原来的skb包中的目的MAC地址修改为目的服务器的MAC地址后直接发出,因此是不能路由的,IPVS均衡设备和目的服务器物理上必须在同一个二层子网。在DR模式下,IPVS和服务器都配置了相同的对外服务的VIP,服务器也配了自己的真实IP,不过服务器上配VIP的网卡属性中的NOARP信息是打开的,就是在该网卡上不响应ARP信息,但可以接收到达该VIP的数据包,这样外面请求包先是到IPVS均衡器,因为IPVS的VIP是响应ARP的,然后根据调度找一台服务器,用服务器的真实IP来确定路由,然后直接把包发出来,这时包中所有数据都没修改,因为目的服务器上VIP地址符合包中的目的地址,因此是可以接收该包的。
    /*
     *      Direct Routing transmitter
     *      Used for ANY protocol
     */
    int
    ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
           struct ip_vs_protocol *pp)
    {
     struct rtable *rt;   /* Route to the other host */
     struct iphdr  *iph = skb->nh.iph;
     int    mtu;
     EnterFunction(10);
    // 根据连接指定的目的服务器找路由
     if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
      goto tx_error_icmp;
     /* MTU checking */
    // 检查MTU
     mtu = dst_mtu(&rt->u.dst);
     if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) {
      icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
      ip_rt_put(rt);
      IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
      goto tx_error;
     }
     /*
      * Call ip_send_check because we are not sure it is called
      * after ip_defrag. Is copy-on-write needed?
      */
    // 防止skb包是共用的,还被其他地方使用
     if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
      ip_rt_put(rt);
      return NF_STOLEN;
     }
    // 重新计算IP头校验和
     ip_send_check(skb->nh.iph);
     /* drop old route */
    // 释放原来的路由
     dst_release(skb->dst);
    // 指定新路由
     skb->dst = &rt->u.dst;
     /* Another hack: avoid icmp_send in ip_fragment */
     skb->local_df = 1;
    // 直接发出了
     IP_VS_XMIT(skb, rt);
     LeaveFunction(10);
     return NF_STOLEN;
      tx_error_icmp:
     dst_link_failure(skb);
      tx_error:
     kfree_skb(skb);
     LeaveFunction(10);
     return NF_STOLEN;
    }
     
    8.4 NULL发送
    啥也没干
    /*
     *      NULL transmitter (do nothing except return NF_ACCEPT)
     */
    int
    ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
      struct ip_vs_protocol *pp)
    {
     /* we do not touch skb and do not need pskb ptr */
     return NF_ACCEPT;
    }

    8.5 旁路发送
    旁路模式,实际数据包不是给IPVS均衡器自己的,由IPVS进行转发
    /*
     *      Bypass transmitter
     *      Let packets bypass the destination when the destination is not
     *      available, it may be only used in transparent cache cluster.
     */
    int
    ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
        struct ip_vs_protocol *pp)
    {
     struct rtable *rt;   /* Route to the other host */
     struct iphdr  *iph = skb->nh.iph;
     u8     tos = iph->tos;
     int    mtu;
    // 用当前IP包的目的地址作为查路由的key
     struct flowi fl = {
      .oif = 0,
      .nl_u = {
       .ip4_u = {
        .daddr = iph->daddr,
        .saddr = 0,
        .tos = RT_TOS(tos), } },
     };
     EnterFunction(10);
    // 查找当前数据包的目的IP地址对应的路由,而不是是IPVS连接的信息找路由
     if (ip_route_output_key(&rt, &fl)) {
      IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
            "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
      goto tx_error_icmp;
     }
    // MTU检查
     /* MTU checking */
     mtu = dst_mtu(&rt->u.dst);
     if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
      ip_rt_put(rt);
      icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
      IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
      goto tx_error;
     }
     /*
      * Call ip_send_check because we are not sure it is called
      * after ip_defrag. Is copy-on-write needed?
      */
    // 防止skb包是共用的,还被其他地方使用
     if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
      ip_rt_put(rt);
      return NF_STOLEN;
     }
    // 计算IP头校验和
     ip_send_check(skb->nh.iph);
     /* drop old route */
    // 释放老路由,更新路由
     dst_release(skb->dst);
     skb->dst = &rt->u.dst;
     /* Another hack: avoid icmp_send in ip_fragment */
     skb->local_df = 1;
    // 发送
     IP_VS_XMIT(skb, rt);
     LeaveFunction(10);
     return NF_STOLEN;
     tx_error_icmp:
     dst_link_failure(skb);
     tx_error:
     kfree_skb(skb);
     LeaveFunction(10);
     return NF_STOLEN;
    }
     
    8.6 ICMP发送

    发送各种ICMP错误信息包
    /*
     * ICMP packet transmitter
     * called by the ip_vs_in_icmp
     */
    int
    ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
      struct ip_vs_protocol *pp, int offset)
    {
     struct rtable *rt; /* Route to the other host */
     int mtu;
     int rc;
     EnterFunction(10);
     /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
        forwarded directly here, because there is no need to
        translate address/port back */
     if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
    // TUNNEL和DR模式下直接调用连接的发送方法发送即可
      if (cp->packet_xmit)
       rc = cp->packet_xmit(skb, cp, pp);
      else
       rc = NF_ACCEPT;
      /* do not touch skb anymore */
      atomic_inc(&cp->in_pkts);
      goto out;
     }
     /*
      * mangle and send the packet here (only for VS/NAT)
      */
    // 根据连接的信息找路由
     if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
      goto tx_error_icmp;
     /* MTU checking */
     mtu = dst_mtu(&rt->u.dst);
     if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
      ip_rt_put(rt);
      icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
      IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
      goto tx_error;
     }
     /* copy-on-write the packet before mangling it */
    // 使skb包可写
     if (!ip_vs_make_skb_writable(&skb, offset))
      goto tx_error_put;
    // 扩充skb头部空间以容纳硬件MAC头数据
     if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
      goto tx_error_put;
    // 路由更新
     /* drop the old route when skb is not shared */
     dst_release(skb->dst);
     skb->dst = &rt->u.dst;
    // 对ICMP包进行地址转换
     ip_vs_nat_icmp(skb, pp, cp, 0);
     /* Another hack: avoid icmp_send in ip_fragment */
     skb->local_df = 1;
     IP_VS_XMIT(skb, rt);
     rc = NF_STOLEN;
     goto out;
      tx_error_icmp:
     dst_link_failure(skb);
      tx_error:
     dev_kfree_skb(skb);
     rc = NF_STOLEN;
      out:
     LeaveFunction(10);
     return rc;
      tx_error_put:
     ip_rt_put(rt);
     goto tx_error;
    }
  • 相关阅读:
    结巴分词 0.14 版发布,Python 中文分词库
    Lazarus 1.0.2 发布,Pascal 集成开发环境
    Android全屏 去除标题栏和状态栏
    服务器日志现 Android 4.2 传将添多项新特性
    Percona XtraBackup 2.0.3 发布
    长平狐 Android 强制设置横屏或竖屏 设置全屏
    NetBeans 7.3 Beta 发布,全新的 HTML5 支持
    CppDepend现在已经支持Linux
    GromJS 1.7.18 发布,服务器端的 JavaScript
    Apache OpenWebBeans 1.1.6 发布
  • 原文地址:https://www.cnblogs.com/qq78292959/p/2587834.html
Copyright © 2011-2022 走看看