DPVS 参考LVS设计,其核心在LVS的netfilter框架;笔者曾经做过类似的FULLNAT 满足portal 三层认证
LVS的HOOK点函数在内核中IPVS的源码在net/netfilter/ipvs目录下,LVS是以netfilter框架为基础,先看一下LVS在哪些HOOK点挂载了自己的处理函数。IPVS的处理函数在挂载在下面三个HOOK点,NF_INET_LOCAL_IN, NF_INET_FORWARD,NF_INET_POST_ROUTING,LVS同时支持IPv4/IPv6两种协议。
static struct nf_hook_ops ip_vs_ops[] __read_mostly = { /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { .hook = ip_vs_in, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_LOCAL_IN, .priority = 100, }, /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_FORWARD, .priority = 100, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_FORWARD, .priority = 99, }, /* Before the netfilter connection tracking, exit from POST_ROUTING */ { .hook = ip_vs_post_routing, .owner = THIS_MODULE, .pf = PF_INET, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC-1, }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { .hook = ip_vs_in, .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_INET_LOCAL_IN, .priority = 100, }, /* After packet filtering, change source only for VS/NAT */ { .hook = ip_vs_out, .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_INET_FORWARD, .priority = 100, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp_v6, .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_INET_FORWARD, .priority = 99, }, /* Before the netfilter connection tracking, exit from POST_ROUTING */ { .hook = ip_vs_post_routing, .owner = THIS_MODULE, .pf = PF_INET6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC-1, }, #endif };
4.2 ip_vs_in函数
当远端的客户端发送数据到达服务器时,也就是LVS Server时,该数据包的目的IP地址就是到达本地的,所以会先进入NF_LOCAL_IN的HOOK点,进行本地处理。static unsigned int ip_vs_in(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; int ret, restart, af, pkts; //判断是IPV4还是IPV6协议 af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; /*获取到IP头*/ ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); /* * Big tappo: only PACKET_HOST, including loopback for local client * Don't handle local packets on IPv6 for now 判断是否是达到本机的*/ if (unlikely(skb->pkt_type != PACKET_HOST)) { IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", skb->pkt_type, iph.protocol, IP_VS_DBG_ADDR(af, &iph.daddr)); return NF_ACCEPT; } /*对IPV6的判断忽略*/ #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); if (related) return verdict; ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } } else #endif /*判断是否是ICMP报文,*/ if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); if (related) return verdict; ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } /* Protocol supported? 判断LVS支持的协议,目前支持TCP、UDP、sctp、ah、esp */ pp = ip_vs_proto_get(iph.protocol); if (unlikely(!pp)) return NF_ACCEPT; /* * Check if the packet belongs to an existing connection entry,检查该数据包是否已经属于已经存在的连接实例,对于第一个包,这里返回NULL,这里对应的回调函数为ip_vs_conn_in_get_proto,该函数主要是根据源IP,目的IP,源端口和目的端口,协议号来查找。 */ cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); /*对于新到来的数据包包,查找不到cp为空,进入下面的if处理*/ if (unlikely(!cp)) { int v; /* For local client packets, it could be a response 对于TCP这里调用的函数为ip_vs_conn_out_get_proto 新的内核版本已经没有了response的处理,这里不作分析*/ cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); if (cp) return handle_response(af, skb, pp, cp, iph.len); /*这里查找是否是出去的报文,if (res_dir == IP_VS_CIDX_F_IN2OUT) { return handle_response(af, skb, pp, cp, iph.len); } 做SNAT*/ /*没有查找到connection ,创建新的connection ,这里的函数为tcp_conn_schedule */ if (!pp->conn_schedule(af, skb, pp, &v, &cp)) return v; } if (unlikely(!cp)) { /* sorry, all this trouble for a no-hit :) 到这里说明创建也没有成功,直接返回ACCEPT*/ IP_VS_DBG_PKT(12, pp, skb, 0, "packet continues traversal as normal"); return NF_ACCEPT; } IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); /* Check the server status 检查server的状态,如果cp->dest不为空,但是服务器不可用,则直接丢弃该数据包*/ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { /* the destination server is not available */ if (sysctl_ip_vs_expire_nodest_conn) { /* try to expire the connection immediately */ ip_vs_conn_expire_now(cp); } /* don't restart its timer, and silently drop the packet. 引用计数减一 */ __ip_vs_conn_put(cp); return NF_DROP; } /*更新统计和状态信息*/ ip_vs_in_stats(cp, skb); /*设置现在的状态为IP_VS_DIR_INPUT,调用的函数为,最终调用的函数为tcp_state_transition */ restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); if (cp->packet_xmit) ret = cp->packet_xmit(skb, cp, pp); /*调用发包函数,对于NAT模式来说这里的发包函数为ip_vs_nat_xmit */ /* do not touch skb anymore */ else { IP_VS_DBG_RL("warning: packet_xmit is null"); ret = NF_ACCEPT; } /* Increase its packet counter and check if it is needed * to be synchronized * * Sync connection if it is about to close to * encorage the standby servers to update the connections timeout 增加发送的包的数量,并进行同步*/ pkts = atomic_add_return(1, &cp->in_pkts); if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && cp->protocol == IPPROTO_SCTP) { if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] == sysctl_ip_vs_sync_threshold[0])) || (cp->old_state != cp->state && ((cp->state == IP_VS_SCTP_S_CLOSED) || (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { ip_vs_sync_conn(cp);//这里进行同步,同步主要是同步备份LVS server,这里是针对SCTP协议 goto out; } } if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && (((cp->protocol != IPPROTO_TCP || cp->state == IP_VS_TCP_S_ESTABLISHED) && (pkts % sysctl_ip_vs_sync_threshold[1] == sysctl_ip_vs_sync_threshold[0])) || ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && ((cp->state == IP_VS_TCP_S_FIN_WAIT) || (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || (cp->state == IP_VS_TCP_S_TIME_WAIT))))) ip_vs_sync_conn(cp); //这里进行同步,同步主要是同步备份LVS server,这里是针对TCP协议 out: cp->old_state = cp->state; ip_vs_conn_put(cp); return ret; }
4.2.1 tcp_conn_schedule
在该函数中调用调度策略函数,按照调度策略找到真正的real Server。
static int tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *verdict, struct ip_vs_conn **cpp, struct ip_vs_iphdr *iph) { struct ip_vs_service *svc; struct tcphdr _tcph, *th; __be16 _ports[2], *ports = NULL; /* In the event of icmp, we're only guaranteed to have the first 8 * bytes of the transport header, so we only check the rest of the * TCP packet for non-ICMP packets */ if (likely(!ip_vs_iph_icmp(iph))) { th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); if (th) { if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) return 1; ports = &th->source; } } else { ports = skb_header_pointer( skb, iph->len, sizeof(_ports), &_ports); } if (!ports) { *verdict = NF_DROP; return 0; } /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ rcu_read_lock(); if (likely(!ip_vs_iph_inverse(iph))) svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->daddr, ports[1]); else svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, &iph->saddr, ports[0]); if (svc) { int ignored; if (ip_vs_todrop(ipvs)) { /* * It seems that we are very loaded. * We have to drop this packet :( */ rcu_read_unlock(); *verdict = NF_DROP; return 0; } /* 根据协议号和目的地址、目的端口号查找到ip_vs_service 实例 * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); if (!*cpp && ignored <= 0) { if (!ignored) *verdict = ip_vs_leave(svc, skb, pd, iph); else *verdict = NF_DROP; rcu_read_unlock(); return 0; } } rcu_read_unlock(); /* NF_ACCEPT */ return 1; }
/* * IPVS main scheduling function * It selects a server according to the virtual service, and * creates a connection entry. * Protocols supported: TCP, UDP * * Usage of *ignored * * 1 : protocol tried to schedule (eg. on SYN), found svc but the * svc/scheduler decides that this packet should be accepted with * NF_ACCEPT because it must not be scheduled. * * 0 : scheduler can not find destination, so try bypass or * return ICMP and then NF_DROP (ip_vs_leave). * * -1 : scheduler tried to schedule but fatal error occurred, eg. * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param * failure such as missing Call-ID, ENOMEM on skb_linearize * or pe_data. In this case we should return NF_DROP without * any attempts to send ICMP with ip_vs_leave. */ struct ip_vs_conn * ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd, int *ignored, struct ip_vs_iphdr *iph) { struct ip_vs_protocol *pp = pd->pp; struct ip_vs_conn *cp = NULL; struct ip_vs_scheduler *sched; struct ip_vs_dest *dest; __be16 _ports[2], *pptr, cport, vport; const void *caddr, *vaddr; unsigned int flags; *ignored = 1; /* * IPv6 frags, only the first hit here. */ pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports, iph); if (pptr == NULL) return NULL; if (likely(!ip_vs_iph_inverse(iph))) { cport = pptr[0]; caddr = &iph->saddr; vport = pptr[1]; vaddr = &iph->daddr; } else { cport = pptr[1]; caddr = &iph->daddr; vport = pptr[0]; vaddr = &iph->saddr; } /* * FTPDATA needs this check when using local real server. * Never schedule Active FTPDATA connections from real server. * For LVS-NAT they must be already created. For other methods * with persistence the connection is created on SYN+ACK. */ if (cport == FTPDATA) { IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, "Not scheduling FTPDATA"); return NULL; } /* * Do not schedule replies from local real server. */ if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK)) { iph->hdr_flags ^= IP_VS_HDR_INVERSE; cp = pp->conn_in_get(svc->ipvs, svc->af, skb, iph); iph->hdr_flags ^= IP_VS_HDR_INVERSE; if (cp) { IP_VS_DBG_PKT(12, svc->af, pp, skb, iph->off, "Not scheduling reply for existing" " connection"); __ip_vs_conn_put(cp); return NULL; } } /* * Persistent service */ if (svc->flags & IP_VS_SVC_F_PERSISTENT) return ip_vs_sched_persist(svc, skb, cport, vport, ignored, iph); *ignored = 0; /* * Non-persistent service */ if (!svc->fwmark && vport != svc->port) { if (!svc->port) pr_err("Schedule: port zero only supported " "in persistent services, " "check your ipvs configuration\n"); return NULL; } sched = rcu_dereference(svc->scheduler); if (sched) { /* read svc->sched_data after svc->scheduler */ smp_rmb(); dest = sched->schedule(svc, skb, iph); } else { dest = NULL; } if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); return NULL; } flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0; /* * Create a connection entry. */ { struct ip_vs_conn_param p; ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol, caddr, cport, vaddr, vport, &p); cp = ip_vs_conn_new(&p, dest->af, &dest->addr, dest->port ? dest->port : vport, flags, dest, skb->mark); if (!cp) { *ignored = -1; return NULL; } } IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), cp->flags, atomic_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); return cp; }
/* * Create a new connection entry and hash it into the ip_vs_conn_tab */ struct ip_vs_conn * ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, const union nf_inet_addr *daddr, __be16 dport, unsigned int flags, struct ip_vs_dest *dest, __u32 fwmark) { struct ip_vs_conn *cp; struct netns_ipvs *ipvs = p->ipvs; struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs, p->protocol); cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { IP_VS_ERR_RL("%s(): no memory\n", __func__); return NULL; } INIT_HLIST_NODE(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); cp->ipvs = ipvs; cp->af = p->af; cp->daf = dest_af; cp->protocol = p->protocol; ip_vs_addr_set(p->af, &cp->caddr, p->caddr); cp->cport = p->cport; /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */ ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, &cp->vaddr, p->vaddr); cp->vport = p->vport; ip_vs_addr_set(cp->daf, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; cp->fwmark = fwmark; if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) { ip_vs_pe_get(p->pe); cp->pe = p->pe; cp->pe_data = p->pe_data; cp->pe_data_len = p->pe_data_len; } else { cp->pe = NULL; cp->pe_data = NULL; cp->pe_data_len = 0; } spin_lock_init(&cp->lock); /* * Set the entry is referenced by the current thread before hashing * it in the table, so that other thread run ip_vs_random_dropentry * but cannot drop this entry. */ atomic_set(&cp->refcnt, 1); cp->control = NULL; atomic_set(&cp->n_control, 0); atomic_set(&cp->in_pkts, 0); cp->packet_xmit = NULL; cp->app = NULL; cp->app_data = NULL; /* reset struct ip_vs_seq */ cp->in_seq.delta = 0; cp->out_seq.delta = 0; atomic_inc(&ipvs->conn_count); if (flags & IP_VS_CONN_F_NO_CPORT) atomic_inc(&ip_vs_conn_no_cport_cnt); /* Bind the connection with a destination server */ cp->dest = NULL; ip_vs_bind_dest(cp, dest); /* Set its state and timeout */ cp->state = 0; cp->old_state = 0; cp->timeout = 3*HZ; cp->sync_endtime = jiffies & ~3UL; /* Bind its packet transmitter */ #ifdef CONFIG_IP_VS_IPV6 if (p->af == AF_INET6) ip_vs_bind_xmit_v6(cp); else #endif ip_vs_bind_xmit(cp); if (unlikely(pd && atomic_read(&pd->appcnt))) ip_vs_bind_app(cp, pd->pp); /* * Allow conntrack to be preserved. By default, conntrack * is created and destroyed for every packet. * Sometimes keeping conntrack can be useful for * IP_VS_CONN_F_ONE_PACKET too. */ if (ip_vs_conntrack_enabled(ipvs)) cp->flags |= IP_VS_CONN_F_NFCT; /* Hash it in the ip_vs_conn_tab finally */ ip_vs_conn_hash(cp); return cp; } /* * Bind a connection entry with the corresponding packet_xmit. * Called by ip_vs_conn_new. */ static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) { switch (IP_VS_FWD_METHOD(cp)) { case IP_VS_CONN_F_MASQ: cp->packet_xmit = ip_vs_nat_xmit; break; case IP_VS_CONN_F_TUNNEL: #ifdef CONFIG_IP_VS_IPV6 if (cp->daf == AF_INET6) cp->packet_xmit = ip_vs_tunnel_xmit_v6; else #endif cp->packet_xmit = ip_vs_tunnel_xmit; break; case IP_VS_CONN_F_DROUTE: cp->packet_xmit = ip_vs_dr_xmit; break; case IP_VS_CONN_F_LOCALNODE: cp->packet_xmit = ip_vs_null_xmit; break; case IP_VS_CONN_F_BYPASS: cp->packet_xmit = ip_vs_bypass_xmit; break; } }
如果是NAT模式下:执行ip_vs_nat_xmit 转发报文到RS服务器
/* * NAT transmitter (only for outside-to-inside nat forwarding) * Not used for related ICMP */ int ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_iphdr *ipvsh) { struct rtable *rt; /* Route to the other host */ int local, rc, was_input; EnterFunction(10); rcu_read_lock(); /* check if it is a connection of no-client-port */ if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { __be16 _pt, *p; p = skb_header_pointer(skb, ipvsh->len, sizeof(_pt), &_pt); if (p == NULL) goto tx_error; ip_vs_conn_fill_cport(cp, *p); IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } was_input = rt_is_input_route(skb_rtable(skb)); local = __ip_vs_get_out_rt(cp->ipvs, cp->af, skb, cp->dest, cp->daddr.ip, IP_VS_RT_MODE_LOCAL | IP_VS_RT_MODE_NON_LOCAL | IP_VS_RT_MODE_RDR, NULL, ipvsh); if (local < 0) goto tx_error; rt = skb_rtable(skb); /* * Avoid duplicate tuple in reply direction for NAT traffic * to local address when connection is sync-ed */ #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (cp->flags & IP_VS_CONN_F_SYNC && local) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, "ip_vs_nat_xmit(): " "stopping DNAT to local address"); goto tx_error; } } #endif /* From world but DNAT to loopback address? */ if (local && ipv4_is_loopback(cp->daddr.ip) && was_input) { IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, ipvsh->off, "ip_vs_nat_xmit(): stopping DNAT to loopback " "address"); goto tx_error; } /* copy-on-write the packet before mangling it */ if (!skb_make_writable(skb, sizeof(struct iphdr))) goto tx_error; if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error; /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp, ipvsh)) goto tx_error; ip_hdr(skb)->daddr = cp->daddr.ip;// DNAT 目的ip地址替换 ip_send_check(ip_hdr(skb)); IP_VS_DBG_PKT(10, AF_INET, pp, skb, ipvsh->off, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ /* Another hack: avoid icmp_send in ip_fragment */ skb->ignore_df = 1; rc = ip_vs_nat_send_or_cont(NFPROTO_IPV4, skb, cp, local); rcu_read_unlock(); LeaveFunction(10); return rc; tx_error: kfree_skb(skb); rcu_read_unlock(); LeaveFunction(10); return NF_STOLEN; }
/* return NF_STOLEN (sent) or NF_ACCEPT if local=1 (not sent) */ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, struct ip_vs_conn *cp, int local) { int ret = NF_STOLEN; skb->ipvs_property = 1; if (likely(!(cp->flags & IP_VS_CONN_F_NFCT))) ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 1); /* Remove the early_demux association unless it's bound for the * exact same port and address on this host after translation. */ if (!local || cp->vport != cp->dport || !ip_vs_addr_equal(cp->af, &cp->vaddr, &cp->daddr)) ip_vs_drop_early_demux_sk(skb); if (!local) { skb_forward_csum(skb); NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else ret = NF_ACCEPT; return ret; }
也就是转发时 会经过LOCK_OUT hook点;执行对应IP_VS_OUT函数执行对应SNAT,然后走对应的协议栈