]> Pileus Git - ~andy/linux/blob - net/netfilter/ipvs/ip_vs_core.c
Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/davej/cpufreq
[~andy/linux] / net / netfilter / ipvs / ip_vs_core.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others.
20  *
21  * Changes:
22  *      Paul `Rusty' Russell            properly handle non-linear skbs
23  *      Harald Welte                    don't use nfcache
24  *
25  */
26
27 #define KMSG_COMPONENT "IPVS"
28 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
29
30 #include <linux/module.h>
31 #include <linux/kernel.h>
32 #include <linux/ip.h>
33 #include <linux/tcp.h>
34 #include <linux/sctp.h>
35 #include <linux/icmp.h>
36
37 #include <net/ip.h>
38 #include <net/tcp.h>
39 #include <net/udp.h>
40 #include <net/icmp.h>                   /* for icmp_send */
41 #include <net/route.h>
42
43 #include <linux/netfilter.h>
44 #include <linux/netfilter_ipv4.h>
45
46 #ifdef CONFIG_IP_VS_IPV6
47 #include <net/ipv6.h>
48 #include <linux/netfilter_ipv6.h>
49 #endif
50
51 #include <net/ip_vs.h>
52
53
54 EXPORT_SYMBOL(register_ip_vs_scheduler);
55 EXPORT_SYMBOL(unregister_ip_vs_scheduler);
56 EXPORT_SYMBOL(ip_vs_skb_replace);
57 EXPORT_SYMBOL(ip_vs_proto_name);
58 EXPORT_SYMBOL(ip_vs_conn_new);
59 EXPORT_SYMBOL(ip_vs_conn_in_get);
60 EXPORT_SYMBOL(ip_vs_conn_out_get);
61 #ifdef CONFIG_IP_VS_PROTO_TCP
62 EXPORT_SYMBOL(ip_vs_tcp_conn_listen);
63 #endif
64 EXPORT_SYMBOL(ip_vs_conn_put);
65 #ifdef CONFIG_IP_VS_DEBUG
66 EXPORT_SYMBOL(ip_vs_get_debug_level);
67 #endif
68
69
70 /* ID used in ICMP lookups */
71 #define icmp_id(icmph)          (((icmph)->un).echo.id)
72 #define icmpv6_id(icmph)        (icmph->icmp6_dataun.u_echo.identifier)
73
74 const char *ip_vs_proto_name(unsigned proto)
75 {
76         static char buf[20];
77
78         switch (proto) {
79         case IPPROTO_IP:
80                 return "IP";
81         case IPPROTO_UDP:
82                 return "UDP";
83         case IPPROTO_TCP:
84                 return "TCP";
85         case IPPROTO_SCTP:
86                 return "SCTP";
87         case IPPROTO_ICMP:
88                 return "ICMP";
89 #ifdef CONFIG_IP_VS_IPV6
90         case IPPROTO_ICMPV6:
91                 return "ICMPv6";
92 #endif
93         default:
94                 sprintf(buf, "IP_%d", proto);
95                 return buf;
96         }
97 }
98
99 void ip_vs_init_hash_table(struct list_head *table, int rows)
100 {
101         while (--rows >= 0)
102                 INIT_LIST_HEAD(&table[rows]);
103 }
104
105 static inline void
106 ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
107 {
108         struct ip_vs_dest *dest = cp->dest;
109         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
110                 spin_lock(&dest->stats.lock);
111                 dest->stats.ustats.inpkts++;
112                 dest->stats.ustats.inbytes += skb->len;
113                 spin_unlock(&dest->stats.lock);
114
115                 spin_lock(&dest->svc->stats.lock);
116                 dest->svc->stats.ustats.inpkts++;
117                 dest->svc->stats.ustats.inbytes += skb->len;
118                 spin_unlock(&dest->svc->stats.lock);
119
120                 spin_lock(&ip_vs_stats.lock);
121                 ip_vs_stats.ustats.inpkts++;
122                 ip_vs_stats.ustats.inbytes += skb->len;
123                 spin_unlock(&ip_vs_stats.lock);
124         }
125 }
126
127
128 static inline void
129 ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
130 {
131         struct ip_vs_dest *dest = cp->dest;
132         if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
133                 spin_lock(&dest->stats.lock);
134                 dest->stats.ustats.outpkts++;
135                 dest->stats.ustats.outbytes += skb->len;
136                 spin_unlock(&dest->stats.lock);
137
138                 spin_lock(&dest->svc->stats.lock);
139                 dest->svc->stats.ustats.outpkts++;
140                 dest->svc->stats.ustats.outbytes += skb->len;
141                 spin_unlock(&dest->svc->stats.lock);
142
143                 spin_lock(&ip_vs_stats.lock);
144                 ip_vs_stats.ustats.outpkts++;
145                 ip_vs_stats.ustats.outbytes += skb->len;
146                 spin_unlock(&ip_vs_stats.lock);
147         }
148 }
149
150
151 static inline void
152 ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
153 {
154         spin_lock(&cp->dest->stats.lock);
155         cp->dest->stats.ustats.conns++;
156         spin_unlock(&cp->dest->stats.lock);
157
158         spin_lock(&svc->stats.lock);
159         svc->stats.ustats.conns++;
160         spin_unlock(&svc->stats.lock);
161
162         spin_lock(&ip_vs_stats.lock);
163         ip_vs_stats.ustats.conns++;
164         spin_unlock(&ip_vs_stats.lock);
165 }
166
167
168 static inline int
169 ip_vs_set_state(struct ip_vs_conn *cp, int direction,
170                 const struct sk_buff *skb,
171                 struct ip_vs_protocol *pp)
172 {
173         if (unlikely(!pp->state_transition))
174                 return 0;
175         return pp->state_transition(cp, direction, skb, pp);
176 }
177
178
179 /*
180  *  IPVS persistent scheduling function
181  *  It creates a connection entry according to its template if exists,
182  *  or selects a server and creates a connection entry plus a template.
183  *  Locking: we are svc user (svc->refcnt), so we hold all dests too
184  *  Protocols supported: TCP, UDP
185  */
186 static struct ip_vs_conn *
187 ip_vs_sched_persist(struct ip_vs_service *svc,
188                     const struct sk_buff *skb,
189                     __be16 ports[2])
190 {
191         struct ip_vs_conn *cp = NULL;
192         struct ip_vs_iphdr iph;
193         struct ip_vs_dest *dest;
194         struct ip_vs_conn *ct;
195         __be16  dport;                  /* destination port to forward */
196         union nf_inet_addr snet;        /* source network of the client,
197                                            after masking */
198
199         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
200
201         /* Mask saddr with the netmask to adjust template granularity */
202 #ifdef CONFIG_IP_VS_IPV6
203         if (svc->af == AF_INET6)
204                 ipv6_addr_prefix(&snet.in6, &iph.saddr.in6, svc->netmask);
205         else
206 #endif
207                 snet.ip = iph.saddr.ip & svc->netmask;
208
209         IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
210                       "mnet %s\n",
211                       IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]),
212                       IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]),
213                       IP_VS_DBG_ADDR(svc->af, &snet));
214
215         /*
216          * As far as we know, FTP is a very complicated network protocol, and
217          * it uses control connection and data connections. For active FTP,
218          * FTP server initialize data connection to the client, its source port
219          * is often 20. For passive FTP, FTP server tells the clients the port
220          * that it passively listens to,  and the client issues the data
221          * connection. In the tunneling or direct routing mode, the load
222          * balancer is on the client-to-server half of connection, the port
223          * number is unknown to the load balancer. So, a conn template like
224          * <caddr, 0, vaddr, 0, daddr, 0> is created for persistent FTP
225          * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
226          * is created for other persistent services.
227          */
228         if (ports[1] == svc->port) {
229                 /* Check if a template already exists */
230                 if (svc->port != FTPPORT)
231                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
232                                              &iph.daddr, ports[1]);
233                 else
234                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
235                                              &iph.daddr, 0);
236
237                 if (!ct || !ip_vs_check_template(ct)) {
238                         /*
239                          * No template found or the dest of the connection
240                          * template is not available.
241                          */
242                         dest = svc->scheduler->schedule(svc, skb);
243                         if (dest == NULL) {
244                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
245                                 return NULL;
246                         }
247
248                         /*
249                          * Create a template like <protocol,caddr,0,
250                          * vaddr,vport,daddr,dport> for non-ftp service,
251                          * and <protocol,caddr,0,vaddr,0,daddr,0>
252                          * for ftp service.
253                          */
254                         if (svc->port != FTPPORT)
255                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
256                                                     &snet, 0,
257                                                     &iph.daddr,
258                                                     ports[1],
259                                                     &dest->addr, dest->port,
260                                                     IP_VS_CONN_F_TEMPLATE,
261                                                     dest);
262                         else
263                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
264                                                     &snet, 0,
265                                                     &iph.daddr, 0,
266                                                     &dest->addr, 0,
267                                                     IP_VS_CONN_F_TEMPLATE,
268                                                     dest);
269                         if (ct == NULL)
270                                 return NULL;
271
272                         ct->timeout = svc->timeout;
273                 } else {
274                         /* set destination with the found template */
275                         dest = ct->dest;
276                 }
277                 dport = dest->port;
278         } else {
279                 /*
280                  * Note: persistent fwmark-based services and persistent
281                  * port zero service are handled here.
282                  * fwmark template: <IPPROTO_IP,caddr,0,fwmark,0,daddr,0>
283                  * port zero template: <protocol,caddr,0,vaddr,0,daddr,0>
284                  */
285                 if (svc->fwmark) {
286                         union nf_inet_addr fwmark = {
287                                 .ip = htonl(svc->fwmark)
288                         };
289
290                         ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0,
291                                              &fwmark, 0);
292                 } else
293                         ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0,
294                                              &iph.daddr, 0);
295
296                 if (!ct || !ip_vs_check_template(ct)) {
297                         /*
298                          * If it is not persistent port zero, return NULL,
299                          * otherwise create a connection template.
300                          */
301                         if (svc->port)
302                                 return NULL;
303
304                         dest = svc->scheduler->schedule(svc, skb);
305                         if (dest == NULL) {
306                                 IP_VS_DBG(1, "p-schedule: no dest found.\n");
307                                 return NULL;
308                         }
309
310                         /*
311                          * Create a template according to the service
312                          */
313                         if (svc->fwmark) {
314                                 union nf_inet_addr fwmark = {
315                                         .ip = htonl(svc->fwmark)
316                                 };
317
318                                 ct = ip_vs_conn_new(svc->af, IPPROTO_IP,
319                                                     &snet, 0,
320                                                     &fwmark, 0,
321                                                     &dest->addr, 0,
322                                                     IP_VS_CONN_F_TEMPLATE,
323                                                     dest);
324                         } else
325                                 ct = ip_vs_conn_new(svc->af, iph.protocol,
326                                                     &snet, 0,
327                                                     &iph.daddr, 0,
328                                                     &dest->addr, 0,
329                                                     IP_VS_CONN_F_TEMPLATE,
330                                                     dest);
331                         if (ct == NULL)
332                                 return NULL;
333
334                         ct->timeout = svc->timeout;
335                 } else {
336                         /* set destination with the found template */
337                         dest = ct->dest;
338                 }
339                 dport = ports[1];
340         }
341
342         /*
343          *    Create a new connection according to the template
344          */
345         cp = ip_vs_conn_new(svc->af, iph.protocol,
346                             &iph.saddr, ports[0],
347                             &iph.daddr, ports[1],
348                             &dest->addr, dport,
349                             0,
350                             dest);
351         if (cp == NULL) {
352                 ip_vs_conn_put(ct);
353                 return NULL;
354         }
355
356         /*
357          *    Add its control
358          */
359         ip_vs_control_add(cp, ct);
360         ip_vs_conn_put(ct);
361
362         ip_vs_conn_stats(cp, svc);
363         return cp;
364 }
365
366
367 /*
368  *  IPVS main scheduling function
369  *  It selects a server according to the virtual service, and
370  *  creates a connection entry.
371  *  Protocols supported: TCP, UDP
372  */
373 struct ip_vs_conn *
374 ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
375 {
376         struct ip_vs_conn *cp = NULL;
377         struct ip_vs_iphdr iph;
378         struct ip_vs_dest *dest;
379         __be16 _ports[2], *pptr;
380
381         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
382         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
383         if (pptr == NULL)
384                 return NULL;
385
386         /*
387          *    Persistent service
388          */
389         if (svc->flags & IP_VS_SVC_F_PERSISTENT)
390                 return ip_vs_sched_persist(svc, skb, pptr);
391
392         /*
393          *    Non-persistent service
394          */
395         if (!svc->fwmark && pptr[1] != svc->port) {
396                 if (!svc->port)
397                         pr_err("Schedule: port zero only supported "
398                                "in persistent services, "
399                                "check your ipvs configuration\n");
400                 return NULL;
401         }
402
403         dest = svc->scheduler->schedule(svc, skb);
404         if (dest == NULL) {
405                 IP_VS_DBG(1, "Schedule: no dest found.\n");
406                 return NULL;
407         }
408
409         /*
410          *    Create a connection entry.
411          */
412         cp = ip_vs_conn_new(svc->af, iph.protocol,
413                             &iph.saddr, pptr[0],
414                             &iph.daddr, pptr[1],
415                             &dest->addr, dest->port ? dest->port : pptr[1],
416                             0,
417                             dest);
418         if (cp == NULL)
419                 return NULL;
420
421         IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
422                       "d:%s:%u conn->flags:%X conn->refcnt:%d\n",
423                       ip_vs_fwd_tag(cp),
424                       IP_VS_DBG_ADDR(svc->af, &cp->caddr), ntohs(cp->cport),
425                       IP_VS_DBG_ADDR(svc->af, &cp->vaddr), ntohs(cp->vport),
426                       IP_VS_DBG_ADDR(svc->af, &cp->daddr), ntohs(cp->dport),
427                       cp->flags, atomic_read(&cp->refcnt));
428
429         ip_vs_conn_stats(cp, svc);
430         return cp;
431 }
432
433
434 /*
435  *  Pass or drop the packet.
436  *  Called by ip_vs_in, when the virtual service is available but
437  *  no destination is available for a new connection.
438  */
439 int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
440                 struct ip_vs_protocol *pp)
441 {
442         __be16 _ports[2], *pptr;
443         struct ip_vs_iphdr iph;
444         int unicast;
445         ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
446
447         pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
448         if (pptr == NULL) {
449                 ip_vs_service_put(svc);
450                 return NF_DROP;
451         }
452
453 #ifdef CONFIG_IP_VS_IPV6
454         if (svc->af == AF_INET6)
455                 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
456         else
457 #endif
458                 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST);
459
460         /* if it is fwmark-based service, the cache_bypass sysctl is up
461            and the destination is a non-local unicast, then create
462            a cache_bypass connection entry */
463         if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) {
464                 int ret, cs;
465                 struct ip_vs_conn *cp;
466                 union nf_inet_addr daddr =  { .all = { 0, 0, 0, 0 } };
467
468                 ip_vs_service_put(svc);
469
470                 /* create a new connection entry */
471                 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
472                 cp = ip_vs_conn_new(svc->af, iph.protocol,
473                                     &iph.saddr, pptr[0],
474                                     &iph.daddr, pptr[1],
475                                     &daddr, 0,
476                                     IP_VS_CONN_F_BYPASS,
477                                     NULL);
478                 if (cp == NULL)
479                         return NF_DROP;
480
481                 /* statistics */
482                 ip_vs_in_stats(cp, skb);
483
484                 /* set state */
485                 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
486
487                 /* transmit the first SYN packet */
488                 ret = cp->packet_xmit(skb, cp, pp);
489                 /* do not touch skb anymore */
490
491                 atomic_inc(&cp->in_pkts);
492                 ip_vs_conn_put(cp);
493                 return ret;
494         }
495
496         /*
497          * When the virtual ftp service is presented, packets destined
498          * for other services on the VIP may get here (except services
499          * listed in the ipvs table), pass the packets, because it is
500          * not ipvs job to decide to drop the packets.
501          */
502         if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) {
503                 ip_vs_service_put(svc);
504                 return NF_ACCEPT;
505         }
506
507         ip_vs_service_put(svc);
508
509         /*
510          * Notify the client that the destination is unreachable, and
511          * release the socket buffer.
512          * Since it is in IP layer, the TCP socket is not actually
513          * created, the TCP RST packet cannot be sent, instead that
514          * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
515          */
516 #ifdef CONFIG_IP_VS_IPV6
517         if (svc->af == AF_INET6)
518                 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0);
519         else
520 #endif
521                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
522
523         return NF_DROP;
524 }
525
526
527 /*
528  *      It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
529  *      chain, and is used for VS/NAT.
530  *      It detects packets for VS/NAT connections and sends the packets
531  *      immediately. This can avoid that iptable_nat mangles the packets
532  *      for VS/NAT.
533  */
534 static unsigned int ip_vs_post_routing(unsigned int hooknum,
535                                        struct sk_buff *skb,
536                                        const struct net_device *in,
537                                        const struct net_device *out,
538                                        int (*okfn)(struct sk_buff *))
539 {
540         if (!skb->ipvs_property)
541                 return NF_ACCEPT;
542         /* The packet was sent from IPVS, exit this chain */
543         return NF_STOP;
544 }
545
546 __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
547 {
548         return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
549 }
550
551 static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
552 {
553         int err = ip_defrag(skb, user);
554
555         if (!err)
556                 ip_send_check(ip_hdr(skb));
557
558         return err;
559 }
560
561 #ifdef CONFIG_IP_VS_IPV6
562 static inline int ip_vs_gather_frags_v6(struct sk_buff *skb, u_int32_t user)
563 {
564         /* TODO IPv6: Find out what to do here for IPv6 */
565         return 0;
566 }
567 #endif
568
569 /*
570  * Packet has been made sufficiently writable in caller
571  * - inout: 1=in->out, 0=out->in
572  */
573 void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
574                     struct ip_vs_conn *cp, int inout)
575 {
576         struct iphdr *iph        = ip_hdr(skb);
577         unsigned int icmp_offset = iph->ihl*4;
578         struct icmphdr *icmph    = (struct icmphdr *)(skb_network_header(skb) +
579                                                       icmp_offset);
580         struct iphdr *ciph       = (struct iphdr *)(icmph + 1);
581
582         if (inout) {
583                 iph->saddr = cp->vaddr.ip;
584                 ip_send_check(iph);
585                 ciph->daddr = cp->vaddr.ip;
586                 ip_send_check(ciph);
587         } else {
588                 iph->daddr = cp->daddr.ip;
589                 ip_send_check(iph);
590                 ciph->saddr = cp->daddr.ip;
591                 ip_send_check(ciph);
592         }
593
594         /* the TCP/UDP/SCTP port */
595         if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol ||
596             IPPROTO_SCTP == ciph->protocol) {
597                 __be16 *ports = (void *)ciph + ciph->ihl*4;
598
599                 if (inout)
600                         ports[1] = cp->vport;
601                 else
602                         ports[0] = cp->dport;
603         }
604
605         /* And finally the ICMP checksum */
606         icmph->checksum = 0;
607         icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
608         skb->ip_summed = CHECKSUM_UNNECESSARY;
609
610         if (inout)
611                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
612                         "Forwarding altered outgoing ICMP");
613         else
614                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
615                         "Forwarding altered incoming ICMP");
616 }
617
618 #ifdef CONFIG_IP_VS_IPV6
619 void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp,
620                     struct ip_vs_conn *cp, int inout)
621 {
622         struct ipv6hdr *iph      = ipv6_hdr(skb);
623         unsigned int icmp_offset = sizeof(struct ipv6hdr);
624         struct icmp6hdr *icmph   = (struct icmp6hdr *)(skb_network_header(skb) +
625                                                       icmp_offset);
626         struct ipv6hdr *ciph     = (struct ipv6hdr *)(icmph + 1);
627
628         if (inout) {
629                 iph->saddr = cp->vaddr.in6;
630                 ciph->daddr = cp->vaddr.in6;
631         } else {
632                 iph->daddr = cp->daddr.in6;
633                 ciph->saddr = cp->daddr.in6;
634         }
635
636         /* the TCP/UDP/SCTP port */
637         if (IPPROTO_TCP == ciph->nexthdr || IPPROTO_UDP == ciph->nexthdr ||
638             IPPROTO_SCTP == ciph->nexthdr) {
639                 __be16 *ports = (void *)ciph + sizeof(struct ipv6hdr);
640
641                 if (inout)
642                         ports[1] = cp->vport;
643                 else
644                         ports[0] = cp->dport;
645         }
646
647         /* And finally the ICMP checksum */
648         icmph->icmp6_cksum = 0;
649         /* TODO IPv6: is this correct for ICMPv6? */
650         ip_vs_checksum_complete(skb, icmp_offset);
651         skb->ip_summed = CHECKSUM_UNNECESSARY;
652
653         if (inout)
654                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
655                         "Forwarding altered outgoing ICMPv6");
656         else
657                 IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
658                         "Forwarding altered incoming ICMPv6");
659 }
660 #endif
661
662 /* Handle relevant response ICMP messages - forward to the right
663  * destination host. Used for NAT and local client.
664  */
665 static int handle_response_icmp(int af, struct sk_buff *skb,
666                                 union nf_inet_addr *snet,
667                                 __u8 protocol, struct ip_vs_conn *cp,
668                                 struct ip_vs_protocol *pp,
669                                 unsigned int offset, unsigned int ihl)
670 {
671         unsigned int verdict = NF_DROP;
672
673         if (IP_VS_FWD_METHOD(cp) != 0) {
674                 pr_err("shouldn't reach here, because the box is on the "
675                        "half connection in the tun/dr module.\n");
676         }
677
678         /* Ensure the checksum is correct */
679         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
680                 /* Failed checksum! */
681                 IP_VS_DBG_BUF(1, "Forward ICMP: failed checksum from %s!\n",
682                               IP_VS_DBG_ADDR(af, snet));
683                 goto out;
684         }
685
686         if (IPPROTO_TCP == protocol || IPPROTO_UDP == protocol ||
687             IPPROTO_SCTP == protocol)
688                 offset += 2 * sizeof(__u16);
689         if (!skb_make_writable(skb, offset))
690                 goto out;
691
692 #ifdef CONFIG_IP_VS_IPV6
693         if (af == AF_INET6)
694                 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
695         else
696 #endif
697                 ip_vs_nat_icmp(skb, pp, cp, 1);
698
699         /* do the statistics and put it back */
700         ip_vs_out_stats(cp, skb);
701
702         skb->ipvs_property = 1;
703         verdict = NF_ACCEPT;
704
705 out:
706         __ip_vs_conn_put(cp);
707
708         return verdict;
709 }
710
711 /*
712  *      Handle ICMP messages in the inside-to-outside direction (outgoing).
713  *      Find any that might be relevant, check against existing connections.
714  *      Currently handles error types - unreachable, quench, ttl exceeded.
715  */
716 static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
717 {
718         struct iphdr *iph;
719         struct icmphdr  _icmph, *ic;
720         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
721         struct ip_vs_iphdr ciph;
722         struct ip_vs_conn *cp;
723         struct ip_vs_protocol *pp;
724         unsigned int offset, ihl;
725         union nf_inet_addr snet;
726
727         *related = 1;
728
729         /* reassemble IP fragments */
730         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
731                 if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
732                         return NF_STOLEN;
733         }
734
735         iph = ip_hdr(skb);
736         offset = ihl = iph->ihl * 4;
737         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
738         if (ic == NULL)
739                 return NF_DROP;
740
741         IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %pI4->%pI4\n",
742                   ic->type, ntohs(icmp_id(ic)),
743                   &iph->saddr, &iph->daddr);
744
745         /*
746          * Work through seeing if this is for us.
747          * These checks are supposed to be in an order that means easy
748          * things are checked first to speed up processing.... however
749          * this means that some packets will manage to get a long way
750          * down this stack and then be rejected, but that's life.
751          */
752         if ((ic->type != ICMP_DEST_UNREACH) &&
753             (ic->type != ICMP_SOURCE_QUENCH) &&
754             (ic->type != ICMP_TIME_EXCEEDED)) {
755                 *related = 0;
756                 return NF_ACCEPT;
757         }
758
759         /* Now find the contained IP header */
760         offset += sizeof(_icmph);
761         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
762         if (cih == NULL)
763                 return NF_ACCEPT; /* The packet looks wrong, ignore */
764
765         pp = ip_vs_proto_get(cih->protocol);
766         if (!pp)
767                 return NF_ACCEPT;
768
769         /* Is the embedded protocol header present? */
770         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
771                      pp->dont_defrag))
772                 return NF_ACCEPT;
773
774         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
775
776         offset += cih->ihl * 4;
777
778         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
779         /* The embedded headers contain source and dest in reverse order */
780         cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
781         if (!cp)
782                 return NF_ACCEPT;
783
784         snet.ip = iph->saddr;
785         return handle_response_icmp(AF_INET, skb, &snet, cih->protocol, cp,
786                                     pp, offset, ihl);
787 }
788
789 #ifdef CONFIG_IP_VS_IPV6
790 static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related)
791 {
792         struct ipv6hdr *iph;
793         struct icmp6hdr _icmph, *ic;
794         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
795                                            within the ICMP */
796         struct ip_vs_iphdr ciph;
797         struct ip_vs_conn *cp;
798         struct ip_vs_protocol *pp;
799         unsigned int offset;
800         union nf_inet_addr snet;
801
802         *related = 1;
803
804         /* reassemble IP fragments */
805         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
806                 if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT))
807                         return NF_STOLEN;
808         }
809
810         iph = ipv6_hdr(skb);
811         offset = sizeof(struct ipv6hdr);
812         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
813         if (ic == NULL)
814                 return NF_DROP;
815
816         IP_VS_DBG(12, "Outgoing ICMPv6 (%d,%d) %pI6->%pI6\n",
817                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
818                   &iph->saddr, &iph->daddr);
819
820         /*
821          * Work through seeing if this is for us.
822          * These checks are supposed to be in an order that means easy
823          * things are checked first to speed up processing.... however
824          * this means that some packets will manage to get a long way
825          * down this stack and then be rejected, but that's life.
826          */
827         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
828             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
829             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
830                 *related = 0;
831                 return NF_ACCEPT;
832         }
833
834         /* Now find the contained IP header */
835         offset += sizeof(_icmph);
836         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
837         if (cih == NULL)
838                 return NF_ACCEPT; /* The packet looks wrong, ignore */
839
840         pp = ip_vs_proto_get(cih->nexthdr);
841         if (!pp)
842                 return NF_ACCEPT;
843
844         /* Is the embedded protocol header present? */
845         /* TODO: we don't support fragmentation at the moment anyways */
846         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
847                 return NF_ACCEPT;
848
849         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for");
850
851         offset += sizeof(struct ipv6hdr);
852
853         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
854         /* The embedded headers contain source and dest in reverse order */
855         cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
856         if (!cp)
857                 return NF_ACCEPT;
858
859         ipv6_addr_copy(&snet.in6, &iph->saddr);
860         return handle_response_icmp(AF_INET6, skb, &snet, cih->nexthdr, cp,
861                                     pp, offset, sizeof(struct ipv6hdr));
862 }
863 #endif
864
865 /*
866  * Check if sctp chunc is ABORT chunk
867  */
868 static inline int is_sctp_abort(const struct sk_buff *skb, int nh_len)
869 {
870         sctp_chunkhdr_t *sch, schunk;
871         sch = skb_header_pointer(skb, nh_len + sizeof(sctp_sctphdr_t),
872                         sizeof(schunk), &schunk);
873         if (sch == NULL)
874                 return 0;
875         if (sch->type == SCTP_CID_ABORT)
876                 return 1;
877         return 0;
878 }
879
880 static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
881 {
882         struct tcphdr _tcph, *th;
883
884         th = skb_header_pointer(skb, nh_len, sizeof(_tcph), &_tcph);
885         if (th == NULL)
886                 return 0;
887         return th->rst;
888 }
889
890 /* Handle response packets: rewrite addresses and send away...
891  * Used for NAT and local client.
892  */
893 static unsigned int
894 handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
895                 struct ip_vs_conn *cp, int ihl)
896 {
897         IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
898
899         if (!skb_make_writable(skb, ihl))
900                 goto drop;
901
902         /* mangle the packet */
903         if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
904                 goto drop;
905
906 #ifdef CONFIG_IP_VS_IPV6
907         if (af == AF_INET6)
908                 ipv6_hdr(skb)->saddr = cp->vaddr.in6;
909         else
910 #endif
911         {
912                 ip_hdr(skb)->saddr = cp->vaddr.ip;
913                 ip_send_check(ip_hdr(skb));
914         }
915
916         /* For policy routing, packets originating from this
917          * machine itself may be routed differently to packets
918          * passing through.  We want this packet to be routed as
919          * if it came from this machine itself.  So re-compute
920          * the routing information.
921          */
922 #ifdef CONFIG_IP_VS_IPV6
923         if (af == AF_INET6) {
924                 if (ip6_route_me_harder(skb) != 0)
925                         goto drop;
926         } else
927 #endif
928                 if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
929                         goto drop;
930
931         IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
932
933         ip_vs_out_stats(cp, skb);
934         ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
935         ip_vs_conn_put(cp);
936
937         skb->ipvs_property = 1;
938
939         LeaveFunction(11);
940         return NF_ACCEPT;
941
942 drop:
943         ip_vs_conn_put(cp);
944         kfree_skb(skb);
945         return NF_STOLEN;
946 }
947
948 /*
949  *      It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
950  *      Check if outgoing packet belongs to the established ip_vs_conn.
951  */
952 static unsigned int
953 ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
954           const struct net_device *in, const struct net_device *out,
955           int (*okfn)(struct sk_buff *))
956 {
957         struct ip_vs_iphdr iph;
958         struct ip_vs_protocol *pp;
959         struct ip_vs_conn *cp;
960         int af;
961
962         EnterFunction(11);
963
964         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
965
966         if (skb->ipvs_property)
967                 return NF_ACCEPT;
968
969         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
970 #ifdef CONFIG_IP_VS_IPV6
971         if (af == AF_INET6) {
972                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
973                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
974
975                         if (related)
976                                 return verdict;
977                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
978                 }
979         } else
980 #endif
981                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
982                         int related, verdict = ip_vs_out_icmp(skb, &related);
983
984                         if (related)
985                                 return verdict;
986                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
987                 }
988
989         pp = ip_vs_proto_get(iph.protocol);
990         if (unlikely(!pp))
991                 return NF_ACCEPT;
992
993         /* reassemble IP fragments */
994 #ifdef CONFIG_IP_VS_IPV6
995         if (af == AF_INET6) {
996                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
997                         int related, verdict = ip_vs_out_icmp_v6(skb, &related);
998
999                         if (related)
1000                                 return verdict;
1001
1002                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1003                 }
1004         } else
1005 #endif
1006                 if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) &&
1007                              !pp->dont_defrag)) {
1008                         if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
1009                                 return NF_STOLEN;
1010
1011                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1012                 }
1013
1014         /*
1015          * Check if the packet belongs to an existing entry
1016          */
1017         cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1018
1019         if (unlikely(!cp)) {
1020                 if (sysctl_ip_vs_nat_icmp_send &&
1021                     (pp->protocol == IPPROTO_TCP ||
1022                      pp->protocol == IPPROTO_UDP ||
1023                      pp->protocol == IPPROTO_SCTP)) {
1024                         __be16 _ports[2], *pptr;
1025
1026                         pptr = skb_header_pointer(skb, iph.len,
1027                                                   sizeof(_ports), _ports);
1028                         if (pptr == NULL)
1029                                 return NF_ACCEPT;       /* Not for me */
1030                         if (ip_vs_lookup_real_service(af, iph.protocol,
1031                                                       &iph.saddr,
1032                                                       pptr[0])) {
1033                                 /*
1034                                  * Notify the real server: there is no
1035                                  * existing entry if it is not RST
1036                                  * packet or not TCP packet.
1037                                  */
1038                                 if ((iph.protocol != IPPROTO_TCP &&
1039                                      iph.protocol != IPPROTO_SCTP)
1040                                      || ((iph.protocol == IPPROTO_TCP
1041                                           && !is_tcp_reset(skb, iph.len))
1042                                          || (iph.protocol == IPPROTO_SCTP
1043                                                 && !is_sctp_abort(skb,
1044                                                         iph.len)))) {
1045 #ifdef CONFIG_IP_VS_IPV6
1046                                         if (af == AF_INET6)
1047                                                 icmpv6_send(skb,
1048                                                             ICMPV6_DEST_UNREACH,
1049                                                             ICMPV6_PORT_UNREACH,
1050                                                             0);
1051                                         else
1052 #endif
1053                                                 icmp_send(skb,
1054                                                           ICMP_DEST_UNREACH,
1055                                                           ICMP_PORT_UNREACH, 0);
1056                                         return NF_DROP;
1057                                 }
1058                         }
1059                 }
1060                 IP_VS_DBG_PKT(12, pp, skb, 0,
1061                               "packet continues traversal as normal");
1062                 return NF_ACCEPT;
1063         }
1064
1065         return handle_response(af, skb, pp, cp, iph.len);
1066 }
1067
1068
1069 /*
1070  *      Handle ICMP messages in the outside-to-inside direction (incoming).
1071  *      Find any that might be relevant, check against existing connections,
1072  *      forward to the right destination host if relevant.
1073  *      Currently handles error types - unreachable, quench, ttl exceeded.
1074  */
1075 static int
1076 ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1077 {
1078         struct iphdr *iph;
1079         struct icmphdr  _icmph, *ic;
1080         struct iphdr    _ciph, *cih;    /* The ip header contained within the ICMP */
1081         struct ip_vs_iphdr ciph;
1082         struct ip_vs_conn *cp;
1083         struct ip_vs_protocol *pp;
1084         unsigned int offset, ihl, verdict;
1085         union nf_inet_addr snet;
1086
1087         *related = 1;
1088
1089         /* reassemble IP fragments */
1090         if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
1091                 if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
1092                                             IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
1093                         return NF_STOLEN;
1094         }
1095
1096         iph = ip_hdr(skb);
1097         offset = ihl = iph->ihl * 4;
1098         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1099         if (ic == NULL)
1100                 return NF_DROP;
1101
1102         IP_VS_DBG(12, "Incoming ICMP (%d,%d) %pI4->%pI4\n",
1103                   ic->type, ntohs(icmp_id(ic)),
1104                   &iph->saddr, &iph->daddr);
1105
1106         /*
1107          * Work through seeing if this is for us.
1108          * These checks are supposed to be in an order that means easy
1109          * things are checked first to speed up processing.... however
1110          * this means that some packets will manage to get a long way
1111          * down this stack and then be rejected, but that's life.
1112          */
1113         if ((ic->type != ICMP_DEST_UNREACH) &&
1114             (ic->type != ICMP_SOURCE_QUENCH) &&
1115             (ic->type != ICMP_TIME_EXCEEDED)) {
1116                 *related = 0;
1117                 return NF_ACCEPT;
1118         }
1119
1120         /* Now find the contained IP header */
1121         offset += sizeof(_icmph);
1122         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1123         if (cih == NULL)
1124                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1125
1126         pp = ip_vs_proto_get(cih->protocol);
1127         if (!pp)
1128                 return NF_ACCEPT;
1129
1130         /* Is the embedded protocol header present? */
1131         if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
1132                      pp->dont_defrag))
1133                 return NF_ACCEPT;
1134
1135         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
1136
1137         offset += cih->ihl * 4;
1138
1139         ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1140         /* The embedded headers contain source and dest in reverse order */
1141         cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1);
1142         if (!cp) {
1143                 /* The packet could also belong to a local client */
1144                 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1);
1145                 if (cp) {
1146                         snet.ip = iph->saddr;
1147                         return handle_response_icmp(AF_INET, skb, &snet,
1148                                                     cih->protocol, cp, pp,
1149                                                     offset, ihl);
1150                 }
1151                 return NF_ACCEPT;
1152         }
1153
1154         verdict = NF_DROP;
1155
1156         /* Ensure the checksum is correct */
1157         if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
1158                 /* Failed checksum! */
1159                 IP_VS_DBG(1, "Incoming ICMP: failed checksum from %pI4!\n",
1160                           &iph->saddr);
1161                 goto out;
1162         }
1163
1164         /* do the statistics and put it back */
1165         ip_vs_in_stats(cp, skb);
1166         if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
1167                 offset += 2 * sizeof(__u16);
1168         verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
1169         /* do not touch skb anymore */
1170
1171   out:
1172         __ip_vs_conn_put(cp);
1173
1174         return verdict;
1175 }
1176
1177 #ifdef CONFIG_IP_VS_IPV6
1178 static int
1179 ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1180 {
1181         struct ipv6hdr *iph;
1182         struct icmp6hdr _icmph, *ic;
1183         struct ipv6hdr  _ciph, *cih;    /* The ip header contained
1184                                            within the ICMP */
1185         struct ip_vs_iphdr ciph;
1186         struct ip_vs_conn *cp;
1187         struct ip_vs_protocol *pp;
1188         unsigned int offset, verdict;
1189         union nf_inet_addr snet;
1190
1191         *related = 1;
1192
1193         /* reassemble IP fragments */
1194         if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) {
1195                 if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ?
1196                                                IP_DEFRAG_VS_IN :
1197                                                IP_DEFRAG_VS_FWD))
1198                         return NF_STOLEN;
1199         }
1200
1201         iph = ipv6_hdr(skb);
1202         offset = sizeof(struct ipv6hdr);
1203         ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
1204         if (ic == NULL)
1205                 return NF_DROP;
1206
1207         IP_VS_DBG(12, "Incoming ICMPv6 (%d,%d) %pI6->%pI6\n",
1208                   ic->icmp6_type, ntohs(icmpv6_id(ic)),
1209                   &iph->saddr, &iph->daddr);
1210
1211         /*
1212          * Work through seeing if this is for us.
1213          * These checks are supposed to be in an order that means easy
1214          * things are checked first to speed up processing.... however
1215          * this means that some packets will manage to get a long way
1216          * down this stack and then be rejected, but that's life.
1217          */
1218         if ((ic->icmp6_type != ICMPV6_DEST_UNREACH) &&
1219             (ic->icmp6_type != ICMPV6_PKT_TOOBIG) &&
1220             (ic->icmp6_type != ICMPV6_TIME_EXCEED)) {
1221                 *related = 0;
1222                 return NF_ACCEPT;
1223         }
1224
1225         /* Now find the contained IP header */
1226         offset += sizeof(_icmph);
1227         cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph);
1228         if (cih == NULL)
1229                 return NF_ACCEPT; /* The packet looks wrong, ignore */
1230
1231         pp = ip_vs_proto_get(cih->nexthdr);
1232         if (!pp)
1233                 return NF_ACCEPT;
1234
1235         /* Is the embedded protocol header present? */
1236         /* TODO: we don't support fragmentation at the moment anyways */
1237         if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag))
1238                 return NF_ACCEPT;
1239
1240         IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for");
1241
1242         offset += sizeof(struct ipv6hdr);
1243
1244         ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1245         /* The embedded headers contain source and dest in reverse order */
1246         cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1);
1247         if (!cp) {
1248                 /* The packet could also belong to a local client */
1249                 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1);
1250                 if (cp) {
1251                         ipv6_addr_copy(&snet.in6, &iph->saddr);
1252                         return handle_response_icmp(AF_INET6, skb, &snet,
1253                                                     cih->nexthdr,
1254                                                     cp, pp, offset,
1255                                                     sizeof(struct ipv6hdr));
1256                 }
1257                 return NF_ACCEPT;
1258         }
1259
1260         verdict = NF_DROP;
1261
1262         /* do the statistics and put it back */
1263         ip_vs_in_stats(cp, skb);
1264         if (IPPROTO_TCP == cih->nexthdr || IPPROTO_UDP == cih->nexthdr ||
1265             IPPROTO_SCTP == cih->nexthdr)
1266                 offset += 2 * sizeof(__u16);
1267         verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset);
1268         /* do not touch skb anymore */
1269
1270         __ip_vs_conn_put(cp);
1271
1272         return verdict;
1273 }
1274 #endif
1275
1276
1277 /*
1278  *      Check if it's for virtual services, look it up,
1279  *      and send it on its way...
1280  */
1281 static unsigned int
1282 ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
1283          const struct net_device *in, const struct net_device *out,
1284          int (*okfn)(struct sk_buff *))
1285 {
1286         struct ip_vs_iphdr iph;
1287         struct ip_vs_protocol *pp;
1288         struct ip_vs_conn *cp;
1289         int ret, restart, af, pkts;
1290
1291         af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6;
1292
1293         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1294
1295         /*
1296          *      Big tappo: only PACKET_HOST, including loopback for local client
1297          *      Don't handle local packets on IPv6 for now
1298          */
1299         if (unlikely(skb->pkt_type != PACKET_HOST)) {
1300                 IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n",
1301                               skb->pkt_type,
1302                               iph.protocol,
1303                               IP_VS_DBG_ADDR(af, &iph.daddr));
1304                 return NF_ACCEPT;
1305         }
1306
1307 #ifdef CONFIG_IP_VS_IPV6
1308         if (af == AF_INET6) {
1309                 if (unlikely(iph.protocol == IPPROTO_ICMPV6)) {
1310                         int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum);
1311
1312                         if (related)
1313                                 return verdict;
1314                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1315                 }
1316         } else
1317 #endif
1318                 if (unlikely(iph.protocol == IPPROTO_ICMP)) {
1319                         int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
1320
1321                         if (related)
1322                                 return verdict;
1323                         ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1324                 }
1325
1326         /* Protocol supported? */
1327         pp = ip_vs_proto_get(iph.protocol);
1328         if (unlikely(!pp))
1329                 return NF_ACCEPT;
1330
1331         /*
1332          * Check if the packet belongs to an existing connection entry
1333          */
1334         cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0);
1335
1336         if (unlikely(!cp)) {
1337                 int v;
1338
1339                 /* For local client packets, it could be a response */
1340                 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0);
1341                 if (cp)
1342                         return handle_response(af, skb, pp, cp, iph.len);
1343
1344                 if (!pp->conn_schedule(af, skb, pp, &v, &cp))
1345                         return v;
1346         }
1347
1348         if (unlikely(!cp)) {
1349                 /* sorry, all this trouble for a no-hit :) */
1350                 IP_VS_DBG_PKT(12, pp, skb, 0,
1351                               "packet continues traversal as normal");
1352                 return NF_ACCEPT;
1353         }
1354
1355         IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
1356
1357         /* Check the server status */
1358         if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1359                 /* the destination server is not available */
1360
1361                 if (sysctl_ip_vs_expire_nodest_conn) {
1362                         /* try to expire the connection immediately */
1363                         ip_vs_conn_expire_now(cp);
1364                 }
1365                 /* don't restart its timer, and silently
1366                    drop the packet. */
1367                 __ip_vs_conn_put(cp);
1368                 return NF_DROP;
1369         }
1370
1371         ip_vs_in_stats(cp, skb);
1372         restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
1373         if (cp->packet_xmit)
1374                 ret = cp->packet_xmit(skb, cp, pp);
1375                 /* do not touch skb anymore */
1376         else {
1377                 IP_VS_DBG_RL("warning: packet_xmit is null");
1378                 ret = NF_ACCEPT;
1379         }
1380
1381         /* Increase its packet counter and check if it is needed
1382          * to be synchronized
1383          *
1384          * Sync connection if it is about to close to
1385          * encorage the standby servers to update the connections timeout
1386          */
1387         pkts = atomic_add_return(1, &cp->in_pkts);
1388         if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1389             cp->protocol == IPPROTO_SCTP) {
1390                 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
1391                         (atomic_read(&cp->in_pkts) %
1392                          sysctl_ip_vs_sync_threshold[1]
1393                          == sysctl_ip_vs_sync_threshold[0])) ||
1394                                 (cp->old_state != cp->state &&
1395                                  ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1396                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1397                                   (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1398                         ip_vs_sync_conn(cp);
1399                         goto out;
1400                 }
1401         }
1402
1403         if (af == AF_INET &&
1404             (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1405             (((cp->protocol != IPPROTO_TCP ||
1406                cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1407               (pkts % sysctl_ip_vs_sync_threshold[1]
1408                == sysctl_ip_vs_sync_threshold[0])) ||
1409              ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1410               ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1411                (cp->state == IP_VS_TCP_S_CLOSE) ||
1412                (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1413                (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1414                 ip_vs_sync_conn(cp);
1415 out:
1416         cp->old_state = cp->state;
1417
1418         ip_vs_conn_put(cp);
1419         return ret;
1420 }
1421
1422
1423 /*
1424  *      It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
1425  *      related packets destined for 0.0.0.0/0.
1426  *      When fwmark-based virtual service is used, such as transparent
1427  *      cache cluster, TCP packets can be marked and routed to ip_vs_in,
1428  *      but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
1429  *      sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
1430  *      and send them to ip_vs_in_icmp.
1431  */
1432 static unsigned int
1433 ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
1434                    const struct net_device *in, const struct net_device *out,
1435                    int (*okfn)(struct sk_buff *))
1436 {
1437         int r;
1438
1439         if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
1440                 return NF_ACCEPT;
1441
1442         return ip_vs_in_icmp(skb, &r, hooknum);
1443 }
1444
1445 #ifdef CONFIG_IP_VS_IPV6
1446 static unsigned int
1447 ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb,
1448                       const struct net_device *in, const struct net_device *out,
1449                       int (*okfn)(struct sk_buff *))
1450 {
1451         int r;
1452
1453         if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6)
1454                 return NF_ACCEPT;
1455
1456         return ip_vs_in_icmp_v6(skb, &r, hooknum);
1457 }
1458 #endif
1459
1460
1461 static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1462         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1463          * or VS/NAT(change destination), so that filtering rules can be
1464          * applied to IPVS. */
1465         {
1466                 .hook           = ip_vs_in,
1467                 .owner          = THIS_MODULE,
1468                 .pf             = PF_INET,
1469                 .hooknum        = NF_INET_LOCAL_IN,
1470                 .priority       = 100,
1471         },
1472         /* After packet filtering, change source only for VS/NAT */
1473         {
1474                 .hook           = ip_vs_out,
1475                 .owner          = THIS_MODULE,
1476                 .pf             = PF_INET,
1477                 .hooknum        = NF_INET_FORWARD,
1478                 .priority       = 100,
1479         },
1480         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1481          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1482         {
1483                 .hook           = ip_vs_forward_icmp,
1484                 .owner          = THIS_MODULE,
1485                 .pf             = PF_INET,
1486                 .hooknum        = NF_INET_FORWARD,
1487                 .priority       = 99,
1488         },
1489         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1490         {
1491                 .hook           = ip_vs_post_routing,
1492                 .owner          = THIS_MODULE,
1493                 .pf             = PF_INET,
1494                 .hooknum        = NF_INET_POST_ROUTING,
1495                 .priority       = NF_IP_PRI_NAT_SRC-1,
1496         },
1497 #ifdef CONFIG_IP_VS_IPV6
1498         /* After packet filtering, forward packet through VS/DR, VS/TUN,
1499          * or VS/NAT(change destination), so that filtering rules can be
1500          * applied to IPVS. */
1501         {
1502                 .hook           = ip_vs_in,
1503                 .owner          = THIS_MODULE,
1504                 .pf             = PF_INET6,
1505                 .hooknum        = NF_INET_LOCAL_IN,
1506                 .priority       = 100,
1507         },
1508         /* After packet filtering, change source only for VS/NAT */
1509         {
1510                 .hook           = ip_vs_out,
1511                 .owner          = THIS_MODULE,
1512                 .pf             = PF_INET6,
1513                 .hooknum        = NF_INET_FORWARD,
1514                 .priority       = 100,
1515         },
1516         /* After packet filtering (but before ip_vs_out_icmp), catch icmp
1517          * destined for 0.0.0.0/0, which is for incoming IPVS connections */
1518         {
1519                 .hook           = ip_vs_forward_icmp_v6,
1520                 .owner          = THIS_MODULE,
1521                 .pf             = PF_INET6,
1522                 .hooknum        = NF_INET_FORWARD,
1523                 .priority       = 99,
1524         },
1525         /* Before the netfilter connection tracking, exit from POST_ROUTING */
1526         {
1527                 .hook           = ip_vs_post_routing,
1528                 .owner          = THIS_MODULE,
1529                 .pf             = PF_INET6,
1530                 .hooknum        = NF_INET_POST_ROUTING,
1531                 .priority       = NF_IP6_PRI_NAT_SRC-1,
1532         },
1533 #endif
1534 };
1535
1536
1537 /*
1538  *      Initialize IP Virtual Server
1539  */
1540 static int __init ip_vs_init(void)
1541 {
1542         int ret;
1543
1544         ip_vs_estimator_init();
1545
1546         ret = ip_vs_control_init();
1547         if (ret < 0) {
1548                 pr_err("can't setup control.\n");
1549                 goto cleanup_estimator;
1550         }
1551
1552         ip_vs_protocol_init();
1553
1554         ret = ip_vs_app_init();
1555         if (ret < 0) {
1556                 pr_err("can't setup application helper.\n");
1557                 goto cleanup_protocol;
1558         }
1559
1560         ret = ip_vs_conn_init();
1561         if (ret < 0) {
1562                 pr_err("can't setup connection table.\n");
1563                 goto cleanup_app;
1564         }
1565
1566         ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1567         if (ret < 0) {
1568                 pr_err("can't register hooks.\n");
1569                 goto cleanup_conn;
1570         }
1571
1572         pr_info("ipvs loaded.\n");
1573         return ret;
1574
1575   cleanup_conn:
1576         ip_vs_conn_cleanup();
1577   cleanup_app:
1578         ip_vs_app_cleanup();
1579   cleanup_protocol:
1580         ip_vs_protocol_cleanup();
1581         ip_vs_control_cleanup();
1582   cleanup_estimator:
1583         ip_vs_estimator_cleanup();
1584         return ret;
1585 }
1586
1587 static void __exit ip_vs_cleanup(void)
1588 {
1589         nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1590         ip_vs_conn_cleanup();
1591         ip_vs_app_cleanup();
1592         ip_vs_protocol_cleanup();
1593         ip_vs_control_cleanup();
1594         ip_vs_estimator_cleanup();
1595         pr_info("ipvs unloaded.\n");
1596 }
1597
1598 module_init(ip_vs_init);
1599 module_exit(ip_vs_cleanup);
1600 MODULE_LICENSE("GPL");