]> Pileus Git - ~andy/linux/blob - net/netfilter/ipvs/ip_vs_ctl.c
Merge branch 'for-linus' of git://git.linaro.org/people/rmk/linux-arm
[~andy/linux] / net / netfilter / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
28 #include <linux/fs.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
35
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
39
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
42 #include <net/ip.h>
43 #ifdef CONFIG_IP_VS_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_route.h>
46 #endif
47 #include <net/route.h>
48 #include <net/sock.h>
49 #include <net/genetlink.h>
50
51 #include <asm/uaccess.h>
52
53 #include <net/ip_vs.h>
54
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
57
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
60
61 /* sysctl variables */
62
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
65
66 int ip_vs_get_debug_level(void)
67 {
68         return sysctl_ip_vs_debug_level;
69 }
70 #endif
71
72
73 /*  Protos */
74 static void __ip_vs_del_service(struct ip_vs_service *svc);
75
76
77 #ifdef CONFIG_IP_VS_IPV6
78 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
79 static bool __ip_vs_addr_is_local_v6(struct net *net,
80                                      const struct in6_addr *addr)
81 {
82         struct flowi6 fl6 = {
83                 .daddr = *addr,
84         };
85         struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
86         bool is_local;
87
88         is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
89
90         dst_release(dst);
91         return is_local;
92 }
93 #endif
94
95 #ifdef CONFIG_SYSCTL
96 /*
97  *      update_defense_level is called from keventd and from sysctl,
98  *      so it needs to protect itself from softirqs
99  */
100 static void update_defense_level(struct netns_ipvs *ipvs)
101 {
102         struct sysinfo i;
103         static int old_secure_tcp = 0;
104         int availmem;
105         int nomem;
106         int to_change = -1;
107
108         /* we only count free and buffered memory (in pages) */
109         si_meminfo(&i);
110         availmem = i.freeram + i.bufferram;
111         /* however in linux 2.5 the i.bufferram is total page cache size,
112            we need adjust it */
113         /* si_swapinfo(&i); */
114         /* availmem = availmem - (i.totalswap - i.freeswap); */
115
116         nomem = (availmem < ipvs->sysctl_amemthresh);
117
118         local_bh_disable();
119
120         /* drop_entry */
121         spin_lock(&ipvs->dropentry_lock);
122         switch (ipvs->sysctl_drop_entry) {
123         case 0:
124                 atomic_set(&ipvs->dropentry, 0);
125                 break;
126         case 1:
127                 if (nomem) {
128                         atomic_set(&ipvs->dropentry, 1);
129                         ipvs->sysctl_drop_entry = 2;
130                 } else {
131                         atomic_set(&ipvs->dropentry, 0);
132                 }
133                 break;
134         case 2:
135                 if (nomem) {
136                         atomic_set(&ipvs->dropentry, 1);
137                 } else {
138                         atomic_set(&ipvs->dropentry, 0);
139                         ipvs->sysctl_drop_entry = 1;
140                 };
141                 break;
142         case 3:
143                 atomic_set(&ipvs->dropentry, 1);
144                 break;
145         }
146         spin_unlock(&ipvs->dropentry_lock);
147
148         /* drop_packet */
149         spin_lock(&ipvs->droppacket_lock);
150         switch (ipvs->sysctl_drop_packet) {
151         case 0:
152                 ipvs->drop_rate = 0;
153                 break;
154         case 1:
155                 if (nomem) {
156                         ipvs->drop_rate = ipvs->drop_counter
157                                 = ipvs->sysctl_amemthresh /
158                                 (ipvs->sysctl_amemthresh-availmem);
159                         ipvs->sysctl_drop_packet = 2;
160                 } else {
161                         ipvs->drop_rate = 0;
162                 }
163                 break;
164         case 2:
165                 if (nomem) {
166                         ipvs->drop_rate = ipvs->drop_counter
167                                 = ipvs->sysctl_amemthresh /
168                                 (ipvs->sysctl_amemthresh-availmem);
169                 } else {
170                         ipvs->drop_rate = 0;
171                         ipvs->sysctl_drop_packet = 1;
172                 }
173                 break;
174         case 3:
175                 ipvs->drop_rate = ipvs->sysctl_am_droprate;
176                 break;
177         }
178         spin_unlock(&ipvs->droppacket_lock);
179
180         /* secure_tcp */
181         spin_lock(&ipvs->securetcp_lock);
182         switch (ipvs->sysctl_secure_tcp) {
183         case 0:
184                 if (old_secure_tcp >= 2)
185                         to_change = 0;
186                 break;
187         case 1:
188                 if (nomem) {
189                         if (old_secure_tcp < 2)
190                                 to_change = 1;
191                         ipvs->sysctl_secure_tcp = 2;
192                 } else {
193                         if (old_secure_tcp >= 2)
194                                 to_change = 0;
195                 }
196                 break;
197         case 2:
198                 if (nomem) {
199                         if (old_secure_tcp < 2)
200                                 to_change = 1;
201                 } else {
202                         if (old_secure_tcp >= 2)
203                                 to_change = 0;
204                         ipvs->sysctl_secure_tcp = 1;
205                 }
206                 break;
207         case 3:
208                 if (old_secure_tcp < 2)
209                         to_change = 1;
210                 break;
211         }
212         old_secure_tcp = ipvs->sysctl_secure_tcp;
213         if (to_change >= 0)
214                 ip_vs_protocol_timeout_change(ipvs,
215                                               ipvs->sysctl_secure_tcp > 1);
216         spin_unlock(&ipvs->securetcp_lock);
217
218         local_bh_enable();
219 }
220
221
222 /*
223  *      Timer for checking the defense
224  */
225 #define DEFENSE_TIMER_PERIOD    1*HZ
226
227 static void defense_work_handler(struct work_struct *work)
228 {
229         struct netns_ipvs *ipvs =
230                 container_of(work, struct netns_ipvs, defense_work.work);
231
232         update_defense_level(ipvs);
233         if (atomic_read(&ipvs->dropentry))
234                 ip_vs_random_dropentry(ipvs->net);
235         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
236 }
237 #endif
238
239 int
240 ip_vs_use_count_inc(void)
241 {
242         return try_module_get(THIS_MODULE);
243 }
244
245 void
246 ip_vs_use_count_dec(void)
247 {
248         module_put(THIS_MODULE);
249 }
250
251
252 /*
253  *      Hash table: for virtual service lookups
254  */
255 #define IP_VS_SVC_TAB_BITS 8
256 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
257 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
258
259 /* the service table hashed by <protocol, addr, port> */
260 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
261 /* the service table hashed by fwmark */
262 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
263
264
265 /*
266  *      Returns hash value for virtual service
267  */
268 static inline unsigned int
269 ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
270                   const union nf_inet_addr *addr, __be16 port)
271 {
272         register unsigned int porth = ntohs(port);
273         __be32 addr_fold = addr->ip;
274
275 #ifdef CONFIG_IP_VS_IPV6
276         if (af == AF_INET6)
277                 addr_fold = addr->ip6[0]^addr->ip6[1]^
278                             addr->ip6[2]^addr->ip6[3];
279 #endif
280         addr_fold ^= ((size_t)net>>8);
281
282         return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
283                 & IP_VS_SVC_TAB_MASK;
284 }
285
286 /*
287  *      Returns hash value of fwmark for virtual service lookup
288  */
289 static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
290 {
291         return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
292 }
293
294 /*
295  *      Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
296  *      or in the ip_vs_svc_fwm_table by fwmark.
297  *      Should be called with locked tables.
298  */
299 static int ip_vs_svc_hash(struct ip_vs_service *svc)
300 {
301         unsigned int hash;
302
303         if (svc->flags & IP_VS_SVC_F_HASHED) {
304                 pr_err("%s(): request for already hashed, called from %pF\n",
305                        __func__, __builtin_return_address(0));
306                 return 0;
307         }
308
309         if (svc->fwmark == 0) {
310                 /*
311                  *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
312                  */
313                 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
314                                          &svc->addr, svc->port);
315                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
316         } else {
317                 /*
318                  *  Hash it by fwmark in svc_fwm_table
319                  */
320                 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
321                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
322         }
323
324         svc->flags |= IP_VS_SVC_F_HASHED;
325         /* increase its refcnt because it is referenced by the svc table */
326         atomic_inc(&svc->refcnt);
327         return 1;
328 }
329
330
331 /*
332  *      Unhashes a service from svc_table / svc_fwm_table.
333  *      Should be called with locked tables.
334  */
335 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
336 {
337         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
338                 pr_err("%s(): request for unhash flagged, called from %pF\n",
339                        __func__, __builtin_return_address(0));
340                 return 0;
341         }
342
343         if (svc->fwmark == 0) {
344                 /* Remove it from the svc_table table */
345                 list_del(&svc->s_list);
346         } else {
347                 /* Remove it from the svc_fwm_table table */
348                 list_del(&svc->f_list);
349         }
350
351         svc->flags &= ~IP_VS_SVC_F_HASHED;
352         atomic_dec(&svc->refcnt);
353         return 1;
354 }
355
356
357 /*
358  *      Get service by {netns, proto,addr,port} in the service table.
359  */
360 static inline struct ip_vs_service *
361 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
362                      const union nf_inet_addr *vaddr, __be16 vport)
363 {
364         unsigned int hash;
365         struct ip_vs_service *svc;
366
367         /* Check for "full" addressed entries */
368         hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
369
370         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
371                 if ((svc->af == af)
372                     && ip_vs_addr_equal(af, &svc->addr, vaddr)
373                     && (svc->port == vport)
374                     && (svc->protocol == protocol)
375                     && net_eq(svc->net, net)) {
376                         /* HIT */
377                         return svc;
378                 }
379         }
380
381         return NULL;
382 }
383
384
385 /*
386  *      Get service by {fwmark} in the service table.
387  */
388 static inline struct ip_vs_service *
389 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
390 {
391         unsigned int hash;
392         struct ip_vs_service *svc;
393
394         /* Check for fwmark addressed entries */
395         hash = ip_vs_svc_fwm_hashkey(net, fwmark);
396
397         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
398                 if (svc->fwmark == fwmark && svc->af == af
399                     && net_eq(svc->net, net)) {
400                         /* HIT */
401                         return svc;
402                 }
403         }
404
405         return NULL;
406 }
407
408 struct ip_vs_service *
409 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
410                   const union nf_inet_addr *vaddr, __be16 vport)
411 {
412         struct ip_vs_service *svc;
413         struct netns_ipvs *ipvs = net_ipvs(net);
414
415         read_lock(&__ip_vs_svc_lock);
416
417         /*
418          *      Check the table hashed by fwmark first
419          */
420         if (fwmark) {
421                 svc = __ip_vs_svc_fwm_find(net, af, fwmark);
422                 if (svc)
423                         goto out;
424         }
425
426         /*
427          *      Check the table hashed by <protocol,addr,port>
428          *      for "full" addressed entries
429          */
430         svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
431
432         if (svc == NULL
433             && protocol == IPPROTO_TCP
434             && atomic_read(&ipvs->ftpsvc_counter)
435             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
436                 /*
437                  * Check if ftp service entry exists, the packet
438                  * might belong to FTP data connections.
439                  */
440                 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
441         }
442
443         if (svc == NULL
444             && atomic_read(&ipvs->nullsvc_counter)) {
445                 /*
446                  * Check if the catch-all port (port zero) exists
447                  */
448                 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
449         }
450
451   out:
452         if (svc)
453                 atomic_inc(&svc->usecnt);
454         read_unlock(&__ip_vs_svc_lock);
455
456         IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
457                       fwmark, ip_vs_proto_name(protocol),
458                       IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
459                       svc ? "hit" : "not hit");
460
461         return svc;
462 }
463
464
465 static inline void
466 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
467 {
468         atomic_inc(&svc->refcnt);
469         dest->svc = svc;
470 }
471
472 static void
473 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
474 {
475         struct ip_vs_service *svc = dest->svc;
476
477         dest->svc = NULL;
478         if (atomic_dec_and_test(&svc->refcnt)) {
479                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
480                               svc->fwmark,
481                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
482                               ntohs(svc->port), atomic_read(&svc->usecnt));
483                 free_percpu(svc->stats.cpustats);
484                 kfree(svc);
485         }
486 }
487
488
489 /*
490  *      Returns hash value for real service
491  */
492 static inline unsigned int ip_vs_rs_hashkey(int af,
493                                             const union nf_inet_addr *addr,
494                                             __be16 port)
495 {
496         register unsigned int porth = ntohs(port);
497         __be32 addr_fold = addr->ip;
498
499 #ifdef CONFIG_IP_VS_IPV6
500         if (af == AF_INET6)
501                 addr_fold = addr->ip6[0]^addr->ip6[1]^
502                             addr->ip6[2]^addr->ip6[3];
503 #endif
504
505         return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
506                 & IP_VS_RTAB_MASK;
507 }
508
509 /*
510  *      Hashes ip_vs_dest in rs_table by <proto,addr,port>.
511  *      should be called with locked tables.
512  */
513 static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
514 {
515         unsigned int hash;
516
517         if (!list_empty(&dest->d_list)) {
518                 return 0;
519         }
520
521         /*
522          *      Hash by proto,addr,port,
523          *      which are the parameters of the real service.
524          */
525         hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
526
527         list_add(&dest->d_list, &ipvs->rs_table[hash]);
528
529         return 1;
530 }
531
532 /*
533  *      UNhashes ip_vs_dest from rs_table.
534  *      should be called with locked tables.
535  */
536 static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
537 {
538         /*
539          * Remove it from the rs_table table.
540          */
541         if (!list_empty(&dest->d_list)) {
542                 list_del_init(&dest->d_list);
543         }
544
545         return 1;
546 }
547
548 /*
549  *      Lookup real service by <proto,addr,port> in the real service table.
550  */
551 struct ip_vs_dest *
552 ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
553                           const union nf_inet_addr *daddr,
554                           __be16 dport)
555 {
556         struct netns_ipvs *ipvs = net_ipvs(net);
557         unsigned int hash;
558         struct ip_vs_dest *dest;
559
560         /*
561          *      Check for "full" addressed entries
562          *      Return the first found entry
563          */
564         hash = ip_vs_rs_hashkey(af, daddr, dport);
565
566         read_lock(&ipvs->rs_lock);
567         list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
568                 if ((dest->af == af)
569                     && ip_vs_addr_equal(af, &dest->addr, daddr)
570                     && (dest->port == dport)
571                     && ((dest->protocol == protocol) ||
572                         dest->vfwmark)) {
573                         /* HIT */
574                         read_unlock(&ipvs->rs_lock);
575                         return dest;
576                 }
577         }
578         read_unlock(&ipvs->rs_lock);
579
580         return NULL;
581 }
582
583 /*
584  *      Lookup destination by {addr,port} in the given service
585  */
586 static struct ip_vs_dest *
587 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
588                   __be16 dport)
589 {
590         struct ip_vs_dest *dest;
591
592         /*
593          * Find the destination for the given service
594          */
595         list_for_each_entry(dest, &svc->destinations, n_list) {
596                 if ((dest->af == svc->af)
597                     && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
598                     && (dest->port == dport)) {
599                         /* HIT */
600                         return dest;
601                 }
602         }
603
604         return NULL;
605 }
606
607 /*
608  * Find destination by {daddr,dport,vaddr,protocol}
609  * Cretaed to be used in ip_vs_process_message() in
610  * the backup synchronization daemon. It finds the
611  * destination to be bound to the received connection
612  * on the backup.
613  *
614  * ip_vs_lookup_real_service() looked promissing, but
615  * seems not working as expected.
616  */
617 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
618                                    const union nf_inet_addr *daddr,
619                                    __be16 dport,
620                                    const union nf_inet_addr *vaddr,
621                                    __be16 vport, __u16 protocol, __u32 fwmark,
622                                    __u32 flags)
623 {
624         struct ip_vs_dest *dest;
625         struct ip_vs_service *svc;
626         __be16 port = dport;
627
628         svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
629         if (!svc)
630                 return NULL;
631         if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
632                 port = 0;
633         dest = ip_vs_lookup_dest(svc, daddr, port);
634         if (!dest)
635                 dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
636         if (dest)
637                 atomic_inc(&dest->refcnt);
638         ip_vs_service_put(svc);
639         return dest;
640 }
641
642 /*
643  *  Lookup dest by {svc,addr,port} in the destination trash.
644  *  The destination trash is used to hold the destinations that are removed
645  *  from the service table but are still referenced by some conn entries.
646  *  The reason to add the destination trash is when the dest is temporary
647  *  down (either by administrator or by monitor program), the dest can be
648  *  picked back from the trash, the remaining connections to the dest can
649  *  continue, and the counting information of the dest is also useful for
650  *  scheduling.
651  */
652 static struct ip_vs_dest *
653 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
654                      __be16 dport)
655 {
656         struct ip_vs_dest *dest, *nxt;
657         struct netns_ipvs *ipvs = net_ipvs(svc->net);
658
659         /*
660          * Find the destination in trash
661          */
662         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
663                 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
664                               "dest->refcnt=%d\n",
665                               dest->vfwmark,
666                               IP_VS_DBG_ADDR(svc->af, &dest->addr),
667                               ntohs(dest->port),
668                               atomic_read(&dest->refcnt));
669                 if (dest->af == svc->af &&
670                     ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
671                     dest->port == dport &&
672                     dest->vfwmark == svc->fwmark &&
673                     dest->protocol == svc->protocol &&
674                     (svc->fwmark ||
675                      (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
676                       dest->vport == svc->port))) {
677                         /* HIT */
678                         return dest;
679                 }
680
681                 /*
682                  * Try to purge the destination from trash if not referenced
683                  */
684                 if (atomic_read(&dest->refcnt) == 1) {
685                         IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u "
686                                       "from trash\n",
687                                       dest->vfwmark,
688                                       IP_VS_DBG_ADDR(svc->af, &dest->addr),
689                                       ntohs(dest->port));
690                         list_del(&dest->n_list);
691                         ip_vs_dst_reset(dest);
692                         __ip_vs_unbind_svc(dest);
693                         free_percpu(dest->stats.cpustats);
694                         kfree(dest);
695                 }
696         }
697
698         return NULL;
699 }
700
701
702 /*
703  *  Clean up all the destinations in the trash
704  *  Called by the ip_vs_control_cleanup()
705  *
706  *  When the ip_vs_control_clearup is activated by ipvs module exit,
707  *  the service tables must have been flushed and all the connections
708  *  are expired, and the refcnt of each destination in the trash must
709  *  be 1, so we simply release them here.
710  */
711 static void ip_vs_trash_cleanup(struct net *net)
712 {
713         struct ip_vs_dest *dest, *nxt;
714         struct netns_ipvs *ipvs = net_ipvs(net);
715
716         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
717                 list_del(&dest->n_list);
718                 ip_vs_dst_reset(dest);
719                 __ip_vs_unbind_svc(dest);
720                 free_percpu(dest->stats.cpustats);
721                 kfree(dest);
722         }
723 }
724
725 static void
726 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
727 {
728 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
729
730         spin_lock_bh(&src->lock);
731
732         IP_VS_SHOW_STATS_COUNTER(conns);
733         IP_VS_SHOW_STATS_COUNTER(inpkts);
734         IP_VS_SHOW_STATS_COUNTER(outpkts);
735         IP_VS_SHOW_STATS_COUNTER(inbytes);
736         IP_VS_SHOW_STATS_COUNTER(outbytes);
737
738         ip_vs_read_estimator(dst, src);
739
740         spin_unlock_bh(&src->lock);
741 }
742
743 static void
744 ip_vs_zero_stats(struct ip_vs_stats *stats)
745 {
746         spin_lock_bh(&stats->lock);
747
748         /* get current counters as zero point, rates are zeroed */
749
750 #define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
751
752         IP_VS_ZERO_STATS_COUNTER(conns);
753         IP_VS_ZERO_STATS_COUNTER(inpkts);
754         IP_VS_ZERO_STATS_COUNTER(outpkts);
755         IP_VS_ZERO_STATS_COUNTER(inbytes);
756         IP_VS_ZERO_STATS_COUNTER(outbytes);
757
758         ip_vs_zero_estimator(stats);
759
760         spin_unlock_bh(&stats->lock);
761 }
762
763 /*
764  *      Update a destination in the given service
765  */
766 static void
767 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
768                     struct ip_vs_dest_user_kern *udest, int add)
769 {
770         struct netns_ipvs *ipvs = net_ipvs(svc->net);
771         int conn_flags;
772
773         /* set the weight and the flags */
774         atomic_set(&dest->weight, udest->weight);
775         conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
776         conn_flags |= IP_VS_CONN_F_INACTIVE;
777
778         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
779         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
780                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
781         } else {
782                 /*
783                  *    Put the real service in rs_table if not present.
784                  *    For now only for NAT!
785                  */
786                 write_lock_bh(&ipvs->rs_lock);
787                 ip_vs_rs_hash(ipvs, dest);
788                 write_unlock_bh(&ipvs->rs_lock);
789         }
790         atomic_set(&dest->conn_flags, conn_flags);
791
792         /* bind the service */
793         if (!dest->svc) {
794                 __ip_vs_bind_svc(dest, svc);
795         } else {
796                 if (dest->svc != svc) {
797                         __ip_vs_unbind_svc(dest);
798                         ip_vs_zero_stats(&dest->stats);
799                         __ip_vs_bind_svc(dest, svc);
800                 }
801         }
802
803         /* set the dest status flags */
804         dest->flags |= IP_VS_DEST_F_AVAILABLE;
805
806         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
807                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
808         dest->u_threshold = udest->u_threshold;
809         dest->l_threshold = udest->l_threshold;
810
811         spin_lock_bh(&dest->dst_lock);
812         ip_vs_dst_reset(dest);
813         spin_unlock_bh(&dest->dst_lock);
814
815         if (add)
816                 ip_vs_start_estimator(svc->net, &dest->stats);
817
818         write_lock_bh(&__ip_vs_svc_lock);
819
820         /* Wait until all other svc users go away */
821         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
822
823         if (add) {
824                 list_add(&dest->n_list, &svc->destinations);
825                 svc->num_dests++;
826         }
827
828         /* call the update_service, because server weight may be changed */
829         if (svc->scheduler->update_service)
830                 svc->scheduler->update_service(svc);
831
832         write_unlock_bh(&__ip_vs_svc_lock);
833 }
834
835
836 /*
837  *      Create a destination for the given service
838  */
839 static int
840 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
841                struct ip_vs_dest **dest_p)
842 {
843         struct ip_vs_dest *dest;
844         unsigned int atype;
845
846         EnterFunction(2);
847
848 #ifdef CONFIG_IP_VS_IPV6
849         if (svc->af == AF_INET6) {
850                 atype = ipv6_addr_type(&udest->addr.in6);
851                 if ((!(atype & IPV6_ADDR_UNICAST) ||
852                         atype & IPV6_ADDR_LINKLOCAL) &&
853                         !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
854                         return -EINVAL;
855         } else
856 #endif
857         {
858                 atype = inet_addr_type(svc->net, udest->addr.ip);
859                 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
860                         return -EINVAL;
861         }
862
863         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
864         if (dest == NULL)
865                 return -ENOMEM;
866
867         dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
868         if (!dest->stats.cpustats)
869                 goto err_alloc;
870
871         dest->af = svc->af;
872         dest->protocol = svc->protocol;
873         dest->vaddr = svc->addr;
874         dest->vport = svc->port;
875         dest->vfwmark = svc->fwmark;
876         ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
877         dest->port = udest->port;
878
879         atomic_set(&dest->activeconns, 0);
880         atomic_set(&dest->inactconns, 0);
881         atomic_set(&dest->persistconns, 0);
882         atomic_set(&dest->refcnt, 1);
883
884         INIT_LIST_HEAD(&dest->d_list);
885         spin_lock_init(&dest->dst_lock);
886         spin_lock_init(&dest->stats.lock);
887         __ip_vs_update_dest(svc, dest, udest, 1);
888
889         *dest_p = dest;
890
891         LeaveFunction(2);
892         return 0;
893
894 err_alloc:
895         kfree(dest);
896         return -ENOMEM;
897 }
898
899
900 /*
901  *      Add a destination into an existing service
902  */
903 static int
904 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
905 {
906         struct ip_vs_dest *dest;
907         union nf_inet_addr daddr;
908         __be16 dport = udest->port;
909         int ret;
910
911         EnterFunction(2);
912
913         if (udest->weight < 0) {
914                 pr_err("%s(): server weight less than zero\n", __func__);
915                 return -ERANGE;
916         }
917
918         if (udest->l_threshold > udest->u_threshold) {
919                 pr_err("%s(): lower threshold is higher than upper threshold\n",
920                         __func__);
921                 return -ERANGE;
922         }
923
924         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
925
926         /*
927          * Check if the dest already exists in the list
928          */
929         dest = ip_vs_lookup_dest(svc, &daddr, dport);
930
931         if (dest != NULL) {
932                 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
933                 return -EEXIST;
934         }
935
936         /*
937          * Check if the dest already exists in the trash and
938          * is from the same service
939          */
940         dest = ip_vs_trash_get_dest(svc, &daddr, dport);
941
942         if (dest != NULL) {
943                 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
944                               "dest->refcnt=%d, service %u/%s:%u\n",
945                               IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
946                               atomic_read(&dest->refcnt),
947                               dest->vfwmark,
948                               IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
949                               ntohs(dest->vport));
950
951                 /*
952                  * Get the destination from the trash
953                  */
954                 list_del(&dest->n_list);
955
956                 __ip_vs_update_dest(svc, dest, udest, 1);
957                 ret = 0;
958         } else {
959                 /*
960                  * Allocate and initialize the dest structure
961                  */
962                 ret = ip_vs_new_dest(svc, udest, &dest);
963         }
964         LeaveFunction(2);
965
966         return ret;
967 }
968
969
970 /*
971  *      Edit a destination in the given service
972  */
973 static int
974 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
975 {
976         struct ip_vs_dest *dest;
977         union nf_inet_addr daddr;
978         __be16 dport = udest->port;
979
980         EnterFunction(2);
981
982         if (udest->weight < 0) {
983                 pr_err("%s(): server weight less than zero\n", __func__);
984                 return -ERANGE;
985         }
986
987         if (udest->l_threshold > udest->u_threshold) {
988                 pr_err("%s(): lower threshold is higher than upper threshold\n",
989                         __func__);
990                 return -ERANGE;
991         }
992
993         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
994
995         /*
996          *  Lookup the destination list
997          */
998         dest = ip_vs_lookup_dest(svc, &daddr, dport);
999
1000         if (dest == NULL) {
1001                 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1002                 return -ENOENT;
1003         }
1004
1005         __ip_vs_update_dest(svc, dest, udest, 0);
1006         LeaveFunction(2);
1007
1008         return 0;
1009 }
1010
1011
1012 /*
1013  *      Delete a destination (must be already unlinked from the service)
1014  */
1015 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
1016 {
1017         struct netns_ipvs *ipvs = net_ipvs(net);
1018
1019         ip_vs_stop_estimator(net, &dest->stats);
1020
1021         /*
1022          *  Remove it from the d-linked list with the real services.
1023          */
1024         write_lock_bh(&ipvs->rs_lock);
1025         ip_vs_rs_unhash(dest);
1026         write_unlock_bh(&ipvs->rs_lock);
1027
1028         /*
1029          *  Decrease the refcnt of the dest, and free the dest
1030          *  if nobody refers to it (refcnt=0). Otherwise, throw
1031          *  the destination into the trash.
1032          */
1033         if (atomic_dec_and_test(&dest->refcnt)) {
1034                 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n",
1035                               dest->vfwmark,
1036                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1037                               ntohs(dest->port));
1038                 ip_vs_dst_reset(dest);
1039                 /* simply decrease svc->refcnt here, let the caller check
1040                    and release the service if nobody refers to it.
1041                    Only user context can release destination and service,
1042                    and only one user context can update virtual service at a
1043                    time, so the operation here is OK */
1044                 atomic_dec(&dest->svc->refcnt);
1045                 free_percpu(dest->stats.cpustats);
1046                 kfree(dest);
1047         } else {
1048                 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
1049                               "dest->refcnt=%d\n",
1050                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1051                               ntohs(dest->port),
1052                               atomic_read(&dest->refcnt));
1053                 list_add(&dest->n_list, &ipvs->dest_trash);
1054                 atomic_inc(&dest->refcnt);
1055         }
1056 }
1057
1058
1059 /*
1060  *      Unlink a destination from the given service
1061  */
1062 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1063                                 struct ip_vs_dest *dest,
1064                                 int svcupd)
1065 {
1066         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1067
1068         /*
1069          *  Remove it from the d-linked destination list.
1070          */
1071         list_del(&dest->n_list);
1072         svc->num_dests--;
1073
1074         /*
1075          *  Call the update_service function of its scheduler
1076          */
1077         if (svcupd && svc->scheduler->update_service)
1078                         svc->scheduler->update_service(svc);
1079 }
1080
1081
1082 /*
1083  *      Delete a destination server in the given service
1084  */
1085 static int
1086 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1087 {
1088         struct ip_vs_dest *dest;
1089         __be16 dport = udest->port;
1090
1091         EnterFunction(2);
1092
1093         dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1094
1095         if (dest == NULL) {
1096                 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1097                 return -ENOENT;
1098         }
1099
1100         write_lock_bh(&__ip_vs_svc_lock);
1101
1102         /*
1103          *      Wait until all other svc users go away.
1104          */
1105         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1106
1107         /*
1108          *      Unlink dest from the service
1109          */
1110         __ip_vs_unlink_dest(svc, dest, 1);
1111
1112         write_unlock_bh(&__ip_vs_svc_lock);
1113
1114         /*
1115          *      Delete the destination
1116          */
1117         __ip_vs_del_dest(svc->net, dest);
1118
1119         LeaveFunction(2);
1120
1121         return 0;
1122 }
1123
1124
1125 /*
1126  *      Add a service into the service hash table
1127  */
1128 static int
1129 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1130                   struct ip_vs_service **svc_p)
1131 {
1132         int ret = 0;
1133         struct ip_vs_scheduler *sched = NULL;
1134         struct ip_vs_pe *pe = NULL;
1135         struct ip_vs_service *svc = NULL;
1136         struct netns_ipvs *ipvs = net_ipvs(net);
1137
1138         /* increase the module use count */
1139         ip_vs_use_count_inc();
1140
1141         /* Lookup the scheduler by 'u->sched_name' */
1142         sched = ip_vs_scheduler_get(u->sched_name);
1143         if (sched == NULL) {
1144                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1145                 ret = -ENOENT;
1146                 goto out_err;
1147         }
1148
1149         if (u->pe_name && *u->pe_name) {
1150                 pe = ip_vs_pe_getbyname(u->pe_name);
1151                 if (pe == NULL) {
1152                         pr_info("persistence engine module ip_vs_pe_%s "
1153                                 "not found\n", u->pe_name);
1154                         ret = -ENOENT;
1155                         goto out_err;
1156                 }
1157         }
1158
1159 #ifdef CONFIG_IP_VS_IPV6
1160         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1161                 ret = -EINVAL;
1162                 goto out_err;
1163         }
1164 #endif
1165
1166         svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1167         if (svc == NULL) {
1168                 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1169                 ret = -ENOMEM;
1170                 goto out_err;
1171         }
1172         svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1173         if (!svc->stats.cpustats) {
1174                 ret = -ENOMEM;
1175                 goto out_err;
1176         }
1177
1178         /* I'm the first user of the service */
1179         atomic_set(&svc->usecnt, 0);
1180         atomic_set(&svc->refcnt, 0);
1181
1182         svc->af = u->af;
1183         svc->protocol = u->protocol;
1184         ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1185         svc->port = u->port;
1186         svc->fwmark = u->fwmark;
1187         svc->flags = u->flags;
1188         svc->timeout = u->timeout * HZ;
1189         svc->netmask = u->netmask;
1190         svc->net = net;
1191
1192         INIT_LIST_HEAD(&svc->destinations);
1193         rwlock_init(&svc->sched_lock);
1194         spin_lock_init(&svc->stats.lock);
1195
1196         /* Bind the scheduler */
1197         ret = ip_vs_bind_scheduler(svc, sched);
1198         if (ret)
1199                 goto out_err;
1200         sched = NULL;
1201
1202         /* Bind the ct retriever */
1203         ip_vs_bind_pe(svc, pe);
1204         pe = NULL;
1205
1206         /* Update the virtual service counters */
1207         if (svc->port == FTPPORT)
1208                 atomic_inc(&ipvs->ftpsvc_counter);
1209         else if (svc->port == 0)
1210                 atomic_inc(&ipvs->nullsvc_counter);
1211
1212         ip_vs_start_estimator(net, &svc->stats);
1213
1214         /* Count only IPv4 services for old get/setsockopt interface */
1215         if (svc->af == AF_INET)
1216                 ipvs->num_services++;
1217
1218         /* Hash the service into the service table */
1219         write_lock_bh(&__ip_vs_svc_lock);
1220         ip_vs_svc_hash(svc);
1221         write_unlock_bh(&__ip_vs_svc_lock);
1222
1223         *svc_p = svc;
1224         /* Now there is a service - full throttle */
1225         ipvs->enable = 1;
1226         return 0;
1227
1228
1229  out_err:
1230         if (svc != NULL) {
1231                 ip_vs_unbind_scheduler(svc);
1232                 if (svc->inc) {
1233                         local_bh_disable();
1234                         ip_vs_app_inc_put(svc->inc);
1235                         local_bh_enable();
1236                 }
1237                 if (svc->stats.cpustats)
1238                         free_percpu(svc->stats.cpustats);
1239                 kfree(svc);
1240         }
1241         ip_vs_scheduler_put(sched);
1242         ip_vs_pe_put(pe);
1243
1244         /* decrease the module use count */
1245         ip_vs_use_count_dec();
1246
1247         return ret;
1248 }
1249
1250
1251 /*
1252  *      Edit a service and bind it with a new scheduler
1253  */
1254 static int
1255 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1256 {
1257         struct ip_vs_scheduler *sched, *old_sched;
1258         struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1259         int ret = 0;
1260
1261         /*
1262          * Lookup the scheduler, by 'u->sched_name'
1263          */
1264         sched = ip_vs_scheduler_get(u->sched_name);
1265         if (sched == NULL) {
1266                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1267                 return -ENOENT;
1268         }
1269         old_sched = sched;
1270
1271         if (u->pe_name && *u->pe_name) {
1272                 pe = ip_vs_pe_getbyname(u->pe_name);
1273                 if (pe == NULL) {
1274                         pr_info("persistence engine module ip_vs_pe_%s "
1275                                 "not found\n", u->pe_name);
1276                         ret = -ENOENT;
1277                         goto out;
1278                 }
1279                 old_pe = pe;
1280         }
1281
1282 #ifdef CONFIG_IP_VS_IPV6
1283         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1284                 ret = -EINVAL;
1285                 goto out;
1286         }
1287 #endif
1288
1289         write_lock_bh(&__ip_vs_svc_lock);
1290
1291         /*
1292          * Wait until all other svc users go away.
1293          */
1294         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1295
1296         /*
1297          * Set the flags and timeout value
1298          */
1299         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1300         svc->timeout = u->timeout * HZ;
1301         svc->netmask = u->netmask;
1302
1303         old_sched = svc->scheduler;
1304         if (sched != old_sched) {
1305                 /*
1306                  * Unbind the old scheduler
1307                  */
1308                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1309                         old_sched = sched;
1310                         goto out_unlock;
1311                 }
1312
1313                 /*
1314                  * Bind the new scheduler
1315                  */
1316                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1317                         /*
1318                          * If ip_vs_bind_scheduler fails, restore the old
1319                          * scheduler.
1320                          * The main reason of failure is out of memory.
1321                          *
1322                          * The question is if the old scheduler can be
1323                          * restored all the time. TODO: if it cannot be
1324                          * restored some time, we must delete the service,
1325                          * otherwise the system may crash.
1326                          */
1327                         ip_vs_bind_scheduler(svc, old_sched);
1328                         old_sched = sched;
1329                         goto out_unlock;
1330                 }
1331         }
1332
1333         old_pe = svc->pe;
1334         if (pe != old_pe) {
1335                 ip_vs_unbind_pe(svc);
1336                 ip_vs_bind_pe(svc, pe);
1337         }
1338
1339 out_unlock:
1340         write_unlock_bh(&__ip_vs_svc_lock);
1341 out:
1342         ip_vs_scheduler_put(old_sched);
1343         ip_vs_pe_put(old_pe);
1344         return ret;
1345 }
1346
1347
1348 /*
1349  *      Delete a service from the service list
1350  *      - The service must be unlinked, unlocked and not referenced!
1351  *      - We are called under _bh lock
1352  */
1353 static void __ip_vs_del_service(struct ip_vs_service *svc)
1354 {
1355         struct ip_vs_dest *dest, *nxt;
1356         struct ip_vs_scheduler *old_sched;
1357         struct ip_vs_pe *old_pe;
1358         struct netns_ipvs *ipvs = net_ipvs(svc->net);
1359
1360         pr_info("%s: enter\n", __func__);
1361
1362         /* Count only IPv4 services for old get/setsockopt interface */
1363         if (svc->af == AF_INET)
1364                 ipvs->num_services--;
1365
1366         ip_vs_stop_estimator(svc->net, &svc->stats);
1367
1368         /* Unbind scheduler */
1369         old_sched = svc->scheduler;
1370         ip_vs_unbind_scheduler(svc);
1371         ip_vs_scheduler_put(old_sched);
1372
1373         /* Unbind persistence engine */
1374         old_pe = svc->pe;
1375         ip_vs_unbind_pe(svc);
1376         ip_vs_pe_put(old_pe);
1377
1378         /* Unbind app inc */
1379         if (svc->inc) {
1380                 ip_vs_app_inc_put(svc->inc);
1381                 svc->inc = NULL;
1382         }
1383
1384         /*
1385          *    Unlink the whole destination list
1386          */
1387         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1388                 __ip_vs_unlink_dest(svc, dest, 0);
1389                 __ip_vs_del_dest(svc->net, dest);
1390         }
1391
1392         /*
1393          *    Update the virtual service counters
1394          */
1395         if (svc->port == FTPPORT)
1396                 atomic_dec(&ipvs->ftpsvc_counter);
1397         else if (svc->port == 0)
1398                 atomic_dec(&ipvs->nullsvc_counter);
1399
1400         /*
1401          *    Free the service if nobody refers to it
1402          */
1403         if (atomic_read(&svc->refcnt) == 0) {
1404                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1405                               svc->fwmark,
1406                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
1407                               ntohs(svc->port), atomic_read(&svc->usecnt));
1408                 free_percpu(svc->stats.cpustats);
1409                 kfree(svc);
1410         }
1411
1412         /* decrease the module use count */
1413         ip_vs_use_count_dec();
1414 }
1415
1416 /*
1417  * Unlink a service from list and try to delete it if its refcnt reached 0
1418  */
1419 static void ip_vs_unlink_service(struct ip_vs_service *svc)
1420 {
1421         /*
1422          * Unhash it from the service table
1423          */
1424         write_lock_bh(&__ip_vs_svc_lock);
1425
1426         ip_vs_svc_unhash(svc);
1427
1428         /*
1429          * Wait until all the svc users go away.
1430          */
1431         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1432
1433         __ip_vs_del_service(svc);
1434
1435         write_unlock_bh(&__ip_vs_svc_lock);
1436 }
1437
1438 /*
1439  *      Delete a service from the service list
1440  */
1441 static int ip_vs_del_service(struct ip_vs_service *svc)
1442 {
1443         if (svc == NULL)
1444                 return -EEXIST;
1445         ip_vs_unlink_service(svc);
1446
1447         return 0;
1448 }
1449
1450
1451 /*
1452  *      Flush all the virtual services
1453  */
1454 static int ip_vs_flush(struct net *net)
1455 {
1456         int idx;
1457         struct ip_vs_service *svc, *nxt;
1458
1459         /*
1460          * Flush the service table hashed by <netns,protocol,addr,port>
1461          */
1462         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1463                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1464                                          s_list) {
1465                         if (net_eq(svc->net, net))
1466                                 ip_vs_unlink_service(svc);
1467                 }
1468         }
1469
1470         /*
1471          * Flush the service table hashed by fwmark
1472          */
1473         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1474                 list_for_each_entry_safe(svc, nxt,
1475                                          &ip_vs_svc_fwm_table[idx], f_list) {
1476                         if (net_eq(svc->net, net))
1477                                 ip_vs_unlink_service(svc);
1478                 }
1479         }
1480
1481         return 0;
1482 }
1483
1484 /*
1485  *      Delete service by {netns} in the service table.
1486  *      Called by __ip_vs_cleanup()
1487  */
1488 void ip_vs_service_net_cleanup(struct net *net)
1489 {
1490         EnterFunction(2);
1491         /* Check for "full" addressed entries */
1492         mutex_lock(&__ip_vs_mutex);
1493         ip_vs_flush(net);
1494         mutex_unlock(&__ip_vs_mutex);
1495         LeaveFunction(2);
1496 }
1497 /*
1498  * Release dst hold by dst_cache
1499  */
1500 static inline void
1501 __ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev)
1502 {
1503         spin_lock_bh(&dest->dst_lock);
1504         if (dest->dst_cache && dest->dst_cache->dev == dev) {
1505                 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
1506                               dev->name,
1507                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1508                               ntohs(dest->port),
1509                               atomic_read(&dest->refcnt));
1510                 ip_vs_dst_reset(dest);
1511         }
1512         spin_unlock_bh(&dest->dst_lock);
1513
1514 }
1515 /*
1516  * Netdev event receiver
1517  * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to
1518  * a device that is "unregister" it must be released.
1519  */
1520 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
1521                             void *ptr)
1522 {
1523         struct net_device *dev = ptr;
1524         struct net *net = dev_net(dev);
1525         struct netns_ipvs *ipvs = net_ipvs(net);
1526         struct ip_vs_service *svc;
1527         struct ip_vs_dest *dest;
1528         unsigned int idx;
1529
1530         if (event != NETDEV_UNREGISTER || !ipvs)
1531                 return NOTIFY_DONE;
1532         IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
1533         EnterFunction(2);
1534         mutex_lock(&__ip_vs_mutex);
1535         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1536                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1537                         if (net_eq(svc->net, net)) {
1538                                 list_for_each_entry(dest, &svc->destinations,
1539                                                     n_list) {
1540                                         __ip_vs_dev_reset(dest, dev);
1541                                 }
1542                         }
1543                 }
1544
1545                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1546                         if (net_eq(svc->net, net)) {
1547                                 list_for_each_entry(dest, &svc->destinations,
1548                                                     n_list) {
1549                                         __ip_vs_dev_reset(dest, dev);
1550                                 }
1551                         }
1552
1553                 }
1554         }
1555
1556         list_for_each_entry(dest, &ipvs->dest_trash, n_list) {
1557                 __ip_vs_dev_reset(dest, dev);
1558         }
1559         mutex_unlock(&__ip_vs_mutex);
1560         LeaveFunction(2);
1561         return NOTIFY_DONE;
1562 }
1563
1564 /*
1565  *      Zero counters in a service or all services
1566  */
1567 static int ip_vs_zero_service(struct ip_vs_service *svc)
1568 {
1569         struct ip_vs_dest *dest;
1570
1571         write_lock_bh(&__ip_vs_svc_lock);
1572         list_for_each_entry(dest, &svc->destinations, n_list) {
1573                 ip_vs_zero_stats(&dest->stats);
1574         }
1575         ip_vs_zero_stats(&svc->stats);
1576         write_unlock_bh(&__ip_vs_svc_lock);
1577         return 0;
1578 }
1579
1580 static int ip_vs_zero_all(struct net *net)
1581 {
1582         int idx;
1583         struct ip_vs_service *svc;
1584
1585         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1586                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1587                         if (net_eq(svc->net, net))
1588                                 ip_vs_zero_service(svc);
1589                 }
1590         }
1591
1592         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1593                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1594                         if (net_eq(svc->net, net))
1595                                 ip_vs_zero_service(svc);
1596                 }
1597         }
1598
1599         ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
1600         return 0;
1601 }
1602
1603 #ifdef CONFIG_SYSCTL
1604
1605 static int zero;
1606 static int three = 3;
1607
1608 static int
1609 proc_do_defense_mode(ctl_table *table, int write,
1610                      void __user *buffer, size_t *lenp, loff_t *ppos)
1611 {
1612         struct net *net = current->nsproxy->net_ns;
1613         int *valp = table->data;
1614         int val = *valp;
1615         int rc;
1616
1617         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1618         if (write && (*valp != val)) {
1619                 if ((*valp < 0) || (*valp > 3)) {
1620                         /* Restore the correct value */
1621                         *valp = val;
1622                 } else {
1623                         update_defense_level(net_ipvs(net));
1624                 }
1625         }
1626         return rc;
1627 }
1628
1629 static int
1630 proc_do_sync_threshold(ctl_table *table, int write,
1631                        void __user *buffer, size_t *lenp, loff_t *ppos)
1632 {
1633         int *valp = table->data;
1634         int val[2];
1635         int rc;
1636
1637         /* backup the value first */
1638         memcpy(val, valp, sizeof(val));
1639
1640         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1641         if (write && (valp[0] < 0 || valp[1] < 0 ||
1642             (valp[0] >= valp[1] && valp[1]))) {
1643                 /* Restore the correct value */
1644                 memcpy(valp, val, sizeof(val));
1645         }
1646         return rc;
1647 }
1648
1649 static int
1650 proc_do_sync_mode(ctl_table *table, int write,
1651                      void __user *buffer, size_t *lenp, loff_t *ppos)
1652 {
1653         int *valp = table->data;
1654         int val = *valp;
1655         int rc;
1656
1657         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1658         if (write && (*valp != val)) {
1659                 if ((*valp < 0) || (*valp > 1)) {
1660                         /* Restore the correct value */
1661                         *valp = val;
1662                 }
1663         }
1664         return rc;
1665 }
1666
1667 static int
1668 proc_do_sync_ports(ctl_table *table, int write,
1669                    void __user *buffer, size_t *lenp, loff_t *ppos)
1670 {
1671         int *valp = table->data;
1672         int val = *valp;
1673         int rc;
1674
1675         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1676         if (write && (*valp != val)) {
1677                 if (*valp < 1 || !is_power_of_2(*valp)) {
1678                         /* Restore the correct value */
1679                         *valp = val;
1680                 }
1681         }
1682         return rc;
1683 }
1684
1685 /*
1686  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1687  *      Do not change order or insert new entries without
1688  *      align with netns init in ip_vs_control_net_init()
1689  */
1690
1691 static struct ctl_table vs_vars[] = {
1692         {
1693                 .procname       = "amemthresh",
1694                 .maxlen         = sizeof(int),
1695                 .mode           = 0644,
1696                 .proc_handler   = proc_dointvec,
1697         },
1698         {
1699                 .procname       = "am_droprate",
1700                 .maxlen         = sizeof(int),
1701                 .mode           = 0644,
1702                 .proc_handler   = proc_dointvec,
1703         },
1704         {
1705                 .procname       = "drop_entry",
1706                 .maxlen         = sizeof(int),
1707                 .mode           = 0644,
1708                 .proc_handler   = proc_do_defense_mode,
1709         },
1710         {
1711                 .procname       = "drop_packet",
1712                 .maxlen         = sizeof(int),
1713                 .mode           = 0644,
1714                 .proc_handler   = proc_do_defense_mode,
1715         },
1716 #ifdef CONFIG_IP_VS_NFCT
1717         {
1718                 .procname       = "conntrack",
1719                 .maxlen         = sizeof(int),
1720                 .mode           = 0644,
1721                 .proc_handler   = &proc_dointvec,
1722         },
1723 #endif
1724         {
1725                 .procname       = "secure_tcp",
1726                 .maxlen         = sizeof(int),
1727                 .mode           = 0644,
1728                 .proc_handler   = proc_do_defense_mode,
1729         },
1730         {
1731                 .procname       = "snat_reroute",
1732                 .maxlen         = sizeof(int),
1733                 .mode           = 0644,
1734                 .proc_handler   = &proc_dointvec,
1735         },
1736         {
1737                 .procname       = "sync_version",
1738                 .maxlen         = sizeof(int),
1739                 .mode           = 0644,
1740                 .proc_handler   = &proc_do_sync_mode,
1741         },
1742         {
1743                 .procname       = "sync_ports",
1744                 .maxlen         = sizeof(int),
1745                 .mode           = 0644,
1746                 .proc_handler   = &proc_do_sync_ports,
1747         },
1748         {
1749                 .procname       = "sync_qlen_max",
1750                 .maxlen         = sizeof(int),
1751                 .mode           = 0644,
1752                 .proc_handler   = proc_dointvec,
1753         },
1754         {
1755                 .procname       = "sync_sock_size",
1756                 .maxlen         = sizeof(int),
1757                 .mode           = 0644,
1758                 .proc_handler   = proc_dointvec,
1759         },
1760         {
1761                 .procname       = "cache_bypass",
1762                 .maxlen         = sizeof(int),
1763                 .mode           = 0644,
1764                 .proc_handler   = proc_dointvec,
1765         },
1766         {
1767                 .procname       = "expire_nodest_conn",
1768                 .maxlen         = sizeof(int),
1769                 .mode           = 0644,
1770                 .proc_handler   = proc_dointvec,
1771         },
1772         {
1773                 .procname       = "expire_quiescent_template",
1774                 .maxlen         = sizeof(int),
1775                 .mode           = 0644,
1776                 .proc_handler   = proc_dointvec,
1777         },
1778         {
1779                 .procname       = "sync_threshold",
1780                 .maxlen         =
1781                         sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1782                 .mode           = 0644,
1783                 .proc_handler   = proc_do_sync_threshold,
1784         },
1785         {
1786                 .procname       = "sync_refresh_period",
1787                 .maxlen         = sizeof(int),
1788                 .mode           = 0644,
1789                 .proc_handler   = proc_dointvec_jiffies,
1790         },
1791         {
1792                 .procname       = "sync_retries",
1793                 .maxlen         = sizeof(int),
1794                 .mode           = 0644,
1795                 .proc_handler   = proc_dointvec_minmax,
1796                 .extra1         = &zero,
1797                 .extra2         = &three,
1798         },
1799         {
1800                 .procname       = "nat_icmp_send",
1801                 .maxlen         = sizeof(int),
1802                 .mode           = 0644,
1803                 .proc_handler   = proc_dointvec,
1804         },
1805         {
1806                 .procname       = "pmtu_disc",
1807                 .maxlen         = sizeof(int),
1808                 .mode           = 0644,
1809                 .proc_handler   = proc_dointvec,
1810         },
1811 #ifdef CONFIG_IP_VS_DEBUG
1812         {
1813                 .procname       = "debug_level",
1814                 .data           = &sysctl_ip_vs_debug_level,
1815                 .maxlen         = sizeof(int),
1816                 .mode           = 0644,
1817                 .proc_handler   = proc_dointvec,
1818         },
1819 #endif
1820 #if 0
1821         {
1822                 .procname       = "timeout_established",
1823                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1824                 .maxlen         = sizeof(int),
1825                 .mode           = 0644,
1826                 .proc_handler   = proc_dointvec_jiffies,
1827         },
1828         {
1829                 .procname       = "timeout_synsent",
1830                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1831                 .maxlen         = sizeof(int),
1832                 .mode           = 0644,
1833                 .proc_handler   = proc_dointvec_jiffies,
1834         },
1835         {
1836                 .procname       = "timeout_synrecv",
1837                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1838                 .maxlen         = sizeof(int),
1839                 .mode           = 0644,
1840                 .proc_handler   = proc_dointvec_jiffies,
1841         },
1842         {
1843                 .procname       = "timeout_finwait",
1844                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1845                 .maxlen         = sizeof(int),
1846                 .mode           = 0644,
1847                 .proc_handler   = proc_dointvec_jiffies,
1848         },
1849         {
1850                 .procname       = "timeout_timewait",
1851                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1852                 .maxlen         = sizeof(int),
1853                 .mode           = 0644,
1854                 .proc_handler   = proc_dointvec_jiffies,
1855         },
1856         {
1857                 .procname       = "timeout_close",
1858                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1859                 .maxlen         = sizeof(int),
1860                 .mode           = 0644,
1861                 .proc_handler   = proc_dointvec_jiffies,
1862         },
1863         {
1864                 .procname       = "timeout_closewait",
1865                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1866                 .maxlen         = sizeof(int),
1867                 .mode           = 0644,
1868                 .proc_handler   = proc_dointvec_jiffies,
1869         },
1870         {
1871                 .procname       = "timeout_lastack",
1872                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1873                 .maxlen         = sizeof(int),
1874                 .mode           = 0644,
1875                 .proc_handler   = proc_dointvec_jiffies,
1876         },
1877         {
1878                 .procname       = "timeout_listen",
1879                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1880                 .maxlen         = sizeof(int),
1881                 .mode           = 0644,
1882                 .proc_handler   = proc_dointvec_jiffies,
1883         },
1884         {
1885                 .procname       = "timeout_synack",
1886                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1887                 .maxlen         = sizeof(int),
1888                 .mode           = 0644,
1889                 .proc_handler   = proc_dointvec_jiffies,
1890         },
1891         {
1892                 .procname       = "timeout_udp",
1893                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1894                 .maxlen         = sizeof(int),
1895                 .mode           = 0644,
1896                 .proc_handler   = proc_dointvec_jiffies,
1897         },
1898         {
1899                 .procname       = "timeout_icmp",
1900                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1901                 .maxlen         = sizeof(int),
1902                 .mode           = 0644,
1903                 .proc_handler   = proc_dointvec_jiffies,
1904         },
1905 #endif
1906         { }
1907 };
1908
1909 #endif
1910
1911 #ifdef CONFIG_PROC_FS
1912
1913 struct ip_vs_iter {
1914         struct seq_net_private p;  /* Do not move this, netns depends upon it*/
1915         struct list_head *table;
1916         int bucket;
1917 };
1918
1919 /*
1920  *      Write the contents of the VS rule table to a PROCfs file.
1921  *      (It is kept just for backward compatibility)
1922  */
1923 static inline const char *ip_vs_fwd_name(unsigned int flags)
1924 {
1925         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1926         case IP_VS_CONN_F_LOCALNODE:
1927                 return "Local";
1928         case IP_VS_CONN_F_TUNNEL:
1929                 return "Tunnel";
1930         case IP_VS_CONN_F_DROUTE:
1931                 return "Route";
1932         default:
1933                 return "Masq";
1934         }
1935 }
1936
1937
1938 /* Get the Nth entry in the two lists */
1939 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1940 {
1941         struct net *net = seq_file_net(seq);
1942         struct ip_vs_iter *iter = seq->private;
1943         int idx;
1944         struct ip_vs_service *svc;
1945
1946         /* look in hash by protocol */
1947         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1948                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1949                         if (net_eq(svc->net, net) && pos-- == 0) {
1950                                 iter->table = ip_vs_svc_table;
1951                                 iter->bucket = idx;
1952                                 return svc;
1953                         }
1954                 }
1955         }
1956
1957         /* keep looking in fwmark */
1958         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1959                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1960                         if (net_eq(svc->net, net) && pos-- == 0) {
1961                                 iter->table = ip_vs_svc_fwm_table;
1962                                 iter->bucket = idx;
1963                                 return svc;
1964                         }
1965                 }
1966         }
1967
1968         return NULL;
1969 }
1970
1971 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
1972 __acquires(__ip_vs_svc_lock)
1973 {
1974
1975         read_lock_bh(&__ip_vs_svc_lock);
1976         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
1977 }
1978
1979
1980 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1981 {
1982         struct list_head *e;
1983         struct ip_vs_iter *iter;
1984         struct ip_vs_service *svc;
1985
1986         ++*pos;
1987         if (v == SEQ_START_TOKEN)
1988                 return ip_vs_info_array(seq,0);
1989
1990         svc = v;
1991         iter = seq->private;
1992
1993         if (iter->table == ip_vs_svc_table) {
1994                 /* next service in table hashed by protocol */
1995                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
1996                         return list_entry(e, struct ip_vs_service, s_list);
1997
1998
1999                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2000                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
2001                                             s_list) {
2002                                 return svc;
2003                         }
2004                 }
2005
2006                 iter->table = ip_vs_svc_fwm_table;
2007                 iter->bucket = -1;
2008                 goto scan_fwmark;
2009         }
2010
2011         /* next service in hashed by fwmark */
2012         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
2013                 return list_entry(e, struct ip_vs_service, f_list);
2014
2015  scan_fwmark:
2016         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2017                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
2018                                     f_list)
2019                         return svc;
2020         }
2021
2022         return NULL;
2023 }
2024
2025 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
2026 __releases(__ip_vs_svc_lock)
2027 {
2028         read_unlock_bh(&__ip_vs_svc_lock);
2029 }
2030
2031
2032 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
2033 {
2034         if (v == SEQ_START_TOKEN) {
2035                 seq_printf(seq,
2036                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
2037                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2038                 seq_puts(seq,
2039                          "Prot LocalAddress:Port Scheduler Flags\n");
2040                 seq_puts(seq,
2041                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2042         } else {
2043                 const struct ip_vs_service *svc = v;
2044                 const struct ip_vs_iter *iter = seq->private;
2045                 const struct ip_vs_dest *dest;
2046
2047                 if (iter->table == ip_vs_svc_table) {
2048 #ifdef CONFIG_IP_VS_IPV6
2049                         if (svc->af == AF_INET6)
2050                                 seq_printf(seq, "%s  [%pI6]:%04X %s ",
2051                                            ip_vs_proto_name(svc->protocol),
2052                                            &svc->addr.in6,
2053                                            ntohs(svc->port),
2054                                            svc->scheduler->name);
2055                         else
2056 #endif
2057                                 seq_printf(seq, "%s  %08X:%04X %s %s ",
2058                                            ip_vs_proto_name(svc->protocol),
2059                                            ntohl(svc->addr.ip),
2060                                            ntohs(svc->port),
2061                                            svc->scheduler->name,
2062                                            (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2063                 } else {
2064                         seq_printf(seq, "FWM  %08X %s %s",
2065                                    svc->fwmark, svc->scheduler->name,
2066                                    (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2067                 }
2068
2069                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2070                         seq_printf(seq, "persistent %d %08X\n",
2071                                 svc->timeout,
2072                                 ntohl(svc->netmask));
2073                 else
2074                         seq_putc(seq, '\n');
2075
2076                 list_for_each_entry(dest, &svc->destinations, n_list) {
2077 #ifdef CONFIG_IP_VS_IPV6
2078                         if (dest->af == AF_INET6)
2079                                 seq_printf(seq,
2080                                            "  -> [%pI6]:%04X"
2081                                            "      %-7s %-6d %-10d %-10d\n",
2082                                            &dest->addr.in6,
2083                                            ntohs(dest->port),
2084                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2085                                            atomic_read(&dest->weight),
2086                                            atomic_read(&dest->activeconns),
2087                                            atomic_read(&dest->inactconns));
2088                         else
2089 #endif
2090                                 seq_printf(seq,
2091                                            "  -> %08X:%04X      "
2092                                            "%-7s %-6d %-10d %-10d\n",
2093                                            ntohl(dest->addr.ip),
2094                                            ntohs(dest->port),
2095                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2096                                            atomic_read(&dest->weight),
2097                                            atomic_read(&dest->activeconns),
2098                                            atomic_read(&dest->inactconns));
2099
2100                 }
2101         }
2102         return 0;
2103 }
2104
2105 static const struct seq_operations ip_vs_info_seq_ops = {
2106         .start = ip_vs_info_seq_start,
2107         .next  = ip_vs_info_seq_next,
2108         .stop  = ip_vs_info_seq_stop,
2109         .show  = ip_vs_info_seq_show,
2110 };
2111
2112 static int ip_vs_info_open(struct inode *inode, struct file *file)
2113 {
2114         return seq_open_net(inode, file, &ip_vs_info_seq_ops,
2115                         sizeof(struct ip_vs_iter));
2116 }
2117
2118 static const struct file_operations ip_vs_info_fops = {
2119         .owner   = THIS_MODULE,
2120         .open    = ip_vs_info_open,
2121         .read    = seq_read,
2122         .llseek  = seq_lseek,
2123         .release = seq_release_net,
2124 };
2125
2126 static int ip_vs_stats_show(struct seq_file *seq, void *v)
2127 {
2128         struct net *net = seq_file_single_net(seq);
2129         struct ip_vs_stats_user show;
2130
2131 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2132         seq_puts(seq,
2133                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
2134         seq_printf(seq,
2135                    "   Conns  Packets  Packets            Bytes            Bytes\n");
2136
2137         ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
2138         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
2139                    show.inpkts, show.outpkts,
2140                    (unsigned long long) show.inbytes,
2141                    (unsigned long long) show.outbytes);
2142
2143 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2144         seq_puts(seq,
2145                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2146         seq_printf(seq, "%8X %8X %8X %16X %16X\n",
2147                         show.cps, show.inpps, show.outpps,
2148                         show.inbps, show.outbps);
2149
2150         return 0;
2151 }
2152
2153 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
2154 {
2155         return single_open_net(inode, file, ip_vs_stats_show);
2156 }
2157
2158 static const struct file_operations ip_vs_stats_fops = {
2159         .owner = THIS_MODULE,
2160         .open = ip_vs_stats_seq_open,
2161         .read = seq_read,
2162         .llseek = seq_lseek,
2163         .release = single_release_net,
2164 };
2165
2166 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2167 {
2168         struct net *net = seq_file_single_net(seq);
2169         struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2170         struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
2171         struct ip_vs_stats_user rates;
2172         int i;
2173
2174 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2175         seq_puts(seq,
2176                  "       Total Incoming Outgoing         Incoming         Outgoing\n");
2177         seq_printf(seq,
2178                    "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2179
2180         for_each_possible_cpu(i) {
2181                 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2182                 unsigned int start;
2183                 __u64 inbytes, outbytes;
2184
2185                 do {
2186                         start = u64_stats_fetch_begin_bh(&u->syncp);
2187                         inbytes = u->ustats.inbytes;
2188                         outbytes = u->ustats.outbytes;
2189                 } while (u64_stats_fetch_retry_bh(&u->syncp, start));
2190
2191                 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2192                            i, u->ustats.conns, u->ustats.inpkts,
2193                            u->ustats.outpkts, (__u64)inbytes,
2194                            (__u64)outbytes);
2195         }
2196
2197         spin_lock_bh(&tot_stats->lock);
2198
2199         seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
2200                    tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2201                    tot_stats->ustats.outpkts,
2202                    (unsigned long long) tot_stats->ustats.inbytes,
2203                    (unsigned long long) tot_stats->ustats.outbytes);
2204
2205         ip_vs_read_estimator(&rates, tot_stats);
2206
2207         spin_unlock_bh(&tot_stats->lock);
2208
2209 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2210         seq_puts(seq,
2211                    "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2212         seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
2213                         rates.cps,
2214                         rates.inpps,
2215                         rates.outpps,
2216                         rates.inbps,
2217                         rates.outbps);
2218
2219         return 0;
2220 }
2221
2222 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2223 {
2224         return single_open_net(inode, file, ip_vs_stats_percpu_show);
2225 }
2226
2227 static const struct file_operations ip_vs_stats_percpu_fops = {
2228         .owner = THIS_MODULE,
2229         .open = ip_vs_stats_percpu_seq_open,
2230         .read = seq_read,
2231         .llseek = seq_lseek,
2232         .release = single_release_net,
2233 };
2234 #endif
2235
2236 /*
2237  *      Set timeout values for tcp tcpfin udp in the timeout_table.
2238  */
2239 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2240 {
2241 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2242         struct ip_vs_proto_data *pd;
2243 #endif
2244
2245         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2246                   u->tcp_timeout,
2247                   u->tcp_fin_timeout,
2248                   u->udp_timeout);
2249
2250 #ifdef CONFIG_IP_VS_PROTO_TCP
2251         if (u->tcp_timeout) {
2252                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2253                 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2254                         = u->tcp_timeout * HZ;
2255         }
2256
2257         if (u->tcp_fin_timeout) {
2258                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2259                 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2260                         = u->tcp_fin_timeout * HZ;
2261         }
2262 #endif
2263
2264 #ifdef CONFIG_IP_VS_PROTO_UDP
2265         if (u->udp_timeout) {
2266                 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2267                 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2268                         = u->udp_timeout * HZ;
2269         }
2270 #endif
2271         return 0;
2272 }
2273
2274
2275 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2276 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
2277 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
2278                                  sizeof(struct ip_vs_dest_user))
2279 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
2280 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
2281 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
2282
2283 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2284         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
2285         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
2286         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
2287         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
2288         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
2289         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
2290         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
2291         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
2292         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
2293         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
2294         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
2295 };
2296
2297 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2298                                   struct ip_vs_service_user *usvc_compat)
2299 {
2300         memset(usvc, 0, sizeof(*usvc));
2301
2302         usvc->af                = AF_INET;
2303         usvc->protocol          = usvc_compat->protocol;
2304         usvc->addr.ip           = usvc_compat->addr;
2305         usvc->port              = usvc_compat->port;
2306         usvc->fwmark            = usvc_compat->fwmark;
2307
2308         /* Deep copy of sched_name is not needed here */
2309         usvc->sched_name        = usvc_compat->sched_name;
2310
2311         usvc->flags             = usvc_compat->flags;
2312         usvc->timeout           = usvc_compat->timeout;
2313         usvc->netmask           = usvc_compat->netmask;
2314 }
2315
2316 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2317                                    struct ip_vs_dest_user *udest_compat)
2318 {
2319         memset(udest, 0, sizeof(*udest));
2320
2321         udest->addr.ip          = udest_compat->addr;
2322         udest->port             = udest_compat->port;
2323         udest->conn_flags       = udest_compat->conn_flags;
2324         udest->weight           = udest_compat->weight;
2325         udest->u_threshold      = udest_compat->u_threshold;
2326         udest->l_threshold      = udest_compat->l_threshold;
2327 }
2328
2329 static int
2330 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2331 {
2332         struct net *net = sock_net(sk);
2333         int ret;
2334         unsigned char arg[MAX_ARG_LEN];
2335         struct ip_vs_service_user *usvc_compat;
2336         struct ip_vs_service_user_kern usvc;
2337         struct ip_vs_service *svc;
2338         struct ip_vs_dest_user *udest_compat;
2339         struct ip_vs_dest_user_kern udest;
2340         struct netns_ipvs *ipvs = net_ipvs(net);
2341
2342         if (!capable(CAP_NET_ADMIN))
2343                 return -EPERM;
2344
2345         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2346                 return -EINVAL;
2347         if (len < 0 || len >  MAX_ARG_LEN)
2348                 return -EINVAL;
2349         if (len != set_arglen[SET_CMDID(cmd)]) {
2350                 pr_err("set_ctl: len %u != %u\n",
2351                        len, set_arglen[SET_CMDID(cmd)]);
2352                 return -EINVAL;
2353         }
2354
2355         if (copy_from_user(arg, user, len) != 0)
2356                 return -EFAULT;
2357
2358         /* increase the module use count */
2359         ip_vs_use_count_inc();
2360
2361         /* Handle daemons since they have another lock */
2362         if (cmd == IP_VS_SO_SET_STARTDAEMON ||
2363             cmd == IP_VS_SO_SET_STOPDAEMON) {
2364                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2365
2366                 if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
2367                         ret = -ERESTARTSYS;
2368                         goto out_dec;
2369                 }
2370                 if (cmd == IP_VS_SO_SET_STARTDAEMON)
2371                         ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2372                                                 dm->syncid);
2373                 else
2374                         ret = stop_sync_thread(net, dm->state);
2375                 mutex_unlock(&ipvs->sync_mutex);
2376                 goto out_dec;
2377         }
2378
2379         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2380                 ret = -ERESTARTSYS;
2381                 goto out_dec;
2382         }
2383
2384         if (cmd == IP_VS_SO_SET_FLUSH) {
2385                 /* Flush the virtual service */
2386                 ret = ip_vs_flush(net);
2387                 goto out_unlock;
2388         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2389                 /* Set timeout values for (tcp tcpfin udp) */
2390                 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2391                 goto out_unlock;
2392         }
2393
2394         usvc_compat = (struct ip_vs_service_user *)arg;
2395         udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2396
2397         /* We only use the new structs internally, so copy userspace compat
2398          * structs to extended internal versions */
2399         ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2400         ip_vs_copy_udest_compat(&udest, udest_compat);
2401
2402         if (cmd == IP_VS_SO_SET_ZERO) {
2403                 /* if no service address is set, zero counters in all */
2404                 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2405                         ret = ip_vs_zero_all(net);
2406                         goto out_unlock;
2407                 }
2408         }
2409
2410         /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2411         if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2412             usvc.protocol != IPPROTO_SCTP) {
2413                 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2414                        usvc.protocol, &usvc.addr.ip,
2415                        ntohs(usvc.port), usvc.sched_name);
2416                 ret = -EFAULT;
2417                 goto out_unlock;
2418         }
2419
2420         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2421         if (usvc.fwmark == 0)
2422                 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2423                                            &usvc.addr, usvc.port);
2424         else
2425                 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2426
2427         if (cmd != IP_VS_SO_SET_ADD
2428             && (svc == NULL || svc->protocol != usvc.protocol)) {
2429                 ret = -ESRCH;
2430                 goto out_unlock;
2431         }
2432
2433         switch (cmd) {
2434         case IP_VS_SO_SET_ADD:
2435                 if (svc != NULL)
2436                         ret = -EEXIST;
2437                 else
2438                         ret = ip_vs_add_service(net, &usvc, &svc);
2439                 break;
2440         case IP_VS_SO_SET_EDIT:
2441                 ret = ip_vs_edit_service(svc, &usvc);
2442                 break;
2443         case IP_VS_SO_SET_DEL:
2444                 ret = ip_vs_del_service(svc);
2445                 if (!ret)
2446                         goto out_unlock;
2447                 break;
2448         case IP_VS_SO_SET_ZERO:
2449                 ret = ip_vs_zero_service(svc);
2450                 break;
2451         case IP_VS_SO_SET_ADDDEST:
2452                 ret = ip_vs_add_dest(svc, &udest);
2453                 break;
2454         case IP_VS_SO_SET_EDITDEST:
2455                 ret = ip_vs_edit_dest(svc, &udest);
2456                 break;
2457         case IP_VS_SO_SET_DELDEST:
2458                 ret = ip_vs_del_dest(svc, &udest);
2459                 break;
2460         default:
2461                 ret = -EINVAL;
2462         }
2463
2464   out_unlock:
2465         mutex_unlock(&__ip_vs_mutex);
2466   out_dec:
2467         /* decrease the module use count */
2468         ip_vs_use_count_dec();
2469
2470         return ret;
2471 }
2472
2473
2474 static void
2475 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2476 {
2477         dst->protocol = src->protocol;
2478         dst->addr = src->addr.ip;
2479         dst->port = src->port;
2480         dst->fwmark = src->fwmark;
2481         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2482         dst->flags = src->flags;
2483         dst->timeout = src->timeout / HZ;
2484         dst->netmask = src->netmask;
2485         dst->num_dests = src->num_dests;
2486         ip_vs_copy_stats(&dst->stats, &src->stats);
2487 }
2488
2489 static inline int
2490 __ip_vs_get_service_entries(struct net *net,
2491                             const struct ip_vs_get_services *get,
2492                             struct ip_vs_get_services __user *uptr)
2493 {
2494         int idx, count=0;
2495         struct ip_vs_service *svc;
2496         struct ip_vs_service_entry entry;
2497         int ret = 0;
2498
2499         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2500                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2501                         /* Only expose IPv4 entries to old interface */
2502                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2503                                 continue;
2504
2505                         if (count >= get->num_services)
2506                                 goto out;
2507                         memset(&entry, 0, sizeof(entry));
2508                         ip_vs_copy_service(&entry, svc);
2509                         if (copy_to_user(&uptr->entrytable[count],
2510                                          &entry, sizeof(entry))) {
2511                                 ret = -EFAULT;
2512                                 goto out;
2513                         }
2514                         count++;
2515                 }
2516         }
2517
2518         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2519                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2520                         /* Only expose IPv4 entries to old interface */
2521                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2522                                 continue;
2523
2524                         if (count >= get->num_services)
2525                                 goto out;
2526                         memset(&entry, 0, sizeof(entry));
2527                         ip_vs_copy_service(&entry, svc);
2528                         if (copy_to_user(&uptr->entrytable[count],
2529                                          &entry, sizeof(entry))) {
2530                                 ret = -EFAULT;
2531                                 goto out;
2532                         }
2533                         count++;
2534                 }
2535         }
2536 out:
2537         return ret;
2538 }
2539
2540 static inline int
2541 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2542                          struct ip_vs_get_dests __user *uptr)
2543 {
2544         struct ip_vs_service *svc;
2545         union nf_inet_addr addr = { .ip = get->addr };
2546         int ret = 0;
2547
2548         if (get->fwmark)
2549                 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2550         else
2551                 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2552                                            get->port);
2553
2554         if (svc) {
2555                 int count = 0;
2556                 struct ip_vs_dest *dest;
2557                 struct ip_vs_dest_entry entry;
2558
2559                 list_for_each_entry(dest, &svc->destinations, n_list) {
2560                         if (count >= get->num_dests)
2561                                 break;
2562
2563                         entry.addr = dest->addr.ip;
2564                         entry.port = dest->port;
2565                         entry.conn_flags = atomic_read(&dest->conn_flags);
2566                         entry.weight = atomic_read(&dest->weight);
2567                         entry.u_threshold = dest->u_threshold;
2568                         entry.l_threshold = dest->l_threshold;
2569                         entry.activeconns = atomic_read(&dest->activeconns);
2570                         entry.inactconns = atomic_read(&dest->inactconns);
2571                         entry.persistconns = atomic_read(&dest->persistconns);
2572                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2573                         if (copy_to_user(&uptr->entrytable[count],
2574                                          &entry, sizeof(entry))) {
2575                                 ret = -EFAULT;
2576                                 break;
2577                         }
2578                         count++;
2579                 }
2580         } else
2581                 ret = -ESRCH;
2582         return ret;
2583 }
2584
2585 static inline void
2586 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2587 {
2588 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2589         struct ip_vs_proto_data *pd;
2590 #endif
2591
2592 #ifdef CONFIG_IP_VS_PROTO_TCP
2593         pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2594         u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2595         u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2596 #endif
2597 #ifdef CONFIG_IP_VS_PROTO_UDP
2598         pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2599         u->udp_timeout =
2600                         pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2601 #endif
2602 }
2603
2604
2605 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2606 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2607 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2608 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2609 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2610 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2611 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2612
2613 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2614         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2615         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2616         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2617         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2618         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2619         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2620         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2621 };
2622
2623 static int
2624 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2625 {
2626         unsigned char arg[128];
2627         int ret = 0;
2628         unsigned int copylen;
2629         struct net *net = sock_net(sk);
2630         struct netns_ipvs *ipvs = net_ipvs(net);
2631
2632         BUG_ON(!net);
2633         if (!capable(CAP_NET_ADMIN))
2634                 return -EPERM;
2635
2636         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2637                 return -EINVAL;
2638
2639         if (*len < get_arglen[GET_CMDID(cmd)]) {
2640                 pr_err("get_ctl: len %u < %u\n",
2641                        *len, get_arglen[GET_CMDID(cmd)]);
2642                 return -EINVAL;
2643         }
2644
2645         copylen = get_arglen[GET_CMDID(cmd)];
2646         if (copylen > 128)
2647                 return -EINVAL;
2648
2649         if (copy_from_user(arg, user, copylen) != 0)
2650                 return -EFAULT;
2651         /*
2652          * Handle daemons first since it has its own locking
2653          */
2654         if (cmd == IP_VS_SO_GET_DAEMON) {
2655                 struct ip_vs_daemon_user d[2];
2656
2657                 memset(&d, 0, sizeof(d));
2658                 if (mutex_lock_interruptible(&ipvs->sync_mutex))
2659                         return -ERESTARTSYS;
2660
2661                 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2662                         d[0].state = IP_VS_STATE_MASTER;
2663                         strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2664                                 sizeof(d[0].mcast_ifn));
2665                         d[0].syncid = ipvs->master_syncid;
2666                 }
2667                 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2668                         d[1].state = IP_VS_STATE_BACKUP;
2669                         strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2670                                 sizeof(d[1].mcast_ifn));
2671                         d[1].syncid = ipvs->backup_syncid;
2672                 }
2673                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2674                         ret = -EFAULT;
2675                 mutex_unlock(&ipvs->sync_mutex);
2676                 return ret;
2677         }
2678
2679         if (mutex_lock_interruptible(&__ip_vs_mutex))
2680                 return -ERESTARTSYS;
2681
2682         switch (cmd) {
2683         case IP_VS_SO_GET_VERSION:
2684         {
2685                 char buf[64];
2686
2687                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2688                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2689                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2690                         ret = -EFAULT;
2691                         goto out;
2692                 }
2693                 *len = strlen(buf)+1;
2694         }
2695         break;
2696
2697         case IP_VS_SO_GET_INFO:
2698         {
2699                 struct ip_vs_getinfo info;
2700                 info.version = IP_VS_VERSION_CODE;
2701                 info.size = ip_vs_conn_tab_size;
2702                 info.num_services = ipvs->num_services;
2703                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2704                         ret = -EFAULT;
2705         }
2706         break;
2707
2708         case IP_VS_SO_GET_SERVICES:
2709         {
2710                 struct ip_vs_get_services *get;
2711                 int size;
2712
2713                 get = (struct ip_vs_get_services *)arg;
2714                 size = sizeof(*get) +
2715                         sizeof(struct ip_vs_service_entry) * get->num_services;
2716                 if (*len != size) {
2717                         pr_err("length: %u != %u\n", *len, size);
2718                         ret = -EINVAL;
2719                         goto out;
2720                 }
2721                 ret = __ip_vs_get_service_entries(net, get, user);
2722         }
2723         break;
2724
2725         case IP_VS_SO_GET_SERVICE:
2726         {
2727                 struct ip_vs_service_entry *entry;
2728                 struct ip_vs_service *svc;
2729                 union nf_inet_addr addr;
2730
2731                 entry = (struct ip_vs_service_entry *)arg;
2732                 addr.ip = entry->addr;
2733                 if (entry->fwmark)
2734                         svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2735                 else
2736                         svc = __ip_vs_service_find(net, AF_INET,
2737                                                    entry->protocol, &addr,
2738                                                    entry->port);
2739                 if (svc) {
2740                         ip_vs_copy_service(entry, svc);
2741                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2742                                 ret = -EFAULT;
2743                 } else
2744                         ret = -ESRCH;
2745         }
2746         break;
2747
2748         case IP_VS_SO_GET_DESTS:
2749         {
2750                 struct ip_vs_get_dests *get;
2751                 int size;
2752
2753                 get = (struct ip_vs_get_dests *)arg;
2754                 size = sizeof(*get) +
2755                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2756                 if (*len != size) {
2757                         pr_err("length: %u != %u\n", *len, size);
2758                         ret = -EINVAL;
2759                         goto out;
2760                 }
2761                 ret = __ip_vs_get_dest_entries(net, get, user);
2762         }
2763         break;
2764
2765         case IP_VS_SO_GET_TIMEOUT:
2766         {
2767                 struct ip_vs_timeout_user t;
2768
2769                 memset(&t, 0, sizeof(t));
2770                 __ip_vs_get_timeouts(net, &t);
2771                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2772                         ret = -EFAULT;
2773         }
2774         break;
2775
2776         default:
2777                 ret = -EINVAL;
2778         }
2779
2780 out:
2781         mutex_unlock(&__ip_vs_mutex);
2782         return ret;
2783 }
2784
2785
2786 static struct nf_sockopt_ops ip_vs_sockopts = {
2787         .pf             = PF_INET,
2788         .set_optmin     = IP_VS_BASE_CTL,
2789         .set_optmax     = IP_VS_SO_SET_MAX+1,
2790         .set            = do_ip_vs_set_ctl,
2791         .get_optmin     = IP_VS_BASE_CTL,
2792         .get_optmax     = IP_VS_SO_GET_MAX+1,
2793         .get            = do_ip_vs_get_ctl,
2794         .owner          = THIS_MODULE,
2795 };
2796
2797 /*
2798  * Generic Netlink interface
2799  */
2800
2801 /* IPVS genetlink family */
2802 static struct genl_family ip_vs_genl_family = {
2803         .id             = GENL_ID_GENERATE,
2804         .hdrsize        = 0,
2805         .name           = IPVS_GENL_NAME,
2806         .version        = IPVS_GENL_VERSION,
2807         .maxattr        = IPVS_CMD_MAX,
2808         .netnsok        = true,         /* Make ipvsadm to work on netns */
2809 };
2810
2811 /* Policy used for first-level command attributes */
2812 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2813         [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
2814         [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
2815         [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
2816         [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
2817         [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2818         [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
2819 };
2820
2821 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2822 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2823         [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
2824         [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
2825                                             .len = IP_VS_IFNAME_MAXLEN },
2826         [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
2827 };
2828
2829 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2830 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2831         [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
2832         [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
2833         [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
2834                                             .len = sizeof(union nf_inet_addr) },
2835         [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
2836         [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
2837         [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
2838                                             .len = IP_VS_SCHEDNAME_MAXLEN },
2839         [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
2840                                             .len = IP_VS_PENAME_MAXLEN },
2841         [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
2842                                             .len = sizeof(struct ip_vs_flags) },
2843         [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
2844         [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
2845         [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
2846 };
2847
2848 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2849 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2850         [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
2851                                             .len = sizeof(union nf_inet_addr) },
2852         [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
2853         [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
2854         [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
2855         [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
2856         [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
2857         [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
2858         [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
2859         [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
2860         [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
2861 };
2862
2863 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2864                                  struct ip_vs_stats *stats)
2865 {
2866         struct ip_vs_stats_user ustats;
2867         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2868         if (!nl_stats)
2869                 return -EMSGSIZE;
2870
2871         ip_vs_copy_stats(&ustats, stats);
2872
2873         if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) ||
2874             nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) ||
2875             nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) ||
2876             nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) ||
2877             nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) ||
2878             nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) ||
2879             nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) ||
2880             nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) ||
2881             nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) ||
2882             nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps))
2883                 goto nla_put_failure;
2884         nla_nest_end(skb, nl_stats);
2885
2886         return 0;
2887
2888 nla_put_failure:
2889         nla_nest_cancel(skb, nl_stats);
2890         return -EMSGSIZE;
2891 }
2892
2893 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2894                                    struct ip_vs_service *svc)
2895 {
2896         struct nlattr *nl_service;
2897         struct ip_vs_flags flags = { .flags = svc->flags,
2898                                      .mask = ~0 };
2899
2900         nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2901         if (!nl_service)
2902                 return -EMSGSIZE;
2903
2904         if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
2905                 goto nla_put_failure;
2906         if (svc->fwmark) {
2907                 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
2908                         goto nla_put_failure;
2909         } else {
2910                 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
2911                     nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
2912                     nla_put_u16(skb, IPVS_SVC_ATTR_PORT, svc->port))
2913                         goto nla_put_failure;
2914         }
2915
2916         if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) ||
2917             (svc->pe &&
2918              nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) ||
2919             nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
2920             nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
2921             nla_put_u32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
2922                 goto nla_put_failure;
2923         if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2924                 goto nla_put_failure;
2925
2926         nla_nest_end(skb, nl_service);
2927
2928         return 0;
2929
2930 nla_put_failure:
2931         nla_nest_cancel(skb, nl_service);
2932         return -EMSGSIZE;
2933 }
2934
2935 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2936                                    struct ip_vs_service *svc,
2937                                    struct netlink_callback *cb)
2938 {
2939         void *hdr;
2940
2941         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
2942                           &ip_vs_genl_family, NLM_F_MULTI,
2943                           IPVS_CMD_NEW_SERVICE);
2944         if (!hdr)
2945                 return -EMSGSIZE;
2946
2947         if (ip_vs_genl_fill_service(skb, svc) < 0)
2948                 goto nla_put_failure;
2949
2950         return genlmsg_end(skb, hdr);
2951
2952 nla_put_failure:
2953         genlmsg_cancel(skb, hdr);
2954         return -EMSGSIZE;
2955 }
2956
2957 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2958                                     struct netlink_callback *cb)
2959 {
2960         int idx = 0, i;
2961         int start = cb->args[0];
2962         struct ip_vs_service *svc;
2963         struct net *net = skb_sknet(skb);
2964
2965         mutex_lock(&__ip_vs_mutex);
2966         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2967                 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2968                         if (++idx <= start || !net_eq(svc->net, net))
2969                                 continue;
2970                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2971                                 idx--;
2972                                 goto nla_put_failure;
2973                         }
2974                 }
2975         }
2976
2977         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2978                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2979                         if (++idx <= start || !net_eq(svc->net, net))
2980                                 continue;
2981                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2982                                 idx--;
2983                                 goto nla_put_failure;
2984                         }
2985                 }
2986         }
2987
2988 nla_put_failure:
2989         mutex_unlock(&__ip_vs_mutex);
2990         cb->args[0] = idx;
2991
2992         return skb->len;
2993 }
2994
2995 static int ip_vs_genl_parse_service(struct net *net,
2996                                     struct ip_vs_service_user_kern *usvc,
2997                                     struct nlattr *nla, int full_entry,
2998                                     struct ip_vs_service **ret_svc)
2999 {
3000         struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
3001         struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
3002         struct ip_vs_service *svc;
3003
3004         /* Parse mandatory identifying service fields first */
3005         if (nla == NULL ||
3006             nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
3007                 return -EINVAL;
3008
3009         nla_af          = attrs[IPVS_SVC_ATTR_AF];
3010         nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
3011         nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
3012         nla_port        = attrs[IPVS_SVC_ATTR_PORT];
3013         nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
3014
3015         if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
3016                 return -EINVAL;
3017
3018         memset(usvc, 0, sizeof(*usvc));
3019
3020         usvc->af = nla_get_u16(nla_af);
3021 #ifdef CONFIG_IP_VS_IPV6
3022         if (usvc->af != AF_INET && usvc->af != AF_INET6)
3023 #else
3024         if (usvc->af != AF_INET)
3025 #endif
3026                 return -EAFNOSUPPORT;
3027
3028         if (nla_fwmark) {
3029                 usvc->protocol = IPPROTO_TCP;
3030                 usvc->fwmark = nla_get_u32(nla_fwmark);
3031         } else {
3032                 usvc->protocol = nla_get_u16(nla_protocol);
3033                 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
3034                 usvc->port = nla_get_u16(nla_port);
3035                 usvc->fwmark = 0;
3036         }
3037
3038         if (usvc->fwmark)
3039                 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
3040         else
3041                 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
3042                                            &usvc->addr, usvc->port);
3043         *ret_svc = svc;
3044
3045         /* If a full entry was requested, check for the additional fields */
3046         if (full_entry) {
3047                 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
3048                               *nla_netmask;
3049                 struct ip_vs_flags flags;
3050
3051                 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
3052                 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
3053                 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
3054                 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
3055                 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
3056
3057                 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
3058                         return -EINVAL;
3059
3060                 nla_memcpy(&flags, nla_flags, sizeof(flags));
3061
3062                 /* prefill flags from service if it already exists */
3063                 if (svc)
3064                         usvc->flags = svc->flags;
3065
3066                 /* set new flags from userland */
3067                 usvc->flags = (usvc->flags & ~flags.mask) |
3068                               (flags.flags & flags.mask);
3069                 usvc->sched_name = nla_data(nla_sched);
3070                 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
3071                 usvc->timeout = nla_get_u32(nla_timeout);
3072                 usvc->netmask = nla_get_u32(nla_netmask);
3073         }
3074
3075         return 0;
3076 }
3077
3078 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
3079                                                      struct nlattr *nla)
3080 {
3081         struct ip_vs_service_user_kern usvc;
3082         struct ip_vs_service *svc;
3083         int ret;
3084
3085         ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
3086         return ret ? ERR_PTR(ret) : svc;
3087 }
3088
3089 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
3090 {
3091         struct nlattr *nl_dest;
3092
3093         nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
3094         if (!nl_dest)
3095                 return -EMSGSIZE;
3096
3097         if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
3098             nla_put_u16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
3099             nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
3100                         (atomic_read(&dest->conn_flags) &
3101                          IP_VS_CONN_F_FWD_MASK)) ||
3102             nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
3103                         atomic_read(&dest->weight)) ||
3104             nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
3105             nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
3106             nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
3107                         atomic_read(&dest->activeconns)) ||
3108             nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
3109                         atomic_read(&dest->inactconns)) ||
3110             nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
3111                         atomic_read(&dest->persistconns)))
3112                 goto nla_put_failure;
3113         if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
3114                 goto nla_put_failure;
3115
3116         nla_nest_end(skb, nl_dest);
3117
3118         return 0;
3119
3120 nla_put_failure:
3121         nla_nest_cancel(skb, nl_dest);
3122         return -EMSGSIZE;
3123 }
3124
3125 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
3126                                 struct netlink_callback *cb)
3127 {
3128         void *hdr;
3129
3130         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3131                           &ip_vs_genl_family, NLM_F_MULTI,
3132                           IPVS_CMD_NEW_DEST);
3133         if (!hdr)
3134                 return -EMSGSIZE;
3135
3136         if (ip_vs_genl_fill_dest(skb, dest) < 0)
3137                 goto nla_put_failure;
3138
3139         return genlmsg_end(skb, hdr);
3140
3141 nla_put_failure:
3142         genlmsg_cancel(skb, hdr);
3143         return -EMSGSIZE;
3144 }
3145
3146 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
3147                                  struct netlink_callback *cb)
3148 {
3149         int idx = 0;
3150         int start = cb->args[0];
3151         struct ip_vs_service *svc;
3152         struct ip_vs_dest *dest;
3153         struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
3154         struct net *net = skb_sknet(skb);
3155
3156         mutex_lock(&__ip_vs_mutex);
3157
3158         /* Try to find the service for which to dump destinations */
3159         if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
3160                         IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
3161                 goto out_err;
3162
3163
3164         svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
3165         if (IS_ERR(svc) || svc == NULL)
3166                 goto out_err;
3167
3168         /* Dump the destinations */
3169         list_for_each_entry(dest, &svc->destinations, n_list) {
3170                 if (++idx <= start)
3171                         continue;
3172                 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
3173                         idx--;
3174                         goto nla_put_failure;
3175                 }
3176         }
3177
3178 nla_put_failure:
3179         cb->args[0] = idx;
3180
3181 out_err:
3182         mutex_unlock(&__ip_vs_mutex);
3183
3184         return skb->len;
3185 }
3186
3187 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3188                                  struct nlattr *nla, int full_entry)
3189 {
3190         struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3191         struct nlattr *nla_addr, *nla_port;
3192
3193         /* Parse mandatory identifying destination fields first */
3194         if (nla == NULL ||
3195             nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3196                 return -EINVAL;
3197
3198         nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
3199         nla_port        = attrs[IPVS_DEST_ATTR_PORT];
3200
3201         if (!(nla_addr && nla_port))
3202                 return -EINVAL;
3203
3204         memset(udest, 0, sizeof(*udest));
3205
3206         nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3207         udest->port = nla_get_u16(nla_port);
3208
3209         /* If a full entry was requested, check for the additional fields */
3210         if (full_entry) {
3211                 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3212                               *nla_l_thresh;
3213
3214                 nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3215                 nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
3216                 nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
3217                 nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
3218
3219                 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3220                         return -EINVAL;
3221
3222                 udest->conn_flags = nla_get_u32(nla_fwd)
3223                                     & IP_VS_CONN_F_FWD_MASK;
3224                 udest->weight = nla_get_u32(nla_weight);
3225                 udest->u_threshold = nla_get_u32(nla_u_thresh);
3226                 udest->l_threshold = nla_get_u32(nla_l_thresh);
3227         }
3228
3229         return 0;
3230 }
3231
3232 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
3233                                   const char *mcast_ifn, __be32 syncid)
3234 {
3235         struct nlattr *nl_daemon;
3236
3237         nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3238         if (!nl_daemon)
3239                 return -EMSGSIZE;
3240
3241         if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
3242             nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
3243             nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
3244                 goto nla_put_failure;
3245         nla_nest_end(skb, nl_daemon);
3246
3247         return 0;
3248
3249 nla_put_failure:
3250         nla_nest_cancel(skb, nl_daemon);
3251         return -EMSGSIZE;
3252 }
3253
3254 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
3255                                   const char *mcast_ifn, __be32 syncid,
3256                                   struct netlink_callback *cb)
3257 {
3258         void *hdr;
3259         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3260                           &ip_vs_genl_family, NLM_F_MULTI,
3261                           IPVS_CMD_NEW_DAEMON);
3262         if (!hdr)
3263                 return -EMSGSIZE;
3264
3265         if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
3266                 goto nla_put_failure;
3267
3268         return genlmsg_end(skb, hdr);
3269
3270 nla_put_failure:
3271         genlmsg_cancel(skb, hdr);
3272         return -EMSGSIZE;
3273 }
3274
3275 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3276                                    struct netlink_callback *cb)
3277 {
3278         struct net *net = skb_sknet(skb);
3279         struct netns_ipvs *ipvs = net_ipvs(net);
3280
3281         mutex_lock(&ipvs->sync_mutex);
3282         if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3283                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3284                                            ipvs->master_mcast_ifn,
3285                                            ipvs->master_syncid, cb) < 0)
3286                         goto nla_put_failure;
3287
3288                 cb->args[0] = 1;
3289         }
3290
3291         if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3292                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3293                                            ipvs->backup_mcast_ifn,
3294                                            ipvs->backup_syncid, cb) < 0)
3295                         goto nla_put_failure;
3296
3297                 cb->args[1] = 1;
3298         }
3299
3300 nla_put_failure:
3301         mutex_unlock(&ipvs->sync_mutex);
3302
3303         return skb->len;
3304 }
3305
3306 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3307 {
3308         if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3309               attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3310               attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3311                 return -EINVAL;
3312
3313         return start_sync_thread(net,
3314                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3315                                  nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3316                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3317 }
3318
3319 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3320 {
3321         if (!attrs[IPVS_DAEMON_ATTR_STATE])
3322                 return -EINVAL;
3323
3324         return stop_sync_thread(net,
3325                                 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3326 }
3327
3328 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3329 {
3330         struct ip_vs_timeout_user t;
3331
3332         __ip_vs_get_timeouts(net, &t);
3333
3334         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3335                 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3336
3337         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3338                 t.tcp_fin_timeout =
3339                         nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3340
3341         if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3342                 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3343
3344         return ip_vs_set_timeout(net, &t);
3345 }
3346
3347 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
3348 {
3349         int ret = 0, cmd;
3350         struct net *net;
3351         struct netns_ipvs *ipvs;
3352
3353         net = skb_sknet(skb);
3354         ipvs = net_ipvs(net);
3355         cmd = info->genlhdr->cmd;
3356
3357         if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
3358                 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3359
3360                 mutex_lock(&ipvs->sync_mutex);
3361                 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3362                     nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3363                                      info->attrs[IPVS_CMD_ATTR_DAEMON],
3364                                      ip_vs_daemon_policy)) {
3365                         ret = -EINVAL;
3366                         goto out;
3367                 }
3368
3369                 if (cmd == IPVS_CMD_NEW_DAEMON)
3370                         ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3371                 else
3372                         ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3373 out:
3374                 mutex_unlock(&ipvs->sync_mutex);
3375         }
3376         return ret;
3377 }
3378
3379 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3380 {
3381         struct ip_vs_service *svc = NULL;
3382         struct ip_vs_service_user_kern usvc;
3383         struct ip_vs_dest_user_kern udest;
3384         int ret = 0, cmd;
3385         int need_full_svc = 0, need_full_dest = 0;
3386         struct net *net;
3387
3388         net = skb_sknet(skb);
3389         cmd = info->genlhdr->cmd;
3390
3391         mutex_lock(&__ip_vs_mutex);
3392
3393         if (cmd == IPVS_CMD_FLUSH) {
3394                 ret = ip_vs_flush(net);
3395                 goto out;
3396         } else if (cmd == IPVS_CMD_SET_CONFIG) {
3397                 ret = ip_vs_genl_set_config(net, info->attrs);
3398                 goto out;
3399         } else if (cmd == IPVS_CMD_ZERO &&
3400                    !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3401                 ret = ip_vs_zero_all(net);
3402                 goto out;
3403         }
3404
3405         /* All following commands require a service argument, so check if we
3406          * received a valid one. We need a full service specification when
3407          * adding / editing a service. Only identifying members otherwise. */
3408         if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3409                 need_full_svc = 1;
3410
3411         ret = ip_vs_genl_parse_service(net, &usvc,
3412                                        info->attrs[IPVS_CMD_ATTR_SERVICE],
3413                                        need_full_svc, &svc);
3414         if (ret)
3415                 goto out;
3416
3417         /* Unless we're adding a new service, the service must already exist */
3418         if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3419                 ret = -ESRCH;
3420                 goto out;
3421         }
3422
3423         /* Destination commands require a valid destination argument. For
3424          * adding / editing a destination, we need a full destination
3425          * specification. */
3426         if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3427             cmd == IPVS_CMD_DEL_DEST) {
3428                 if (cmd != IPVS_CMD_DEL_DEST)
3429                         need_full_dest = 1;
3430
3431                 ret = ip_vs_genl_parse_dest(&udest,
3432                                             info->attrs[IPVS_CMD_ATTR_DEST],
3433                                             need_full_dest);
3434                 if (ret)
3435                         goto out;
3436         }
3437
3438         switch (cmd) {
3439         case IPVS_CMD_NEW_SERVICE:
3440                 if (svc == NULL)
3441                         ret = ip_vs_add_service(net, &usvc, &svc);
3442                 else
3443                         ret = -EEXIST;
3444                 break;
3445         case IPVS_CMD_SET_SERVICE:
3446                 ret = ip_vs_edit_service(svc, &usvc);
3447                 break;
3448         case IPVS_CMD_DEL_SERVICE:
3449                 ret = ip_vs_del_service(svc);
3450                 /* do not use svc, it can be freed */
3451                 break;
3452         case IPVS_CMD_NEW_DEST:
3453                 ret = ip_vs_add_dest(svc, &udest);
3454                 break;
3455         case IPVS_CMD_SET_DEST:
3456                 ret = ip_vs_edit_dest(svc, &udest);
3457                 break;
3458         case IPVS_CMD_DEL_DEST:
3459                 ret = ip_vs_del_dest(svc, &udest);
3460                 break;
3461         case IPVS_CMD_ZERO:
3462                 ret = ip_vs_zero_service(svc);
3463                 break;
3464         default:
3465                 ret = -EINVAL;
3466         }
3467
3468 out:
3469         mutex_unlock(&__ip_vs_mutex);
3470
3471         return ret;
3472 }
3473
3474 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3475 {
3476         struct sk_buff *msg;
3477         void *reply;
3478         int ret, cmd, reply_cmd;
3479         struct net *net;
3480
3481         net = skb_sknet(skb);
3482         cmd = info->genlhdr->cmd;
3483
3484         if (cmd == IPVS_CMD_GET_SERVICE)
3485                 reply_cmd = IPVS_CMD_NEW_SERVICE;
3486         else if (cmd == IPVS_CMD_GET_INFO)
3487                 reply_cmd = IPVS_CMD_SET_INFO;
3488         else if (cmd == IPVS_CMD_GET_CONFIG)
3489                 reply_cmd = IPVS_CMD_SET_CONFIG;
3490         else {
3491                 pr_err("unknown Generic Netlink command\n");
3492                 return -EINVAL;
3493         }
3494
3495         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3496         if (!msg)
3497                 return -ENOMEM;
3498
3499         mutex_lock(&__ip_vs_mutex);
3500
3501         reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3502         if (reply == NULL)
3503                 goto nla_put_failure;
3504
3505         switch (cmd) {
3506         case IPVS_CMD_GET_SERVICE:
3507         {
3508                 struct ip_vs_service *svc;
3509
3510                 svc = ip_vs_genl_find_service(net,
3511                                               info->attrs[IPVS_CMD_ATTR_SERVICE]);
3512                 if (IS_ERR(svc)) {
3513                         ret = PTR_ERR(svc);
3514                         goto out_err;
3515                 } else if (svc) {
3516                         ret = ip_vs_genl_fill_service(msg, svc);
3517                         if (ret)
3518                                 goto nla_put_failure;
3519                 } else {
3520                         ret = -ESRCH;
3521                         goto out_err;
3522                 }
3523
3524                 break;
3525         }
3526
3527         case IPVS_CMD_GET_CONFIG:
3528         {
3529                 struct ip_vs_timeout_user t;
3530
3531                 __ip_vs_get_timeouts(net, &t);
3532 #ifdef CONFIG_IP_VS_PROTO_TCP
3533                 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
3534                                 t.tcp_timeout) ||
3535                     nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3536                                 t.tcp_fin_timeout))
3537                         goto nla_put_failure;
3538 #endif
3539 #ifdef CONFIG_IP_VS_PROTO_UDP
3540                 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
3541                         goto nla_put_failure;
3542 #endif
3543
3544                 break;
3545         }
3546
3547         case IPVS_CMD_GET_INFO:
3548                 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
3549                                 IP_VS_VERSION_CODE) ||
3550                     nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3551                                 ip_vs_conn_tab_size))
3552                         goto nla_put_failure;
3553                 break;
3554         }
3555
3556         genlmsg_end(msg, reply);
3557         ret = genlmsg_reply(msg, info);
3558         goto out;
3559
3560 nla_put_failure:
3561         pr_err("not enough space in Netlink message\n");
3562         ret = -EMSGSIZE;
3563
3564 out_err:
3565         nlmsg_free(msg);
3566 out:
3567         mutex_unlock(&__ip_vs_mutex);
3568
3569         return ret;
3570 }
3571
3572
3573 static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3574         {
3575                 .cmd    = IPVS_CMD_NEW_SERVICE,
3576                 .flags  = GENL_ADMIN_PERM,
3577                 .policy = ip_vs_cmd_policy,
3578                 .doit   = ip_vs_genl_set_cmd,
3579         },
3580         {
3581                 .cmd    = IPVS_CMD_SET_SERVICE,
3582                 .flags  = GENL_ADMIN_PERM,
3583                 .policy = ip_vs_cmd_policy,
3584                 .doit   = ip_vs_genl_set_cmd,
3585         },
3586         {
3587                 .cmd    = IPVS_CMD_DEL_SERVICE,
3588                 .flags  = GENL_ADMIN_PERM,
3589                 .policy = ip_vs_cmd_policy,
3590                 .doit   = ip_vs_genl_set_cmd,
3591         },
3592         {
3593                 .cmd    = IPVS_CMD_GET_SERVICE,
3594                 .flags  = GENL_ADMIN_PERM,
3595                 .doit   = ip_vs_genl_get_cmd,
3596                 .dumpit = ip_vs_genl_dump_services,
3597                 .policy = ip_vs_cmd_policy,
3598         },
3599         {
3600                 .cmd    = IPVS_CMD_NEW_DEST,
3601                 .flags  = GENL_ADMIN_PERM,
3602                 .policy = ip_vs_cmd_policy,
3603                 .doit   = ip_vs_genl_set_cmd,
3604         },
3605         {
3606                 .cmd    = IPVS_CMD_SET_DEST,
3607                 .flags  = GENL_ADMIN_PERM,
3608                 .policy = ip_vs_cmd_policy,
3609                 .doit   = ip_vs_genl_set_cmd,
3610         },
3611         {
3612                 .cmd    = IPVS_CMD_DEL_DEST,
3613                 .flags  = GENL_ADMIN_PERM,
3614                 .policy = ip_vs_cmd_policy,
3615                 .doit   = ip_vs_genl_set_cmd,
3616         },
3617         {
3618                 .cmd    = IPVS_CMD_GET_DEST,
3619                 .flags  = GENL_ADMIN_PERM,
3620                 .policy = ip_vs_cmd_policy,
3621                 .dumpit = ip_vs_genl_dump_dests,
3622         },
3623         {
3624                 .cmd    = IPVS_CMD_NEW_DAEMON,
3625                 .flags  = GENL_ADMIN_PERM,
3626                 .policy = ip_vs_cmd_policy,
3627                 .doit   = ip_vs_genl_set_daemon,
3628         },
3629         {
3630                 .cmd    = IPVS_CMD_DEL_DAEMON,
3631                 .flags  = GENL_ADMIN_PERM,
3632                 .policy = ip_vs_cmd_policy,
3633                 .doit   = ip_vs_genl_set_daemon,
3634         },
3635         {
3636                 .cmd    = IPVS_CMD_GET_DAEMON,
3637                 .flags  = GENL_ADMIN_PERM,
3638                 .dumpit = ip_vs_genl_dump_daemons,
3639         },
3640         {
3641                 .cmd    = IPVS_CMD_SET_CONFIG,
3642                 .flags  = GENL_ADMIN_PERM,
3643                 .policy = ip_vs_cmd_policy,
3644                 .doit   = ip_vs_genl_set_cmd,
3645         },
3646         {
3647                 .cmd    = IPVS_CMD_GET_CONFIG,
3648                 .flags  = GENL_ADMIN_PERM,
3649                 .doit   = ip_vs_genl_get_cmd,
3650         },
3651         {
3652                 .cmd    = IPVS_CMD_GET_INFO,
3653                 .flags  = GENL_ADMIN_PERM,
3654                 .doit   = ip_vs_genl_get_cmd,
3655         },
3656         {
3657                 .cmd    = IPVS_CMD_ZERO,
3658                 .flags  = GENL_ADMIN_PERM,
3659                 .policy = ip_vs_cmd_policy,
3660                 .doit   = ip_vs_genl_set_cmd,
3661         },
3662         {
3663                 .cmd    = IPVS_CMD_FLUSH,
3664                 .flags  = GENL_ADMIN_PERM,
3665                 .doit   = ip_vs_genl_set_cmd,
3666         },
3667 };
3668
3669 static int __init ip_vs_genl_register(void)
3670 {
3671         return genl_register_family_with_ops(&ip_vs_genl_family,
3672                 ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
3673 }
3674
3675 static void ip_vs_genl_unregister(void)
3676 {
3677         genl_unregister_family(&ip_vs_genl_family);
3678 }
3679
3680 /* End of Generic Netlink interface definitions */
3681
3682 /*
3683  * per netns intit/exit func.
3684  */
3685 #ifdef CONFIG_SYSCTL
3686 static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
3687 {
3688         int idx;
3689         struct netns_ipvs *ipvs = net_ipvs(net);
3690         struct ctl_table *tbl;
3691
3692         atomic_set(&ipvs->dropentry, 0);
3693         spin_lock_init(&ipvs->dropentry_lock);
3694         spin_lock_init(&ipvs->droppacket_lock);
3695         spin_lock_init(&ipvs->securetcp_lock);
3696
3697         if (!net_eq(net, &init_net)) {
3698                 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3699                 if (tbl == NULL)
3700                         return -ENOMEM;
3701         } else
3702                 tbl = vs_vars;
3703         /* Initialize sysctl defaults */
3704         idx = 0;
3705         ipvs->sysctl_amemthresh = 1024;
3706         tbl[idx++].data = &ipvs->sysctl_amemthresh;
3707         ipvs->sysctl_am_droprate = 10;
3708         tbl[idx++].data = &ipvs->sysctl_am_droprate;
3709         tbl[idx++].data = &ipvs->sysctl_drop_entry;
3710         tbl[idx++].data = &ipvs->sysctl_drop_packet;
3711 #ifdef CONFIG_IP_VS_NFCT
3712         tbl[idx++].data = &ipvs->sysctl_conntrack;
3713 #endif
3714         tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3715         ipvs->sysctl_snat_reroute = 1;
3716         tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3717         ipvs->sysctl_sync_ver = 1;
3718         tbl[idx++].data = &ipvs->sysctl_sync_ver;
3719         ipvs->sysctl_sync_ports = 1;
3720         tbl[idx++].data = &ipvs->sysctl_sync_ports;
3721         ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
3722         tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
3723         ipvs->sysctl_sync_sock_size = 0;
3724         tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
3725         tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3726         tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3727         tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3728         ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
3729         ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
3730         tbl[idx].data = &ipvs->sysctl_sync_threshold;
3731         tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3732         ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
3733         tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
3734         ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
3735         tbl[idx++].data = &ipvs->sysctl_sync_retries;
3736         tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3737         ipvs->sysctl_pmtu_disc = 1;
3738         tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
3739
3740
3741         ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
3742         if (ipvs->sysctl_hdr == NULL) {
3743                 if (!net_eq(net, &init_net))
3744                         kfree(tbl);
3745                 return -ENOMEM;
3746         }
3747         ip_vs_start_estimator(net, &ipvs->tot_stats);
3748         ipvs->sysctl_tbl = tbl;
3749         /* Schedule defense work */
3750         INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3751         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3752
3753         return 0;
3754 }
3755
3756 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
3757 {
3758         struct netns_ipvs *ipvs = net_ipvs(net);
3759
3760         cancel_delayed_work_sync(&ipvs->defense_work);
3761         cancel_work_sync(&ipvs->defense_work.work);
3762         unregister_net_sysctl_table(ipvs->sysctl_hdr);
3763 }
3764
3765 #else
3766
3767 static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
3768 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
3769
3770 #endif
3771
3772 static struct notifier_block ip_vs_dst_notifier = {
3773         .notifier_call = ip_vs_dst_event,
3774 };
3775
3776 int __net_init ip_vs_control_net_init(struct net *net)
3777 {
3778         int idx;
3779         struct netns_ipvs *ipvs = net_ipvs(net);
3780
3781         rwlock_init(&ipvs->rs_lock);
3782
3783         /* Initialize rs_table */
3784         for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3785                 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3786
3787         INIT_LIST_HEAD(&ipvs->dest_trash);
3788         atomic_set(&ipvs->ftpsvc_counter, 0);
3789         atomic_set(&ipvs->nullsvc_counter, 0);
3790
3791         /* procfs stats */
3792         ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3793         if (!ipvs->tot_stats.cpustats)
3794                 return -ENOMEM;
3795
3796         spin_lock_init(&ipvs->tot_stats.lock);
3797
3798         proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
3799         proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
3800         proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
3801                              &ip_vs_stats_percpu_fops);
3802
3803         if (ip_vs_control_net_init_sysctl(net))
3804                 goto err;
3805
3806         return 0;
3807
3808 err:
3809         free_percpu(ipvs->tot_stats.cpustats);
3810         return -ENOMEM;
3811 }
3812
3813 void __net_exit ip_vs_control_net_cleanup(struct net *net)
3814 {
3815         struct netns_ipvs *ipvs = net_ipvs(net);
3816
3817         ip_vs_trash_cleanup(net);
3818         ip_vs_stop_estimator(net, &ipvs->tot_stats);
3819         ip_vs_control_net_cleanup_sysctl(net);
3820         proc_net_remove(net, "ip_vs_stats_percpu");
3821         proc_net_remove(net, "ip_vs_stats");
3822         proc_net_remove(net, "ip_vs");
3823         free_percpu(ipvs->tot_stats.cpustats);
3824 }
3825
3826 int __init ip_vs_register_nl_ioctl(void)
3827 {
3828         int ret;
3829
3830         ret = nf_register_sockopt(&ip_vs_sockopts);
3831         if (ret) {
3832                 pr_err("cannot register sockopt.\n");
3833                 goto err_sock;
3834         }
3835
3836         ret = ip_vs_genl_register();
3837         if (ret) {
3838                 pr_err("cannot register Generic Netlink interface.\n");
3839                 goto err_genl;
3840         }
3841         return 0;
3842
3843 err_genl:
3844         nf_unregister_sockopt(&ip_vs_sockopts);
3845 err_sock:
3846         return ret;
3847 }
3848
3849 void ip_vs_unregister_nl_ioctl(void)
3850 {
3851         ip_vs_genl_unregister();
3852         nf_unregister_sockopt(&ip_vs_sockopts);
3853 }
3854
3855 int __init ip_vs_control_init(void)
3856 {
3857         int idx;
3858         int ret;
3859
3860         EnterFunction(2);
3861
3862         /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
3863         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3864                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3865                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3866         }
3867
3868         smp_wmb();      /* Do we really need it now ? */
3869
3870         ret = register_netdevice_notifier(&ip_vs_dst_notifier);
3871         if (ret < 0)
3872                 return ret;
3873
3874         LeaveFunction(2);
3875         return 0;
3876 }
3877
3878
3879 void ip_vs_control_cleanup(void)
3880 {
3881         EnterFunction(2);
3882         unregister_netdevice_notifier(&ip_vs_dst_notifier);
3883         LeaveFunction(2);
3884 }