]> Pileus Git - ~andy/linux/blob - net/netfilter/ipvs/ip_vs_ctl.c
ipvs: reorganize dest trash
[~andy/linux] / net / netfilter / ipvs / ip_vs_ctl.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the NetFilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * Changes:
18  *
19  */
20
21 #define KMSG_COMPONENT "IPVS"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/init.h>
26 #include <linux/types.h>
27 #include <linux/capability.h>
28 #include <linux/fs.h>
29 #include <linux/sysctl.h>
30 #include <linux/proc_fs.h>
31 #include <linux/workqueue.h>
32 #include <linux/swap.h>
33 #include <linux/seq_file.h>
34 #include <linux/slab.h>
35
36 #include <linux/netfilter.h>
37 #include <linux/netfilter_ipv4.h>
38 #include <linux/mutex.h>
39
40 #include <net/net_namespace.h>
41 #include <linux/nsproxy.h>
42 #include <net/ip.h>
43 #ifdef CONFIG_IP_VS_IPV6
44 #include <net/ipv6.h>
45 #include <net/ip6_route.h>
46 #endif
47 #include <net/route.h>
48 #include <net/sock.h>
49 #include <net/genetlink.h>
50
51 #include <asm/uaccess.h>
52
53 #include <net/ip_vs.h>
54
55 /* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
56 static DEFINE_MUTEX(__ip_vs_mutex);
57
58 /* lock for service table */
59 static DEFINE_RWLOCK(__ip_vs_svc_lock);
60
61 /* sysctl variables */
62
63 #ifdef CONFIG_IP_VS_DEBUG
64 static int sysctl_ip_vs_debug_level = 0;
65
66 int ip_vs_get_debug_level(void)
67 {
68         return sysctl_ip_vs_debug_level;
69 }
70 #endif
71
72
73 /*  Protos */
74 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
75
76
77 #ifdef CONFIG_IP_VS_IPV6
78 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
79 static bool __ip_vs_addr_is_local_v6(struct net *net,
80                                      const struct in6_addr *addr)
81 {
82         struct flowi6 fl6 = {
83                 .daddr = *addr,
84         };
85         struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
86         bool is_local;
87
88         is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
89
90         dst_release(dst);
91         return is_local;
92 }
93 #endif
94
95 #ifdef CONFIG_SYSCTL
96 /*
97  *      update_defense_level is called from keventd and from sysctl,
98  *      so it needs to protect itself from softirqs
99  */
100 static void update_defense_level(struct netns_ipvs *ipvs)
101 {
102         struct sysinfo i;
103         static int old_secure_tcp = 0;
104         int availmem;
105         int nomem;
106         int to_change = -1;
107
108         /* we only count free and buffered memory (in pages) */
109         si_meminfo(&i);
110         availmem = i.freeram + i.bufferram;
111         /* however in linux 2.5 the i.bufferram is total page cache size,
112            we need adjust it */
113         /* si_swapinfo(&i); */
114         /* availmem = availmem - (i.totalswap - i.freeswap); */
115
116         nomem = (availmem < ipvs->sysctl_amemthresh);
117
118         local_bh_disable();
119
120         /* drop_entry */
121         spin_lock(&ipvs->dropentry_lock);
122         switch (ipvs->sysctl_drop_entry) {
123         case 0:
124                 atomic_set(&ipvs->dropentry, 0);
125                 break;
126         case 1:
127                 if (nomem) {
128                         atomic_set(&ipvs->dropentry, 1);
129                         ipvs->sysctl_drop_entry = 2;
130                 } else {
131                         atomic_set(&ipvs->dropentry, 0);
132                 }
133                 break;
134         case 2:
135                 if (nomem) {
136                         atomic_set(&ipvs->dropentry, 1);
137                 } else {
138                         atomic_set(&ipvs->dropentry, 0);
139                         ipvs->sysctl_drop_entry = 1;
140                 };
141                 break;
142         case 3:
143                 atomic_set(&ipvs->dropentry, 1);
144                 break;
145         }
146         spin_unlock(&ipvs->dropentry_lock);
147
148         /* drop_packet */
149         spin_lock(&ipvs->droppacket_lock);
150         switch (ipvs->sysctl_drop_packet) {
151         case 0:
152                 ipvs->drop_rate = 0;
153                 break;
154         case 1:
155                 if (nomem) {
156                         ipvs->drop_rate = ipvs->drop_counter
157                                 = ipvs->sysctl_amemthresh /
158                                 (ipvs->sysctl_amemthresh-availmem);
159                         ipvs->sysctl_drop_packet = 2;
160                 } else {
161                         ipvs->drop_rate = 0;
162                 }
163                 break;
164         case 2:
165                 if (nomem) {
166                         ipvs->drop_rate = ipvs->drop_counter
167                                 = ipvs->sysctl_amemthresh /
168                                 (ipvs->sysctl_amemthresh-availmem);
169                 } else {
170                         ipvs->drop_rate = 0;
171                         ipvs->sysctl_drop_packet = 1;
172                 }
173                 break;
174         case 3:
175                 ipvs->drop_rate = ipvs->sysctl_am_droprate;
176                 break;
177         }
178         spin_unlock(&ipvs->droppacket_lock);
179
180         /* secure_tcp */
181         spin_lock(&ipvs->securetcp_lock);
182         switch (ipvs->sysctl_secure_tcp) {
183         case 0:
184                 if (old_secure_tcp >= 2)
185                         to_change = 0;
186                 break;
187         case 1:
188                 if (nomem) {
189                         if (old_secure_tcp < 2)
190                                 to_change = 1;
191                         ipvs->sysctl_secure_tcp = 2;
192                 } else {
193                         if (old_secure_tcp >= 2)
194                                 to_change = 0;
195                 }
196                 break;
197         case 2:
198                 if (nomem) {
199                         if (old_secure_tcp < 2)
200                                 to_change = 1;
201                 } else {
202                         if (old_secure_tcp >= 2)
203                                 to_change = 0;
204                         ipvs->sysctl_secure_tcp = 1;
205                 }
206                 break;
207         case 3:
208                 if (old_secure_tcp < 2)
209                         to_change = 1;
210                 break;
211         }
212         old_secure_tcp = ipvs->sysctl_secure_tcp;
213         if (to_change >= 0)
214                 ip_vs_protocol_timeout_change(ipvs,
215                                               ipvs->sysctl_secure_tcp > 1);
216         spin_unlock(&ipvs->securetcp_lock);
217
218         local_bh_enable();
219 }
220
221
222 /*
223  *      Timer for checking the defense
224  */
225 #define DEFENSE_TIMER_PERIOD    1*HZ
226
227 static void defense_work_handler(struct work_struct *work)
228 {
229         struct netns_ipvs *ipvs =
230                 container_of(work, struct netns_ipvs, defense_work.work);
231
232         update_defense_level(ipvs);
233         if (atomic_read(&ipvs->dropentry))
234                 ip_vs_random_dropentry(ipvs->net);
235         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
236 }
237 #endif
238
239 int
240 ip_vs_use_count_inc(void)
241 {
242         return try_module_get(THIS_MODULE);
243 }
244
245 void
246 ip_vs_use_count_dec(void)
247 {
248         module_put(THIS_MODULE);
249 }
250
251
252 /*
253  *      Hash table: for virtual service lookups
254  */
255 #define IP_VS_SVC_TAB_BITS 8
256 #define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS)
257 #define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1)
258
259 /* the service table hashed by <protocol, addr, port> */
260 static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
261 /* the service table hashed by fwmark */
262 static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
263
264
265 /*
266  *      Returns hash value for virtual service
267  */
268 static inline unsigned int
269 ip_vs_svc_hashkey(struct net *net, int af, unsigned int proto,
270                   const union nf_inet_addr *addr, __be16 port)
271 {
272         register unsigned int porth = ntohs(port);
273         __be32 addr_fold = addr->ip;
274         __u32 ahash;
275
276 #ifdef CONFIG_IP_VS_IPV6
277         if (af == AF_INET6)
278                 addr_fold = addr->ip6[0]^addr->ip6[1]^
279                             addr->ip6[2]^addr->ip6[3];
280 #endif
281         ahash = ntohl(addr_fold);
282         ahash ^= ((size_t) net >> 8);
283
284         return (proto ^ ahash ^ (porth >> IP_VS_SVC_TAB_BITS) ^ porth) &
285                IP_VS_SVC_TAB_MASK;
286 }
287
288 /*
289  *      Returns hash value of fwmark for virtual service lookup
290  */
291 static inline unsigned int ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
292 {
293         return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
294 }
295
296 /*
297  *      Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
298  *      or in the ip_vs_svc_fwm_table by fwmark.
299  *      Should be called with locked tables.
300  */
301 static int ip_vs_svc_hash(struct ip_vs_service *svc)
302 {
303         unsigned int hash;
304
305         if (svc->flags & IP_VS_SVC_F_HASHED) {
306                 pr_err("%s(): request for already hashed, called from %pF\n",
307                        __func__, __builtin_return_address(0));
308                 return 0;
309         }
310
311         if (svc->fwmark == 0) {
312                 /*
313                  *  Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
314                  */
315                 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
316                                          &svc->addr, svc->port);
317                 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
318         } else {
319                 /*
320                  *  Hash it by fwmark in svc_fwm_table
321                  */
322                 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
323                 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
324         }
325
326         svc->flags |= IP_VS_SVC_F_HASHED;
327         /* increase its refcnt because it is referenced by the svc table */
328         atomic_inc(&svc->refcnt);
329         return 1;
330 }
331
332
333 /*
334  *      Unhashes a service from svc_table / svc_fwm_table.
335  *      Should be called with locked tables.
336  */
337 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
338 {
339         if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
340                 pr_err("%s(): request for unhash flagged, called from %pF\n",
341                        __func__, __builtin_return_address(0));
342                 return 0;
343         }
344
345         if (svc->fwmark == 0) {
346                 /* Remove it from the svc_table table */
347                 list_del(&svc->s_list);
348         } else {
349                 /* Remove it from the svc_fwm_table table */
350                 list_del(&svc->f_list);
351         }
352
353         svc->flags &= ~IP_VS_SVC_F_HASHED;
354         atomic_dec(&svc->refcnt);
355         return 1;
356 }
357
358
359 /*
360  *      Get service by {netns, proto,addr,port} in the service table.
361  */
362 static inline struct ip_vs_service *
363 __ip_vs_service_find(struct net *net, int af, __u16 protocol,
364                      const union nf_inet_addr *vaddr, __be16 vport)
365 {
366         unsigned int hash;
367         struct ip_vs_service *svc;
368
369         /* Check for "full" addressed entries */
370         hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
371
372         list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
373                 if ((svc->af == af)
374                     && ip_vs_addr_equal(af, &svc->addr, vaddr)
375                     && (svc->port == vport)
376                     && (svc->protocol == protocol)
377                     && net_eq(svc->net, net)) {
378                         /* HIT */
379                         return svc;
380                 }
381         }
382
383         return NULL;
384 }
385
386
387 /*
388  *      Get service by {fwmark} in the service table.
389  */
390 static inline struct ip_vs_service *
391 __ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
392 {
393         unsigned int hash;
394         struct ip_vs_service *svc;
395
396         /* Check for fwmark addressed entries */
397         hash = ip_vs_svc_fwm_hashkey(net, fwmark);
398
399         list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
400                 if (svc->fwmark == fwmark && svc->af == af
401                     && net_eq(svc->net, net)) {
402                         /* HIT */
403                         return svc;
404                 }
405         }
406
407         return NULL;
408 }
409
410 struct ip_vs_service *
411 ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
412                   const union nf_inet_addr *vaddr, __be16 vport)
413 {
414         struct ip_vs_service *svc;
415         struct netns_ipvs *ipvs = net_ipvs(net);
416
417         read_lock(&__ip_vs_svc_lock);
418
419         /*
420          *      Check the table hashed by fwmark first
421          */
422         if (fwmark) {
423                 svc = __ip_vs_svc_fwm_find(net, af, fwmark);
424                 if (svc)
425                         goto out;
426         }
427
428         /*
429          *      Check the table hashed by <protocol,addr,port>
430          *      for "full" addressed entries
431          */
432         svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
433
434         if (svc == NULL
435             && protocol == IPPROTO_TCP
436             && atomic_read(&ipvs->ftpsvc_counter)
437             && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
438                 /*
439                  * Check if ftp service entry exists, the packet
440                  * might belong to FTP data connections.
441                  */
442                 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
443         }
444
445         if (svc == NULL
446             && atomic_read(&ipvs->nullsvc_counter)) {
447                 /*
448                  * Check if the catch-all port (port zero) exists
449                  */
450                 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
451         }
452
453   out:
454         if (svc)
455                 atomic_inc(&svc->usecnt);
456         read_unlock(&__ip_vs_svc_lock);
457
458         IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
459                       fwmark, ip_vs_proto_name(protocol),
460                       IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
461                       svc ? "hit" : "not hit");
462
463         return svc;
464 }
465
466
467 static inline void
468 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
469 {
470         atomic_inc(&svc->refcnt);
471         dest->svc = svc;
472 }
473
474 static void
475 __ip_vs_unbind_svc(struct ip_vs_dest *dest)
476 {
477         struct ip_vs_service *svc = dest->svc;
478
479         dest->svc = NULL;
480         if (atomic_dec_and_test(&svc->refcnt)) {
481                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
482                               svc->fwmark,
483                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
484                               ntohs(svc->port), atomic_read(&svc->usecnt));
485                 free_percpu(svc->stats.cpustats);
486                 kfree(svc);
487         }
488 }
489
490
491 /*
492  *      Returns hash value for real service
493  */
494 static inline unsigned int ip_vs_rs_hashkey(int af,
495                                             const union nf_inet_addr *addr,
496                                             __be16 port)
497 {
498         register unsigned int porth = ntohs(port);
499         __be32 addr_fold = addr->ip;
500
501 #ifdef CONFIG_IP_VS_IPV6
502         if (af == AF_INET6)
503                 addr_fold = addr->ip6[0]^addr->ip6[1]^
504                             addr->ip6[2]^addr->ip6[3];
505 #endif
506
507         return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
508                 & IP_VS_RTAB_MASK;
509 }
510
511 /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
512 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
513 {
514         unsigned int hash;
515
516         if (dest->in_rs_table)
517                 return;
518
519         /*
520          *      Hash by proto,addr,port,
521          *      which are the parameters of the real service.
522          */
523         hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
524
525         hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
526         dest->in_rs_table = 1;
527 }
528
529 /* Unhash ip_vs_dest from rs_table. */
530 static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
531 {
532         /*
533          * Remove it from the rs_table table.
534          */
535         if (dest->in_rs_table) {
536                 hlist_del_rcu(&dest->d_list);
537                 dest->in_rs_table = 0;
538         }
539 }
540
541 /* Check if real service by <proto,addr,port> is present */
542 bool ip_vs_has_real_service(struct net *net, int af, __u16 protocol,
543                             const union nf_inet_addr *daddr, __be16 dport)
544 {
545         struct netns_ipvs *ipvs = net_ipvs(net);
546         unsigned int hash;
547         struct ip_vs_dest *dest;
548
549         /* Check for "full" addressed entries */
550         hash = ip_vs_rs_hashkey(af, daddr, dport);
551
552         rcu_read_lock();
553         hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
554                 if (dest->port == dport &&
555                     dest->af == af &&
556                     ip_vs_addr_equal(af, &dest->addr, daddr) &&
557                     (dest->protocol == protocol || dest->vfwmark)) {
558                         /* HIT */
559                         rcu_read_unlock();
560                         return true;
561                 }
562         }
563         rcu_read_unlock();
564
565         return false;
566 }
567
568 /*
569  *      Lookup destination by {addr,port} in the given service
570  */
571 static struct ip_vs_dest *
572 ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
573                   __be16 dport)
574 {
575         struct ip_vs_dest *dest;
576
577         /*
578          * Find the destination for the given service
579          */
580         list_for_each_entry(dest, &svc->destinations, n_list) {
581                 if ((dest->af == svc->af)
582                     && ip_vs_addr_equal(svc->af, &dest->addr, daddr)
583                     && (dest->port == dport)) {
584                         /* HIT */
585                         return dest;
586                 }
587         }
588
589         return NULL;
590 }
591
592 /*
593  * Find destination by {daddr,dport,vaddr,protocol}
594  * Cretaed to be used in ip_vs_process_message() in
595  * the backup synchronization daemon. It finds the
596  * destination to be bound to the received connection
597  * on the backup.
598  */
599 struct ip_vs_dest *ip_vs_find_dest(struct net  *net, int af,
600                                    const union nf_inet_addr *daddr,
601                                    __be16 dport,
602                                    const union nf_inet_addr *vaddr,
603                                    __be16 vport, __u16 protocol, __u32 fwmark,
604                                    __u32 flags)
605 {
606         struct ip_vs_dest *dest;
607         struct ip_vs_service *svc;
608         __be16 port = dport;
609
610         svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
611         if (!svc)
612                 return NULL;
613         if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
614                 port = 0;
615         dest = ip_vs_lookup_dest(svc, daddr, port);
616         if (!dest)
617                 dest = ip_vs_lookup_dest(svc, daddr, port ^ dport);
618         if (dest)
619                 ip_vs_dest_hold(dest);
620         ip_vs_service_put(svc);
621         return dest;
622 }
623
624 void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
625 {
626         struct ip_vs_dest_dst *dest_dst = container_of(head,
627                                                        struct ip_vs_dest_dst,
628                                                        rcu_head);
629
630         dst_release(dest_dst->dst_cache);
631         kfree(dest_dst);
632 }
633
634 /* Release dest_dst and dst_cache for dest in user context */
635 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
636 {
637         struct ip_vs_dest_dst *old;
638
639         old = rcu_dereference_protected(dest->dest_dst, 1);
640         if (old) {
641                 RCU_INIT_POINTER(dest->dest_dst, NULL);
642                 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
643         }
644 }
645
646 /*
647  *  Lookup dest by {svc,addr,port} in the destination trash.
648  *  The destination trash is used to hold the destinations that are removed
649  *  from the service table but are still referenced by some conn entries.
650  *  The reason to add the destination trash is when the dest is temporary
651  *  down (either by administrator or by monitor program), the dest can be
652  *  picked back from the trash, the remaining connections to the dest can
653  *  continue, and the counting information of the dest is also useful for
654  *  scheduling.
655  */
656 static struct ip_vs_dest *
657 ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
658                      __be16 dport)
659 {
660         struct ip_vs_dest *dest;
661         struct netns_ipvs *ipvs = net_ipvs(svc->net);
662
663         /*
664          * Find the destination in trash
665          */
666         spin_lock_bh(&ipvs->dest_trash_lock);
667         list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
668                 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
669                               "dest->refcnt=%d\n",
670                               dest->vfwmark,
671                               IP_VS_DBG_ADDR(svc->af, &dest->addr),
672                               ntohs(dest->port),
673                               atomic_read(&dest->refcnt));
674                 /* We can not reuse dest while in grace period
675                  * because conns still can use dest->svc
676                  */
677                 if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
678                         continue;
679                 if (dest->af == svc->af &&
680                     ip_vs_addr_equal(svc->af, &dest->addr, daddr) &&
681                     dest->port == dport &&
682                     dest->vfwmark == svc->fwmark &&
683                     dest->protocol == svc->protocol &&
684                     (svc->fwmark ||
685                      (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
686                       dest->vport == svc->port))) {
687                         /* HIT */
688                         list_del(&dest->t_list);
689                         ip_vs_dest_hold(dest);
690                         goto out;
691                 }
692         }
693
694         dest = NULL;
695
696 out:
697         spin_unlock_bh(&ipvs->dest_trash_lock);
698
699         return dest;
700 }
701
702 static void ip_vs_dest_free(struct ip_vs_dest *dest)
703 {
704         __ip_vs_dst_cache_reset(dest);
705         __ip_vs_unbind_svc(dest);
706         free_percpu(dest->stats.cpustats);
707         kfree(dest);
708 }
709
710 /*
711  *  Clean up all the destinations in the trash
712  *  Called by the ip_vs_control_cleanup()
713  *
714  *  When the ip_vs_control_clearup is activated by ipvs module exit,
715  *  the service tables must have been flushed and all the connections
716  *  are expired, and the refcnt of each destination in the trash must
717  *  be 0, so we simply release them here.
718  */
719 static void ip_vs_trash_cleanup(struct net *net)
720 {
721         struct ip_vs_dest *dest, *nxt;
722         struct netns_ipvs *ipvs = net_ipvs(net);
723
724         del_timer_sync(&ipvs->dest_trash_timer);
725         /* No need to use dest_trash_lock */
726         list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
727                 list_del(&dest->t_list);
728                 ip_vs_dest_free(dest);
729         }
730 }
731
732 static void
733 ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src)
734 {
735 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->ustats.c - src->ustats0.c
736
737         spin_lock_bh(&src->lock);
738
739         IP_VS_SHOW_STATS_COUNTER(conns);
740         IP_VS_SHOW_STATS_COUNTER(inpkts);
741         IP_VS_SHOW_STATS_COUNTER(outpkts);
742         IP_VS_SHOW_STATS_COUNTER(inbytes);
743         IP_VS_SHOW_STATS_COUNTER(outbytes);
744
745         ip_vs_read_estimator(dst, src);
746
747         spin_unlock_bh(&src->lock);
748 }
749
750 static void
751 ip_vs_zero_stats(struct ip_vs_stats *stats)
752 {
753         spin_lock_bh(&stats->lock);
754
755         /* get current counters as zero point, rates are zeroed */
756
757 #define IP_VS_ZERO_STATS_COUNTER(c) stats->ustats0.c = stats->ustats.c
758
759         IP_VS_ZERO_STATS_COUNTER(conns);
760         IP_VS_ZERO_STATS_COUNTER(inpkts);
761         IP_VS_ZERO_STATS_COUNTER(outpkts);
762         IP_VS_ZERO_STATS_COUNTER(inbytes);
763         IP_VS_ZERO_STATS_COUNTER(outbytes);
764
765         ip_vs_zero_estimator(stats);
766
767         spin_unlock_bh(&stats->lock);
768 }
769
770 /*
771  *      Update a destination in the given service
772  */
773 static void
774 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
775                     struct ip_vs_dest_user_kern *udest, int add)
776 {
777         struct netns_ipvs *ipvs = net_ipvs(svc->net);
778         int conn_flags;
779
780         /* set the weight and the flags */
781         atomic_set(&dest->weight, udest->weight);
782         conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
783         conn_flags |= IP_VS_CONN_F_INACTIVE;
784
785         /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
786         if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
787                 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
788         } else {
789                 /*
790                  *    Put the real service in rs_table if not present.
791                  *    For now only for NAT!
792                  */
793                 ip_vs_rs_hash(ipvs, dest);
794         }
795         atomic_set(&dest->conn_flags, conn_flags);
796
797         /* bind the service */
798         if (!dest->svc) {
799                 __ip_vs_bind_svc(dest, svc);
800         } else {
801                 if (dest->svc != svc) {
802                         __ip_vs_unbind_svc(dest);
803                         ip_vs_zero_stats(&dest->stats);
804                         __ip_vs_bind_svc(dest, svc);
805                 }
806         }
807
808         /* set the dest status flags */
809         dest->flags |= IP_VS_DEST_F_AVAILABLE;
810
811         if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
812                 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
813         dest->u_threshold = udest->u_threshold;
814         dest->l_threshold = udest->l_threshold;
815
816         spin_lock_bh(&dest->dst_lock);
817         __ip_vs_dst_cache_reset(dest);
818         spin_unlock_bh(&dest->dst_lock);
819
820         if (add)
821                 ip_vs_start_estimator(svc->net, &dest->stats);
822
823         write_lock_bh(&__ip_vs_svc_lock);
824
825         /* Wait until all other svc users go away */
826         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
827
828         if (add) {
829                 list_add(&dest->n_list, &svc->destinations);
830                 svc->num_dests++;
831                 if (svc->scheduler->add_dest)
832                         svc->scheduler->add_dest(svc, dest);
833         } else {
834                 if (svc->scheduler->upd_dest)
835                         svc->scheduler->upd_dest(svc, dest);
836         }
837
838         /* call the update_service, because server weight may be changed */
839         if (svc->scheduler->update_service)
840                 svc->scheduler->update_service(svc);
841
842         write_unlock_bh(&__ip_vs_svc_lock);
843 }
844
845
846 /*
847  *      Create a destination for the given service
848  */
849 static int
850 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
851                struct ip_vs_dest **dest_p)
852 {
853         struct ip_vs_dest *dest;
854         unsigned int atype;
855
856         EnterFunction(2);
857
858 #ifdef CONFIG_IP_VS_IPV6
859         if (svc->af == AF_INET6) {
860                 atype = ipv6_addr_type(&udest->addr.in6);
861                 if ((!(atype & IPV6_ADDR_UNICAST) ||
862                         atype & IPV6_ADDR_LINKLOCAL) &&
863                         !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
864                         return -EINVAL;
865         } else
866 #endif
867         {
868                 atype = inet_addr_type(svc->net, udest->addr.ip);
869                 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
870                         return -EINVAL;
871         }
872
873         dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL);
874         if (dest == NULL)
875                 return -ENOMEM;
876
877         dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
878         if (!dest->stats.cpustats)
879                 goto err_alloc;
880
881         dest->af = svc->af;
882         dest->protocol = svc->protocol;
883         dest->vaddr = svc->addr;
884         dest->vport = svc->port;
885         dest->vfwmark = svc->fwmark;
886         ip_vs_addr_copy(svc->af, &dest->addr, &udest->addr);
887         dest->port = udest->port;
888
889         atomic_set(&dest->activeconns, 0);
890         atomic_set(&dest->inactconns, 0);
891         atomic_set(&dest->persistconns, 0);
892         atomic_set(&dest->refcnt, 1);
893
894         INIT_HLIST_NODE(&dest->d_list);
895         spin_lock_init(&dest->dst_lock);
896         spin_lock_init(&dest->stats.lock);
897         __ip_vs_update_dest(svc, dest, udest, 1);
898
899         *dest_p = dest;
900
901         LeaveFunction(2);
902         return 0;
903
904 err_alloc:
905         kfree(dest);
906         return -ENOMEM;
907 }
908
909
910 /*
911  *      Add a destination into an existing service
912  */
913 static int
914 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
915 {
916         struct ip_vs_dest *dest;
917         union nf_inet_addr daddr;
918         __be16 dport = udest->port;
919         int ret;
920
921         EnterFunction(2);
922
923         if (udest->weight < 0) {
924                 pr_err("%s(): server weight less than zero\n", __func__);
925                 return -ERANGE;
926         }
927
928         if (udest->l_threshold > udest->u_threshold) {
929                 pr_err("%s(): lower threshold is higher than upper threshold\n",
930                         __func__);
931                 return -ERANGE;
932         }
933
934         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
935
936         /*
937          * Check if the dest already exists in the list
938          */
939         dest = ip_vs_lookup_dest(svc, &daddr, dport);
940
941         if (dest != NULL) {
942                 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
943                 return -EEXIST;
944         }
945
946         /*
947          * Check if the dest already exists in the trash and
948          * is from the same service
949          */
950         dest = ip_vs_trash_get_dest(svc, &daddr, dport);
951
952         if (dest != NULL) {
953                 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
954                               "dest->refcnt=%d, service %u/%s:%u\n",
955                               IP_VS_DBG_ADDR(svc->af, &daddr), ntohs(dport),
956                               atomic_read(&dest->refcnt),
957                               dest->vfwmark,
958                               IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
959                               ntohs(dest->vport));
960
961                 __ip_vs_update_dest(svc, dest, udest, 1);
962                 ret = 0;
963         } else {
964                 /*
965                  * Allocate and initialize the dest structure
966                  */
967                 ret = ip_vs_new_dest(svc, udest, &dest);
968         }
969         LeaveFunction(2);
970
971         return ret;
972 }
973
974
975 /*
976  *      Edit a destination in the given service
977  */
978 static int
979 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
980 {
981         struct ip_vs_dest *dest;
982         union nf_inet_addr daddr;
983         __be16 dport = udest->port;
984
985         EnterFunction(2);
986
987         if (udest->weight < 0) {
988                 pr_err("%s(): server weight less than zero\n", __func__);
989                 return -ERANGE;
990         }
991
992         if (udest->l_threshold > udest->u_threshold) {
993                 pr_err("%s(): lower threshold is higher than upper threshold\n",
994                         __func__);
995                 return -ERANGE;
996         }
997
998         ip_vs_addr_copy(svc->af, &daddr, &udest->addr);
999
1000         /*
1001          *  Lookup the destination list
1002          */
1003         dest = ip_vs_lookup_dest(svc, &daddr, dport);
1004
1005         if (dest == NULL) {
1006                 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1007                 return -ENOENT;
1008         }
1009
1010         __ip_vs_update_dest(svc, dest, udest, 0);
1011         LeaveFunction(2);
1012
1013         return 0;
1014 }
1015
1016 static void ip_vs_dest_wait_readers(struct rcu_head *head)
1017 {
1018         struct ip_vs_dest *dest = container_of(head, struct ip_vs_dest,
1019                                                rcu_head);
1020
1021         /* End of grace period after unlinking */
1022         clear_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
1023 }
1024
1025
1026 /*
1027  *      Delete a destination (must be already unlinked from the service)
1028  */
1029 static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest,
1030                              bool cleanup)
1031 {
1032         struct netns_ipvs *ipvs = net_ipvs(net);
1033
1034         ip_vs_stop_estimator(net, &dest->stats);
1035
1036         /*
1037          *  Remove it from the d-linked list with the real services.
1038          */
1039         ip_vs_rs_unhash(dest);
1040
1041         if (!cleanup) {
1042                 set_bit(IP_VS_DEST_STATE_REMOVING, &dest->state);
1043                 call_rcu(&dest->rcu_head, ip_vs_dest_wait_readers);
1044         }
1045
1046         spin_lock_bh(&ipvs->dest_trash_lock);
1047         IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
1048                       IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
1049                       atomic_read(&dest->refcnt));
1050         if (list_empty(&ipvs->dest_trash) && !cleanup)
1051                 mod_timer(&ipvs->dest_trash_timer,
1052                           jiffies + IP_VS_DEST_TRASH_PERIOD);
1053         /* dest lives in trash without reference */
1054         list_add(&dest->t_list, &ipvs->dest_trash);
1055         spin_unlock_bh(&ipvs->dest_trash_lock);
1056         ip_vs_dest_put(dest);
1057 }
1058
1059
1060 /*
1061  *      Unlink a destination from the given service
1062  */
1063 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1064                                 struct ip_vs_dest *dest,
1065                                 int svcupd)
1066 {
1067         dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1068
1069         /*
1070          *  Remove it from the d-linked destination list.
1071          */
1072         list_del(&dest->n_list);
1073         svc->num_dests--;
1074
1075         if (svcupd && svc->scheduler->del_dest)
1076                 svc->scheduler->del_dest(svc, dest);
1077
1078         /*
1079          *  Call the update_service function of its scheduler
1080          */
1081         if (svcupd && svc->scheduler->update_service)
1082                         svc->scheduler->update_service(svc);
1083 }
1084
1085
1086 /*
1087  *      Delete a destination server in the given service
1088  */
1089 static int
1090 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1091 {
1092         struct ip_vs_dest *dest;
1093         __be16 dport = udest->port;
1094
1095         EnterFunction(2);
1096
1097         dest = ip_vs_lookup_dest(svc, &udest->addr, dport);
1098
1099         if (dest == NULL) {
1100                 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1101                 return -ENOENT;
1102         }
1103
1104         write_lock_bh(&__ip_vs_svc_lock);
1105
1106         /*
1107          *      Wait until all other svc users go away.
1108          */
1109         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1110
1111         /*
1112          *      Unlink dest from the service
1113          */
1114         __ip_vs_unlink_dest(svc, dest, 1);
1115
1116         write_unlock_bh(&__ip_vs_svc_lock);
1117
1118         /*
1119          *      Delete the destination
1120          */
1121         __ip_vs_del_dest(svc->net, dest, false);
1122
1123         LeaveFunction(2);
1124
1125         return 0;
1126 }
1127
1128 static void ip_vs_dest_trash_expire(unsigned long data)
1129 {
1130         struct net *net = (struct net *) data;
1131         struct netns_ipvs *ipvs = net_ipvs(net);
1132         struct ip_vs_dest *dest, *next;
1133
1134         spin_lock(&ipvs->dest_trash_lock);
1135         list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1136                 /* Skip if dest is in grace period */
1137                 if (test_bit(IP_VS_DEST_STATE_REMOVING, &dest->state))
1138                         continue;
1139                 if (atomic_read(&dest->refcnt) > 0)
1140                         continue;
1141                 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
1142                               dest->vfwmark,
1143                               IP_VS_DBG_ADDR(dest->svc->af, &dest->addr),
1144                               ntohs(dest->port));
1145                 list_del(&dest->t_list);
1146                 ip_vs_dest_free(dest);
1147         }
1148         if (!list_empty(&ipvs->dest_trash))
1149                 mod_timer(&ipvs->dest_trash_timer,
1150                           jiffies + IP_VS_DEST_TRASH_PERIOD);
1151         spin_unlock(&ipvs->dest_trash_lock);
1152 }
1153
1154 /*
1155  *      Add a service into the service hash table
1156  */
1157 static int
1158 ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1159                   struct ip_vs_service **svc_p)
1160 {
1161         int ret = 0;
1162         struct ip_vs_scheduler *sched = NULL;
1163         struct ip_vs_pe *pe = NULL;
1164         struct ip_vs_service *svc = NULL;
1165         struct netns_ipvs *ipvs = net_ipvs(net);
1166
1167         /* increase the module use count */
1168         ip_vs_use_count_inc();
1169
1170         /* Lookup the scheduler by 'u->sched_name' */
1171         sched = ip_vs_scheduler_get(u->sched_name);
1172         if (sched == NULL) {
1173                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1174                 ret = -ENOENT;
1175                 goto out_err;
1176         }
1177
1178         if (u->pe_name && *u->pe_name) {
1179                 pe = ip_vs_pe_getbyname(u->pe_name);
1180                 if (pe == NULL) {
1181                         pr_info("persistence engine module ip_vs_pe_%s "
1182                                 "not found\n", u->pe_name);
1183                         ret = -ENOENT;
1184                         goto out_err;
1185                 }
1186         }
1187
1188 #ifdef CONFIG_IP_VS_IPV6
1189         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1190                 ret = -EINVAL;
1191                 goto out_err;
1192         }
1193 #endif
1194
1195         svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL);
1196         if (svc == NULL) {
1197                 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1198                 ret = -ENOMEM;
1199                 goto out_err;
1200         }
1201         svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1202         if (!svc->stats.cpustats) {
1203                 ret = -ENOMEM;
1204                 goto out_err;
1205         }
1206
1207         /* I'm the first user of the service */
1208         atomic_set(&svc->usecnt, 0);
1209         atomic_set(&svc->refcnt, 0);
1210
1211         svc->af = u->af;
1212         svc->protocol = u->protocol;
1213         ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1214         svc->port = u->port;
1215         svc->fwmark = u->fwmark;
1216         svc->flags = u->flags;
1217         svc->timeout = u->timeout * HZ;
1218         svc->netmask = u->netmask;
1219         svc->net = net;
1220
1221         INIT_LIST_HEAD(&svc->destinations);
1222         rwlock_init(&svc->sched_lock);
1223         spin_lock_init(&svc->stats.lock);
1224
1225         /* Bind the scheduler */
1226         ret = ip_vs_bind_scheduler(svc, sched);
1227         if (ret)
1228                 goto out_err;
1229         sched = NULL;
1230
1231         /* Bind the ct retriever */
1232         ip_vs_bind_pe(svc, pe);
1233         pe = NULL;
1234
1235         /* Update the virtual service counters */
1236         if (svc->port == FTPPORT)
1237                 atomic_inc(&ipvs->ftpsvc_counter);
1238         else if (svc->port == 0)
1239                 atomic_inc(&ipvs->nullsvc_counter);
1240
1241         ip_vs_start_estimator(net, &svc->stats);
1242
1243         /* Count only IPv4 services for old get/setsockopt interface */
1244         if (svc->af == AF_INET)
1245                 ipvs->num_services++;
1246
1247         /* Hash the service into the service table */
1248         write_lock_bh(&__ip_vs_svc_lock);
1249         ip_vs_svc_hash(svc);
1250         write_unlock_bh(&__ip_vs_svc_lock);
1251
1252         *svc_p = svc;
1253         /* Now there is a service - full throttle */
1254         ipvs->enable = 1;
1255         return 0;
1256
1257
1258  out_err:
1259         if (svc != NULL) {
1260                 ip_vs_unbind_scheduler(svc);
1261                 if (svc->inc) {
1262                         local_bh_disable();
1263                         ip_vs_app_inc_put(svc->inc);
1264                         local_bh_enable();
1265                 }
1266                 if (svc->stats.cpustats)
1267                         free_percpu(svc->stats.cpustats);
1268                 kfree(svc);
1269         }
1270         ip_vs_scheduler_put(sched);
1271         ip_vs_pe_put(pe);
1272
1273         /* decrease the module use count */
1274         ip_vs_use_count_dec();
1275
1276         return ret;
1277 }
1278
1279
1280 /*
1281  *      Edit a service and bind it with a new scheduler
1282  */
1283 static int
1284 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1285 {
1286         struct ip_vs_scheduler *sched, *old_sched;
1287         struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1288         int ret = 0;
1289
1290         /*
1291          * Lookup the scheduler, by 'u->sched_name'
1292          */
1293         sched = ip_vs_scheduler_get(u->sched_name);
1294         if (sched == NULL) {
1295                 pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name);
1296                 return -ENOENT;
1297         }
1298         old_sched = sched;
1299
1300         if (u->pe_name && *u->pe_name) {
1301                 pe = ip_vs_pe_getbyname(u->pe_name);
1302                 if (pe == NULL) {
1303                         pr_info("persistence engine module ip_vs_pe_%s "
1304                                 "not found\n", u->pe_name);
1305                         ret = -ENOENT;
1306                         goto out;
1307                 }
1308                 old_pe = pe;
1309         }
1310
1311 #ifdef CONFIG_IP_VS_IPV6
1312         if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) {
1313                 ret = -EINVAL;
1314                 goto out;
1315         }
1316 #endif
1317
1318         write_lock_bh(&__ip_vs_svc_lock);
1319
1320         /*
1321          * Wait until all other svc users go away.
1322          */
1323         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1324
1325         /*
1326          * Set the flags and timeout value
1327          */
1328         svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1329         svc->timeout = u->timeout * HZ;
1330         svc->netmask = u->netmask;
1331
1332         old_sched = svc->scheduler;
1333         if (sched != old_sched) {
1334                 /*
1335                  * Unbind the old scheduler
1336                  */
1337                 if ((ret = ip_vs_unbind_scheduler(svc))) {
1338                         old_sched = sched;
1339                         goto out_unlock;
1340                 }
1341
1342                 /*
1343                  * Bind the new scheduler
1344                  */
1345                 if ((ret = ip_vs_bind_scheduler(svc, sched))) {
1346                         /*
1347                          * If ip_vs_bind_scheduler fails, restore the old
1348                          * scheduler.
1349                          * The main reason of failure is out of memory.
1350                          *
1351                          * The question is if the old scheduler can be
1352                          * restored all the time. TODO: if it cannot be
1353                          * restored some time, we must delete the service,
1354                          * otherwise the system may crash.
1355                          */
1356                         ip_vs_bind_scheduler(svc, old_sched);
1357                         old_sched = sched;
1358                         goto out_unlock;
1359                 }
1360         }
1361
1362         old_pe = svc->pe;
1363         if (pe != old_pe) {
1364                 ip_vs_unbind_pe(svc);
1365                 ip_vs_bind_pe(svc, pe);
1366         }
1367
1368 out_unlock:
1369         write_unlock_bh(&__ip_vs_svc_lock);
1370 out:
1371         ip_vs_scheduler_put(old_sched);
1372         ip_vs_pe_put(old_pe);
1373         return ret;
1374 }
1375
1376
1377 /*
1378  *      Delete a service from the service list
1379  *      - The service must be unlinked, unlocked and not referenced!
1380  *      - We are called under _bh lock
1381  */
1382 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
1383 {
1384         struct ip_vs_dest *dest, *nxt;
1385         struct ip_vs_scheduler *old_sched;
1386         struct ip_vs_pe *old_pe;
1387         struct netns_ipvs *ipvs = net_ipvs(svc->net);
1388
1389         pr_info("%s: enter\n", __func__);
1390
1391         /* Count only IPv4 services for old get/setsockopt interface */
1392         if (svc->af == AF_INET)
1393                 ipvs->num_services--;
1394
1395         ip_vs_stop_estimator(svc->net, &svc->stats);
1396
1397         /* Unbind scheduler */
1398         old_sched = svc->scheduler;
1399         ip_vs_unbind_scheduler(svc);
1400         ip_vs_scheduler_put(old_sched);
1401
1402         /* Unbind persistence engine */
1403         old_pe = svc->pe;
1404         ip_vs_unbind_pe(svc);
1405         ip_vs_pe_put(old_pe);
1406
1407         /* Unbind app inc */
1408         if (svc->inc) {
1409                 ip_vs_app_inc_put(svc->inc);
1410                 svc->inc = NULL;
1411         }
1412
1413         /*
1414          *    Unlink the whole destination list
1415          */
1416         list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1417                 __ip_vs_unlink_dest(svc, dest, 0);
1418                 __ip_vs_del_dest(svc->net, dest, cleanup);
1419         }
1420
1421         /*
1422          *    Update the virtual service counters
1423          */
1424         if (svc->port == FTPPORT)
1425                 atomic_dec(&ipvs->ftpsvc_counter);
1426         else if (svc->port == 0)
1427                 atomic_dec(&ipvs->nullsvc_counter);
1428
1429         /*
1430          *    Free the service if nobody refers to it
1431          */
1432         if (atomic_read(&svc->refcnt) == 0) {
1433                 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n",
1434                               svc->fwmark,
1435                               IP_VS_DBG_ADDR(svc->af, &svc->addr),
1436                               ntohs(svc->port), atomic_read(&svc->usecnt));
1437                 free_percpu(svc->stats.cpustats);
1438                 kfree(svc);
1439         }
1440
1441         /* decrease the module use count */
1442         ip_vs_use_count_dec();
1443 }
1444
1445 /*
1446  * Unlink a service from list and try to delete it if its refcnt reached 0
1447  */
1448 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
1449 {
1450         /*
1451          * Unhash it from the service table
1452          */
1453         write_lock_bh(&__ip_vs_svc_lock);
1454
1455         ip_vs_svc_unhash(svc);
1456
1457         /*
1458          * Wait until all the svc users go away.
1459          */
1460         IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0);
1461
1462         __ip_vs_del_service(svc, cleanup);
1463
1464         write_unlock_bh(&__ip_vs_svc_lock);
1465 }
1466
1467 /*
1468  *      Delete a service from the service list
1469  */
1470 static int ip_vs_del_service(struct ip_vs_service *svc)
1471 {
1472         if (svc == NULL)
1473                 return -EEXIST;
1474         ip_vs_unlink_service(svc, false);
1475
1476         return 0;
1477 }
1478
1479
1480 /*
1481  *      Flush all the virtual services
1482  */
1483 static int ip_vs_flush(struct net *net, bool cleanup)
1484 {
1485         int idx;
1486         struct ip_vs_service *svc, *nxt;
1487
1488         /*
1489          * Flush the service table hashed by <netns,protocol,addr,port>
1490          */
1491         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1492                 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1493                                          s_list) {
1494                         if (net_eq(svc->net, net))
1495                                 ip_vs_unlink_service(svc, cleanup);
1496                 }
1497         }
1498
1499         /*
1500          * Flush the service table hashed by fwmark
1501          */
1502         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1503                 list_for_each_entry_safe(svc, nxt,
1504                                          &ip_vs_svc_fwm_table[idx], f_list) {
1505                         if (net_eq(svc->net, net))
1506                                 ip_vs_unlink_service(svc, cleanup);
1507                 }
1508         }
1509
1510         return 0;
1511 }
1512
1513 /*
1514  *      Delete service by {netns} in the service table.
1515  *      Called by __ip_vs_cleanup()
1516  */
1517 void ip_vs_service_net_cleanup(struct net *net)
1518 {
1519         EnterFunction(2);
1520         /* Check for "full" addressed entries */
1521         mutex_lock(&__ip_vs_mutex);
1522         ip_vs_flush(net, true);
1523         mutex_unlock(&__ip_vs_mutex);
1524         LeaveFunction(2);
1525 }
1526
1527 /* Put all references for device (dst_cache) */
1528 static inline void
1529 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
1530 {
1531         spin_lock_bh(&dest->dst_lock);
1532         if (dest->dest_dst && dest->dest_dst->dst_cache->dev == dev) {
1533                 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
1534                               dev->name,
1535                               IP_VS_DBG_ADDR(dest->af, &dest->addr),
1536                               ntohs(dest->port),
1537                               atomic_read(&dest->refcnt));
1538                 __ip_vs_dst_cache_reset(dest);
1539         }
1540         spin_unlock_bh(&dest->dst_lock);
1541
1542 }
1543 /* Netdev event receiver
1544  * Currently only NETDEV_DOWN is handled to release refs to cached dsts
1545  */
1546 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
1547                             void *ptr)
1548 {
1549         struct net_device *dev = ptr;
1550         struct net *net = dev_net(dev);
1551         struct netns_ipvs *ipvs = net_ipvs(net);
1552         struct ip_vs_service *svc;
1553         struct ip_vs_dest *dest;
1554         unsigned int idx;
1555
1556         if (event != NETDEV_DOWN || !ipvs)
1557                 return NOTIFY_DONE;
1558         IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
1559         EnterFunction(2);
1560         mutex_lock(&__ip_vs_mutex);
1561         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1562                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1563                         if (net_eq(svc->net, net)) {
1564                                 list_for_each_entry(dest, &svc->destinations,
1565                                                     n_list) {
1566                                         ip_vs_forget_dev(dest, dev);
1567                                 }
1568                         }
1569                 }
1570
1571                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1572                         if (net_eq(svc->net, net)) {
1573                                 list_for_each_entry(dest, &svc->destinations,
1574                                                     n_list) {
1575                                         ip_vs_forget_dev(dest, dev);
1576                                 }
1577                         }
1578
1579                 }
1580         }
1581
1582         spin_lock_bh(&ipvs->dest_trash_lock);
1583         list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
1584                 ip_vs_forget_dev(dest, dev);
1585         }
1586         spin_unlock_bh(&ipvs->dest_trash_lock);
1587         mutex_unlock(&__ip_vs_mutex);
1588         LeaveFunction(2);
1589         return NOTIFY_DONE;
1590 }
1591
1592 /*
1593  *      Zero counters in a service or all services
1594  */
1595 static int ip_vs_zero_service(struct ip_vs_service *svc)
1596 {
1597         struct ip_vs_dest *dest;
1598
1599         write_lock_bh(&__ip_vs_svc_lock);
1600         list_for_each_entry(dest, &svc->destinations, n_list) {
1601                 ip_vs_zero_stats(&dest->stats);
1602         }
1603         ip_vs_zero_stats(&svc->stats);
1604         write_unlock_bh(&__ip_vs_svc_lock);
1605         return 0;
1606 }
1607
1608 static int ip_vs_zero_all(struct net *net)
1609 {
1610         int idx;
1611         struct ip_vs_service *svc;
1612
1613         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1614                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1615                         if (net_eq(svc->net, net))
1616                                 ip_vs_zero_service(svc);
1617                 }
1618         }
1619
1620         for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1621                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1622                         if (net_eq(svc->net, net))
1623                                 ip_vs_zero_service(svc);
1624                 }
1625         }
1626
1627         ip_vs_zero_stats(&net_ipvs(net)->tot_stats);
1628         return 0;
1629 }
1630
1631 #ifdef CONFIG_SYSCTL
1632
1633 static int zero;
1634 static int three = 3;
1635
1636 static int
1637 proc_do_defense_mode(ctl_table *table, int write,
1638                      void __user *buffer, size_t *lenp, loff_t *ppos)
1639 {
1640         struct net *net = current->nsproxy->net_ns;
1641         int *valp = table->data;
1642         int val = *valp;
1643         int rc;
1644
1645         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1646         if (write && (*valp != val)) {
1647                 if ((*valp < 0) || (*valp > 3)) {
1648                         /* Restore the correct value */
1649                         *valp = val;
1650                 } else {
1651                         update_defense_level(net_ipvs(net));
1652                 }
1653         }
1654         return rc;
1655 }
1656
1657 static int
1658 proc_do_sync_threshold(ctl_table *table, int write,
1659                        void __user *buffer, size_t *lenp, loff_t *ppos)
1660 {
1661         int *valp = table->data;
1662         int val[2];
1663         int rc;
1664
1665         /* backup the value first */
1666         memcpy(val, valp, sizeof(val));
1667
1668         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1669         if (write && (valp[0] < 0 || valp[1] < 0 ||
1670             (valp[0] >= valp[1] && valp[1]))) {
1671                 /* Restore the correct value */
1672                 memcpy(valp, val, sizeof(val));
1673         }
1674         return rc;
1675 }
1676
1677 static int
1678 proc_do_sync_mode(ctl_table *table, int write,
1679                      void __user *buffer, size_t *lenp, loff_t *ppos)
1680 {
1681         int *valp = table->data;
1682         int val = *valp;
1683         int rc;
1684
1685         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1686         if (write && (*valp != val)) {
1687                 if ((*valp < 0) || (*valp > 1)) {
1688                         /* Restore the correct value */
1689                         *valp = val;
1690                 }
1691         }
1692         return rc;
1693 }
1694
1695 static int
1696 proc_do_sync_ports(ctl_table *table, int write,
1697                    void __user *buffer, size_t *lenp, loff_t *ppos)
1698 {
1699         int *valp = table->data;
1700         int val = *valp;
1701         int rc;
1702
1703         rc = proc_dointvec(table, write, buffer, lenp, ppos);
1704         if (write && (*valp != val)) {
1705                 if (*valp < 1 || !is_power_of_2(*valp)) {
1706                         /* Restore the correct value */
1707                         *valp = val;
1708                 }
1709         }
1710         return rc;
1711 }
1712
1713 /*
1714  *      IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1715  *      Do not change order or insert new entries without
1716  *      align with netns init in ip_vs_control_net_init()
1717  */
1718
1719 static struct ctl_table vs_vars[] = {
1720         {
1721                 .procname       = "amemthresh",
1722                 .maxlen         = sizeof(int),
1723                 .mode           = 0644,
1724                 .proc_handler   = proc_dointvec,
1725         },
1726         {
1727                 .procname       = "am_droprate",
1728                 .maxlen         = sizeof(int),
1729                 .mode           = 0644,
1730                 .proc_handler   = proc_dointvec,
1731         },
1732         {
1733                 .procname       = "drop_entry",
1734                 .maxlen         = sizeof(int),
1735                 .mode           = 0644,
1736                 .proc_handler   = proc_do_defense_mode,
1737         },
1738         {
1739                 .procname       = "drop_packet",
1740                 .maxlen         = sizeof(int),
1741                 .mode           = 0644,
1742                 .proc_handler   = proc_do_defense_mode,
1743         },
1744 #ifdef CONFIG_IP_VS_NFCT
1745         {
1746                 .procname       = "conntrack",
1747                 .maxlen         = sizeof(int),
1748                 .mode           = 0644,
1749                 .proc_handler   = &proc_dointvec,
1750         },
1751 #endif
1752         {
1753                 .procname       = "secure_tcp",
1754                 .maxlen         = sizeof(int),
1755                 .mode           = 0644,
1756                 .proc_handler   = proc_do_defense_mode,
1757         },
1758         {
1759                 .procname       = "snat_reroute",
1760                 .maxlen         = sizeof(int),
1761                 .mode           = 0644,
1762                 .proc_handler   = &proc_dointvec,
1763         },
1764         {
1765                 .procname       = "sync_version",
1766                 .maxlen         = sizeof(int),
1767                 .mode           = 0644,
1768                 .proc_handler   = &proc_do_sync_mode,
1769         },
1770         {
1771                 .procname       = "sync_ports",
1772                 .maxlen         = sizeof(int),
1773                 .mode           = 0644,
1774                 .proc_handler   = &proc_do_sync_ports,
1775         },
1776         {
1777                 .procname       = "sync_qlen_max",
1778                 .maxlen         = sizeof(int),
1779                 .mode           = 0644,
1780                 .proc_handler   = proc_dointvec,
1781         },
1782         {
1783                 .procname       = "sync_sock_size",
1784                 .maxlen         = sizeof(int),
1785                 .mode           = 0644,
1786                 .proc_handler   = proc_dointvec,
1787         },
1788         {
1789                 .procname       = "cache_bypass",
1790                 .maxlen         = sizeof(int),
1791                 .mode           = 0644,
1792                 .proc_handler   = proc_dointvec,
1793         },
1794         {
1795                 .procname       = "expire_nodest_conn",
1796                 .maxlen         = sizeof(int),
1797                 .mode           = 0644,
1798                 .proc_handler   = proc_dointvec,
1799         },
1800         {
1801                 .procname       = "expire_quiescent_template",
1802                 .maxlen         = sizeof(int),
1803                 .mode           = 0644,
1804                 .proc_handler   = proc_dointvec,
1805         },
1806         {
1807                 .procname       = "sync_threshold",
1808                 .maxlen         =
1809                         sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1810                 .mode           = 0644,
1811                 .proc_handler   = proc_do_sync_threshold,
1812         },
1813         {
1814                 .procname       = "sync_refresh_period",
1815                 .maxlen         = sizeof(int),
1816                 .mode           = 0644,
1817                 .proc_handler   = proc_dointvec_jiffies,
1818         },
1819         {
1820                 .procname       = "sync_retries",
1821                 .maxlen         = sizeof(int),
1822                 .mode           = 0644,
1823                 .proc_handler   = proc_dointvec_minmax,
1824                 .extra1         = &zero,
1825                 .extra2         = &three,
1826         },
1827         {
1828                 .procname       = "nat_icmp_send",
1829                 .maxlen         = sizeof(int),
1830                 .mode           = 0644,
1831                 .proc_handler   = proc_dointvec,
1832         },
1833         {
1834                 .procname       = "pmtu_disc",
1835                 .maxlen         = sizeof(int),
1836                 .mode           = 0644,
1837                 .proc_handler   = proc_dointvec,
1838         },
1839         {
1840                 .procname       = "backup_only",
1841                 .maxlen         = sizeof(int),
1842                 .mode           = 0644,
1843                 .proc_handler   = proc_dointvec,
1844         },
1845 #ifdef CONFIG_IP_VS_DEBUG
1846         {
1847                 .procname       = "debug_level",
1848                 .data           = &sysctl_ip_vs_debug_level,
1849                 .maxlen         = sizeof(int),
1850                 .mode           = 0644,
1851                 .proc_handler   = proc_dointvec,
1852         },
1853 #endif
1854 #if 0
1855         {
1856                 .procname       = "timeout_established",
1857                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED],
1858                 .maxlen         = sizeof(int),
1859                 .mode           = 0644,
1860                 .proc_handler   = proc_dointvec_jiffies,
1861         },
1862         {
1863                 .procname       = "timeout_synsent",
1864                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT],
1865                 .maxlen         = sizeof(int),
1866                 .mode           = 0644,
1867                 .proc_handler   = proc_dointvec_jiffies,
1868         },
1869         {
1870                 .procname       = "timeout_synrecv",
1871                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV],
1872                 .maxlen         = sizeof(int),
1873                 .mode           = 0644,
1874                 .proc_handler   = proc_dointvec_jiffies,
1875         },
1876         {
1877                 .procname       = "timeout_finwait",
1878                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT],
1879                 .maxlen         = sizeof(int),
1880                 .mode           = 0644,
1881                 .proc_handler   = proc_dointvec_jiffies,
1882         },
1883         {
1884                 .procname       = "timeout_timewait",
1885                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT],
1886                 .maxlen         = sizeof(int),
1887                 .mode           = 0644,
1888                 .proc_handler   = proc_dointvec_jiffies,
1889         },
1890         {
1891                 .procname       = "timeout_close",
1892                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE],
1893                 .maxlen         = sizeof(int),
1894                 .mode           = 0644,
1895                 .proc_handler   = proc_dointvec_jiffies,
1896         },
1897         {
1898                 .procname       = "timeout_closewait",
1899                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT],
1900                 .maxlen         = sizeof(int),
1901                 .mode           = 0644,
1902                 .proc_handler   = proc_dointvec_jiffies,
1903         },
1904         {
1905                 .procname       = "timeout_lastack",
1906                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK],
1907                 .maxlen         = sizeof(int),
1908                 .mode           = 0644,
1909                 .proc_handler   = proc_dointvec_jiffies,
1910         },
1911         {
1912                 .procname       = "timeout_listen",
1913                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN],
1914                 .maxlen         = sizeof(int),
1915                 .mode           = 0644,
1916                 .proc_handler   = proc_dointvec_jiffies,
1917         },
1918         {
1919                 .procname       = "timeout_synack",
1920                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK],
1921                 .maxlen         = sizeof(int),
1922                 .mode           = 0644,
1923                 .proc_handler   = proc_dointvec_jiffies,
1924         },
1925         {
1926                 .procname       = "timeout_udp",
1927                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_UDP],
1928                 .maxlen         = sizeof(int),
1929                 .mode           = 0644,
1930                 .proc_handler   = proc_dointvec_jiffies,
1931         },
1932         {
1933                 .procname       = "timeout_icmp",
1934                 .data   = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP],
1935                 .maxlen         = sizeof(int),
1936                 .mode           = 0644,
1937                 .proc_handler   = proc_dointvec_jiffies,
1938         },
1939 #endif
1940         { }
1941 };
1942
1943 #endif
1944
1945 #ifdef CONFIG_PROC_FS
1946
1947 struct ip_vs_iter {
1948         struct seq_net_private p;  /* Do not move this, netns depends upon it*/
1949         struct list_head *table;
1950         int bucket;
1951 };
1952
1953 /*
1954  *      Write the contents of the VS rule table to a PROCfs file.
1955  *      (It is kept just for backward compatibility)
1956  */
1957 static inline const char *ip_vs_fwd_name(unsigned int flags)
1958 {
1959         switch (flags & IP_VS_CONN_F_FWD_MASK) {
1960         case IP_VS_CONN_F_LOCALNODE:
1961                 return "Local";
1962         case IP_VS_CONN_F_TUNNEL:
1963                 return "Tunnel";
1964         case IP_VS_CONN_F_DROUTE:
1965                 return "Route";
1966         default:
1967                 return "Masq";
1968         }
1969 }
1970
1971
1972 /* Get the Nth entry in the two lists */
1973 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1974 {
1975         struct net *net = seq_file_net(seq);
1976         struct ip_vs_iter *iter = seq->private;
1977         int idx;
1978         struct ip_vs_service *svc;
1979
1980         /* look in hash by protocol */
1981         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1982                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1983                         if (net_eq(svc->net, net) && pos-- == 0) {
1984                                 iter->table = ip_vs_svc_table;
1985                                 iter->bucket = idx;
1986                                 return svc;
1987                         }
1988                 }
1989         }
1990
1991         /* keep looking in fwmark */
1992         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1993                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1994                         if (net_eq(svc->net, net) && pos-- == 0) {
1995                                 iter->table = ip_vs_svc_fwm_table;
1996                                 iter->bucket = idx;
1997                                 return svc;
1998                         }
1999                 }
2000         }
2001
2002         return NULL;
2003 }
2004
2005 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
2006 __acquires(__ip_vs_svc_lock)
2007 {
2008
2009         read_lock_bh(&__ip_vs_svc_lock);
2010         return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
2011 }
2012
2013
2014 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2015 {
2016         struct list_head *e;
2017         struct ip_vs_iter *iter;
2018         struct ip_vs_service *svc;
2019
2020         ++*pos;
2021         if (v == SEQ_START_TOKEN)
2022                 return ip_vs_info_array(seq,0);
2023
2024         svc = v;
2025         iter = seq->private;
2026
2027         if (iter->table == ip_vs_svc_table) {
2028                 /* next service in table hashed by protocol */
2029                 if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket])
2030                         return list_entry(e, struct ip_vs_service, s_list);
2031
2032
2033                 while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2034                         list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket],
2035                                             s_list) {
2036                                 return svc;
2037                         }
2038                 }
2039
2040                 iter->table = ip_vs_svc_fwm_table;
2041                 iter->bucket = -1;
2042                 goto scan_fwmark;
2043         }
2044
2045         /* next service in hashed by fwmark */
2046         if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket])
2047                 return list_entry(e, struct ip_vs_service, f_list);
2048
2049  scan_fwmark:
2050         while (++iter->bucket < IP_VS_SVC_TAB_SIZE) {
2051                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket],
2052                                     f_list)
2053                         return svc;
2054         }
2055
2056         return NULL;
2057 }
2058
2059 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
2060 __releases(__ip_vs_svc_lock)
2061 {
2062         read_unlock_bh(&__ip_vs_svc_lock);
2063 }
2064
2065
2066 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
2067 {
2068         if (v == SEQ_START_TOKEN) {
2069                 seq_printf(seq,
2070                         "IP Virtual Server version %d.%d.%d (size=%d)\n",
2071                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2072                 seq_puts(seq,
2073                          "Prot LocalAddress:Port Scheduler Flags\n");
2074                 seq_puts(seq,
2075                          "  -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2076         } else {
2077                 const struct ip_vs_service *svc = v;
2078                 const struct ip_vs_iter *iter = seq->private;
2079                 const struct ip_vs_dest *dest;
2080
2081                 if (iter->table == ip_vs_svc_table) {
2082 #ifdef CONFIG_IP_VS_IPV6
2083                         if (svc->af == AF_INET6)
2084                                 seq_printf(seq, "%s  [%pI6]:%04X %s ",
2085                                            ip_vs_proto_name(svc->protocol),
2086                                            &svc->addr.in6,
2087                                            ntohs(svc->port),
2088                                            svc->scheduler->name);
2089                         else
2090 #endif
2091                                 seq_printf(seq, "%s  %08X:%04X %s %s ",
2092                                            ip_vs_proto_name(svc->protocol),
2093                                            ntohl(svc->addr.ip),
2094                                            ntohs(svc->port),
2095                                            svc->scheduler->name,
2096                                            (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2097                 } else {
2098                         seq_printf(seq, "FWM  %08X %s %s",
2099                                    svc->fwmark, svc->scheduler->name,
2100                                    (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2101                 }
2102
2103                 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2104                         seq_printf(seq, "persistent %d %08X\n",
2105                                 svc->timeout,
2106                                 ntohl(svc->netmask));
2107                 else
2108                         seq_putc(seq, '\n');
2109
2110                 list_for_each_entry(dest, &svc->destinations, n_list) {
2111 #ifdef CONFIG_IP_VS_IPV6
2112                         if (dest->af == AF_INET6)
2113                                 seq_printf(seq,
2114                                            "  -> [%pI6]:%04X"
2115                                            "      %-7s %-6d %-10d %-10d\n",
2116                                            &dest->addr.in6,
2117                                            ntohs(dest->port),
2118                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2119                                            atomic_read(&dest->weight),
2120                                            atomic_read(&dest->activeconns),
2121                                            atomic_read(&dest->inactconns));
2122                         else
2123 #endif
2124                                 seq_printf(seq,
2125                                            "  -> %08X:%04X      "
2126                                            "%-7s %-6d %-10d %-10d\n",
2127                                            ntohl(dest->addr.ip),
2128                                            ntohs(dest->port),
2129                                            ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2130                                            atomic_read(&dest->weight),
2131                                            atomic_read(&dest->activeconns),
2132                                            atomic_read(&dest->inactconns));
2133
2134                 }
2135         }
2136         return 0;
2137 }
2138
2139 static const struct seq_operations ip_vs_info_seq_ops = {
2140         .start = ip_vs_info_seq_start,
2141         .next  = ip_vs_info_seq_next,
2142         .stop  = ip_vs_info_seq_stop,
2143         .show  = ip_vs_info_seq_show,
2144 };
2145
2146 static int ip_vs_info_open(struct inode *inode, struct file *file)
2147 {
2148         return seq_open_net(inode, file, &ip_vs_info_seq_ops,
2149                         sizeof(struct ip_vs_iter));
2150 }
2151
2152 static const struct file_operations ip_vs_info_fops = {
2153         .owner   = THIS_MODULE,
2154         .open    = ip_vs_info_open,
2155         .read    = seq_read,
2156         .llseek  = seq_lseek,
2157         .release = seq_release_net,
2158 };
2159
2160 static int ip_vs_stats_show(struct seq_file *seq, void *v)
2161 {
2162         struct net *net = seq_file_single_net(seq);
2163         struct ip_vs_stats_user show;
2164
2165 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2166         seq_puts(seq,
2167                  "   Total Incoming Outgoing         Incoming         Outgoing\n");
2168         seq_printf(seq,
2169                    "   Conns  Packets  Packets            Bytes            Bytes\n");
2170
2171         ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
2172         seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", show.conns,
2173                    show.inpkts, show.outpkts,
2174                    (unsigned long long) show.inbytes,
2175                    (unsigned long long) show.outbytes);
2176
2177 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2178         seq_puts(seq,
2179                    " Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2180         seq_printf(seq, "%8X %8X %8X %16X %16X\n",
2181                         show.cps, show.inpps, show.outpps,
2182                         show.inbps, show.outbps);
2183
2184         return 0;
2185 }
2186
2187 static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
2188 {
2189         return single_open_net(inode, file, ip_vs_stats_show);
2190 }
2191
2192 static const struct file_operations ip_vs_stats_fops = {
2193         .owner = THIS_MODULE,
2194         .open = ip_vs_stats_seq_open,
2195         .read = seq_read,
2196         .llseek = seq_lseek,
2197         .release = single_release_net,
2198 };
2199
2200 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2201 {
2202         struct net *net = seq_file_single_net(seq);
2203         struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
2204         struct ip_vs_cpu_stats *cpustats = tot_stats->cpustats;
2205         struct ip_vs_stats_user rates;
2206         int i;
2207
2208 /*               01234567 01234567 01234567 0123456701234567 0123456701234567 */
2209         seq_puts(seq,
2210                  "       Total Incoming Outgoing         Incoming         Outgoing\n");
2211         seq_printf(seq,
2212                    "CPU    Conns  Packets  Packets            Bytes            Bytes\n");
2213
2214         for_each_possible_cpu(i) {
2215                 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2216                 unsigned int start;
2217                 __u64 inbytes, outbytes;
2218
2219                 do {
2220                         start = u64_stats_fetch_begin_bh(&u->syncp);
2221                         inbytes = u->ustats.inbytes;
2222                         outbytes = u->ustats.outbytes;
2223                 } while (u64_stats_fetch_retry_bh(&u->syncp, start));
2224
2225                 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2226                            i, u->ustats.conns, u->ustats.inpkts,
2227                            u->ustats.outpkts, (__u64)inbytes,
2228                            (__u64)outbytes);
2229         }
2230
2231         spin_lock_bh(&tot_stats->lock);
2232
2233         seq_printf(seq, "  ~ %8X %8X %8X %16LX %16LX\n\n",
2234                    tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2235                    tot_stats->ustats.outpkts,
2236                    (unsigned long long) tot_stats->ustats.inbytes,
2237                    (unsigned long long) tot_stats->ustats.outbytes);
2238
2239         ip_vs_read_estimator(&rates, tot_stats);
2240
2241         spin_unlock_bh(&tot_stats->lock);
2242
2243 /*                 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2244         seq_puts(seq,
2245                    "     Conns/s   Pkts/s   Pkts/s          Bytes/s          Bytes/s\n");
2246         seq_printf(seq, "    %8X %8X %8X %16X %16X\n",
2247                         rates.cps,
2248                         rates.inpps,
2249                         rates.outpps,
2250                         rates.inbps,
2251                         rates.outbps);
2252
2253         return 0;
2254 }
2255
2256 static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2257 {
2258         return single_open_net(inode, file, ip_vs_stats_percpu_show);
2259 }
2260
2261 static const struct file_operations ip_vs_stats_percpu_fops = {
2262         .owner = THIS_MODULE,
2263         .open = ip_vs_stats_percpu_seq_open,
2264         .read = seq_read,
2265         .llseek = seq_lseek,
2266         .release = single_release_net,
2267 };
2268 #endif
2269
2270 /*
2271  *      Set timeout values for tcp tcpfin udp in the timeout_table.
2272  */
2273 static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2274 {
2275 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2276         struct ip_vs_proto_data *pd;
2277 #endif
2278
2279         IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2280                   u->tcp_timeout,
2281                   u->tcp_fin_timeout,
2282                   u->udp_timeout);
2283
2284 #ifdef CONFIG_IP_VS_PROTO_TCP
2285         if (u->tcp_timeout) {
2286                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2287                 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2288                         = u->tcp_timeout * HZ;
2289         }
2290
2291         if (u->tcp_fin_timeout) {
2292                 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2293                 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2294                         = u->tcp_fin_timeout * HZ;
2295         }
2296 #endif
2297
2298 #ifdef CONFIG_IP_VS_PROTO_UDP
2299         if (u->udp_timeout) {
2300                 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2301                 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2302                         = u->udp_timeout * HZ;
2303         }
2304 #endif
2305         return 0;
2306 }
2307
2308
2309 #define SET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2310 #define SERVICE_ARG_LEN         (sizeof(struct ip_vs_service_user))
2311 #define SVCDEST_ARG_LEN         (sizeof(struct ip_vs_service_user) +    \
2312                                  sizeof(struct ip_vs_dest_user))
2313 #define TIMEOUT_ARG_LEN         (sizeof(struct ip_vs_timeout_user))
2314 #define DAEMON_ARG_LEN          (sizeof(struct ip_vs_daemon_user))
2315 #define MAX_ARG_LEN             SVCDEST_ARG_LEN
2316
2317 static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = {
2318         [SET_CMDID(IP_VS_SO_SET_ADD)]           = SERVICE_ARG_LEN,
2319         [SET_CMDID(IP_VS_SO_SET_EDIT)]          = SERVICE_ARG_LEN,
2320         [SET_CMDID(IP_VS_SO_SET_DEL)]           = SERVICE_ARG_LEN,
2321         [SET_CMDID(IP_VS_SO_SET_FLUSH)]         = 0,
2322         [SET_CMDID(IP_VS_SO_SET_ADDDEST)]       = SVCDEST_ARG_LEN,
2323         [SET_CMDID(IP_VS_SO_SET_DELDEST)]       = SVCDEST_ARG_LEN,
2324         [SET_CMDID(IP_VS_SO_SET_EDITDEST)]      = SVCDEST_ARG_LEN,
2325         [SET_CMDID(IP_VS_SO_SET_TIMEOUT)]       = TIMEOUT_ARG_LEN,
2326         [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)]   = DAEMON_ARG_LEN,
2327         [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)]    = DAEMON_ARG_LEN,
2328         [SET_CMDID(IP_VS_SO_SET_ZERO)]          = SERVICE_ARG_LEN,
2329 };
2330
2331 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
2332                                   struct ip_vs_service_user *usvc_compat)
2333 {
2334         memset(usvc, 0, sizeof(*usvc));
2335
2336         usvc->af                = AF_INET;
2337         usvc->protocol          = usvc_compat->protocol;
2338         usvc->addr.ip           = usvc_compat->addr;
2339         usvc->port              = usvc_compat->port;
2340         usvc->fwmark            = usvc_compat->fwmark;
2341
2342         /* Deep copy of sched_name is not needed here */
2343         usvc->sched_name        = usvc_compat->sched_name;
2344
2345         usvc->flags             = usvc_compat->flags;
2346         usvc->timeout           = usvc_compat->timeout;
2347         usvc->netmask           = usvc_compat->netmask;
2348 }
2349
2350 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2351                                    struct ip_vs_dest_user *udest_compat)
2352 {
2353         memset(udest, 0, sizeof(*udest));
2354
2355         udest->addr.ip          = udest_compat->addr;
2356         udest->port             = udest_compat->port;
2357         udest->conn_flags       = udest_compat->conn_flags;
2358         udest->weight           = udest_compat->weight;
2359         udest->u_threshold      = udest_compat->u_threshold;
2360         udest->l_threshold      = udest_compat->l_threshold;
2361 }
2362
2363 static int
2364 do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2365 {
2366         struct net *net = sock_net(sk);
2367         int ret;
2368         unsigned char arg[MAX_ARG_LEN];
2369         struct ip_vs_service_user *usvc_compat;
2370         struct ip_vs_service_user_kern usvc;
2371         struct ip_vs_service *svc;
2372         struct ip_vs_dest_user *udest_compat;
2373         struct ip_vs_dest_user_kern udest;
2374         struct netns_ipvs *ipvs = net_ipvs(net);
2375
2376         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2377                 return -EPERM;
2378
2379         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
2380                 return -EINVAL;
2381         if (len < 0 || len >  MAX_ARG_LEN)
2382                 return -EINVAL;
2383         if (len != set_arglen[SET_CMDID(cmd)]) {
2384                 pr_err("set_ctl: len %u != %u\n",
2385                        len, set_arglen[SET_CMDID(cmd)]);
2386                 return -EINVAL;
2387         }
2388
2389         if (copy_from_user(arg, user, len) != 0)
2390                 return -EFAULT;
2391
2392         /* increase the module use count */
2393         ip_vs_use_count_inc();
2394
2395         /* Handle daemons since they have another lock */
2396         if (cmd == IP_VS_SO_SET_STARTDAEMON ||
2397             cmd == IP_VS_SO_SET_STOPDAEMON) {
2398                 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2399
2400                 if (mutex_lock_interruptible(&ipvs->sync_mutex)) {
2401                         ret = -ERESTARTSYS;
2402                         goto out_dec;
2403                 }
2404                 if (cmd == IP_VS_SO_SET_STARTDAEMON)
2405                         ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2406                                                 dm->syncid);
2407                 else
2408                         ret = stop_sync_thread(net, dm->state);
2409                 mutex_unlock(&ipvs->sync_mutex);
2410                 goto out_dec;
2411         }
2412
2413         if (mutex_lock_interruptible(&__ip_vs_mutex)) {
2414                 ret = -ERESTARTSYS;
2415                 goto out_dec;
2416         }
2417
2418         if (cmd == IP_VS_SO_SET_FLUSH) {
2419                 /* Flush the virtual service */
2420                 ret = ip_vs_flush(net, false);
2421                 goto out_unlock;
2422         } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2423                 /* Set timeout values for (tcp tcpfin udp) */
2424                 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2425                 goto out_unlock;
2426         }
2427
2428         usvc_compat = (struct ip_vs_service_user *)arg;
2429         udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
2430
2431         /* We only use the new structs internally, so copy userspace compat
2432          * structs to extended internal versions */
2433         ip_vs_copy_usvc_compat(&usvc, usvc_compat);
2434         ip_vs_copy_udest_compat(&udest, udest_compat);
2435
2436         if (cmd == IP_VS_SO_SET_ZERO) {
2437                 /* if no service address is set, zero counters in all */
2438                 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2439                         ret = ip_vs_zero_all(net);
2440                         goto out_unlock;
2441                 }
2442         }
2443
2444         /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
2445         if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
2446             usvc.protocol != IPPROTO_SCTP) {
2447                 pr_err("set_ctl: invalid protocol: %d %pI4:%d %s\n",
2448                        usvc.protocol, &usvc.addr.ip,
2449                        ntohs(usvc.port), usvc.sched_name);
2450                 ret = -EFAULT;
2451                 goto out_unlock;
2452         }
2453
2454         /* Lookup the exact service by <protocol, addr, port> or fwmark */
2455         if (usvc.fwmark == 0)
2456                 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2457                                            &usvc.addr, usvc.port);
2458         else
2459                 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2460
2461         if (cmd != IP_VS_SO_SET_ADD
2462             && (svc == NULL || svc->protocol != usvc.protocol)) {
2463                 ret = -ESRCH;
2464                 goto out_unlock;
2465         }
2466
2467         switch (cmd) {
2468         case IP_VS_SO_SET_ADD:
2469                 if (svc != NULL)
2470                         ret = -EEXIST;
2471                 else
2472                         ret = ip_vs_add_service(net, &usvc, &svc);
2473                 break;
2474         case IP_VS_SO_SET_EDIT:
2475                 ret = ip_vs_edit_service(svc, &usvc);
2476                 break;
2477         case IP_VS_SO_SET_DEL:
2478                 ret = ip_vs_del_service(svc);
2479                 if (!ret)
2480                         goto out_unlock;
2481                 break;
2482         case IP_VS_SO_SET_ZERO:
2483                 ret = ip_vs_zero_service(svc);
2484                 break;
2485         case IP_VS_SO_SET_ADDDEST:
2486                 ret = ip_vs_add_dest(svc, &udest);
2487                 break;
2488         case IP_VS_SO_SET_EDITDEST:
2489                 ret = ip_vs_edit_dest(svc, &udest);
2490                 break;
2491         case IP_VS_SO_SET_DELDEST:
2492                 ret = ip_vs_del_dest(svc, &udest);
2493                 break;
2494         default:
2495                 ret = -EINVAL;
2496         }
2497
2498   out_unlock:
2499         mutex_unlock(&__ip_vs_mutex);
2500   out_dec:
2501         /* decrease the module use count */
2502         ip_vs_use_count_dec();
2503
2504         return ret;
2505 }
2506
2507
2508 static void
2509 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2510 {
2511         dst->protocol = src->protocol;
2512         dst->addr = src->addr.ip;
2513         dst->port = src->port;
2514         dst->fwmark = src->fwmark;
2515         strlcpy(dst->sched_name, src->scheduler->name, sizeof(dst->sched_name));
2516         dst->flags = src->flags;
2517         dst->timeout = src->timeout / HZ;
2518         dst->netmask = src->netmask;
2519         dst->num_dests = src->num_dests;
2520         ip_vs_copy_stats(&dst->stats, &src->stats);
2521 }
2522
2523 static inline int
2524 __ip_vs_get_service_entries(struct net *net,
2525                             const struct ip_vs_get_services *get,
2526                             struct ip_vs_get_services __user *uptr)
2527 {
2528         int idx, count=0;
2529         struct ip_vs_service *svc;
2530         struct ip_vs_service_entry entry;
2531         int ret = 0;
2532
2533         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2534                 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2535                         /* Only expose IPv4 entries to old interface */
2536                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2537                                 continue;
2538
2539                         if (count >= get->num_services)
2540                                 goto out;
2541                         memset(&entry, 0, sizeof(entry));
2542                         ip_vs_copy_service(&entry, svc);
2543                         if (copy_to_user(&uptr->entrytable[count],
2544                                          &entry, sizeof(entry))) {
2545                                 ret = -EFAULT;
2546                                 goto out;
2547                         }
2548                         count++;
2549                 }
2550         }
2551
2552         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2553                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2554                         /* Only expose IPv4 entries to old interface */
2555                         if (svc->af != AF_INET || !net_eq(svc->net, net))
2556                                 continue;
2557
2558                         if (count >= get->num_services)
2559                                 goto out;
2560                         memset(&entry, 0, sizeof(entry));
2561                         ip_vs_copy_service(&entry, svc);
2562                         if (copy_to_user(&uptr->entrytable[count],
2563                                          &entry, sizeof(entry))) {
2564                                 ret = -EFAULT;
2565                                 goto out;
2566                         }
2567                         count++;
2568                 }
2569         }
2570 out:
2571         return ret;
2572 }
2573
2574 static inline int
2575 __ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2576                          struct ip_vs_get_dests __user *uptr)
2577 {
2578         struct ip_vs_service *svc;
2579         union nf_inet_addr addr = { .ip = get->addr };
2580         int ret = 0;
2581
2582         if (get->fwmark)
2583                 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2584         else
2585                 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2586                                            get->port);
2587
2588         if (svc) {
2589                 int count = 0;
2590                 struct ip_vs_dest *dest;
2591                 struct ip_vs_dest_entry entry;
2592
2593                 list_for_each_entry(dest, &svc->destinations, n_list) {
2594                         if (count >= get->num_dests)
2595                                 break;
2596
2597                         entry.addr = dest->addr.ip;
2598                         entry.port = dest->port;
2599                         entry.conn_flags = atomic_read(&dest->conn_flags);
2600                         entry.weight = atomic_read(&dest->weight);
2601                         entry.u_threshold = dest->u_threshold;
2602                         entry.l_threshold = dest->l_threshold;
2603                         entry.activeconns = atomic_read(&dest->activeconns);
2604                         entry.inactconns = atomic_read(&dest->inactconns);
2605                         entry.persistconns = atomic_read(&dest->persistconns);
2606                         ip_vs_copy_stats(&entry.stats, &dest->stats);
2607                         if (copy_to_user(&uptr->entrytable[count],
2608                                          &entry, sizeof(entry))) {
2609                                 ret = -EFAULT;
2610                                 break;
2611                         }
2612                         count++;
2613                 }
2614         } else
2615                 ret = -ESRCH;
2616         return ret;
2617 }
2618
2619 static inline void
2620 __ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2621 {
2622 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
2623         struct ip_vs_proto_data *pd;
2624 #endif
2625
2626         memset(u, 0, sizeof (*u));
2627
2628 #ifdef CONFIG_IP_VS_PROTO_TCP
2629         pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2630         u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2631         u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2632 #endif
2633 #ifdef CONFIG_IP_VS_PROTO_UDP
2634         pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2635         u->udp_timeout =
2636                         pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2637 #endif
2638 }
2639
2640
2641 #define GET_CMDID(cmd)          (cmd - IP_VS_BASE_CTL)
2642 #define GET_INFO_ARG_LEN        (sizeof(struct ip_vs_getinfo))
2643 #define GET_SERVICES_ARG_LEN    (sizeof(struct ip_vs_get_services))
2644 #define GET_SERVICE_ARG_LEN     (sizeof(struct ip_vs_service_entry))
2645 #define GET_DESTS_ARG_LEN       (sizeof(struct ip_vs_get_dests))
2646 #define GET_TIMEOUT_ARG_LEN     (sizeof(struct ip_vs_timeout_user))
2647 #define GET_DAEMON_ARG_LEN      (sizeof(struct ip_vs_daemon_user) * 2)
2648
2649 static const unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = {
2650         [GET_CMDID(IP_VS_SO_GET_VERSION)]       = 64,
2651         [GET_CMDID(IP_VS_SO_GET_INFO)]          = GET_INFO_ARG_LEN,
2652         [GET_CMDID(IP_VS_SO_GET_SERVICES)]      = GET_SERVICES_ARG_LEN,
2653         [GET_CMDID(IP_VS_SO_GET_SERVICE)]       = GET_SERVICE_ARG_LEN,
2654         [GET_CMDID(IP_VS_SO_GET_DESTS)]         = GET_DESTS_ARG_LEN,
2655         [GET_CMDID(IP_VS_SO_GET_TIMEOUT)]       = GET_TIMEOUT_ARG_LEN,
2656         [GET_CMDID(IP_VS_SO_GET_DAEMON)]        = GET_DAEMON_ARG_LEN,
2657 };
2658
2659 static int
2660 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2661 {
2662         unsigned char arg[128];
2663         int ret = 0;
2664         unsigned int copylen;
2665         struct net *net = sock_net(sk);
2666         struct netns_ipvs *ipvs = net_ipvs(net);
2667
2668         BUG_ON(!net);
2669         if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2670                 return -EPERM;
2671
2672         if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
2673                 return -EINVAL;
2674
2675         if (*len < get_arglen[GET_CMDID(cmd)]) {
2676                 pr_err("get_ctl: len %u < %u\n",
2677                        *len, get_arglen[GET_CMDID(cmd)]);
2678                 return -EINVAL;
2679         }
2680
2681         copylen = get_arglen[GET_CMDID(cmd)];
2682         if (copylen > 128)
2683                 return -EINVAL;
2684
2685         if (copy_from_user(arg, user, copylen) != 0)
2686                 return -EFAULT;
2687         /*
2688          * Handle daemons first since it has its own locking
2689          */
2690         if (cmd == IP_VS_SO_GET_DAEMON) {
2691                 struct ip_vs_daemon_user d[2];
2692
2693                 memset(&d, 0, sizeof(d));
2694                 if (mutex_lock_interruptible(&ipvs->sync_mutex))
2695                         return -ERESTARTSYS;
2696
2697                 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2698                         d[0].state = IP_VS_STATE_MASTER;
2699                         strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2700                                 sizeof(d[0].mcast_ifn));
2701                         d[0].syncid = ipvs->master_syncid;
2702                 }
2703                 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2704                         d[1].state = IP_VS_STATE_BACKUP;
2705                         strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2706                                 sizeof(d[1].mcast_ifn));
2707                         d[1].syncid = ipvs->backup_syncid;
2708                 }
2709                 if (copy_to_user(user, &d, sizeof(d)) != 0)
2710                         ret = -EFAULT;
2711                 mutex_unlock(&ipvs->sync_mutex);
2712                 return ret;
2713         }
2714
2715         if (mutex_lock_interruptible(&__ip_vs_mutex))
2716                 return -ERESTARTSYS;
2717
2718         switch (cmd) {
2719         case IP_VS_SO_GET_VERSION:
2720         {
2721                 char buf[64];
2722
2723                 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
2724                         NVERSION(IP_VS_VERSION_CODE), ip_vs_conn_tab_size);
2725                 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
2726                         ret = -EFAULT;
2727                         goto out;
2728                 }
2729                 *len = strlen(buf)+1;
2730         }
2731         break;
2732
2733         case IP_VS_SO_GET_INFO:
2734         {
2735                 struct ip_vs_getinfo info;
2736                 info.version = IP_VS_VERSION_CODE;
2737                 info.size = ip_vs_conn_tab_size;
2738                 info.num_services = ipvs->num_services;
2739                 if (copy_to_user(user, &info, sizeof(info)) != 0)
2740                         ret = -EFAULT;
2741         }
2742         break;
2743
2744         case IP_VS_SO_GET_SERVICES:
2745         {
2746                 struct ip_vs_get_services *get;
2747                 int size;
2748
2749                 get = (struct ip_vs_get_services *)arg;
2750                 size = sizeof(*get) +
2751                         sizeof(struct ip_vs_service_entry) * get->num_services;
2752                 if (*len != size) {
2753                         pr_err("length: %u != %u\n", *len, size);
2754                         ret = -EINVAL;
2755                         goto out;
2756                 }
2757                 ret = __ip_vs_get_service_entries(net, get, user);
2758         }
2759         break;
2760
2761         case IP_VS_SO_GET_SERVICE:
2762         {
2763                 struct ip_vs_service_entry *entry;
2764                 struct ip_vs_service *svc;
2765                 union nf_inet_addr addr;
2766
2767                 entry = (struct ip_vs_service_entry *)arg;
2768                 addr.ip = entry->addr;
2769                 if (entry->fwmark)
2770                         svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2771                 else
2772                         svc = __ip_vs_service_find(net, AF_INET,
2773                                                    entry->protocol, &addr,
2774                                                    entry->port);
2775                 if (svc) {
2776                         ip_vs_copy_service(entry, svc);
2777                         if (copy_to_user(user, entry, sizeof(*entry)) != 0)
2778                                 ret = -EFAULT;
2779                 } else
2780                         ret = -ESRCH;
2781         }
2782         break;
2783
2784         case IP_VS_SO_GET_DESTS:
2785         {
2786                 struct ip_vs_get_dests *get;
2787                 int size;
2788
2789                 get = (struct ip_vs_get_dests *)arg;
2790                 size = sizeof(*get) +
2791                         sizeof(struct ip_vs_dest_entry) * get->num_dests;
2792                 if (*len != size) {
2793                         pr_err("length: %u != %u\n", *len, size);
2794                         ret = -EINVAL;
2795                         goto out;
2796                 }
2797                 ret = __ip_vs_get_dest_entries(net, get, user);
2798         }
2799         break;
2800
2801         case IP_VS_SO_GET_TIMEOUT:
2802         {
2803                 struct ip_vs_timeout_user t;
2804
2805                 __ip_vs_get_timeouts(net, &t);
2806                 if (copy_to_user(user, &t, sizeof(t)) != 0)
2807                         ret = -EFAULT;
2808         }
2809         break;
2810
2811         default:
2812                 ret = -EINVAL;
2813         }
2814
2815 out:
2816         mutex_unlock(&__ip_vs_mutex);
2817         return ret;
2818 }
2819
2820
2821 static struct nf_sockopt_ops ip_vs_sockopts = {
2822         .pf             = PF_INET,
2823         .set_optmin     = IP_VS_BASE_CTL,
2824         .set_optmax     = IP_VS_SO_SET_MAX+1,
2825         .set            = do_ip_vs_set_ctl,
2826         .get_optmin     = IP_VS_BASE_CTL,
2827         .get_optmax     = IP_VS_SO_GET_MAX+1,
2828         .get            = do_ip_vs_get_ctl,
2829         .owner          = THIS_MODULE,
2830 };
2831
2832 /*
2833  * Generic Netlink interface
2834  */
2835
2836 /* IPVS genetlink family */
2837 static struct genl_family ip_vs_genl_family = {
2838         .id             = GENL_ID_GENERATE,
2839         .hdrsize        = 0,
2840         .name           = IPVS_GENL_NAME,
2841         .version        = IPVS_GENL_VERSION,
2842         .maxattr        = IPVS_CMD_MAX,
2843         .netnsok        = true,         /* Make ipvsadm to work on netns */
2844 };
2845
2846 /* Policy used for first-level command attributes */
2847 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
2848         [IPVS_CMD_ATTR_SERVICE]         = { .type = NLA_NESTED },
2849         [IPVS_CMD_ATTR_DEST]            = { .type = NLA_NESTED },
2850         [IPVS_CMD_ATTR_DAEMON]          = { .type = NLA_NESTED },
2851         [IPVS_CMD_ATTR_TIMEOUT_TCP]     = { .type = NLA_U32 },
2852         [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
2853         [IPVS_CMD_ATTR_TIMEOUT_UDP]     = { .type = NLA_U32 },
2854 };
2855
2856 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
2857 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
2858         [IPVS_DAEMON_ATTR_STATE]        = { .type = NLA_U32 },
2859         [IPVS_DAEMON_ATTR_MCAST_IFN]    = { .type = NLA_NUL_STRING,
2860                                             .len = IP_VS_IFNAME_MAXLEN },
2861         [IPVS_DAEMON_ATTR_SYNC_ID]      = { .type = NLA_U32 },
2862 };
2863
2864 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
2865 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
2866         [IPVS_SVC_ATTR_AF]              = { .type = NLA_U16 },
2867         [IPVS_SVC_ATTR_PROTOCOL]        = { .type = NLA_U16 },
2868         [IPVS_SVC_ATTR_ADDR]            = { .type = NLA_BINARY,
2869                                             .len = sizeof(union nf_inet_addr) },
2870         [IPVS_SVC_ATTR_PORT]            = { .type = NLA_U16 },
2871         [IPVS_SVC_ATTR_FWMARK]          = { .type = NLA_U32 },
2872         [IPVS_SVC_ATTR_SCHED_NAME]      = { .type = NLA_NUL_STRING,
2873                                             .len = IP_VS_SCHEDNAME_MAXLEN },
2874         [IPVS_SVC_ATTR_PE_NAME]         = { .type = NLA_NUL_STRING,
2875                                             .len = IP_VS_PENAME_MAXLEN },
2876         [IPVS_SVC_ATTR_FLAGS]           = { .type = NLA_BINARY,
2877                                             .len = sizeof(struct ip_vs_flags) },
2878         [IPVS_SVC_ATTR_TIMEOUT]         = { .type = NLA_U32 },
2879         [IPVS_SVC_ATTR_NETMASK]         = { .type = NLA_U32 },
2880         [IPVS_SVC_ATTR_STATS]           = { .type = NLA_NESTED },
2881 };
2882
2883 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
2884 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
2885         [IPVS_DEST_ATTR_ADDR]           = { .type = NLA_BINARY,
2886                                             .len = sizeof(union nf_inet_addr) },
2887         [IPVS_DEST_ATTR_PORT]           = { .type = NLA_U16 },
2888         [IPVS_DEST_ATTR_FWD_METHOD]     = { .type = NLA_U32 },
2889         [IPVS_DEST_ATTR_WEIGHT]         = { .type = NLA_U32 },
2890         [IPVS_DEST_ATTR_U_THRESH]       = { .type = NLA_U32 },
2891         [IPVS_DEST_ATTR_L_THRESH]       = { .type = NLA_U32 },
2892         [IPVS_DEST_ATTR_ACTIVE_CONNS]   = { .type = NLA_U32 },
2893         [IPVS_DEST_ATTR_INACT_CONNS]    = { .type = NLA_U32 },
2894         [IPVS_DEST_ATTR_PERSIST_CONNS]  = { .type = NLA_U32 },
2895         [IPVS_DEST_ATTR_STATS]          = { .type = NLA_NESTED },
2896 };
2897
2898 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
2899                                  struct ip_vs_stats *stats)
2900 {
2901         struct ip_vs_stats_user ustats;
2902         struct nlattr *nl_stats = nla_nest_start(skb, container_type);
2903         if (!nl_stats)
2904                 return -EMSGSIZE;
2905
2906         ip_vs_copy_stats(&ustats, stats);
2907
2908         if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, ustats.conns) ||
2909             nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, ustats.inpkts) ||
2910             nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, ustats.outpkts) ||
2911             nla_put_u64(skb, IPVS_STATS_ATTR_INBYTES, ustats.inbytes) ||
2912             nla_put_u64(skb, IPVS_STATS_ATTR_OUTBYTES, ustats.outbytes) ||
2913             nla_put_u32(skb, IPVS_STATS_ATTR_CPS, ustats.cps) ||
2914             nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, ustats.inpps) ||
2915             nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, ustats.outpps) ||
2916             nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, ustats.inbps) ||
2917             nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, ustats.outbps))
2918                 goto nla_put_failure;
2919         nla_nest_end(skb, nl_stats);
2920
2921         return 0;
2922
2923 nla_put_failure:
2924         nla_nest_cancel(skb, nl_stats);
2925         return -EMSGSIZE;
2926 }
2927
2928 static int ip_vs_genl_fill_service(struct sk_buff *skb,
2929                                    struct ip_vs_service *svc)
2930 {
2931         struct nlattr *nl_service;
2932         struct ip_vs_flags flags = { .flags = svc->flags,
2933                                      .mask = ~0 };
2934
2935         nl_service = nla_nest_start(skb, IPVS_CMD_ATTR_SERVICE);
2936         if (!nl_service)
2937                 return -EMSGSIZE;
2938
2939         if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
2940                 goto nla_put_failure;
2941         if (svc->fwmark) {
2942                 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
2943                         goto nla_put_failure;
2944         } else {
2945                 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
2946                     nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
2947                     nla_put_u16(skb, IPVS_SVC_ATTR_PORT, svc->port))
2948                         goto nla_put_failure;
2949         }
2950
2951         if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name) ||
2952             (svc->pe &&
2953              nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name)) ||
2954             nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
2955             nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
2956             nla_put_u32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
2957                 goto nla_put_failure;
2958         if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &svc->stats))
2959                 goto nla_put_failure;
2960
2961         nla_nest_end(skb, nl_service);
2962
2963         return 0;
2964
2965 nla_put_failure:
2966         nla_nest_cancel(skb, nl_service);
2967         return -EMSGSIZE;
2968 }
2969
2970 static int ip_vs_genl_dump_service(struct sk_buff *skb,
2971                                    struct ip_vs_service *svc,
2972                                    struct netlink_callback *cb)
2973 {
2974         void *hdr;
2975
2976         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
2977                           &ip_vs_genl_family, NLM_F_MULTI,
2978                           IPVS_CMD_NEW_SERVICE);
2979         if (!hdr)
2980                 return -EMSGSIZE;
2981
2982         if (ip_vs_genl_fill_service(skb, svc) < 0)
2983                 goto nla_put_failure;
2984
2985         return genlmsg_end(skb, hdr);
2986
2987 nla_put_failure:
2988         genlmsg_cancel(skb, hdr);
2989         return -EMSGSIZE;
2990 }
2991
2992 static int ip_vs_genl_dump_services(struct sk_buff *skb,
2993                                     struct netlink_callback *cb)
2994 {
2995         int idx = 0, i;
2996         int start = cb->args[0];
2997         struct ip_vs_service *svc;
2998         struct net *net = skb_sknet(skb);
2999
3000         mutex_lock(&__ip_vs_mutex);
3001         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
3002                 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
3003                         if (++idx <= start || !net_eq(svc->net, net))
3004                                 continue;
3005                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
3006                                 idx--;
3007                                 goto nla_put_failure;
3008                         }
3009                 }
3010         }
3011
3012         for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
3013                 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
3014                         if (++idx <= start || !net_eq(svc->net, net))
3015                                 continue;
3016                         if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
3017                                 idx--;
3018                                 goto nla_put_failure;
3019                         }
3020                 }
3021         }
3022
3023 nla_put_failure:
3024         mutex_unlock(&__ip_vs_mutex);
3025         cb->args[0] = idx;
3026
3027         return skb->len;
3028 }
3029
3030 static int ip_vs_genl_parse_service(struct net *net,
3031                                     struct ip_vs_service_user_kern *usvc,
3032                                     struct nlattr *nla, int full_entry,
3033                                     struct ip_vs_service **ret_svc)
3034 {
3035         struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
3036         struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
3037         struct ip_vs_service *svc;
3038
3039         /* Parse mandatory identifying service fields first */
3040         if (nla == NULL ||
3041             nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy))
3042                 return -EINVAL;
3043
3044         nla_af          = attrs[IPVS_SVC_ATTR_AF];
3045         nla_protocol    = attrs[IPVS_SVC_ATTR_PROTOCOL];
3046         nla_addr        = attrs[IPVS_SVC_ATTR_ADDR];
3047         nla_port        = attrs[IPVS_SVC_ATTR_PORT];
3048         nla_fwmark      = attrs[IPVS_SVC_ATTR_FWMARK];
3049
3050         if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
3051                 return -EINVAL;
3052
3053         memset(usvc, 0, sizeof(*usvc));
3054
3055         usvc->af = nla_get_u16(nla_af);
3056 #ifdef CONFIG_IP_VS_IPV6
3057         if (usvc->af != AF_INET && usvc->af != AF_INET6)
3058 #else
3059         if (usvc->af != AF_INET)
3060 #endif
3061                 return -EAFNOSUPPORT;
3062
3063         if (nla_fwmark) {
3064                 usvc->protocol = IPPROTO_TCP;
3065                 usvc->fwmark = nla_get_u32(nla_fwmark);
3066         } else {
3067                 usvc->protocol = nla_get_u16(nla_protocol);
3068                 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
3069                 usvc->port = nla_get_u16(nla_port);
3070                 usvc->fwmark = 0;
3071         }
3072
3073         if (usvc->fwmark)
3074                 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
3075         else
3076                 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
3077                                            &usvc->addr, usvc->port);
3078         *ret_svc = svc;
3079
3080         /* If a full entry was requested, check for the additional fields */
3081         if (full_entry) {
3082                 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
3083                               *nla_netmask;
3084                 struct ip_vs_flags flags;
3085
3086                 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
3087                 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
3088                 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
3089                 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
3090                 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
3091
3092                 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
3093                         return -EINVAL;
3094
3095                 nla_memcpy(&flags, nla_flags, sizeof(flags));
3096
3097                 /* prefill flags from service if it already exists */
3098                 if (svc)
3099                         usvc->flags = svc->flags;
3100
3101                 /* set new flags from userland */
3102                 usvc->flags = (usvc->flags & ~flags.mask) |
3103                               (flags.flags & flags.mask);
3104                 usvc->sched_name = nla_data(nla_sched);
3105                 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
3106                 usvc->timeout = nla_get_u32(nla_timeout);
3107                 usvc->netmask = nla_get_u32(nla_netmask);
3108         }
3109
3110         return 0;
3111 }
3112
3113 static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
3114                                                      struct nlattr *nla)
3115 {
3116         struct ip_vs_service_user_kern usvc;
3117         struct ip_vs_service *svc;
3118         int ret;
3119
3120         ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
3121         return ret ? ERR_PTR(ret) : svc;
3122 }
3123
3124 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
3125 {
3126         struct nlattr *nl_dest;
3127
3128         nl_dest = nla_nest_start(skb, IPVS_CMD_ATTR_DEST);
3129         if (!nl_dest)
3130                 return -EMSGSIZE;
3131
3132         if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
3133             nla_put_u16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
3134             nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
3135                         (atomic_read(&dest->conn_flags) &
3136                          IP_VS_CONN_F_FWD_MASK)) ||
3137             nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
3138                         atomic_read(&dest->weight)) ||
3139             nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
3140             nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
3141             nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
3142                         atomic_read(&dest->activeconns)) ||
3143             nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
3144                         atomic_read(&dest->inactconns)) ||
3145             nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
3146                         atomic_read(&dest->persistconns)))
3147                 goto nla_put_failure;
3148         if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &dest->stats))
3149                 goto nla_put_failure;
3150
3151         nla_nest_end(skb, nl_dest);
3152
3153         return 0;
3154
3155 nla_put_failure:
3156         nla_nest_cancel(skb, nl_dest);
3157         return -EMSGSIZE;
3158 }
3159
3160 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
3161                                 struct netlink_callback *cb)
3162 {
3163         void *hdr;
3164
3165         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3166                           &ip_vs_genl_family, NLM_F_MULTI,
3167                           IPVS_CMD_NEW_DEST);
3168         if (!hdr)
3169                 return -EMSGSIZE;
3170
3171         if (ip_vs_genl_fill_dest(skb, dest) < 0)
3172                 goto nla_put_failure;
3173
3174         return genlmsg_end(skb, hdr);
3175
3176 nla_put_failure:
3177         genlmsg_cancel(skb, hdr);
3178         return -EMSGSIZE;
3179 }
3180
3181 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
3182                                  struct netlink_callback *cb)
3183 {
3184         int idx = 0;
3185         int start = cb->args[0];
3186         struct ip_vs_service *svc;
3187         struct ip_vs_dest *dest;
3188         struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
3189         struct net *net = skb_sknet(skb);
3190
3191         mutex_lock(&__ip_vs_mutex);
3192
3193         /* Try to find the service for which to dump destinations */
3194         if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs,
3195                         IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
3196                 goto out_err;
3197
3198
3199         svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
3200         if (IS_ERR(svc) || svc == NULL)
3201                 goto out_err;
3202
3203         /* Dump the destinations */
3204         list_for_each_entry(dest, &svc->destinations, n_list) {
3205                 if (++idx <= start)
3206                         continue;
3207                 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
3208                         idx--;
3209                         goto nla_put_failure;
3210                 }
3211         }
3212
3213 nla_put_failure:
3214         cb->args[0] = idx;
3215
3216 out_err:
3217         mutex_unlock(&__ip_vs_mutex);
3218
3219         return skb->len;
3220 }
3221
3222 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
3223                                  struct nlattr *nla, int full_entry)
3224 {
3225         struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
3226         struct nlattr *nla_addr, *nla_port;
3227
3228         /* Parse mandatory identifying destination fields first */
3229         if (nla == NULL ||
3230             nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy))
3231                 return -EINVAL;
3232
3233         nla_addr        = attrs[IPVS_DEST_ATTR_ADDR];
3234         nla_port        = attrs[IPVS_DEST_ATTR_PORT];
3235
3236         if (!(nla_addr && nla_port))
3237                 return -EINVAL;
3238
3239         memset(udest, 0, sizeof(*udest));
3240
3241         nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
3242         udest->port = nla_get_u16(nla_port);
3243
3244         /* If a full entry was requested, check for the additional fields */
3245         if (full_entry) {
3246                 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
3247                               *nla_l_thresh;
3248
3249                 nla_fwd         = attrs[IPVS_DEST_ATTR_FWD_METHOD];
3250                 nla_weight      = attrs[IPVS_DEST_ATTR_WEIGHT];
3251                 nla_u_thresh    = attrs[IPVS_DEST_ATTR_U_THRESH];
3252                 nla_l_thresh    = attrs[IPVS_DEST_ATTR_L_THRESH];
3253
3254                 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
3255                         return -EINVAL;
3256
3257                 udest->conn_flags = nla_get_u32(nla_fwd)
3258                                     & IP_VS_CONN_F_FWD_MASK;
3259                 udest->weight = nla_get_u32(nla_weight);
3260                 udest->u_threshold = nla_get_u32(nla_u_thresh);
3261                 udest->l_threshold = nla_get_u32(nla_l_thresh);
3262         }
3263
3264         return 0;
3265 }
3266
3267 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __be32 state,
3268                                   const char *mcast_ifn, __be32 syncid)
3269 {
3270         struct nlattr *nl_daemon;
3271
3272         nl_daemon = nla_nest_start(skb, IPVS_CMD_ATTR_DAEMON);
3273         if (!nl_daemon)
3274                 return -EMSGSIZE;
3275
3276         if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
3277             nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, mcast_ifn) ||
3278             nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, syncid))
3279                 goto nla_put_failure;
3280         nla_nest_end(skb, nl_daemon);
3281
3282         return 0;
3283
3284 nla_put_failure:
3285         nla_nest_cancel(skb, nl_daemon);
3286         return -EMSGSIZE;
3287 }
3288
3289 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __be32 state,
3290                                   const char *mcast_ifn, __be32 syncid,
3291                                   struct netlink_callback *cb)
3292 {
3293         void *hdr;
3294         hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3295                           &ip_vs_genl_family, NLM_F_MULTI,
3296                           IPVS_CMD_NEW_DAEMON);
3297         if (!hdr)
3298                 return -EMSGSIZE;
3299
3300         if (ip_vs_genl_fill_daemon(skb, state, mcast_ifn, syncid))
3301                 goto nla_put_failure;
3302
3303         return genlmsg_end(skb, hdr);
3304
3305 nla_put_failure:
3306         genlmsg_cancel(skb, hdr);
3307         return -EMSGSIZE;
3308 }
3309
3310 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3311                                    struct netlink_callback *cb)
3312 {
3313         struct net *net = skb_sknet(skb);
3314         struct netns_ipvs *ipvs = net_ipvs(net);
3315
3316         mutex_lock(&ipvs->sync_mutex);
3317         if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3318                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3319                                            ipvs->master_mcast_ifn,
3320                                            ipvs->master_syncid, cb) < 0)
3321                         goto nla_put_failure;
3322
3323                 cb->args[0] = 1;
3324         }
3325
3326         if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3327                 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3328                                            ipvs->backup_mcast_ifn,
3329                                            ipvs->backup_syncid, cb) < 0)
3330                         goto nla_put_failure;
3331
3332                 cb->args[1] = 1;
3333         }
3334
3335 nla_put_failure:
3336         mutex_unlock(&ipvs->sync_mutex);
3337
3338         return skb->len;
3339 }
3340
3341 static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3342 {
3343         if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3344               attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3345               attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3346                 return -EINVAL;
3347
3348         return start_sync_thread(net,
3349                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3350                                  nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3351                                  nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3352 }
3353
3354 static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3355 {
3356         if (!attrs[IPVS_DAEMON_ATTR_STATE])
3357                 return -EINVAL;
3358
3359         return stop_sync_thread(net,
3360                                 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3361 }
3362
3363 static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3364 {
3365         struct ip_vs_timeout_user t;
3366
3367         __ip_vs_get_timeouts(net, &t);
3368
3369         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3370                 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
3371
3372         if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
3373                 t.tcp_fin_timeout =
3374                         nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
3375
3376         if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3377                 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3378
3379         return ip_vs_set_timeout(net, &t);
3380 }
3381
3382 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
3383 {
3384         int ret = 0, cmd;
3385         struct net *net;
3386         struct netns_ipvs *ipvs;
3387
3388         net = skb_sknet(skb);
3389         ipvs = net_ipvs(net);
3390         cmd = info->genlhdr->cmd;
3391
3392         if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
3393                 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
3394
3395                 mutex_lock(&ipvs->sync_mutex);
3396                 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
3397                     nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX,
3398                                      info->attrs[IPVS_CMD_ATTR_DAEMON],
3399                                      ip_vs_daemon_policy)) {
3400                         ret = -EINVAL;
3401                         goto out;
3402                 }
3403
3404                 if (cmd == IPVS_CMD_NEW_DAEMON)
3405                         ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3406                 else
3407                         ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3408 out:
3409                 mutex_unlock(&ipvs->sync_mutex);
3410         }
3411         return ret;
3412 }
3413
3414 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3415 {
3416         struct ip_vs_service *svc = NULL;
3417         struct ip_vs_service_user_kern usvc;
3418         struct ip_vs_dest_user_kern udest;
3419         int ret = 0, cmd;
3420         int need_full_svc = 0, need_full_dest = 0;
3421         struct net *net;
3422
3423         net = skb_sknet(skb);
3424         cmd = info->genlhdr->cmd;
3425
3426         mutex_lock(&__ip_vs_mutex);
3427
3428         if (cmd == IPVS_CMD_FLUSH) {
3429                 ret = ip_vs_flush(net, false);
3430                 goto out;
3431         } else if (cmd == IPVS_CMD_SET_CONFIG) {
3432                 ret = ip_vs_genl_set_config(net, info->attrs);
3433                 goto out;
3434         } else if (cmd == IPVS_CMD_ZERO &&
3435                    !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3436                 ret = ip_vs_zero_all(net);
3437                 goto out;
3438         }
3439
3440         /* All following commands require a service argument, so check if we
3441          * received a valid one. We need a full service specification when
3442          * adding / editing a service. Only identifying members otherwise. */
3443         if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3444                 need_full_svc = 1;
3445
3446         ret = ip_vs_genl_parse_service(net, &usvc,
3447                                        info->attrs[IPVS_CMD_ATTR_SERVICE],
3448                                        need_full_svc, &svc);
3449         if (ret)
3450                 goto out;
3451
3452         /* Unless we're adding a new service, the service must already exist */
3453         if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
3454                 ret = -ESRCH;
3455                 goto out;
3456         }
3457
3458         /* Destination commands require a valid destination argument. For
3459          * adding / editing a destination, we need a full destination
3460          * specification. */
3461         if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
3462             cmd == IPVS_CMD_DEL_DEST) {
3463                 if (cmd != IPVS_CMD_DEL_DEST)
3464                         need_full_dest = 1;
3465
3466                 ret = ip_vs_genl_parse_dest(&udest,
3467                                             info->attrs[IPVS_CMD_ATTR_DEST],
3468                                             need_full_dest);
3469                 if (ret)
3470                         goto out;
3471         }
3472
3473         switch (cmd) {
3474         case IPVS_CMD_NEW_SERVICE:
3475                 if (svc == NULL)
3476                         ret = ip_vs_add_service(net, &usvc, &svc);
3477                 else
3478                         ret = -EEXIST;
3479                 break;
3480         case IPVS_CMD_SET_SERVICE:
3481                 ret = ip_vs_edit_service(svc, &usvc);
3482                 break;
3483         case IPVS_CMD_DEL_SERVICE:
3484                 ret = ip_vs_del_service(svc);
3485                 /* do not use svc, it can be freed */
3486                 break;
3487         case IPVS_CMD_NEW_DEST:
3488                 ret = ip_vs_add_dest(svc, &udest);
3489                 break;
3490         case IPVS_CMD_SET_DEST:
3491                 ret = ip_vs_edit_dest(svc, &udest);
3492                 break;
3493         case IPVS_CMD_DEL_DEST:
3494                 ret = ip_vs_del_dest(svc, &udest);
3495                 break;
3496         case IPVS_CMD_ZERO:
3497                 ret = ip_vs_zero_service(svc);
3498                 break;
3499         default:
3500                 ret = -EINVAL;
3501         }
3502
3503 out:
3504         mutex_unlock(&__ip_vs_mutex);
3505
3506         return ret;
3507 }
3508
3509 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3510 {
3511         struct sk_buff *msg;
3512         void *reply;
3513         int ret, cmd, reply_cmd;
3514         struct net *net;
3515
3516         net = skb_sknet(skb);
3517         cmd = info->genlhdr->cmd;
3518
3519         if (cmd == IPVS_CMD_GET_SERVICE)
3520                 reply_cmd = IPVS_CMD_NEW_SERVICE;
3521         else if (cmd == IPVS_CMD_GET_INFO)
3522                 reply_cmd = IPVS_CMD_SET_INFO;
3523         else if (cmd == IPVS_CMD_GET_CONFIG)
3524                 reply_cmd = IPVS_CMD_SET_CONFIG;
3525         else {
3526                 pr_err("unknown Generic Netlink command\n");
3527                 return -EINVAL;
3528         }
3529
3530         msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
3531         if (!msg)
3532                 return -ENOMEM;
3533
3534         mutex_lock(&__ip_vs_mutex);
3535
3536         reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
3537         if (reply == NULL)
3538                 goto nla_put_failure;
3539
3540         switch (cmd) {
3541         case IPVS_CMD_GET_SERVICE:
3542         {
3543                 struct ip_vs_service *svc;
3544
3545                 svc = ip_vs_genl_find_service(net,
3546                                               info->attrs[IPVS_CMD_ATTR_SERVICE]);
3547                 if (IS_ERR(svc)) {
3548                         ret = PTR_ERR(svc);
3549                         goto out_err;
3550                 } else if (svc) {
3551                         ret = ip_vs_genl_fill_service(msg, svc);
3552                         if (ret)
3553                                 goto nla_put_failure;
3554                 } else {
3555                         ret = -ESRCH;
3556                         goto out_err;
3557                 }
3558
3559                 break;
3560         }
3561
3562         case IPVS_CMD_GET_CONFIG:
3563         {
3564                 struct ip_vs_timeout_user t;
3565
3566                 __ip_vs_get_timeouts(net, &t);
3567 #ifdef CONFIG_IP_VS_PROTO_TCP
3568                 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
3569                                 t.tcp_timeout) ||
3570                     nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
3571                                 t.tcp_fin_timeout))
3572                         goto nla_put_failure;
3573 #endif
3574 #ifdef CONFIG_IP_VS_PROTO_UDP
3575                 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
3576                         goto nla_put_failure;
3577 #endif
3578
3579                 break;
3580         }
3581
3582         case IPVS_CMD_GET_INFO:
3583                 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
3584                                 IP_VS_VERSION_CODE) ||
3585                     nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
3586                                 ip_vs_conn_tab_size))
3587                         goto nla_put_failure;
3588                 break;
3589         }
3590
3591         genlmsg_end(msg, reply);
3592         ret = genlmsg_reply(msg, info);
3593         goto out;
3594
3595 nla_put_failure:
3596         pr_err("not enough space in Netlink message\n");
3597         ret = -EMSGSIZE;
3598
3599 out_err:
3600         nlmsg_free(msg);
3601 out:
3602         mutex_unlock(&__ip_vs_mutex);
3603
3604         return ret;
3605 }
3606
3607
3608 static struct genl_ops ip_vs_genl_ops[] __read_mostly = {
3609         {
3610                 .cmd    = IPVS_CMD_NEW_SERVICE,
3611                 .flags  = GENL_ADMIN_PERM,
3612                 .policy = ip_vs_cmd_policy,
3613                 .doit   = ip_vs_genl_set_cmd,
3614         },
3615         {
3616                 .cmd    = IPVS_CMD_SET_SERVICE,
3617                 .flags  = GENL_ADMIN_PERM,
3618                 .policy = ip_vs_cmd_policy,
3619                 .doit   = ip_vs_genl_set_cmd,
3620         },
3621         {
3622                 .cmd    = IPVS_CMD_DEL_SERVICE,
3623                 .flags  = GENL_ADMIN_PERM,
3624                 .policy = ip_vs_cmd_policy,
3625                 .doit   = ip_vs_genl_set_cmd,
3626         },
3627         {
3628                 .cmd    = IPVS_CMD_GET_SERVICE,
3629                 .flags  = GENL_ADMIN_PERM,
3630                 .doit   = ip_vs_genl_get_cmd,
3631                 .dumpit = ip_vs_genl_dump_services,
3632                 .policy = ip_vs_cmd_policy,
3633         },
3634         {
3635                 .cmd    = IPVS_CMD_NEW_DEST,
3636                 .flags  = GENL_ADMIN_PERM,
3637                 .policy = ip_vs_cmd_policy,
3638                 .doit   = ip_vs_genl_set_cmd,
3639         },
3640         {
3641                 .cmd    = IPVS_CMD_SET_DEST,
3642                 .flags  = GENL_ADMIN_PERM,
3643                 .policy = ip_vs_cmd_policy,
3644                 .doit   = ip_vs_genl_set_cmd,
3645         },
3646         {
3647                 .cmd    = IPVS_CMD_DEL_DEST,
3648                 .flags  = GENL_ADMIN_PERM,
3649                 .policy = ip_vs_cmd_policy,
3650                 .doit   = ip_vs_genl_set_cmd,
3651         },
3652         {
3653                 .cmd    = IPVS_CMD_GET_DEST,
3654                 .flags  = GENL_ADMIN_PERM,
3655                 .policy = ip_vs_cmd_policy,
3656                 .dumpit = ip_vs_genl_dump_dests,
3657         },
3658         {
3659                 .cmd    = IPVS_CMD_NEW_DAEMON,
3660                 .flags  = GENL_ADMIN_PERM,
3661                 .policy = ip_vs_cmd_policy,
3662                 .doit   = ip_vs_genl_set_daemon,
3663         },
3664         {
3665                 .cmd    = IPVS_CMD_DEL_DAEMON,
3666                 .flags  = GENL_ADMIN_PERM,
3667                 .policy = ip_vs_cmd_policy,
3668                 .doit   = ip_vs_genl_set_daemon,
3669         },
3670         {
3671                 .cmd    = IPVS_CMD_GET_DAEMON,
3672                 .flags  = GENL_ADMIN_PERM,
3673                 .dumpit = ip_vs_genl_dump_daemons,
3674         },
3675         {
3676                 .cmd    = IPVS_CMD_SET_CONFIG,
3677                 .flags  = GENL_ADMIN_PERM,
3678                 .policy = ip_vs_cmd_policy,
3679                 .doit   = ip_vs_genl_set_cmd,
3680         },
3681         {
3682                 .cmd    = IPVS_CMD_GET_CONFIG,
3683                 .flags  = GENL_ADMIN_PERM,
3684                 .doit   = ip_vs_genl_get_cmd,
3685         },
3686         {
3687                 .cmd    = IPVS_CMD_GET_INFO,
3688                 .flags  = GENL_ADMIN_PERM,
3689                 .doit   = ip_vs_genl_get_cmd,
3690         },
3691         {
3692                 .cmd    = IPVS_CMD_ZERO,
3693                 .flags  = GENL_ADMIN_PERM,
3694                 .policy = ip_vs_cmd_policy,
3695                 .doit   = ip_vs_genl_set_cmd,
3696         },
3697         {
3698                 .cmd    = IPVS_CMD_FLUSH,
3699                 .flags  = GENL_ADMIN_PERM,
3700                 .doit   = ip_vs_genl_set_cmd,
3701         },
3702 };
3703
3704 static int __init ip_vs_genl_register(void)
3705 {
3706         return genl_register_family_with_ops(&ip_vs_genl_family,
3707                 ip_vs_genl_ops, ARRAY_SIZE(ip_vs_genl_ops));
3708 }
3709
3710 static void ip_vs_genl_unregister(void)
3711 {
3712         genl_unregister_family(&ip_vs_genl_family);
3713 }
3714
3715 /* End of Generic Netlink interface definitions */
3716
3717 /*
3718  * per netns intit/exit func.
3719  */
3720 #ifdef CONFIG_SYSCTL
3721 static int __net_init ip_vs_control_net_init_sysctl(struct net *net)
3722 {
3723         int idx;
3724         struct netns_ipvs *ipvs = net_ipvs(net);
3725         struct ctl_table *tbl;
3726
3727         atomic_set(&ipvs->dropentry, 0);
3728         spin_lock_init(&ipvs->dropentry_lock);
3729         spin_lock_init(&ipvs->droppacket_lock);
3730         spin_lock_init(&ipvs->securetcp_lock);
3731
3732         if (!net_eq(net, &init_net)) {
3733                 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3734                 if (tbl == NULL)
3735                         return -ENOMEM;
3736
3737                 /* Don't export sysctls to unprivileged users */
3738                 if (net->user_ns != &init_user_ns)
3739                         tbl[0].procname = NULL;
3740         } else
3741                 tbl = vs_vars;
3742         /* Initialize sysctl defaults */
3743         idx = 0;
3744         ipvs->sysctl_amemthresh = 1024;
3745         tbl[idx++].data = &ipvs->sysctl_amemthresh;
3746         ipvs->sysctl_am_droprate = 10;
3747         tbl[idx++].data = &ipvs->sysctl_am_droprate;
3748         tbl[idx++].data = &ipvs->sysctl_drop_entry;
3749         tbl[idx++].data = &ipvs->sysctl_drop_packet;
3750 #ifdef CONFIG_IP_VS_NFCT
3751         tbl[idx++].data = &ipvs->sysctl_conntrack;
3752 #endif
3753         tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3754         ipvs->sysctl_snat_reroute = 1;
3755         tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3756         ipvs->sysctl_sync_ver = 1;
3757         tbl[idx++].data = &ipvs->sysctl_sync_ver;
3758         ipvs->sysctl_sync_ports = 1;
3759         tbl[idx++].data = &ipvs->sysctl_sync_ports;
3760         ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
3761         tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
3762         ipvs->sysctl_sync_sock_size = 0;
3763         tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
3764         tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3765         tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3766         tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3767         ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
3768         ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
3769         tbl[idx].data = &ipvs->sysctl_sync_threshold;
3770         tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3771         ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
3772         tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
3773         ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
3774         tbl[idx++].data = &ipvs->sysctl_sync_retries;
3775         tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3776         ipvs->sysctl_pmtu_disc = 1;
3777         tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
3778         tbl[idx++].data = &ipvs->sysctl_backup_only;
3779
3780
3781         ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
3782         if (ipvs->sysctl_hdr == NULL) {
3783                 if (!net_eq(net, &init_net))
3784                         kfree(tbl);
3785                 return -ENOMEM;
3786         }
3787         ip_vs_start_estimator(net, &ipvs->tot_stats);
3788         ipvs->sysctl_tbl = tbl;
3789         /* Schedule defense work */
3790         INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3791         schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3792
3793         return 0;
3794 }
3795
3796 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
3797 {
3798         struct netns_ipvs *ipvs = net_ipvs(net);
3799
3800         cancel_delayed_work_sync(&ipvs->defense_work);
3801         cancel_work_sync(&ipvs->defense_work.work);
3802         unregister_net_sysctl_table(ipvs->sysctl_hdr);
3803 }
3804
3805 #else
3806
3807 static int __net_init ip_vs_control_net_init_sysctl(struct net *net) { return 0; }
3808 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net) { }
3809
3810 #endif
3811
3812 static struct notifier_block ip_vs_dst_notifier = {
3813         .notifier_call = ip_vs_dst_event,
3814 };
3815
3816 int __net_init ip_vs_control_net_init(struct net *net)
3817 {
3818         int idx;
3819         struct netns_ipvs *ipvs = net_ipvs(net);
3820
3821         /* Initialize rs_table */
3822         for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3823                 INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
3824
3825         INIT_LIST_HEAD(&ipvs->dest_trash);
3826         spin_lock_init(&ipvs->dest_trash_lock);
3827         setup_timer(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire,
3828                     (unsigned long) net);
3829         atomic_set(&ipvs->ftpsvc_counter, 0);
3830         atomic_set(&ipvs->nullsvc_counter, 0);
3831
3832         /* procfs stats */
3833         ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3834         if (!ipvs->tot_stats.cpustats)
3835                 return -ENOMEM;
3836
3837         spin_lock_init(&ipvs->tot_stats.lock);
3838
3839         proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
3840         proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
3841         proc_create("ip_vs_stats_percpu", 0, net->proc_net,
3842                     &ip_vs_stats_percpu_fops);
3843
3844         if (ip_vs_control_net_init_sysctl(net))
3845                 goto err;
3846
3847         return 0;
3848
3849 err:
3850         free_percpu(ipvs->tot_stats.cpustats);
3851         return -ENOMEM;
3852 }
3853
3854 void __net_exit ip_vs_control_net_cleanup(struct net *net)
3855 {
3856         struct netns_ipvs *ipvs = net_ipvs(net);
3857
3858         /* Some dest can be in grace period even before cleanup, we have to
3859          * defer ip_vs_trash_cleanup until ip_vs_dest_wait_readers is called.
3860          */
3861         rcu_barrier();
3862         ip_vs_trash_cleanup(net);
3863         ip_vs_stop_estimator(net, &ipvs->tot_stats);
3864         ip_vs_control_net_cleanup_sysctl(net);
3865         remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
3866         remove_proc_entry("ip_vs_stats", net->proc_net);
3867         remove_proc_entry("ip_vs", net->proc_net);
3868         free_percpu(ipvs->tot_stats.cpustats);
3869 }
3870
3871 int __init ip_vs_register_nl_ioctl(void)
3872 {
3873         int ret;
3874
3875         ret = nf_register_sockopt(&ip_vs_sockopts);
3876         if (ret) {
3877                 pr_err("cannot register sockopt.\n");
3878                 goto err_sock;
3879         }
3880
3881         ret = ip_vs_genl_register();
3882         if (ret) {
3883                 pr_err("cannot register Generic Netlink interface.\n");
3884                 goto err_genl;
3885         }
3886         return 0;
3887
3888 err_genl:
3889         nf_unregister_sockopt(&ip_vs_sockopts);
3890 err_sock:
3891         return ret;
3892 }
3893
3894 void ip_vs_unregister_nl_ioctl(void)
3895 {
3896         ip_vs_genl_unregister();
3897         nf_unregister_sockopt(&ip_vs_sockopts);
3898 }
3899
3900 int __init ip_vs_control_init(void)
3901 {
3902         int idx;
3903         int ret;
3904
3905         EnterFunction(2);
3906
3907         /* Initialize svc_table, ip_vs_svc_fwm_table */
3908         for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3909                 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3910                 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3911         }
3912
3913         smp_wmb();      /* Do we really need it now ? */
3914
3915         ret = register_netdevice_notifier(&ip_vs_dst_notifier);
3916         if (ret < 0)
3917                 return ret;
3918
3919         LeaveFunction(2);
3920         return 0;
3921 }
3922
3923
3924 void ip_vs_control_cleanup(void)
3925 {
3926         EnterFunction(2);
3927         unregister_netdevice_notifier(&ip_vs_dst_notifier);
3928         LeaveFunction(2);
3929 }