]> Pileus Git - ~andy/linux/blob - net/netfilter/ipvs/ip_vs_conn.c
Merge tag 'disintegrate-mtd-20121009' of git://git.infradead.org/users/dhowells/linux...
[~andy/linux] / net / netfilter / ipvs / ip_vs_conn.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
9  *              Peter Kese <peter.kese@ijs.si>
10  *              Julian Anastasov <ja@ssi.bg>
11  *
12  *              This program is free software; you can redistribute it and/or
13  *              modify it under the terms of the GNU General Public License
14  *              as published by the Free Software Foundation; either version
15  *              2 of the License, or (at your option) any later version.
16  *
17  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
18  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
19  * and others. Many code here is taken from IP MASQ code of kernel 2.2.
20  *
21  * Changes:
22  *
23  */
24
25 #define KMSG_COMPONENT "IPVS"
26 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
27
28 #include <linux/interrupt.h>
29 #include <linux/in.h>
30 #include <linux/net.h>
31 #include <linux/kernel.h>
32 #include <linux/module.h>
33 #include <linux/vmalloc.h>
34 #include <linux/proc_fs.h>              /* for proc_net_* */
35 #include <linux/slab.h>
36 #include <linux/seq_file.h>
37 #include <linux/jhash.h>
38 #include <linux/random.h>
39
40 #include <net/net_namespace.h>
41 #include <net/ip_vs.h>
42
43
44 #ifndef CONFIG_IP_VS_TAB_BITS
45 #define CONFIG_IP_VS_TAB_BITS   12
46 #endif
47
48 /*
49  * Connection hash size. Default is what was selected at compile time.
50 */
51 static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
52 module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
53 MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
54
55 /* size and mask values */
56 int ip_vs_conn_tab_size __read_mostly;
57 static int ip_vs_conn_tab_mask __read_mostly;
58
59 /*
60  *  Connection hash table: for input and output packets lookups of IPVS
61  */
62 static struct hlist_head *ip_vs_conn_tab __read_mostly;
63
64 /*  SLAB cache for IPVS connections */
65 static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
66
67 /*  counter for no client port connections */
68 static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
69
70 /* random value for IPVS connection hash */
71 static unsigned int ip_vs_conn_rnd __read_mostly;
72
73 /*
74  *  Fine locking granularity for big connection hash table
75  */
76 #define CT_LOCKARRAY_BITS  5
77 #define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
78 #define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
79
80 struct ip_vs_aligned_lock
81 {
82         rwlock_t        l;
83 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
84
85 /* lock array for conn table */
86 static struct ip_vs_aligned_lock
87 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
88
89 static inline void ct_read_lock(unsigned int key)
90 {
91         read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
92 }
93
94 static inline void ct_read_unlock(unsigned int key)
95 {
96         read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
97 }
98
99 static inline void ct_write_lock(unsigned int key)
100 {
101         write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
102 }
103
104 static inline void ct_write_unlock(unsigned int key)
105 {
106         write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
107 }
108
109 static inline void ct_read_lock_bh(unsigned int key)
110 {
111         read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
112 }
113
114 static inline void ct_read_unlock_bh(unsigned int key)
115 {
116         read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
117 }
118
119 static inline void ct_write_lock_bh(unsigned int key)
120 {
121         write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
122 }
123
124 static inline void ct_write_unlock_bh(unsigned int key)
125 {
126         write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
127 }
128
129
130 /*
131  *      Returns hash value for IPVS connection entry
132  */
133 static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned int proto,
134                                        const union nf_inet_addr *addr,
135                                        __be16 port)
136 {
137 #ifdef CONFIG_IP_VS_IPV6
138         if (af == AF_INET6)
139                 return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
140                                     (__force u32)port, proto, ip_vs_conn_rnd) ^
141                         ((size_t)net>>8)) & ip_vs_conn_tab_mask;
142 #endif
143         return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
144                             ip_vs_conn_rnd) ^
145                 ((size_t)net>>8)) & ip_vs_conn_tab_mask;
146 }
147
148 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
149                                              bool inverse)
150 {
151         const union nf_inet_addr *addr;
152         __be16 port;
153
154         if (p->pe_data && p->pe->hashkey_raw)
155                 return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) &
156                         ip_vs_conn_tab_mask;
157
158         if (likely(!inverse)) {
159                 addr = p->caddr;
160                 port = p->cport;
161         } else {
162                 addr = p->vaddr;
163                 port = p->vport;
164         }
165
166         return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
167 }
168
169 static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
170 {
171         struct ip_vs_conn_param p;
172
173         ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
174                               &cp->caddr, cp->cport, NULL, 0, &p);
175
176         if (cp->pe) {
177                 p.pe = cp->pe;
178                 p.pe_data = cp->pe_data;
179                 p.pe_data_len = cp->pe_data_len;
180         }
181
182         return ip_vs_conn_hashkey_param(&p, false);
183 }
184
185 /*
186  *      Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
187  *      returns bool success.
188  */
189 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
190 {
191         unsigned int hash;
192         int ret;
193
194         if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
195                 return 0;
196
197         /* Hash by protocol, client address and port */
198         hash = ip_vs_conn_hashkey_conn(cp);
199
200         ct_write_lock(hash);
201         spin_lock(&cp->lock);
202
203         if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
204                 hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]);
205                 cp->flags |= IP_VS_CONN_F_HASHED;
206                 atomic_inc(&cp->refcnt);
207                 ret = 1;
208         } else {
209                 pr_err("%s(): request for already hashed, called from %pF\n",
210                        __func__, __builtin_return_address(0));
211                 ret = 0;
212         }
213
214         spin_unlock(&cp->lock);
215         ct_write_unlock(hash);
216
217         return ret;
218 }
219
220
221 /*
222  *      UNhashes ip_vs_conn from ip_vs_conn_tab.
223  *      returns bool success.
224  */
225 static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
226 {
227         unsigned int hash;
228         int ret;
229
230         /* unhash it and decrease its reference counter */
231         hash = ip_vs_conn_hashkey_conn(cp);
232
233         ct_write_lock(hash);
234         spin_lock(&cp->lock);
235
236         if (cp->flags & IP_VS_CONN_F_HASHED) {
237                 hlist_del(&cp->c_list);
238                 cp->flags &= ~IP_VS_CONN_F_HASHED;
239                 atomic_dec(&cp->refcnt);
240                 ret = 1;
241         } else
242                 ret = 0;
243
244         spin_unlock(&cp->lock);
245         ct_write_unlock(hash);
246
247         return ret;
248 }
249
250
251 /*
252  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
253  *  Called for pkts coming from OUTside-to-INside.
254  *      p->caddr, p->cport: pkt source address (foreign host)
255  *      p->vaddr, p->vport: pkt dest address (load balancer)
256  */
257 static inline struct ip_vs_conn *
258 __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
259 {
260         unsigned int hash;
261         struct ip_vs_conn *cp;
262         struct hlist_node *n;
263
264         hash = ip_vs_conn_hashkey_param(p, false);
265
266         ct_read_lock(hash);
267
268         hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
269                 if (cp->af == p->af &&
270                     p->cport == cp->cport && p->vport == cp->vport &&
271                     ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
272                     ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
273                     ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
274                     p->protocol == cp->protocol &&
275                     ip_vs_conn_net_eq(cp, p->net)) {
276                         /* HIT */
277                         atomic_inc(&cp->refcnt);
278                         ct_read_unlock(hash);
279                         return cp;
280                 }
281         }
282
283         ct_read_unlock(hash);
284
285         return NULL;
286 }
287
288 struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
289 {
290         struct ip_vs_conn *cp;
291
292         cp = __ip_vs_conn_in_get(p);
293         if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) {
294                 struct ip_vs_conn_param cport_zero_p = *p;
295                 cport_zero_p.cport = 0;
296                 cp = __ip_vs_conn_in_get(&cport_zero_p);
297         }
298
299         IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
300                       ip_vs_proto_name(p->protocol),
301                       IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
302                       IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
303                       cp ? "hit" : "not hit");
304
305         return cp;
306 }
307
308 static int
309 ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
310                             const struct ip_vs_iphdr *iph,
311                             unsigned int proto_off, int inverse,
312                             struct ip_vs_conn_param *p)
313 {
314         __be16 _ports[2], *pptr;
315         struct net *net = skb_net(skb);
316
317         pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
318         if (pptr == NULL)
319                 return 1;
320
321         if (likely(!inverse))
322                 ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
323                                       pptr[0], &iph->daddr, pptr[1], p);
324         else
325                 ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
326                                       pptr[1], &iph->saddr, pptr[0], p);
327         return 0;
328 }
329
330 struct ip_vs_conn *
331 ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
332                         const struct ip_vs_iphdr *iph,
333                         unsigned int proto_off, int inverse)
334 {
335         struct ip_vs_conn_param p;
336
337         if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
338                 return NULL;
339
340         return ip_vs_conn_in_get(&p);
341 }
342 EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
343
344 /* Get reference to connection template */
345 struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
346 {
347         unsigned int hash;
348         struct ip_vs_conn *cp;
349         struct hlist_node *n;
350
351         hash = ip_vs_conn_hashkey_param(p, false);
352
353         ct_read_lock(hash);
354
355         hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
356                 if (!ip_vs_conn_net_eq(cp, p->net))
357                         continue;
358                 if (p->pe_data && p->pe->ct_match) {
359                         if (p->pe == cp->pe && p->pe->ct_match(p, cp))
360                                 goto out;
361                         continue;
362                 }
363
364                 if (cp->af == p->af &&
365                     ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
366                     /* protocol should only be IPPROTO_IP if
367                      * p->vaddr is a fwmark */
368                     ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC :
369                                      p->af, p->vaddr, &cp->vaddr) &&
370                     p->cport == cp->cport && p->vport == cp->vport &&
371                     cp->flags & IP_VS_CONN_F_TEMPLATE &&
372                     p->protocol == cp->protocol)
373                         goto out;
374         }
375         cp = NULL;
376
377   out:
378         if (cp)
379                 atomic_inc(&cp->refcnt);
380         ct_read_unlock(hash);
381
382         IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
383                       ip_vs_proto_name(p->protocol),
384                       IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
385                       IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
386                       cp ? "hit" : "not hit");
387
388         return cp;
389 }
390
391 /* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
392  * Called for pkts coming from inside-to-OUTside.
393  *      p->caddr, p->cport: pkt source address (inside host)
394  *      p->vaddr, p->vport: pkt dest address (foreign host) */
395 struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
396 {
397         unsigned int hash;
398         struct ip_vs_conn *cp, *ret=NULL;
399         struct hlist_node *n;
400
401         /*
402          *      Check for "full" addressed entries
403          */
404         hash = ip_vs_conn_hashkey_param(p, true);
405
406         ct_read_lock(hash);
407
408         hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
409                 if (cp->af == p->af &&
410                     p->vport == cp->cport && p->cport == cp->dport &&
411                     ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
412                     ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
413                     p->protocol == cp->protocol &&
414                     ip_vs_conn_net_eq(cp, p->net)) {
415                         /* HIT */
416                         atomic_inc(&cp->refcnt);
417                         ret = cp;
418                         break;
419                 }
420         }
421
422         ct_read_unlock(hash);
423
424         IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
425                       ip_vs_proto_name(p->protocol),
426                       IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
427                       IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
428                       ret ? "hit" : "not hit");
429
430         return ret;
431 }
432
433 struct ip_vs_conn *
434 ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
435                          const struct ip_vs_iphdr *iph,
436                          unsigned int proto_off, int inverse)
437 {
438         struct ip_vs_conn_param p;
439
440         if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p))
441                 return NULL;
442
443         return ip_vs_conn_out_get(&p);
444 }
445 EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
446
447 /*
448  *      Put back the conn and restart its timer with its timeout
449  */
450 void ip_vs_conn_put(struct ip_vs_conn *cp)
451 {
452         unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
453                 0 : cp->timeout;
454         mod_timer(&cp->timer, jiffies+t);
455
456         __ip_vs_conn_put(cp);
457 }
458
459
460 /*
461  *      Fill a no_client_port connection with a client port number
462  */
463 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
464 {
465         if (ip_vs_conn_unhash(cp)) {
466                 spin_lock(&cp->lock);
467                 if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
468                         atomic_dec(&ip_vs_conn_no_cport_cnt);
469                         cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
470                         cp->cport = cport;
471                 }
472                 spin_unlock(&cp->lock);
473
474                 /* hash on new dport */
475                 ip_vs_conn_hash(cp);
476         }
477 }
478
479
480 /*
481  *      Bind a connection entry with the corresponding packet_xmit.
482  *      Called by ip_vs_conn_new.
483  */
484 static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
485 {
486         switch (IP_VS_FWD_METHOD(cp)) {
487         case IP_VS_CONN_F_MASQ:
488                 cp->packet_xmit = ip_vs_nat_xmit;
489                 break;
490
491         case IP_VS_CONN_F_TUNNEL:
492                 cp->packet_xmit = ip_vs_tunnel_xmit;
493                 break;
494
495         case IP_VS_CONN_F_DROUTE:
496                 cp->packet_xmit = ip_vs_dr_xmit;
497                 break;
498
499         case IP_VS_CONN_F_LOCALNODE:
500                 cp->packet_xmit = ip_vs_null_xmit;
501                 break;
502
503         case IP_VS_CONN_F_BYPASS:
504                 cp->packet_xmit = ip_vs_bypass_xmit;
505                 break;
506         }
507 }
508
509 #ifdef CONFIG_IP_VS_IPV6
510 static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
511 {
512         switch (IP_VS_FWD_METHOD(cp)) {
513         case IP_VS_CONN_F_MASQ:
514                 cp->packet_xmit = ip_vs_nat_xmit_v6;
515                 break;
516
517         case IP_VS_CONN_F_TUNNEL:
518                 cp->packet_xmit = ip_vs_tunnel_xmit_v6;
519                 break;
520
521         case IP_VS_CONN_F_DROUTE:
522                 cp->packet_xmit = ip_vs_dr_xmit_v6;
523                 break;
524
525         case IP_VS_CONN_F_LOCALNODE:
526                 cp->packet_xmit = ip_vs_null_xmit;
527                 break;
528
529         case IP_VS_CONN_F_BYPASS:
530                 cp->packet_xmit = ip_vs_bypass_xmit_v6;
531                 break;
532         }
533 }
534 #endif
535
536
537 static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
538 {
539         return atomic_read(&dest->activeconns)
540                 + atomic_read(&dest->inactconns);
541 }
542
543 /*
544  *      Bind a connection entry with a virtual service destination
545  *      Called just after a new connection entry is created.
546  */
547 static inline void
548 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
549 {
550         unsigned int conn_flags;
551         __u32 flags;
552
553         /* if dest is NULL, then return directly */
554         if (!dest)
555                 return;
556
557         /* Increase the refcnt counter of the dest */
558         atomic_inc(&dest->refcnt);
559
560         conn_flags = atomic_read(&dest->conn_flags);
561         if (cp->protocol != IPPROTO_UDP)
562                 conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
563         flags = cp->flags;
564         /* Bind with the destination and its corresponding transmitter */
565         if (flags & IP_VS_CONN_F_SYNC) {
566                 /* if the connection is not template and is created
567                  * by sync, preserve the activity flag.
568                  */
569                 if (!(flags & IP_VS_CONN_F_TEMPLATE))
570                         conn_flags &= ~IP_VS_CONN_F_INACTIVE;
571                 /* connections inherit forwarding method from dest */
572                 flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
573         }
574         flags |= conn_flags;
575         cp->flags = flags;
576         cp->dest = dest;
577
578         IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
579                       "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
580                       "dest->refcnt:%d\n",
581                       ip_vs_proto_name(cp->protocol),
582                       IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
583                       IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
584                       IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
585                       ip_vs_fwd_tag(cp), cp->state,
586                       cp->flags, atomic_read(&cp->refcnt),
587                       atomic_read(&dest->refcnt));
588
589         /* Update the connection counters */
590         if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
591                 /* It is a normal connection, so modify the counters
592                  * according to the flags, later the protocol can
593                  * update them on state change
594                  */
595                 if (!(flags & IP_VS_CONN_F_INACTIVE))
596                         atomic_inc(&dest->activeconns);
597                 else
598                         atomic_inc(&dest->inactconns);
599         } else {
600                 /* It is a persistent connection/template, so increase
601                    the persistent connection counter */
602                 atomic_inc(&dest->persistconns);
603         }
604
605         if (dest->u_threshold != 0 &&
606             ip_vs_dest_totalconns(dest) >= dest->u_threshold)
607                 dest->flags |= IP_VS_DEST_F_OVERLOAD;
608 }
609
610
611 /*
612  * Check if there is a destination for the connection, if so
613  * bind the connection to the destination.
614  */
615 struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
616 {
617         struct ip_vs_dest *dest;
618
619         dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
620                                cp->dport, &cp->vaddr, cp->vport,
621                                cp->protocol, cp->fwmark, cp->flags);
622         if (dest) {
623                 struct ip_vs_proto_data *pd;
624
625                 spin_lock(&cp->lock);
626                 if (cp->dest) {
627                         spin_unlock(&cp->lock);
628                         return dest;
629                 }
630
631                 /* Applications work depending on the forwarding method
632                  * but better to reassign them always when binding dest */
633                 if (cp->app)
634                         ip_vs_unbind_app(cp);
635
636                 ip_vs_bind_dest(cp, dest);
637                 spin_unlock(&cp->lock);
638
639                 /* Update its packet transmitter */
640                 cp->packet_xmit = NULL;
641 #ifdef CONFIG_IP_VS_IPV6
642                 if (cp->af == AF_INET6)
643                         ip_vs_bind_xmit_v6(cp);
644                 else
645 #endif
646                         ip_vs_bind_xmit(cp);
647
648                 pd = ip_vs_proto_data_get(ip_vs_conn_net(cp), cp->protocol);
649                 if (pd && atomic_read(&pd->appcnt))
650                         ip_vs_bind_app(cp, pd->pp);
651         }
652         return dest;
653 }
654
655
656 /*
657  *      Unbind a connection entry with its VS destination
658  *      Called by the ip_vs_conn_expire function.
659  */
660 static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
661 {
662         struct ip_vs_dest *dest = cp->dest;
663
664         if (!dest)
665                 return;
666
667         IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
668                       "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
669                       "dest->refcnt:%d\n",
670                       ip_vs_proto_name(cp->protocol),
671                       IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
672                       IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
673                       IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
674                       ip_vs_fwd_tag(cp), cp->state,
675                       cp->flags, atomic_read(&cp->refcnt),
676                       atomic_read(&dest->refcnt));
677
678         /* Update the connection counters */
679         if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
680                 /* It is a normal connection, so decrease the inactconns
681                    or activeconns counter */
682                 if (cp->flags & IP_VS_CONN_F_INACTIVE) {
683                         atomic_dec(&dest->inactconns);
684                 } else {
685                         atomic_dec(&dest->activeconns);
686                 }
687         } else {
688                 /* It is a persistent connection/template, so decrease
689                    the persistent connection counter */
690                 atomic_dec(&dest->persistconns);
691         }
692
693         if (dest->l_threshold != 0) {
694                 if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
695                         dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
696         } else if (dest->u_threshold != 0) {
697                 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
698                         dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
699         } else {
700                 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
701                         dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
702         }
703
704         /*
705          * Simply decrease the refcnt of the dest, because the
706          * dest will be either in service's destination list
707          * or in the trash.
708          */
709         atomic_dec(&dest->refcnt);
710 }
711
712 static int expire_quiescent_template(struct netns_ipvs *ipvs,
713                                      struct ip_vs_dest *dest)
714 {
715 #ifdef CONFIG_SYSCTL
716         return ipvs->sysctl_expire_quiescent_template &&
717                 (atomic_read(&dest->weight) == 0);
718 #else
719         return 0;
720 #endif
721 }
722
723 /*
724  *      Checking if the destination of a connection template is available.
725  *      If available, return 1, otherwise invalidate this connection
726  *      template and return 0.
727  */
728 int ip_vs_check_template(struct ip_vs_conn *ct)
729 {
730         struct ip_vs_dest *dest = ct->dest;
731         struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
732
733         /*
734          * Checking the dest server status.
735          */
736         if ((dest == NULL) ||
737             !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
738             expire_quiescent_template(ipvs, dest)) {
739                 IP_VS_DBG_BUF(9, "check_template: dest not available for "
740                               "protocol %s s:%s:%d v:%s:%d "
741                               "-> d:%s:%d\n",
742                               ip_vs_proto_name(ct->protocol),
743                               IP_VS_DBG_ADDR(ct->af, &ct->caddr),
744                               ntohs(ct->cport),
745                               IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
746                               ntohs(ct->vport),
747                               IP_VS_DBG_ADDR(ct->af, &ct->daddr),
748                               ntohs(ct->dport));
749
750                 /*
751                  * Invalidate the connection template
752                  */
753                 if (ct->vport != htons(0xffff)) {
754                         if (ip_vs_conn_unhash(ct)) {
755                                 ct->dport = htons(0xffff);
756                                 ct->vport = htons(0xffff);
757                                 ct->cport = 0;
758                                 ip_vs_conn_hash(ct);
759                         }
760                 }
761
762                 /*
763                  * Simply decrease the refcnt of the template,
764                  * don't restart its timer.
765                  */
766                 atomic_dec(&ct->refcnt);
767                 return 0;
768         }
769         return 1;
770 }
771
772 static void ip_vs_conn_expire(unsigned long data)
773 {
774         struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
775         struct net *net = ip_vs_conn_net(cp);
776         struct netns_ipvs *ipvs = net_ipvs(net);
777
778         cp->timeout = 60*HZ;
779
780         /*
781          *      hey, I'm using it
782          */
783         atomic_inc(&cp->refcnt);
784
785         /*
786          *      do I control anybody?
787          */
788         if (atomic_read(&cp->n_control))
789                 goto expire_later;
790
791         /*
792          *      unhash it if it is hashed in the conn table
793          */
794         if (!ip_vs_conn_unhash(cp) && !(cp->flags & IP_VS_CONN_F_ONE_PACKET))
795                 goto expire_later;
796
797         /*
798          *      refcnt==1 implies I'm the only one referrer
799          */
800         if (likely(atomic_read(&cp->refcnt) == 1)) {
801                 /* delete the timer if it is activated by other users */
802                 if (timer_pending(&cp->timer))
803                         del_timer(&cp->timer);
804
805                 /* does anybody control me? */
806                 if (cp->control)
807                         ip_vs_control_del(cp);
808
809                 if (cp->flags & IP_VS_CONN_F_NFCT) {
810                         ip_vs_conn_drop_conntrack(cp);
811                         /* Do not access conntracks during subsys cleanup
812                          * because nf_conntrack_find_get can not be used after
813                          * conntrack cleanup for the net.
814                          */
815                         smp_rmb();
816                         if (ipvs->enable)
817                                 ip_vs_conn_drop_conntrack(cp);
818                 }
819
820                 ip_vs_pe_put(cp->pe);
821                 kfree(cp->pe_data);
822                 if (unlikely(cp->app != NULL))
823                         ip_vs_unbind_app(cp);
824                 ip_vs_unbind_dest(cp);
825                 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
826                         atomic_dec(&ip_vs_conn_no_cport_cnt);
827                 atomic_dec(&ipvs->conn_count);
828
829                 kmem_cache_free(ip_vs_conn_cachep, cp);
830                 return;
831         }
832
833         /* hash it back to the table */
834         ip_vs_conn_hash(cp);
835
836   expire_later:
837         IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
838                   atomic_read(&cp->refcnt)-1,
839                   atomic_read(&cp->n_control));
840
841         if (ipvs->sync_state & IP_VS_STATE_MASTER)
842                 ip_vs_sync_conn(net, cp, sysctl_sync_threshold(ipvs));
843
844         ip_vs_conn_put(cp);
845 }
846
847
848 void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
849 {
850         if (del_timer(&cp->timer))
851                 mod_timer(&cp->timer, jiffies);
852 }
853
854
855 /*
856  *      Create a new connection entry and hash it into the ip_vs_conn_tab
857  */
858 struct ip_vs_conn *
859 ip_vs_conn_new(const struct ip_vs_conn_param *p,
860                const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
861                struct ip_vs_dest *dest, __u32 fwmark)
862 {
863         struct ip_vs_conn *cp;
864         struct netns_ipvs *ipvs = net_ipvs(p->net);
865         struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
866                                                            p->protocol);
867
868         cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
869         if (cp == NULL) {
870                 IP_VS_ERR_RL("%s(): no memory\n", __func__);
871                 return NULL;
872         }
873
874         INIT_HLIST_NODE(&cp->c_list);
875         setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
876         ip_vs_conn_net_set(cp, p->net);
877         cp->af             = p->af;
878         cp->protocol       = p->protocol;
879         ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
880         cp->cport          = p->cport;
881         ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr);
882         cp->vport          = p->vport;
883         /* proto should only be IPPROTO_IP if d_addr is a fwmark */
884         ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
885                         &cp->daddr, daddr);
886         cp->dport          = dport;
887         cp->flags          = flags;
888         cp->fwmark         = fwmark;
889         if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
890                 ip_vs_pe_get(p->pe);
891                 cp->pe = p->pe;
892                 cp->pe_data = p->pe_data;
893                 cp->pe_data_len = p->pe_data_len;
894         }
895         spin_lock_init(&cp->lock);
896
897         /*
898          * Set the entry is referenced by the current thread before hashing
899          * it in the table, so that other thread run ip_vs_random_dropentry
900          * but cannot drop this entry.
901          */
902         atomic_set(&cp->refcnt, 1);
903
904         atomic_set(&cp->n_control, 0);
905         atomic_set(&cp->in_pkts, 0);
906
907         atomic_inc(&ipvs->conn_count);
908         if (flags & IP_VS_CONN_F_NO_CPORT)
909                 atomic_inc(&ip_vs_conn_no_cport_cnt);
910
911         /* Bind the connection with a destination server */
912         ip_vs_bind_dest(cp, dest);
913
914         /* Set its state and timeout */
915         cp->state = 0;
916         cp->timeout = 3*HZ;
917         cp->sync_endtime = jiffies & ~3UL;
918
919         /* Bind its packet transmitter */
920 #ifdef CONFIG_IP_VS_IPV6
921         if (p->af == AF_INET6)
922                 ip_vs_bind_xmit_v6(cp);
923         else
924 #endif
925                 ip_vs_bind_xmit(cp);
926
927         if (unlikely(pd && atomic_read(&pd->appcnt)))
928                 ip_vs_bind_app(cp, pd->pp);
929
930         /*
931          * Allow conntrack to be preserved. By default, conntrack
932          * is created and destroyed for every packet.
933          * Sometimes keeping conntrack can be useful for
934          * IP_VS_CONN_F_ONE_PACKET too.
935          */
936
937         if (ip_vs_conntrack_enabled(ipvs))
938                 cp->flags |= IP_VS_CONN_F_NFCT;
939
940         /* Hash it in the ip_vs_conn_tab finally */
941         ip_vs_conn_hash(cp);
942
943         return cp;
944 }
945
946 /*
947  *      /proc/net/ip_vs_conn entries
948  */
949 #ifdef CONFIG_PROC_FS
950 struct ip_vs_iter_state {
951         struct seq_net_private  p;
952         struct hlist_head       *l;
953 };
954
955 static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
956 {
957         int idx;
958         struct ip_vs_conn *cp;
959         struct ip_vs_iter_state *iter = seq->private;
960         struct hlist_node *n;
961
962         for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
963                 ct_read_lock_bh(idx);
964                 hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
965                         if (pos-- == 0) {
966                                 iter->l = &ip_vs_conn_tab[idx];
967                                 return cp;
968                         }
969                 }
970                 ct_read_unlock_bh(idx);
971         }
972
973         return NULL;
974 }
975
976 static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
977 {
978         struct ip_vs_iter_state *iter = seq->private;
979
980         iter->l = NULL;
981         return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
982 }
983
984 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
985 {
986         struct ip_vs_conn *cp = v;
987         struct ip_vs_iter_state *iter = seq->private;
988         struct hlist_node *e;
989         struct hlist_head *l = iter->l;
990         int idx;
991
992         ++*pos;
993         if (v == SEQ_START_TOKEN)
994                 return ip_vs_conn_array(seq, 0);
995
996         /* more on same hash chain? */
997         if ((e = cp->c_list.next))
998                 return hlist_entry(e, struct ip_vs_conn, c_list);
999
1000         idx = l - ip_vs_conn_tab;
1001         ct_read_unlock_bh(idx);
1002
1003         while (++idx < ip_vs_conn_tab_size) {
1004                 ct_read_lock_bh(idx);
1005                 hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) {
1006                         iter->l = &ip_vs_conn_tab[idx];
1007                         return cp;
1008                 }
1009                 ct_read_unlock_bh(idx);
1010         }
1011         iter->l = NULL;
1012         return NULL;
1013 }
1014
1015 static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
1016 {
1017         struct ip_vs_iter_state *iter = seq->private;
1018         struct hlist_head *l = iter->l;
1019
1020         if (l)
1021                 ct_read_unlock_bh(l - ip_vs_conn_tab);
1022 }
1023
1024 static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
1025 {
1026
1027         if (v == SEQ_START_TOKEN)
1028                 seq_puts(seq,
1029    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
1030         else {
1031                 const struct ip_vs_conn *cp = v;
1032                 struct net *net = seq_file_net(seq);
1033                 char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
1034                 size_t len = 0;
1035
1036                 if (!ip_vs_conn_net_eq(cp, net))
1037                         return 0;
1038                 if (cp->pe_data) {
1039                         pe_data[0] = ' ';
1040                         len = strlen(cp->pe->name);
1041                         memcpy(pe_data + 1, cp->pe->name, len);
1042                         pe_data[len + 1] = ' ';
1043                         len += 2;
1044                         len += cp->pe->show_pe_data(cp, pe_data + len);
1045                 }
1046                 pe_data[len] = '\0';
1047
1048 #ifdef CONFIG_IP_VS_IPV6
1049                 if (cp->af == AF_INET6)
1050                         seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
1051                                 "%pI6 %04X %-11s %7lu%s\n",
1052                                 ip_vs_proto_name(cp->protocol),
1053                                 &cp->caddr.in6, ntohs(cp->cport),
1054                                 &cp->vaddr.in6, ntohs(cp->vport),
1055                                 &cp->daddr.in6, ntohs(cp->dport),
1056                                 ip_vs_state_name(cp->protocol, cp->state),
1057                                 (cp->timer.expires-jiffies)/HZ, pe_data);
1058                 else
1059 #endif
1060                         seq_printf(seq,
1061                                 "%-3s %08X %04X %08X %04X"
1062                                 " %08X %04X %-11s %7lu%s\n",
1063                                 ip_vs_proto_name(cp->protocol),
1064                                 ntohl(cp->caddr.ip), ntohs(cp->cport),
1065                                 ntohl(cp->vaddr.ip), ntohs(cp->vport),
1066                                 ntohl(cp->daddr.ip), ntohs(cp->dport),
1067                                 ip_vs_state_name(cp->protocol, cp->state),
1068                                 (cp->timer.expires-jiffies)/HZ, pe_data);
1069         }
1070         return 0;
1071 }
1072
1073 static const struct seq_operations ip_vs_conn_seq_ops = {
1074         .start = ip_vs_conn_seq_start,
1075         .next  = ip_vs_conn_seq_next,
1076         .stop  = ip_vs_conn_seq_stop,
1077         .show  = ip_vs_conn_seq_show,
1078 };
1079
1080 static int ip_vs_conn_open(struct inode *inode, struct file *file)
1081 {
1082         return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
1083                             sizeof(struct ip_vs_iter_state));
1084 }
1085
1086 static const struct file_operations ip_vs_conn_fops = {
1087         .owner   = THIS_MODULE,
1088         .open    = ip_vs_conn_open,
1089         .read    = seq_read,
1090         .llseek  = seq_lseek,
1091         .release = seq_release_net,
1092 };
1093
1094 static const char *ip_vs_origin_name(unsigned int flags)
1095 {
1096         if (flags & IP_VS_CONN_F_SYNC)
1097                 return "SYNC";
1098         else
1099                 return "LOCAL";
1100 }
1101
1102 static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
1103 {
1104
1105         if (v == SEQ_START_TOKEN)
1106                 seq_puts(seq,
1107    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
1108         else {
1109                 const struct ip_vs_conn *cp = v;
1110                 struct net *net = seq_file_net(seq);
1111
1112                 if (!ip_vs_conn_net_eq(cp, net))
1113                         return 0;
1114
1115 #ifdef CONFIG_IP_VS_IPV6
1116                 if (cp->af == AF_INET6)
1117                         seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %-6s %7lu\n",
1118                                 ip_vs_proto_name(cp->protocol),
1119                                 &cp->caddr.in6, ntohs(cp->cport),
1120                                 &cp->vaddr.in6, ntohs(cp->vport),
1121                                 &cp->daddr.in6, ntohs(cp->dport),
1122                                 ip_vs_state_name(cp->protocol, cp->state),
1123                                 ip_vs_origin_name(cp->flags),
1124                                 (cp->timer.expires-jiffies)/HZ);
1125                 else
1126 #endif
1127                         seq_printf(seq,
1128                                 "%-3s %08X %04X %08X %04X "
1129                                 "%08X %04X %-11s %-6s %7lu\n",
1130                                 ip_vs_proto_name(cp->protocol),
1131                                 ntohl(cp->caddr.ip), ntohs(cp->cport),
1132                                 ntohl(cp->vaddr.ip), ntohs(cp->vport),
1133                                 ntohl(cp->daddr.ip), ntohs(cp->dport),
1134                                 ip_vs_state_name(cp->protocol, cp->state),
1135                                 ip_vs_origin_name(cp->flags),
1136                                 (cp->timer.expires-jiffies)/HZ);
1137         }
1138         return 0;
1139 }
1140
1141 static const struct seq_operations ip_vs_conn_sync_seq_ops = {
1142         .start = ip_vs_conn_seq_start,
1143         .next  = ip_vs_conn_seq_next,
1144         .stop  = ip_vs_conn_seq_stop,
1145         .show  = ip_vs_conn_sync_seq_show,
1146 };
1147
1148 static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
1149 {
1150         return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
1151                             sizeof(struct ip_vs_iter_state));
1152 }
1153
1154 static const struct file_operations ip_vs_conn_sync_fops = {
1155         .owner   = THIS_MODULE,
1156         .open    = ip_vs_conn_sync_open,
1157         .read    = seq_read,
1158         .llseek  = seq_lseek,
1159         .release = seq_release_net,
1160 };
1161
1162 #endif
1163
1164
1165 /*
1166  *      Randomly drop connection entries before running out of memory
1167  */
1168 static inline int todrop_entry(struct ip_vs_conn *cp)
1169 {
1170         /*
1171          * The drop rate array needs tuning for real environments.
1172          * Called from timer bh only => no locking
1173          */
1174         static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
1175         static char todrop_counter[9] = {0};
1176         int i;
1177
1178         /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
1179            This will leave enough time for normal connection to get
1180            through. */
1181         if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
1182                 return 0;
1183
1184         /* Don't drop the entry if its number of incoming packets is not
1185            located in [0, 8] */
1186         i = atomic_read(&cp->in_pkts);
1187         if (i > 8 || i < 0) return 0;
1188
1189         if (!todrop_rate[i]) return 0;
1190         if (--todrop_counter[i] > 0) return 0;
1191
1192         todrop_counter[i] = todrop_rate[i];
1193         return 1;
1194 }
1195
1196 /* Called from keventd and must protect itself from softirqs */
1197 void ip_vs_random_dropentry(struct net *net)
1198 {
1199         int idx;
1200         struct ip_vs_conn *cp;
1201
1202         /*
1203          * Randomly scan 1/32 of the whole table every second
1204          */
1205         for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) {
1206                 unsigned int hash = net_random() & ip_vs_conn_tab_mask;
1207                 struct hlist_node *n;
1208
1209                 /*
1210                  *  Lock is actually needed in this loop.
1211                  */
1212                 ct_write_lock_bh(hash);
1213
1214                 hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) {
1215                         if (cp->flags & IP_VS_CONN_F_TEMPLATE)
1216                                 /* connection template */
1217                                 continue;
1218                         if (!ip_vs_conn_net_eq(cp, net))
1219                                 continue;
1220                         if (cp->protocol == IPPROTO_TCP) {
1221                                 switch(cp->state) {
1222                                 case IP_VS_TCP_S_SYN_RECV:
1223                                 case IP_VS_TCP_S_SYNACK:
1224                                         break;
1225
1226                                 case IP_VS_TCP_S_ESTABLISHED:
1227                                         if (todrop_entry(cp))
1228                                                 break;
1229                                         continue;
1230
1231                                 default:
1232                                         continue;
1233                                 }
1234                         } else {
1235                                 if (!todrop_entry(cp))
1236                                         continue;
1237                         }
1238
1239                         IP_VS_DBG(4, "del connection\n");
1240                         ip_vs_conn_expire_now(cp);
1241                         if (cp->control) {
1242                                 IP_VS_DBG(4, "del conn template\n");
1243                                 ip_vs_conn_expire_now(cp->control);
1244                         }
1245                 }
1246                 ct_write_unlock_bh(hash);
1247         }
1248 }
1249
1250
1251 /*
1252  *      Flush all the connection entries in the ip_vs_conn_tab
1253  */
1254 static void ip_vs_conn_flush(struct net *net)
1255 {
1256         int idx;
1257         struct ip_vs_conn *cp;
1258         struct netns_ipvs *ipvs = net_ipvs(net);
1259
1260 flush_again:
1261         for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
1262                 struct hlist_node *n;
1263
1264                 /*
1265                  *  Lock is actually needed in this loop.
1266                  */
1267                 ct_write_lock_bh(idx);
1268
1269                 hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) {
1270                         if (!ip_vs_conn_net_eq(cp, net))
1271                                 continue;
1272                         IP_VS_DBG(4, "del connection\n");
1273                         ip_vs_conn_expire_now(cp);
1274                         if (cp->control) {
1275                                 IP_VS_DBG(4, "del conn template\n");
1276                                 ip_vs_conn_expire_now(cp->control);
1277                         }
1278                 }
1279                 ct_write_unlock_bh(idx);
1280         }
1281
1282         /* the counter may be not NULL, because maybe some conn entries
1283            are run by slow timer handler or unhashed but still referred */
1284         if (atomic_read(&ipvs->conn_count) != 0) {
1285                 schedule();
1286                 goto flush_again;
1287         }
1288 }
1289 /*
1290  * per netns init and exit
1291  */
1292 int __net_init ip_vs_conn_net_init(struct net *net)
1293 {
1294         struct netns_ipvs *ipvs = net_ipvs(net);
1295
1296         atomic_set(&ipvs->conn_count, 0);
1297
1298         proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
1299         proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
1300         return 0;
1301 }
1302
1303 void __net_exit ip_vs_conn_net_cleanup(struct net *net)
1304 {
1305         /* flush all the connection entries first */
1306         ip_vs_conn_flush(net);
1307         proc_net_remove(net, "ip_vs_conn");
1308         proc_net_remove(net, "ip_vs_conn_sync");
1309 }
1310
1311 int __init ip_vs_conn_init(void)
1312 {
1313         int idx;
1314
1315         /* Compute size and mask */
1316         ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
1317         ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1;
1318
1319         /*
1320          * Allocate the connection hash table and initialize its list heads
1321          */
1322         ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab));
1323         if (!ip_vs_conn_tab)
1324                 return -ENOMEM;
1325
1326         /* Allocate ip_vs_conn slab cache */
1327         ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
1328                                               sizeof(struct ip_vs_conn), 0,
1329                                               SLAB_HWCACHE_ALIGN, NULL);
1330         if (!ip_vs_conn_cachep) {
1331                 vfree(ip_vs_conn_tab);
1332                 return -ENOMEM;
1333         }
1334
1335         pr_info("Connection hash table configured "
1336                 "(size=%d, memory=%ldKbytes)\n",
1337                 ip_vs_conn_tab_size,
1338                 (long)(ip_vs_conn_tab_size*sizeof(struct list_head))/1024);
1339         IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
1340                   sizeof(struct ip_vs_conn));
1341
1342         for (idx = 0; idx < ip_vs_conn_tab_size; idx++)
1343                 INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]);
1344
1345         for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
1346                 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
1347         }
1348
1349         /* calculate the random value for connection hash */
1350         get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1351
1352         return 0;
1353 }
1354
1355 void ip_vs_conn_cleanup(void)
1356 {
1357         /* Release the empty cache */
1358         kmem_cache_destroy(ip_vs_conn_cachep);
1359         vfree(ip_vs_conn_tab);
1360 }