]> Pileus Git - ~andy/linux/blob - net/ipv4/ipvs/ip_vs_conn.c
[IPVS]: Cleanup IP_VS_DBG statements.
[~andy/linux] / net / ipv4 / ipvs / ip_vs_conn.c
1 /*
2  * IPVS         An implementation of the IP virtual server support for the
3  *              LINUX operating system.  IPVS is now implemented as a module
4  *              over the Netfilter framework. IPVS can be used to build a
5  *              high-performance and highly available server based on a
6  *              cluster of servers.
7  *
8  * Version:     $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $
9  *
10  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
11  *              Peter Kese <peter.kese@ijs.si>
12  *              Julian Anastasov <ja@ssi.bg>
13  *
14  *              This program is free software; you can redistribute it and/or
15  *              modify it under the terms of the GNU General Public License
16  *              as published by the Free Software Foundation; either version
17  *              2 of the License, or (at your option) any later version.
18  *
19  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
20  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
21  * and others. Many code here is taken from IP MASQ code of kernel 2.2.
22  *
23  * Changes:
24  *
25  */
26
27 #include <linux/in.h>
28 #include <linux/kernel.h>
29 #include <linux/module.h>
30 #include <linux/vmalloc.h>
31 #include <linux/proc_fs.h>              /* for proc_net_* */
32 #include <linux/seq_file.h>
33 #include <linux/jhash.h>
34 #include <linux/random.h>
35
36 #include <net/ip_vs.h>
37
38
39 /*
40  *  Connection hash table: for input and output packets lookups of IPVS
41  */
42 static struct list_head *ip_vs_conn_tab;
43
44 /*  SLAB cache for IPVS connections */
45 static kmem_cache_t *ip_vs_conn_cachep __read_mostly;
46
47 /*  counter for current IPVS connections */
48 static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
49
50 /*  counter for no client port connections */
51 static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
52
53 /* random value for IPVS connection hash */
54 static unsigned int ip_vs_conn_rnd;
55
56 /*
57  *  Fine locking granularity for big connection hash table
58  */
59 #define CT_LOCKARRAY_BITS  4
60 #define CT_LOCKARRAY_SIZE  (1<<CT_LOCKARRAY_BITS)
61 #define CT_LOCKARRAY_MASK  (CT_LOCKARRAY_SIZE-1)
62
63 struct ip_vs_aligned_lock
64 {
65         rwlock_t        l;
66 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
67
68 /* lock array for conn table */
69 static struct ip_vs_aligned_lock
70 __ip_vs_conntbl_lock_array[CT_LOCKARRAY_SIZE] __cacheline_aligned;
71
72 static inline void ct_read_lock(unsigned key)
73 {
74         read_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
75 }
76
77 static inline void ct_read_unlock(unsigned key)
78 {
79         read_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
80 }
81
82 static inline void ct_write_lock(unsigned key)
83 {
84         write_lock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
85 }
86
87 static inline void ct_write_unlock(unsigned key)
88 {
89         write_unlock(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
90 }
91
92 static inline void ct_read_lock_bh(unsigned key)
93 {
94         read_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
95 }
96
97 static inline void ct_read_unlock_bh(unsigned key)
98 {
99         read_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
100 }
101
102 static inline void ct_write_lock_bh(unsigned key)
103 {
104         write_lock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
105 }
106
107 static inline void ct_write_unlock_bh(unsigned key)
108 {
109         write_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
110 }
111
112
113 /*
114  *      Returns hash value for IPVS connection entry
115  */
116 static unsigned int ip_vs_conn_hashkey(unsigned proto, __u32 addr, __u16 port)
117 {
118         return jhash_3words(addr, port, proto, ip_vs_conn_rnd)
119                 & IP_VS_CONN_TAB_MASK;
120 }
121
122
123 /*
124  *      Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port.
125  *      returns bool success.
126  */
127 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
128 {
129         unsigned hash;
130         int ret;
131
132         /* Hash by protocol, client address and port */
133         hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
134
135         ct_write_lock(hash);
136
137         if (!(cp->flags & IP_VS_CONN_F_HASHED)) {
138                 list_add(&cp->c_list, &ip_vs_conn_tab[hash]);
139                 cp->flags |= IP_VS_CONN_F_HASHED;
140                 atomic_inc(&cp->refcnt);
141                 ret = 1;
142         } else {
143                 IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, "
144                           "called from %p\n", __builtin_return_address(0));
145                 ret = 0;
146         }
147
148         ct_write_unlock(hash);
149
150         return ret;
151 }
152
153
154 /*
155  *      UNhashes ip_vs_conn from ip_vs_conn_tab.
156  *      returns bool success.
157  */
158 static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp)
159 {
160         unsigned hash;
161         int ret;
162
163         /* unhash it and decrease its reference counter */
164         hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport);
165
166         ct_write_lock(hash);
167
168         if (cp->flags & IP_VS_CONN_F_HASHED) {
169                 list_del(&cp->c_list);
170                 cp->flags &= ~IP_VS_CONN_F_HASHED;
171                 atomic_dec(&cp->refcnt);
172                 ret = 1;
173         } else
174                 ret = 0;
175
176         ct_write_unlock(hash);
177
178         return ret;
179 }
180
181
182 /*
183  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
184  *  Called for pkts coming from OUTside-to-INside.
185  *      s_addr, s_port: pkt source address (foreign host)
186  *      d_addr, d_port: pkt dest address (load balancer)
187  */
188 static inline struct ip_vs_conn *__ip_vs_conn_in_get
189 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
190 {
191         unsigned hash;
192         struct ip_vs_conn *cp;
193
194         hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
195
196         ct_read_lock(hash);
197
198         list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
199                 if (s_addr==cp->caddr && s_port==cp->cport &&
200                     d_port==cp->vport && d_addr==cp->vaddr &&
201                     ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
202                     protocol==cp->protocol) {
203                         /* HIT */
204                         atomic_inc(&cp->refcnt);
205                         ct_read_unlock(hash);
206                         return cp;
207                 }
208         }
209
210         ct_read_unlock(hash);
211
212         return NULL;
213 }
214
215 struct ip_vs_conn *ip_vs_conn_in_get
216 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
217 {
218         struct ip_vs_conn *cp;
219
220         cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port);
221         if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt))
222                 cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port);
223
224         IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
225                   ip_vs_proto_name(protocol),
226                   NIPQUAD(s_addr), ntohs(s_port),
227                   NIPQUAD(d_addr), ntohs(d_port),
228                   cp?"hit":"not hit");
229
230         return cp;
231 }
232
233 /* Get reference to connection template */
234 struct ip_vs_conn *ip_vs_ct_in_get
235 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
236 {
237         unsigned hash;
238         struct ip_vs_conn *cp;
239
240         hash = ip_vs_conn_hashkey(protocol, s_addr, s_port);
241
242         ct_read_lock(hash);
243
244         list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
245                 if (s_addr==cp->caddr && s_port==cp->cport &&
246                     d_port==cp->vport && d_addr==cp->vaddr &&
247                     cp->flags & IP_VS_CONN_F_TEMPLATE &&
248                     protocol==cp->protocol) {
249                         /* HIT */
250                         atomic_inc(&cp->refcnt);
251                         goto out;
252                 }
253         }
254         cp = NULL;
255
256   out:
257         ct_read_unlock(hash);
258
259         IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
260                   ip_vs_proto_name(protocol),
261                   NIPQUAD(s_addr), ntohs(s_port),
262                   NIPQUAD(d_addr), ntohs(d_port),
263                   cp?"hit":"not hit");
264
265         return cp;
266 }
267
268 /*
269  *  Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab.
270  *  Called for pkts coming from inside-to-OUTside.
271  *      s_addr, s_port: pkt source address (inside host)
272  *      d_addr, d_port: pkt dest address (foreign host)
273  */
274 struct ip_vs_conn *ip_vs_conn_out_get
275 (int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port)
276 {
277         unsigned hash;
278         struct ip_vs_conn *cp, *ret=NULL;
279
280         /*
281          *      Check for "full" addressed entries
282          */
283         hash = ip_vs_conn_hashkey(protocol, d_addr, d_port);
284
285         ct_read_lock(hash);
286
287         list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
288                 if (d_addr == cp->caddr && d_port == cp->cport &&
289                     s_port == cp->dport && s_addr == cp->daddr &&
290                     protocol == cp->protocol) {
291                         /* HIT */
292                         atomic_inc(&cp->refcnt);
293                         ret = cp;
294                         break;
295                 }
296         }
297
298         ct_read_unlock(hash);
299
300         IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n",
301                   ip_vs_proto_name(protocol),
302                   NIPQUAD(s_addr), ntohs(s_port),
303                   NIPQUAD(d_addr), ntohs(d_port),
304                   ret?"hit":"not hit");
305
306         return ret;
307 }
308
309
310 /*
311  *      Put back the conn and restart its timer with its timeout
312  */
313 void ip_vs_conn_put(struct ip_vs_conn *cp)
314 {
315         /* reset it expire in its timeout */
316         mod_timer(&cp->timer, jiffies+cp->timeout);
317
318         __ip_vs_conn_put(cp);
319 }
320
321
322 /*
323  *      Fill a no_client_port connection with a client port number
324  */
325 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport)
326 {
327         if (ip_vs_conn_unhash(cp)) {
328                 spin_lock(&cp->lock);
329                 if (cp->flags & IP_VS_CONN_F_NO_CPORT) {
330                         atomic_dec(&ip_vs_conn_no_cport_cnt);
331                         cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
332                         cp->cport = cport;
333                 }
334                 spin_unlock(&cp->lock);
335
336                 /* hash on new dport */
337                 ip_vs_conn_hash(cp);
338         }
339 }
340
341
342 /*
343  *      Bind a connection entry with the corresponding packet_xmit.
344  *      Called by ip_vs_conn_new.
345  */
346 static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
347 {
348         switch (IP_VS_FWD_METHOD(cp)) {
349         case IP_VS_CONN_F_MASQ:
350                 cp->packet_xmit = ip_vs_nat_xmit;
351                 break;
352
353         case IP_VS_CONN_F_TUNNEL:
354                 cp->packet_xmit = ip_vs_tunnel_xmit;
355                 break;
356
357         case IP_VS_CONN_F_DROUTE:
358                 cp->packet_xmit = ip_vs_dr_xmit;
359                 break;
360
361         case IP_VS_CONN_F_LOCALNODE:
362                 cp->packet_xmit = ip_vs_null_xmit;
363                 break;
364
365         case IP_VS_CONN_F_BYPASS:
366                 cp->packet_xmit = ip_vs_bypass_xmit;
367                 break;
368         }
369 }
370
371
372 static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
373 {
374         return atomic_read(&dest->activeconns)
375                 + atomic_read(&dest->inactconns);
376 }
377
378 /*
379  *      Bind a connection entry with a virtual service destination
380  *      Called just after a new connection entry is created.
381  */
382 static inline void
383 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
384 {
385         /* if dest is NULL, then return directly */
386         if (!dest)
387                 return;
388
389         /* Increase the refcnt counter of the dest */
390         atomic_inc(&dest->refcnt);
391
392         /* Bind with the destination and its corresponding transmitter */
393         cp->flags |= atomic_read(&dest->conn_flags);
394         cp->dest = dest;
395
396         IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
397                   "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
398                   "dest->refcnt:%d\n",
399                   ip_vs_proto_name(cp->protocol),
400                   NIPQUAD(cp->caddr), ntohs(cp->cport),
401                   NIPQUAD(cp->vaddr), ntohs(cp->vport),
402                   NIPQUAD(cp->daddr), ntohs(cp->dport),
403                   ip_vs_fwd_tag(cp), cp->state,
404                   cp->flags, atomic_read(&cp->refcnt),
405                   atomic_read(&dest->refcnt));
406
407         /* Update the connection counters */
408         if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
409                 /* It is a normal connection, so increase the inactive
410                    connection counter because it is in TCP SYNRECV
411                    state (inactive) or other protocol inacive state */
412                 atomic_inc(&dest->inactconns);
413         } else {
414                 /* It is a persistent connection/template, so increase
415                    the peristent connection counter */
416                 atomic_inc(&dest->persistconns);
417         }
418
419         if (dest->u_threshold != 0 &&
420             ip_vs_dest_totalconns(dest) >= dest->u_threshold)
421                 dest->flags |= IP_VS_DEST_F_OVERLOAD;
422 }
423
424
425 /*
426  *      Unbind a connection entry with its VS destination
427  *      Called by the ip_vs_conn_expire function.
428  */
429 static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
430 {
431         struct ip_vs_dest *dest = cp->dest;
432
433         if (!dest)
434                 return;
435
436         IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
437                   "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
438                   "dest->refcnt:%d\n",
439                   ip_vs_proto_name(cp->protocol),
440                   NIPQUAD(cp->caddr), ntohs(cp->cport),
441                   NIPQUAD(cp->vaddr), ntohs(cp->vport),
442                   NIPQUAD(cp->daddr), ntohs(cp->dport),
443                   ip_vs_fwd_tag(cp), cp->state,
444                   cp->flags, atomic_read(&cp->refcnt),
445                   atomic_read(&dest->refcnt));
446
447         /* Update the connection counters */
448         if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
449                 /* It is a normal connection, so decrease the inactconns
450                    or activeconns counter */
451                 if (cp->flags & IP_VS_CONN_F_INACTIVE) {
452                         atomic_dec(&dest->inactconns);
453                 } else {
454                         atomic_dec(&dest->activeconns);
455                 }
456         } else {
457                 /* It is a persistent connection/template, so decrease
458                    the peristent connection counter */
459                 atomic_dec(&dest->persistconns);
460         }
461
462         if (dest->l_threshold != 0) {
463                 if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
464                         dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
465         } else if (dest->u_threshold != 0) {
466                 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
467                         dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
468         } else {
469                 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
470                         dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
471         }
472
473         /*
474          * Simply decrease the refcnt of the dest, because the
475          * dest will be either in service's destination list
476          * or in the trash.
477          */
478         atomic_dec(&dest->refcnt);
479 }
480
481
482 /*
483  *      Checking if the destination of a connection template is available.
484  *      If available, return 1, otherwise invalidate this connection
485  *      template and return 0.
486  */
487 int ip_vs_check_template(struct ip_vs_conn *ct)
488 {
489         struct ip_vs_dest *dest = ct->dest;
490
491         /*
492          * Checking the dest server status.
493          */
494         if ((dest == NULL) ||
495             !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 
496             (sysctl_ip_vs_expire_quiescent_template && 
497              (atomic_read(&dest->weight) == 0))) {
498                 IP_VS_DBG(9, "check_template: dest not available for "
499                           "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d "
500                           "-> d:%u.%u.%u.%u:%d\n",
501                           ip_vs_proto_name(ct->protocol),
502                           NIPQUAD(ct->caddr), ntohs(ct->cport),
503                           NIPQUAD(ct->vaddr), ntohs(ct->vport),
504                           NIPQUAD(ct->daddr), ntohs(ct->dport));
505
506                 /*
507                  * Invalidate the connection template
508                  */
509                 if (ct->vport != 65535) {
510                         if (ip_vs_conn_unhash(ct)) {
511                                 ct->dport = 65535;
512                                 ct->vport = 65535;
513                                 ct->cport = 0;
514                                 ip_vs_conn_hash(ct);
515                         }
516                 }
517
518                 /*
519                  * Simply decrease the refcnt of the template,
520                  * don't restart its timer.
521                  */
522                 atomic_dec(&ct->refcnt);
523                 return 0;
524         }
525         return 1;
526 }
527
528 static void ip_vs_conn_expire(unsigned long data)
529 {
530         struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
531
532         cp->timeout = 60*HZ;
533
534         /*
535          *      hey, I'm using it
536          */
537         atomic_inc(&cp->refcnt);
538
539         /*
540          *      do I control anybody?
541          */
542         if (atomic_read(&cp->n_control))
543                 goto expire_later;
544
545         /*
546          *      unhash it if it is hashed in the conn table
547          */
548         if (!ip_vs_conn_unhash(cp))
549                 goto expire_later;
550
551         /*
552          *      refcnt==1 implies I'm the only one referrer
553          */
554         if (likely(atomic_read(&cp->refcnt) == 1)) {
555                 /* delete the timer if it is activated by other users */
556                 if (timer_pending(&cp->timer))
557                         del_timer(&cp->timer);
558
559                 /* does anybody control me? */
560                 if (cp->control)
561                         ip_vs_control_del(cp);
562
563                 if (unlikely(cp->app != NULL))
564                         ip_vs_unbind_app(cp);
565                 ip_vs_unbind_dest(cp);
566                 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
567                         atomic_dec(&ip_vs_conn_no_cport_cnt);
568                 atomic_dec(&ip_vs_conn_count);
569
570                 kmem_cache_free(ip_vs_conn_cachep, cp);
571                 return;
572         }
573
574         /* hash it back to the table */
575         ip_vs_conn_hash(cp);
576
577   expire_later:
578         IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n",
579                   atomic_read(&cp->refcnt)-1,
580                   atomic_read(&cp->n_control));
581
582         ip_vs_conn_put(cp);
583 }
584
585
586 void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
587 {
588         if (del_timer(&cp->timer))
589                 mod_timer(&cp->timer, jiffies);
590 }
591
592
593 /*
594  *      Create a new connection entry and hash it into the ip_vs_conn_tab
595  */
596 struct ip_vs_conn *
597 ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport,
598                __u32 daddr, __u16 dport, unsigned flags,
599                struct ip_vs_dest *dest)
600 {
601         struct ip_vs_conn *cp;
602         struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
603
604         cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
605         if (cp == NULL) {
606                 IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n");
607                 return NULL;
608         }
609
610         memset(cp, 0, sizeof(*cp));
611         INIT_LIST_HEAD(&cp->c_list);
612         init_timer(&cp->timer);
613         cp->timer.data     = (unsigned long)cp;
614         cp->timer.function = ip_vs_conn_expire;
615         cp->protocol       = proto;
616         cp->caddr          = caddr;
617         cp->cport          = cport;
618         cp->vaddr          = vaddr;
619         cp->vport          = vport;
620         cp->daddr          = daddr;
621         cp->dport          = dport;
622         cp->flags          = flags;
623         spin_lock_init(&cp->lock);
624
625         /*
626          * Set the entry is referenced by the current thread before hashing
627          * it in the table, so that other thread run ip_vs_random_dropentry
628          * but cannot drop this entry.
629          */
630         atomic_set(&cp->refcnt, 1);
631
632         atomic_set(&cp->n_control, 0);
633         atomic_set(&cp->in_pkts, 0);
634
635         atomic_inc(&ip_vs_conn_count);
636         if (flags & IP_VS_CONN_F_NO_CPORT)
637                 atomic_inc(&ip_vs_conn_no_cport_cnt);
638
639         /* Bind the connection with a destination server */
640         ip_vs_bind_dest(cp, dest);
641
642         /* Set its state and timeout */
643         cp->state = 0;
644         cp->timeout = 3*HZ;
645
646         /* Bind its packet transmitter */
647         ip_vs_bind_xmit(cp);
648
649         if (unlikely(pp && atomic_read(&pp->appcnt)))
650                 ip_vs_bind_app(cp, pp);
651
652         /* Hash it in the ip_vs_conn_tab finally */
653         ip_vs_conn_hash(cp);
654
655         return cp;
656 }
657
658
659 /*
660  *      /proc/net/ip_vs_conn entries
661  */
662 #ifdef CONFIG_PROC_FS
663
664 static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
665 {
666         int idx;
667         struct ip_vs_conn *cp;
668         
669         for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
670                 ct_read_lock_bh(idx);
671                 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
672                         if (pos-- == 0) {
673                                 seq->private = &ip_vs_conn_tab[idx];
674                                 return cp;
675                         }
676                 }
677                 ct_read_unlock_bh(idx);
678         }
679
680         return NULL;
681 }
682
683 static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
684 {
685         seq->private = NULL;
686         return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
687 }
688
689 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
690 {
691         struct ip_vs_conn *cp = v;
692         struct list_head *e, *l = seq->private;
693         int idx;
694
695         ++*pos;
696         if (v == SEQ_START_TOKEN) 
697                 return ip_vs_conn_array(seq, 0);
698
699         /* more on same hash chain? */
700         if ((e = cp->c_list.next) != l)
701                 return list_entry(e, struct ip_vs_conn, c_list);
702
703         idx = l - ip_vs_conn_tab;
704         ct_read_unlock_bh(idx);
705
706         while (++idx < IP_VS_CONN_TAB_SIZE) {
707                 ct_read_lock_bh(idx);
708                 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
709                         seq->private = &ip_vs_conn_tab[idx];
710                         return cp;
711                 }       
712                 ct_read_unlock_bh(idx);
713         }
714         seq->private = NULL;
715         return NULL;
716 }
717
718 static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
719 {
720         struct list_head *l = seq->private;
721
722         if (l)
723                 ct_read_unlock_bh(l - ip_vs_conn_tab);
724 }
725
726 static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
727 {
728
729         if (v == SEQ_START_TOKEN)
730                 seq_puts(seq,
731    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires\n");
732         else {
733                 const struct ip_vs_conn *cp = v;
734
735                 seq_printf(seq,
736                         "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n",
737                                 ip_vs_proto_name(cp->protocol),
738                                 ntohl(cp->caddr), ntohs(cp->cport),
739                                 ntohl(cp->vaddr), ntohs(cp->vport),
740                                 ntohl(cp->daddr), ntohs(cp->dport),
741                                 ip_vs_state_name(cp->protocol, cp->state),
742                                 (cp->timer.expires-jiffies)/HZ);
743         }
744         return 0;
745 }
746
747 static struct seq_operations ip_vs_conn_seq_ops = {
748         .start = ip_vs_conn_seq_start,
749         .next  = ip_vs_conn_seq_next,
750         .stop  = ip_vs_conn_seq_stop,
751         .show  = ip_vs_conn_seq_show,
752 };
753
754 static int ip_vs_conn_open(struct inode *inode, struct file *file)
755 {
756         return seq_open(file, &ip_vs_conn_seq_ops);
757 }
758
759 static struct file_operations ip_vs_conn_fops = {
760         .owner   = THIS_MODULE,
761         .open    = ip_vs_conn_open,
762         .read    = seq_read,
763         .llseek  = seq_lseek,
764         .release = seq_release,
765 };
766 #endif
767
768
769 /*
770  *      Randomly drop connection entries before running out of memory
771  */
772 static inline int todrop_entry(struct ip_vs_conn *cp)
773 {
774         /*
775          * The drop rate array needs tuning for real environments.
776          * Called from timer bh only => no locking
777          */
778         static const char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8};
779         static char todrop_counter[9] = {0};
780         int i;
781
782         /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
783            This will leave enough time for normal connection to get
784            through. */
785         if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
786                 return 0;
787
788         /* Don't drop the entry if its number of incoming packets is not
789            located in [0, 8] */
790         i = atomic_read(&cp->in_pkts);
791         if (i > 8 || i < 0) return 0;
792
793         if (!todrop_rate[i]) return 0;
794         if (--todrop_counter[i] > 0) return 0;
795
796         todrop_counter[i] = todrop_rate[i];
797         return 1;
798 }
799
800 /* Called from keventd and must protect itself from softirqs */
801 void ip_vs_random_dropentry(void)
802 {
803         int idx;
804         struct ip_vs_conn *cp;
805
806         /*
807          * Randomly scan 1/32 of the whole table every second
808          */
809         for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) {
810                 unsigned hash = net_random() & IP_VS_CONN_TAB_MASK;
811
812                 /*
813                  *  Lock is actually needed in this loop.
814                  */
815                 ct_write_lock_bh(hash);
816
817                 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
818                         if (cp->flags & IP_VS_CONN_F_TEMPLATE)
819                                 /* connection template */
820                                 continue;
821
822                         if (cp->protocol == IPPROTO_TCP) {
823                                 switch(cp->state) {
824                                 case IP_VS_TCP_S_SYN_RECV:
825                                 case IP_VS_TCP_S_SYNACK:
826                                         break;
827
828                                 case IP_VS_TCP_S_ESTABLISHED:
829                                         if (todrop_entry(cp))
830                                                 break;
831                                         continue;
832
833                                 default:
834                                         continue;
835                                 }
836                         } else {
837                                 if (!todrop_entry(cp))
838                                         continue;
839                         }
840
841                         IP_VS_DBG(4, "del connection\n");
842                         ip_vs_conn_expire_now(cp);
843                         if (cp->control) {
844                                 IP_VS_DBG(4, "del conn template\n");
845                                 ip_vs_conn_expire_now(cp->control);
846                         }
847                 }
848                 ct_write_unlock_bh(hash);
849         }
850 }
851
852
853 /*
854  *      Flush all the connection entries in the ip_vs_conn_tab
855  */
856 static void ip_vs_conn_flush(void)
857 {
858         int idx;
859         struct ip_vs_conn *cp;
860
861   flush_again:
862         for (idx=0; idx<IP_VS_CONN_TAB_SIZE; idx++) {
863                 /*
864                  *  Lock is actually needed in this loop.
865                  */
866                 ct_write_lock_bh(idx);
867
868                 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
869
870                         IP_VS_DBG(4, "del connection\n");
871                         ip_vs_conn_expire_now(cp);
872                         if (cp->control) {
873                                 IP_VS_DBG(4, "del conn template\n");
874                                 ip_vs_conn_expire_now(cp->control);
875                         }
876                 }
877                 ct_write_unlock_bh(idx);
878         }
879
880         /* the counter may be not NULL, because maybe some conn entries
881            are run by slow timer handler or unhashed but still referred */
882         if (atomic_read(&ip_vs_conn_count) != 0) {
883                 schedule();
884                 goto flush_again;
885         }
886 }
887
888
889 int ip_vs_conn_init(void)
890 {
891         int idx;
892
893         /*
894          * Allocate the connection hash table and initialize its list heads
895          */
896         ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head));
897         if (!ip_vs_conn_tab)
898                 return -ENOMEM;
899
900         /* Allocate ip_vs_conn slab cache */
901         ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
902                                               sizeof(struct ip_vs_conn), 0,
903                                               SLAB_HWCACHE_ALIGN, NULL, NULL);
904         if (!ip_vs_conn_cachep) {
905                 vfree(ip_vs_conn_tab);
906                 return -ENOMEM;
907         }
908
909         IP_VS_INFO("Connection hash table configured "
910                    "(size=%d, memory=%ldKbytes)\n",
911                    IP_VS_CONN_TAB_SIZE,
912                    (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024);
913         IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n",
914                   sizeof(struct ip_vs_conn));
915
916         for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) {
917                 INIT_LIST_HEAD(&ip_vs_conn_tab[idx]);
918         }
919
920         for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++)  {
921                 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
922         }
923
924         proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops);
925
926         /* calculate the random value for connection hash */
927         get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
928
929         return 0;
930 }
931
932
933 void ip_vs_conn_cleanup(void)
934 {
935         /* flush all the connection entries first */
936         ip_vs_conn_flush();
937
938         /* Release the empty cache */
939         kmem_cache_destroy(ip_vs_conn_cachep);
940         proc_net_remove("ip_vs_conn");
941         vfree(ip_vs_conn_tab);
942 }