]> Pileus Git - ~andy/linux/blob - net/netfilter/nf_conntrack_expect.c
ipvs: convert nq scheduler to rcu
[~andy/linux] / net / netfilter / nf_conntrack_expect.c
1 /* Expectation handling for nf_conntrack. */
2
3 /* (C) 1999-2001 Paul `Rusty' Russell
4  * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
5  * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11
12 #include <linux/types.h>
13 #include <linux/netfilter.h>
14 #include <linux/skbuff.h>
15 #include <linux/proc_fs.h>
16 #include <linux/seq_file.h>
17 #include <linux/stddef.h>
18 #include <linux/slab.h>
19 #include <linux/err.h>
20 #include <linux/percpu.h>
21 #include <linux/kernel.h>
22 #include <linux/jhash.h>
23 #include <linux/moduleparam.h>
24 #include <linux/export.h>
25 #include <net/net_namespace.h>
26
27 #include <net/netfilter/nf_conntrack.h>
28 #include <net/netfilter/nf_conntrack_core.h>
29 #include <net/netfilter/nf_conntrack_expect.h>
30 #include <net/netfilter/nf_conntrack_helper.h>
31 #include <net/netfilter/nf_conntrack_tuple.h>
32 #include <net/netfilter/nf_conntrack_zones.h>
33
34 unsigned int nf_ct_expect_hsize __read_mostly;
35 EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
36
37 unsigned int nf_ct_expect_max __read_mostly;
38
39 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
40
41 /* nf_conntrack_expect helper functions */
42 void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
43                                 u32 pid, int report)
44 {
45         struct nf_conn_help *master_help = nfct_help(exp->master);
46         struct net *net = nf_ct_exp_net(exp);
47
48         NF_CT_ASSERT(master_help);
49         NF_CT_ASSERT(!timer_pending(&exp->timeout));
50
51         hlist_del_rcu(&exp->hnode);
52         net->ct.expect_count--;
53
54         hlist_del(&exp->lnode);
55         master_help->expecting[exp->class]--;
56
57         nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
58         nf_ct_expect_put(exp);
59
60         NF_CT_STAT_INC(net, expect_delete);
61 }
62 EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report);
63
64 static void nf_ct_expectation_timed_out(unsigned long ul_expect)
65 {
66         struct nf_conntrack_expect *exp = (void *)ul_expect;
67
68         spin_lock_bh(&nf_conntrack_lock);
69         nf_ct_unlink_expect(exp);
70         spin_unlock_bh(&nf_conntrack_lock);
71         nf_ct_expect_put(exp);
72 }
73
74 static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
75 {
76         unsigned int hash;
77
78         if (unlikely(!nf_conntrack_hash_rnd)) {
79                 init_nf_conntrack_hash_rnd();
80         }
81
82         hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
83                       (((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
84                        (__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);
85         return ((u64)hash * nf_ct_expect_hsize) >> 32;
86 }
87
88 struct nf_conntrack_expect *
89 __nf_ct_expect_find(struct net *net, u16 zone,
90                     const struct nf_conntrack_tuple *tuple)
91 {
92         struct nf_conntrack_expect *i;
93         unsigned int h;
94
95         if (!net->ct.expect_count)
96                 return NULL;
97
98         h = nf_ct_expect_dst_hash(tuple);
99         hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
100                 if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
101                     nf_ct_zone(i->master) == zone)
102                         return i;
103         }
104         return NULL;
105 }
106 EXPORT_SYMBOL_GPL(__nf_ct_expect_find);
107
108 /* Just find a expectation corresponding to a tuple. */
109 struct nf_conntrack_expect *
110 nf_ct_expect_find_get(struct net *net, u16 zone,
111                       const struct nf_conntrack_tuple *tuple)
112 {
113         struct nf_conntrack_expect *i;
114
115         rcu_read_lock();
116         i = __nf_ct_expect_find(net, zone, tuple);
117         if (i && !atomic_inc_not_zero(&i->use))
118                 i = NULL;
119         rcu_read_unlock();
120
121         return i;
122 }
123 EXPORT_SYMBOL_GPL(nf_ct_expect_find_get);
124
125 /* If an expectation for this connection is found, it gets delete from
126  * global list then returned. */
127 struct nf_conntrack_expect *
128 nf_ct_find_expectation(struct net *net, u16 zone,
129                        const struct nf_conntrack_tuple *tuple)
130 {
131         struct nf_conntrack_expect *i, *exp = NULL;
132         unsigned int h;
133
134         if (!net->ct.expect_count)
135                 return NULL;
136
137         h = nf_ct_expect_dst_hash(tuple);
138         hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
139                 if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
140                     nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
141                     nf_ct_zone(i->master) == zone) {
142                         exp = i;
143                         break;
144                 }
145         }
146         if (!exp)
147                 return NULL;
148
149         /* If master is not in hash table yet (ie. packet hasn't left
150            this machine yet), how can other end know about expected?
151            Hence these are not the droids you are looking for (if
152            master ct never got confirmed, we'd hold a reference to it
153            and weird things would happen to future packets). */
154         if (!nf_ct_is_confirmed(exp->master))
155                 return NULL;
156
157         if (exp->flags & NF_CT_EXPECT_PERMANENT) {
158                 atomic_inc(&exp->use);
159                 return exp;
160         } else if (del_timer(&exp->timeout)) {
161                 nf_ct_unlink_expect(exp);
162                 return exp;
163         }
164
165         return NULL;
166 }
167
168 /* delete all expectations for this conntrack */
169 void nf_ct_remove_expectations(struct nf_conn *ct)
170 {
171         struct nf_conn_help *help = nfct_help(ct);
172         struct nf_conntrack_expect *exp;
173         struct hlist_node *next;
174
175         /* Optimization: most connection never expect any others. */
176         if (!help)
177                 return;
178
179         hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) {
180                 if (del_timer(&exp->timeout)) {
181                         nf_ct_unlink_expect(exp);
182                         nf_ct_expect_put(exp);
183                 }
184         }
185 }
186 EXPORT_SYMBOL_GPL(nf_ct_remove_expectations);
187
188 /* Would two expected things clash? */
189 static inline int expect_clash(const struct nf_conntrack_expect *a,
190                                const struct nf_conntrack_expect *b)
191 {
192         /* Part covered by intersection of masks must be unequal,
193            otherwise they clash */
194         struct nf_conntrack_tuple_mask intersect_mask;
195         int count;
196
197         intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all;
198
199         for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){
200                 intersect_mask.src.u3.all[count] =
201                         a->mask.src.u3.all[count] & b->mask.src.u3.all[count];
202         }
203
204         return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
205 }
206
207 static inline int expect_matches(const struct nf_conntrack_expect *a,
208                                  const struct nf_conntrack_expect *b)
209 {
210         return a->master == b->master && a->class == b->class &&
211                 nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
212                 nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
213                 nf_ct_zone(a->master) == nf_ct_zone(b->master);
214 }
215
216 /* Generally a bad idea to call this: could have matched already. */
217 void nf_ct_unexpect_related(struct nf_conntrack_expect *exp)
218 {
219         spin_lock_bh(&nf_conntrack_lock);
220         if (del_timer(&exp->timeout)) {
221                 nf_ct_unlink_expect(exp);
222                 nf_ct_expect_put(exp);
223         }
224         spin_unlock_bh(&nf_conntrack_lock);
225 }
226 EXPORT_SYMBOL_GPL(nf_ct_unexpect_related);
227
228 /* We don't increase the master conntrack refcount for non-fulfilled
229  * conntracks. During the conntrack destruction, the expectations are
230  * always killed before the conntrack itself */
231 struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me)
232 {
233         struct nf_conntrack_expect *new;
234
235         new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC);
236         if (!new)
237                 return NULL;
238
239         new->master = me;
240         atomic_set(&new->use, 1);
241         return new;
242 }
243 EXPORT_SYMBOL_GPL(nf_ct_expect_alloc);
244
245 void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class,
246                        u_int8_t family,
247                        const union nf_inet_addr *saddr,
248                        const union nf_inet_addr *daddr,
249                        u_int8_t proto, const __be16 *src, const __be16 *dst)
250 {
251         int len;
252
253         if (family == AF_INET)
254                 len = 4;
255         else
256                 len = 16;
257
258         exp->flags = 0;
259         exp->class = class;
260         exp->expectfn = NULL;
261         exp->helper = NULL;
262         exp->tuple.src.l3num = family;
263         exp->tuple.dst.protonum = proto;
264
265         if (saddr) {
266                 memcpy(&exp->tuple.src.u3, saddr, len);
267                 if (sizeof(exp->tuple.src.u3) > len)
268                         /* address needs to be cleared for nf_ct_tuple_equal */
269                         memset((void *)&exp->tuple.src.u3 + len, 0x00,
270                                sizeof(exp->tuple.src.u3) - len);
271                 memset(&exp->mask.src.u3, 0xFF, len);
272                 if (sizeof(exp->mask.src.u3) > len)
273                         memset((void *)&exp->mask.src.u3 + len, 0x00,
274                                sizeof(exp->mask.src.u3) - len);
275         } else {
276                 memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3));
277                 memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3));
278         }
279
280         if (src) {
281                 exp->tuple.src.u.all = *src;
282                 exp->mask.src.u.all = htons(0xFFFF);
283         } else {
284                 exp->tuple.src.u.all = 0;
285                 exp->mask.src.u.all = 0;
286         }
287
288         memcpy(&exp->tuple.dst.u3, daddr, len);
289         if (sizeof(exp->tuple.dst.u3) > len)
290                 /* address needs to be cleared for nf_ct_tuple_equal */
291                 memset((void *)&exp->tuple.dst.u3 + len, 0x00,
292                        sizeof(exp->tuple.dst.u3) - len);
293
294         exp->tuple.dst.u.all = *dst;
295 }
296 EXPORT_SYMBOL_GPL(nf_ct_expect_init);
297
298 static void nf_ct_expect_free_rcu(struct rcu_head *head)
299 {
300         struct nf_conntrack_expect *exp;
301
302         exp = container_of(head, struct nf_conntrack_expect, rcu);
303         kmem_cache_free(nf_ct_expect_cachep, exp);
304 }
305
306 void nf_ct_expect_put(struct nf_conntrack_expect *exp)
307 {
308         if (atomic_dec_and_test(&exp->use))
309                 call_rcu(&exp->rcu, nf_ct_expect_free_rcu);
310 }
311 EXPORT_SYMBOL_GPL(nf_ct_expect_put);
312
313 static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
314 {
315         struct nf_conn_help *master_help = nfct_help(exp->master);
316         struct nf_conntrack_helper *helper;
317         struct net *net = nf_ct_exp_net(exp);
318         unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
319
320         /* two references : one for hash insert, one for the timer */
321         atomic_add(2, &exp->use);
322
323         hlist_add_head(&exp->lnode, &master_help->expectations);
324         master_help->expecting[exp->class]++;
325
326         hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
327         net->ct.expect_count++;
328
329         setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
330                     (unsigned long)exp);
331         helper = rcu_dereference_protected(master_help->helper,
332                                            lockdep_is_held(&nf_conntrack_lock));
333         if (helper) {
334                 exp->timeout.expires = jiffies +
335                         helper->expect_policy[exp->class].timeout * HZ;
336         }
337         add_timer(&exp->timeout);
338
339         NF_CT_STAT_INC(net, expect_create);
340         return 0;
341 }
342
343 /* Race with expectations being used means we could have none to find; OK. */
344 static void evict_oldest_expect(struct nf_conn *master,
345                                 struct nf_conntrack_expect *new)
346 {
347         struct nf_conn_help *master_help = nfct_help(master);
348         struct nf_conntrack_expect *exp, *last = NULL;
349
350         hlist_for_each_entry(exp, &master_help->expectations, lnode) {
351                 if (exp->class == new->class)
352                         last = exp;
353         }
354
355         if (last && del_timer(&last->timeout)) {
356                 nf_ct_unlink_expect(last);
357                 nf_ct_expect_put(last);
358         }
359 }
360
361 static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
362 {
363         const struct nf_conntrack_expect_policy *p;
364         struct nf_conntrack_expect *i;
365         struct nf_conn *master = expect->master;
366         struct nf_conn_help *master_help = nfct_help(master);
367         struct nf_conntrack_helper *helper;
368         struct net *net = nf_ct_exp_net(expect);
369         struct hlist_node *next;
370         unsigned int h;
371         int ret = 1;
372
373         if (!master_help) {
374                 ret = -ESHUTDOWN;
375                 goto out;
376         }
377         h = nf_ct_expect_dst_hash(&expect->tuple);
378         hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) {
379                 if (expect_matches(i, expect)) {
380                         if (del_timer(&i->timeout)) {
381                                 nf_ct_unlink_expect(i);
382                                 nf_ct_expect_put(i);
383                                 break;
384                         }
385                 } else if (expect_clash(i, expect)) {
386                         ret = -EBUSY;
387                         goto out;
388                 }
389         }
390         /* Will be over limit? */
391         helper = rcu_dereference_protected(master_help->helper,
392                                            lockdep_is_held(&nf_conntrack_lock));
393         if (helper) {
394                 p = &helper->expect_policy[expect->class];
395                 if (p->max_expected &&
396                     master_help->expecting[expect->class] >= p->max_expected) {
397                         evict_oldest_expect(master, expect);
398                         if (master_help->expecting[expect->class]
399                                                 >= p->max_expected) {
400                                 ret = -EMFILE;
401                                 goto out;
402                         }
403                 }
404         }
405
406         if (net->ct.expect_count >= nf_ct_expect_max) {
407                 net_warn_ratelimited("nf_conntrack: expectation table full\n");
408                 ret = -EMFILE;
409         }
410 out:
411         return ret;
412 }
413
414 int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, 
415                                 u32 pid, int report)
416 {
417         int ret;
418
419         spin_lock_bh(&nf_conntrack_lock);
420         ret = __nf_ct_expect_check(expect);
421         if (ret <= 0)
422                 goto out;
423
424         ret = nf_ct_expect_insert(expect);
425         if (ret < 0)
426                 goto out;
427         spin_unlock_bh(&nf_conntrack_lock);
428         nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report);
429         return ret;
430 out:
431         spin_unlock_bh(&nf_conntrack_lock);
432         return ret;
433 }
434 EXPORT_SYMBOL_GPL(nf_ct_expect_related_report);
435
436 #ifdef CONFIG_NF_CONNTRACK_PROCFS
437 struct ct_expect_iter_state {
438         struct seq_net_private p;
439         unsigned int bucket;
440 };
441
442 static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
443 {
444         struct net *net = seq_file_net(seq);
445         struct ct_expect_iter_state *st = seq->private;
446         struct hlist_node *n;
447
448         for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
449                 n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
450                 if (n)
451                         return n;
452         }
453         return NULL;
454 }
455
456 static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
457                                              struct hlist_node *head)
458 {
459         struct net *net = seq_file_net(seq);
460         struct ct_expect_iter_state *st = seq->private;
461
462         head = rcu_dereference(hlist_next_rcu(head));
463         while (head == NULL) {
464                 if (++st->bucket >= nf_ct_expect_hsize)
465                         return NULL;
466                 head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
467         }
468         return head;
469 }
470
471 static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos)
472 {
473         struct hlist_node *head = ct_expect_get_first(seq);
474
475         if (head)
476                 while (pos && (head = ct_expect_get_next(seq, head)))
477                         pos--;
478         return pos ? NULL : head;
479 }
480
481 static void *exp_seq_start(struct seq_file *seq, loff_t *pos)
482         __acquires(RCU)
483 {
484         rcu_read_lock();
485         return ct_expect_get_idx(seq, *pos);
486 }
487
488 static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
489 {
490         (*pos)++;
491         return ct_expect_get_next(seq, v);
492 }
493
494 static void exp_seq_stop(struct seq_file *seq, void *v)
495         __releases(RCU)
496 {
497         rcu_read_unlock();
498 }
499
500 static int exp_seq_show(struct seq_file *s, void *v)
501 {
502         struct nf_conntrack_expect *expect;
503         struct nf_conntrack_helper *helper;
504         struct hlist_node *n = v;
505         char *delim = "";
506
507         expect = hlist_entry(n, struct nf_conntrack_expect, hnode);
508
509         if (expect->timeout.function)
510                 seq_printf(s, "%ld ", timer_pending(&expect->timeout)
511                            ? (long)(expect->timeout.expires - jiffies)/HZ : 0);
512         else
513                 seq_printf(s, "- ");
514         seq_printf(s, "l3proto = %u proto=%u ",
515                    expect->tuple.src.l3num,
516                    expect->tuple.dst.protonum);
517         print_tuple(s, &expect->tuple,
518                     __nf_ct_l3proto_find(expect->tuple.src.l3num),
519                     __nf_ct_l4proto_find(expect->tuple.src.l3num,
520                                        expect->tuple.dst.protonum));
521
522         if (expect->flags & NF_CT_EXPECT_PERMANENT) {
523                 seq_printf(s, "PERMANENT");
524                 delim = ",";
525         }
526         if (expect->flags & NF_CT_EXPECT_INACTIVE) {
527                 seq_printf(s, "%sINACTIVE", delim);
528                 delim = ",";
529         }
530         if (expect->flags & NF_CT_EXPECT_USERSPACE)
531                 seq_printf(s, "%sUSERSPACE", delim);
532
533         helper = rcu_dereference(nfct_help(expect->master)->helper);
534         if (helper) {
535                 seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name);
536                 if (helper->expect_policy[expect->class].name)
537                         seq_printf(s, "/%s",
538                                    helper->expect_policy[expect->class].name);
539         }
540
541         return seq_putc(s, '\n');
542 }
543
544 static const struct seq_operations exp_seq_ops = {
545         .start = exp_seq_start,
546         .next = exp_seq_next,
547         .stop = exp_seq_stop,
548         .show = exp_seq_show
549 };
550
551 static int exp_open(struct inode *inode, struct file *file)
552 {
553         return seq_open_net(inode, file, &exp_seq_ops,
554                         sizeof(struct ct_expect_iter_state));
555 }
556
557 static const struct file_operations exp_file_ops = {
558         .owner   = THIS_MODULE,
559         .open    = exp_open,
560         .read    = seq_read,
561         .llseek  = seq_lseek,
562         .release = seq_release_net,
563 };
564 #endif /* CONFIG_NF_CONNTRACK_PROCFS */
565
566 static int exp_proc_init(struct net *net)
567 {
568 #ifdef CONFIG_NF_CONNTRACK_PROCFS
569         struct proc_dir_entry *proc;
570
571         proc = proc_create("nf_conntrack_expect", 0440, net->proc_net,
572                            &exp_file_ops);
573         if (!proc)
574                 return -ENOMEM;
575 #endif /* CONFIG_NF_CONNTRACK_PROCFS */
576         return 0;
577 }
578
579 static void exp_proc_remove(struct net *net)
580 {
581 #ifdef CONFIG_NF_CONNTRACK_PROCFS
582         remove_proc_entry("nf_conntrack_expect", net->proc_net);
583 #endif /* CONFIG_NF_CONNTRACK_PROCFS */
584 }
585
586 module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
587
588 int nf_conntrack_expect_pernet_init(struct net *net)
589 {
590         int err = -ENOMEM;
591
592         net->ct.expect_count = 0;
593         net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
594         if (net->ct.expect_hash == NULL)
595                 goto err1;
596
597         err = exp_proc_init(net);
598         if (err < 0)
599                 goto err2;
600
601         return 0;
602 err2:
603         nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
604 err1:
605         return err;
606 }
607
608 void nf_conntrack_expect_pernet_fini(struct net *net)
609 {
610         exp_proc_remove(net);
611         nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
612 }
613
614 int nf_conntrack_expect_init(void)
615 {
616         if (!nf_ct_expect_hsize) {
617                 nf_ct_expect_hsize = nf_conntrack_htable_size / 256;
618                 if (!nf_ct_expect_hsize)
619                         nf_ct_expect_hsize = 1;
620         }
621         nf_ct_expect_max = nf_ct_expect_hsize * 4;
622         nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
623                                 sizeof(struct nf_conntrack_expect),
624                                 0, 0, NULL);
625         if (!nf_ct_expect_cachep)
626                 return -ENOMEM;
627         return 0;
628 }
629
630 void nf_conntrack_expect_fini(void)
631 {
632         rcu_barrier(); /* Wait for call_rcu() before destroy */
633         kmem_cache_destroy(nf_ct_expect_cachep);
634 }