]> Pileus Git - ~andy/linux/blob - net/sched/sch_api.c
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph...
[~andy/linux] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/unregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* Get default qdisc if not otherwise specified */
204 void qdisc_get_default(char *name, size_t len)
205 {
206         read_lock(&qdisc_mod_lock);
207         strlcpy(name, default_qdisc_ops->id, len);
208         read_unlock(&qdisc_mod_lock);
209 }
210
211 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
212 {
213         struct Qdisc_ops *q = NULL;
214
215         for (q = qdisc_base; q; q = q->next) {
216                 if (!strcmp(name, q->id)) {
217                         if (!try_module_get(q->owner))
218                                 q = NULL;
219                         break;
220                 }
221         }
222
223         return q;
224 }
225
226 /* Set new default qdisc to use */
227 int qdisc_set_default(const char *name)
228 {
229         const struct Qdisc_ops *ops;
230
231         if (!capable(CAP_NET_ADMIN))
232                 return -EPERM;
233
234         write_lock(&qdisc_mod_lock);
235         ops = qdisc_lookup_default(name);
236         if (!ops) {
237                 /* Not found, drop lock and try to load module */
238                 write_unlock(&qdisc_mod_lock);
239                 request_module("sch_%s", name);
240                 write_lock(&qdisc_mod_lock);
241
242                 ops = qdisc_lookup_default(name);
243         }
244
245         if (ops) {
246                 /* Set new default */
247                 module_put(default_qdisc_ops->owner);
248                 default_qdisc_ops = ops;
249         }
250         write_unlock(&qdisc_mod_lock);
251
252         return ops ? 0 : -ENOENT;
253 }
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256    (root qdisc, all its children, children of children etc.)
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!(root->flags & TCQ_F_BUILTIN) &&
264             root->handle == handle)
265                 return root;
266
267         list_for_each_entry(q, &root->list, list) {
268                 if (q->handle == handle)
269                         return q;
270         }
271         return NULL;
272 }
273
274 void qdisc_list_add(struct Qdisc *q)
275 {
276         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
277                 struct Qdisc *root = qdisc_dev(q)->qdisc;
278
279                 WARN_ON_ONCE(root == &noop_qdisc);
280                 list_add_tail(&q->list, &root->list);
281         }
282 }
283 EXPORT_SYMBOL(qdisc_list_add);
284
285 void qdisc_list_del(struct Qdisc *q)
286 {
287         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
288                 list_del(&q->list);
289 }
290 EXPORT_SYMBOL(qdisc_list_del);
291
292 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
293 {
294         struct Qdisc *q;
295
296         q = qdisc_match_from_root(dev->qdisc, handle);
297         if (q)
298                 goto out;
299
300         if (dev_ingress_queue(dev))
301                 q = qdisc_match_from_root(
302                         dev_ingress_queue(dev)->qdisc_sleeping,
303                         handle);
304 out:
305         return q;
306 }
307
308 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
309 {
310         unsigned long cl;
311         struct Qdisc *leaf;
312         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
313
314         if (cops == NULL)
315                 return NULL;
316         cl = cops->get(p, classid);
317
318         if (cl == 0)
319                 return NULL;
320         leaf = cops->leaf(p, cl);
321         cops->put(p, cl);
322         return leaf;
323 }
324
325 /* Find queueing discipline by name */
326
327 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
328 {
329         struct Qdisc_ops *q = NULL;
330
331         if (kind) {
332                 read_lock(&qdisc_mod_lock);
333                 for (q = qdisc_base; q; q = q->next) {
334                         if (nla_strcmp(kind, q->id) == 0) {
335                                 if (!try_module_get(q->owner))
336                                         q = NULL;
337                                 break;
338                         }
339                 }
340                 read_unlock(&qdisc_mod_lock);
341         }
342         return q;
343 }
344
345 /* The linklayer setting were not transferred from iproute2, in older
346  * versions, and the rate tables lookup systems have been dropped in
347  * the kernel. To keep backward compatible with older iproute2 tc
348  * utils, we detect the linklayer setting by detecting if the rate
349  * table were modified.
350  *
351  * For linklayer ATM table entries, the rate table will be aligned to
352  * 48 bytes, thus some table entries will contain the same value.  The
353  * mpu (min packet unit) is also encoded into the old rate table, thus
354  * starting from the mpu, we find low and high table entries for
355  * mapping this cell.  If these entries contain the same value, when
356  * the rate tables have been modified for linklayer ATM.
357  *
358  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
359  * and then roundup to the next cell, calc the table entry one below,
360  * and compare.
361  */
362 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
363 {
364         int low       = roundup(r->mpu, 48);
365         int high      = roundup(low+1, 48);
366         int cell_low  = low >> r->cell_log;
367         int cell_high = (high >> r->cell_log) - 1;
368
369         /* rtab is too inaccurate at rates > 100Mbit/s */
370         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
371                 pr_debug("TC linklayer: Giving up ATM detection\n");
372                 return TC_LINKLAYER_ETHERNET;
373         }
374
375         if ((cell_high > cell_low) && (cell_high < 256)
376             && (rtab[cell_low] == rtab[cell_high])) {
377                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
378                          cell_low, cell_high, rtab[cell_high]);
379                 return TC_LINKLAYER_ATM;
380         }
381         return TC_LINKLAYER_ETHERNET;
382 }
383
384 static struct qdisc_rate_table *qdisc_rtab_list;
385
386 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
387 {
388         struct qdisc_rate_table *rtab;
389
390         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
391             nla_len(tab) != TC_RTAB_SIZE)
392                 return NULL;
393
394         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
395                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
396                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
397                         rtab->refcnt++;
398                         return rtab;
399                 }
400         }
401
402         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
403         if (rtab) {
404                 rtab->rate = *r;
405                 rtab->refcnt = 1;
406                 memcpy(rtab->data, nla_data(tab), 1024);
407                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
408                         r->linklayer = __detect_linklayer(r, rtab->data);
409                 rtab->next = qdisc_rtab_list;
410                 qdisc_rtab_list = rtab;
411         }
412         return rtab;
413 }
414 EXPORT_SYMBOL(qdisc_get_rtab);
415
416 void qdisc_put_rtab(struct qdisc_rate_table *tab)
417 {
418         struct qdisc_rate_table *rtab, **rtabp;
419
420         if (!tab || --tab->refcnt)
421                 return;
422
423         for (rtabp = &qdisc_rtab_list;
424              (rtab = *rtabp) != NULL;
425              rtabp = &rtab->next) {
426                 if (rtab == tab) {
427                         *rtabp = rtab->next;
428                         kfree(rtab);
429                         return;
430                 }
431         }
432 }
433 EXPORT_SYMBOL(qdisc_put_rtab);
434
435 static LIST_HEAD(qdisc_stab_list);
436 static DEFINE_SPINLOCK(qdisc_stab_lock);
437
438 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
439         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
440         [TCA_STAB_DATA] = { .type = NLA_BINARY },
441 };
442
443 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
444 {
445         struct nlattr *tb[TCA_STAB_MAX + 1];
446         struct qdisc_size_table *stab;
447         struct tc_sizespec *s;
448         unsigned int tsize = 0;
449         u16 *tab = NULL;
450         int err;
451
452         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
453         if (err < 0)
454                 return ERR_PTR(err);
455         if (!tb[TCA_STAB_BASE])
456                 return ERR_PTR(-EINVAL);
457
458         s = nla_data(tb[TCA_STAB_BASE]);
459
460         if (s->tsize > 0) {
461                 if (!tb[TCA_STAB_DATA])
462                         return ERR_PTR(-EINVAL);
463                 tab = nla_data(tb[TCA_STAB_DATA]);
464                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
465         }
466
467         if (tsize != s->tsize || (!tab && tsize > 0))
468                 return ERR_PTR(-EINVAL);
469
470         spin_lock(&qdisc_stab_lock);
471
472         list_for_each_entry(stab, &qdisc_stab_list, list) {
473                 if (memcmp(&stab->szopts, s, sizeof(*s)))
474                         continue;
475                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
476                         continue;
477                 stab->refcnt++;
478                 spin_unlock(&qdisc_stab_lock);
479                 return stab;
480         }
481
482         spin_unlock(&qdisc_stab_lock);
483
484         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
485         if (!stab)
486                 return ERR_PTR(-ENOMEM);
487
488         stab->refcnt = 1;
489         stab->szopts = *s;
490         if (tsize > 0)
491                 memcpy(stab->data, tab, tsize * sizeof(u16));
492
493         spin_lock(&qdisc_stab_lock);
494         list_add_tail(&stab->list, &qdisc_stab_list);
495         spin_unlock(&qdisc_stab_lock);
496
497         return stab;
498 }
499
500 static void stab_kfree_rcu(struct rcu_head *head)
501 {
502         kfree(container_of(head, struct qdisc_size_table, rcu));
503 }
504
505 void qdisc_put_stab(struct qdisc_size_table *tab)
506 {
507         if (!tab)
508                 return;
509
510         spin_lock(&qdisc_stab_lock);
511
512         if (--tab->refcnt == 0) {
513                 list_del(&tab->list);
514                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
515         }
516
517         spin_unlock(&qdisc_stab_lock);
518 }
519 EXPORT_SYMBOL(qdisc_put_stab);
520
521 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
522 {
523         struct nlattr *nest;
524
525         nest = nla_nest_start(skb, TCA_STAB);
526         if (nest == NULL)
527                 goto nla_put_failure;
528         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
529                 goto nla_put_failure;
530         nla_nest_end(skb, nest);
531
532         return skb->len;
533
534 nla_put_failure:
535         return -1;
536 }
537
538 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
539 {
540         int pkt_len, slot;
541
542         pkt_len = skb->len + stab->szopts.overhead;
543         if (unlikely(!stab->szopts.tsize))
544                 goto out;
545
546         slot = pkt_len + stab->szopts.cell_align;
547         if (unlikely(slot < 0))
548                 slot = 0;
549
550         slot >>= stab->szopts.cell_log;
551         if (likely(slot < stab->szopts.tsize))
552                 pkt_len = stab->data[slot];
553         else
554                 pkt_len = stab->data[stab->szopts.tsize - 1] *
555                                 (slot / stab->szopts.tsize) +
556                                 stab->data[slot % stab->szopts.tsize];
557
558         pkt_len <<= stab->szopts.size_log;
559 out:
560         if (unlikely(pkt_len < 1))
561                 pkt_len = 1;
562         qdisc_skb_cb(skb)->pkt_len = pkt_len;
563 }
564 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
565
566 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
567 {
568         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
569                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
570                         txt, qdisc->ops->id, qdisc->handle >> 16);
571                 qdisc->flags |= TCQ_F_WARN_NONWC;
572         }
573 }
574 EXPORT_SYMBOL(qdisc_warn_nonwc);
575
576 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
577 {
578         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
579                                                  timer);
580
581         qdisc_unthrottled(wd->qdisc);
582         __netif_schedule(qdisc_root(wd->qdisc));
583
584         return HRTIMER_NORESTART;
585 }
586
587 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
588 {
589         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
590         wd->timer.function = qdisc_watchdog;
591         wd->qdisc = qdisc;
592 }
593 EXPORT_SYMBOL(qdisc_watchdog_init);
594
595 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
596 {
597         if (test_bit(__QDISC_STATE_DEACTIVATED,
598                      &qdisc_root_sleeping(wd->qdisc)->state))
599                 return;
600
601         qdisc_throttled(wd->qdisc);
602
603         hrtimer_start(&wd->timer,
604                       ns_to_ktime(expires),
605                       HRTIMER_MODE_ABS);
606 }
607 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
608
609 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
610 {
611         hrtimer_cancel(&wd->timer);
612         qdisc_unthrottled(wd->qdisc);
613 }
614 EXPORT_SYMBOL(qdisc_watchdog_cancel);
615
616 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
617 {
618         unsigned int size = n * sizeof(struct hlist_head), i;
619         struct hlist_head *h;
620
621         if (size <= PAGE_SIZE)
622                 h = kmalloc(size, GFP_KERNEL);
623         else
624                 h = (struct hlist_head *)
625                         __get_free_pages(GFP_KERNEL, get_order(size));
626
627         if (h != NULL) {
628                 for (i = 0; i < n; i++)
629                         INIT_HLIST_HEAD(&h[i]);
630         }
631         return h;
632 }
633
634 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
635 {
636         unsigned int size = n * sizeof(struct hlist_head);
637
638         if (size <= PAGE_SIZE)
639                 kfree(h);
640         else
641                 free_pages((unsigned long)h, get_order(size));
642 }
643
644 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
645 {
646         struct Qdisc_class_common *cl;
647         struct hlist_node *next;
648         struct hlist_head *nhash, *ohash;
649         unsigned int nsize, nmask, osize;
650         unsigned int i, h;
651
652         /* Rehash when load factor exceeds 0.75 */
653         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
654                 return;
655         nsize = clhash->hashsize * 2;
656         nmask = nsize - 1;
657         nhash = qdisc_class_hash_alloc(nsize);
658         if (nhash == NULL)
659                 return;
660
661         ohash = clhash->hash;
662         osize = clhash->hashsize;
663
664         sch_tree_lock(sch);
665         for (i = 0; i < osize; i++) {
666                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
667                         h = qdisc_class_hash(cl->classid, nmask);
668                         hlist_add_head(&cl->hnode, &nhash[h]);
669                 }
670         }
671         clhash->hash     = nhash;
672         clhash->hashsize = nsize;
673         clhash->hashmask = nmask;
674         sch_tree_unlock(sch);
675
676         qdisc_class_hash_free(ohash, osize);
677 }
678 EXPORT_SYMBOL(qdisc_class_hash_grow);
679
680 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
681 {
682         unsigned int size = 4;
683
684         clhash->hash = qdisc_class_hash_alloc(size);
685         if (clhash->hash == NULL)
686                 return -ENOMEM;
687         clhash->hashsize  = size;
688         clhash->hashmask  = size - 1;
689         clhash->hashelems = 0;
690         return 0;
691 }
692 EXPORT_SYMBOL(qdisc_class_hash_init);
693
694 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
695 {
696         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
697 }
698 EXPORT_SYMBOL(qdisc_class_hash_destroy);
699
700 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
701                              struct Qdisc_class_common *cl)
702 {
703         unsigned int h;
704
705         INIT_HLIST_NODE(&cl->hnode);
706         h = qdisc_class_hash(cl->classid, clhash->hashmask);
707         hlist_add_head(&cl->hnode, &clhash->hash[h]);
708         clhash->hashelems++;
709 }
710 EXPORT_SYMBOL(qdisc_class_hash_insert);
711
712 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
713                              struct Qdisc_class_common *cl)
714 {
715         hlist_del(&cl->hnode);
716         clhash->hashelems--;
717 }
718 EXPORT_SYMBOL(qdisc_class_hash_remove);
719
720 /* Allocate an unique handle from space managed by kernel
721  * Possible range is [8000-FFFF]:0000 (0x8000 values)
722  */
723 static u32 qdisc_alloc_handle(struct net_device *dev)
724 {
725         int i = 0x8000;
726         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
727
728         do {
729                 autohandle += TC_H_MAKE(0x10000U, 0);
730                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
731                         autohandle = TC_H_MAKE(0x80000000U, 0);
732                 if (!qdisc_lookup(dev, autohandle))
733                         return autohandle;
734                 cond_resched();
735         } while (--i > 0);
736
737         return 0;
738 }
739
740 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
741 {
742         const struct Qdisc_class_ops *cops;
743         unsigned long cl;
744         u32 parentid;
745         int drops;
746
747         if (n == 0)
748                 return;
749         drops = max_t(int, n, 0);
750         while ((parentid = sch->parent)) {
751                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
752                         return;
753
754                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
755                 if (sch == NULL) {
756                         WARN_ON(parentid != TC_H_ROOT);
757                         return;
758                 }
759                 cops = sch->ops->cl_ops;
760                 if (cops->qlen_notify) {
761                         cl = cops->get(sch, parentid);
762                         cops->qlen_notify(sch, cl);
763                         cops->put(sch, cl);
764                 }
765                 sch->q.qlen -= n;
766                 sch->qstats.drops += drops;
767         }
768 }
769 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
770
771 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
772                                struct nlmsghdr *n, u32 clid,
773                                struct Qdisc *old, struct Qdisc *new)
774 {
775         if (new || old)
776                 qdisc_notify(net, skb, n, clid, old, new);
777
778         if (old)
779                 qdisc_destroy(old);
780 }
781
782 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
783  * to device "dev".
784  *
785  * When appropriate send a netlink notification using 'skb'
786  * and "n".
787  *
788  * On success, destroy old qdisc.
789  */
790
791 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
792                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
793                        struct Qdisc *new, struct Qdisc *old)
794 {
795         struct Qdisc *q = old;
796         struct net *net = dev_net(dev);
797         int err = 0;
798
799         if (parent == NULL) {
800                 unsigned int i, num_q, ingress;
801
802                 ingress = 0;
803                 num_q = dev->num_tx_queues;
804                 if ((q && q->flags & TCQ_F_INGRESS) ||
805                     (new && new->flags & TCQ_F_INGRESS)) {
806                         num_q = 1;
807                         ingress = 1;
808                         if (!dev_ingress_queue(dev))
809                                 return -ENOENT;
810                 }
811
812                 if (dev->flags & IFF_UP)
813                         dev_deactivate(dev);
814
815                 if (new && new->ops->attach) {
816                         new->ops->attach(new);
817                         num_q = 0;
818                 }
819
820                 for (i = 0; i < num_q; i++) {
821                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
822
823                         if (!ingress)
824                                 dev_queue = netdev_get_tx_queue(dev, i);
825
826                         old = dev_graft_qdisc(dev_queue, new);
827                         if (new && i > 0)
828                                 atomic_inc(&new->refcnt);
829
830                         if (!ingress)
831                                 qdisc_destroy(old);
832                 }
833
834                 if (!ingress) {
835                         notify_and_destroy(net, skb, n, classid,
836                                            dev->qdisc, new);
837                         if (new && !new->ops->attach)
838                                 atomic_inc(&new->refcnt);
839                         dev->qdisc = new ? : &noop_qdisc;
840                 } else {
841                         notify_and_destroy(net, skb, n, classid, old, new);
842                 }
843
844                 if (dev->flags & IFF_UP)
845                         dev_activate(dev);
846         } else {
847                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
848
849                 err = -EOPNOTSUPP;
850                 if (cops && cops->graft) {
851                         unsigned long cl = cops->get(parent, classid);
852                         if (cl) {
853                                 err = cops->graft(parent, cl, new, &old);
854                                 cops->put(parent, cl);
855                         } else
856                                 err = -ENOENT;
857                 }
858                 if (!err)
859                         notify_and_destroy(net, skb, n, classid, old, new);
860         }
861         return err;
862 }
863
864 /* lockdep annotation is needed for ingress; egress gets it only for name */
865 static struct lock_class_key qdisc_tx_lock;
866 static struct lock_class_key qdisc_rx_lock;
867
868 /*
869    Allocate and initialize new qdisc.
870
871    Parameters are passed via opt.
872  */
873
874 static struct Qdisc *
875 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
876              struct Qdisc *p, u32 parent, u32 handle,
877              struct nlattr **tca, int *errp)
878 {
879         int err;
880         struct nlattr *kind = tca[TCA_KIND];
881         struct Qdisc *sch;
882         struct Qdisc_ops *ops;
883         struct qdisc_size_table *stab;
884
885         ops = qdisc_lookup_ops(kind);
886 #ifdef CONFIG_MODULES
887         if (ops == NULL && kind != NULL) {
888                 char name[IFNAMSIZ];
889                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
890                         /* We dropped the RTNL semaphore in order to
891                          * perform the module load.  So, even if we
892                          * succeeded in loading the module we have to
893                          * tell the caller to replay the request.  We
894                          * indicate this using -EAGAIN.
895                          * We replay the request because the device may
896                          * go away in the mean time.
897                          */
898                         rtnl_unlock();
899                         request_module("sch_%s", name);
900                         rtnl_lock();
901                         ops = qdisc_lookup_ops(kind);
902                         if (ops != NULL) {
903                                 /* We will try again qdisc_lookup_ops,
904                                  * so don't keep a reference.
905                                  */
906                                 module_put(ops->owner);
907                                 err = -EAGAIN;
908                                 goto err_out;
909                         }
910                 }
911         }
912 #endif
913
914         err = -ENOENT;
915         if (ops == NULL)
916                 goto err_out;
917
918         sch = qdisc_alloc(dev_queue, ops);
919         if (IS_ERR(sch)) {
920                 err = PTR_ERR(sch);
921                 goto err_out2;
922         }
923
924         sch->parent = parent;
925
926         if (handle == TC_H_INGRESS) {
927                 sch->flags |= TCQ_F_INGRESS;
928                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
929                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
930         } else {
931                 if (handle == 0) {
932                         handle = qdisc_alloc_handle(dev);
933                         err = -ENOMEM;
934                         if (handle == 0)
935                                 goto err_out3;
936                 }
937                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
938                 if (!netif_is_multiqueue(dev))
939                         sch->flags |= TCQ_F_ONETXQUEUE;
940         }
941
942         sch->handle = handle;
943
944         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
945                 if (tca[TCA_STAB]) {
946                         stab = qdisc_get_stab(tca[TCA_STAB]);
947                         if (IS_ERR(stab)) {
948                                 err = PTR_ERR(stab);
949                                 goto err_out4;
950                         }
951                         rcu_assign_pointer(sch->stab, stab);
952                 }
953                 if (tca[TCA_RATE]) {
954                         spinlock_t *root_lock;
955
956                         err = -EOPNOTSUPP;
957                         if (sch->flags & TCQ_F_MQROOT)
958                                 goto err_out4;
959
960                         if ((sch->parent != TC_H_ROOT) &&
961                             !(sch->flags & TCQ_F_INGRESS) &&
962                             (!p || !(p->flags & TCQ_F_MQROOT)))
963                                 root_lock = qdisc_root_sleeping_lock(sch);
964                         else
965                                 root_lock = qdisc_lock(sch);
966
967                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
968                                                 root_lock, tca[TCA_RATE]);
969                         if (err)
970                                 goto err_out4;
971                 }
972
973                 qdisc_list_add(sch);
974
975                 return sch;
976         }
977 err_out3:
978         dev_put(dev);
979         kfree((char *) sch - sch->padded);
980 err_out2:
981         module_put(ops->owner);
982 err_out:
983         *errp = err;
984         return NULL;
985
986 err_out4:
987         /*
988          * Any broken qdiscs that would require a ops->reset() here?
989          * The qdisc was never in action so it shouldn't be necessary.
990          */
991         qdisc_put_stab(rtnl_dereference(sch->stab));
992         if (ops->destroy)
993                 ops->destroy(sch);
994         goto err_out3;
995 }
996
997 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
998 {
999         struct qdisc_size_table *ostab, *stab = NULL;
1000         int err = 0;
1001
1002         if (tca[TCA_OPTIONS]) {
1003                 if (sch->ops->change == NULL)
1004                         return -EINVAL;
1005                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1006                 if (err)
1007                         return err;
1008         }
1009
1010         if (tca[TCA_STAB]) {
1011                 stab = qdisc_get_stab(tca[TCA_STAB]);
1012                 if (IS_ERR(stab))
1013                         return PTR_ERR(stab);
1014         }
1015
1016         ostab = rtnl_dereference(sch->stab);
1017         rcu_assign_pointer(sch->stab, stab);
1018         qdisc_put_stab(ostab);
1019
1020         if (tca[TCA_RATE]) {
1021                 /* NB: ignores errors from replace_estimator
1022                    because change can't be undone. */
1023                 if (sch->flags & TCQ_F_MQROOT)
1024                         goto out;
1025                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
1026                                             qdisc_root_sleeping_lock(sch),
1027                                             tca[TCA_RATE]);
1028         }
1029 out:
1030         return 0;
1031 }
1032
1033 struct check_loop_arg {
1034         struct qdisc_walker     w;
1035         struct Qdisc            *p;
1036         int                     depth;
1037 };
1038
1039 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1040
1041 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1042 {
1043         struct check_loop_arg   arg;
1044
1045         if (q->ops->cl_ops == NULL)
1046                 return 0;
1047
1048         arg.w.stop = arg.w.skip = arg.w.count = 0;
1049         arg.w.fn = check_loop_fn;
1050         arg.depth = depth;
1051         arg.p = p;
1052         q->ops->cl_ops->walk(q, &arg.w);
1053         return arg.w.stop ? -ELOOP : 0;
1054 }
1055
1056 static int
1057 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1058 {
1059         struct Qdisc *leaf;
1060         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1061         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1062
1063         leaf = cops->leaf(q, cl);
1064         if (leaf) {
1065                 if (leaf == arg->p || arg->depth > 7)
1066                         return -ELOOP;
1067                 return check_loop(leaf, arg->p, arg->depth + 1);
1068         }
1069         return 0;
1070 }
1071
1072 /*
1073  * Delete/get qdisc.
1074  */
1075
1076 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1077 {
1078         struct net *net = sock_net(skb->sk);
1079         struct tcmsg *tcm = nlmsg_data(n);
1080         struct nlattr *tca[TCA_MAX + 1];
1081         struct net_device *dev;
1082         u32 clid;
1083         struct Qdisc *q = NULL;
1084         struct Qdisc *p = NULL;
1085         int err;
1086
1087         if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
1088                 return -EPERM;
1089
1090         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1091         if (err < 0)
1092                 return err;
1093
1094         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1095         if (!dev)
1096                 return -ENODEV;
1097
1098         clid = tcm->tcm_parent;
1099         if (clid) {
1100                 if (clid != TC_H_ROOT) {
1101                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1102                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1103                                 if (!p)
1104                                         return -ENOENT;
1105                                 q = qdisc_leaf(p, clid);
1106                         } else if (dev_ingress_queue(dev)) {
1107                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1108                         }
1109                 } else {
1110                         q = dev->qdisc;
1111                 }
1112                 if (!q)
1113                         return -ENOENT;
1114
1115                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1116                         return -EINVAL;
1117         } else {
1118                 q = qdisc_lookup(dev, tcm->tcm_handle);
1119                 if (!q)
1120                         return -ENOENT;
1121         }
1122
1123         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1124                 return -EINVAL;
1125
1126         if (n->nlmsg_type == RTM_DELQDISC) {
1127                 if (!clid)
1128                         return -EINVAL;
1129                 if (q->handle == 0)
1130                         return -ENOENT;
1131                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1132                 if (err != 0)
1133                         return err;
1134         } else {
1135                 qdisc_notify(net, skb, n, clid, NULL, q);
1136         }
1137         return 0;
1138 }
1139
1140 /*
1141  * Create/change qdisc.
1142  */
1143
1144 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1145 {
1146         struct net *net = sock_net(skb->sk);
1147         struct tcmsg *tcm;
1148         struct nlattr *tca[TCA_MAX + 1];
1149         struct net_device *dev;
1150         u32 clid;
1151         struct Qdisc *q, *p;
1152         int err;
1153
1154         if (!capable(CAP_NET_ADMIN))
1155                 return -EPERM;
1156
1157 replay:
1158         /* Reinit, just in case something touches this. */
1159         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1160         if (err < 0)
1161                 return err;
1162
1163         tcm = nlmsg_data(n);
1164         clid = tcm->tcm_parent;
1165         q = p = NULL;
1166
1167         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1168         if (!dev)
1169                 return -ENODEV;
1170
1171
1172         if (clid) {
1173                 if (clid != TC_H_ROOT) {
1174                         if (clid != TC_H_INGRESS) {
1175                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1176                                 if (!p)
1177                                         return -ENOENT;
1178                                 q = qdisc_leaf(p, clid);
1179                         } else if (dev_ingress_queue_create(dev)) {
1180                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1181                         }
1182                 } else {
1183                         q = dev->qdisc;
1184                 }
1185
1186                 /* It may be default qdisc, ignore it */
1187                 if (q && q->handle == 0)
1188                         q = NULL;
1189
1190                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1191                         if (tcm->tcm_handle) {
1192                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1193                                         return -EEXIST;
1194                                 if (TC_H_MIN(tcm->tcm_handle))
1195                                         return -EINVAL;
1196                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1197                                 if (!q)
1198                                         goto create_n_graft;
1199                                 if (n->nlmsg_flags & NLM_F_EXCL)
1200                                         return -EEXIST;
1201                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1202                                         return -EINVAL;
1203                                 if (q == p ||
1204                                     (p && check_loop(q, p, 0)))
1205                                         return -ELOOP;
1206                                 atomic_inc(&q->refcnt);
1207                                 goto graft;
1208                         } else {
1209                                 if (!q)
1210                                         goto create_n_graft;
1211
1212                                 /* This magic test requires explanation.
1213                                  *
1214                                  *   We know, that some child q is already
1215                                  *   attached to this parent and have choice:
1216                                  *   either to change it or to create/graft new one.
1217                                  *
1218                                  *   1. We are allowed to create/graft only
1219                                  *   if CREATE and REPLACE flags are set.
1220                                  *
1221                                  *   2. If EXCL is set, requestor wanted to say,
1222                                  *   that qdisc tcm_handle is not expected
1223                                  *   to exist, so that we choose create/graft too.
1224                                  *
1225                                  *   3. The last case is when no flags are set.
1226                                  *   Alas, it is sort of hole in API, we
1227                                  *   cannot decide what to do unambiguously.
1228                                  *   For now we select create/graft, if
1229                                  *   user gave KIND, which does not match existing.
1230                                  */
1231                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1232                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1233                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1234                                      (tca[TCA_KIND] &&
1235                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1236                                         goto create_n_graft;
1237                         }
1238                 }
1239         } else {
1240                 if (!tcm->tcm_handle)
1241                         return -EINVAL;
1242                 q = qdisc_lookup(dev, tcm->tcm_handle);
1243         }
1244
1245         /* Change qdisc parameters */
1246         if (q == NULL)
1247                 return -ENOENT;
1248         if (n->nlmsg_flags & NLM_F_EXCL)
1249                 return -EEXIST;
1250         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1251                 return -EINVAL;
1252         err = qdisc_change(q, tca);
1253         if (err == 0)
1254                 qdisc_notify(net, skb, n, clid, NULL, q);
1255         return err;
1256
1257 create_n_graft:
1258         if (!(n->nlmsg_flags & NLM_F_CREATE))
1259                 return -ENOENT;
1260         if (clid == TC_H_INGRESS) {
1261                 if (dev_ingress_queue(dev))
1262                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1263                                          tcm->tcm_parent, tcm->tcm_parent,
1264                                          tca, &err);
1265                 else
1266                         err = -ENOENT;
1267         } else {
1268                 struct netdev_queue *dev_queue;
1269
1270                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1271                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1272                 else if (p)
1273                         dev_queue = p->dev_queue;
1274                 else
1275                         dev_queue = netdev_get_tx_queue(dev, 0);
1276
1277                 q = qdisc_create(dev, dev_queue, p,
1278                                  tcm->tcm_parent, tcm->tcm_handle,
1279                                  tca, &err);
1280         }
1281         if (q == NULL) {
1282                 if (err == -EAGAIN)
1283                         goto replay;
1284                 return err;
1285         }
1286
1287 graft:
1288         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1289         if (err) {
1290                 if (q)
1291                         qdisc_destroy(q);
1292                 return err;
1293         }
1294
1295         return 0;
1296 }
1297
1298 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1299                          u32 portid, u32 seq, u16 flags, int event)
1300 {
1301         struct tcmsg *tcm;
1302         struct nlmsghdr  *nlh;
1303         unsigned char *b = skb_tail_pointer(skb);
1304         struct gnet_dump d;
1305         struct qdisc_size_table *stab;
1306
1307         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1308         if (!nlh)
1309                 goto out_nlmsg_trim;
1310         tcm = nlmsg_data(nlh);
1311         tcm->tcm_family = AF_UNSPEC;
1312         tcm->tcm__pad1 = 0;
1313         tcm->tcm__pad2 = 0;
1314         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1315         tcm->tcm_parent = clid;
1316         tcm->tcm_handle = q->handle;
1317         tcm->tcm_info = atomic_read(&q->refcnt);
1318         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1319                 goto nla_put_failure;
1320         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1321                 goto nla_put_failure;
1322         q->qstats.qlen = q->q.qlen;
1323
1324         stab = rtnl_dereference(q->stab);
1325         if (stab && qdisc_dump_stab(skb, stab) < 0)
1326                 goto nla_put_failure;
1327
1328         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1329                                          qdisc_root_sleeping_lock(q), &d) < 0)
1330                 goto nla_put_failure;
1331
1332         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1333                 goto nla_put_failure;
1334
1335         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1336             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1337             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1338                 goto nla_put_failure;
1339
1340         if (gnet_stats_finish_copy(&d) < 0)
1341                 goto nla_put_failure;
1342
1343         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1344         return skb->len;
1345
1346 out_nlmsg_trim:
1347 nla_put_failure:
1348         nlmsg_trim(skb, b);
1349         return -1;
1350 }
1351
1352 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1353 {
1354         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1355 }
1356
1357 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1358                         struct nlmsghdr *n, u32 clid,
1359                         struct Qdisc *old, struct Qdisc *new)
1360 {
1361         struct sk_buff *skb;
1362         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1363
1364         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1365         if (!skb)
1366                 return -ENOBUFS;
1367
1368         if (old && !tc_qdisc_dump_ignore(old)) {
1369                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1370                                   0, RTM_DELQDISC) < 0)
1371                         goto err_out;
1372         }
1373         if (new && !tc_qdisc_dump_ignore(new)) {
1374                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1375                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1376                         goto err_out;
1377         }
1378
1379         if (skb->len)
1380                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1381                                       n->nlmsg_flags & NLM_F_ECHO);
1382
1383 err_out:
1384         kfree_skb(skb);
1385         return -EINVAL;
1386 }
1387
1388 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1389                               struct netlink_callback *cb,
1390                               int *q_idx_p, int s_q_idx)
1391 {
1392         int ret = 0, q_idx = *q_idx_p;
1393         struct Qdisc *q;
1394
1395         if (!root)
1396                 return 0;
1397
1398         q = root;
1399         if (q_idx < s_q_idx) {
1400                 q_idx++;
1401         } else {
1402                 if (!tc_qdisc_dump_ignore(q) &&
1403                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1404                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1405                         goto done;
1406                 q_idx++;
1407         }
1408         list_for_each_entry(q, &root->list, list) {
1409                 if (q_idx < s_q_idx) {
1410                         q_idx++;
1411                         continue;
1412                 }
1413                 if (!tc_qdisc_dump_ignore(q) &&
1414                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1415                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1416                         goto done;
1417                 q_idx++;
1418         }
1419
1420 out:
1421         *q_idx_p = q_idx;
1422         return ret;
1423 done:
1424         ret = -1;
1425         goto out;
1426 }
1427
1428 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1429 {
1430         struct net *net = sock_net(skb->sk);
1431         int idx, q_idx;
1432         int s_idx, s_q_idx;
1433         struct net_device *dev;
1434
1435         s_idx = cb->args[0];
1436         s_q_idx = q_idx = cb->args[1];
1437
1438         rcu_read_lock();
1439         idx = 0;
1440         for_each_netdev_rcu(net, dev) {
1441                 struct netdev_queue *dev_queue;
1442
1443                 if (idx < s_idx)
1444                         goto cont;
1445                 if (idx > s_idx)
1446                         s_q_idx = 0;
1447                 q_idx = 0;
1448
1449                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1450                         goto done;
1451
1452                 dev_queue = dev_ingress_queue(dev);
1453                 if (dev_queue &&
1454                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1455                                        &q_idx, s_q_idx) < 0)
1456                         goto done;
1457
1458 cont:
1459                 idx++;
1460         }
1461
1462 done:
1463         rcu_read_unlock();
1464
1465         cb->args[0] = idx;
1466         cb->args[1] = q_idx;
1467
1468         return skb->len;
1469 }
1470
1471
1472
1473 /************************************************
1474  *      Traffic classes manipulation.           *
1475  ************************************************/
1476
1477
1478
1479 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1480 {
1481         struct net *net = sock_net(skb->sk);
1482         struct tcmsg *tcm = nlmsg_data(n);
1483         struct nlattr *tca[TCA_MAX + 1];
1484         struct net_device *dev;
1485         struct Qdisc *q = NULL;
1486         const struct Qdisc_class_ops *cops;
1487         unsigned long cl = 0;
1488         unsigned long new_cl;
1489         u32 portid;
1490         u32 clid;
1491         u32 qid;
1492         int err;
1493
1494         if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1495                 return -EPERM;
1496
1497         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1498         if (err < 0)
1499                 return err;
1500
1501         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1502         if (!dev)
1503                 return -ENODEV;
1504
1505         /*
1506            parent == TC_H_UNSPEC - unspecified parent.
1507            parent == TC_H_ROOT   - class is root, which has no parent.
1508            parent == X:0         - parent is root class.
1509            parent == X:Y         - parent is a node in hierarchy.
1510            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1511
1512            handle == 0:0         - generate handle from kernel pool.
1513            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1514            handle == X:Y         - clear.
1515            handle == X:0         - root class.
1516          */
1517
1518         /* Step 1. Determine qdisc handle X:0 */
1519
1520         portid = tcm->tcm_parent;
1521         clid = tcm->tcm_handle;
1522         qid = TC_H_MAJ(clid);
1523
1524         if (portid != TC_H_ROOT) {
1525                 u32 qid1 = TC_H_MAJ(portid);
1526
1527                 if (qid && qid1) {
1528                         /* If both majors are known, they must be identical. */
1529                         if (qid != qid1)
1530                                 return -EINVAL;
1531                 } else if (qid1) {
1532                         qid = qid1;
1533                 } else if (qid == 0)
1534                         qid = dev->qdisc->handle;
1535
1536                 /* Now qid is genuine qdisc handle consistent
1537                  * both with parent and child.
1538                  *
1539                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1540                  */
1541                 if (portid)
1542                         portid = TC_H_MAKE(qid, portid);
1543         } else {
1544                 if (qid == 0)
1545                         qid = dev->qdisc->handle;
1546         }
1547
1548         /* OK. Locate qdisc */
1549         q = qdisc_lookup(dev, qid);
1550         if (!q)
1551                 return -ENOENT;
1552
1553         /* An check that it supports classes */
1554         cops = q->ops->cl_ops;
1555         if (cops == NULL)
1556                 return -EINVAL;
1557
1558         /* Now try to get class */
1559         if (clid == 0) {
1560                 if (portid == TC_H_ROOT)
1561                         clid = qid;
1562         } else
1563                 clid = TC_H_MAKE(qid, clid);
1564
1565         if (clid)
1566                 cl = cops->get(q, clid);
1567
1568         if (cl == 0) {
1569                 err = -ENOENT;
1570                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1571                     !(n->nlmsg_flags & NLM_F_CREATE))
1572                         goto out;
1573         } else {
1574                 switch (n->nlmsg_type) {
1575                 case RTM_NEWTCLASS:
1576                         err = -EEXIST;
1577                         if (n->nlmsg_flags & NLM_F_EXCL)
1578                                 goto out;
1579                         break;
1580                 case RTM_DELTCLASS:
1581                         err = -EOPNOTSUPP;
1582                         if (cops->delete)
1583                                 err = cops->delete(q, cl);
1584                         if (err == 0)
1585                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1586                         goto out;
1587                 case RTM_GETTCLASS:
1588                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1589                         goto out;
1590                 default:
1591                         err = -EINVAL;
1592                         goto out;
1593                 }
1594         }
1595
1596         new_cl = cl;
1597         err = -EOPNOTSUPP;
1598         if (cops->change)
1599                 err = cops->change(q, clid, portid, tca, &new_cl);
1600         if (err == 0)
1601                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1602
1603 out:
1604         if (cl)
1605                 cops->put(q, cl);
1606
1607         return err;
1608 }
1609
1610
1611 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1612                           unsigned long cl,
1613                           u32 portid, u32 seq, u16 flags, int event)
1614 {
1615         struct tcmsg *tcm;
1616         struct nlmsghdr  *nlh;
1617         unsigned char *b = skb_tail_pointer(skb);
1618         struct gnet_dump d;
1619         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1620
1621         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1622         if (!nlh)
1623                 goto out_nlmsg_trim;
1624         tcm = nlmsg_data(nlh);
1625         tcm->tcm_family = AF_UNSPEC;
1626         tcm->tcm__pad1 = 0;
1627         tcm->tcm__pad2 = 0;
1628         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1629         tcm->tcm_parent = q->handle;
1630         tcm->tcm_handle = q->handle;
1631         tcm->tcm_info = 0;
1632         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1633                 goto nla_put_failure;
1634         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1635                 goto nla_put_failure;
1636
1637         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1638                                          qdisc_root_sleeping_lock(q), &d) < 0)
1639                 goto nla_put_failure;
1640
1641         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1642                 goto nla_put_failure;
1643
1644         if (gnet_stats_finish_copy(&d) < 0)
1645                 goto nla_put_failure;
1646
1647         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1648         return skb->len;
1649
1650 out_nlmsg_trim:
1651 nla_put_failure:
1652         nlmsg_trim(skb, b);
1653         return -1;
1654 }
1655
1656 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1657                          struct nlmsghdr *n, struct Qdisc *q,
1658                          unsigned long cl, int event)
1659 {
1660         struct sk_buff *skb;
1661         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1662
1663         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1664         if (!skb)
1665                 return -ENOBUFS;
1666
1667         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1668                 kfree_skb(skb);
1669                 return -EINVAL;
1670         }
1671
1672         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1673                               n->nlmsg_flags & NLM_F_ECHO);
1674 }
1675
1676 struct qdisc_dump_args {
1677         struct qdisc_walker     w;
1678         struct sk_buff          *skb;
1679         struct netlink_callback *cb;
1680 };
1681
1682 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1683 {
1684         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1685
1686         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1687                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1688 }
1689
1690 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1691                                 struct tcmsg *tcm, struct netlink_callback *cb,
1692                                 int *t_p, int s_t)
1693 {
1694         struct qdisc_dump_args arg;
1695
1696         if (tc_qdisc_dump_ignore(q) ||
1697             *t_p < s_t || !q->ops->cl_ops ||
1698             (tcm->tcm_parent &&
1699              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1700                 (*t_p)++;
1701                 return 0;
1702         }
1703         if (*t_p > s_t)
1704                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1705         arg.w.fn = qdisc_class_dump;
1706         arg.skb = skb;
1707         arg.cb = cb;
1708         arg.w.stop  = 0;
1709         arg.w.skip = cb->args[1];
1710         arg.w.count = 0;
1711         q->ops->cl_ops->walk(q, &arg.w);
1712         cb->args[1] = arg.w.count;
1713         if (arg.w.stop)
1714                 return -1;
1715         (*t_p)++;
1716         return 0;
1717 }
1718
1719 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1720                                struct tcmsg *tcm, struct netlink_callback *cb,
1721                                int *t_p, int s_t)
1722 {
1723         struct Qdisc *q;
1724
1725         if (!root)
1726                 return 0;
1727
1728         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1729                 return -1;
1730
1731         list_for_each_entry(q, &root->list, list) {
1732                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1733                         return -1;
1734         }
1735
1736         return 0;
1737 }
1738
1739 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1740 {
1741         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1742         struct net *net = sock_net(skb->sk);
1743         struct netdev_queue *dev_queue;
1744         struct net_device *dev;
1745         int t, s_t;
1746
1747         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1748                 return 0;
1749         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1750         if (!dev)
1751                 return 0;
1752
1753         s_t = cb->args[0];
1754         t = 0;
1755
1756         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1757                 goto done;
1758
1759         dev_queue = dev_ingress_queue(dev);
1760         if (dev_queue &&
1761             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1762                                 &t, s_t) < 0)
1763                 goto done;
1764
1765 done:
1766         cb->args[0] = t;
1767
1768         dev_put(dev);
1769         return skb->len;
1770 }
1771
1772 /* Main classifier routine: scans classifier chain attached
1773  * to this qdisc, (optionally) tests for protocol and asks
1774  * specific classifiers.
1775  */
1776 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1777                        struct tcf_result *res)
1778 {
1779         __be16 protocol = skb->protocol;
1780         int err;
1781
1782         for (; tp; tp = tp->next) {
1783                 if (tp->protocol != protocol &&
1784                     tp->protocol != htons(ETH_P_ALL))
1785                         continue;
1786                 err = tp->classify(skb, tp, res);
1787
1788                 if (err >= 0) {
1789 #ifdef CONFIG_NET_CLS_ACT
1790                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1791                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1792 #endif
1793                         return err;
1794                 }
1795         }
1796         return -1;
1797 }
1798 EXPORT_SYMBOL(tc_classify_compat);
1799
1800 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1801                 struct tcf_result *res)
1802 {
1803         int err = 0;
1804 #ifdef CONFIG_NET_CLS_ACT
1805         const struct tcf_proto *otp = tp;
1806 reclassify:
1807 #endif
1808
1809         err = tc_classify_compat(skb, tp, res);
1810 #ifdef CONFIG_NET_CLS_ACT
1811         if (err == TC_ACT_RECLASSIFY) {
1812                 u32 verd = G_TC_VERD(skb->tc_verd);
1813                 tp = otp;
1814
1815                 if (verd++ >= MAX_REC_LOOP) {
1816                         net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1817                                                tp->q->ops->id,
1818                                                tp->prio & 0xffff,
1819                                                ntohs(tp->protocol));
1820                         return TC_ACT_SHOT;
1821                 }
1822                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1823                 goto reclassify;
1824         }
1825 #endif
1826         return err;
1827 }
1828 EXPORT_SYMBOL(tc_classify);
1829
1830 void tcf_destroy(struct tcf_proto *tp)
1831 {
1832         tp->ops->destroy(tp);
1833         module_put(tp->ops->owner);
1834         kfree(tp);
1835 }
1836
1837 void tcf_destroy_chain(struct tcf_proto **fl)
1838 {
1839         struct tcf_proto *tp;
1840
1841         while ((tp = *fl) != NULL) {
1842                 *fl = tp->next;
1843                 tcf_destroy(tp);
1844         }
1845 }
1846 EXPORT_SYMBOL(tcf_destroy_chain);
1847
1848 #ifdef CONFIG_PROC_FS
1849 static int psched_show(struct seq_file *seq, void *v)
1850 {
1851         struct timespec ts;
1852
1853         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1854         seq_printf(seq, "%08x %08x %08x %08x\n",
1855                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1856                    1000000,
1857                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1858
1859         return 0;
1860 }
1861
1862 static int psched_open(struct inode *inode, struct file *file)
1863 {
1864         return single_open(file, psched_show, NULL);
1865 }
1866
1867 static const struct file_operations psched_fops = {
1868         .owner = THIS_MODULE,
1869         .open = psched_open,
1870         .read  = seq_read,
1871         .llseek = seq_lseek,
1872         .release = single_release,
1873 };
1874
1875 static int __net_init psched_net_init(struct net *net)
1876 {
1877         struct proc_dir_entry *e;
1878
1879         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1880         if (e == NULL)
1881                 return -ENOMEM;
1882
1883         return 0;
1884 }
1885
1886 static void __net_exit psched_net_exit(struct net *net)
1887 {
1888         remove_proc_entry("psched", net->proc_net);
1889 }
1890 #else
1891 static int __net_init psched_net_init(struct net *net)
1892 {
1893         return 0;
1894 }
1895
1896 static void __net_exit psched_net_exit(struct net *net)
1897 {
1898 }
1899 #endif
1900
1901 static struct pernet_operations psched_net_ops = {
1902         .init = psched_net_init,
1903         .exit = psched_net_exit,
1904 };
1905
1906 static int __init pktsched_init(void)
1907 {
1908         int err;
1909
1910         err = register_pernet_subsys(&psched_net_ops);
1911         if (err) {
1912                 pr_err("pktsched_init: "
1913                        "cannot initialize per netns operations\n");
1914                 return err;
1915         }
1916
1917         register_qdisc(&pfifo_fast_ops);
1918         register_qdisc(&pfifo_qdisc_ops);
1919         register_qdisc(&bfifo_qdisc_ops);
1920         register_qdisc(&pfifo_head_drop_qdisc_ops);
1921         register_qdisc(&mq_qdisc_ops);
1922
1923         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1924         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1925         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1926         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1927         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1928         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1929
1930         return 0;
1931 }
1932
1933 subsys_initcall(pktsched_init);