Pileus Git - ~andy/linux/blob - net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26
  27 #include <net/netlink.h>
  28 #include <net/pkt_sched.h>
  29 #include <net/inet_ecn.h>
  30
  31 #define VERSION "1.3"
  32
  33 /*      Network Emulation Queuing algorithm.
  34         ====================================
  35
  36         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  37                  Network Emulation Tool
  38                  [2] Luigi Rizzo, DummyNet for FreeBSD
  39
  40          ----------------------------------------------------------------
  41
  42          This started out as a simple way to delay outgoing packets to
  43          test TCP but has grown to include most of the functionality
  44          of a full blown network emulator like NISTnet. It can delay
  45          packets and add random jitter (and correlation). The random
  46          distribution can be loaded from a table as well to provide
  47          normal, Pareto, or experimental curves. Packet loss,
  48          duplication, and reordering can also be emulated.
  49
  50          This qdisc does not do classification that can be handled in
  51          layering other disciplines.  It does not need to do bandwidth
  52          control either since that can be handled by using token
  53          bucket or other rate control.
  54
  55      Correlated Loss Generator models
  56
  57         Added generation of correlated loss according to the
  58         "Gilbert-Elliot" model, a 4-state markov model.
  59
  60         References:
  61         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  62         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  63         and intuitive loss model for packet networks and its implementation
  64         in the Netem module in the Linux kernel", available in [1]
  65
  66         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  67                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  68 */
  69
  70 struct netem_sched_data {
  71         /* internal t(ime)fifo qdisc uses sch->q and sch->limit */
  72
  73         /* optional qdisc for classful handling (NULL at netem init) */
  74         struct Qdisc    *qdisc;
  75
  76         struct qdisc_watchdog watchdog;
  77
  78         psched_tdiff_t latency;
  79         psched_tdiff_t jitter;
  80
  81         u32 loss;
  82         u32 ecn;
  83         u32 limit;
  84         u32 counter;
  85         u32 gap;
  86         u32 duplicate;
  87         u32 reorder;
  88         u32 corrupt;
  89         u32 rate;
  90         s32 packet_overhead;
  91         u32 cell_size;
  92         u32 cell_size_reciprocal;
  93         s32 cell_overhead;
  94
  95         struct crndstate {
  96                 u32 last;
  97                 u32 rho;
  98         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
  99
 100         struct disttable {
 101                 u32  size;
 102                 s16 table[0];
 103         } *delay_dist;
 104
 105         enum  {
 106                 CLG_RANDOM,
 107                 CLG_4_STATES,
 108                 CLG_GILB_ELL,
 109         } loss_model;
 110
 111         /* Correlated Loss Generation models */
 112         struct clgstate {
 113                 /* state of the Markov chain */
 114                 u8 state;
 115
 116                 /* 4-states and Gilbert-Elliot models */
 117                 u32 a1; /* p13 for 4-states or p for GE */
 118                 u32 a2; /* p31 for 4-states or r for GE */
 119                 u32 a3; /* p32 for 4-states or h for GE */
 120                 u32 a4; /* p14 for 4-states or 1-k for GE */
 121                 u32 a5; /* p23 used only in 4-states */
 122         } clg;
 123
 124 };
 125
 126 /* Time stamp put into socket buffer control block
 127  * Only valid when skbs are in our internal t(ime)fifo queue.
 128  */
 129 struct netem_skb_cb {
 130         psched_time_t   time_to_send;
 131 };
 132
 133 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 134 {
 135         qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 136         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 137 }
 138
 139 /* init_crandom - initialize correlated random number generator
 140  * Use entropy source for initial seed.
 141  */
 142 static void init_crandom(struct crndstate *state, unsigned long rho)
 143 {
 144         state->rho = rho;
 145         state->last = net_random();
 146 }
 147
 148 /* get_crandom - correlated random number generator
 149  * Next number depends on last value.
 150  * rho is scaled to avoid floating point.
 151  */
 152 static u32 get_crandom(struct crndstate *state)
 153 {
 154         u64 value, rho;
 155         unsigned long answer;
 156
 157         if (state->rho == 0)    /* no correlation */
 158                 return net_random();
 159
 160         value = net_random();
 161         rho = (u64)state->rho + 1;
 162         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 163         state->last = answer;
 164         return answer;
 165 }
 166
 167 /* loss_4state - 4-state model loss generator
 168  * Generates losses according to the 4-state Markov chain adopted in
 169  * the GI (General and Intuitive) loss model.
 170  */
 171 static bool loss_4state(struct netem_sched_data *q)
 172 {
 173         struct clgstate *clg = &q->clg;
 174         u32 rnd = net_random();
 175
 176         /*
 177          * Makes a comparison between rnd and the transition
 178          * probabilities outgoing from the current state, then decides the
 179          * next state and if the next packet has to be transmitted or lost.
 180          * The four states correspond to:
 181          *   1 => successfully transmitted packets within a gap period
 182          *   4 => isolated losses within a gap period
 183          *   3 => lost packets within a burst period
 184          *   2 => successfully transmitted packets within a burst period
 185          */
 186         switch (clg->state) {
 187         case 1:
 188                 if (rnd < clg->a4) {
 189                         clg->state = 4;
 190                         return true;
 191                 } else if (clg->a4 < rnd && rnd < clg->a1) {
 192                         clg->state = 3;
 193                         return true;
 194                 } else if (clg->a1 < rnd)
 195                         clg->state = 1;
 196
 197                 break;
 198         case 2:
 199                 if (rnd < clg->a5) {
 200                         clg->state = 3;
 201                         return true;
 202                 } else
 203                         clg->state = 2;
 204
 205                 break;
 206         case 3:
 207                 if (rnd < clg->a3)
 208                         clg->state = 2;
 209                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 210                         clg->state = 1;
 211                         return true;
 212                 } else if (clg->a2 + clg->a3 < rnd) {
 213                         clg->state = 3;
 214                         return true;
 215                 }
 216                 break;
 217         case 4:
 218                 clg->state = 1;
 219                 break;
 220         }
 221
 222         return false;
 223 }
 224
 225 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 226  * Generates losses according to the Gilbert-Elliot loss model or
 227  * its special cases  (Gilbert or Simple Gilbert)
 228  *
 229  * Makes a comparison between random number and the transition
 230  * probabilities outgoing from the current state, then decides the
 231  * next state. A second random number is extracted and the comparison
 232  * with the loss probability of the current state decides if the next
 233  * packet will be transmitted or lost.
 234  */
 235 static bool loss_gilb_ell(struct netem_sched_data *q)
 236 {
 237         struct clgstate *clg = &q->clg;
 238
 239         switch (clg->state) {
 240         case 1:
 241                 if (net_random() < clg->a1)
 242                         clg->state = 2;
 243                 if (net_random() < clg->a4)
 244                         return true;
 245         case 2:
 246                 if (net_random() < clg->a2)
 247                         clg->state = 1;
 248                 if (clg->a3 > net_random())
 249                         return true;
 250         }
 251
 252         return false;
 253 }
 254
 255 static bool loss_event(struct netem_sched_data *q)
 256 {
 257         switch (q->loss_model) {
 258         case CLG_RANDOM:
 259                 /* Random packet drop 0 => none, ~0 => all */
 260                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 261
 262         case CLG_4_STATES:
 263                 /* 4state loss model algorithm (used also for GI model)
 264                 * Extracts a value from the markov 4 state loss generator,
 265                 * if it is 1 drops a packet and if needed writes the event in
 266                 * the kernel logs
 267                 */
 268                 return loss_4state(q);
 269
 270         case CLG_GILB_ELL:
 271                 /* Gilbert-Elliot loss model algorithm
 272                 * Extracts a value from the Gilbert-Elliot loss generator,
 273                 * if it is 1 drops a packet and if needed writes the event in
 274                 * the kernel logs
 275                 */
 276                 return loss_gilb_ell(q);
 277         }
 278
 279         return false;   /* not reached */
 280 }
 281
 282
 283 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 284  * std deviation sigma.  Uses table lookup to approximate the desired
 285  * distribution, and a uniformly-distributed pseudo-random source.
 286  */
 287 static psched_tdiff_t tabledist(psched_tdiff_t mu, psched_tdiff_t sigma,
 288                                 struct crndstate *state,
 289                                 const struct disttable *dist)
 290 {
 291         psched_tdiff_t x;
 292         long t;
 293         u32 rnd;
 294
 295         if (sigma == 0)
 296                 return mu;
 297
 298         rnd = get_crandom(state);
 299
 300         /* default uniform distribution */
 301         if (dist == NULL)
 302                 return (rnd % (2*sigma)) - sigma + mu;
 303
 304         t = dist->table[rnd % dist->size];
 305         x = (sigma % NETEM_DIST_SCALE) * t;
 306         if (x >= 0)
 307                 x += NETEM_DIST_SCALE/2;
 308         else
 309                 x -= NETEM_DIST_SCALE/2;
 310
 311         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 312 }
 313
 314 static psched_time_t packet_len_2_sched_time(unsigned int len, struct netem_sched_data *q)
 315 {
 316         u64 ticks;
 317
 318         len += q->packet_overhead;
 319
 320         if (q->cell_size) {
 321                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 322
 323                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 324                         cells++;
 325                 len = cells * (q->cell_size + q->cell_overhead);
 326         }
 327
 328         ticks = (u64)len * NSEC_PER_SEC;
 329
 330         do_div(ticks, q->rate);
 331         return PSCHED_NS2TICKS(ticks);
 332 }
 333
 334 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 335 {
 336         struct sk_buff_head *list = &sch->q;
 337         psched_time_t tnext = netem_skb_cb(nskb)->time_to_send;
 338         struct sk_buff *skb = skb_peek_tail(list);
 339
 340         /* Optimize for add at tail */
 341         if (likely(!skb || tnext >= netem_skb_cb(skb)->time_to_send))
 342                 return __skb_queue_tail(list, nskb);
 343
 344         skb_queue_reverse_walk(list, skb) {
 345                 if (tnext >= netem_skb_cb(skb)->time_to_send)
 346                         break;
 347         }
 348
 349         __skb_queue_after(list, skb, nskb);
 350 }
 351
 352 /*
 353  * Insert one skb into qdisc.
 354  * Note: parent depends on return value to account for queue length.
 355  *      NET_XMIT_DROP: queue length didn't change.
 356  *      NET_XMIT_SUCCESS: one skb was queued.
 357  */
 358 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
 359 {
 360         struct netem_sched_data *q = qdisc_priv(sch);
 361         /* We don't fill cb now as skb_unshare() may invalidate it */
 362         struct netem_skb_cb *cb;
 363         struct sk_buff *skb2;
 364         int count = 1;
 365
 366         /* Random duplication */
 367         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 368                 ++count;
 369
 370         /* Drop packet? */
 371         if (loss_event(q)) {
 372                 if (q->ecn && INET_ECN_set_ce(skb))
 373                         sch->qstats.drops++; /* mark packet */
 374                 else
 375                         --count;
 376         }
 377         if (count == 0) {
 378                 sch->qstats.drops++;
 379                 kfree_skb(skb);
 380                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 381         }
 382
 383         skb_orphan(skb);
 384
 385         /*
 386          * If we need to duplicate packet, then re-insert at top of the
 387          * qdisc tree, since parent queuer expects that only one
 388          * skb will be queued.
 389          */
 390         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 391                 struct Qdisc *rootq = qdisc_root(sch);
 392                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 393                 q->duplicate = 0;
 394
 395                 qdisc_enqueue_root(skb2, rootq);
 396                 q->duplicate = dupsave;
 397         }
 398
 399         /*
 400          * Randomized packet corruption.
 401          * Make copy if needed since we are modifying
 402          * If packet is going to be hardware checksummed, then
 403          * do it now in software before we mangle it.
 404          */
 405         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 406                 if (!(skb = skb_unshare(skb, GFP_ATOMIC)) ||
 407                     (skb->ip_summed == CHECKSUM_PARTIAL &&
 408                      skb_checksum_help(skb)))
 409                         return qdisc_drop(skb, sch);
 410
 411                 skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8);
 412         }
 413
 414         if (unlikely(skb_queue_len(&sch->q) >= sch->limit))
 415                 return qdisc_reshape_fail(skb, sch);
 416
 417         sch->qstats.backlog += qdisc_pkt_len(skb);
 418
 419         cb = netem_skb_cb(skb);
 420         if (q->gap == 0 ||              /* not doing reordering */
 421             q->counter < q->gap - 1 ||  /* inside last reordering gap */
 422             q->reorder < get_crandom(&q->reorder_cor)) {
 423                 psched_time_t now;
 424                 psched_tdiff_t delay;
 425
 426                 delay = tabledist(q->latency, q->jitter,
 427                                   &q->delay_cor, q->delay_dist);
 428
 429                 now = psched_get_time();
 430
 431                 if (q->rate) {
 432                         struct sk_buff_head *list = &sch->q;
 433
 434                         delay += packet_len_2_sched_time(skb->len, q);
 435
 436                         if (!skb_queue_empty(list)) {
 437                                 /*
 438                                  * Last packet in queue is reference point (now).
 439                                  * First packet in queue is already in flight,
 440                                  * calculate this time bonus and substract
 441                                  * from delay.
 442                                  */
 443                                 delay -= now - netem_skb_cb(skb_peek(list))->time_to_send;
 444                                 now = netem_skb_cb(skb_peek_tail(list))->time_to_send;
 445                         }
 446                 }
 447
 448                 cb->time_to_send = now + delay;
 449                 ++q->counter;
 450                 tfifo_enqueue(skb, sch);
 451         } else {
 452                 /*
 453                  * Do re-ordering by putting one out of N packets at the front
 454                  * of the queue.
 455                  */
 456                 cb->time_to_send = psched_get_time();
 457                 q->counter = 0;
 458
 459                 __skb_queue_head(&sch->q, skb);
 460                 sch->qstats.requeues++;
 461         }
 462
 463         return NET_XMIT_SUCCESS;
 464 }
 465
 466 static unsigned int netem_drop(struct Qdisc *sch)
 467 {
 468         struct netem_sched_data *q = qdisc_priv(sch);
 469         unsigned int len;
 470
 471         len = qdisc_queue_drop(sch);
 472         if (!len && q->qdisc && q->qdisc->ops->drop)
 473             len = q->qdisc->ops->drop(q->qdisc);
 474         if (len)
 475                 sch->qstats.drops++;
 476
 477         return len;
 478 }
 479
 480 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 481 {
 482         struct netem_sched_data *q = qdisc_priv(sch);
 483         struct sk_buff *skb;
 484
 485         if (qdisc_is_throttled(sch))
 486                 return NULL;
 487
 488 tfifo_dequeue:
 489         skb = qdisc_peek_head(sch);
 490         if (skb) {
 491                 const struct netem_skb_cb *cb = netem_skb_cb(skb);
 492
 493                 /* if more time remaining? */
 494                 if (cb->time_to_send <= psched_get_time()) {
 495                         __skb_unlink(skb, &sch->q);
 496                         sch->qstats.backlog -= qdisc_pkt_len(skb);
 497
 498 #ifdef CONFIG_NET_CLS_ACT
 499                         /*
 500                          * If it's at ingress let's pretend the delay is
 501                          * from the network (tstamp will be updated).
 502                          */
 503                         if (G_TC_FROM(skb->tc_verd) & AT_INGRESS)
 504                                 skb->tstamp.tv64 = 0;
 505 #endif
 506
 507                         if (q->qdisc) {
 508                                 int err = qdisc_enqueue(skb, q->qdisc);
 509
 510                                 if (unlikely(err != NET_XMIT_SUCCESS)) {
 511                                         if (net_xmit_drop_count(err)) {
 512                                                 sch->qstats.drops++;
 513                                                 qdisc_tree_decrease_qlen(sch, 1);
 514                                         }
 515                                 }
 516                                 goto tfifo_dequeue;
 517                         }
 518 deliver:
 519                         qdisc_unthrottled(sch);
 520                         qdisc_bstats_update(sch, skb);
 521                         return skb;
 522                 }
 523
 524                 if (q->qdisc) {
 525                         skb = q->qdisc->ops->dequeue(q->qdisc);
 526                         if (skb)
 527                                 goto deliver;
 528                 }
 529                 qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send);
 530         }
 531
 532         if (q->qdisc) {
 533                 skb = q->qdisc->ops->dequeue(q->qdisc);
 534                 if (skb)
 535                         goto deliver;
 536         }
 537         return NULL;
 538 }
 539
 540 static void netem_reset(struct Qdisc *sch)
 541 {
 542         struct netem_sched_data *q = qdisc_priv(sch);
 543
 544         qdisc_reset_queue(sch);
 545         if (q->qdisc)
 546                 qdisc_reset(q->qdisc);
 547         qdisc_watchdog_cancel(&q->watchdog);
 548 }
 549
 550 static void dist_free(struct disttable *d)
 551 {
 552         if (d) {
 553                 if (is_vmalloc_addr(d))
 554                         vfree(d);
 555                 else
 556                         kfree(d);
 557         }
 558 }
 559
 560 /*
 561  * Distribution data is a variable size payload containing
 562  * signed 16 bit values.
 563  */
 564 static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr)
 565 {
 566         struct netem_sched_data *q = qdisc_priv(sch);
 567         size_t n = nla_len(attr)/sizeof(__s16);
 568         const __s16 *data = nla_data(attr);
 569         spinlock_t *root_lock;
 570         struct disttable *d;
 571         int i;
 572         size_t s;
 573
 574         if (n > NETEM_DIST_MAX)
 575                 return -EINVAL;
 576
 577         s = sizeof(struct disttable) + n * sizeof(s16);
 578         d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN);
 579         if (!d)
 580                 d = vmalloc(s);
 581         if (!d)
 582                 return -ENOMEM;
 583
 584         d->size = n;
 585         for (i = 0; i < n; i++)
 586                 d->table[i] = data[i];
 587
 588         root_lock = qdisc_root_sleeping_lock(sch);
 589
 590         spin_lock_bh(root_lock);
 591         swap(q->delay_dist, d);
 592         spin_unlock_bh(root_lock);
 593
 594         dist_free(d);
 595         return 0;
 596 }
 597
 598 static void get_correlation(struct Qdisc *sch, const struct nlattr *attr)
 599 {
 600         struct netem_sched_data *q = qdisc_priv(sch);
 601         const struct tc_netem_corr *c = nla_data(attr);
 602
 603         init_crandom(&q->delay_cor, c->delay_corr);
 604         init_crandom(&q->loss_cor, c->loss_corr);
 605         init_crandom(&q->dup_cor, c->dup_corr);
 606 }
 607
 608 static void get_reorder(struct Qdisc *sch, const struct nlattr *attr)
 609 {
 610         struct netem_sched_data *q = qdisc_priv(sch);
 611         const struct tc_netem_reorder *r = nla_data(attr);
 612
 613         q->reorder = r->probability;
 614         init_crandom(&q->reorder_cor, r->correlation);
 615 }
 616
 617 static void get_corrupt(struct Qdisc *sch, const struct nlattr *attr)
 618 {
 619         struct netem_sched_data *q = qdisc_priv(sch);
 620         const struct tc_netem_corrupt *r = nla_data(attr);
 621
 622         q->corrupt = r->probability;
 623         init_crandom(&q->corrupt_cor, r->correlation);
 624 }
 625
 626 static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 627 {
 628         struct netem_sched_data *q = qdisc_priv(sch);
 629         const struct tc_netem_rate *r = nla_data(attr);
 630
 631         q->rate = r->rate;
 632         q->packet_overhead = r->packet_overhead;
 633         q->cell_size = r->cell_size;
 634         if (q->cell_size)
 635                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 636         q->cell_overhead = r->cell_overhead;
 637 }
 638
 639 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)
 640 {
 641         struct netem_sched_data *q = qdisc_priv(sch);
 642         const struct nlattr *la;
 643         int rem;
 644
 645         nla_for_each_nested(la, attr, rem) {
 646                 u16 type = nla_type(la);
 647
 648                 switch(type) {
 649                 case NETEM_LOSS_GI: {
 650                         const struct tc_netem_gimodel *gi = nla_data(la);
 651
 652                         if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 653                                 pr_info("netem: incorrect gi model size\n");
 654                                 return -EINVAL;
 655                         }
 656
 657                         q->loss_model = CLG_4_STATES;
 658
 659                         q->clg.state = 1;
 660                         q->clg.a1 = gi->p13;
 661                         q->clg.a2 = gi->p31;
 662                         q->clg.a3 = gi->p32;
 663                         q->clg.a4 = gi->p14;
 664                         q->clg.a5 = gi->p23;
 665                         break;
 666                 }
 667
 668                 case NETEM_LOSS_GE: {
 669                         const struct tc_netem_gemodel *ge = nla_data(la);
 670
 671                         if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 672                                 pr_info("netem: incorrect ge model size\n");
 673                                 return -EINVAL;
 674                         }
 675
 676                         q->loss_model = CLG_GILB_ELL;
 677                         q->clg.state = 1;
 678                         q->clg.a1 = ge->p;
 679                         q->clg.a2 = ge->r;
 680                         q->clg.a3 = ge->h;
 681                         q->clg.a4 = ge->k1;
 682                         break;
 683                 }
 684
 685                 default:
 686                         pr_info("netem: unknown loss type %u\n", type);
 687                         return -EINVAL;
 688                 }
 689         }
 690
 691         return 0;
 692 }
 693
 694 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 695         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 696         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 697         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 698         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 699         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 700         [TCA_NETEM_ECN]         = { .type = NLA_U32 },
 701 };
 702
 703 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 704                       const struct nla_policy *policy, int len)
 705 {
 706         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 707
 708         if (nested_len < 0) {
 709                 pr_info("netem: invalid attributes len %d\n", nested_len);
 710                 return -EINVAL;
 711         }
 712
 713         if (nested_len >= nla_attr_size(0))
 714                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 715                                  nested_len, policy);
 716
 717         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 718         return 0;
 719 }
 720
 721 /* Parse netlink message to set options */
 722 static int netem_change(struct Qdisc *sch, struct nlattr *opt)
 723 {
 724         struct netem_sched_data *q = qdisc_priv(sch);
 725         struct nlattr *tb[TCA_NETEM_MAX + 1];
 726         struct tc_netem_qopt *qopt;
 727         int ret;
 728
 729         if (opt == NULL)
 730                 return -EINVAL;
 731
 732         qopt = nla_data(opt);
 733         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 734         if (ret < 0)
 735                 return ret;
 736
 737         sch->limit = qopt->limit;
 738
 739         q->latency = qopt->latency;
 740         q->jitter = qopt->jitter;
 741         q->limit = qopt->limit;
 742         q->gap = qopt->gap;
 743         q->counter = 0;
 744         q->loss = qopt->loss;
 745         q->duplicate = qopt->duplicate;
 746
 747         /* for compatibility with earlier versions.
 748          * if gap is set, need to assume 100% probability
 749          */
 750         if (q->gap)
 751                 q->reorder = ~0;
 752
 753         if (tb[TCA_NETEM_CORR])
 754                 get_correlation(sch, tb[TCA_NETEM_CORR]);
 755
 756         if (tb[TCA_NETEM_DELAY_DIST]) {
 757                 ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST]);
 758                 if (ret)
 759                         return ret;
 760         }
 761
 762         if (tb[TCA_NETEM_REORDER])
 763                 get_reorder(sch, tb[TCA_NETEM_REORDER]);
 764
 765         if (tb[TCA_NETEM_CORRUPT])
 766                 get_corrupt(sch, tb[TCA_NETEM_CORRUPT]);
 767
 768         if (tb[TCA_NETEM_RATE])
 769                 get_rate(sch, tb[TCA_NETEM_RATE]);
 770
 771         if (tb[TCA_NETEM_ECN])
 772                 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
 773
 774         q->loss_model = CLG_RANDOM;
 775         if (tb[TCA_NETEM_LOSS])
 776                 ret = get_loss_clg(sch, tb[TCA_NETEM_LOSS]);
 777
 778         return ret;
 779 }
 780
 781 static int netem_init(struct Qdisc *sch, struct nlattr *opt)
 782 {
 783         struct netem_sched_data *q = qdisc_priv(sch);
 784         int ret;
 785
 786         if (!opt)
 787                 return -EINVAL;
 788
 789         qdisc_watchdog_init(&q->watchdog, sch);
 790
 791         q->loss_model = CLG_RANDOM;
 792         ret = netem_change(sch, opt);
 793         if (ret)
 794                 pr_info("netem: change failed\n");
 795         return ret;
 796 }
 797
 798 static void netem_destroy(struct Qdisc *sch)
 799 {
 800         struct netem_sched_data *q = qdisc_priv(sch);
 801
 802         qdisc_watchdog_cancel(&q->watchdog);
 803         if (q->qdisc)
 804                 qdisc_destroy(q->qdisc);
 805         dist_free(q->delay_dist);
 806 }
 807
 808 static int dump_loss_model(const struct netem_sched_data *q,
 809                            struct sk_buff *skb)
 810 {
 811         struct nlattr *nest;
 812
 813         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
 814         if (nest == NULL)
 815                 goto nla_put_failure;
 816
 817         switch (q->loss_model) {
 818         case CLG_RANDOM:
 819                 /* legacy loss model */
 820                 nla_nest_cancel(skb, nest);
 821                 return 0;       /* no data */
 822
 823         case CLG_4_STATES: {
 824                 struct tc_netem_gimodel gi = {
 825                         .p13 = q->clg.a1,
 826                         .p31 = q->clg.a2,
 827                         .p32 = q->clg.a3,
 828                         .p14 = q->clg.a4,
 829                         .p23 = q->clg.a5,
 830                 };
 831
 832                 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
 833                         goto nla_put_failure;
 834                 break;
 835         }
 836         case CLG_GILB_ELL: {
 837                 struct tc_netem_gemodel ge = {
 838                         .p = q->clg.a1,
 839                         .r = q->clg.a2,
 840                         .h = q->clg.a3,
 841                         .k1 = q->clg.a4,
 842                 };
 843
 844                 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
 845                         goto nla_put_failure;
 846                 break;
 847         }
 848         }
 849
 850         nla_nest_end(skb, nest);
 851         return 0;
 852
 853 nla_put_failure:
 854         nla_nest_cancel(skb, nest);
 855         return -1;
 856 }
 857
 858 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
 859 {
 860         const struct netem_sched_data *q = qdisc_priv(sch);
 861         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
 862         struct tc_netem_qopt qopt;
 863         struct tc_netem_corr cor;
 864         struct tc_netem_reorder reorder;
 865         struct tc_netem_corrupt corrupt;
 866         struct tc_netem_rate rate;
 867
 868         qopt.latency = q->latency;
 869         qopt.jitter = q->jitter;
 870         qopt.limit = q->limit;
 871         qopt.loss = q->loss;
 872         qopt.gap = q->gap;
 873         qopt.duplicate = q->duplicate;
 874         if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
 875                 goto nla_put_failure;
 876
 877         cor.delay_corr = q->delay_cor.rho;
 878         cor.loss_corr = q->loss_cor.rho;
 879         cor.dup_corr = q->dup_cor.rho;
 880         if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
 881                 goto nla_put_failure;
 882
 883         reorder.probability = q->reorder;
 884         reorder.correlation = q->reorder_cor.rho;
 885         if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
 886                 goto nla_put_failure;
 887
 888         corrupt.probability = q->corrupt;
 889         corrupt.correlation = q->corrupt_cor.rho;
 890         if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
 891                 goto nla_put_failure;
 892
 893         rate.rate = q->rate;
 894         rate.packet_overhead = q->packet_overhead;
 895         rate.cell_size = q->cell_size;
 896         rate.cell_overhead = q->cell_overhead;
 897         if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
 898                 goto nla_put_failure;
 899
 900         if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
 901                 goto nla_put_failure;
 902
 903         if (dump_loss_model(q, skb) != 0)
 904                 goto nla_put_failure;
 905
 906         return nla_nest_end(skb, nla);
 907
 908 nla_put_failure:
 909         nlmsg_trim(skb, nla);
 910         return -1;
 911 }
 912
 913 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
 914                           struct sk_buff *skb, struct tcmsg *tcm)
 915 {
 916         struct netem_sched_data *q = qdisc_priv(sch);
 917
 918         if (cl != 1 || !q->qdisc)       /* only one class */
 919                 return -ENOENT;
 920
 921         tcm->tcm_handle |= TC_H_MIN(1);
 922         tcm->tcm_info = q->qdisc->handle;
 923
 924         return 0;
 925 }
 926
 927 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
 928                      struct Qdisc **old)
 929 {
 930         struct netem_sched_data *q = qdisc_priv(sch);
 931
 932         sch_tree_lock(sch);
 933         *old = q->qdisc;
 934         q->qdisc = new;
 935         if (*old) {
 936                 qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
 937                 qdisc_reset(*old);
 938         }
 939         sch_tree_unlock(sch);
 940
 941         return 0;
 942 }
 943
 944 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
 945 {
 946         struct netem_sched_data *q = qdisc_priv(sch);
 947         return q->qdisc;
 948 }
 949
 950 static unsigned long netem_get(struct Qdisc *sch, u32 classid)
 951 {
 952         return 1;
 953 }
 954
 955 static void netem_put(struct Qdisc *sch, unsigned long arg)
 956 {
 957 }
 958
 959 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
 960 {
 961         if (!walker->stop) {
 962                 if (walker->count >= walker->skip)
 963                         if (walker->fn(sch, 1, walker) < 0) {
 964                                 walker->stop = 1;
 965                                 return;
 966                         }
 967                 walker->count++;
 968         }
 969 }
 970
 971 static const struct Qdisc_class_ops netem_class_ops = {
 972         .graft          =       netem_graft,
 973         .leaf           =       netem_leaf,
 974         .get            =       netem_get,
 975         .put            =       netem_put,
 976         .walk           =       netem_walk,
 977         .dump           =       netem_dump_class,
 978 };
 979
 980 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
 981         .id             =       "netem",
 982         .cl_ops         =       &netem_class_ops,
 983         .priv_size      =       sizeof(struct netem_sched_data),
 984         .enqueue        =       netem_enqueue,
 985         .dequeue        =       netem_dequeue,
 986         .peek           =       qdisc_peek_dequeued,
 987         .drop           =       netem_drop,
 988         .init           =       netem_init,
 989         .reset          =       netem_reset,
 990         .destroy        =       netem_destroy,
 991         .change         =       netem_change,
 992         .dump           =       netem_dump,
 993         .owner          =       THIS_MODULE,
 994 };
 995
 996
 997 static int __init netem_module_init(void)
 998 {
 999         pr_info("netem: version " VERSION "\n");
1000         return register_qdisc(&netem_qdisc_ops);
1001 }
1002 static void __exit netem_module_exit(void)
1003 {
1004         unregister_qdisc(&netem_qdisc_ops);
1005 }
1006 module_init(netem_module_init)
1007 module_exit(netem_module_exit)
1008 MODULE_LICENSE("GPL");