Pileus Git - ~andy/linux/blob - net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/proc_fs.h>
 101 #include <linux/seq_file.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <net/xfrm.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/net_tstamp.h>
 136 #include <linux/static_key.h>
 137 #include <net/flow_keys.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 /*
 148  *      The list of packet types we will receive (as opposed to discard)
 149  *      and the routines to invoke.
 150  *
 151  *      Why 16. Because with 16 the only overlap we get on a hash of the
 152  *      low nibble of the protocol value is RARP/SNAP/X.25.
 153  *
 154  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 155  *             sure which should go first, but I bet it won't make much
 156  *             difference if we are running VLANs.  The good news is that
 157  *             this protocol won't be in the list unless compiled in, so
 158  *             the average user (w/out VLANs) will not be adversely affected.
 159  *             --BLG
 160  *
 161  *              0800    IP
 162  *              8100    802.1Q VLAN
 163  *              0001    802.3
 164  *              0002    AX.25
 165  *              0004    802.2
 166  *              8035    RARP
 167  *              0005    SNAP
 168  *              0805    X.25
 169  *              0806    ARP
 170  *              8137    IPX
 171  *              0009    Localtalk
 172  *              86DD    IPv6
 173  */
 174
 175 #define PTYPE_HASH_SIZE (16)
 176 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 177
 178 static DEFINE_SPINLOCK(ptype_lock);
 179 static DEFINE_SPINLOCK(offload_lock);
 180 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 181 static struct list_head ptype_all __read_mostly;        /* Taps */
 182 static struct list_head offload_base __read_mostly;
 183
 184 /*
 185  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 186  * semaphore.
 187  *
 188  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 189  *
 190  * Writers must hold the rtnl semaphore while they loop through the
 191  * dev_base_head list, and hold dev_base_lock for writing when they do the
 192  * actual updates.  This allows pure readers to access the list even
 193  * while a writer is preparing to update it.
 194  *
 195  * To put it another way, dev_base_lock is held for writing only to
 196  * protect against pure readers; the rtnl semaphore provides the
 197  * protection against other writers.
 198  *
 199  * See, for example usages, register_netdevice() and
 200  * unregister_netdevice(), which must be called with the rtnl
 201  * semaphore held.
 202  */
 203 DEFINE_RWLOCK(dev_base_lock);
 204 EXPORT_SYMBOL(dev_base_lock);
 205
 206 static inline void dev_base_seq_inc(struct net *net)
 207 {
 208         while (++net->dev_base_seq == 0);
 209 }
 210
 211 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 212 {
 213         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 214
 215         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 216 }
 217
 218 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 219 {
 220         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 221 }
 222
 223 static inline void rps_lock(struct softnet_data *sd)
 224 {
 225 #ifdef CONFIG_RPS
 226         spin_lock(&sd->input_pkt_queue.lock);
 227 #endif
 228 }
 229
 230 static inline void rps_unlock(struct softnet_data *sd)
 231 {
 232 #ifdef CONFIG_RPS
 233         spin_unlock(&sd->input_pkt_queue.lock);
 234 #endif
 235 }
 236
 237 /* Device list insertion */
 238 static int list_netdevice(struct net_device *dev)
 239 {
 240         struct net *net = dev_net(dev);
 241
 242         ASSERT_RTNL();
 243
 244         write_lock_bh(&dev_base_lock);
 245         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 246         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 247         hlist_add_head_rcu(&dev->index_hlist,
 248                            dev_index_hash(net, dev->ifindex));
 249         write_unlock_bh(&dev_base_lock);
 250
 251         dev_base_seq_inc(net);
 252
 253         return 0;
 254 }
 255
 256 /* Device list removal
 257  * caller must respect a RCU grace period before freeing/reusing dev
 258  */
 259 static void unlist_netdevice(struct net_device *dev)
 260 {
 261         ASSERT_RTNL();
 262
 263         /* Unlink dev from the device chain */
 264         write_lock_bh(&dev_base_lock);
 265         list_del_rcu(&dev->dev_list);
 266         hlist_del_rcu(&dev->name_hlist);
 267         hlist_del_rcu(&dev->index_hlist);
 268         write_unlock_bh(&dev_base_lock);
 269
 270         dev_base_seq_inc(dev_net(dev));
 271 }
 272
 273 /*
 274  *      Our notifier list
 275  */
 276
 277 static RAW_NOTIFIER_HEAD(netdev_chain);
 278
 279 /*
 280  *      Device drivers call our routines to queue packets here. We empty the
 281  *      queue in the local softnet handler.
 282  */
 283
 284 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 285 EXPORT_PER_CPU_SYMBOL(softnet_data);
 286
 287 #ifdef CONFIG_LOCKDEP
 288 /*
 289  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 290  * according to dev->type
 291  */
 292 static const unsigned short netdev_lock_type[] =
 293         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 294          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 295          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 296          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 297          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 298          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 299          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 300          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 301          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 302          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 303          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 304          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 305          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 306          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 307          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 308
 309 static const char *const netdev_lock_name[] =
 310         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 311          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 312          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 313          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 314          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 315          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 316          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 317          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 318          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 319          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 320          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 321          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 322          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 323          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 324          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 325
 326 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 327 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328
 329 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 330 {
 331         int i;
 332
 333         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 334                 if (netdev_lock_type[i] == dev_type)
 335                         return i;
 336         /* the last key is used by default */
 337         return ARRAY_SIZE(netdev_lock_type) - 1;
 338 }
 339
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343         int i;
 344
 345         i = netdev_lock_pos(dev_type);
 346         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 347                                    netdev_lock_name[i]);
 348 }
 349
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352         int i;
 353
 354         i = netdev_lock_pos(dev->type);
 355         lockdep_set_class_and_name(&dev->addr_list_lock,
 356                                    &netdev_addr_lock_key[i],
 357                                    netdev_lock_name[i]);
 358 }
 359 #else
 360 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 361                                                  unsigned short dev_type)
 362 {
 363 }
 364 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 365 {
 366 }
 367 #endif
 368
 369 /*******************************************************************************
 370
 371                 Protocol management and registration routines
 372
 373 *******************************************************************************/
 374
 375 /*
 376  *      Add a protocol ID to the list. Now that the input handler is
 377  *      smarter we can dispense with all the messy stuff that used to be
 378  *      here.
 379  *
 380  *      BEWARE!!! Protocol handlers, mangling input packets,
 381  *      MUST BE last in hash buckets and checking protocol handlers
 382  *      MUST start from promiscuous ptype_all chain in net_bh.
 383  *      It is true now, do not change it.
 384  *      Explanation follows: if protocol handler, mangling packet, will
 385  *      be the first on list, it is not able to sense, that packet
 386  *      is cloned and should be copied-on-write, so that it will
 387  *      change it and subsequent readers will get broken packet.
 388  *                                                      --ANK (980803)
 389  */
 390
 391 static inline struct list_head *ptype_head(const struct packet_type *pt)
 392 {
 393         if (pt->type == htons(ETH_P_ALL))
 394                 return &ptype_all;
 395         else
 396                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 397 }
 398
 399 /**
 400  *      dev_add_pack - add packet handler
 401  *      @pt: packet type declaration
 402  *
 403  *      Add a protocol handler to the networking stack. The passed &packet_type
 404  *      is linked into kernel lists and may not be freed until it has been
 405  *      removed from the kernel lists.
 406  *
 407  *      This call does not sleep therefore it can not
 408  *      guarantee all CPU's that are in middle of receiving packets
 409  *      will see the new packet type (until the next received packet).
 410  */
 411
 412 void dev_add_pack(struct packet_type *pt)
 413 {
 414         struct list_head *head = ptype_head(pt);
 415
 416         spin_lock(&ptype_lock);
 417         list_add_rcu(&pt->list, head);
 418         spin_unlock(&ptype_lock);
 419 }
 420 EXPORT_SYMBOL(dev_add_pack);
 421
 422 /**
 423  *      __dev_remove_pack        - remove packet handler
 424  *      @pt: packet type declaration
 425  *
 426  *      Remove a protocol handler that was previously added to the kernel
 427  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 428  *      from the kernel lists and can be freed or reused once this function
 429  *      returns.
 430  *
 431  *      The packet type might still be in use by receivers
 432  *      and must not be freed until after all the CPU's have gone
 433  *      through a quiescent state.
 434  */
 435 void __dev_remove_pack(struct packet_type *pt)
 436 {
 437         struct list_head *head = ptype_head(pt);
 438         struct packet_type *pt1;
 439
 440         spin_lock(&ptype_lock);
 441
 442         list_for_each_entry(pt1, head, list) {
 443                 if (pt == pt1) {
 444                         list_del_rcu(&pt->list);
 445                         goto out;
 446                 }
 447         }
 448
 449         pr_warn("dev_remove_pack: %p not found\n", pt);
 450 out:
 451         spin_unlock(&ptype_lock);
 452 }
 453 EXPORT_SYMBOL(__dev_remove_pack);
 454
 455 /**
 456  *      dev_remove_pack  - remove packet handler
 457  *      @pt: packet type declaration
 458  *
 459  *      Remove a protocol handler that was previously added to the kernel
 460  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 461  *      from the kernel lists and can be freed or reused once this function
 462  *      returns.
 463  *
 464  *      This call sleeps to guarantee that no CPU is looking at the packet
 465  *      type after return.
 466  */
 467 void dev_remove_pack(struct packet_type *pt)
 468 {
 469         __dev_remove_pack(pt);
 470
 471         synchronize_net();
 472 }
 473 EXPORT_SYMBOL(dev_remove_pack);
 474
 475
 476 /**
 477  *      dev_add_offload - register offload handlers
 478  *      @po: protocol offload declaration
 479  *
 480  *      Add protocol offload handlers to the networking stack. The passed
 481  *      &proto_offload is linked into kernel lists and may not be freed until
 482  *      it has been removed from the kernel lists.
 483  *
 484  *      This call does not sleep therefore it can not
 485  *      guarantee all CPU's that are in middle of receiving packets
 486  *      will see the new offload handlers (until the next received packet).
 487  */
 488 void dev_add_offload(struct packet_offload *po)
 489 {
 490         struct list_head *head = &offload_base;
 491
 492         spin_lock(&offload_lock);
 493         list_add_rcu(&po->list, head);
 494         spin_unlock(&offload_lock);
 495 }
 496 EXPORT_SYMBOL(dev_add_offload);
 497
 498 /**
 499  *      __dev_remove_offload     - remove offload handler
 500  *      @po: packet offload declaration
 501  *
 502  *      Remove a protocol offload handler that was previously added to the
 503  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 504  *      is removed from the kernel lists and can be freed or reused once this
 505  *      function returns.
 506  *
 507  *      The packet type might still be in use by receivers
 508  *      and must not be freed until after all the CPU's have gone
 509  *      through a quiescent state.
 510  */
 511 void __dev_remove_offload(struct packet_offload *po)
 512 {
 513         struct list_head *head = &offload_base;
 514         struct packet_offload *po1;
 515
 516         spin_lock(&ptype_lock);
 517
 518         list_for_each_entry(po1, head, list) {
 519                 if (po == po1) {
 520                         list_del_rcu(&po->list);
 521                         goto out;
 522                 }
 523         }
 524
 525         pr_warn("dev_remove_offload: %p not found\n", po);
 526 out:
 527         spin_unlock(&ptype_lock);
 528 }
 529 EXPORT_SYMBOL(__dev_remove_offload);
 530
 531 /**
 532  *      dev_remove_offload       - remove packet offload handler
 533  *      @po: packet offload declaration
 534  *
 535  *      Remove a packet offload handler that was previously added to the kernel
 536  *      offload handlers by dev_add_offload(). The passed &offload_type is
 537  *      removed from the kernel lists and can be freed or reused once this
 538  *      function returns.
 539  *
 540  *      This call sleeps to guarantee that no CPU is looking at the packet
 541  *      type after return.
 542  */
 543 void dev_remove_offload(struct packet_offload *po)
 544 {
 545         __dev_remove_offload(po);
 546
 547         synchronize_net();
 548 }
 549 EXPORT_SYMBOL(dev_remove_offload);
 550
 551 /******************************************************************************
 552
 553                       Device Boot-time Settings Routines
 554
 555 *******************************************************************************/
 556
 557 /* Boot time configuration table */
 558 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 559
 560 /**
 561  *      netdev_boot_setup_add   - add new setup entry
 562  *      @name: name of the device
 563  *      @map: configured settings for the device
 564  *
 565  *      Adds new setup entry to the dev_boot_setup list.  The function
 566  *      returns 0 on error and 1 on success.  This is a generic routine to
 567  *      all netdevices.
 568  */
 569 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 570 {
 571         struct netdev_boot_setup *s;
 572         int i;
 573
 574         s = dev_boot_setup;
 575         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 576                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 577                         memset(s[i].name, 0, sizeof(s[i].name));
 578                         strlcpy(s[i].name, name, IFNAMSIZ);
 579                         memcpy(&s[i].map, map, sizeof(s[i].map));
 580                         break;
 581                 }
 582         }
 583
 584         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 585 }
 586
 587 /**
 588  *      netdev_boot_setup_check - check boot time settings
 589  *      @dev: the netdevice
 590  *
 591  *      Check boot time settings for the device.
 592  *      The found settings are set for the device to be used
 593  *      later in the device probing.
 594  *      Returns 0 if no settings found, 1 if they are.
 595  */
 596 int netdev_boot_setup_check(struct net_device *dev)
 597 {
 598         struct netdev_boot_setup *s = dev_boot_setup;
 599         int i;
 600
 601         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 602                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 603                     !strcmp(dev->name, s[i].name)) {
 604                         dev->irq        = s[i].map.irq;
 605                         dev->base_addr  = s[i].map.base_addr;
 606                         dev->mem_start  = s[i].map.mem_start;
 607                         dev->mem_end    = s[i].map.mem_end;
 608                         return 1;
 609                 }
 610         }
 611         return 0;
 612 }
 613 EXPORT_SYMBOL(netdev_boot_setup_check);
 614
 615
 616 /**
 617  *      netdev_boot_base        - get address from boot time settings
 618  *      @prefix: prefix for network device
 619  *      @unit: id for network device
 620  *
 621  *      Check boot time settings for the base address of device.
 622  *      The found settings are set for the device to be used
 623  *      later in the device probing.
 624  *      Returns 0 if no settings found.
 625  */
 626 unsigned long netdev_boot_base(const char *prefix, int unit)
 627 {
 628         const struct netdev_boot_setup *s = dev_boot_setup;
 629         char name[IFNAMSIZ];
 630         int i;
 631
 632         sprintf(name, "%s%d", prefix, unit);
 633
 634         /*
 635          * If device already registered then return base of 1
 636          * to indicate not to probe for this interface
 637          */
 638         if (__dev_get_by_name(&init_net, name))
 639                 return 1;
 640
 641         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 642                 if (!strcmp(name, s[i].name))
 643                         return s[i].map.base_addr;
 644         return 0;
 645 }
 646
 647 /*
 648  * Saves at boot time configured settings for any netdevice.
 649  */
 650 int __init netdev_boot_setup(char *str)
 651 {
 652         int ints[5];
 653         struct ifmap map;
 654
 655         str = get_options(str, ARRAY_SIZE(ints), ints);
 656         if (!str || !*str)
 657                 return 0;
 658
 659         /* Save settings */
 660         memset(&map, 0, sizeof(map));
 661         if (ints[0] > 0)
 662                 map.irq = ints[1];
 663         if (ints[0] > 1)
 664                 map.base_addr = ints[2];
 665         if (ints[0] > 2)
 666                 map.mem_start = ints[3];
 667         if (ints[0] > 3)
 668                 map.mem_end = ints[4];
 669
 670         /* Add new entry to the list */
 671         return netdev_boot_setup_add(str, &map);
 672 }
 673
 674 __setup("netdev=", netdev_boot_setup);
 675
 676 /*******************************************************************************
 677
 678                             Device Interface Subroutines
 679
 680 *******************************************************************************/
 681
 682 /**
 683  *      __dev_get_by_name       - find a device by its name
 684  *      @net: the applicable net namespace
 685  *      @name: name to find
 686  *
 687  *      Find an interface by name. Must be called under RTNL semaphore
 688  *      or @dev_base_lock. If the name is found a pointer to the device
 689  *      is returned. If the name is not found then %NULL is returned. The
 690  *      reference counters are not incremented so the caller must be
 691  *      careful with locks.
 692  */
 693
 694 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 695 {
 696         struct hlist_node *p;
 697         struct net_device *dev;
 698         struct hlist_head *head = dev_name_hash(net, name);
 699
 700         hlist_for_each_entry(dev, p, head, name_hlist)
 701                 if (!strncmp(dev->name, name, IFNAMSIZ))
 702                         return dev;
 703
 704         return NULL;
 705 }
 706 EXPORT_SYMBOL(__dev_get_by_name);
 707
 708 /**
 709  *      dev_get_by_name_rcu     - find a device by its name
 710  *      @net: the applicable net namespace
 711  *      @name: name to find
 712  *
 713  *      Find an interface by name.
 714  *      If the name is found a pointer to the device is returned.
 715  *      If the name is not found then %NULL is returned.
 716  *      The reference counters are not incremented so the caller must be
 717  *      careful with locks. The caller must hold RCU lock.
 718  */
 719
 720 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 721 {
 722         struct hlist_node *p;
 723         struct net_device *dev;
 724         struct hlist_head *head = dev_name_hash(net, name);
 725
 726         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 727                 if (!strncmp(dev->name, name, IFNAMSIZ))
 728                         return dev;
 729
 730         return NULL;
 731 }
 732 EXPORT_SYMBOL(dev_get_by_name_rcu);
 733
 734 /**
 735  *      dev_get_by_name         - find a device by its name
 736  *      @net: the applicable net namespace
 737  *      @name: name to find
 738  *
 739  *      Find an interface by name. This can be called from any
 740  *      context and does its own locking. The returned handle has
 741  *      the usage count incremented and the caller must use dev_put() to
 742  *      release it when it is no longer needed. %NULL is returned if no
 743  *      matching device is found.
 744  */
 745
 746 struct net_device *dev_get_by_name(struct net *net, const char *name)
 747 {
 748         struct net_device *dev;
 749
 750         rcu_read_lock();
 751         dev = dev_get_by_name_rcu(net, name);
 752         if (dev)
 753                 dev_hold(dev);
 754         rcu_read_unlock();
 755         return dev;
 756 }
 757 EXPORT_SYMBOL(dev_get_by_name);
 758
 759 /**
 760  *      __dev_get_by_index - find a device by its ifindex
 761  *      @net: the applicable net namespace
 762  *      @ifindex: index of device
 763  *
 764  *      Search for an interface by index. Returns %NULL if the device
 765  *      is not found or a pointer to the device. The device has not
 766  *      had its reference counter increased so the caller must be careful
 767  *      about locking. The caller must hold either the RTNL semaphore
 768  *      or @dev_base_lock.
 769  */
 770
 771 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 772 {
 773         struct hlist_node *p;
 774         struct net_device *dev;
 775         struct hlist_head *head = dev_index_hash(net, ifindex);
 776
 777         hlist_for_each_entry(dev, p, head, index_hlist)
 778                 if (dev->ifindex == ifindex)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(__dev_get_by_index);
 784
 785 /**
 786  *      dev_get_by_index_rcu - find a device by its ifindex
 787  *      @net: the applicable net namespace
 788  *      @ifindex: index of device
 789  *
 790  *      Search for an interface by index. Returns %NULL if the device
 791  *      is not found or a pointer to the device. The device has not
 792  *      had its reference counter increased so the caller must be careful
 793  *      about locking. The caller must hold RCU lock.
 794  */
 795
 796 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 797 {
 798         struct hlist_node *p;
 799         struct net_device *dev;
 800         struct hlist_head *head = dev_index_hash(net, ifindex);
 801
 802         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 803                 if (dev->ifindex == ifindex)
 804                         return dev;
 805
 806         return NULL;
 807 }
 808 EXPORT_SYMBOL(dev_get_by_index_rcu);
 809
 810
 811 /**
 812  *      dev_get_by_index - find a device by its ifindex
 813  *      @net: the applicable net namespace
 814  *      @ifindex: index of device
 815  *
 816  *      Search for an interface by index. Returns NULL if the device
 817  *      is not found or a pointer to the device. The device returned has
 818  *      had a reference added and the pointer is safe until the user calls
 819  *      dev_put to indicate they have finished with it.
 820  */
 821
 822 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 823 {
 824         struct net_device *dev;
 825
 826         rcu_read_lock();
 827         dev = dev_get_by_index_rcu(net, ifindex);
 828         if (dev)
 829                 dev_hold(dev);
 830         rcu_read_unlock();
 831         return dev;
 832 }
 833 EXPORT_SYMBOL(dev_get_by_index);
 834
 835 /**
 836  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 837  *      @net: the applicable net namespace
 838  *      @type: media type of device
 839  *      @ha: hardware address
 840  *
 841  *      Search for an interface by MAC address. Returns NULL if the device
 842  *      is not found or a pointer to the device.
 843  *      The caller must hold RCU or RTNL.
 844  *      The returned device has not had its ref count increased
 845  *      and the caller must therefore be careful about locking
 846  *
 847  */
 848
 849 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 850                                        const char *ha)
 851 {
 852         struct net_device *dev;
 853
 854         for_each_netdev_rcu(net, dev)
 855                 if (dev->type == type &&
 856                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 857                         return dev;
 858
 859         return NULL;
 860 }
 861 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 862
 863 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 864 {
 865         struct net_device *dev;
 866
 867         ASSERT_RTNL();
 868         for_each_netdev(net, dev)
 869                 if (dev->type == type)
 870                         return dev;
 871
 872         return NULL;
 873 }
 874 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 875
 876 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 877 {
 878         struct net_device *dev, *ret = NULL;
 879
 880         rcu_read_lock();
 881         for_each_netdev_rcu(net, dev)
 882                 if (dev->type == type) {
 883                         dev_hold(dev);
 884                         ret = dev;
 885                         break;
 886                 }
 887         rcu_read_unlock();
 888         return ret;
 889 }
 890 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 891
 892 /**
 893  *      dev_get_by_flags_rcu - find any device with given flags
 894  *      @net: the applicable net namespace
 895  *      @if_flags: IFF_* values
 896  *      @mask: bitmask of bits in if_flags to check
 897  *
 898  *      Search for any interface with the given flags. Returns NULL if a device
 899  *      is not found or a pointer to the device. Must be called inside
 900  *      rcu_read_lock(), and result refcount is unchanged.
 901  */
 902
 903 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 904                                     unsigned short mask)
 905 {
 906         struct net_device *dev, *ret;
 907
 908         ret = NULL;
 909         for_each_netdev_rcu(net, dev) {
 910                 if (((dev->flags ^ if_flags) & mask) == 0) {
 911                         ret = dev;
 912                         break;
 913                 }
 914         }
 915         return ret;
 916 }
 917 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 918
 919 /**
 920  *      dev_valid_name - check if name is okay for network device
 921  *      @name: name string
 922  *
 923  *      Network device names need to be valid file names to
 924  *      to allow sysfs to work.  We also disallow any kind of
 925  *      whitespace.
 926  */
 927 bool dev_valid_name(const char *name)
 928 {
 929         if (*name == '\0')
 930                 return false;
 931         if (strlen(name) >= IFNAMSIZ)
 932                 return false;
 933         if (!strcmp(name, ".") || !strcmp(name, ".."))
 934                 return false;
 935
 936         while (*name) {
 937                 if (*name == '/' || isspace(*name))
 938                         return false;
 939                 name++;
 940         }
 941         return true;
 942 }
 943 EXPORT_SYMBOL(dev_valid_name);
 944
 945 /**
 946  *      __dev_alloc_name - allocate a name for a device
 947  *      @net: network namespace to allocate the device name in
 948  *      @name: name format string
 949  *      @buf:  scratch buffer and result name string
 950  *
 951  *      Passed a format string - eg "lt%d" it will try and find a suitable
 952  *      id. It scans list of devices to build up a free map, then chooses
 953  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 954  *      while allocating the name and adding the device in order to avoid
 955  *      duplicates.
 956  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 957  *      Returns the number of the unit assigned or a negative errno code.
 958  */
 959
 960 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 961 {
 962         int i = 0;
 963         const char *p;
 964         const int max_netdevices = 8*PAGE_SIZE;
 965         unsigned long *inuse;
 966         struct net_device *d;
 967
 968         p = strnchr(name, IFNAMSIZ-1, '%');
 969         if (p) {
 970                 /*
 971                  * Verify the string as this thing may have come from
 972                  * the user.  There must be either one "%d" and no other "%"
 973                  * characters.
 974                  */
 975                 if (p[1] != 'd' || strchr(p + 2, '%'))
 976                         return -EINVAL;
 977
 978                 /* Use one page as a bit array of possible slots */
 979                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 980                 if (!inuse)
 981                         return -ENOMEM;
 982
 983                 for_each_netdev(net, d) {
 984                         if (!sscanf(d->name, name, &i))
 985                                 continue;
 986                         if (i < 0 || i >= max_netdevices)
 987                                 continue;
 988
 989                         /*  avoid cases where sscanf is not exact inverse of printf */
 990                         snprintf(buf, IFNAMSIZ, name, i);
 991                         if (!strncmp(buf, d->name, IFNAMSIZ))
 992                                 set_bit(i, inuse);
 993                 }
 994
 995                 i = find_first_zero_bit(inuse, max_netdevices);
 996                 free_page((unsigned long) inuse);
 997         }
 998
 999         if (buf != name)
1000                 snprintf(buf, IFNAMSIZ, name, i);
1001         if (!__dev_get_by_name(net, buf))
1002                 return i;
1003
1004         /* It is possible to run out of possible slots
1005          * when the name is long and there isn't enough space left
1006          * for the digits, or if all bits are used.
1007          */
1008         return -ENFILE;
1009 }
1010
1011 /**
1012  *      dev_alloc_name - allocate a name for a device
1013  *      @dev: device
1014  *      @name: name format string
1015  *
1016  *      Passed a format string - eg "lt%d" it will try and find a suitable
1017  *      id. It scans list of devices to build up a free map, then chooses
1018  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1019  *      while allocating the name and adding the device in order to avoid
1020  *      duplicates.
1021  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1022  *      Returns the number of the unit assigned or a negative errno code.
1023  */
1024
1025 int dev_alloc_name(struct net_device *dev, const char *name)
1026 {
1027         char buf[IFNAMSIZ];
1028         struct net *net;
1029         int ret;
1030
1031         BUG_ON(!dev_net(dev));
1032         net = dev_net(dev);
1033         ret = __dev_alloc_name(net, name, buf);
1034         if (ret >= 0)
1035                 strlcpy(dev->name, buf, IFNAMSIZ);
1036         return ret;
1037 }
1038 EXPORT_SYMBOL(dev_alloc_name);
1039
1040 static int dev_alloc_name_ns(struct net *net,
1041                              struct net_device *dev,
1042                              const char *name)
1043 {
1044         char buf[IFNAMSIZ];
1045         int ret;
1046
1047         ret = __dev_alloc_name(net, name, buf);
1048         if (ret >= 0)
1049                 strlcpy(dev->name, buf, IFNAMSIZ);
1050         return ret;
1051 }
1052
1053 static int dev_get_valid_name(struct net *net,
1054                               struct net_device *dev,
1055                               const char *name)
1056 {
1057         BUG_ON(!net);
1058
1059         if (!dev_valid_name(name))
1060                 return -EINVAL;
1061
1062         if (strchr(name, '%'))
1063                 return dev_alloc_name_ns(net, dev, name);
1064         else if (__dev_get_by_name(net, name))
1065                 return -EEXIST;
1066         else if (dev->name != name)
1067                 strlcpy(dev->name, name, IFNAMSIZ);
1068
1069         return 0;
1070 }
1071
1072 /**
1073  *      dev_change_name - change name of a device
1074  *      @dev: device
1075  *      @newname: name (or format string) must be at least IFNAMSIZ
1076  *
1077  *      Change name of a device, can pass format strings "eth%d".
1078  *      for wildcarding.
1079  */
1080 int dev_change_name(struct net_device *dev, const char *newname)
1081 {
1082         char oldname[IFNAMSIZ];
1083         int err = 0;
1084         int ret;
1085         struct net *net;
1086
1087         ASSERT_RTNL();
1088         BUG_ON(!dev_net(dev));
1089
1090         net = dev_net(dev);
1091         if (dev->flags & IFF_UP)
1092                 return -EBUSY;
1093
1094         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1095                 return 0;
1096
1097         memcpy(oldname, dev->name, IFNAMSIZ);
1098
1099         err = dev_get_valid_name(net, dev, newname);
1100         if (err < 0)
1101                 return err;
1102
1103 rollback:
1104         ret = device_rename(&dev->dev, dev->name);
1105         if (ret) {
1106                 memcpy(dev->name, oldname, IFNAMSIZ);
1107                 return ret;
1108         }
1109
1110         write_lock_bh(&dev_base_lock);
1111         hlist_del_rcu(&dev->name_hlist);
1112         write_unlock_bh(&dev_base_lock);
1113
1114         synchronize_rcu();
1115
1116         write_lock_bh(&dev_base_lock);
1117         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1118         write_unlock_bh(&dev_base_lock);
1119
1120         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1121         ret = notifier_to_errno(ret);
1122
1123         if (ret) {
1124                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1125                 if (err >= 0) {
1126                         err = ret;
1127                         memcpy(dev->name, oldname, IFNAMSIZ);
1128                         goto rollback;
1129                 } else {
1130                         pr_err("%s: name change rollback failed: %d\n",
1131                                dev->name, ret);
1132                 }
1133         }
1134
1135         return err;
1136 }
1137
1138 /**
1139  *      dev_set_alias - change ifalias of a device
1140  *      @dev: device
1141  *      @alias: name up to IFALIASZ
1142  *      @len: limit of bytes to copy from info
1143  *
1144  *      Set ifalias for a device,
1145  */
1146 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1147 {
1148         char *new_ifalias;
1149
1150         ASSERT_RTNL();
1151
1152         if (len >= IFALIASZ)
1153                 return -EINVAL;
1154
1155         if (!len) {
1156                 if (dev->ifalias) {
1157                         kfree(dev->ifalias);
1158                         dev->ifalias = NULL;
1159                 }
1160                 return 0;
1161         }
1162
1163         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1164         if (!new_ifalias)
1165                 return -ENOMEM;
1166         dev->ifalias = new_ifalias;
1167
1168         strlcpy(dev->ifalias, alias, len+1);
1169         return len;
1170 }
1171
1172
1173 /**
1174  *      netdev_features_change - device changes features
1175  *      @dev: device to cause notification
1176  *
1177  *      Called to indicate a device has changed features.
1178  */
1179 void netdev_features_change(struct net_device *dev)
1180 {
1181         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1182 }
1183 EXPORT_SYMBOL(netdev_features_change);
1184
1185 /**
1186  *      netdev_state_change - device changes state
1187  *      @dev: device to cause notification
1188  *
1189  *      Called to indicate a device has changed state. This function calls
1190  *      the notifier chains for netdev_chain and sends a NEWLINK message
1191  *      to the routing socket.
1192  */
1193 void netdev_state_change(struct net_device *dev)
1194 {
1195         if (dev->flags & IFF_UP) {
1196                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1197                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1198         }
1199 }
1200 EXPORT_SYMBOL(netdev_state_change);
1201
1202 /**
1203  *      netdev_notify_peers - notify network peers about existence of @dev
1204  *      @dev: network device
1205  *
1206  * Generate traffic such that interested network peers are aware of
1207  * @dev, such as by generating a gratuitous ARP. This may be used when
1208  * a device wants to inform the rest of the network about some sort of
1209  * reconfiguration such as a failover event or virtual machine
1210  * migration.
1211  */
1212 void netdev_notify_peers(struct net_device *dev)
1213 {
1214         rtnl_lock();
1215         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1216         rtnl_unlock();
1217 }
1218 EXPORT_SYMBOL(netdev_notify_peers);
1219
1220 /**
1221  *      dev_load        - load a network module
1222  *      @net: the applicable net namespace
1223  *      @name: name of interface
1224  *
1225  *      If a network interface is not present and the process has suitable
1226  *      privileges this function loads the module. If module loading is not
1227  *      available in this kernel then it becomes a nop.
1228  */
1229
1230 void dev_load(struct net *net, const char *name)
1231 {
1232         struct net_device *dev;
1233         int no_module;
1234
1235         rcu_read_lock();
1236         dev = dev_get_by_name_rcu(net, name);
1237         rcu_read_unlock();
1238
1239         no_module = !dev;
1240         if (no_module && capable(CAP_NET_ADMIN))
1241                 no_module = request_module("netdev-%s", name);
1242         if (no_module && capable(CAP_SYS_MODULE)) {
1243                 if (!request_module("%s", name))
1244                         pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1245                                 name);
1246         }
1247 }
1248 EXPORT_SYMBOL(dev_load);
1249
1250 static int __dev_open(struct net_device *dev)
1251 {
1252         const struct net_device_ops *ops = dev->netdev_ops;
1253         int ret;
1254
1255         ASSERT_RTNL();
1256
1257         if (!netif_device_present(dev))
1258                 return -ENODEV;
1259
1260         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1261         ret = notifier_to_errno(ret);
1262         if (ret)
1263                 return ret;
1264
1265         set_bit(__LINK_STATE_START, &dev->state);
1266
1267         if (ops->ndo_validate_addr)
1268                 ret = ops->ndo_validate_addr(dev);
1269
1270         if (!ret && ops->ndo_open)
1271                 ret = ops->ndo_open(dev);
1272
1273         if (ret)
1274                 clear_bit(__LINK_STATE_START, &dev->state);
1275         else {
1276                 dev->flags |= IFF_UP;
1277                 net_dmaengine_get();
1278                 dev_set_rx_mode(dev);
1279                 dev_activate(dev);
1280                 add_device_randomness(dev->dev_addr, dev->addr_len);
1281         }
1282
1283         return ret;
1284 }
1285
1286 /**
1287  *      dev_open        - prepare an interface for use.
1288  *      @dev:   device to open
1289  *
1290  *      Takes a device from down to up state. The device's private open
1291  *      function is invoked and then the multicast lists are loaded. Finally
1292  *      the device is moved into the up state and a %NETDEV_UP message is
1293  *      sent to the netdev notifier chain.
1294  *
1295  *      Calling this function on an active interface is a nop. On a failure
1296  *      a negative errno code is returned.
1297  */
1298 int dev_open(struct net_device *dev)
1299 {
1300         int ret;
1301
1302         if (dev->flags & IFF_UP)
1303                 return 0;
1304
1305         ret = __dev_open(dev);
1306         if (ret < 0)
1307                 return ret;
1308
1309         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1310         call_netdevice_notifiers(NETDEV_UP, dev);
1311
1312         return ret;
1313 }
1314 EXPORT_SYMBOL(dev_open);
1315
1316 static int __dev_close_many(struct list_head *head)
1317 {
1318         struct net_device *dev;
1319
1320         ASSERT_RTNL();
1321         might_sleep();
1322
1323         list_for_each_entry(dev, head, unreg_list) {
1324                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1325
1326                 clear_bit(__LINK_STATE_START, &dev->state);
1327
1328                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1329                  * can be even on different cpu. So just clear netif_running().
1330                  *
1331                  * dev->stop() will invoke napi_disable() on all of it's
1332                  * napi_struct instances on this device.
1333                  */
1334                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1335         }
1336
1337         dev_deactivate_many(head);
1338
1339         list_for_each_entry(dev, head, unreg_list) {
1340                 const struct net_device_ops *ops = dev->netdev_ops;
1341
1342                 /*
1343                  *      Call the device specific close. This cannot fail.
1344                  *      Only if device is UP
1345                  *
1346                  *      We allow it to be called even after a DETACH hot-plug
1347                  *      event.
1348                  */
1349                 if (ops->ndo_stop)
1350                         ops->ndo_stop(dev);
1351
1352                 dev->flags &= ~IFF_UP;
1353                 net_dmaengine_put();
1354         }
1355
1356         return 0;
1357 }
1358
1359 static int __dev_close(struct net_device *dev)
1360 {
1361         int retval;
1362         LIST_HEAD(single);
1363
1364         list_add(&dev->unreg_list, &single);
1365         retval = __dev_close_many(&single);
1366         list_del(&single);
1367         return retval;
1368 }
1369
1370 static int dev_close_many(struct list_head *head)
1371 {
1372         struct net_device *dev, *tmp;
1373         LIST_HEAD(tmp_list);
1374
1375         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1376                 if (!(dev->flags & IFF_UP))
1377                         list_move(&dev->unreg_list, &tmp_list);
1378
1379         __dev_close_many(head);
1380
1381         list_for_each_entry(dev, head, unreg_list) {
1382                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1383                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1384         }
1385
1386         /* rollback_registered_many needs the complete original list */
1387         list_splice(&tmp_list, head);
1388         return 0;
1389 }
1390
1391 /**
1392  *      dev_close - shutdown an interface.
1393  *      @dev: device to shutdown
1394  *
1395  *      This function moves an active device into down state. A
1396  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1397  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1398  *      chain.
1399  */
1400 int dev_close(struct net_device *dev)
1401 {
1402         if (dev->flags & IFF_UP) {
1403                 LIST_HEAD(single);
1404
1405                 list_add(&dev->unreg_list, &single);
1406                 dev_close_many(&single);
1407                 list_del(&single);
1408         }
1409         return 0;
1410 }
1411 EXPORT_SYMBOL(dev_close);
1412
1413
1414 /**
1415  *      dev_disable_lro - disable Large Receive Offload on a device
1416  *      @dev: device
1417  *
1418  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1419  *      called under RTNL.  This is needed if received packets may be
1420  *      forwarded to another interface.
1421  */
1422 void dev_disable_lro(struct net_device *dev)
1423 {
1424         /*
1425          * If we're trying to disable lro on a vlan device
1426          * use the underlying physical device instead
1427          */
1428         if (is_vlan_dev(dev))
1429                 dev = vlan_dev_real_dev(dev);
1430
1431         dev->wanted_features &= ~NETIF_F_LRO;
1432         netdev_update_features(dev);
1433
1434         if (unlikely(dev->features & NETIF_F_LRO))
1435                 netdev_WARN(dev, "failed to disable LRO!\n");
1436 }
1437 EXPORT_SYMBOL(dev_disable_lro);
1438
1439
1440 static int dev_boot_phase = 1;
1441
1442 /**
1443  *      register_netdevice_notifier - register a network notifier block
1444  *      @nb: notifier
1445  *
1446  *      Register a notifier to be called when network device events occur.
1447  *      The notifier passed is linked into the kernel structures and must
1448  *      not be reused until it has been unregistered. A negative errno code
1449  *      is returned on a failure.
1450  *
1451  *      When registered all registration and up events are replayed
1452  *      to the new notifier to allow device to have a race free
1453  *      view of the network device list.
1454  */
1455
1456 int register_netdevice_notifier(struct notifier_block *nb)
1457 {
1458         struct net_device *dev;
1459         struct net_device *last;
1460         struct net *net;
1461         int err;
1462
1463         rtnl_lock();
1464         err = raw_notifier_chain_register(&netdev_chain, nb);
1465         if (err)
1466                 goto unlock;
1467         if (dev_boot_phase)
1468                 goto unlock;
1469         for_each_net(net) {
1470                 for_each_netdev(net, dev) {
1471                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1472                         err = notifier_to_errno(err);
1473                         if (err)
1474                                 goto rollback;
1475
1476                         if (!(dev->flags & IFF_UP))
1477                                 continue;
1478
1479                         nb->notifier_call(nb, NETDEV_UP, dev);
1480                 }
1481         }
1482
1483 unlock:
1484         rtnl_unlock();
1485         return err;
1486
1487 rollback:
1488         last = dev;
1489         for_each_net(net) {
1490                 for_each_netdev(net, dev) {
1491                         if (dev == last)
1492                                 goto outroll;
1493
1494                         if (dev->flags & IFF_UP) {
1495                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1496                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1497                         }
1498                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1499                 }
1500         }
1501
1502 outroll:
1503         raw_notifier_chain_unregister(&netdev_chain, nb);
1504         goto unlock;
1505 }
1506 EXPORT_SYMBOL(register_netdevice_notifier);
1507
1508 /**
1509  *      unregister_netdevice_notifier - unregister a network notifier block
1510  *      @nb: notifier
1511  *
1512  *      Unregister a notifier previously registered by
1513  *      register_netdevice_notifier(). The notifier is unlinked into the
1514  *      kernel structures and may then be reused. A negative errno code
1515  *      is returned on a failure.
1516  *
1517  *      After unregistering unregister and down device events are synthesized
1518  *      for all devices on the device list to the removed notifier to remove
1519  *      the need for special case cleanup code.
1520  */
1521
1522 int unregister_netdevice_notifier(struct notifier_block *nb)
1523 {
1524         struct net_device *dev;
1525         struct net *net;
1526         int err;
1527
1528         rtnl_lock();
1529         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1530         if (err)
1531                 goto unlock;
1532
1533         for_each_net(net) {
1534                 for_each_netdev(net, dev) {
1535                         if (dev->flags & IFF_UP) {
1536                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1537                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1538                         }
1539                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1540                 }
1541         }
1542 unlock:
1543         rtnl_unlock();
1544         return err;
1545 }
1546 EXPORT_SYMBOL(unregister_netdevice_notifier);
1547
1548 /**
1549  *      call_netdevice_notifiers - call all network notifier blocks
1550  *      @val: value passed unmodified to notifier function
1551  *      @dev: net_device pointer passed unmodified to notifier function
1552  *
1553  *      Call all network notifier blocks.  Parameters and return value
1554  *      are as for raw_notifier_call_chain().
1555  */
1556
1557 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1558 {
1559         ASSERT_RTNL();
1560         return raw_notifier_call_chain(&netdev_chain, val, dev);
1561 }
1562 EXPORT_SYMBOL(call_netdevice_notifiers);
1563
1564 static struct static_key netstamp_needed __read_mostly;
1565 #ifdef HAVE_JUMP_LABEL
1566 /* We are not allowed to call static_key_slow_dec() from irq context
1567  * If net_disable_timestamp() is called from irq context, defer the
1568  * static_key_slow_dec() calls.
1569  */
1570 static atomic_t netstamp_needed_deferred;
1571 #endif
1572
1573 void net_enable_timestamp(void)
1574 {
1575 #ifdef HAVE_JUMP_LABEL
1576         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1577
1578         if (deferred) {
1579                 while (--deferred)
1580                         static_key_slow_dec(&netstamp_needed);
1581                 return;
1582         }
1583 #endif
1584         WARN_ON(in_interrupt());
1585         static_key_slow_inc(&netstamp_needed);
1586 }
1587 EXPORT_SYMBOL(net_enable_timestamp);
1588
1589 void net_disable_timestamp(void)
1590 {
1591 #ifdef HAVE_JUMP_LABEL
1592         if (in_interrupt()) {
1593                 atomic_inc(&netstamp_needed_deferred);
1594                 return;
1595         }
1596 #endif
1597         static_key_slow_dec(&netstamp_needed);
1598 }
1599 EXPORT_SYMBOL(net_disable_timestamp);
1600
1601 static inline void net_timestamp_set(struct sk_buff *skb)
1602 {
1603         skb->tstamp.tv64 = 0;
1604         if (static_key_false(&netstamp_needed))
1605                 __net_timestamp(skb);
1606 }
1607
1608 #define net_timestamp_check(COND, SKB)                  \
1609         if (static_key_false(&netstamp_needed)) {               \
1610                 if ((COND) && !(SKB)->tstamp.tv64)      \
1611                         __net_timestamp(SKB);           \
1612         }                                               \
1613
1614 static int net_hwtstamp_validate(struct ifreq *ifr)
1615 {
1616         struct hwtstamp_config cfg;
1617         enum hwtstamp_tx_types tx_type;
1618         enum hwtstamp_rx_filters rx_filter;
1619         int tx_type_valid = 0;
1620         int rx_filter_valid = 0;
1621
1622         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1623                 return -EFAULT;
1624
1625         if (cfg.flags) /* reserved for future extensions */
1626                 return -EINVAL;
1627
1628         tx_type = cfg.tx_type;
1629         rx_filter = cfg.rx_filter;
1630
1631         switch (tx_type) {
1632         case HWTSTAMP_TX_OFF:
1633         case HWTSTAMP_TX_ON:
1634         case HWTSTAMP_TX_ONESTEP_SYNC:
1635                 tx_type_valid = 1;
1636                 break;
1637         }
1638
1639         switch (rx_filter) {
1640         case HWTSTAMP_FILTER_NONE:
1641         case HWTSTAMP_FILTER_ALL:
1642         case HWTSTAMP_FILTER_SOME:
1643         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1644         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1645         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1646         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1647         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1648         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1649         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1650         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1651         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1652         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1653         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1654         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1655                 rx_filter_valid = 1;
1656                 break;
1657         }
1658
1659         if (!tx_type_valid || !rx_filter_valid)
1660                 return -ERANGE;
1661
1662         return 0;
1663 }
1664
1665 static inline bool is_skb_forwardable(struct net_device *dev,
1666                                       struct sk_buff *skb)
1667 {
1668         unsigned int len;
1669
1670         if (!(dev->flags & IFF_UP))
1671                 return false;
1672
1673         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1674         if (skb->len <= len)
1675                 return true;
1676
1677         /* if TSO is enabled, we don't care about the length as the packet
1678          * could be forwarded without being segmented before
1679          */
1680         if (skb_is_gso(skb))
1681                 return true;
1682
1683         return false;
1684 }
1685
1686 /**
1687  * dev_forward_skb - loopback an skb to another netif
1688  *
1689  * @dev: destination network device
1690  * @skb: buffer to forward
1691  *
1692  * return values:
1693  *      NET_RX_SUCCESS  (no congestion)
1694  *      NET_RX_DROP     (packet was dropped, but freed)
1695  *
1696  * dev_forward_skb can be used for injecting an skb from the
1697  * start_xmit function of one device into the receive queue
1698  * of another device.
1699  *
1700  * The receiving device may be in another namespace, so
1701  * we have to clear all information in the skb that could
1702  * impact namespace isolation.
1703  */
1704 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1705 {
1706         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1707                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1708                         atomic_long_inc(&dev->rx_dropped);
1709                         kfree_skb(skb);
1710                         return NET_RX_DROP;
1711                 }
1712         }
1713
1714         skb_orphan(skb);
1715         nf_reset(skb);
1716
1717         if (unlikely(!is_skb_forwardable(dev, skb))) {
1718                 atomic_long_inc(&dev->rx_dropped);
1719                 kfree_skb(skb);
1720                 return NET_RX_DROP;
1721         }
1722         skb->skb_iif = 0;
1723         skb->dev = dev;
1724         skb_dst_drop(skb);
1725         skb->tstamp.tv64 = 0;
1726         skb->pkt_type = PACKET_HOST;
1727         skb->protocol = eth_type_trans(skb, dev);
1728         skb->mark = 0;
1729         secpath_reset(skb);
1730         nf_reset(skb);
1731         return netif_rx(skb);
1732 }
1733 EXPORT_SYMBOL_GPL(dev_forward_skb);
1734
1735 static inline int deliver_skb(struct sk_buff *skb,
1736                               struct packet_type *pt_prev,
1737                               struct net_device *orig_dev)
1738 {
1739         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1740                 return -ENOMEM;
1741         atomic_inc(&skb->users);
1742         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1743 }
1744
1745 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1746 {
1747         if (!ptype->af_packet_priv || !skb->sk)
1748                 return false;
1749
1750         if (ptype->id_match)
1751                 return ptype->id_match(ptype, skb->sk);
1752         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1753                 return true;
1754
1755         return false;
1756 }
1757
1758 /*
1759  *      Support routine. Sends outgoing frames to any network
1760  *      taps currently in use.
1761  */
1762
1763 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1764 {
1765         struct packet_type *ptype;
1766         struct sk_buff *skb2 = NULL;
1767         struct packet_type *pt_prev = NULL;
1768
1769         rcu_read_lock();
1770         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1771                 /* Never send packets back to the socket
1772                  * they originated from - MvS (miquels@drinkel.ow.org)
1773                  */
1774                 if ((ptype->dev == dev || !ptype->dev) &&
1775                     (!skb_loop_sk(ptype, skb))) {
1776                         if (pt_prev) {
1777                                 deliver_skb(skb2, pt_prev, skb->dev);
1778                                 pt_prev = ptype;
1779                                 continue;
1780                         }
1781
1782                         skb2 = skb_clone(skb, GFP_ATOMIC);
1783                         if (!skb2)
1784                                 break;
1785
1786                         net_timestamp_set(skb2);
1787
1788                         /* skb->nh should be correctly
1789                            set by sender, so that the second statement is
1790                            just protection against buggy protocols.
1791                          */
1792                         skb_reset_mac_header(skb2);
1793
1794                         if (skb_network_header(skb2) < skb2->data ||
1795                             skb2->network_header > skb2->tail) {
1796                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1797                                                      ntohs(skb2->protocol),
1798                                                      dev->name);
1799                                 skb_reset_network_header(skb2);
1800                         }
1801
1802                         skb2->transport_header = skb2->network_header;
1803                         skb2->pkt_type = PACKET_OUTGOING;
1804                         pt_prev = ptype;
1805                 }
1806         }
1807         if (pt_prev)
1808                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1809         rcu_read_unlock();
1810 }
1811
1812 /**
1813  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1814  * @dev: Network device
1815  * @txq: number of queues available
1816  *
1817  * If real_num_tx_queues is changed the tc mappings may no longer be
1818  * valid. To resolve this verify the tc mapping remains valid and if
1819  * not NULL the mapping. With no priorities mapping to this
1820  * offset/count pair it will no longer be used. In the worst case TC0
1821  * is invalid nothing can be done so disable priority mappings. If is
1822  * expected that drivers will fix this mapping if they can before
1823  * calling netif_set_real_num_tx_queues.
1824  */
1825 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1826 {
1827         int i;
1828         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1829
1830         /* If TC0 is invalidated disable TC mapping */
1831         if (tc->offset + tc->count > txq) {
1832                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1833                 dev->num_tc = 0;
1834                 return;
1835         }
1836
1837         /* Invalidated prio to tc mappings set to TC0 */
1838         for (i = 1; i < TC_BITMASK + 1; i++) {
1839                 int q = netdev_get_prio_tc_map(dev, i);
1840
1841                 tc = &dev->tc_to_txq[q];
1842                 if (tc->offset + tc->count > txq) {
1843                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1844                                 i, q);
1845                         netdev_set_prio_tc_map(dev, i, 0);
1846                 }
1847         }
1848 }
1849
1850 /*
1851  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1852  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1853  */
1854 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1855 {
1856         int rc;
1857
1858         if (txq < 1 || txq > dev->num_tx_queues)
1859                 return -EINVAL;
1860
1861         if (dev->reg_state == NETREG_REGISTERED ||
1862             dev->reg_state == NETREG_UNREGISTERING) {
1863                 ASSERT_RTNL();
1864
1865                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1866                                                   txq);
1867                 if (rc)
1868                         return rc;
1869
1870                 if (dev->num_tc)
1871                         netif_setup_tc(dev, txq);
1872
1873                 if (txq < dev->real_num_tx_queues)
1874                         qdisc_reset_all_tx_gt(dev, txq);
1875         }
1876
1877         dev->real_num_tx_queues = txq;
1878         return 0;
1879 }
1880 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1881
1882 #ifdef CONFIG_RPS
1883 /**
1884  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1885  *      @dev: Network device
1886  *      @rxq: Actual number of RX queues
1887  *
1888  *      This must be called either with the rtnl_lock held or before
1889  *      registration of the net device.  Returns 0 on success, or a
1890  *      negative error code.  If called before registration, it always
1891  *      succeeds.
1892  */
1893 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1894 {
1895         int rc;
1896
1897         if (rxq < 1 || rxq > dev->num_rx_queues)
1898                 return -EINVAL;
1899
1900         if (dev->reg_state == NETREG_REGISTERED) {
1901                 ASSERT_RTNL();
1902
1903                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1904                                                   rxq);
1905                 if (rc)
1906                         return rc;
1907         }
1908
1909         dev->real_num_rx_queues = rxq;
1910         return 0;
1911 }
1912 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1913 #endif
1914
1915 /**
1916  * netif_get_num_default_rss_queues - default number of RSS queues
1917  *
1918  * This routine should set an upper limit on the number of RSS queues
1919  * used by default by multiqueue devices.
1920  */
1921 int netif_get_num_default_rss_queues(void)
1922 {
1923         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
1924 }
1925 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
1926
1927 static inline void __netif_reschedule(struct Qdisc *q)
1928 {
1929         struct softnet_data *sd;
1930         unsigned long flags;
1931
1932         local_irq_save(flags);
1933         sd = &__get_cpu_var(softnet_data);
1934         q->next_sched = NULL;
1935         *sd->output_queue_tailp = q;
1936         sd->output_queue_tailp = &q->next_sched;
1937         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1938         local_irq_restore(flags);
1939 }
1940
1941 void __netif_schedule(struct Qdisc *q)
1942 {
1943         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1944                 __netif_reschedule(q);
1945 }
1946 EXPORT_SYMBOL(__netif_schedule);
1947
1948 void dev_kfree_skb_irq(struct sk_buff *skb)
1949 {
1950         if (atomic_dec_and_test(&skb->users)) {
1951                 struct softnet_data *sd;
1952                 unsigned long flags;
1953
1954                 local_irq_save(flags);
1955                 sd = &__get_cpu_var(softnet_data);
1956                 skb->next = sd->completion_queue;
1957                 sd->completion_queue = skb;
1958                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1959                 local_irq_restore(flags);
1960         }
1961 }
1962 EXPORT_SYMBOL(dev_kfree_skb_irq);
1963
1964 void dev_kfree_skb_any(struct sk_buff *skb)
1965 {
1966         if (in_irq() || irqs_disabled())
1967                 dev_kfree_skb_irq(skb);
1968         else
1969                 dev_kfree_skb(skb);
1970 }
1971 EXPORT_SYMBOL(dev_kfree_skb_any);
1972
1973
1974 /**
1975  * netif_device_detach - mark device as removed
1976  * @dev: network device
1977  *
1978  * Mark device as removed from system and therefore no longer available.
1979  */
1980 void netif_device_detach(struct net_device *dev)
1981 {
1982         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1983             netif_running(dev)) {
1984                 netif_tx_stop_all_queues(dev);
1985         }
1986 }
1987 EXPORT_SYMBOL(netif_device_detach);
1988
1989 /**
1990  * netif_device_attach - mark device as attached
1991  * @dev: network device
1992  *
1993  * Mark device as attached from system and restart if needed.
1994  */
1995 void netif_device_attach(struct net_device *dev)
1996 {
1997         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1998             netif_running(dev)) {
1999                 netif_tx_wake_all_queues(dev);
2000                 __netdev_watchdog_up(dev);
2001         }
2002 }
2003 EXPORT_SYMBOL(netif_device_attach);
2004
2005 static void skb_warn_bad_offload(const struct sk_buff *skb)
2006 {
2007         static const netdev_features_t null_features = 0;
2008         struct net_device *dev = skb->dev;
2009         const char *driver = "";
2010
2011         if (dev && dev->dev.parent)
2012                 driver = dev_driver_string(dev->dev.parent);
2013
2014         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2015              "gso_type=%d ip_summed=%d\n",
2016              driver, dev ? &dev->features : &null_features,
2017              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2018              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2019              skb_shinfo(skb)->gso_type, skb->ip_summed);
2020 }
2021
2022 /*
2023  * Invalidate hardware checksum when packet is to be mangled, and
2024  * complete checksum manually on outgoing path.
2025  */
2026 int skb_checksum_help(struct sk_buff *skb)
2027 {
2028         __wsum csum;
2029         int ret = 0, offset;
2030
2031         if (skb->ip_summed == CHECKSUM_COMPLETE)
2032                 goto out_set_summed;
2033
2034         if (unlikely(skb_shinfo(skb)->gso_size)) {
2035                 skb_warn_bad_offload(skb);
2036                 return -EINVAL;
2037         }
2038
2039         offset = skb_checksum_start_offset(skb);
2040         BUG_ON(offset >= skb_headlen(skb));
2041         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2042
2043         offset += skb->csum_offset;
2044         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2045
2046         if (skb_cloned(skb) &&
2047             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2048                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2049                 if (ret)
2050                         goto out;
2051         }
2052
2053         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2054 out_set_summed:
2055         skb->ip_summed = CHECKSUM_NONE;
2056 out:
2057         return ret;
2058 }
2059 EXPORT_SYMBOL(skb_checksum_help);
2060
2061 /**
2062  *      skb_gso_segment - Perform segmentation on skb.
2063  *      @skb: buffer to segment
2064  *      @features: features for the output path (see dev->features)
2065  *
2066  *      This function segments the given skb and returns a list of segments.
2067  *
2068  *      It may return NULL if the skb requires no segmentation.  This is
2069  *      only possible when GSO is used for verifying header integrity.
2070  */
2071 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
2072         netdev_features_t features)
2073 {
2074         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2075         struct packet_type *ptype;
2076         __be16 type = skb->protocol;
2077         int vlan_depth = ETH_HLEN;
2078         int err;
2079
2080         while (type == htons(ETH_P_8021Q)) {
2081                 struct vlan_hdr *vh;
2082
2083                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2084                         return ERR_PTR(-EINVAL);
2085
2086                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2087                 type = vh->h_vlan_encapsulated_proto;
2088                 vlan_depth += VLAN_HLEN;
2089         }
2090
2091         skb_reset_mac_header(skb);
2092         skb->mac_len = skb->network_header - skb->mac_header;
2093         __skb_pull(skb, skb->mac_len);
2094
2095         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2096                 skb_warn_bad_offload(skb);
2097
2098                 if (skb_header_cloned(skb) &&
2099                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2100                         return ERR_PTR(err);
2101         }
2102
2103         rcu_read_lock();
2104         list_for_each_entry_rcu(ptype,
2105                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2106                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
2107                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2108                                 err = ptype->gso_send_check(skb);
2109                                 segs = ERR_PTR(err);
2110                                 if (err || skb_gso_ok(skb, features))
2111                                         break;
2112                                 __skb_push(skb, (skb->data -
2113                                                  skb_network_header(skb)));
2114                         }
2115                         segs = ptype->gso_segment(skb, features);
2116                         break;
2117                 }
2118         }
2119         rcu_read_unlock();
2120
2121         __skb_push(skb, skb->data - skb_mac_header(skb));
2122
2123         return segs;
2124 }
2125 EXPORT_SYMBOL(skb_gso_segment);
2126
2127 /* Take action when hardware reception checksum errors are detected. */
2128 #ifdef CONFIG_BUG
2129 void netdev_rx_csum_fault(struct net_device *dev)
2130 {
2131         if (net_ratelimit()) {
2132                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2133                 dump_stack();
2134         }
2135 }
2136 EXPORT_SYMBOL(netdev_rx_csum_fault);
2137 #endif
2138
2139 /* Actually, we should eliminate this check as soon as we know, that:
2140  * 1. IOMMU is present and allows to map all the memory.
2141  * 2. No high memory really exists on this machine.
2142  */
2143
2144 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2145 {
2146 #ifdef CONFIG_HIGHMEM
2147         int i;
2148         if (!(dev->features & NETIF_F_HIGHDMA)) {
2149                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2150                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2151                         if (PageHighMem(skb_frag_page(frag)))
2152                                 return 1;
2153                 }
2154         }
2155
2156         if (PCI_DMA_BUS_IS_PHYS) {
2157                 struct device *pdev = dev->dev.parent;
2158
2159                 if (!pdev)
2160                         return 0;
2161                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2162                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2163                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2164                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2165                                 return 1;
2166                 }
2167         }
2168 #endif
2169         return 0;
2170 }
2171
2172 struct dev_gso_cb {
2173         void (*destructor)(struct sk_buff *skb);
2174 };
2175
2176 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2177
2178 static void dev_gso_skb_destructor(struct sk_buff *skb)
2179 {
2180         struct dev_gso_cb *cb;
2181
2182         do {
2183                 struct sk_buff *nskb = skb->next;
2184
2185                 skb->next = nskb->next;
2186                 nskb->next = NULL;
2187                 kfree_skb(nskb);
2188         } while (skb->next);
2189
2190         cb = DEV_GSO_CB(skb);
2191         if (cb->destructor)
2192                 cb->destructor(skb);
2193 }
2194
2195 /**
2196  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2197  *      @skb: buffer to segment
2198  *      @features: device features as applicable to this skb
2199  *
2200  *      This function segments the given skb and stores the list of segments
2201  *      in skb->next.
2202  */
2203 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2204 {
2205         struct sk_buff *segs;
2206
2207         segs = skb_gso_segment(skb, features);
2208
2209         /* Verifying header integrity only. */
2210         if (!segs)
2211                 return 0;
2212
2213         if (IS_ERR(segs))
2214                 return PTR_ERR(segs);
2215
2216         skb->next = segs;
2217         DEV_GSO_CB(skb)->destructor = skb->destructor;
2218         skb->destructor = dev_gso_skb_destructor;
2219
2220         return 0;
2221 }
2222
2223 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2224 {
2225         return ((features & NETIF_F_GEN_CSUM) ||
2226                 ((features & NETIF_F_V4_CSUM) &&
2227                  protocol == htons(ETH_P_IP)) ||
2228                 ((features & NETIF_F_V6_CSUM) &&
2229                  protocol == htons(ETH_P_IPV6)) ||
2230                 ((features & NETIF_F_FCOE_CRC) &&
2231                  protocol == htons(ETH_P_FCOE)));
2232 }
2233
2234 static netdev_features_t harmonize_features(struct sk_buff *skb,
2235         __be16 protocol, netdev_features_t features)
2236 {
2237         if (skb->ip_summed != CHECKSUM_NONE &&
2238             !can_checksum_protocol(features, protocol)) {
2239                 features &= ~NETIF_F_ALL_CSUM;
2240                 features &= ~NETIF_F_SG;
2241         } else if (illegal_highdma(skb->dev, skb)) {
2242                 features &= ~NETIF_F_SG;
2243         }
2244
2245         return features;
2246 }
2247
2248 netdev_features_t netif_skb_features(struct sk_buff *skb)
2249 {
2250         __be16 protocol = skb->protocol;
2251         netdev_features_t features = skb->dev->features;
2252
2253         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2254                 features &= ~NETIF_F_GSO_MASK;
2255
2256         if (protocol == htons(ETH_P_8021Q)) {
2257                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2258                 protocol = veh->h_vlan_encapsulated_proto;
2259         } else if (!vlan_tx_tag_present(skb)) {
2260                 return harmonize_features(skb, protocol, features);
2261         }
2262
2263         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2264
2265         if (protocol != htons(ETH_P_8021Q)) {
2266                 return harmonize_features(skb, protocol, features);
2267         } else {
2268                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2269                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2270                 return harmonize_features(skb, protocol, features);
2271         }
2272 }
2273 EXPORT_SYMBOL(netif_skb_features);
2274
2275 /*
2276  * Returns true if either:
2277  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2278  *      2. skb is fragmented and the device does not support SG.
2279  */
2280 static inline int skb_needs_linearize(struct sk_buff *skb,
2281                                       int features)
2282 {
2283         return skb_is_nonlinear(skb) &&
2284                         ((skb_has_frag_list(skb) &&
2285                                 !(features & NETIF_F_FRAGLIST)) ||
2286                         (skb_shinfo(skb)->nr_frags &&
2287                                 !(features & NETIF_F_SG)));
2288 }
2289
2290 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2291                         struct netdev_queue *txq)
2292 {
2293         const struct net_device_ops *ops = dev->netdev_ops;
2294         int rc = NETDEV_TX_OK;
2295         unsigned int skb_len;
2296
2297         if (likely(!skb->next)) {
2298                 netdev_features_t features;
2299
2300                 /*
2301                  * If device doesn't need skb->dst, release it right now while
2302                  * its hot in this cpu cache
2303                  */
2304                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2305                         skb_dst_drop(skb);
2306
2307                 features = netif_skb_features(skb);
2308
2309                 if (vlan_tx_tag_present(skb) &&
2310                     !(features & NETIF_F_HW_VLAN_TX)) {
2311                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2312                         if (unlikely(!skb))
2313                                 goto out;
2314
2315                         skb->vlan_tci = 0;
2316                 }
2317
2318                 if (netif_needs_gso(skb, features)) {
2319                         if (unlikely(dev_gso_segment(skb, features)))
2320                                 goto out_kfree_skb;
2321                         if (skb->next)
2322                                 goto gso;
2323                 } else {
2324                         if (skb_needs_linearize(skb, features) &&
2325                             __skb_linearize(skb))
2326                                 goto out_kfree_skb;
2327
2328                         /* If packet is not checksummed and device does not
2329                          * support checksumming for this protocol, complete
2330                          * checksumming here.
2331                          */
2332                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2333                                 skb_set_transport_header(skb,
2334                                         skb_checksum_start_offset(skb));
2335                                 if (!(features & NETIF_F_ALL_CSUM) &&
2336                                      skb_checksum_help(skb))
2337                                         goto out_kfree_skb;
2338                         }
2339                 }
2340
2341                 if (!list_empty(&ptype_all))
2342                         dev_queue_xmit_nit(skb, dev);
2343
2344                 skb_len = skb->len;
2345                 rc = ops->ndo_start_xmit(skb, dev);
2346                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2347                 if (rc == NETDEV_TX_OK)
2348                         txq_trans_update(txq);
2349                 return rc;
2350         }
2351
2352 gso:
2353         do {
2354                 struct sk_buff *nskb = skb->next;
2355
2356                 skb->next = nskb->next;
2357                 nskb->next = NULL;
2358
2359                 /*
2360                  * If device doesn't need nskb->dst, release it right now while
2361                  * its hot in this cpu cache
2362                  */
2363                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2364                         skb_dst_drop(nskb);
2365
2366                 if (!list_empty(&ptype_all))
2367                         dev_queue_xmit_nit(nskb, dev);
2368
2369                 skb_len = nskb->len;
2370                 rc = ops->ndo_start_xmit(nskb, dev);
2371                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2372                 if (unlikely(rc != NETDEV_TX_OK)) {
2373                         if (rc & ~NETDEV_TX_MASK)
2374                                 goto out_kfree_gso_skb;
2375                         nskb->next = skb->next;
2376                         skb->next = nskb;
2377                         return rc;
2378                 }
2379                 txq_trans_update(txq);
2380                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2381                         return NETDEV_TX_BUSY;
2382         } while (skb->next);
2383
2384 out_kfree_gso_skb:
2385         if (likely(skb->next == NULL))
2386                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2387 out_kfree_skb:
2388         kfree_skb(skb);
2389 out:
2390         return rc;
2391 }
2392
2393 static u32 hashrnd __read_mostly;
2394
2395 /*
2396  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2397  * to be used as a distribution range.
2398  */
2399 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2400                   unsigned int num_tx_queues)
2401 {
2402         u32 hash;
2403         u16 qoffset = 0;
2404         u16 qcount = num_tx_queues;
2405
2406         if (skb_rx_queue_recorded(skb)) {
2407                 hash = skb_get_rx_queue(skb);
2408                 while (unlikely(hash >= num_tx_queues))
2409                         hash -= num_tx_queues;
2410                 return hash;
2411         }
2412
2413         if (dev->num_tc) {
2414                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2415                 qoffset = dev->tc_to_txq[tc].offset;
2416                 qcount = dev->tc_to_txq[tc].count;
2417         }
2418
2419         if (skb->sk && skb->sk->sk_hash)
2420                 hash = skb->sk->sk_hash;
2421         else
2422                 hash = (__force u16) skb->protocol;
2423         hash = jhash_1word(hash, hashrnd);
2424
2425         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2426 }
2427 EXPORT_SYMBOL(__skb_tx_hash);
2428
2429 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2430 {
2431         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2432                 net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2433                                      dev->name, queue_index,
2434                                      dev->real_num_tx_queues);
2435                 return 0;
2436         }
2437         return queue_index;
2438 }
2439
2440 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2441 {
2442 #ifdef CONFIG_XPS
2443         struct xps_dev_maps *dev_maps;
2444         struct xps_map *map;
2445         int queue_index = -1;
2446
2447         rcu_read_lock();
2448         dev_maps = rcu_dereference(dev->xps_maps);
2449         if (dev_maps) {
2450                 map = rcu_dereference(
2451                     dev_maps->cpu_map[raw_smp_processor_id()]);
2452                 if (map) {
2453                         if (map->len == 1)
2454                                 queue_index = map->queues[0];
2455                         else {
2456                                 u32 hash;
2457                                 if (skb->sk && skb->sk->sk_hash)
2458                                         hash = skb->sk->sk_hash;
2459                                 else
2460                                         hash = (__force u16) skb->protocol ^
2461                                             skb->rxhash;
2462                                 hash = jhash_1word(hash, hashrnd);
2463                                 queue_index = map->queues[
2464                                     ((u64)hash * map->len) >> 32];
2465                         }
2466                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2467                                 queue_index = -1;
2468                 }
2469         }
2470         rcu_read_unlock();
2471
2472         return queue_index;
2473 #else
2474         return -1;
2475 #endif
2476 }
2477
2478 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2479                                     struct sk_buff *skb)
2480 {
2481         int queue_index;
2482         const struct net_device_ops *ops = dev->netdev_ops;
2483
2484         if (dev->real_num_tx_queues == 1)
2485                 queue_index = 0;
2486         else if (ops->ndo_select_queue) {
2487                 queue_index = ops->ndo_select_queue(dev, skb);
2488                 queue_index = dev_cap_txqueue(dev, queue_index);
2489         } else {
2490                 struct sock *sk = skb->sk;
2491                 queue_index = sk_tx_queue_get(sk);
2492
2493                 if (queue_index < 0 || skb->ooo_okay ||
2494                     queue_index >= dev->real_num_tx_queues) {
2495                         int old_index = queue_index;
2496
2497                         queue_index = get_xps_queue(dev, skb);
2498                         if (queue_index < 0)
2499                                 queue_index = skb_tx_hash(dev, skb);
2500
2501                         if (queue_index != old_index && sk) {
2502                                 struct dst_entry *dst =
2503                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2504
2505                                 if (dst && skb_dst(skb) == dst)
2506                                         sk_tx_queue_set(sk, queue_index);
2507                         }
2508                 }
2509         }
2510
2511         skb_set_queue_mapping(skb, queue_index);
2512         return netdev_get_tx_queue(dev, queue_index);
2513 }
2514
2515 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2516                                  struct net_device *dev,
2517                                  struct netdev_queue *txq)
2518 {
2519         spinlock_t *root_lock = qdisc_lock(q);
2520         bool contended;
2521         int rc;
2522
2523         qdisc_skb_cb(skb)->pkt_len = skb->len;
2524         qdisc_calculate_pkt_len(skb, q);
2525         /*
2526          * Heuristic to force contended enqueues to serialize on a
2527          * separate lock before trying to get qdisc main lock.
2528          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2529          * and dequeue packets faster.
2530          */
2531         contended = qdisc_is_running(q);
2532         if (unlikely(contended))
2533                 spin_lock(&q->busylock);
2534
2535         spin_lock(root_lock);
2536         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2537                 kfree_skb(skb);
2538                 rc = NET_XMIT_DROP;
2539         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2540                    qdisc_run_begin(q)) {
2541                 /*
2542                  * This is a work-conserving queue; there are no old skbs
2543                  * waiting to be sent out; and the qdisc is not running -
2544                  * xmit the skb directly.
2545                  */
2546                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2547                         skb_dst_force(skb);
2548
2549                 qdisc_bstats_update(q, skb);
2550
2551                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2552                         if (unlikely(contended)) {
2553                                 spin_unlock(&q->busylock);
2554                                 contended = false;
2555                         }
2556                         __qdisc_run(q);
2557                 } else
2558                         qdisc_run_end(q);
2559
2560                 rc = NET_XMIT_SUCCESS;
2561         } else {
2562                 skb_dst_force(skb);
2563                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2564                 if (qdisc_run_begin(q)) {
2565                         if (unlikely(contended)) {
2566                                 spin_unlock(&q->busylock);
2567                                 contended = false;
2568                         }
2569                         __qdisc_run(q);
2570                 }
2571         }
2572         spin_unlock(root_lock);
2573         if (unlikely(contended))
2574                 spin_unlock(&q->busylock);
2575         return rc;
2576 }
2577
2578 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2579 static void skb_update_prio(struct sk_buff *skb)
2580 {
2581         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2582
2583         if (!skb->priority && skb->sk && map) {
2584                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2585
2586                 if (prioidx < map->priomap_len)
2587                         skb->priority = map->priomap[prioidx];
2588         }
2589 }
2590 #else
2591 #define skb_update_prio(skb)
2592 #endif
2593
2594 static DEFINE_PER_CPU(int, xmit_recursion);
2595 #define RECURSION_LIMIT 10
2596
2597 /**
2598  *      dev_loopback_xmit - loop back @skb
2599  *      @skb: buffer to transmit
2600  */
2601 int dev_loopback_xmit(struct sk_buff *skb)
2602 {
2603         skb_reset_mac_header(skb);
2604         __skb_pull(skb, skb_network_offset(skb));
2605         skb->pkt_type = PACKET_LOOPBACK;
2606         skb->ip_summed = CHECKSUM_UNNECESSARY;
2607         WARN_ON(!skb_dst(skb));
2608         skb_dst_force(skb);
2609         netif_rx_ni(skb);
2610         return 0;
2611 }
2612 EXPORT_SYMBOL(dev_loopback_xmit);
2613
2614 /**
2615  *      dev_queue_xmit - transmit a buffer
2616  *      @skb: buffer to transmit
2617  *
2618  *      Queue a buffer for transmission to a network device. The caller must
2619  *      have set the device and priority and built the buffer before calling
2620  *      this function. The function can be called from an interrupt.
2621  *
2622  *      A negative errno code is returned on a failure. A success does not
2623  *      guarantee the frame will be transmitted as it may be dropped due
2624  *      to congestion or traffic shaping.
2625  *
2626  * -----------------------------------------------------------------------------------
2627  *      I notice this method can also return errors from the queue disciplines,
2628  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2629  *      be positive.
2630  *
2631  *      Regardless of the return value, the skb is consumed, so it is currently
2632  *      difficult to retry a send to this method.  (You can bump the ref count
2633  *      before sending to hold a reference for retry if you are careful.)
2634  *
2635  *      When calling this method, interrupts MUST be enabled.  This is because
2636  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2637  *          --BLG
2638  */
2639 int dev_queue_xmit(struct sk_buff *skb)
2640 {
2641         struct net_device *dev = skb->dev;
2642         struct netdev_queue *txq;
2643         struct Qdisc *q;
2644         int rc = -ENOMEM;
2645
2646         /* Disable soft irqs for various locks below. Also
2647          * stops preemption for RCU.
2648          */
2649         rcu_read_lock_bh();
2650
2651         skb_update_prio(skb);
2652
2653         txq = netdev_pick_tx(dev, skb);
2654         q = rcu_dereference_bh(txq->qdisc);
2655
2656 #ifdef CONFIG_NET_CLS_ACT
2657         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2658 #endif
2659         trace_net_dev_queue(skb);
2660         if (q->enqueue) {
2661                 rc = __dev_xmit_skb(skb, q, dev, txq);
2662                 goto out;
2663         }
2664
2665         /* The device has no queue. Common case for software devices:
2666            loopback, all the sorts of tunnels...
2667
2668            Really, it is unlikely that netif_tx_lock protection is necessary
2669            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2670            counters.)
2671            However, it is possible, that they rely on protection
2672            made by us here.
2673
2674            Check this and shot the lock. It is not prone from deadlocks.
2675            Either shot noqueue qdisc, it is even simpler 8)
2676          */
2677         if (dev->flags & IFF_UP) {
2678                 int cpu = smp_processor_id(); /* ok because BHs are off */
2679
2680                 if (txq->xmit_lock_owner != cpu) {
2681
2682                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2683                                 goto recursion_alert;
2684
2685                         HARD_TX_LOCK(dev, txq, cpu);
2686
2687                         if (!netif_xmit_stopped(txq)) {
2688                                 __this_cpu_inc(xmit_recursion);
2689                                 rc = dev_hard_start_xmit(skb, dev, txq);
2690                                 __this_cpu_dec(xmit_recursion);
2691                                 if (dev_xmit_complete(rc)) {
2692                                         HARD_TX_UNLOCK(dev, txq);
2693                                         goto out;
2694                                 }
2695                         }
2696                         HARD_TX_UNLOCK(dev, txq);
2697                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2698                                              dev->name);
2699                 } else {
2700                         /* Recursion is detected! It is possible,
2701                          * unfortunately
2702                          */
2703 recursion_alert:
2704                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2705                                              dev->name);
2706                 }
2707         }
2708
2709         rc = -ENETDOWN;
2710         rcu_read_unlock_bh();
2711
2712         kfree_skb(skb);
2713         return rc;
2714 out:
2715         rcu_read_unlock_bh();
2716         return rc;
2717 }
2718 EXPORT_SYMBOL(dev_queue_xmit);
2719
2720
2721 /*=======================================================================
2722                         Receiver routines
2723   =======================================================================*/
2724
2725 int netdev_max_backlog __read_mostly = 1000;
2726 EXPORT_SYMBOL(netdev_max_backlog);
2727
2728 int netdev_tstamp_prequeue __read_mostly = 1;
2729 int netdev_budget __read_mostly = 300;
2730 int weight_p __read_mostly = 64;            /* old backlog weight */
2731
2732 /* Called with irq disabled */
2733 static inline void ____napi_schedule(struct softnet_data *sd,
2734                                      struct napi_struct *napi)
2735 {
2736         list_add_tail(&napi->poll_list, &sd->poll_list);
2737         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2738 }
2739
2740 /*
2741  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2742  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2743  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2744  * if hash is a canonical 4-tuple hash over transport ports.
2745  */
2746 void __skb_get_rxhash(struct sk_buff *skb)
2747 {
2748         struct flow_keys keys;
2749         u32 hash;
2750
2751         if (!skb_flow_dissect(skb, &keys))
2752                 return;
2753
2754         if (keys.ports)
2755                 skb->l4_rxhash = 1;
2756
2757         /* get a consistent hash (same value on both flow directions) */
2758         if (((__force u32)keys.dst < (__force u32)keys.src) ||
2759             (((__force u32)keys.dst == (__force u32)keys.src) &&
2760              ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
2761                 swap(keys.dst, keys.src);
2762                 swap(keys.port16[0], keys.port16[1]);
2763         }
2764
2765         hash = jhash_3words((__force u32)keys.dst,
2766                             (__force u32)keys.src,
2767                             (__force u32)keys.ports, hashrnd);
2768         if (!hash)
2769                 hash = 1;
2770
2771         skb->rxhash = hash;
2772 }
2773 EXPORT_SYMBOL(__skb_get_rxhash);
2774
2775 #ifdef CONFIG_RPS
2776
2777 /* One global table that all flow-based protocols share. */
2778 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2779 EXPORT_SYMBOL(rps_sock_flow_table);
2780
2781 struct static_key rps_needed __read_mostly;
2782
2783 static struct rps_dev_flow *
2784 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2785             struct rps_dev_flow *rflow, u16 next_cpu)
2786 {
2787         if (next_cpu != RPS_NO_CPU) {
2788 #ifdef CONFIG_RFS_ACCEL
2789                 struct netdev_rx_queue *rxqueue;
2790                 struct rps_dev_flow_table *flow_table;
2791                 struct rps_dev_flow *old_rflow;
2792                 u32 flow_id;
2793                 u16 rxq_index;
2794                 int rc;
2795
2796                 /* Should we steer this flow to a different hardware queue? */
2797                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2798                     !(dev->features & NETIF_F_NTUPLE))
2799                         goto out;
2800                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2801                 if (rxq_index == skb_get_rx_queue(skb))
2802                         goto out;
2803
2804                 rxqueue = dev->_rx + rxq_index;
2805                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2806                 if (!flow_table)
2807                         goto out;
2808                 flow_id = skb->rxhash & flow_table->mask;
2809                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2810                                                         rxq_index, flow_id);
2811                 if (rc < 0)
2812                         goto out;
2813                 old_rflow = rflow;
2814                 rflow = &flow_table->flows[flow_id];
2815                 rflow->filter = rc;
2816                 if (old_rflow->filter == rflow->filter)
2817                         old_rflow->filter = RPS_NO_FILTER;
2818         out:
2819 #endif
2820                 rflow->last_qtail =
2821                         per_cpu(softnet_data, next_cpu).input_queue_head;
2822         }
2823
2824         rflow->cpu = next_cpu;
2825         return rflow;
2826 }
2827
2828 /*
2829  * get_rps_cpu is called from netif_receive_skb and returns the target
2830  * CPU from the RPS map of the receiving queue for a given skb.
2831  * rcu_read_lock must be held on entry.
2832  */
2833 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2834                        struct rps_dev_flow **rflowp)
2835 {
2836         struct netdev_rx_queue *rxqueue;
2837         struct rps_map *map;
2838         struct rps_dev_flow_table *flow_table;
2839         struct rps_sock_flow_table *sock_flow_table;
2840         int cpu = -1;
2841         u16 tcpu;
2842
2843         if (skb_rx_queue_recorded(skb)) {
2844                 u16 index = skb_get_rx_queue(skb);
2845                 if (unlikely(index >= dev->real_num_rx_queues)) {
2846                         WARN_ONCE(dev->real_num_rx_queues > 1,
2847                                   "%s received packet on queue %u, but number "
2848                                   "of RX queues is %u\n",
2849                                   dev->name, index, dev->real_num_rx_queues);
2850                         goto done;
2851                 }
2852                 rxqueue = dev->_rx + index;
2853         } else
2854                 rxqueue = dev->_rx;
2855
2856         map = rcu_dereference(rxqueue->rps_map);
2857         if (map) {
2858                 if (map->len == 1 &&
2859                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2860                         tcpu = map->cpus[0];
2861                         if (cpu_online(tcpu))
2862                                 cpu = tcpu;
2863                         goto done;
2864                 }
2865         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2866                 goto done;
2867         }
2868
2869         skb_reset_network_header(skb);
2870         if (!skb_get_rxhash(skb))
2871                 goto done;
2872
2873         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2874         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2875         if (flow_table && sock_flow_table) {
2876                 u16 next_cpu;
2877                 struct rps_dev_flow *rflow;
2878
2879                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2880                 tcpu = rflow->cpu;
2881
2882                 next_cpu = sock_flow_table->ents[skb->rxhash &
2883                     sock_flow_table->mask];
2884
2885                 /*
2886                  * If the desired CPU (where last recvmsg was done) is
2887                  * different from current CPU (one in the rx-queue flow
2888                  * table entry), switch if one of the following holds:
2889                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2890                  *   - Current CPU is offline.
2891                  *   - The current CPU's queue tail has advanced beyond the
2892                  *     last packet that was enqueued using this table entry.
2893                  *     This guarantees that all previous packets for the flow
2894                  *     have been dequeued, thus preserving in order delivery.
2895                  */
2896                 if (unlikely(tcpu != next_cpu) &&
2897                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2898                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2899                       rflow->last_qtail)) >= 0))
2900                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2901
2902                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2903                         *rflowp = rflow;
2904                         cpu = tcpu;
2905                         goto done;
2906                 }
2907         }
2908
2909         if (map) {
2910                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2911
2912                 if (cpu_online(tcpu)) {
2913                         cpu = tcpu;
2914                         goto done;
2915                 }
2916         }
2917
2918 done:
2919         return cpu;
2920 }
2921
2922 #ifdef CONFIG_RFS_ACCEL
2923
2924 /**
2925  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2926  * @dev: Device on which the filter was set
2927  * @rxq_index: RX queue index
2928  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2929  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2930  *
2931  * Drivers that implement ndo_rx_flow_steer() should periodically call
2932  * this function for each installed filter and remove the filters for
2933  * which it returns %true.
2934  */
2935 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2936                          u32 flow_id, u16 filter_id)
2937 {
2938         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2939         struct rps_dev_flow_table *flow_table;
2940         struct rps_dev_flow *rflow;
2941         bool expire = true;
2942         int cpu;
2943
2944         rcu_read_lock();
2945         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2946         if (flow_table && flow_id <= flow_table->mask) {
2947                 rflow = &flow_table->flows[flow_id];
2948                 cpu = ACCESS_ONCE(rflow->cpu);
2949                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2950                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2951                            rflow->last_qtail) <
2952                      (int)(10 * flow_table->mask)))
2953                         expire = false;
2954         }
2955         rcu_read_unlock();
2956         return expire;
2957 }
2958 EXPORT_SYMBOL(rps_may_expire_flow);
2959
2960 #endif /* CONFIG_RFS_ACCEL */
2961
2962 /* Called from hardirq (IPI) context */
2963 static void rps_trigger_softirq(void *data)
2964 {
2965         struct softnet_data *sd = data;
2966
2967         ____napi_schedule(sd, &sd->backlog);
2968         sd->received_rps++;
2969 }
2970
2971 #endif /* CONFIG_RPS */
2972
2973 /*
2974  * Check if this softnet_data structure is another cpu one
2975  * If yes, queue it to our IPI list and return 1
2976  * If no, return 0
2977  */
2978 static int rps_ipi_queued(struct softnet_data *sd)
2979 {
2980 #ifdef CONFIG_RPS
2981         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2982
2983         if (sd != mysd) {
2984                 sd->rps_ipi_next = mysd->rps_ipi_list;
2985                 mysd->rps_ipi_list = sd;
2986
2987                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2988                 return 1;
2989         }
2990 #endif /* CONFIG_RPS */
2991         return 0;
2992 }
2993
2994 /*
2995  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2996  * queue (may be a remote CPU queue).
2997  */
2998 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2999                               unsigned int *qtail)
3000 {
3001         struct softnet_data *sd;
3002         unsigned long flags;
3003
3004         sd = &per_cpu(softnet_data, cpu);
3005
3006         local_irq_save(flags);
3007
3008         rps_lock(sd);
3009         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3010                 if (skb_queue_len(&sd->input_pkt_queue)) {
3011 enqueue:
3012                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3013                         input_queue_tail_incr_save(sd, qtail);
3014                         rps_unlock(sd);
3015                         local_irq_restore(flags);
3016                         return NET_RX_SUCCESS;
3017                 }
3018
3019                 /* Schedule NAPI for backlog device
3020                  * We can use non atomic operation since we own the queue lock
3021                  */
3022                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3023                         if (!rps_ipi_queued(sd))
3024                                 ____napi_schedule(sd, &sd->backlog);
3025                 }
3026                 goto enqueue;
3027         }
3028
3029         sd->dropped++;
3030         rps_unlock(sd);
3031
3032         local_irq_restore(flags);
3033
3034         atomic_long_inc(&skb->dev->rx_dropped);
3035         kfree_skb(skb);
3036         return NET_RX_DROP;
3037 }
3038
3039 /**
3040  *      netif_rx        -       post buffer to the network code
3041  *      @skb: buffer to post
3042  *
3043  *      This function receives a packet from a device driver and queues it for
3044  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3045  *      may be dropped during processing for congestion control or by the
3046  *      protocol layers.
3047  *
3048  *      return values:
3049  *      NET_RX_SUCCESS  (no congestion)
3050  *      NET_RX_DROP     (packet was dropped)
3051  *
3052  */
3053
3054 int netif_rx(struct sk_buff *skb)
3055 {
3056         int ret;
3057
3058         /* if netpoll wants it, pretend we never saw it */
3059         if (netpoll_rx(skb))
3060                 return NET_RX_DROP;
3061
3062         net_timestamp_check(netdev_tstamp_prequeue, skb);
3063
3064         trace_netif_rx(skb);
3065 #ifdef CONFIG_RPS
3066         if (static_key_false(&rps_needed)) {
3067                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3068                 int cpu;
3069
3070                 preempt_disable();
3071                 rcu_read_lock();
3072
3073                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3074                 if (cpu < 0)
3075                         cpu = smp_processor_id();
3076
3077                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3078
3079                 rcu_read_unlock();
3080                 preempt_enable();
3081         } else
3082 #endif
3083         {
3084                 unsigned int qtail;
3085                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3086                 put_cpu();
3087         }
3088         return ret;
3089 }
3090 EXPORT_SYMBOL(netif_rx);
3091
3092 int netif_rx_ni(struct sk_buff *skb)
3093 {
3094         int err;
3095
3096         preempt_disable();
3097         err = netif_rx(skb);
3098         if (local_softirq_pending())
3099                 do_softirq();
3100         preempt_enable();
3101
3102         return err;
3103 }
3104 EXPORT_SYMBOL(netif_rx_ni);
3105
3106 static void net_tx_action(struct softirq_action *h)
3107 {
3108         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3109
3110         if (sd->completion_queue) {
3111                 struct sk_buff *clist;
3112
3113                 local_irq_disable();
3114                 clist = sd->completion_queue;
3115                 sd->completion_queue = NULL;
3116                 local_irq_enable();
3117
3118                 while (clist) {
3119                         struct sk_buff *skb = clist;
3120                         clist = clist->next;
3121
3122                         WARN_ON(atomic_read(&skb->users));
3123                         trace_kfree_skb(skb, net_tx_action);
3124                         __kfree_skb(skb);
3125                 }
3126         }
3127
3128         if (sd->output_queue) {
3129                 struct Qdisc *head;
3130
3131                 local_irq_disable();
3132                 head = sd->output_queue;
3133                 sd->output_queue = NULL;
3134                 sd->output_queue_tailp = &sd->output_queue;
3135                 local_irq_enable();
3136
3137                 while (head) {
3138                         struct Qdisc *q = head;
3139                         spinlock_t *root_lock;
3140
3141                         head = head->next_sched;
3142
3143                         root_lock = qdisc_lock(q);
3144                         if (spin_trylock(root_lock)) {
3145                                 smp_mb__before_clear_bit();
3146                                 clear_bit(__QDISC_STATE_SCHED,
3147                                           &q->state);
3148                                 qdisc_run(q);
3149                                 spin_unlock(root_lock);
3150                         } else {
3151                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3152                                               &q->state)) {
3153                                         __netif_reschedule(q);
3154                                 } else {
3155                                         smp_mb__before_clear_bit();
3156                                         clear_bit(__QDISC_STATE_SCHED,
3157                                                   &q->state);
3158                                 }
3159                         }
3160                 }
3161         }
3162 }
3163
3164 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3165     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3166 /* This hook is defined here for ATM LANE */
3167 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3168                              unsigned char *addr) __read_mostly;
3169 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3170 #endif
3171
3172 #ifdef CONFIG_NET_CLS_ACT
3173 /* TODO: Maybe we should just force sch_ingress to be compiled in
3174  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3175  * a compare and 2 stores extra right now if we dont have it on
3176  * but have CONFIG_NET_CLS_ACT
3177  * NOTE: This doesn't stop any functionality; if you dont have
3178  * the ingress scheduler, you just can't add policies on ingress.
3179  *
3180  */
3181 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3182 {
3183         struct net_device *dev = skb->dev;
3184         u32 ttl = G_TC_RTTL(skb->tc_verd);
3185         int result = TC_ACT_OK;
3186         struct Qdisc *q;
3187
3188         if (unlikely(MAX_RED_LOOP < ttl++)) {
3189                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3190                                      skb->skb_iif, dev->ifindex);
3191                 return TC_ACT_SHOT;
3192         }
3193
3194         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3195         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3196
3197         q = rxq->qdisc;
3198         if (q != &noop_qdisc) {
3199                 spin_lock(qdisc_lock(q));
3200                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3201                         result = qdisc_enqueue_root(skb, q);
3202                 spin_unlock(qdisc_lock(q));
3203         }
3204
3205         return result;
3206 }
3207
3208 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3209                                          struct packet_type **pt_prev,
3210                                          int *ret, struct net_device *orig_dev)
3211 {
3212         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3213
3214         if (!rxq || rxq->qdisc == &noop_qdisc)
3215                 goto out;
3216
3217         if (*pt_prev) {
3218                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3219                 *pt_prev = NULL;
3220         }
3221
3222         switch (ing_filter(skb, rxq)) {
3223         case TC_ACT_SHOT:
3224         case TC_ACT_STOLEN:
3225                 kfree_skb(skb);
3226                 return NULL;
3227         }
3228
3229 out:
3230         skb->tc_verd = 0;
3231         return skb;
3232 }
3233 #endif
3234
3235 /**
3236  *      netdev_rx_handler_register - register receive handler
3237  *      @dev: device to register a handler for
3238  *      @rx_handler: receive handler to register
3239  *      @rx_handler_data: data pointer that is used by rx handler
3240  *
3241  *      Register a receive hander for a device. This handler will then be
3242  *      called from __netif_receive_skb. A negative errno code is returned
3243  *      on a failure.
3244  *
3245  *      The caller must hold the rtnl_mutex.
3246  *
3247  *      For a general description of rx_handler, see enum rx_handler_result.
3248  */
3249 int netdev_rx_handler_register(struct net_device *dev,
3250                                rx_handler_func_t *rx_handler,
3251                                void *rx_handler_data)
3252 {
3253         ASSERT_RTNL();
3254
3255         if (dev->rx_handler)
3256                 return -EBUSY;
3257
3258         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3259         rcu_assign_pointer(dev->rx_handler, rx_handler);
3260
3261         return 0;
3262 }
3263 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3264
3265 /**
3266  *      netdev_rx_handler_unregister - unregister receive handler
3267  *      @dev: device to unregister a handler from
3268  *
3269  *      Unregister a receive hander from a device.
3270  *
3271  *      The caller must hold the rtnl_mutex.
3272  */
3273 void netdev_rx_handler_unregister(struct net_device *dev)
3274 {
3275
3276         ASSERT_RTNL();
3277         RCU_INIT_POINTER(dev->rx_handler, NULL);
3278         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3279 }
3280 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3281
3282 /*
3283  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3284  * the special handling of PFMEMALLOC skbs.
3285  */
3286 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3287 {
3288         switch (skb->protocol) {
3289         case __constant_htons(ETH_P_ARP):
3290         case __constant_htons(ETH_P_IP):
3291         case __constant_htons(ETH_P_IPV6):
3292         case __constant_htons(ETH_P_8021Q):
3293                 return true;
3294         default:
3295                 return false;
3296         }
3297 }
3298
3299 static int __netif_receive_skb(struct sk_buff *skb)
3300 {
3301         struct packet_type *ptype, *pt_prev;
3302         rx_handler_func_t *rx_handler;
3303         struct net_device *orig_dev;
3304         struct net_device *null_or_dev;
3305         bool deliver_exact = false;
3306         int ret = NET_RX_DROP;
3307         __be16 type;
3308         unsigned long pflags = current->flags;
3309
3310         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3311
3312         trace_netif_receive_skb(skb);
3313
3314         /*
3315          * PFMEMALLOC skbs are special, they should
3316          * - be delivered to SOCK_MEMALLOC sockets only
3317          * - stay away from userspace
3318          * - have bounded memory usage
3319          *
3320          * Use PF_MEMALLOC as this saves us from propagating the allocation
3321          * context down to all allocation sites.
3322          */
3323         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3324                 current->flags |= PF_MEMALLOC;
3325
3326         /* if we've gotten here through NAPI, check netpoll */
3327         if (netpoll_receive_skb(skb))
3328                 goto out;
3329
3330         orig_dev = skb->dev;
3331
3332         skb_reset_network_header(skb);
3333         skb_reset_transport_header(skb);
3334         skb_reset_mac_len(skb);
3335
3336         pt_prev = NULL;
3337
3338         rcu_read_lock();
3339
3340 another_round:
3341         skb->skb_iif = skb->dev->ifindex;
3342
3343         __this_cpu_inc(softnet_data.processed);
3344
3345         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3346                 skb = vlan_untag(skb);
3347                 if (unlikely(!skb))
3348                         goto unlock;
3349         }
3350
3351 #ifdef CONFIG_NET_CLS_ACT
3352         if (skb->tc_verd & TC_NCLS) {
3353                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3354                 goto ncls;
3355         }
3356 #endif
3357
3358         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3359                 goto skip_taps;
3360
3361         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3362                 if (!ptype->dev || ptype->dev == skb->dev) {
3363                         if (pt_prev)
3364                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3365                         pt_prev = ptype;
3366                 }
3367         }
3368
3369 skip_taps:
3370 #ifdef CONFIG_NET_CLS_ACT
3371         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3372         if (!skb)
3373                 goto unlock;
3374 ncls:
3375 #endif
3376
3377         if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3378                                 && !skb_pfmemalloc_protocol(skb))
3379                 goto drop;
3380
3381         if (vlan_tx_tag_present(skb)) {
3382                 if (pt_prev) {
3383                         ret = deliver_skb(skb, pt_prev, orig_dev);
3384                         pt_prev = NULL;
3385                 }
3386                 if (vlan_do_receive(&skb))
3387                         goto another_round;
3388                 else if (unlikely(!skb))
3389                         goto unlock;
3390         }
3391
3392         rx_handler = rcu_dereference(skb->dev->rx_handler);
3393         if (rx_handler) {
3394                 if (pt_prev) {
3395                         ret = deliver_skb(skb, pt_prev, orig_dev);
3396                         pt_prev = NULL;
3397                 }
3398                 switch (rx_handler(&skb)) {
3399                 case RX_HANDLER_CONSUMED:
3400                         goto unlock;
3401                 case RX_HANDLER_ANOTHER:
3402                         goto another_round;
3403                 case RX_HANDLER_EXACT:
3404                         deliver_exact = true;
3405                 case RX_HANDLER_PASS:
3406                         break;
3407                 default:
3408                         BUG();
3409                 }
3410         }
3411
3412         if (vlan_tx_nonzero_tag_present(skb))
3413                 skb->pkt_type = PACKET_OTHERHOST;
3414
3415         /* deliver only exact match when indicated */
3416         null_or_dev = deliver_exact ? skb->dev : NULL;
3417
3418         type = skb->protocol;
3419         list_for_each_entry_rcu(ptype,
3420                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3421                 if (ptype->type == type &&
3422                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3423                      ptype->dev == orig_dev)) {
3424                         if (pt_prev)
3425                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3426                         pt_prev = ptype;
3427                 }
3428         }
3429
3430         if (pt_prev) {
3431                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3432                         goto drop;
3433                 else
3434                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3435         } else {
3436 drop:
3437                 atomic_long_inc(&skb->dev->rx_dropped);
3438                 kfree_skb(skb);
3439                 /* Jamal, now you will not able to escape explaining
3440                  * me how you were going to use this. :-)
3441                  */
3442                 ret = NET_RX_DROP;
3443         }
3444
3445 unlock:
3446         rcu_read_unlock();
3447 out:
3448         tsk_restore_flags(current, pflags, PF_MEMALLOC);
3449         return ret;
3450 }
3451
3452 /**
3453  *      netif_receive_skb - process receive buffer from network
3454  *      @skb: buffer to process
3455  *
3456  *      netif_receive_skb() is the main receive data processing function.
3457  *      It always succeeds. The buffer may be dropped during processing
3458  *      for congestion control or by the protocol layers.
3459  *
3460  *      This function may only be called from softirq context and interrupts
3461  *      should be enabled.
3462  *
3463  *      Return values (usually ignored):
3464  *      NET_RX_SUCCESS: no congestion
3465  *      NET_RX_DROP: packet was dropped
3466  */
3467 int netif_receive_skb(struct sk_buff *skb)
3468 {
3469         net_timestamp_check(netdev_tstamp_prequeue, skb);
3470
3471         if (skb_defer_rx_timestamp(skb))
3472                 return NET_RX_SUCCESS;
3473
3474 #ifdef CONFIG_RPS
3475         if (static_key_false(&rps_needed)) {
3476                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3477                 int cpu, ret;
3478
3479                 rcu_read_lock();
3480
3481                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3482
3483                 if (cpu >= 0) {
3484                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3485                         rcu_read_unlock();
3486                         return ret;
3487                 }
3488                 rcu_read_unlock();
3489         }
3490 #endif
3491         return __netif_receive_skb(skb);
3492 }
3493 EXPORT_SYMBOL(netif_receive_skb);
3494
3495 /* Network device is going away, flush any packets still pending
3496  * Called with irqs disabled.
3497  */
3498 static void flush_backlog(void *arg)
3499 {
3500         struct net_device *dev = arg;
3501         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3502         struct sk_buff *skb, *tmp;
3503
3504         rps_lock(sd);
3505         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3506                 if (skb->dev == dev) {
3507                         __skb_unlink(skb, &sd->input_pkt_queue);
3508                         kfree_skb(skb);
3509                         input_queue_head_incr(sd);
3510                 }
3511         }
3512         rps_unlock(sd);
3513
3514         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3515                 if (skb->dev == dev) {
3516                         __skb_unlink(skb, &sd->process_queue);
3517                         kfree_skb(skb);
3518                         input_queue_head_incr(sd);
3519                 }
3520         }
3521 }
3522
3523 static int napi_gro_complete(struct sk_buff *skb)
3524 {
3525         struct packet_type *ptype;
3526         __be16 type = skb->protocol;
3527         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3528         int err = -ENOENT;
3529
3530         if (NAPI_GRO_CB(skb)->count == 1) {
3531                 skb_shinfo(skb)->gso_size = 0;
3532                 goto out;
3533         }
3534
3535         rcu_read_lock();
3536         list_for_each_entry_rcu(ptype, head, list) {
3537                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3538                         continue;
3539
3540                 err = ptype->gro_complete(skb);
3541                 break;
3542         }
3543         rcu_read_unlock();
3544
3545         if (err) {
3546                 WARN_ON(&ptype->list == head);
3547                 kfree_skb(skb);
3548                 return NET_RX_SUCCESS;
3549         }
3550
3551 out:
3552         return netif_receive_skb(skb);
3553 }
3554
3555 /* napi->gro_list contains packets ordered by age.
3556  * youngest packets at the head of it.
3557  * Complete skbs in reverse order to reduce latencies.
3558  */
3559 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3560 {
3561         struct sk_buff *skb, *prev = NULL;
3562
3563         /* scan list and build reverse chain */
3564         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3565                 skb->prev = prev;
3566                 prev = skb;
3567         }
3568
3569         for (skb = prev; skb; skb = prev) {
3570                 skb->next = NULL;
3571
3572                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3573                         return;
3574
3575                 prev = skb->prev;
3576                 napi_gro_complete(skb);
3577                 napi->gro_count--;
3578         }
3579
3580         napi->gro_list = NULL;
3581 }
3582 EXPORT_SYMBOL(napi_gro_flush);
3583
3584 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3585 {
3586         struct sk_buff **pp = NULL;
3587         struct packet_type *ptype;
3588         __be16 type = skb->protocol;
3589         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3590         int same_flow;
3591         int mac_len;
3592         enum gro_result ret;
3593
3594         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3595                 goto normal;
3596
3597         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3598                 goto normal;
3599
3600         rcu_read_lock();
3601         list_for_each_entry_rcu(ptype, head, list) {
3602                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3603                         continue;
3604
3605                 skb_set_network_header(skb, skb_gro_offset(skb));
3606                 mac_len = skb->network_header - skb->mac_header;
3607                 skb->mac_len = mac_len;
3608                 NAPI_GRO_CB(skb)->same_flow = 0;
3609                 NAPI_GRO_CB(skb)->flush = 0;
3610                 NAPI_GRO_CB(skb)->free = 0;
3611
3612                 pp = ptype->gro_receive(&napi->gro_list, skb);
3613                 break;
3614         }
3615         rcu_read_unlock();
3616
3617         if (&ptype->list == head)
3618                 goto normal;
3619
3620         same_flow = NAPI_GRO_CB(skb)->same_flow;
3621         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3622
3623         if (pp) {
3624                 struct sk_buff *nskb = *pp;
3625
3626                 *pp = nskb->next;
3627                 nskb->next = NULL;
3628                 napi_gro_complete(nskb);
3629                 napi->gro_count--;
3630         }
3631
3632         if (same_flow)
3633                 goto ok;
3634
3635         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3636                 goto normal;
3637
3638         napi->gro_count++;
3639         NAPI_GRO_CB(skb)->count = 1;
3640         NAPI_GRO_CB(skb)->age = jiffies;
3641         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3642         skb->next = napi->gro_list;
3643         napi->gro_list = skb;
3644         ret = GRO_HELD;
3645
3646 pull:
3647         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3648                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3649
3650                 BUG_ON(skb->end - skb->tail < grow);
3651
3652                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3653
3654                 skb->tail += grow;
3655                 skb->data_len -= grow;
3656
3657                 skb_shinfo(skb)->frags[0].page_offset += grow;
3658                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3659
3660                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3661                         skb_frag_unref(skb, 0);
3662                         memmove(skb_shinfo(skb)->frags,
3663                                 skb_shinfo(skb)->frags + 1,
3664                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3665                 }
3666         }
3667
3668 ok:
3669         return ret;
3670
3671 normal:
3672         ret = GRO_NORMAL;
3673         goto pull;
3674 }
3675 EXPORT_SYMBOL(dev_gro_receive);
3676
3677 static inline gro_result_t
3678 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3679 {
3680         struct sk_buff *p;
3681         unsigned int maclen = skb->dev->hard_header_len;
3682
3683         for (p = napi->gro_list; p; p = p->next) {
3684                 unsigned long diffs;
3685
3686                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3687                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3688                 if (maclen == ETH_HLEN)
3689                         diffs |= compare_ether_header(skb_mac_header(p),
3690                                                       skb_gro_mac_header(skb));
3691                 else if (!diffs)
3692                         diffs = memcmp(skb_mac_header(p),
3693                                        skb_gro_mac_header(skb),
3694                                        maclen);
3695                 NAPI_GRO_CB(p)->same_flow = !diffs;
3696                 NAPI_GRO_CB(p)->flush = 0;
3697         }
3698
3699         return dev_gro_receive(napi, skb);
3700 }
3701
3702 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3703 {
3704         switch (ret) {
3705         case GRO_NORMAL:
3706                 if (netif_receive_skb(skb))
3707                         ret = GRO_DROP;
3708                 break;
3709
3710         case GRO_DROP:
3711                 kfree_skb(skb);
3712                 break;
3713
3714         case GRO_MERGED_FREE:
3715                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3716                         kmem_cache_free(skbuff_head_cache, skb);
3717                 else
3718                         __kfree_skb(skb);
3719                 break;
3720
3721         case GRO_HELD:
3722         case GRO_MERGED:
3723                 break;
3724         }
3725
3726         return ret;
3727 }
3728 EXPORT_SYMBOL(napi_skb_finish);
3729
3730 static void skb_gro_reset_offset(struct sk_buff *skb)
3731 {
3732         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3733         const skb_frag_t *frag0 = &pinfo->frags[0];
3734
3735         NAPI_GRO_CB(skb)->data_offset = 0;
3736         NAPI_GRO_CB(skb)->frag0 = NULL;
3737         NAPI_GRO_CB(skb)->frag0_len = 0;
3738
3739         if (skb->mac_header == skb->tail &&
3740             pinfo->nr_frags &&
3741             !PageHighMem(skb_frag_page(frag0))) {
3742                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3743                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3744         }
3745 }
3746
3747 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3748 {
3749         skb_gro_reset_offset(skb);
3750
3751         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3752 }
3753 EXPORT_SYMBOL(napi_gro_receive);
3754
3755 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3756 {
3757         __skb_pull(skb, skb_headlen(skb));
3758         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3759         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3760         skb->vlan_tci = 0;
3761         skb->dev = napi->dev;
3762         skb->skb_iif = 0;
3763
3764         napi->skb = skb;
3765 }
3766
3767 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3768 {
3769         struct sk_buff *skb = napi->skb;
3770
3771         if (!skb) {
3772                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3773                 if (skb)
3774                         napi->skb = skb;
3775         }
3776         return skb;
3777 }
3778 EXPORT_SYMBOL(napi_get_frags);
3779
3780 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3781                                gro_result_t ret)
3782 {
3783         switch (ret) {
3784         case GRO_NORMAL:
3785         case GRO_HELD:
3786                 skb->protocol = eth_type_trans(skb, skb->dev);
3787
3788                 if (ret == GRO_HELD)
3789                         skb_gro_pull(skb, -ETH_HLEN);
3790                 else if (netif_receive_skb(skb))
3791                         ret = GRO_DROP;
3792                 break;
3793
3794         case GRO_DROP:
3795         case GRO_MERGED_FREE:
3796                 napi_reuse_skb(napi, skb);
3797                 break;
3798
3799         case GRO_MERGED:
3800                 break;
3801         }
3802
3803         return ret;
3804 }
3805 EXPORT_SYMBOL(napi_frags_finish);
3806
3807 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3808 {
3809         struct sk_buff *skb = napi->skb;
3810         struct ethhdr *eth;
3811         unsigned int hlen;
3812         unsigned int off;
3813
3814         napi->skb = NULL;
3815
3816         skb_reset_mac_header(skb);
3817         skb_gro_reset_offset(skb);
3818
3819         off = skb_gro_offset(skb);
3820         hlen = off + sizeof(*eth);
3821         eth = skb_gro_header_fast(skb, off);
3822         if (skb_gro_header_hard(skb, hlen)) {
3823                 eth = skb_gro_header_slow(skb, hlen, off);
3824                 if (unlikely(!eth)) {
3825                         napi_reuse_skb(napi, skb);
3826                         skb = NULL;
3827                         goto out;
3828                 }
3829         }
3830
3831         skb_gro_pull(skb, sizeof(*eth));
3832
3833         /*
3834          * This works because the only protocols we care about don't require
3835          * special handling.  We'll fix it up properly at the end.
3836          */
3837         skb->protocol = eth->h_proto;
3838
3839 out:
3840         return skb;
3841 }
3842
3843 gro_result_t napi_gro_frags(struct napi_struct *napi)
3844 {
3845         struct sk_buff *skb = napi_frags_skb(napi);
3846
3847         if (!skb)
3848                 return GRO_DROP;
3849
3850         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3851 }
3852 EXPORT_SYMBOL(napi_gro_frags);
3853
3854 /*
3855  * net_rps_action sends any pending IPI's for rps.
3856  * Note: called with local irq disabled, but exits with local irq enabled.
3857  */
3858 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3859 {
3860 #ifdef CONFIG_RPS
3861         struct softnet_data *remsd = sd->rps_ipi_list;
3862
3863         if (remsd) {
3864                 sd->rps_ipi_list = NULL;
3865
3866                 local_irq_enable();
3867
3868                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3869                 while (remsd) {
3870                         struct softnet_data *next = remsd->rps_ipi_next;
3871
3872                         if (cpu_online(remsd->cpu))
3873                                 __smp_call_function_single(remsd->cpu,
3874                                                            &remsd->csd, 0);
3875                         remsd = next;
3876                 }
3877         } else
3878 #endif
3879                 local_irq_enable();
3880 }
3881
3882 static int process_backlog(struct napi_struct *napi, int quota)
3883 {
3884         int work = 0;
3885         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3886
3887 #ifdef CONFIG_RPS
3888         /* Check if we have pending ipi, its better to send them now,
3889          * not waiting net_rx_action() end.
3890          */
3891         if (sd->rps_ipi_list) {
3892                 local_irq_disable();
3893                 net_rps_action_and_irq_enable(sd);
3894         }
3895 #endif
3896         napi->weight = weight_p;
3897         local_irq_disable();
3898         while (work < quota) {
3899                 struct sk_buff *skb;
3900                 unsigned int qlen;
3901
3902                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3903                         local_irq_enable();
3904                         __netif_receive_skb(skb);
3905                         local_irq_disable();
3906                         input_queue_head_incr(sd);
3907                         if (++work >= quota) {
3908                                 local_irq_enable();
3909                                 return work;
3910                         }
3911                 }
3912
3913                 rps_lock(sd);
3914                 qlen = skb_queue_len(&sd->input_pkt_queue);
3915                 if (qlen)
3916                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3917                                                    &sd->process_queue);
3918
3919                 if (qlen < quota - work) {
3920                         /*
3921                          * Inline a custom version of __napi_complete().
3922                          * only current cpu owns and manipulates this napi,
3923                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3924                          * we can use a plain write instead of clear_bit(),
3925                          * and we dont need an smp_mb() memory barrier.
3926                          */
3927                         list_del(&napi->poll_list);
3928                         napi->state = 0;
3929
3930                         quota = work + qlen;
3931                 }
3932                 rps_unlock(sd);
3933         }
3934         local_irq_enable();
3935
3936         return work;
3937 }
3938
3939 /**
3940  * __napi_schedule - schedule for receive
3941  * @n: entry to schedule
3942  *
3943  * The entry's receive function will be scheduled to run
3944  */
3945 void __napi_schedule(struct napi_struct *n)
3946 {
3947         unsigned long flags;
3948
3949         local_irq_save(flags);
3950         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3951         local_irq_restore(flags);
3952 }
3953 EXPORT_SYMBOL(__napi_schedule);
3954
3955 void __napi_complete(struct napi_struct *n)
3956 {
3957         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3958         BUG_ON(n->gro_list);
3959
3960         list_del(&n->poll_list);
3961         smp_mb__before_clear_bit();
3962         clear_bit(NAPI_STATE_SCHED, &n->state);
3963 }
3964 EXPORT_SYMBOL(__napi_complete);
3965
3966 void napi_complete(struct napi_struct *n)
3967 {
3968         unsigned long flags;
3969
3970         /*
3971          * don't let napi dequeue from the cpu poll list
3972          * just in case its running on a different cpu
3973          */
3974         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3975                 return;
3976
3977         napi_gro_flush(n, false);
3978         local_irq_save(flags);
3979         __napi_complete(n);
3980         local_irq_restore(flags);
3981 }
3982 EXPORT_SYMBOL(napi_complete);
3983
3984 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3985                     int (*poll)(struct napi_struct *, int), int weight)
3986 {
3987         INIT_LIST_HEAD(&napi->poll_list);
3988         napi->gro_count = 0;
3989         napi->gro_list = NULL;
3990         napi->skb = NULL;
3991         napi->poll = poll;
3992         napi->weight = weight;
3993         list_add(&napi->dev_list, &dev->napi_list);
3994         napi->dev = dev;
3995 #ifdef CONFIG_NETPOLL
3996         spin_lock_init(&napi->poll_lock);
3997         napi->poll_owner = -1;
3998 #endif
3999         set_bit(NAPI_STATE_SCHED, &napi->state);
4000 }
4001 EXPORT_SYMBOL(netif_napi_add);
4002
4003 void netif_napi_del(struct napi_struct *napi)
4004 {
4005         struct sk_buff *skb, *next;
4006
4007         list_del_init(&napi->dev_list);
4008         napi_free_frags(napi);
4009
4010         for (skb = napi->gro_list; skb; skb = next) {
4011                 next = skb->next;
4012                 skb->next = NULL;
4013                 kfree_skb(skb);
4014         }
4015
4016         napi->gro_list = NULL;
4017         napi->gro_count = 0;
4018 }
4019 EXPORT_SYMBOL(netif_napi_del);
4020
4021 static void net_rx_action(struct softirq_action *h)
4022 {
4023         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4024         unsigned long time_limit = jiffies + 2;
4025         int budget = netdev_budget;
4026         void *have;
4027
4028         local_irq_disable();
4029
4030         while (!list_empty(&sd->poll_list)) {
4031                 struct napi_struct *n;
4032                 int work, weight;
4033
4034                 /* If softirq window is exhuasted then punt.
4035                  * Allow this to run for 2 jiffies since which will allow
4036                  * an average latency of 1.5/HZ.
4037                  */
4038                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4039                         goto softnet_break;
4040
4041                 local_irq_enable();
4042
4043                 /* Even though interrupts have been re-enabled, this
4044                  * access is safe because interrupts can only add new
4045                  * entries to the tail of this list, and only ->poll()
4046                  * calls can remove this head entry from the list.
4047                  */
4048                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4049
4050                 have = netpoll_poll_lock(n);
4051
4052                 weight = n->weight;
4053
4054                 /* This NAPI_STATE_SCHED test is for avoiding a race
4055                  * with netpoll's poll_napi().  Only the entity which
4056                  * obtains the lock and sees NAPI_STATE_SCHED set will
4057                  * actually make the ->poll() call.  Therefore we avoid
4058                  * accidentally calling ->poll() when NAPI is not scheduled.
4059                  */
4060                 work = 0;
4061                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4062                         work = n->poll(n, weight);
4063                         trace_napi_poll(n);
4064                 }
4065
4066                 WARN_ON_ONCE(work > weight);
4067
4068                 budget -= work;
4069
4070                 local_irq_disable();
4071
4072                 /* Drivers must not modify the NAPI state if they
4073                  * consume the entire weight.  In such cases this code
4074                  * still "owns" the NAPI instance and therefore can
4075                  * move the instance around on the list at-will.
4076                  */
4077                 if (unlikely(work == weight)) {
4078                         if (unlikely(napi_disable_pending(n))) {
4079                                 local_irq_enable();
4080                                 napi_complete(n);
4081                                 local_irq_disable();
4082                         } else {
4083                                 if (n->gro_list) {
4084                                         /* flush too old packets
4085                                          * If HZ < 1000, flush all packets.
4086                                          */
4087                                         local_irq_enable();
4088                                         napi_gro_flush(n, HZ >= 1000);
4089                                         local_irq_disable();
4090                                 }
4091                                 list_move_tail(&n->poll_list, &sd->poll_list);
4092                         }
4093                 }
4094
4095                 netpoll_poll_unlock(have);
4096         }
4097 out:
4098         net_rps_action_and_irq_enable(sd);
4099
4100 #ifdef CONFIG_NET_DMA
4101         /*
4102          * There may not be any more sk_buffs coming right now, so push
4103          * any pending DMA copies to hardware
4104          */
4105         dma_issue_pending_all();
4106 #endif
4107
4108         return;
4109
4110 softnet_break:
4111         sd->time_squeeze++;
4112         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4113         goto out;
4114 }
4115
4116 static gifconf_func_t *gifconf_list[NPROTO];
4117
4118 /**
4119  *      register_gifconf        -       register a SIOCGIF handler
4120  *      @family: Address family
4121  *      @gifconf: Function handler
4122  *
4123  *      Register protocol dependent address dumping routines. The handler
4124  *      that is passed must not be freed or reused until it has been replaced
4125  *      by another handler.
4126  */
4127 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4128 {
4129         if (family >= NPROTO)
4130                 return -EINVAL;
4131         gifconf_list[family] = gifconf;
4132         return 0;
4133 }
4134 EXPORT_SYMBOL(register_gifconf);
4135
4136
4137 /*
4138  *      Map an interface index to its name (SIOCGIFNAME)
4139  */
4140
4141 /*
4142  *      We need this ioctl for efficient implementation of the
4143  *      if_indextoname() function required by the IPv6 API.  Without
4144  *      it, we would have to search all the interfaces to find a
4145  *      match.  --pb
4146  */
4147
4148 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4149 {
4150         struct net_device *dev;
4151         struct ifreq ifr;
4152
4153         /*
4154          *      Fetch the caller's info block.
4155          */
4156
4157         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4158                 return -EFAULT;
4159
4160         rcu_read_lock();
4161         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4162         if (!dev) {
4163                 rcu_read_unlock();
4164                 return -ENODEV;
4165         }
4166
4167         strcpy(ifr.ifr_name, dev->name);
4168         rcu_read_unlock();
4169
4170         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4171                 return -EFAULT;
4172         return 0;
4173 }
4174
4175 /*
4176  *      Perform a SIOCGIFCONF call. This structure will change
4177  *      size eventually, and there is nothing I can do about it.
4178  *      Thus we will need a 'compatibility mode'.
4179  */
4180
4181 static int dev_ifconf(struct net *net, char __user *arg)
4182 {
4183         struct ifconf ifc;
4184         struct net_device *dev;
4185         char __user *pos;
4186         int len;
4187         int total;
4188         int i;
4189
4190         /*
4191          *      Fetch the caller's info block.
4192          */
4193
4194         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4195                 return -EFAULT;
4196
4197         pos = ifc.ifc_buf;
4198         len = ifc.ifc_len;
4199
4200         /*
4201          *      Loop over the interfaces, and write an info block for each.
4202          */
4203
4204         total = 0;
4205         for_each_netdev(net, dev) {
4206                 for (i = 0; i < NPROTO; i++) {
4207                         if (gifconf_list[i]) {
4208                                 int done;
4209                                 if (!pos)
4210                                         done = gifconf_list[i](dev, NULL, 0);
4211                                 else
4212                                         done = gifconf_list[i](dev, pos + total,
4213                                                                len - total);
4214                                 if (done < 0)
4215                                         return -EFAULT;
4216                                 total += done;
4217                         }
4218                 }
4219         }
4220
4221         /*
4222          *      All done.  Write the updated control block back to the caller.
4223          */
4224         ifc.ifc_len = total;
4225
4226         /*
4227          *      Both BSD and Solaris return 0 here, so we do too.
4228          */
4229         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4230 }
4231
4232 #ifdef CONFIG_PROC_FS
4233
4234 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4235
4236 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4237 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4238 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4239
4240 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4241 {
4242         struct net *net = seq_file_net(seq);
4243         struct net_device *dev;
4244         struct hlist_node *p;
4245         struct hlist_head *h;
4246         unsigned int count = 0, offset = get_offset(*pos);
4247
4248         h = &net->dev_name_head[get_bucket(*pos)];
4249         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4250                 if (++count == offset)
4251                         return dev;
4252         }
4253
4254         return NULL;
4255 }
4256
4257 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4258 {
4259         struct net_device *dev;
4260         unsigned int bucket;
4261
4262         do {
4263                 dev = dev_from_same_bucket(seq, pos);
4264                 if (dev)
4265                         return dev;
4266
4267                 bucket = get_bucket(*pos) + 1;
4268                 *pos = set_bucket_offset(bucket, 1);
4269         } while (bucket < NETDEV_HASHENTRIES);
4270
4271         return NULL;
4272 }
4273
4274 /*
4275  *      This is invoked by the /proc filesystem handler to display a device
4276  *      in detail.
4277  */
4278 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4279         __acquires(RCU)
4280 {
4281         rcu_read_lock();
4282         if (!*pos)
4283                 return SEQ_START_TOKEN;
4284
4285         if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4286                 return NULL;
4287
4288         return dev_from_bucket(seq, pos);
4289 }
4290
4291 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4292 {
4293         ++*pos;
4294         return dev_from_bucket(seq, pos);
4295 }
4296
4297 void dev_seq_stop(struct seq_file *seq, void *v)
4298         __releases(RCU)
4299 {
4300         rcu_read_unlock();
4301 }
4302
4303 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4304 {
4305         struct rtnl_link_stats64 temp;
4306         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4307
4308         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4309                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4310                    dev->name, stats->rx_bytes, stats->rx_packets,
4311                    stats->rx_errors,
4312                    stats->rx_dropped + stats->rx_missed_errors,
4313                    stats->rx_fifo_errors,
4314                    stats->rx_length_errors + stats->rx_over_errors +
4315                     stats->rx_crc_errors + stats->rx_frame_errors,
4316                    stats->rx_compressed, stats->multicast,
4317                    stats->tx_bytes, stats->tx_packets,
4318                    stats->tx_errors, stats->tx_dropped,
4319                    stats->tx_fifo_errors, stats->collisions,
4320                    stats->tx_carrier_errors +
4321                     stats->tx_aborted_errors +
4322                     stats->tx_window_errors +
4323                     stats->tx_heartbeat_errors,
4324                    stats->tx_compressed);
4325 }
4326
4327 /*
4328  *      Called from the PROCfs module. This now uses the new arbitrary sized
4329  *      /proc/net interface to create /proc/net/dev
4330  */
4331 static int dev_seq_show(struct seq_file *seq, void *v)
4332 {
4333         if (v == SEQ_START_TOKEN)
4334                 seq_puts(seq, "Inter-|   Receive                            "
4335                               "                    |  Transmit\n"
4336                               " face |bytes    packets errs drop fifo frame "
4337                               "compressed multicast|bytes    packets errs "
4338                               "drop fifo colls carrier compressed\n");
4339         else
4340                 dev_seq_printf_stats(seq, v);
4341         return 0;
4342 }
4343
4344 static struct softnet_data *softnet_get_online(loff_t *pos)
4345 {
4346         struct softnet_data *sd = NULL;
4347
4348         while (*pos < nr_cpu_ids)
4349                 if (cpu_online(*pos)) {
4350                         sd = &per_cpu(softnet_data, *pos);
4351                         break;
4352                 } else
4353                         ++*pos;
4354         return sd;
4355 }
4356
4357 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4358 {
4359         return softnet_get_online(pos);
4360 }
4361
4362 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4363 {
4364         ++*pos;
4365         return softnet_get_online(pos);
4366 }
4367
4368 static void softnet_seq_stop(struct seq_file *seq, void *v)
4369 {
4370 }
4371
4372 static int softnet_seq_show(struct seq_file *seq, void *v)
4373 {
4374         struct softnet_data *sd = v;
4375
4376         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4377                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4378                    0, 0, 0, 0, /* was fastroute */
4379                    sd->cpu_collision, sd->received_rps);
4380         return 0;
4381 }
4382
4383 static const struct seq_operations dev_seq_ops = {
4384         .start = dev_seq_start,
4385         .next  = dev_seq_next,
4386         .stop  = dev_seq_stop,
4387         .show  = dev_seq_show,
4388 };
4389
4390 static int dev_seq_open(struct inode *inode, struct file *file)
4391 {
4392         return seq_open_net(inode, file, &dev_seq_ops,
4393                             sizeof(struct seq_net_private));
4394 }
4395
4396 static const struct file_operations dev_seq_fops = {
4397         .owner   = THIS_MODULE,
4398         .open    = dev_seq_open,
4399         .read    = seq_read,
4400         .llseek  = seq_lseek,
4401         .release = seq_release_net,
4402 };
4403
4404 static const struct seq_operations softnet_seq_ops = {
4405         .start = softnet_seq_start,
4406         .next  = softnet_seq_next,
4407         .stop  = softnet_seq_stop,
4408         .show  = softnet_seq_show,
4409 };
4410
4411 static int softnet_seq_open(struct inode *inode, struct file *file)
4412 {
4413         return seq_open(file, &softnet_seq_ops);
4414 }
4415
4416 static const struct file_operations softnet_seq_fops = {
4417         .owner   = THIS_MODULE,
4418         .open    = softnet_seq_open,
4419         .read    = seq_read,
4420         .llseek  = seq_lseek,
4421         .release = seq_release,
4422 };
4423
4424 static void *ptype_get_idx(loff_t pos)
4425 {
4426         struct packet_type *pt = NULL;
4427         loff_t i = 0;
4428         int t;
4429
4430         list_for_each_entry_rcu(pt, &ptype_all, list) {
4431                 if (i == pos)
4432                         return pt;
4433                 ++i;
4434         }
4435
4436         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4437                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4438                         if (i == pos)
4439                                 return pt;
4440                         ++i;
4441                 }
4442         }
4443         return NULL;
4444 }
4445
4446 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4447         __acquires(RCU)
4448 {
4449         rcu_read_lock();
4450         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4451 }
4452
4453 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4454 {
4455         struct packet_type *pt;
4456         struct list_head *nxt;
4457         int hash;
4458
4459         ++*pos;
4460         if (v == SEQ_START_TOKEN)
4461                 return ptype_get_idx(0);
4462
4463         pt = v;
4464         nxt = pt->list.next;
4465         if (pt->type == htons(ETH_P_ALL)) {
4466                 if (nxt != &ptype_all)
4467                         goto found;
4468                 hash = 0;
4469                 nxt = ptype_base[0].next;
4470         } else
4471                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4472
4473         while (nxt == &ptype_base[hash]) {
4474                 if (++hash >= PTYPE_HASH_SIZE)
4475                         return NULL;
4476                 nxt = ptype_base[hash].next;
4477         }
4478 found:
4479         return list_entry(nxt, struct packet_type, list);
4480 }
4481
4482 static void ptype_seq_stop(struct seq_file *seq, void *v)
4483         __releases(RCU)
4484 {
4485         rcu_read_unlock();
4486 }
4487
4488 static int ptype_seq_show(struct seq_file *seq, void *v)
4489 {
4490         struct packet_type *pt = v;
4491
4492         if (v == SEQ_START_TOKEN)
4493                 seq_puts(seq, "Type Device      Function\n");
4494         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4495                 if (pt->type == htons(ETH_P_ALL))
4496                         seq_puts(seq, "ALL ");
4497                 else
4498                         seq_printf(seq, "%04x", ntohs(pt->type));
4499
4500                 seq_printf(seq, " %-8s %pF\n",
4501                            pt->dev ? pt->dev->name : "", pt->func);
4502         }
4503
4504         return 0;
4505 }
4506
4507 static const struct seq_operations ptype_seq_ops = {
4508         .start = ptype_seq_start,
4509         .next  = ptype_seq_next,
4510         .stop  = ptype_seq_stop,
4511         .show  = ptype_seq_show,
4512 };
4513
4514 static int ptype_seq_open(struct inode *inode, struct file *file)
4515 {
4516         return seq_open_net(inode, file, &ptype_seq_ops,
4517                         sizeof(struct seq_net_private));
4518 }
4519
4520 static const struct file_operations ptype_seq_fops = {
4521         .owner   = THIS_MODULE,
4522         .open    = ptype_seq_open,
4523         .read    = seq_read,
4524         .llseek  = seq_lseek,
4525         .release = seq_release_net,
4526 };
4527
4528
4529 static int __net_init dev_proc_net_init(struct net *net)
4530 {
4531         int rc = -ENOMEM;
4532
4533         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4534                 goto out;
4535         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4536                 goto out_dev;
4537         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4538                 goto out_softnet;
4539
4540         if (wext_proc_init(net))
4541                 goto out_ptype;
4542         rc = 0;
4543 out:
4544         return rc;
4545 out_ptype:
4546         proc_net_remove(net, "ptype");
4547 out_softnet:
4548         proc_net_remove(net, "softnet_stat");
4549 out_dev:
4550         proc_net_remove(net, "dev");
4551         goto out;
4552 }
4553
4554 static void __net_exit dev_proc_net_exit(struct net *net)
4555 {
4556         wext_proc_exit(net);
4557
4558         proc_net_remove(net, "ptype");
4559         proc_net_remove(net, "softnet_stat");
4560         proc_net_remove(net, "dev");
4561 }
4562
4563 static struct pernet_operations __net_initdata dev_proc_ops = {
4564         .init = dev_proc_net_init,
4565         .exit = dev_proc_net_exit,
4566 };
4567
4568 static int __init dev_proc_init(void)
4569 {
4570         return register_pernet_subsys(&dev_proc_ops);
4571 }
4572 #else
4573 #define dev_proc_init() 0
4574 #endif  /* CONFIG_PROC_FS */
4575
4576
4577 /**
4578  *      netdev_set_master       -       set up master pointer
4579  *      @slave: slave device
4580  *      @master: new master device
4581  *
4582  *      Changes the master device of the slave. Pass %NULL to break the
4583  *      bonding. The caller must hold the RTNL semaphore. On a failure
4584  *      a negative errno code is returned. On success the reference counts
4585  *      are adjusted and the function returns zero.
4586  */
4587 int netdev_set_master(struct net_device *slave, struct net_device *master)
4588 {
4589         struct net_device *old = slave->master;
4590
4591         ASSERT_RTNL();
4592
4593         if (master) {
4594                 if (old)
4595                         return -EBUSY;
4596                 dev_hold(master);
4597         }
4598
4599         slave->master = master;
4600
4601         if (old)
4602                 dev_put(old);
4603         return 0;
4604 }
4605 EXPORT_SYMBOL(netdev_set_master);
4606
4607 /**
4608  *      netdev_set_bond_master  -       set up bonding master/slave pair
4609  *      @slave: slave device
4610  *      @master: new master device
4611  *
4612  *      Changes the master device of the slave. Pass %NULL to break the
4613  *      bonding. The caller must hold the RTNL semaphore. On a failure
4614  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4615  *      to the routing socket and the function returns zero.
4616  */
4617 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4618 {
4619         int err;
4620
4621         ASSERT_RTNL();
4622
4623         err = netdev_set_master(slave, master);
4624         if (err)
4625                 return err;
4626         if (master)
4627                 slave->flags |= IFF_SLAVE;
4628         else
4629                 slave->flags &= ~IFF_SLAVE;
4630
4631         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4632         return 0;
4633 }
4634 EXPORT_SYMBOL(netdev_set_bond_master);
4635
4636 static void dev_change_rx_flags(struct net_device *dev, int flags)
4637 {
4638         const struct net_device_ops *ops = dev->netdev_ops;
4639
4640         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4641                 ops->ndo_change_rx_flags(dev, flags);
4642 }
4643
4644 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4645 {
4646         unsigned int old_flags = dev->flags;
4647         kuid_t uid;
4648         kgid_t gid;
4649
4650         ASSERT_RTNL();
4651
4652         dev->flags |= IFF_PROMISC;
4653         dev->promiscuity += inc;
4654         if (dev->promiscuity == 0) {
4655                 /*
4656                  * Avoid overflow.
4657                  * If inc causes overflow, untouch promisc and return error.
4658                  */
4659                 if (inc < 0)
4660                         dev->flags &= ~IFF_PROMISC;
4661                 else {
4662                         dev->promiscuity -= inc;
4663                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4664                                 dev->name);
4665                         return -EOVERFLOW;
4666                 }
4667         }
4668         if (dev->flags != old_flags) {
4669                 pr_info("device %s %s promiscuous mode\n",
4670                         dev->name,
4671                         dev->flags & IFF_PROMISC ? "entered" : "left");
4672                 if (audit_enabled) {
4673                         current_uid_gid(&uid, &gid);
4674                         audit_log(current->audit_context, GFP_ATOMIC,
4675                                 AUDIT_ANOM_PROMISCUOUS,
4676                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4677                                 dev->name, (dev->flags & IFF_PROMISC),
4678                                 (old_flags & IFF_PROMISC),
4679                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4680                                 from_kuid(&init_user_ns, uid),
4681                                 from_kgid(&init_user_ns, gid),
4682                                 audit_get_sessionid(current));
4683                 }
4684
4685                 dev_change_rx_flags(dev, IFF_PROMISC);
4686         }
4687         return 0;
4688 }
4689
4690 /**
4691  *      dev_set_promiscuity     - update promiscuity count on a device
4692  *      @dev: device
4693  *      @inc: modifier
4694  *
4695  *      Add or remove promiscuity from a device. While the count in the device
4696  *      remains above zero the interface remains promiscuous. Once it hits zero
4697  *      the device reverts back to normal filtering operation. A negative inc
4698  *      value is used to drop promiscuity on the device.
4699  *      Return 0 if successful or a negative errno code on error.
4700  */
4701 int dev_set_promiscuity(struct net_device *dev, int inc)
4702 {
4703         unsigned int old_flags = dev->flags;
4704         int err;
4705
4706         err = __dev_set_promiscuity(dev, inc);
4707         if (err < 0)
4708                 return err;
4709         if (dev->flags != old_flags)
4710                 dev_set_rx_mode(dev);
4711         return err;
4712 }
4713 EXPORT_SYMBOL(dev_set_promiscuity);
4714
4715 /**
4716  *      dev_set_allmulti        - update allmulti count on a device
4717  *      @dev: device
4718  *      @inc: modifier
4719  *
4720  *      Add or remove reception of all multicast frames to a device. While the
4721  *      count in the device remains above zero the interface remains listening
4722  *      to all interfaces. Once it hits zero the device reverts back to normal
4723  *      filtering operation. A negative @inc value is used to drop the counter
4724  *      when releasing a resource needing all multicasts.
4725  *      Return 0 if successful or a negative errno code on error.
4726  */
4727
4728 int dev_set_allmulti(struct net_device *dev, int inc)
4729 {
4730         unsigned int old_flags = dev->flags;
4731
4732         ASSERT_RTNL();
4733
4734         dev->flags |= IFF_ALLMULTI;
4735         dev->allmulti += inc;
4736         if (dev->allmulti == 0) {
4737                 /*
4738                  * Avoid overflow.
4739                  * If inc causes overflow, untouch allmulti and return error.
4740                  */
4741                 if (inc < 0)
4742                         dev->flags &= ~IFF_ALLMULTI;
4743                 else {
4744                         dev->allmulti -= inc;
4745                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4746                                 dev->name);
4747                         return -EOVERFLOW;
4748                 }
4749         }
4750         if (dev->flags ^ old_flags) {
4751                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4752                 dev_set_rx_mode(dev);
4753         }
4754         return 0;
4755 }
4756 EXPORT_SYMBOL(dev_set_allmulti);
4757
4758 /*
4759  *      Upload unicast and multicast address lists to device and
4760  *      configure RX filtering. When the device doesn't support unicast
4761  *      filtering it is put in promiscuous mode while unicast addresses
4762  *      are present.
4763  */
4764 void __dev_set_rx_mode(struct net_device *dev)
4765 {
4766         const struct net_device_ops *ops = dev->netdev_ops;
4767
4768         /* dev_open will call this function so the list will stay sane. */
4769         if (!(dev->flags&IFF_UP))
4770                 return;
4771
4772         if (!netif_device_present(dev))
4773                 return;
4774
4775         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4776                 /* Unicast addresses changes may only happen under the rtnl,
4777                  * therefore calling __dev_set_promiscuity here is safe.
4778                  */
4779                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4780                         __dev_set_promiscuity(dev, 1);
4781                         dev->uc_promisc = true;
4782                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4783                         __dev_set_promiscuity(dev, -1);
4784                         dev->uc_promisc = false;
4785                 }
4786         }
4787
4788         if (ops->ndo_set_rx_mode)
4789                 ops->ndo_set_rx_mode(dev);
4790 }
4791
4792 void dev_set_rx_mode(struct net_device *dev)
4793 {
4794         netif_addr_lock_bh(dev);
4795         __dev_set_rx_mode(dev);
4796         netif_addr_unlock_bh(dev);
4797 }
4798
4799 /**
4800  *      dev_get_flags - get flags reported to userspace
4801  *      @dev: device
4802  *
4803  *      Get the combination of flag bits exported through APIs to userspace.
4804  */
4805 unsigned int dev_get_flags(const struct net_device *dev)
4806 {
4807         unsigned int flags;
4808
4809         flags = (dev->flags & ~(IFF_PROMISC |
4810                                 IFF_ALLMULTI |
4811                                 IFF_RUNNING |
4812                                 IFF_LOWER_UP |
4813                                 IFF_DORMANT)) |
4814                 (dev->gflags & (IFF_PROMISC |
4815                                 IFF_ALLMULTI));
4816
4817         if (netif_running(dev)) {
4818                 if (netif_oper_up(dev))
4819                         flags |= IFF_RUNNING;
4820                 if (netif_carrier_ok(dev))
4821                         flags |= IFF_LOWER_UP;
4822                 if (netif_dormant(dev))
4823                         flags |= IFF_DORMANT;
4824         }
4825
4826         return flags;
4827 }
4828 EXPORT_SYMBOL(dev_get_flags);
4829
4830 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4831 {
4832         unsigned int old_flags = dev->flags;
4833         int ret;
4834
4835         ASSERT_RTNL();
4836
4837         /*
4838          *      Set the flags on our device.
4839          */
4840
4841         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4842                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4843                                IFF_AUTOMEDIA)) |
4844                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4845                                     IFF_ALLMULTI));
4846
4847         /*
4848          *      Load in the correct multicast list now the flags have changed.
4849          */
4850
4851         if ((old_flags ^ flags) & IFF_MULTICAST)
4852                 dev_change_rx_flags(dev, IFF_MULTICAST);
4853
4854         dev_set_rx_mode(dev);
4855
4856         /*
4857          *      Have we downed the interface. We handle IFF_UP ourselves
4858          *      according to user attempts to set it, rather than blindly
4859          *      setting it.
4860          */
4861
4862         ret = 0;
4863         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4864                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4865
4866                 if (!ret)
4867                         dev_set_rx_mode(dev);
4868         }
4869
4870         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4871                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4872
4873                 dev->gflags ^= IFF_PROMISC;
4874                 dev_set_promiscuity(dev, inc);
4875         }
4876
4877         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4878            is important. Some (broken) drivers set IFF_PROMISC, when
4879            IFF_ALLMULTI is requested not asking us and not reporting.
4880          */
4881         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4882                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4883
4884                 dev->gflags ^= IFF_ALLMULTI;
4885                 dev_set_allmulti(dev, inc);
4886         }
4887
4888         return ret;
4889 }
4890
4891 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4892 {
4893         unsigned int changes = dev->flags ^ old_flags;
4894
4895         if (changes & IFF_UP) {
4896                 if (dev->flags & IFF_UP)
4897                         call_netdevice_notifiers(NETDEV_UP, dev);
4898                 else
4899                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4900         }
4901
4902         if (dev->flags & IFF_UP &&
4903             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4904                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4905 }
4906
4907 /**
4908  *      dev_change_flags - change device settings
4909  *      @dev: device
4910  *      @flags: device state flags
4911  *
4912  *      Change settings on device based state flags. The flags are
4913  *      in the userspace exported format.
4914  */
4915 int dev_change_flags(struct net_device *dev, unsigned int flags)
4916 {
4917         int ret;
4918         unsigned int changes, old_flags = dev->flags;
4919
4920         ret = __dev_change_flags(dev, flags);
4921         if (ret < 0)
4922                 return ret;
4923
4924         changes = old_flags ^ dev->flags;
4925         if (changes)
4926                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4927
4928         __dev_notify_flags(dev, old_flags);
4929         return ret;
4930 }
4931 EXPORT_SYMBOL(dev_change_flags);
4932
4933 /**
4934  *      dev_set_mtu - Change maximum transfer unit
4935  *      @dev: device
4936  *      @new_mtu: new transfer unit
4937  *
4938  *      Change the maximum transfer size of the network device.
4939  */
4940 int dev_set_mtu(struct net_device *dev, int new_mtu)
4941 {
4942         const struct net_device_ops *ops = dev->netdev_ops;
4943         int err;
4944
4945         if (new_mtu == dev->mtu)
4946                 return 0;
4947
4948         /*      MTU must be positive.    */
4949         if (new_mtu < 0)
4950                 return -EINVAL;
4951
4952         if (!netif_device_present(dev))
4953                 return -ENODEV;
4954
4955         err = 0;
4956         if (ops->ndo_change_mtu)
4957                 err = ops->ndo_change_mtu(dev, new_mtu);
4958         else
4959                 dev->mtu = new_mtu;
4960
4961         if (!err && dev->flags & IFF_UP)
4962                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4963         return err;
4964 }
4965 EXPORT_SYMBOL(dev_set_mtu);
4966
4967 /**
4968  *      dev_set_group - Change group this device belongs to
4969  *      @dev: device
4970  *      @new_group: group this device should belong to
4971  */
4972 void dev_set_group(struct net_device *dev, int new_group)
4973 {
4974         dev->group = new_group;
4975 }
4976 EXPORT_SYMBOL(dev_set_group);
4977
4978 /**
4979  *      dev_set_mac_address - Change Media Access Control Address
4980  *      @dev: device
4981  *      @sa: new address
4982  *
4983  *      Change the hardware (MAC) address of the device
4984  */
4985 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4986 {
4987         const struct net_device_ops *ops = dev->netdev_ops;
4988         int err;
4989
4990         if (!ops->ndo_set_mac_address)
4991                 return -EOPNOTSUPP;
4992         if (sa->sa_family != dev->type)
4993                 return -EINVAL;
4994         if (!netif_device_present(dev))
4995                 return -ENODEV;
4996         err = ops->ndo_set_mac_address(dev, sa);
4997         if (!err)
4998                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4999         add_device_randomness(dev->dev_addr, dev->addr_len);
5000         return err;
5001 }
5002 EXPORT_SYMBOL(dev_set_mac_address);
5003
5004 /*
5005  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5006  */
5007 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5008 {
5009         int err;
5010         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5011
5012         if (!dev)
5013                 return -ENODEV;
5014
5015         switch (cmd) {
5016         case SIOCGIFFLAGS:      /* Get interface flags */
5017                 ifr->ifr_flags = (short) dev_get_flags(dev);
5018                 return 0;
5019
5020         case SIOCGIFMETRIC:     /* Get the metric on the interface
5021                                    (currently unused) */
5022                 ifr->ifr_metric = 0;
5023                 return 0;
5024
5025         case SIOCGIFMTU:        /* Get the MTU of a device */
5026                 ifr->ifr_mtu = dev->mtu;
5027                 return 0;
5028
5029         case SIOCGIFHWADDR:
5030                 if (!dev->addr_len)
5031                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5032                 else
5033                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5034                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5035                 ifr->ifr_hwaddr.sa_family = dev->type;
5036                 return 0;
5037
5038         case SIOCGIFSLAVE:
5039                 err = -EINVAL;
5040                 break;
5041
5042         case SIOCGIFMAP:
5043                 ifr->ifr_map.mem_start = dev->mem_start;
5044                 ifr->ifr_map.mem_end   = dev->mem_end;
5045                 ifr->ifr_map.base_addr = dev->base_addr;
5046                 ifr->ifr_map.irq       = dev->irq;
5047                 ifr->ifr_map.dma       = dev->dma;
5048                 ifr->ifr_map.port      = dev->if_port;
5049                 return 0;
5050
5051         case SIOCGIFINDEX:
5052                 ifr->ifr_ifindex = dev->ifindex;
5053                 return 0;
5054
5055         case SIOCGIFTXQLEN:
5056                 ifr->ifr_qlen = dev->tx_queue_len;
5057                 return 0;
5058
5059         default:
5060                 /* dev_ioctl() should ensure this case
5061                  * is never reached
5062                  */
5063                 WARN_ON(1);
5064                 err = -ENOTTY;
5065                 break;
5066
5067         }
5068         return err;
5069 }
5070
5071 /*
5072  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
5073  */
5074 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5075 {
5076         int err;
5077         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5078         const struct net_device_ops *ops;
5079
5080         if (!dev)
5081                 return -ENODEV;
5082
5083         ops = dev->netdev_ops;
5084
5085         switch (cmd) {
5086         case SIOCSIFFLAGS:      /* Set interface flags */
5087                 return dev_change_flags(dev, ifr->ifr_flags);
5088
5089         case SIOCSIFMETRIC:     /* Set the metric on the interface
5090                                    (currently unused) */
5091                 return -EOPNOTSUPP;
5092
5093         case SIOCSIFMTU:        /* Set the MTU of a device */
5094                 return dev_set_mtu(dev, ifr->ifr_mtu);
5095
5096         case SIOCSIFHWADDR:
5097                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5098
5099         case SIOCSIFHWBROADCAST:
5100                 if (ifr->ifr_hwaddr.sa_family != dev->type)
5101                         return -EINVAL;
5102                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5103                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5104                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5105                 return 0;
5106
5107         case SIOCSIFMAP:
5108                 if (ops->ndo_set_config) {
5109                         if (!netif_device_present(dev))
5110                                 return -ENODEV;
5111                         return ops->ndo_set_config(dev, &ifr->ifr_map);
5112                 }
5113                 return -EOPNOTSUPP;
5114
5115         case SIOCADDMULTI:
5116                 if (!ops->ndo_set_rx_mode ||
5117                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5118                         return -EINVAL;
5119                 if (!netif_device_present(dev))
5120                         return -ENODEV;
5121                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5122
5123         case SIOCDELMULTI:
5124                 if (!ops->ndo_set_rx_mode ||
5125                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5126                         return -EINVAL;
5127                 if (!netif_device_present(dev))
5128                         return -ENODEV;
5129                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5130
5131         case SIOCSIFTXQLEN:
5132                 if (ifr->ifr_qlen < 0)
5133                         return -EINVAL;
5134                 dev->tx_queue_len = ifr->ifr_qlen;
5135                 return 0;
5136
5137         case SIOCSIFNAME:
5138                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5139                 return dev_change_name(dev, ifr->ifr_newname);
5140
5141         case SIOCSHWTSTAMP:
5142                 err = net_hwtstamp_validate(ifr);
5143                 if (err)
5144                         return err;
5145                 /* fall through */
5146
5147         /*
5148          *      Unknown or private ioctl
5149          */
5150         default:
5151                 if ((cmd >= SIOCDEVPRIVATE &&
5152                     cmd <= SIOCDEVPRIVATE + 15) ||
5153                     cmd == SIOCBONDENSLAVE ||
5154                     cmd == SIOCBONDRELEASE ||
5155                     cmd == SIOCBONDSETHWADDR ||
5156                     cmd == SIOCBONDSLAVEINFOQUERY ||
5157                     cmd == SIOCBONDINFOQUERY ||
5158                     cmd == SIOCBONDCHANGEACTIVE ||
5159                     cmd == SIOCGMIIPHY ||
5160                     cmd == SIOCGMIIREG ||
5161                     cmd == SIOCSMIIREG ||
5162                     cmd == SIOCBRADDIF ||
5163                     cmd == SIOCBRDELIF ||
5164                     cmd == SIOCSHWTSTAMP ||
5165                     cmd == SIOCWANDEV) {
5166                         err = -EOPNOTSUPP;
5167                         if (ops->ndo_do_ioctl) {
5168                                 if (netif_device_present(dev))
5169                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5170                                 else
5171                                         err = -ENODEV;
5172                         }
5173                 } else
5174                         err = -EINVAL;
5175
5176         }
5177         return err;
5178 }
5179
5180 /*
5181  *      This function handles all "interface"-type I/O control requests. The actual
5182  *      'doing' part of this is dev_ifsioc above.
5183  */
5184
5185 /**
5186  *      dev_ioctl       -       network device ioctl
5187  *      @net: the applicable net namespace
5188  *      @cmd: command to issue
5189  *      @arg: pointer to a struct ifreq in user space
5190  *
5191  *      Issue ioctl functions to devices. This is normally called by the
5192  *      user space syscall interfaces but can sometimes be useful for
5193  *      other purposes. The return value is the return from the syscall if
5194  *      positive or a negative errno code on error.
5195  */
5196
5197 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5198 {
5199         struct ifreq ifr;
5200         int ret;
5201         char *colon;
5202
5203         /* One special case: SIOCGIFCONF takes ifconf argument
5204            and requires shared lock, because it sleeps writing
5205            to user space.
5206          */
5207
5208         if (cmd == SIOCGIFCONF) {
5209                 rtnl_lock();
5210                 ret = dev_ifconf(net, (char __user *) arg);
5211                 rtnl_unlock();
5212                 return ret;
5213         }
5214         if (cmd == SIOCGIFNAME)
5215                 return dev_ifname(net, (struct ifreq __user *)arg);
5216
5217         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5218                 return -EFAULT;
5219
5220         ifr.ifr_name[IFNAMSIZ-1] = 0;
5221
5222         colon = strchr(ifr.ifr_name, ':');
5223         if (colon)
5224                 *colon = 0;
5225
5226         /*
5227          *      See which interface the caller is talking about.
5228          */
5229
5230         switch (cmd) {
5231         /*
5232          *      These ioctl calls:
5233          *      - can be done by all.
5234          *      - atomic and do not require locking.
5235          *      - return a value
5236          */
5237         case SIOCGIFFLAGS:
5238         case SIOCGIFMETRIC:
5239         case SIOCGIFMTU:
5240         case SIOCGIFHWADDR:
5241         case SIOCGIFSLAVE:
5242         case SIOCGIFMAP:
5243         case SIOCGIFINDEX:
5244         case SIOCGIFTXQLEN:
5245                 dev_load(net, ifr.ifr_name);
5246                 rcu_read_lock();
5247                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5248                 rcu_read_unlock();
5249                 if (!ret) {
5250                         if (colon)
5251                                 *colon = ':';
5252                         if (copy_to_user(arg, &ifr,
5253                                          sizeof(struct ifreq)))
5254                                 ret = -EFAULT;
5255                 }
5256                 return ret;
5257
5258         case SIOCETHTOOL:
5259                 dev_load(net, ifr.ifr_name);
5260                 rtnl_lock();
5261                 ret = dev_ethtool(net, &ifr);
5262                 rtnl_unlock();
5263                 if (!ret) {
5264                         if (colon)
5265                                 *colon = ':';
5266                         if (copy_to_user(arg, &ifr,
5267                                          sizeof(struct ifreq)))
5268                                 ret = -EFAULT;
5269                 }
5270                 return ret;
5271
5272         /*
5273          *      These ioctl calls:
5274          *      - require superuser power.
5275          *      - require strict serialization.
5276          *      - return a value
5277          */
5278         case SIOCGMIIPHY:
5279         case SIOCGMIIREG:
5280         case SIOCSIFNAME:
5281                 if (!capable(CAP_NET_ADMIN))
5282                         return -EPERM;
5283                 dev_load(net, ifr.ifr_name);
5284                 rtnl_lock();
5285                 ret = dev_ifsioc(net, &ifr, cmd);
5286                 rtnl_unlock();
5287                 if (!ret) {
5288                         if (colon)
5289                                 *colon = ':';
5290                         if (copy_to_user(arg, &ifr,
5291                                          sizeof(struct ifreq)))
5292                                 ret = -EFAULT;
5293                 }
5294                 return ret;
5295
5296         /*
5297          *      These ioctl calls:
5298          *      - require superuser power.
5299          *      - require strict serialization.
5300          *      - do not return a value
5301          */
5302         case SIOCSIFFLAGS:
5303         case SIOCSIFMETRIC:
5304         case SIOCSIFMTU:
5305         case SIOCSIFMAP:
5306         case SIOCSIFHWADDR:
5307         case SIOCSIFSLAVE:
5308         case SIOCADDMULTI:
5309         case SIOCDELMULTI:
5310         case SIOCSIFHWBROADCAST:
5311         case SIOCSIFTXQLEN:
5312         case SIOCSMIIREG:
5313         case SIOCBONDENSLAVE:
5314         case SIOCBONDRELEASE:
5315         case SIOCBONDSETHWADDR:
5316         case SIOCBONDCHANGEACTIVE:
5317         case SIOCBRADDIF:
5318         case SIOCBRDELIF:
5319         case SIOCSHWTSTAMP:
5320                 if (!capable(CAP_NET_ADMIN))
5321                         return -EPERM;
5322                 /* fall through */
5323         case SIOCBONDSLAVEINFOQUERY:
5324         case SIOCBONDINFOQUERY:
5325                 dev_load(net, ifr.ifr_name);
5326                 rtnl_lock();
5327                 ret = dev_ifsioc(net, &ifr, cmd);
5328                 rtnl_unlock();
5329                 return ret;
5330
5331         case SIOCGIFMEM:
5332                 /* Get the per device memory space. We can add this but
5333                  * currently do not support it */
5334         case SIOCSIFMEM:
5335                 /* Set the per device memory buffer space.
5336                  * Not applicable in our case */
5337         case SIOCSIFLINK:
5338                 return -ENOTTY;
5339
5340         /*
5341          *      Unknown or private ioctl.
5342          */
5343         default:
5344                 if (cmd == SIOCWANDEV ||
5345                     (cmd >= SIOCDEVPRIVATE &&
5346                      cmd <= SIOCDEVPRIVATE + 15)) {
5347                         dev_load(net, ifr.ifr_name);
5348                         rtnl_lock();
5349                         ret = dev_ifsioc(net, &ifr, cmd);
5350                         rtnl_unlock();
5351                         if (!ret && copy_to_user(arg, &ifr,
5352                                                  sizeof(struct ifreq)))
5353                                 ret = -EFAULT;
5354                         return ret;
5355                 }
5356                 /* Take care of Wireless Extensions */
5357                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5358                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5359                 return -ENOTTY;
5360         }
5361 }
5362
5363
5364 /**
5365  *      dev_new_index   -       allocate an ifindex
5366  *      @net: the applicable net namespace
5367  *
5368  *      Returns a suitable unique value for a new device interface
5369  *      number.  The caller must hold the rtnl semaphore or the
5370  *      dev_base_lock to be sure it remains unique.
5371  */
5372 static int dev_new_index(struct net *net)
5373 {
5374         int ifindex = net->ifindex;
5375         for (;;) {
5376                 if (++ifindex <= 0)
5377                         ifindex = 1;
5378                 if (!__dev_get_by_index(net, ifindex))
5379                         return net->ifindex = ifindex;
5380         }
5381 }
5382
5383 /* Delayed registration/unregisteration */
5384 static LIST_HEAD(net_todo_list);
5385
5386 static void net_set_todo(struct net_device *dev)
5387 {
5388         list_add_tail(&dev->todo_list, &net_todo_list);
5389 }
5390
5391 static void rollback_registered_many(struct list_head *head)
5392 {
5393         struct net_device *dev, *tmp;
5394
5395         BUG_ON(dev_boot_phase);
5396         ASSERT_RTNL();
5397
5398         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5399                 /* Some devices call without registering
5400                  * for initialization unwind. Remove those
5401                  * devices and proceed with the remaining.
5402                  */
5403                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5404                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5405                                  dev->name, dev);
5406
5407                         WARN_ON(1);
5408                         list_del(&dev->unreg_list);
5409                         continue;
5410                 }
5411                 dev->dismantle = true;
5412                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5413         }
5414
5415         /* If device is running, close it first. */
5416         dev_close_many(head);
5417
5418         list_for_each_entry(dev, head, unreg_list) {
5419                 /* And unlink it from device chain. */
5420                 unlist_netdevice(dev);
5421
5422                 dev->reg_state = NETREG_UNREGISTERING;
5423         }
5424
5425         synchronize_net();
5426
5427         list_for_each_entry(dev, head, unreg_list) {
5428                 /* Shutdown queueing discipline. */
5429                 dev_shutdown(dev);
5430
5431
5432                 /* Notify protocols, that we are about to destroy
5433                    this device. They should clean all the things.
5434                 */
5435                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5436
5437                 if (!dev->rtnl_link_ops ||
5438                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5439                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5440
5441                 /*
5442                  *      Flush the unicast and multicast chains
5443                  */
5444                 dev_uc_flush(dev);
5445                 dev_mc_flush(dev);
5446
5447                 if (dev->netdev_ops->ndo_uninit)
5448                         dev->netdev_ops->ndo_uninit(dev);
5449
5450                 /* Notifier chain MUST detach us from master device. */
5451                 WARN_ON(dev->master);
5452
5453                 /* Remove entries from kobject tree */
5454                 netdev_unregister_kobject(dev);
5455         }
5456
5457         synchronize_net();
5458
5459         list_for_each_entry(dev, head, unreg_list)
5460                 dev_put(dev);
5461 }
5462
5463 static void rollback_registered(struct net_device *dev)
5464 {
5465         LIST_HEAD(single);
5466
5467         list_add(&dev->unreg_list, &single);
5468         rollback_registered_many(&single);
5469         list_del(&single);
5470 }
5471
5472 static netdev_features_t netdev_fix_features(struct net_device *dev,
5473         netdev_features_t features)
5474 {
5475         /* Fix illegal checksum combinations */
5476         if ((features & NETIF_F_HW_CSUM) &&
5477             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5478                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5479                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5480         }
5481
5482         /* Fix illegal SG+CSUM combinations. */
5483         if ((features & NETIF_F_SG) &&
5484             !(features & NETIF_F_ALL_CSUM)) {
5485                 netdev_dbg(dev,
5486                         "Dropping NETIF_F_SG since no checksum feature.\n");
5487                 features &= ~NETIF_F_SG;
5488         }
5489
5490         /* TSO requires that SG is present as well. */
5491         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5492                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5493                 features &= ~NETIF_F_ALL_TSO;
5494         }
5495
5496         /* TSO ECN requires that TSO is present as well. */
5497         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5498                 features &= ~NETIF_F_TSO_ECN;
5499
5500         /* Software GSO depends on SG. */
5501         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5502                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5503                 features &= ~NETIF_F_GSO;
5504         }
5505
5506         /* UFO needs SG and checksumming */
5507         if (features & NETIF_F_UFO) {
5508                 /* maybe split UFO into V4 and V6? */
5509                 if (!((features & NETIF_F_GEN_CSUM) ||
5510                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5511                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5512                         netdev_dbg(dev,
5513                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5514                         features &= ~NETIF_F_UFO;
5515                 }
5516
5517                 if (!(features & NETIF_F_SG)) {
5518                         netdev_dbg(dev,
5519                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5520                         features &= ~NETIF_F_UFO;
5521                 }
5522         }
5523
5524         return features;
5525 }
5526
5527 int __netdev_update_features(struct net_device *dev)
5528 {
5529         netdev_features_t features;
5530         int err = 0;
5531
5532         ASSERT_RTNL();
5533
5534         features = netdev_get_wanted_features(dev);
5535
5536         if (dev->netdev_ops->ndo_fix_features)
5537                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5538
5539         /* driver might be less strict about feature dependencies */
5540         features = netdev_fix_features(dev, features);
5541
5542         if (dev->features == features)
5543                 return 0;
5544
5545         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5546                 &dev->features, &features);
5547
5548         if (dev->netdev_ops->ndo_set_features)
5549                 err = dev->netdev_ops->ndo_set_features(dev, features);
5550
5551         if (unlikely(err < 0)) {
5552                 netdev_err(dev,
5553                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5554                         err, &features, &dev->features);
5555                 return -1;
5556         }
5557
5558         if (!err)
5559                 dev->features = features;
5560
5561         return 1;
5562 }
5563
5564 /**
5565  *      netdev_update_features - recalculate device features
5566  *      @dev: the device to check
5567  *
5568  *      Recalculate dev->features set and send notifications if it
5569  *      has changed. Should be called after driver or hardware dependent
5570  *      conditions might have changed that influence the features.
5571  */
5572 void netdev_update_features(struct net_device *dev)
5573 {
5574         if (__netdev_update_features(dev))
5575                 netdev_features_change(dev);
5576 }
5577 EXPORT_SYMBOL(netdev_update_features);
5578
5579 /**
5580  *      netdev_change_features - recalculate device features
5581  *      @dev: the device to check
5582  *
5583  *      Recalculate dev->features set and send notifications even
5584  *      if they have not changed. Should be called instead of
5585  *      netdev_update_features() if also dev->vlan_features might
5586  *      have changed to allow the changes to be propagated to stacked
5587  *      VLAN devices.
5588  */
5589 void netdev_change_features(struct net_device *dev)
5590 {
5591         __netdev_update_features(dev);
5592         netdev_features_change(dev);
5593 }
5594 EXPORT_SYMBOL(netdev_change_features);
5595
5596 /**
5597  *      netif_stacked_transfer_operstate -      transfer operstate
5598  *      @rootdev: the root or lower level device to transfer state from
5599  *      @dev: the device to transfer operstate to
5600  *
5601  *      Transfer operational state from root to device. This is normally
5602  *      called when a stacking relationship exists between the root
5603  *      device and the device(a leaf device).
5604  */
5605 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5606                                         struct net_device *dev)
5607 {
5608         if (rootdev->operstate == IF_OPER_DORMANT)
5609                 netif_dormant_on(dev);
5610         else
5611                 netif_dormant_off(dev);
5612
5613         if (netif_carrier_ok(rootdev)) {
5614                 if (!netif_carrier_ok(dev))
5615                         netif_carrier_on(dev);
5616         } else {
5617                 if (netif_carrier_ok(dev))
5618                         netif_carrier_off(dev);
5619         }
5620 }
5621 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5622
5623 #ifdef CONFIG_RPS
5624 static int netif_alloc_rx_queues(struct net_device *dev)
5625 {
5626         unsigned int i, count = dev->num_rx_queues;
5627         struct netdev_rx_queue *rx;
5628
5629         BUG_ON(count < 1);
5630
5631         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5632         if (!rx) {
5633                 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5634                 return -ENOMEM;
5635         }
5636         dev->_rx = rx;
5637
5638         for (i = 0; i < count; i++)
5639                 rx[i].dev = dev;
5640         return 0;
5641 }
5642 #endif
5643
5644 static void netdev_init_one_queue(struct net_device *dev,
5645                                   struct netdev_queue *queue, void *_unused)
5646 {
5647         /* Initialize queue lock */
5648         spin_lock_init(&queue->_xmit_lock);
5649         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5650         queue->xmit_lock_owner = -1;
5651         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5652         queue->dev = dev;
5653 #ifdef CONFIG_BQL
5654         dql_init(&queue->dql, HZ);
5655 #endif
5656 }
5657
5658 static int netif_alloc_netdev_queues(struct net_device *dev)
5659 {
5660         unsigned int count = dev->num_tx_queues;
5661         struct netdev_queue *tx;
5662
5663         BUG_ON(count < 1);
5664
5665         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5666         if (!tx) {
5667                 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5668                 return -ENOMEM;
5669         }
5670         dev->_tx = tx;
5671
5672         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5673         spin_lock_init(&dev->tx_global_lock);
5674
5675         return 0;
5676 }
5677
5678 /**
5679  *      register_netdevice      - register a network device
5680  *      @dev: device to register
5681  *
5682  *      Take a completed network device structure and add it to the kernel
5683  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5684  *      chain. 0 is returned on success. A negative errno code is returned
5685  *      on a failure to set up the device, or if the name is a duplicate.
5686  *
5687  *      Callers must hold the rtnl semaphore. You may want
5688  *      register_netdev() instead of this.
5689  *
5690  *      BUGS:
5691  *      The locking appears insufficient to guarantee two parallel registers
5692  *      will not get the same name.
5693  */
5694
5695 int register_netdevice(struct net_device *dev)
5696 {
5697         int ret;
5698         struct net *net = dev_net(dev);
5699
5700         BUG_ON(dev_boot_phase);
5701         ASSERT_RTNL();
5702
5703         might_sleep();
5704
5705         /* When net_device's are persistent, this will be fatal. */
5706         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5707         BUG_ON(!net);
5708
5709         spin_lock_init(&dev->addr_list_lock);
5710         netdev_set_addr_lockdep_class(dev);
5711
5712         dev->iflink = -1;
5713
5714         ret = dev_get_valid_name(net, dev, dev->name);
5715         if (ret < 0)
5716                 goto out;
5717
5718         /* Init, if this function is available */
5719         if (dev->netdev_ops->ndo_init) {
5720                 ret = dev->netdev_ops->ndo_init(dev);
5721                 if (ret) {
5722                         if (ret > 0)
5723                                 ret = -EIO;
5724                         goto out;
5725                 }
5726         }
5727
5728         ret = -EBUSY;
5729         if (!dev->ifindex)
5730                 dev->ifindex = dev_new_index(net);
5731         else if (__dev_get_by_index(net, dev->ifindex))
5732                 goto err_uninit;
5733
5734         if (dev->iflink == -1)
5735                 dev->iflink = dev->ifindex;
5736
5737         /* Transfer changeable features to wanted_features and enable
5738          * software offloads (GSO and GRO).
5739          */
5740         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5741         dev->features |= NETIF_F_SOFT_FEATURES;
5742         dev->wanted_features = dev->features & dev->hw_features;
5743
5744         /* Turn on no cache copy if HW is doing checksum */
5745         if (!(dev->flags & IFF_LOOPBACK)) {
5746                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5747                 if (dev->features & NETIF_F_ALL_CSUM) {
5748                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5749                         dev->features |= NETIF_F_NOCACHE_COPY;
5750                 }
5751         }
5752
5753         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5754          */
5755         dev->vlan_features |= NETIF_F_HIGHDMA;
5756
5757         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5758         ret = notifier_to_errno(ret);
5759         if (ret)
5760                 goto err_uninit;
5761
5762         ret = netdev_register_kobject(dev);
5763         if (ret)
5764                 goto err_uninit;
5765         dev->reg_state = NETREG_REGISTERED;
5766
5767         __netdev_update_features(dev);
5768
5769         /*
5770          *      Default initial state at registry is that the
5771          *      device is present.
5772          */
5773
5774         set_bit(__LINK_STATE_PRESENT, &dev->state);
5775
5776         linkwatch_init_dev(dev);
5777
5778         dev_init_scheduler(dev);
5779         dev_hold(dev);
5780         list_netdevice(dev);
5781         add_device_randomness(dev->dev_addr, dev->addr_len);
5782
5783         /* Notify protocols, that a new device appeared. */
5784         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5785         ret = notifier_to_errno(ret);
5786         if (ret) {
5787                 rollback_registered(dev);
5788                 dev->reg_state = NETREG_UNREGISTERED;
5789         }
5790         /*
5791          *      Prevent userspace races by waiting until the network
5792          *      device is fully setup before sending notifications.
5793          */
5794         if (!dev->rtnl_link_ops ||
5795             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5796                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5797
5798 out:
5799         return ret;
5800
5801 err_uninit:
5802         if (dev->netdev_ops->ndo_uninit)
5803                 dev->netdev_ops->ndo_uninit(dev);
5804         goto out;
5805 }
5806 EXPORT_SYMBOL(register_netdevice);
5807
5808 /**
5809  *      init_dummy_netdev       - init a dummy network device for NAPI
5810  *      @dev: device to init
5811  *
5812  *      This takes a network device structure and initialize the minimum
5813  *      amount of fields so it can be used to schedule NAPI polls without
5814  *      registering a full blown interface. This is to be used by drivers
5815  *      that need to tie several hardware interfaces to a single NAPI
5816  *      poll scheduler due to HW limitations.
5817  */
5818 int init_dummy_netdev(struct net_device *dev)
5819 {
5820         /* Clear everything. Note we don't initialize spinlocks
5821          * are they aren't supposed to be taken by any of the
5822          * NAPI code and this dummy netdev is supposed to be
5823          * only ever used for NAPI polls
5824          */
5825         memset(dev, 0, sizeof(struct net_device));
5826
5827         /* make sure we BUG if trying to hit standard
5828          * register/unregister code path
5829          */
5830         dev->reg_state = NETREG_DUMMY;
5831
5832         /* NAPI wants this */
5833         INIT_LIST_HEAD(&dev->napi_list);
5834
5835         /* a dummy interface is started by default */
5836         set_bit(__LINK_STATE_PRESENT, &dev->state);
5837         set_bit(__LINK_STATE_START, &dev->state);
5838
5839         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5840          * because users of this 'device' dont need to change
5841          * its refcount.
5842          */
5843
5844         return 0;
5845 }
5846 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5847
5848
5849 /**
5850  *      register_netdev - register a network device
5851  *      @dev: device to register
5852  *
5853  *      Take a completed network device structure and add it to the kernel
5854  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5855  *      chain. 0 is returned on success. A negative errno code is returned
5856  *      on a failure to set up the device, or if the name is a duplicate.
5857  *
5858  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5859  *      and expands the device name if you passed a format string to
5860  *      alloc_netdev.
5861  */
5862 int register_netdev(struct net_device *dev)
5863 {
5864         int err;
5865
5866         rtnl_lock();
5867         err = register_netdevice(dev);
5868         rtnl_unlock();
5869         return err;
5870 }
5871 EXPORT_SYMBOL(register_netdev);
5872
5873 int netdev_refcnt_read(const struct net_device *dev)
5874 {
5875         int i, refcnt = 0;
5876
5877         for_each_possible_cpu(i)
5878                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5879         return refcnt;
5880 }
5881 EXPORT_SYMBOL(netdev_refcnt_read);
5882
5883 /**
5884  * netdev_wait_allrefs - wait until all references are gone.
5885  * @dev: target net_device
5886  *
5887  * This is called when unregistering network devices.
5888  *
5889  * Any protocol or device that holds a reference should register
5890  * for netdevice notification, and cleanup and put back the
5891  * reference if they receive an UNREGISTER event.
5892  * We can get stuck here if buggy protocols don't correctly
5893  * call dev_put.
5894  */
5895 static void netdev_wait_allrefs(struct net_device *dev)
5896 {
5897         unsigned long rebroadcast_time, warning_time;
5898         int refcnt;
5899
5900         linkwatch_forget_dev(dev);
5901
5902         rebroadcast_time = warning_time = jiffies;
5903         refcnt = netdev_refcnt_read(dev);
5904
5905         while (refcnt != 0) {
5906                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5907                         rtnl_lock();
5908
5909                         /* Rebroadcast unregister notification */
5910                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5911
5912                         __rtnl_unlock();
5913                         rcu_barrier();
5914                         rtnl_lock();
5915
5916                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5917                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5918                                      &dev->state)) {
5919                                 /* We must not have linkwatch events
5920                                  * pending on unregister. If this
5921                                  * happens, we simply run the queue
5922                                  * unscheduled, resulting in a noop
5923                                  * for this device.
5924                                  */
5925                                 linkwatch_run_queue();
5926                         }
5927
5928                         __rtnl_unlock();
5929
5930                         rebroadcast_time = jiffies;
5931                 }
5932
5933                 msleep(250);
5934
5935                 refcnt = netdev_refcnt_read(dev);
5936
5937                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5938                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5939                                  dev->name, refcnt);
5940                         warning_time = jiffies;
5941                 }
5942         }
5943 }
5944
5945 /* The sequence is:
5946  *
5947  *      rtnl_lock();
5948  *      ...
5949  *      register_netdevice(x1);
5950  *      register_netdevice(x2);
5951  *      ...
5952  *      unregister_netdevice(y1);
5953  *      unregister_netdevice(y2);
5954  *      ...
5955  *      rtnl_unlock();
5956  *      free_netdev(y1);
5957  *      free_netdev(y2);
5958  *
5959  * We are invoked by rtnl_unlock().
5960  * This allows us to deal with problems:
5961  * 1) We can delete sysfs objects which invoke hotplug
5962  *    without deadlocking with linkwatch via keventd.
5963  * 2) Since we run with the RTNL semaphore not held, we can sleep
5964  *    safely in order to wait for the netdev refcnt to drop to zero.
5965  *
5966  * We must not return until all unregister events added during
5967  * the interval the lock was held have been completed.
5968  */
5969 void netdev_run_todo(void)
5970 {
5971         struct list_head list;
5972
5973         /* Snapshot list, allow later requests */
5974         list_replace_init(&net_todo_list, &list);
5975
5976         __rtnl_unlock();
5977
5978
5979         /* Wait for rcu callbacks to finish before next phase */
5980         if (!list_empty(&list))
5981                 rcu_barrier();
5982
5983         while (!list_empty(&list)) {
5984                 struct net_device *dev
5985                         = list_first_entry(&list, struct net_device, todo_list);
5986                 list_del(&dev->todo_list);
5987
5988                 rtnl_lock();
5989                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5990                 __rtnl_unlock();
5991
5992                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5993                         pr_err("network todo '%s' but state %d\n",
5994                                dev->name, dev->reg_state);
5995                         dump_stack();
5996                         continue;
5997                 }
5998
5999                 dev->reg_state = NETREG_UNREGISTERED;
6000
6001                 on_each_cpu(flush_backlog, dev, 1);
6002
6003                 netdev_wait_allrefs(dev);
6004
6005                 /* paranoia */
6006                 BUG_ON(netdev_refcnt_read(dev));
6007                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6008                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6009                 WARN_ON(dev->dn_ptr);
6010
6011                 if (dev->destructor)
6012                         dev->destructor(dev);
6013
6014                 /* Free network device */
6015                 kobject_put(&dev->dev.kobj);
6016         }
6017 }
6018
6019 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6020  * fields in the same order, with only the type differing.
6021  */
6022 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6023                              const struct net_device_stats *netdev_stats)
6024 {
6025 #if BITS_PER_LONG == 64
6026         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6027         memcpy(stats64, netdev_stats, sizeof(*stats64));
6028 #else
6029         size_t i, n = sizeof(*stats64) / sizeof(u64);
6030         const unsigned long *src = (const unsigned long *)netdev_stats;
6031         u64 *dst = (u64 *)stats64;
6032
6033         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6034                      sizeof(*stats64) / sizeof(u64));
6035         for (i = 0; i < n; i++)
6036                 dst[i] = src[i];
6037 #endif
6038 }
6039 EXPORT_SYMBOL(netdev_stats_to_stats64);
6040
6041 /**
6042  *      dev_get_stats   - get network device statistics
6043  *      @dev: device to get statistics from
6044  *      @storage: place to store stats
6045  *
6046  *      Get network statistics from device. Return @storage.
6047  *      The device driver may provide its own method by setting
6048  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6049  *      otherwise the internal statistics structure is used.
6050  */
6051 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6052                                         struct rtnl_link_stats64 *storage)
6053 {
6054         const struct net_device_ops *ops = dev->netdev_ops;
6055
6056         if (ops->ndo_get_stats64) {
6057                 memset(storage, 0, sizeof(*storage));
6058                 ops->ndo_get_stats64(dev, storage);
6059         } else if (ops->ndo_get_stats) {
6060                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6061         } else {
6062                 netdev_stats_to_stats64(storage, &dev->stats);
6063         }
6064         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6065         return storage;
6066 }
6067 EXPORT_SYMBOL(dev_get_stats);
6068
6069 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6070 {
6071         struct netdev_queue *queue = dev_ingress_queue(dev);
6072
6073 #ifdef CONFIG_NET_CLS_ACT
6074         if (queue)
6075                 return queue;
6076         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6077         if (!queue)
6078                 return NULL;
6079         netdev_init_one_queue(dev, queue, NULL);
6080         queue->qdisc = &noop_qdisc;
6081         queue->qdisc_sleeping = &noop_qdisc;
6082         rcu_assign_pointer(dev->ingress_queue, queue);
6083 #endif
6084         return queue;
6085 }
6086
6087 static const struct ethtool_ops default_ethtool_ops;
6088
6089 /**
6090  *      alloc_netdev_mqs - allocate network device
6091  *      @sizeof_priv:   size of private data to allocate space for
6092  *      @name:          device name format string
6093  *      @setup:         callback to initialize device
6094  *      @txqs:          the number of TX subqueues to allocate
6095  *      @rxqs:          the number of RX subqueues to allocate
6096  *
6097  *      Allocates a struct net_device with private data area for driver use
6098  *      and performs basic initialization.  Also allocates subquue structs
6099  *      for each queue on the device.
6100  */
6101 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6102                 void (*setup)(struct net_device *),
6103                 unsigned int txqs, unsigned int rxqs)
6104 {
6105         struct net_device *dev;
6106         size_t alloc_size;
6107         struct net_device *p;
6108
6109         BUG_ON(strlen(name) >= sizeof(dev->name));
6110
6111         if (txqs < 1) {
6112                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6113                 return NULL;
6114         }
6115
6116 #ifdef CONFIG_RPS
6117         if (rxqs < 1) {
6118                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6119                 return NULL;
6120         }
6121 #endif
6122
6123         alloc_size = sizeof(struct net_device);
6124         if (sizeof_priv) {
6125                 /* ensure 32-byte alignment of private area */
6126                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6127                 alloc_size += sizeof_priv;
6128         }
6129         /* ensure 32-byte alignment of whole construct */
6130         alloc_size += NETDEV_ALIGN - 1;
6131
6132         p = kzalloc(alloc_size, GFP_KERNEL);
6133         if (!p) {
6134                 pr_err("alloc_netdev: Unable to allocate device\n");
6135                 return NULL;
6136         }
6137
6138         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6139         dev->padded = (char *)dev - (char *)p;
6140
6141         dev->pcpu_refcnt = alloc_percpu(int);
6142         if (!dev->pcpu_refcnt)
6143                 goto free_p;
6144
6145         if (dev_addr_init(dev))
6146                 goto free_pcpu;
6147
6148         dev_mc_init(dev);
6149         dev_uc_init(dev);
6150
6151         dev_net_set(dev, &init_net);
6152
6153         dev->gso_max_size = GSO_MAX_SIZE;
6154         dev->gso_max_segs = GSO_MAX_SEGS;
6155
6156         INIT_LIST_HEAD(&dev->napi_list);
6157         INIT_LIST_HEAD(&dev->unreg_list);
6158         INIT_LIST_HEAD(&dev->link_watch_list);
6159         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6160         setup(dev);
6161
6162         dev->num_tx_queues = txqs;
6163         dev->real_num_tx_queues = txqs;
6164         if (netif_alloc_netdev_queues(dev))
6165                 goto free_all;
6166
6167 #ifdef CONFIG_RPS
6168         dev->num_rx_queues = rxqs;
6169         dev->real_num_rx_queues = rxqs;
6170         if (netif_alloc_rx_queues(dev))
6171                 goto free_all;
6172 #endif
6173
6174         strcpy(dev->name, name);
6175         dev->group = INIT_NETDEV_GROUP;
6176         if (!dev->ethtool_ops)
6177                 dev->ethtool_ops = &default_ethtool_ops;
6178         return dev;
6179
6180 free_all:
6181         free_netdev(dev);
6182         return NULL;
6183
6184 free_pcpu:
6185         free_percpu(dev->pcpu_refcnt);
6186         kfree(dev->_tx);
6187 #ifdef CONFIG_RPS
6188         kfree(dev->_rx);
6189 #endif
6190
6191 free_p:
6192         kfree(p);
6193         return NULL;
6194 }
6195 EXPORT_SYMBOL(alloc_netdev_mqs);
6196
6197 /**
6198  *      free_netdev - free network device
6199  *      @dev: device
6200  *
6201  *      This function does the last stage of destroying an allocated device
6202  *      interface. The reference to the device object is released.
6203  *      If this is the last reference then it will be freed.
6204  */
6205 void free_netdev(struct net_device *dev)
6206 {
6207         struct napi_struct *p, *n;
6208
6209         release_net(dev_net(dev));
6210
6211         kfree(dev->_tx);
6212 #ifdef CONFIG_RPS
6213         kfree(dev->_rx);
6214 #endif
6215
6216         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6217
6218         /* Flush device addresses */
6219         dev_addr_flush(dev);
6220
6221         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6222                 netif_napi_del(p);
6223
6224         free_percpu(dev->pcpu_refcnt);
6225         dev->pcpu_refcnt = NULL;
6226
6227         /*  Compatibility with error handling in drivers */
6228         if (dev->reg_state == NETREG_UNINITIALIZED) {
6229                 kfree((char *)dev - dev->padded);
6230                 return;
6231         }
6232
6233         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6234         dev->reg_state = NETREG_RELEASED;
6235
6236         /* will free via device release */
6237         put_device(&dev->dev);
6238 }
6239 EXPORT_SYMBOL(free_netdev);
6240
6241 /**
6242  *      synchronize_net -  Synchronize with packet receive processing
6243  *
6244  *      Wait for packets currently being received to be done.
6245  *      Does not block later packets from starting.
6246  */
6247 void synchronize_net(void)
6248 {
6249         might_sleep();
6250         if (rtnl_is_locked())
6251                 synchronize_rcu_expedited();
6252         else
6253                 synchronize_rcu();
6254 }
6255 EXPORT_SYMBOL(synchronize_net);
6256
6257 /**
6258  *      unregister_netdevice_queue - remove device from the kernel
6259  *      @dev: device
6260  *      @head: list
6261  *
6262  *      This function shuts down a device interface and removes it
6263  *      from the kernel tables.
6264  *      If head not NULL, device is queued to be unregistered later.
6265  *
6266  *      Callers must hold the rtnl semaphore.  You may want
6267  *      unregister_netdev() instead of this.
6268  */
6269
6270 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6271 {
6272         ASSERT_RTNL();
6273
6274         if (head) {
6275                 list_move_tail(&dev->unreg_list, head);
6276         } else {
6277                 rollback_registered(dev);
6278                 /* Finish processing unregister after unlock */
6279                 net_set_todo(dev);
6280         }
6281 }
6282 EXPORT_SYMBOL(unregister_netdevice_queue);
6283
6284 /**
6285  *      unregister_netdevice_many - unregister many devices
6286  *      @head: list of devices
6287  */
6288 void unregister_netdevice_many(struct list_head *head)
6289 {
6290         struct net_device *dev;
6291
6292         if (!list_empty(head)) {
6293                 rollback_registered_many(head);
6294                 list_for_each_entry(dev, head, unreg_list)
6295                         net_set_todo(dev);
6296         }
6297 }
6298 EXPORT_SYMBOL(unregister_netdevice_many);
6299
6300 /**
6301  *      unregister_netdev - remove device from the kernel
6302  *      @dev: device
6303  *
6304  *      This function shuts down a device interface and removes it
6305  *      from the kernel tables.
6306  *
6307  *      This is just a wrapper for unregister_netdevice that takes
6308  *      the rtnl semaphore.  In general you want to use this and not
6309  *      unregister_netdevice.
6310  */
6311 void unregister_netdev(struct net_device *dev)
6312 {
6313         rtnl_lock();
6314         unregister_netdevice(dev);
6315         rtnl_unlock();
6316 }
6317 EXPORT_SYMBOL(unregister_netdev);
6318
6319 /**
6320  *      dev_change_net_namespace - move device to different nethost namespace
6321  *      @dev: device
6322  *      @net: network namespace
6323  *      @pat: If not NULL name pattern to try if the current device name
6324  *            is already taken in the destination network namespace.
6325  *
6326  *      This function shuts down a device interface and moves it
6327  *      to a new network namespace. On success 0 is returned, on
6328  *      a failure a netagive errno code is returned.
6329  *
6330  *      Callers must hold the rtnl semaphore.
6331  */
6332
6333 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6334 {
6335         int err;
6336
6337         ASSERT_RTNL();
6338
6339         /* Don't allow namespace local devices to be moved. */
6340         err = -EINVAL;
6341         if (dev->features & NETIF_F_NETNS_LOCAL)
6342                 goto out;
6343
6344         /* Ensure the device has been registrered */
6345         if (dev->reg_state != NETREG_REGISTERED)
6346                 goto out;
6347
6348         /* Get out if there is nothing todo */
6349         err = 0;
6350         if (net_eq(dev_net(dev), net))
6351                 goto out;
6352
6353         /* Pick the destination device name, and ensure
6354          * we can use it in the destination network namespace.
6355          */
6356         err = -EEXIST;
6357         if (__dev_get_by_name(net, dev->name)) {
6358                 /* We get here if we can't use the current device name */
6359                 if (!pat)
6360                         goto out;
6361                 if (dev_get_valid_name(net, dev, pat) < 0)
6362                         goto out;
6363         }
6364
6365         /*
6366          * And now a mini version of register_netdevice unregister_netdevice.
6367          */
6368
6369         /* If device is running close it first. */
6370         dev_close(dev);
6371
6372         /* And unlink it from device chain */
6373         err = -ENODEV;
6374         unlist_netdevice(dev);
6375
6376         synchronize_net();
6377
6378         /* Shutdown queueing discipline. */
6379         dev_shutdown(dev);
6380
6381         /* Notify protocols, that we are about to destroy
6382            this device. They should clean all the things.
6383
6384            Note that dev->reg_state stays at NETREG_REGISTERED.
6385            This is wanted because this way 8021q and macvlan know
6386            the device is just moving and can keep their slaves up.
6387         */
6388         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6389         rcu_barrier();
6390         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6391         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6392
6393         /*
6394          *      Flush the unicast and multicast chains
6395          */
6396         dev_uc_flush(dev);
6397         dev_mc_flush(dev);
6398
6399         /* Actually switch the network namespace */
6400         dev_net_set(dev, net);
6401
6402         /* If there is an ifindex conflict assign a new one */
6403         if (__dev_get_by_index(net, dev->ifindex)) {
6404                 int iflink = (dev->iflink == dev->ifindex);
6405                 dev->ifindex = dev_new_index(net);
6406                 if (iflink)
6407                         dev->iflink = dev->ifindex;
6408         }
6409
6410         /* Fixup kobjects */
6411         err = device_rename(&dev->dev, dev->name);
6412         WARN_ON(err);
6413
6414         /* Add the device back in the hashes */
6415         list_netdevice(dev);
6416
6417         /* Notify protocols, that a new device appeared. */
6418         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6419
6420         /*
6421          *      Prevent userspace races by waiting until the network
6422          *      device is fully setup before sending notifications.
6423          */
6424         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6425
6426         synchronize_net();
6427         err = 0;
6428 out:
6429         return err;
6430 }
6431 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6432
6433 static int dev_cpu_callback(struct notifier_block *nfb,
6434                             unsigned long action,
6435                             void *ocpu)
6436 {
6437         struct sk_buff **list_skb;
6438         struct sk_buff *skb;
6439         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6440         struct softnet_data *sd, *oldsd;
6441
6442         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6443                 return NOTIFY_OK;
6444
6445         local_irq_disable();
6446         cpu = smp_processor_id();
6447         sd = &per_cpu(softnet_data, cpu);
6448         oldsd = &per_cpu(softnet_data, oldcpu);
6449
6450         /* Find end of our completion_queue. */
6451         list_skb = &sd->completion_queue;
6452         while (*list_skb)
6453                 list_skb = &(*list_skb)->next;
6454         /* Append completion queue from offline CPU. */
6455         *list_skb = oldsd->completion_queue;
6456         oldsd->completion_queue = NULL;
6457
6458         /* Append output queue from offline CPU. */
6459         if (oldsd->output_queue) {
6460                 *sd->output_queue_tailp = oldsd->output_queue;
6461                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6462                 oldsd->output_queue = NULL;
6463                 oldsd->output_queue_tailp = &oldsd->output_queue;
6464         }
6465         /* Append NAPI poll list from offline CPU. */
6466         if (!list_empty(&oldsd->poll_list)) {
6467                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6468                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6469         }
6470
6471         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6472         local_irq_enable();
6473
6474         /* Process offline CPU's input_pkt_queue */
6475         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6476                 netif_rx(skb);
6477                 input_queue_head_incr(oldsd);
6478         }
6479         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6480                 netif_rx(skb);
6481                 input_queue_head_incr(oldsd);
6482         }
6483
6484         return NOTIFY_OK;
6485 }
6486
6487
6488 /**
6489  *      netdev_increment_features - increment feature set by one
6490  *      @all: current feature set
6491  *      @one: new feature set
6492  *      @mask: mask feature set
6493  *
6494  *      Computes a new feature set after adding a device with feature set
6495  *      @one to the master device with current feature set @all.  Will not
6496  *      enable anything that is off in @mask. Returns the new feature set.
6497  */
6498 netdev_features_t netdev_increment_features(netdev_features_t all,
6499         netdev_features_t one, netdev_features_t mask)
6500 {
6501         if (mask & NETIF_F_GEN_CSUM)
6502                 mask |= NETIF_F_ALL_CSUM;
6503         mask |= NETIF_F_VLAN_CHALLENGED;
6504
6505         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6506         all &= one | ~NETIF_F_ALL_FOR_ALL;
6507
6508         /* If one device supports hw checksumming, set for all. */
6509         if (all & NETIF_F_GEN_CSUM)
6510                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6511
6512         return all;
6513 }
6514 EXPORT_SYMBOL(netdev_increment_features);
6515
6516 static struct hlist_head *netdev_create_hash(void)
6517 {
6518         int i;
6519         struct hlist_head *hash;
6520
6521         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6522         if (hash != NULL)
6523                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6524                         INIT_HLIST_HEAD(&hash[i]);
6525
6526         return hash;
6527 }
6528
6529 /* Initialize per network namespace state */
6530 static int __net_init netdev_init(struct net *net)
6531 {
6532         if (net != &init_net)
6533                 INIT_LIST_HEAD(&net->dev_base_head);
6534
6535         net->dev_name_head = netdev_create_hash();
6536         if (net->dev_name_head == NULL)
6537                 goto err_name;
6538
6539         net->dev_index_head = netdev_create_hash();
6540         if (net->dev_index_head == NULL)
6541                 goto err_idx;
6542
6543         return 0;
6544
6545 err_idx:
6546         kfree(net->dev_name_head);
6547 err_name:
6548         return -ENOMEM;
6549 }
6550
6551 /**
6552  *      netdev_drivername - network driver for the device
6553  *      @dev: network device
6554  *
6555  *      Determine network driver for device.
6556  */
6557 const char *netdev_drivername(const struct net_device *dev)
6558 {
6559         const struct device_driver *driver;
6560         const struct device *parent;
6561         const char *empty = "";
6562
6563         parent = dev->dev.parent;
6564         if (!parent)
6565                 return empty;
6566
6567         driver = parent->driver;
6568         if (driver && driver->name)
6569                 return driver->name;
6570         return empty;
6571 }
6572
6573 static int __netdev_printk(const char *level, const struct net_device *dev,
6574                            struct va_format *vaf)
6575 {
6576         int r;
6577
6578         if (dev && dev->dev.parent) {
6579                 r = dev_printk_emit(level[1] - '0',
6580                                     dev->dev.parent,
6581                                     "%s %s %s: %pV",
6582                                     dev_driver_string(dev->dev.parent),
6583                                     dev_name(dev->dev.parent),
6584                                     netdev_name(dev), vaf);
6585         } else if (dev) {
6586                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6587         } else {
6588                 r = printk("%s(NULL net_device): %pV", level, vaf);
6589         }
6590
6591         return r;
6592 }
6593
6594 int netdev_printk(const char *level, const struct net_device *dev,
6595                   const char *format, ...)
6596 {
6597         struct va_format vaf;
6598         va_list args;
6599         int r;
6600
6601         va_start(args, format);
6602
6603         vaf.fmt = format;
6604         vaf.va = &args;
6605
6606         r = __netdev_printk(level, dev, &vaf);
6607
6608         va_end(args);
6609
6610         return r;
6611 }
6612 EXPORT_SYMBOL(netdev_printk);
6613
6614 #define define_netdev_printk_level(func, level)                 \
6615 int func(const struct net_device *dev, const char *fmt, ...)    \
6616 {                                                               \
6617         int r;                                                  \
6618         struct va_format vaf;                                   \
6619         va_list args;                                           \
6620                                                                 \
6621         va_start(args, fmt);                                    \
6622                                                                 \
6623         vaf.fmt = fmt;                                          \
6624         vaf.va = &args;                                         \
6625                                                                 \
6626         r = __netdev_printk(level, dev, &vaf);                  \
6627                                                                 \
6628         va_end(args);                                           \
6629                                                                 \
6630         return r;                                               \
6631 }                                                               \
6632 EXPORT_SYMBOL(func);
6633
6634 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6635 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6636 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6637 define_netdev_printk_level(netdev_err, KERN_ERR);
6638 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6639 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6640 define_netdev_printk_level(netdev_info, KERN_INFO);
6641
6642 static void __net_exit netdev_exit(struct net *net)
6643 {
6644         kfree(net->dev_name_head);
6645         kfree(net->dev_index_head);
6646 }
6647
6648 static struct pernet_operations __net_initdata netdev_net_ops = {
6649         .init = netdev_init,
6650         .exit = netdev_exit,
6651 };
6652
6653 static void __net_exit default_device_exit(struct net *net)
6654 {
6655         struct net_device *dev, *aux;
6656         /*
6657          * Push all migratable network devices back to the
6658          * initial network namespace
6659          */
6660         rtnl_lock();
6661         for_each_netdev_safe(net, dev, aux) {
6662                 int err;
6663                 char fb_name[IFNAMSIZ];
6664
6665                 /* Ignore unmoveable devices (i.e. loopback) */
6666                 if (dev->features & NETIF_F_NETNS_LOCAL)
6667                         continue;
6668
6669                 /* Leave virtual devices for the generic cleanup */
6670                 if (dev->rtnl_link_ops)
6671                         continue;
6672
6673                 /* Push remaining network devices to init_net */
6674                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6675                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6676                 if (err) {
6677                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6678                                  __func__, dev->name, err);
6679                         BUG();
6680                 }
6681         }
6682         rtnl_unlock();
6683 }
6684
6685 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6686 {
6687         /* At exit all network devices most be removed from a network
6688          * namespace.  Do this in the reverse order of registration.
6689          * Do this across as many network namespaces as possible to
6690          * improve batching efficiency.
6691          */
6692         struct net_device *dev;
6693         struct net *net;
6694         LIST_HEAD(dev_kill_list);
6695
6696         rtnl_lock();
6697         list_for_each_entry(net, net_list, exit_list) {
6698                 for_each_netdev_reverse(net, dev) {
6699                         if (dev->rtnl_link_ops)
6700                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6701                         else
6702                                 unregister_netdevice_queue(dev, &dev_kill_list);
6703                 }
6704         }
6705         unregister_netdevice_many(&dev_kill_list);
6706         list_del(&dev_kill_list);
6707         rtnl_unlock();
6708 }
6709
6710 static struct pernet_operations __net_initdata default_device_ops = {
6711         .exit = default_device_exit,
6712         .exit_batch = default_device_exit_batch,
6713 };
6714
6715 /*
6716  *      Initialize the DEV module. At boot time this walks the device list and
6717  *      unhooks any devices that fail to initialise (normally hardware not
6718  *      present) and leaves us with a valid list of present and active devices.
6719  *
6720  */
6721
6722 /*
6723  *       This is called single threaded during boot, so no need
6724  *       to take the rtnl semaphore.
6725  */
6726 static int __init net_dev_init(void)
6727 {
6728         int i, rc = -ENOMEM;
6729
6730         BUG_ON(!dev_boot_phase);
6731
6732         if (dev_proc_init())
6733                 goto out;
6734
6735         if (netdev_kobject_init())
6736                 goto out;
6737
6738         INIT_LIST_HEAD(&ptype_all);
6739         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6740                 INIT_LIST_HEAD(&ptype_base[i]);
6741
6742         INIT_LIST_HEAD(&offload_base);
6743
6744         if (register_pernet_subsys(&netdev_net_ops))
6745                 goto out;
6746
6747         /*
6748          *      Initialise the packet receive queues.
6749          */
6750
6751         for_each_possible_cpu(i) {
6752                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6753
6754                 memset(sd, 0, sizeof(*sd));
6755                 skb_queue_head_init(&sd->input_pkt_queue);
6756                 skb_queue_head_init(&sd->process_queue);
6757                 sd->completion_queue = NULL;
6758                 INIT_LIST_HEAD(&sd->poll_list);
6759                 sd->output_queue = NULL;
6760                 sd->output_queue_tailp = &sd->output_queue;
6761 #ifdef CONFIG_RPS
6762                 sd->csd.func = rps_trigger_softirq;
6763                 sd->csd.info = sd;
6764                 sd->csd.flags = 0;
6765                 sd->cpu = i;
6766 #endif
6767
6768                 sd->backlog.poll = process_backlog;
6769                 sd->backlog.weight = weight_p;
6770                 sd->backlog.gro_list = NULL;
6771                 sd->backlog.gro_count = 0;
6772         }
6773
6774         dev_boot_phase = 0;
6775
6776         /* The loopback device is special if any other network devices
6777          * is present in a network namespace the loopback device must
6778          * be present. Since we now dynamically allocate and free the
6779          * loopback device ensure this invariant is maintained by
6780          * keeping the loopback device as the first device on the
6781          * list of network devices.  Ensuring the loopback devices
6782          * is the first device that appears and the last network device
6783          * that disappears.
6784          */
6785         if (register_pernet_device(&loopback_net_ops))
6786                 goto out;
6787
6788         if (register_pernet_device(&default_device_ops))
6789                 goto out;
6790
6791         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6792         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6793
6794         hotcpu_notifier(dev_cpu_callback, 0);
6795         dst_init();
6796         dev_mcast_init();
6797         rc = 0;
6798 out:
6799         return rc;
6800 }
6801
6802 subsys_initcall(net_dev_init);
6803
6804 static int __init initialize_hashrnd(void)
6805 {
6806         get_random_bytes(&hashrnd, sizeof(hashrnd));
6807         return 0;
6808 }
6809
6810 late_initcall_sync(initialize_hashrnd);
6811