Pileus Git - ~andy/linux/blob - net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/proc_fs.h>
 101 #include <linux/seq_file.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <net/xfrm.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/net_tstamp.h>
 136 #include <linux/static_key.h>
 137 #include <net/flow_keys.h>
 138
 139 #include "net-sysfs.h"
 140
 141 /* Instead of increasing this, you should create a hash table. */
 142 #define MAX_GRO_SKBS 8
 143
 144 /* This should be increased if a protocol with a bigger head is added. */
 145 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 146
 147 /*
 148  *      The list of packet types we will receive (as opposed to discard)
 149  *      and the routines to invoke.
 150  *
 151  *      Why 16. Because with 16 the only overlap we get on a hash of the
 152  *      low nibble of the protocol value is RARP/SNAP/X.25.
 153  *
 154  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 155  *             sure which should go first, but I bet it won't make much
 156  *             difference if we are running VLANs.  The good news is that
 157  *             this protocol won't be in the list unless compiled in, so
 158  *             the average user (w/out VLANs) will not be adversely affected.
 159  *             --BLG
 160  *
 161  *              0800    IP
 162  *              8100    802.1Q VLAN
 163  *              0001    802.3
 164  *              0002    AX.25
 165  *              0004    802.2
 166  *              8035    RARP
 167  *              0005    SNAP
 168  *              0805    X.25
 169  *              0806    ARP
 170  *              8137    IPX
 171  *              0009    Localtalk
 172  *              86DD    IPv6
 173  */
 174
 175 #define PTYPE_HASH_SIZE (16)
 176 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 177
 178 static DEFINE_SPINLOCK(ptype_lock);
 179 static DEFINE_SPINLOCK(offload_lock);
 180 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 181 static struct list_head ptype_all __read_mostly;        /* Taps */
 182 static struct list_head offload_base __read_mostly;
 183
 184 /*
 185  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 186  * semaphore.
 187  *
 188  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 189  *
 190  * Writers must hold the rtnl semaphore while they loop through the
 191  * dev_base_head list, and hold dev_base_lock for writing when they do the
 192  * actual updates.  This allows pure readers to access the list even
 193  * while a writer is preparing to update it.
 194  *
 195  * To put it another way, dev_base_lock is held for writing only to
 196  * protect against pure readers; the rtnl semaphore provides the
 197  * protection against other writers.
 198  *
 199  * See, for example usages, register_netdevice() and
 200  * unregister_netdevice(), which must be called with the rtnl
 201  * semaphore held.
 202  */
 203 DEFINE_RWLOCK(dev_base_lock);
 204 EXPORT_SYMBOL(dev_base_lock);
 205
 206 static inline void dev_base_seq_inc(struct net *net)
 207 {
 208         while (++net->dev_base_seq == 0);
 209 }
 210
 211 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 212 {
 213         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 214
 215         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 216 }
 217
 218 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 219 {
 220         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 221 }
 222
 223 static inline void rps_lock(struct softnet_data *sd)
 224 {
 225 #ifdef CONFIG_RPS
 226         spin_lock(&sd->input_pkt_queue.lock);
 227 #endif
 228 }
 229
 230 static inline void rps_unlock(struct softnet_data *sd)
 231 {
 232 #ifdef CONFIG_RPS
 233         spin_unlock(&sd->input_pkt_queue.lock);
 234 #endif
 235 }
 236
 237 /* Device list insertion */
 238 static int list_netdevice(struct net_device *dev)
 239 {
 240         struct net *net = dev_net(dev);
 241
 242         ASSERT_RTNL();
 243
 244         write_lock_bh(&dev_base_lock);
 245         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 246         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 247         hlist_add_head_rcu(&dev->index_hlist,
 248                            dev_index_hash(net, dev->ifindex));
 249         write_unlock_bh(&dev_base_lock);
 250
 251         dev_base_seq_inc(net);
 252
 253         return 0;
 254 }
 255
 256 /* Device list removal
 257  * caller must respect a RCU grace period before freeing/reusing dev
 258  */
 259 static void unlist_netdevice(struct net_device *dev)
 260 {
 261         ASSERT_RTNL();
 262
 263         /* Unlink dev from the device chain */
 264         write_lock_bh(&dev_base_lock);
 265         list_del_rcu(&dev->dev_list);
 266         hlist_del_rcu(&dev->name_hlist);
 267         hlist_del_rcu(&dev->index_hlist);
 268         write_unlock_bh(&dev_base_lock);
 269
 270         dev_base_seq_inc(dev_net(dev));
 271 }
 272
 273 /*
 274  *      Our notifier list
 275  */
 276
 277 static RAW_NOTIFIER_HEAD(netdev_chain);
 278
 279 /*
 280  *      Device drivers call our routines to queue packets here. We empty the
 281  *      queue in the local softnet handler.
 282  */
 283
 284 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 285 EXPORT_PER_CPU_SYMBOL(softnet_data);
 286
 287 #ifdef CONFIG_LOCKDEP
 288 /*
 289  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 290  * according to dev->type
 291  */
 292 static const unsigned short netdev_lock_type[] =
 293         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 294          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 295          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 296          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 297          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 298          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 299          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 300          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 301          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 302          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 303          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 304          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 305          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 306          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 307          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 308
 309 static const char *const netdev_lock_name[] =
 310         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 311          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 312          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 313          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 314          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 315          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 316          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 317          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 318          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 319          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 320          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 321          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 322          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 323          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 324          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 325
 326 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 327 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 328
 329 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 330 {
 331         int i;
 332
 333         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 334                 if (netdev_lock_type[i] == dev_type)
 335                         return i;
 336         /* the last key is used by default */
 337         return ARRAY_SIZE(netdev_lock_type) - 1;
 338 }
 339
 340 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 341                                                  unsigned short dev_type)
 342 {
 343         int i;
 344
 345         i = netdev_lock_pos(dev_type);
 346         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 347                                    netdev_lock_name[i]);
 348 }
 349
 350 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 351 {
 352         int i;
 353
 354         i = netdev_lock_pos(dev->type);
 355         lockdep_set_class_and_name(&dev->addr_list_lock,
 356                                    &netdev_addr_lock_key[i],
 357                                    netdev_lock_name[i]);
 358 }
 359 #else
 360 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 361                                                  unsigned short dev_type)
 362 {
 363 }
 364 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 365 {
 366 }
 367 #endif
 368
 369 /*******************************************************************************
 370
 371                 Protocol management and registration routines
 372
 373 *******************************************************************************/
 374
 375 /*
 376  *      Add a protocol ID to the list. Now that the input handler is
 377  *      smarter we can dispense with all the messy stuff that used to be
 378  *      here.
 379  *
 380  *      BEWARE!!! Protocol handlers, mangling input packets,
 381  *      MUST BE last in hash buckets and checking protocol handlers
 382  *      MUST start from promiscuous ptype_all chain in net_bh.
 383  *      It is true now, do not change it.
 384  *      Explanation follows: if protocol handler, mangling packet, will
 385  *      be the first on list, it is not able to sense, that packet
 386  *      is cloned and should be copied-on-write, so that it will
 387  *      change it and subsequent readers will get broken packet.
 388  *                                                      --ANK (980803)
 389  */
 390
 391 static inline struct list_head *ptype_head(const struct packet_type *pt)
 392 {
 393         if (pt->type == htons(ETH_P_ALL))
 394                 return &ptype_all;
 395         else
 396                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 397 }
 398
 399 /**
 400  *      dev_add_pack - add packet handler
 401  *      @pt: packet type declaration
 402  *
 403  *      Add a protocol handler to the networking stack. The passed &packet_type
 404  *      is linked into kernel lists and may not be freed until it has been
 405  *      removed from the kernel lists.
 406  *
 407  *      This call does not sleep therefore it can not
 408  *      guarantee all CPU's that are in middle of receiving packets
 409  *      will see the new packet type (until the next received packet).
 410  */
 411
 412 void dev_add_pack(struct packet_type *pt)
 413 {
 414         struct list_head *head = ptype_head(pt);
 415
 416         spin_lock(&ptype_lock);
 417         list_add_rcu(&pt->list, head);
 418         spin_unlock(&ptype_lock);
 419 }
 420 EXPORT_SYMBOL(dev_add_pack);
 421
 422 /**
 423  *      __dev_remove_pack        - remove packet handler
 424  *      @pt: packet type declaration
 425  *
 426  *      Remove a protocol handler that was previously added to the kernel
 427  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 428  *      from the kernel lists and can be freed or reused once this function
 429  *      returns.
 430  *
 431  *      The packet type might still be in use by receivers
 432  *      and must not be freed until after all the CPU's have gone
 433  *      through a quiescent state.
 434  */
 435 void __dev_remove_pack(struct packet_type *pt)
 436 {
 437         struct list_head *head = ptype_head(pt);
 438         struct packet_type *pt1;
 439
 440         spin_lock(&ptype_lock);
 441
 442         list_for_each_entry(pt1, head, list) {
 443                 if (pt == pt1) {
 444                         list_del_rcu(&pt->list);
 445                         goto out;
 446                 }
 447         }
 448
 449         pr_warn("dev_remove_pack: %p not found\n", pt);
 450 out:
 451         spin_unlock(&ptype_lock);
 452 }
 453 EXPORT_SYMBOL(__dev_remove_pack);
 454
 455 /**
 456  *      dev_remove_pack  - remove packet handler
 457  *      @pt: packet type declaration
 458  *
 459  *      Remove a protocol handler that was previously added to the kernel
 460  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 461  *      from the kernel lists and can be freed or reused once this function
 462  *      returns.
 463  *
 464  *      This call sleeps to guarantee that no CPU is looking at the packet
 465  *      type after return.
 466  */
 467 void dev_remove_pack(struct packet_type *pt)
 468 {
 469         __dev_remove_pack(pt);
 470
 471         synchronize_net();
 472 }
 473 EXPORT_SYMBOL(dev_remove_pack);
 474
 475
 476 /**
 477  *      dev_add_offload - register offload handlers
 478  *      @po: protocol offload declaration
 479  *
 480  *      Add protocol offload handlers to the networking stack. The passed
 481  *      &proto_offload is linked into kernel lists and may not be freed until
 482  *      it has been removed from the kernel lists.
 483  *
 484  *      This call does not sleep therefore it can not
 485  *      guarantee all CPU's that are in middle of receiving packets
 486  *      will see the new offload handlers (until the next received packet).
 487  */
 488 void dev_add_offload(struct packet_offload *po)
 489 {
 490         struct list_head *head = &offload_base;
 491
 492         spin_lock(&offload_lock);
 493         list_add_rcu(&po->list, head);
 494         spin_unlock(&offload_lock);
 495 }
 496 EXPORT_SYMBOL(dev_add_offload);
 497
 498 /**
 499  *      __dev_remove_offload     - remove offload handler
 500  *      @po: packet offload declaration
 501  *
 502  *      Remove a protocol offload handler that was previously added to the
 503  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 504  *      is removed from the kernel lists and can be freed or reused once this
 505  *      function returns.
 506  *
 507  *      The packet type might still be in use by receivers
 508  *      and must not be freed until after all the CPU's have gone
 509  *      through a quiescent state.
 510  */
 511 void __dev_remove_offload(struct packet_offload *po)
 512 {
 513         struct list_head *head = &offload_base;
 514         struct packet_offload *po1;
 515
 516         spin_lock(&ptype_lock);
 517
 518         list_for_each_entry(po1, head, list) {
 519                 if (po == po1) {
 520                         list_del_rcu(&po->list);
 521                         goto out;
 522                 }
 523         }
 524
 525         pr_warn("dev_remove_offload: %p not found\n", po);
 526 out:
 527         spin_unlock(&ptype_lock);
 528 }
 529 EXPORT_SYMBOL(__dev_remove_offload);
 530
 531 /**
 532  *      dev_remove_offload       - remove packet offload handler
 533  *      @po: packet offload declaration
 534  *
 535  *      Remove a packet offload handler that was previously added to the kernel
 536  *      offload handlers by dev_add_offload(). The passed &offload_type is
 537  *      removed from the kernel lists and can be freed or reused once this
 538  *      function returns.
 539  *
 540  *      This call sleeps to guarantee that no CPU is looking at the packet
 541  *      type after return.
 542  */
 543 void dev_remove_offload(struct packet_offload *po)
 544 {
 545         __dev_remove_offload(po);
 546
 547         synchronize_net();
 548 }
 549 EXPORT_SYMBOL(dev_remove_offload);
 550
 551 /******************************************************************************
 552
 553                       Device Boot-time Settings Routines
 554
 555 *******************************************************************************/
 556
 557 /* Boot time configuration table */
 558 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 559
 560 /**
 561  *      netdev_boot_setup_add   - add new setup entry
 562  *      @name: name of the device
 563  *      @map: configured settings for the device
 564  *
 565  *      Adds new setup entry to the dev_boot_setup list.  The function
 566  *      returns 0 on error and 1 on success.  This is a generic routine to
 567  *      all netdevices.
 568  */
 569 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 570 {
 571         struct netdev_boot_setup *s;
 572         int i;
 573
 574         s = dev_boot_setup;
 575         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 576                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 577                         memset(s[i].name, 0, sizeof(s[i].name));
 578                         strlcpy(s[i].name, name, IFNAMSIZ);
 579                         memcpy(&s[i].map, map, sizeof(s[i].map));
 580                         break;
 581                 }
 582         }
 583
 584         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 585 }
 586
 587 /**
 588  *      netdev_boot_setup_check - check boot time settings
 589  *      @dev: the netdevice
 590  *
 591  *      Check boot time settings for the device.
 592  *      The found settings are set for the device to be used
 593  *      later in the device probing.
 594  *      Returns 0 if no settings found, 1 if they are.
 595  */
 596 int netdev_boot_setup_check(struct net_device *dev)
 597 {
 598         struct netdev_boot_setup *s = dev_boot_setup;
 599         int i;
 600
 601         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 602                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 603                     !strcmp(dev->name, s[i].name)) {
 604                         dev->irq        = s[i].map.irq;
 605                         dev->base_addr  = s[i].map.base_addr;
 606                         dev->mem_start  = s[i].map.mem_start;
 607                         dev->mem_end    = s[i].map.mem_end;
 608                         return 1;
 609                 }
 610         }
 611         return 0;
 612 }
 613 EXPORT_SYMBOL(netdev_boot_setup_check);
 614
 615
 616 /**
 617  *      netdev_boot_base        - get address from boot time settings
 618  *      @prefix: prefix for network device
 619  *      @unit: id for network device
 620  *
 621  *      Check boot time settings for the base address of device.
 622  *      The found settings are set for the device to be used
 623  *      later in the device probing.
 624  *      Returns 0 if no settings found.
 625  */
 626 unsigned long netdev_boot_base(const char *prefix, int unit)
 627 {
 628         const struct netdev_boot_setup *s = dev_boot_setup;
 629         char name[IFNAMSIZ];
 630         int i;
 631
 632         sprintf(name, "%s%d", prefix, unit);
 633
 634         /*
 635          * If device already registered then return base of 1
 636          * to indicate not to probe for this interface
 637          */
 638         if (__dev_get_by_name(&init_net, name))
 639                 return 1;
 640
 641         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 642                 if (!strcmp(name, s[i].name))
 643                         return s[i].map.base_addr;
 644         return 0;
 645 }
 646
 647 /*
 648  * Saves at boot time configured settings for any netdevice.
 649  */
 650 int __init netdev_boot_setup(char *str)
 651 {
 652         int ints[5];
 653         struct ifmap map;
 654
 655         str = get_options(str, ARRAY_SIZE(ints), ints);
 656         if (!str || !*str)
 657                 return 0;
 658
 659         /* Save settings */
 660         memset(&map, 0, sizeof(map));
 661         if (ints[0] > 0)
 662                 map.irq = ints[1];
 663         if (ints[0] > 1)
 664                 map.base_addr = ints[2];
 665         if (ints[0] > 2)
 666                 map.mem_start = ints[3];
 667         if (ints[0] > 3)
 668                 map.mem_end = ints[4];
 669
 670         /* Add new entry to the list */
 671         return netdev_boot_setup_add(str, &map);
 672 }
 673
 674 __setup("netdev=", netdev_boot_setup);
 675
 676 /*******************************************************************************
 677
 678                             Device Interface Subroutines
 679
 680 *******************************************************************************/
 681
 682 /**
 683  *      __dev_get_by_name       - find a device by its name
 684  *      @net: the applicable net namespace
 685  *      @name: name to find
 686  *
 687  *      Find an interface by name. Must be called under RTNL semaphore
 688  *      or @dev_base_lock. If the name is found a pointer to the device
 689  *      is returned. If the name is not found then %NULL is returned. The
 690  *      reference counters are not incremented so the caller must be
 691  *      careful with locks.
 692  */
 693
 694 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 695 {
 696         struct hlist_node *p;
 697         struct net_device *dev;
 698         struct hlist_head *head = dev_name_hash(net, name);
 699
 700         hlist_for_each_entry(dev, p, head, name_hlist)
 701                 if (!strncmp(dev->name, name, IFNAMSIZ))
 702                         return dev;
 703
 704         return NULL;
 705 }
 706 EXPORT_SYMBOL(__dev_get_by_name);
 707
 708 /**
 709  *      dev_get_by_name_rcu     - find a device by its name
 710  *      @net: the applicable net namespace
 711  *      @name: name to find
 712  *
 713  *      Find an interface by name.
 714  *      If the name is found a pointer to the device is returned.
 715  *      If the name is not found then %NULL is returned.
 716  *      The reference counters are not incremented so the caller must be
 717  *      careful with locks. The caller must hold RCU lock.
 718  */
 719
 720 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 721 {
 722         struct hlist_node *p;
 723         struct net_device *dev;
 724         struct hlist_head *head = dev_name_hash(net, name);
 725
 726         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 727                 if (!strncmp(dev->name, name, IFNAMSIZ))
 728                         return dev;
 729
 730         return NULL;
 731 }
 732 EXPORT_SYMBOL(dev_get_by_name_rcu);
 733
 734 /**
 735  *      dev_get_by_name         - find a device by its name
 736  *      @net: the applicable net namespace
 737  *      @name: name to find
 738  *
 739  *      Find an interface by name. This can be called from any
 740  *      context and does its own locking. The returned handle has
 741  *      the usage count incremented and the caller must use dev_put() to
 742  *      release it when it is no longer needed. %NULL is returned if no
 743  *      matching device is found.
 744  */
 745
 746 struct net_device *dev_get_by_name(struct net *net, const char *name)
 747 {
 748         struct net_device *dev;
 749
 750         rcu_read_lock();
 751         dev = dev_get_by_name_rcu(net, name);
 752         if (dev)
 753                 dev_hold(dev);
 754         rcu_read_unlock();
 755         return dev;
 756 }
 757 EXPORT_SYMBOL(dev_get_by_name);
 758
 759 /**
 760  *      __dev_get_by_index - find a device by its ifindex
 761  *      @net: the applicable net namespace
 762  *      @ifindex: index of device
 763  *
 764  *      Search for an interface by index. Returns %NULL if the device
 765  *      is not found or a pointer to the device. The device has not
 766  *      had its reference counter increased so the caller must be careful
 767  *      about locking. The caller must hold either the RTNL semaphore
 768  *      or @dev_base_lock.
 769  */
 770
 771 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 772 {
 773         struct hlist_node *p;
 774         struct net_device *dev;
 775         struct hlist_head *head = dev_index_hash(net, ifindex);
 776
 777         hlist_for_each_entry(dev, p, head, index_hlist)
 778                 if (dev->ifindex == ifindex)
 779                         return dev;
 780
 781         return NULL;
 782 }
 783 EXPORT_SYMBOL(__dev_get_by_index);
 784
 785 /**
 786  *      dev_get_by_index_rcu - find a device by its ifindex
 787  *      @net: the applicable net namespace
 788  *      @ifindex: index of device
 789  *
 790  *      Search for an interface by index. Returns %NULL if the device
 791  *      is not found or a pointer to the device. The device has not
 792  *      had its reference counter increased so the caller must be careful
 793  *      about locking. The caller must hold RCU lock.
 794  */
 795
 796 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 797 {
 798         struct hlist_node *p;
 799         struct net_device *dev;
 800         struct hlist_head *head = dev_index_hash(net, ifindex);
 801
 802         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 803                 if (dev->ifindex == ifindex)
 804                         return dev;
 805
 806         return NULL;
 807 }
 808 EXPORT_SYMBOL(dev_get_by_index_rcu);
 809
 810
 811 /**
 812  *      dev_get_by_index - find a device by its ifindex
 813  *      @net: the applicable net namespace
 814  *      @ifindex: index of device
 815  *
 816  *      Search for an interface by index. Returns NULL if the device
 817  *      is not found or a pointer to the device. The device returned has
 818  *      had a reference added and the pointer is safe until the user calls
 819  *      dev_put to indicate they have finished with it.
 820  */
 821
 822 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 823 {
 824         struct net_device *dev;
 825
 826         rcu_read_lock();
 827         dev = dev_get_by_index_rcu(net, ifindex);
 828         if (dev)
 829                 dev_hold(dev);
 830         rcu_read_unlock();
 831         return dev;
 832 }
 833 EXPORT_SYMBOL(dev_get_by_index);
 834
 835 /**
 836  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 837  *      @net: the applicable net namespace
 838  *      @type: media type of device
 839  *      @ha: hardware address
 840  *
 841  *      Search for an interface by MAC address. Returns NULL if the device
 842  *      is not found or a pointer to the device.
 843  *      The caller must hold RCU or RTNL.
 844  *      The returned device has not had its ref count increased
 845  *      and the caller must therefore be careful about locking
 846  *
 847  */
 848
 849 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 850                                        const char *ha)
 851 {
 852         struct net_device *dev;
 853
 854         for_each_netdev_rcu(net, dev)
 855                 if (dev->type == type &&
 856                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 857                         return dev;
 858
 859         return NULL;
 860 }
 861 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 862
 863 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 864 {
 865         struct net_device *dev;
 866
 867         ASSERT_RTNL();
 868         for_each_netdev(net, dev)
 869                 if (dev->type == type)
 870                         return dev;
 871
 872         return NULL;
 873 }
 874 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 875
 876 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 877 {
 878         struct net_device *dev, *ret = NULL;
 879
 880         rcu_read_lock();
 881         for_each_netdev_rcu(net, dev)
 882                 if (dev->type == type) {
 883                         dev_hold(dev);
 884                         ret = dev;
 885                         break;
 886                 }
 887         rcu_read_unlock();
 888         return ret;
 889 }
 890 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 891
 892 /**
 893  *      dev_get_by_flags_rcu - find any device with given flags
 894  *      @net: the applicable net namespace
 895  *      @if_flags: IFF_* values
 896  *      @mask: bitmask of bits in if_flags to check
 897  *
 898  *      Search for any interface with the given flags. Returns NULL if a device
 899  *      is not found or a pointer to the device. Must be called inside
 900  *      rcu_read_lock(), and result refcount is unchanged.
 901  */
 902
 903 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 904                                     unsigned short mask)
 905 {
 906         struct net_device *dev, *ret;
 907
 908         ret = NULL;
 909         for_each_netdev_rcu(net, dev) {
 910                 if (((dev->flags ^ if_flags) & mask) == 0) {
 911                         ret = dev;
 912                         break;
 913                 }
 914         }
 915         return ret;
 916 }
 917 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 918
 919 /**
 920  *      dev_valid_name - check if name is okay for network device
 921  *      @name: name string
 922  *
 923  *      Network device names need to be valid file names to
 924  *      to allow sysfs to work.  We also disallow any kind of
 925  *      whitespace.
 926  */
 927 bool dev_valid_name(const char *name)
 928 {
 929         if (*name == '\0')
 930                 return false;
 931         if (strlen(name) >= IFNAMSIZ)
 932                 return false;
 933         if (!strcmp(name, ".") || !strcmp(name, ".."))
 934                 return false;
 935
 936         while (*name) {
 937                 if (*name == '/' || isspace(*name))
 938                         return false;
 939                 name++;
 940         }
 941         return true;
 942 }
 943 EXPORT_SYMBOL(dev_valid_name);
 944
 945 /**
 946  *      __dev_alloc_name - allocate a name for a device
 947  *      @net: network namespace to allocate the device name in
 948  *      @name: name format string
 949  *      @buf:  scratch buffer and result name string
 950  *
 951  *      Passed a format string - eg "lt%d" it will try and find a suitable
 952  *      id. It scans list of devices to build up a free map, then chooses
 953  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 954  *      while allocating the name and adding the device in order to avoid
 955  *      duplicates.
 956  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 957  *      Returns the number of the unit assigned or a negative errno code.
 958  */
 959
 960 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 961 {
 962         int i = 0;
 963         const char *p;
 964         const int max_netdevices = 8*PAGE_SIZE;
 965         unsigned long *inuse;
 966         struct net_device *d;
 967
 968         p = strnchr(name, IFNAMSIZ-1, '%');
 969         if (p) {
 970                 /*
 971                  * Verify the string as this thing may have come from
 972                  * the user.  There must be either one "%d" and no other "%"
 973                  * characters.
 974                  */
 975                 if (p[1] != 'd' || strchr(p + 2, '%'))
 976                         return -EINVAL;
 977
 978                 /* Use one page as a bit array of possible slots */
 979                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 980                 if (!inuse)
 981                         return -ENOMEM;
 982
 983                 for_each_netdev(net, d) {
 984                         if (!sscanf(d->name, name, &i))
 985                                 continue;
 986                         if (i < 0 || i >= max_netdevices)
 987                                 continue;
 988
 989                         /*  avoid cases where sscanf is not exact inverse of printf */
 990                         snprintf(buf, IFNAMSIZ, name, i);
 991                         if (!strncmp(buf, d->name, IFNAMSIZ))
 992                                 set_bit(i, inuse);
 993                 }
 994
 995                 i = find_first_zero_bit(inuse, max_netdevices);
 996                 free_page((unsigned long) inuse);
 997         }
 998
 999         if (buf != name)
1000                 snprintf(buf, IFNAMSIZ, name, i);
1001         if (!__dev_get_by_name(net, buf))
1002                 return i;
1003
1004         /* It is possible to run out of possible slots
1005          * when the name is long and there isn't enough space left
1006          * for the digits, or if all bits are used.
1007          */
1008         return -ENFILE;
1009 }
1010
1011 /**
1012  *      dev_alloc_name - allocate a name for a device
1013  *      @dev: device
1014  *      @name: name format string
1015  *
1016  *      Passed a format string - eg "lt%d" it will try and find a suitable
1017  *      id. It scans list of devices to build up a free map, then chooses
1018  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1019  *      while allocating the name and adding the device in order to avoid
1020  *      duplicates.
1021  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1022  *      Returns the number of the unit assigned or a negative errno code.
1023  */
1024
1025 int dev_alloc_name(struct net_device *dev, const char *name)
1026 {
1027         char buf[IFNAMSIZ];
1028         struct net *net;
1029         int ret;
1030
1031         BUG_ON(!dev_net(dev));
1032         net = dev_net(dev);
1033         ret = __dev_alloc_name(net, name, buf);
1034         if (ret >= 0)
1035                 strlcpy(dev->name, buf, IFNAMSIZ);
1036         return ret;
1037 }
1038 EXPORT_SYMBOL(dev_alloc_name);
1039
1040 static int dev_alloc_name_ns(struct net *net,
1041                              struct net_device *dev,
1042                              const char *name)
1043 {
1044         char buf[IFNAMSIZ];
1045         int ret;
1046
1047         ret = __dev_alloc_name(net, name, buf);
1048         if (ret >= 0)
1049                 strlcpy(dev->name, buf, IFNAMSIZ);
1050         return ret;
1051 }
1052
1053 static int dev_get_valid_name(struct net *net,
1054                               struct net_device *dev,
1055                               const char *name)
1056 {
1057         BUG_ON(!net);
1058
1059         if (!dev_valid_name(name))
1060                 return -EINVAL;
1061
1062         if (strchr(name, '%'))
1063                 return dev_alloc_name_ns(net, dev, name);
1064         else if (__dev_get_by_name(net, name))
1065                 return -EEXIST;
1066         else if (dev->name != name)
1067                 strlcpy(dev->name, name, IFNAMSIZ);
1068
1069         return 0;
1070 }
1071
1072 /**
1073  *      dev_change_name - change name of a device
1074  *      @dev: device
1075  *      @newname: name (or format string) must be at least IFNAMSIZ
1076  *
1077  *      Change name of a device, can pass format strings "eth%d".
1078  *      for wildcarding.
1079  */
1080 int dev_change_name(struct net_device *dev, const char *newname)
1081 {
1082         char oldname[IFNAMSIZ];
1083         int err = 0;
1084         int ret;
1085         struct net *net;
1086
1087         ASSERT_RTNL();
1088         BUG_ON(!dev_net(dev));
1089
1090         net = dev_net(dev);
1091         if (dev->flags & IFF_UP)
1092                 return -EBUSY;
1093
1094         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1095                 return 0;
1096
1097         memcpy(oldname, dev->name, IFNAMSIZ);
1098
1099         err = dev_get_valid_name(net, dev, newname);
1100         if (err < 0)
1101                 return err;
1102
1103 rollback:
1104         ret = device_rename(&dev->dev, dev->name);
1105         if (ret) {
1106                 memcpy(dev->name, oldname, IFNAMSIZ);
1107                 return ret;
1108         }
1109
1110         write_lock_bh(&dev_base_lock);
1111         hlist_del_rcu(&dev->name_hlist);
1112         write_unlock_bh(&dev_base_lock);
1113
1114         synchronize_rcu();
1115
1116         write_lock_bh(&dev_base_lock);
1117         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1118         write_unlock_bh(&dev_base_lock);
1119
1120         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1121         ret = notifier_to_errno(ret);
1122
1123         if (ret) {
1124                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1125                 if (err >= 0) {
1126                         err = ret;
1127                         memcpy(dev->name, oldname, IFNAMSIZ);
1128                         goto rollback;
1129                 } else {
1130                         pr_err("%s: name change rollback failed: %d\n",
1131                                dev->name, ret);
1132                 }
1133         }
1134
1135         return err;
1136 }
1137
1138 /**
1139  *      dev_set_alias - change ifalias of a device
1140  *      @dev: device
1141  *      @alias: name up to IFALIASZ
1142  *      @len: limit of bytes to copy from info
1143  *
1144  *      Set ifalias for a device,
1145  */
1146 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1147 {
1148         char *new_ifalias;
1149
1150         ASSERT_RTNL();
1151
1152         if (len >= IFALIASZ)
1153                 return -EINVAL;
1154
1155         if (!len) {
1156                 if (dev->ifalias) {
1157                         kfree(dev->ifalias);
1158                         dev->ifalias = NULL;
1159                 }
1160                 return 0;
1161         }
1162
1163         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1164         if (!new_ifalias)
1165                 return -ENOMEM;
1166         dev->ifalias = new_ifalias;
1167
1168         strlcpy(dev->ifalias, alias, len+1);
1169         return len;
1170 }
1171
1172
1173 /**
1174  *      netdev_features_change - device changes features
1175  *      @dev: device to cause notification
1176  *
1177  *      Called to indicate a device has changed features.
1178  */
1179 void netdev_features_change(struct net_device *dev)
1180 {
1181         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1182 }
1183 EXPORT_SYMBOL(netdev_features_change);
1184
1185 /**
1186  *      netdev_state_change - device changes state
1187  *      @dev: device to cause notification
1188  *
1189  *      Called to indicate a device has changed state. This function calls
1190  *      the notifier chains for netdev_chain and sends a NEWLINK message
1191  *      to the routing socket.
1192  */
1193 void netdev_state_change(struct net_device *dev)
1194 {
1195         if (dev->flags & IFF_UP) {
1196                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1197                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1198         }
1199 }
1200 EXPORT_SYMBOL(netdev_state_change);
1201
1202 /**
1203  *      netdev_notify_peers - notify network peers about existence of @dev
1204  *      @dev: network device
1205  *
1206  * Generate traffic such that interested network peers are aware of
1207  * @dev, such as by generating a gratuitous ARP. This may be used when
1208  * a device wants to inform the rest of the network about some sort of
1209  * reconfiguration such as a failover event or virtual machine
1210  * migration.
1211  */
1212 void netdev_notify_peers(struct net_device *dev)
1213 {
1214         rtnl_lock();
1215         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1216         rtnl_unlock();
1217 }
1218 EXPORT_SYMBOL(netdev_notify_peers);
1219
1220 /**
1221  *      dev_load        - load a network module
1222  *      @net: the applicable net namespace
1223  *      @name: name of interface
1224  *
1225  *      If a network interface is not present and the process has suitable
1226  *      privileges this function loads the module. If module loading is not
1227  *      available in this kernel then it becomes a nop.
1228  */
1229
1230 void dev_load(struct net *net, const char *name)
1231 {
1232         struct net_device *dev;
1233         int no_module;
1234
1235         rcu_read_lock();
1236         dev = dev_get_by_name_rcu(net, name);
1237         rcu_read_unlock();
1238
1239         no_module = !dev;
1240         if (no_module && capable(CAP_NET_ADMIN))
1241                 no_module = request_module("netdev-%s", name);
1242         if (no_module && capable(CAP_SYS_MODULE)) {
1243                 if (!request_module("%s", name))
1244                         pr_warn("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated).  Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1245                                 name);
1246         }
1247 }
1248 EXPORT_SYMBOL(dev_load);
1249
1250 static int __dev_open(struct net_device *dev)
1251 {
1252         const struct net_device_ops *ops = dev->netdev_ops;
1253         int ret;
1254
1255         ASSERT_RTNL();
1256
1257         if (!netif_device_present(dev))
1258                 return -ENODEV;
1259
1260         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1261         ret = notifier_to_errno(ret);
1262         if (ret)
1263                 return ret;
1264
1265         set_bit(__LINK_STATE_START, &dev->state);
1266
1267         if (ops->ndo_validate_addr)
1268                 ret = ops->ndo_validate_addr(dev);
1269
1270         if (!ret && ops->ndo_open)
1271                 ret = ops->ndo_open(dev);
1272
1273         if (ret)
1274                 clear_bit(__LINK_STATE_START, &dev->state);
1275         else {
1276                 dev->flags |= IFF_UP;
1277                 net_dmaengine_get();
1278                 dev_set_rx_mode(dev);
1279                 dev_activate(dev);
1280                 add_device_randomness(dev->dev_addr, dev->addr_len);
1281         }
1282
1283         return ret;
1284 }
1285
1286 /**
1287  *      dev_open        - prepare an interface for use.
1288  *      @dev:   device to open
1289  *
1290  *      Takes a device from down to up state. The device's private open
1291  *      function is invoked and then the multicast lists are loaded. Finally
1292  *      the device is moved into the up state and a %NETDEV_UP message is
1293  *      sent to the netdev notifier chain.
1294  *
1295  *      Calling this function on an active interface is a nop. On a failure
1296  *      a negative errno code is returned.
1297  */
1298 int dev_open(struct net_device *dev)
1299 {
1300         int ret;
1301
1302         if (dev->flags & IFF_UP)
1303                 return 0;
1304
1305         ret = __dev_open(dev);
1306         if (ret < 0)
1307                 return ret;
1308
1309         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1310         call_netdevice_notifiers(NETDEV_UP, dev);
1311
1312         return ret;
1313 }
1314 EXPORT_SYMBOL(dev_open);
1315
1316 static int __dev_close_many(struct list_head *head)
1317 {
1318         struct net_device *dev;
1319
1320         ASSERT_RTNL();
1321         might_sleep();
1322
1323         list_for_each_entry(dev, head, unreg_list) {
1324                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1325
1326                 clear_bit(__LINK_STATE_START, &dev->state);
1327
1328                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1329                  * can be even on different cpu. So just clear netif_running().
1330                  *
1331                  * dev->stop() will invoke napi_disable() on all of it's
1332                  * napi_struct instances on this device.
1333                  */
1334                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1335         }
1336
1337         dev_deactivate_many(head);
1338
1339         list_for_each_entry(dev, head, unreg_list) {
1340                 const struct net_device_ops *ops = dev->netdev_ops;
1341
1342                 /*
1343                  *      Call the device specific close. This cannot fail.
1344                  *      Only if device is UP
1345                  *
1346                  *      We allow it to be called even after a DETACH hot-plug
1347                  *      event.
1348                  */
1349                 if (ops->ndo_stop)
1350                         ops->ndo_stop(dev);
1351
1352                 dev->flags &= ~IFF_UP;
1353                 net_dmaengine_put();
1354         }
1355
1356         return 0;
1357 }
1358
1359 static int __dev_close(struct net_device *dev)
1360 {
1361         int retval;
1362         LIST_HEAD(single);
1363
1364         list_add(&dev->unreg_list, &single);
1365         retval = __dev_close_many(&single);
1366         list_del(&single);
1367         return retval;
1368 }
1369
1370 static int dev_close_many(struct list_head *head)
1371 {
1372         struct net_device *dev, *tmp;
1373         LIST_HEAD(tmp_list);
1374
1375         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1376                 if (!(dev->flags & IFF_UP))
1377                         list_move(&dev->unreg_list, &tmp_list);
1378
1379         __dev_close_many(head);
1380
1381         list_for_each_entry(dev, head, unreg_list) {
1382                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1383                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1384         }
1385
1386         /* rollback_registered_many needs the complete original list */
1387         list_splice(&tmp_list, head);
1388         return 0;
1389 }
1390
1391 /**
1392  *      dev_close - shutdown an interface.
1393  *      @dev: device to shutdown
1394  *
1395  *      This function moves an active device into down state. A
1396  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1397  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1398  *      chain.
1399  */
1400 int dev_close(struct net_device *dev)
1401 {
1402         if (dev->flags & IFF_UP) {
1403                 LIST_HEAD(single);
1404
1405                 list_add(&dev->unreg_list, &single);
1406                 dev_close_many(&single);
1407                 list_del(&single);
1408         }
1409         return 0;
1410 }
1411 EXPORT_SYMBOL(dev_close);
1412
1413
1414 /**
1415  *      dev_disable_lro - disable Large Receive Offload on a device
1416  *      @dev: device
1417  *
1418  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1419  *      called under RTNL.  This is needed if received packets may be
1420  *      forwarded to another interface.
1421  */
1422 void dev_disable_lro(struct net_device *dev)
1423 {
1424         /*
1425          * If we're trying to disable lro on a vlan device
1426          * use the underlying physical device instead
1427          */
1428         if (is_vlan_dev(dev))
1429                 dev = vlan_dev_real_dev(dev);
1430
1431         dev->wanted_features &= ~NETIF_F_LRO;
1432         netdev_update_features(dev);
1433
1434         if (unlikely(dev->features & NETIF_F_LRO))
1435                 netdev_WARN(dev, "failed to disable LRO!\n");
1436 }
1437 EXPORT_SYMBOL(dev_disable_lro);
1438
1439
1440 static int dev_boot_phase = 1;
1441
1442 /**
1443  *      register_netdevice_notifier - register a network notifier block
1444  *      @nb: notifier
1445  *
1446  *      Register a notifier to be called when network device events occur.
1447  *      The notifier passed is linked into the kernel structures and must
1448  *      not be reused until it has been unregistered. A negative errno code
1449  *      is returned on a failure.
1450  *
1451  *      When registered all registration and up events are replayed
1452  *      to the new notifier to allow device to have a race free
1453  *      view of the network device list.
1454  */
1455
1456 int register_netdevice_notifier(struct notifier_block *nb)
1457 {
1458         struct net_device *dev;
1459         struct net_device *last;
1460         struct net *net;
1461         int err;
1462
1463         rtnl_lock();
1464         err = raw_notifier_chain_register(&netdev_chain, nb);
1465         if (err)
1466                 goto unlock;
1467         if (dev_boot_phase)
1468                 goto unlock;
1469         for_each_net(net) {
1470                 for_each_netdev(net, dev) {
1471                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1472                         err = notifier_to_errno(err);
1473                         if (err)
1474                                 goto rollback;
1475
1476                         if (!(dev->flags & IFF_UP))
1477                                 continue;
1478
1479                         nb->notifier_call(nb, NETDEV_UP, dev);
1480                 }
1481         }
1482
1483 unlock:
1484         rtnl_unlock();
1485         return err;
1486
1487 rollback:
1488         last = dev;
1489         for_each_net(net) {
1490                 for_each_netdev(net, dev) {
1491                         if (dev == last)
1492                                 goto outroll;
1493
1494                         if (dev->flags & IFF_UP) {
1495                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1496                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1497                         }
1498                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1499                 }
1500         }
1501
1502 outroll:
1503         raw_notifier_chain_unregister(&netdev_chain, nb);
1504         goto unlock;
1505 }
1506 EXPORT_SYMBOL(register_netdevice_notifier);
1507
1508 /**
1509  *      unregister_netdevice_notifier - unregister a network notifier block
1510  *      @nb: notifier
1511  *
1512  *      Unregister a notifier previously registered by
1513  *      register_netdevice_notifier(). The notifier is unlinked into the
1514  *      kernel structures and may then be reused. A negative errno code
1515  *      is returned on a failure.
1516  *
1517  *      After unregistering unregister and down device events are synthesized
1518  *      for all devices on the device list to the removed notifier to remove
1519  *      the need for special case cleanup code.
1520  */
1521
1522 int unregister_netdevice_notifier(struct notifier_block *nb)
1523 {
1524         struct net_device *dev;
1525         struct net *net;
1526         int err;
1527
1528         rtnl_lock();
1529         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1530         if (err)
1531                 goto unlock;
1532
1533         for_each_net(net) {
1534                 for_each_netdev(net, dev) {
1535                         if (dev->flags & IFF_UP) {
1536                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1537                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1538                         }
1539                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1540                 }
1541         }
1542 unlock:
1543         rtnl_unlock();
1544         return err;
1545 }
1546 EXPORT_SYMBOL(unregister_netdevice_notifier);
1547
1548 /**
1549  *      call_netdevice_notifiers - call all network notifier blocks
1550  *      @val: value passed unmodified to notifier function
1551  *      @dev: net_device pointer passed unmodified to notifier function
1552  *
1553  *      Call all network notifier blocks.  Parameters and return value
1554  *      are as for raw_notifier_call_chain().
1555  */
1556
1557 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1558 {
1559         ASSERT_RTNL();
1560         return raw_notifier_call_chain(&netdev_chain, val, dev);
1561 }
1562 EXPORT_SYMBOL(call_netdevice_notifiers);
1563
1564 static struct static_key netstamp_needed __read_mostly;
1565 #ifdef HAVE_JUMP_LABEL
1566 /* We are not allowed to call static_key_slow_dec() from irq context
1567  * If net_disable_timestamp() is called from irq context, defer the
1568  * static_key_slow_dec() calls.
1569  */
1570 static atomic_t netstamp_needed_deferred;
1571 #endif
1572
1573 void net_enable_timestamp(void)
1574 {
1575 #ifdef HAVE_JUMP_LABEL
1576         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1577
1578         if (deferred) {
1579                 while (--deferred)
1580                         static_key_slow_dec(&netstamp_needed);
1581                 return;
1582         }
1583 #endif
1584         WARN_ON(in_interrupt());
1585         static_key_slow_inc(&netstamp_needed);
1586 }
1587 EXPORT_SYMBOL(net_enable_timestamp);
1588
1589 void net_disable_timestamp(void)
1590 {
1591 #ifdef HAVE_JUMP_LABEL
1592         if (in_interrupt()) {
1593                 atomic_inc(&netstamp_needed_deferred);
1594                 return;
1595         }
1596 #endif
1597         static_key_slow_dec(&netstamp_needed);
1598 }
1599 EXPORT_SYMBOL(net_disable_timestamp);
1600
1601 static inline void net_timestamp_set(struct sk_buff *skb)
1602 {
1603         skb->tstamp.tv64 = 0;
1604         if (static_key_false(&netstamp_needed))
1605                 __net_timestamp(skb);
1606 }
1607
1608 #define net_timestamp_check(COND, SKB)                  \
1609         if (static_key_false(&netstamp_needed)) {               \
1610                 if ((COND) && !(SKB)->tstamp.tv64)      \
1611                         __net_timestamp(SKB);           \
1612         }                                               \
1613
1614 static int net_hwtstamp_validate(struct ifreq *ifr)
1615 {
1616         struct hwtstamp_config cfg;
1617         enum hwtstamp_tx_types tx_type;
1618         enum hwtstamp_rx_filters rx_filter;
1619         int tx_type_valid = 0;
1620         int rx_filter_valid = 0;
1621
1622         if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1623                 return -EFAULT;
1624
1625         if (cfg.flags) /* reserved for future extensions */
1626                 return -EINVAL;
1627
1628         tx_type = cfg.tx_type;
1629         rx_filter = cfg.rx_filter;
1630
1631         switch (tx_type) {
1632         case HWTSTAMP_TX_OFF:
1633         case HWTSTAMP_TX_ON:
1634         case HWTSTAMP_TX_ONESTEP_SYNC:
1635                 tx_type_valid = 1;
1636                 break;
1637         }
1638
1639         switch (rx_filter) {
1640         case HWTSTAMP_FILTER_NONE:
1641         case HWTSTAMP_FILTER_ALL:
1642         case HWTSTAMP_FILTER_SOME:
1643         case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1644         case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1645         case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1646         case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1647         case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1648         case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1649         case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1650         case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1651         case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1652         case HWTSTAMP_FILTER_PTP_V2_EVENT:
1653         case HWTSTAMP_FILTER_PTP_V2_SYNC:
1654         case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1655                 rx_filter_valid = 1;
1656                 break;
1657         }
1658
1659         if (!tx_type_valid || !rx_filter_valid)
1660                 return -ERANGE;
1661
1662         return 0;
1663 }
1664
1665 static inline bool is_skb_forwardable(struct net_device *dev,
1666                                       struct sk_buff *skb)
1667 {
1668         unsigned int len;
1669
1670         if (!(dev->flags & IFF_UP))
1671                 return false;
1672
1673         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1674         if (skb->len <= len)
1675                 return true;
1676
1677         /* if TSO is enabled, we don't care about the length as the packet
1678          * could be forwarded without being segmented before
1679          */
1680         if (skb_is_gso(skb))
1681                 return true;
1682
1683         return false;
1684 }
1685
1686 /**
1687  * dev_forward_skb - loopback an skb to another netif
1688  *
1689  * @dev: destination network device
1690  * @skb: buffer to forward
1691  *
1692  * return values:
1693  *      NET_RX_SUCCESS  (no congestion)
1694  *      NET_RX_DROP     (packet was dropped, but freed)
1695  *
1696  * dev_forward_skb can be used for injecting an skb from the
1697  * start_xmit function of one device into the receive queue
1698  * of another device.
1699  *
1700  * The receiving device may be in another namespace, so
1701  * we have to clear all information in the skb that could
1702  * impact namespace isolation.
1703  */
1704 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1705 {
1706         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1707                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1708                         atomic_long_inc(&dev->rx_dropped);
1709                         kfree_skb(skb);
1710                         return NET_RX_DROP;
1711                 }
1712         }
1713
1714         skb_orphan(skb);
1715         nf_reset(skb);
1716
1717         if (unlikely(!is_skb_forwardable(dev, skb))) {
1718                 atomic_long_inc(&dev->rx_dropped);
1719                 kfree_skb(skb);
1720                 return NET_RX_DROP;
1721         }
1722         skb->skb_iif = 0;
1723         skb->dev = dev;
1724         skb_dst_drop(skb);
1725         skb->tstamp.tv64 = 0;
1726         skb->pkt_type = PACKET_HOST;
1727         skb->protocol = eth_type_trans(skb, dev);
1728         skb->mark = 0;
1729         secpath_reset(skb);
1730         nf_reset(skb);
1731         return netif_rx(skb);
1732 }
1733 EXPORT_SYMBOL_GPL(dev_forward_skb);
1734
1735 static inline int deliver_skb(struct sk_buff *skb,
1736                               struct packet_type *pt_prev,
1737                               struct net_device *orig_dev)
1738 {
1739         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1740                 return -ENOMEM;
1741         atomic_inc(&skb->users);
1742         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1743 }
1744
1745 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1746 {
1747         if (!ptype->af_packet_priv || !skb->sk)
1748                 return false;
1749
1750         if (ptype->id_match)
1751                 return ptype->id_match(ptype, skb->sk);
1752         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1753                 return true;
1754
1755         return false;
1756 }
1757
1758 /*
1759  *      Support routine. Sends outgoing frames to any network
1760  *      taps currently in use.
1761  */
1762
1763 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1764 {
1765         struct packet_type *ptype;
1766         struct sk_buff *skb2 = NULL;
1767         struct packet_type *pt_prev = NULL;
1768
1769         rcu_read_lock();
1770         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1771                 /* Never send packets back to the socket
1772                  * they originated from - MvS (miquels@drinkel.ow.org)
1773                  */
1774                 if ((ptype->dev == dev || !ptype->dev) &&
1775                     (!skb_loop_sk(ptype, skb))) {
1776                         if (pt_prev) {
1777                                 deliver_skb(skb2, pt_prev, skb->dev);
1778                                 pt_prev = ptype;
1779                                 continue;
1780                         }
1781
1782                         skb2 = skb_clone(skb, GFP_ATOMIC);
1783                         if (!skb2)
1784                                 break;
1785
1786                         net_timestamp_set(skb2);
1787
1788                         /* skb->nh should be correctly
1789                            set by sender, so that the second statement is
1790                            just protection against buggy protocols.
1791                          */
1792                         skb_reset_mac_header(skb2);
1793
1794                         if (skb_network_header(skb2) < skb2->data ||
1795                             skb2->network_header > skb2->tail) {
1796                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1797                                                      ntohs(skb2->protocol),
1798                                                      dev->name);
1799                                 skb_reset_network_header(skb2);
1800                         }
1801
1802                         skb2->transport_header = skb2->network_header;
1803                         skb2->pkt_type = PACKET_OUTGOING;
1804                         pt_prev = ptype;
1805                 }
1806         }
1807         if (pt_prev)
1808                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1809         rcu_read_unlock();
1810 }
1811
1812 /**
1813  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1814  * @dev: Network device
1815  * @txq: number of queues available
1816  *
1817  * If real_num_tx_queues is changed the tc mappings may no longer be
1818  * valid. To resolve this verify the tc mapping remains valid and if
1819  * not NULL the mapping. With no priorities mapping to this
1820  * offset/count pair it will no longer be used. In the worst case TC0
1821  * is invalid nothing can be done so disable priority mappings. If is
1822  * expected that drivers will fix this mapping if they can before
1823  * calling netif_set_real_num_tx_queues.
1824  */
1825 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1826 {
1827         int i;
1828         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1829
1830         /* If TC0 is invalidated disable TC mapping */
1831         if (tc->offset + tc->count > txq) {
1832                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1833                 dev->num_tc = 0;
1834                 return;
1835         }
1836
1837         /* Invalidated prio to tc mappings set to TC0 */
1838         for (i = 1; i < TC_BITMASK + 1; i++) {
1839                 int q = netdev_get_prio_tc_map(dev, i);
1840
1841                 tc = &dev->tc_to_txq[q];
1842                 if (tc->offset + tc->count > txq) {
1843                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1844                                 i, q);
1845                         netdev_set_prio_tc_map(dev, i, 0);
1846                 }
1847         }
1848 }
1849
1850 /*
1851  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1852  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1853  */
1854 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1855 {
1856         int rc;
1857
1858         if (txq < 1 || txq > dev->num_tx_queues)
1859                 return -EINVAL;
1860
1861         if (dev->reg_state == NETREG_REGISTERED ||
1862             dev->reg_state == NETREG_UNREGISTERING) {
1863                 ASSERT_RTNL();
1864
1865                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1866                                                   txq);
1867                 if (rc)
1868                         return rc;
1869
1870                 if (dev->num_tc)
1871                         netif_setup_tc(dev, txq);
1872
1873                 if (txq < dev->real_num_tx_queues)
1874                         qdisc_reset_all_tx_gt(dev, txq);
1875         }
1876
1877         dev->real_num_tx_queues = txq;
1878         return 0;
1879 }
1880 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
1881
1882 #ifdef CONFIG_RPS
1883 /**
1884  *      netif_set_real_num_rx_queues - set actual number of RX queues used
1885  *      @dev: Network device
1886  *      @rxq: Actual number of RX queues
1887  *
1888  *      This must be called either with the rtnl_lock held or before
1889  *      registration of the net device.  Returns 0 on success, or a
1890  *      negative error code.  If called before registration, it always
1891  *      succeeds.
1892  */
1893 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1894 {
1895         int rc;
1896
1897         if (rxq < 1 || rxq > dev->num_rx_queues)
1898                 return -EINVAL;
1899
1900         if (dev->reg_state == NETREG_REGISTERED) {
1901                 ASSERT_RTNL();
1902
1903                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1904                                                   rxq);
1905                 if (rc)
1906                         return rc;
1907         }
1908
1909         dev->real_num_rx_queues = rxq;
1910         return 0;
1911 }
1912 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1913 #endif
1914
1915 /**
1916  * netif_get_num_default_rss_queues - default number of RSS queues
1917  *
1918  * This routine should set an upper limit on the number of RSS queues
1919  * used by default by multiqueue devices.
1920  */
1921 int netif_get_num_default_rss_queues(void)
1922 {
1923         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
1924 }
1925 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
1926
1927 static inline void __netif_reschedule(struct Qdisc *q)
1928 {
1929         struct softnet_data *sd;
1930         unsigned long flags;
1931
1932         local_irq_save(flags);
1933         sd = &__get_cpu_var(softnet_data);
1934         q->next_sched = NULL;
1935         *sd->output_queue_tailp = q;
1936         sd->output_queue_tailp = &q->next_sched;
1937         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1938         local_irq_restore(flags);
1939 }
1940
1941 void __netif_schedule(struct Qdisc *q)
1942 {
1943         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1944                 __netif_reschedule(q);
1945 }
1946 EXPORT_SYMBOL(__netif_schedule);
1947
1948 void dev_kfree_skb_irq(struct sk_buff *skb)
1949 {
1950         if (atomic_dec_and_test(&skb->users)) {
1951                 struct softnet_data *sd;
1952                 unsigned long flags;
1953
1954                 local_irq_save(flags);
1955                 sd = &__get_cpu_var(softnet_data);
1956                 skb->next = sd->completion_queue;
1957                 sd->completion_queue = skb;
1958                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1959                 local_irq_restore(flags);
1960         }
1961 }
1962 EXPORT_SYMBOL(dev_kfree_skb_irq);
1963
1964 void dev_kfree_skb_any(struct sk_buff *skb)
1965 {
1966         if (in_irq() || irqs_disabled())
1967                 dev_kfree_skb_irq(skb);
1968         else
1969                 dev_kfree_skb(skb);
1970 }
1971 EXPORT_SYMBOL(dev_kfree_skb_any);
1972
1973
1974 /**
1975  * netif_device_detach - mark device as removed
1976  * @dev: network device
1977  *
1978  * Mark device as removed from system and therefore no longer available.
1979  */
1980 void netif_device_detach(struct net_device *dev)
1981 {
1982         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1983             netif_running(dev)) {
1984                 netif_tx_stop_all_queues(dev);
1985         }
1986 }
1987 EXPORT_SYMBOL(netif_device_detach);
1988
1989 /**
1990  * netif_device_attach - mark device as attached
1991  * @dev: network device
1992  *
1993  * Mark device as attached from system and restart if needed.
1994  */
1995 void netif_device_attach(struct net_device *dev)
1996 {
1997         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1998             netif_running(dev)) {
1999                 netif_tx_wake_all_queues(dev);
2000                 __netdev_watchdog_up(dev);
2001         }
2002 }
2003 EXPORT_SYMBOL(netif_device_attach);
2004
2005 static void skb_warn_bad_offload(const struct sk_buff *skb)
2006 {
2007         static const netdev_features_t null_features = 0;
2008         struct net_device *dev = skb->dev;
2009         const char *driver = "";
2010
2011         if (dev && dev->dev.parent)
2012                 driver = dev_driver_string(dev->dev.parent);
2013
2014         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2015              "gso_type=%d ip_summed=%d\n",
2016              driver, dev ? &dev->features : &null_features,
2017              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2018              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2019              skb_shinfo(skb)->gso_type, skb->ip_summed);
2020 }
2021
2022 /*
2023  * Invalidate hardware checksum when packet is to be mangled, and
2024  * complete checksum manually on outgoing path.
2025  */
2026 int skb_checksum_help(struct sk_buff *skb)
2027 {
2028         __wsum csum;
2029         int ret = 0, offset;
2030
2031         if (skb->ip_summed == CHECKSUM_COMPLETE)
2032                 goto out_set_summed;
2033
2034         if (unlikely(skb_shinfo(skb)->gso_size)) {
2035                 skb_warn_bad_offload(skb);
2036                 return -EINVAL;
2037         }
2038
2039         offset = skb_checksum_start_offset(skb);
2040         BUG_ON(offset >= skb_headlen(skb));
2041         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2042
2043         offset += skb->csum_offset;
2044         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2045
2046         if (skb_cloned(skb) &&
2047             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2048                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2049                 if (ret)
2050                         goto out;
2051         }
2052
2053         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2054 out_set_summed:
2055         skb->ip_summed = CHECKSUM_NONE;
2056 out:
2057         return ret;
2058 }
2059 EXPORT_SYMBOL(skb_checksum_help);
2060
2061 /**
2062  *      skb_gso_segment - Perform segmentation on skb.
2063  *      @skb: buffer to segment
2064  *      @features: features for the output path (see dev->features)
2065  *
2066  *      This function segments the given skb and returns a list of segments.
2067  *
2068  *      It may return NULL if the skb requires no segmentation.  This is
2069  *      only possible when GSO is used for verifying header integrity.
2070  */
2071 struct sk_buff *skb_gso_segment(struct sk_buff *skb,
2072         netdev_features_t features)
2073 {
2074         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2075         struct packet_offload *ptype;
2076         __be16 type = skb->protocol;
2077         int vlan_depth = ETH_HLEN;
2078         int err;
2079
2080         while (type == htons(ETH_P_8021Q)) {
2081                 struct vlan_hdr *vh;
2082
2083                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2084                         return ERR_PTR(-EINVAL);
2085
2086                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2087                 type = vh->h_vlan_encapsulated_proto;
2088                 vlan_depth += VLAN_HLEN;
2089         }
2090
2091         skb_reset_mac_header(skb);
2092         skb->mac_len = skb->network_header - skb->mac_header;
2093         __skb_pull(skb, skb->mac_len);
2094
2095         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2096                 skb_warn_bad_offload(skb);
2097
2098                 if (skb_header_cloned(skb) &&
2099                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2100                         return ERR_PTR(err);
2101         }
2102
2103         rcu_read_lock();
2104         list_for_each_entry_rcu(ptype, &offload_base, list) {
2105                 if (ptype->type == type && ptype->gso_segment) {
2106                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2107                                 err = ptype->gso_send_check(skb);
2108                                 segs = ERR_PTR(err);
2109                                 if (err || skb_gso_ok(skb, features))
2110                                         break;
2111                                 __skb_push(skb, (skb->data -
2112                                                  skb_network_header(skb)));
2113                         }
2114                         segs = ptype->gso_segment(skb, features);
2115                         break;
2116                 }
2117         }
2118         rcu_read_unlock();
2119
2120         __skb_push(skb, skb->data - skb_mac_header(skb));
2121
2122         return segs;
2123 }
2124 EXPORT_SYMBOL(skb_gso_segment);
2125
2126 /* Take action when hardware reception checksum errors are detected. */
2127 #ifdef CONFIG_BUG
2128 void netdev_rx_csum_fault(struct net_device *dev)
2129 {
2130         if (net_ratelimit()) {
2131                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2132                 dump_stack();
2133         }
2134 }
2135 EXPORT_SYMBOL(netdev_rx_csum_fault);
2136 #endif
2137
2138 /* Actually, we should eliminate this check as soon as we know, that:
2139  * 1. IOMMU is present and allows to map all the memory.
2140  * 2. No high memory really exists on this machine.
2141  */
2142
2143 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2144 {
2145 #ifdef CONFIG_HIGHMEM
2146         int i;
2147         if (!(dev->features & NETIF_F_HIGHDMA)) {
2148                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2149                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2150                         if (PageHighMem(skb_frag_page(frag)))
2151                                 return 1;
2152                 }
2153         }
2154
2155         if (PCI_DMA_BUS_IS_PHYS) {
2156                 struct device *pdev = dev->dev.parent;
2157
2158                 if (!pdev)
2159                         return 0;
2160                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2161                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2162                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2163                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2164                                 return 1;
2165                 }
2166         }
2167 #endif
2168         return 0;
2169 }
2170
2171 struct dev_gso_cb {
2172         void (*destructor)(struct sk_buff *skb);
2173 };
2174
2175 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2176
2177 static void dev_gso_skb_destructor(struct sk_buff *skb)
2178 {
2179         struct dev_gso_cb *cb;
2180
2181         do {
2182                 struct sk_buff *nskb = skb->next;
2183
2184                 skb->next = nskb->next;
2185                 nskb->next = NULL;
2186                 kfree_skb(nskb);
2187         } while (skb->next);
2188
2189         cb = DEV_GSO_CB(skb);
2190         if (cb->destructor)
2191                 cb->destructor(skb);
2192 }
2193
2194 /**
2195  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2196  *      @skb: buffer to segment
2197  *      @features: device features as applicable to this skb
2198  *
2199  *      This function segments the given skb and stores the list of segments
2200  *      in skb->next.
2201  */
2202 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2203 {
2204         struct sk_buff *segs;
2205
2206         segs = skb_gso_segment(skb, features);
2207
2208         /* Verifying header integrity only. */
2209         if (!segs)
2210                 return 0;
2211
2212         if (IS_ERR(segs))
2213                 return PTR_ERR(segs);
2214
2215         skb->next = segs;
2216         DEV_GSO_CB(skb)->destructor = skb->destructor;
2217         skb->destructor = dev_gso_skb_destructor;
2218
2219         return 0;
2220 }
2221
2222 static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
2223 {
2224         return ((features & NETIF_F_GEN_CSUM) ||
2225                 ((features & NETIF_F_V4_CSUM) &&
2226                  protocol == htons(ETH_P_IP)) ||
2227                 ((features & NETIF_F_V6_CSUM) &&
2228                  protocol == htons(ETH_P_IPV6)) ||
2229                 ((features & NETIF_F_FCOE_CRC) &&
2230                  protocol == htons(ETH_P_FCOE)));
2231 }
2232
2233 static netdev_features_t harmonize_features(struct sk_buff *skb,
2234         __be16 protocol, netdev_features_t features)
2235 {
2236         if (skb->ip_summed != CHECKSUM_NONE &&
2237             !can_checksum_protocol(features, protocol)) {
2238                 features &= ~NETIF_F_ALL_CSUM;
2239                 features &= ~NETIF_F_SG;
2240         } else if (illegal_highdma(skb->dev, skb)) {
2241                 features &= ~NETIF_F_SG;
2242         }
2243
2244         return features;
2245 }
2246
2247 netdev_features_t netif_skb_features(struct sk_buff *skb)
2248 {
2249         __be16 protocol = skb->protocol;
2250         netdev_features_t features = skb->dev->features;
2251
2252         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2253                 features &= ~NETIF_F_GSO_MASK;
2254
2255         if (protocol == htons(ETH_P_8021Q)) {
2256                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2257                 protocol = veh->h_vlan_encapsulated_proto;
2258         } else if (!vlan_tx_tag_present(skb)) {
2259                 return harmonize_features(skb, protocol, features);
2260         }
2261
2262         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
2263
2264         if (protocol != htons(ETH_P_8021Q)) {
2265                 return harmonize_features(skb, protocol, features);
2266         } else {
2267                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2268                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
2269                 return harmonize_features(skb, protocol, features);
2270         }
2271 }
2272 EXPORT_SYMBOL(netif_skb_features);
2273
2274 /*
2275  * Returns true if either:
2276  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2277  *      2. skb is fragmented and the device does not support SG.
2278  */
2279 static inline int skb_needs_linearize(struct sk_buff *skb,
2280                                       int features)
2281 {
2282         return skb_is_nonlinear(skb) &&
2283                         ((skb_has_frag_list(skb) &&
2284                                 !(features & NETIF_F_FRAGLIST)) ||
2285                         (skb_shinfo(skb)->nr_frags &&
2286                                 !(features & NETIF_F_SG)));
2287 }
2288
2289 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2290                         struct netdev_queue *txq)
2291 {
2292         const struct net_device_ops *ops = dev->netdev_ops;
2293         int rc = NETDEV_TX_OK;
2294         unsigned int skb_len;
2295
2296         if (likely(!skb->next)) {
2297                 netdev_features_t features;
2298
2299                 /*
2300                  * If device doesn't need skb->dst, release it right now while
2301                  * its hot in this cpu cache
2302                  */
2303                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2304                         skb_dst_drop(skb);
2305
2306                 features = netif_skb_features(skb);
2307
2308                 if (vlan_tx_tag_present(skb) &&
2309                     !(features & NETIF_F_HW_VLAN_TX)) {
2310                         skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2311                         if (unlikely(!skb))
2312                                 goto out;
2313
2314                         skb->vlan_tci = 0;
2315                 }
2316
2317                 if (netif_needs_gso(skb, features)) {
2318                         if (unlikely(dev_gso_segment(skb, features)))
2319                                 goto out_kfree_skb;
2320                         if (skb->next)
2321                                 goto gso;
2322                 } else {
2323                         if (skb_needs_linearize(skb, features) &&
2324                             __skb_linearize(skb))
2325                                 goto out_kfree_skb;
2326
2327                         /* If packet is not checksummed and device does not
2328                          * support checksumming for this protocol, complete
2329                          * checksumming here.
2330                          */
2331                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2332                                 skb_set_transport_header(skb,
2333                                         skb_checksum_start_offset(skb));
2334                                 if (!(features & NETIF_F_ALL_CSUM) &&
2335                                      skb_checksum_help(skb))
2336                                         goto out_kfree_skb;
2337                         }
2338                 }
2339
2340                 if (!list_empty(&ptype_all))
2341                         dev_queue_xmit_nit(skb, dev);
2342
2343                 skb_len = skb->len;
2344                 rc = ops->ndo_start_xmit(skb, dev);
2345                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2346                 if (rc == NETDEV_TX_OK)
2347                         txq_trans_update(txq);
2348                 return rc;
2349         }
2350
2351 gso:
2352         do {
2353                 struct sk_buff *nskb = skb->next;
2354
2355                 skb->next = nskb->next;
2356                 nskb->next = NULL;
2357
2358                 /*
2359                  * If device doesn't need nskb->dst, release it right now while
2360                  * its hot in this cpu cache
2361                  */
2362                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2363                         skb_dst_drop(nskb);
2364
2365                 if (!list_empty(&ptype_all))
2366                         dev_queue_xmit_nit(nskb, dev);
2367
2368                 skb_len = nskb->len;
2369                 rc = ops->ndo_start_xmit(nskb, dev);
2370                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2371                 if (unlikely(rc != NETDEV_TX_OK)) {
2372                         if (rc & ~NETDEV_TX_MASK)
2373                                 goto out_kfree_gso_skb;
2374                         nskb->next = skb->next;
2375                         skb->next = nskb;
2376                         return rc;
2377                 }
2378                 txq_trans_update(txq);
2379                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2380                         return NETDEV_TX_BUSY;
2381         } while (skb->next);
2382
2383 out_kfree_gso_skb:
2384         if (likely(skb->next == NULL))
2385                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2386 out_kfree_skb:
2387         kfree_skb(skb);
2388 out:
2389         return rc;
2390 }
2391
2392 static u32 hashrnd __read_mostly;
2393
2394 /*
2395  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2396  * to be used as a distribution range.
2397  */
2398 u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2399                   unsigned int num_tx_queues)
2400 {
2401         u32 hash;
2402         u16 qoffset = 0;
2403         u16 qcount = num_tx_queues;
2404
2405         if (skb_rx_queue_recorded(skb)) {
2406                 hash = skb_get_rx_queue(skb);
2407                 while (unlikely(hash >= num_tx_queues))
2408                         hash -= num_tx_queues;
2409                 return hash;
2410         }
2411
2412         if (dev->num_tc) {
2413                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2414                 qoffset = dev->tc_to_txq[tc].offset;
2415                 qcount = dev->tc_to_txq[tc].count;
2416         }
2417
2418         if (skb->sk && skb->sk->sk_hash)
2419                 hash = skb->sk->sk_hash;
2420         else
2421                 hash = (__force u16) skb->protocol;
2422         hash = jhash_1word(hash, hashrnd);
2423
2424         return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2425 }
2426 EXPORT_SYMBOL(__skb_tx_hash);
2427
2428 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2429 {
2430         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2431                 net_warn_ratelimited("%s selects TX queue %d, but real number of TX queues is %d\n",
2432                                      dev->name, queue_index,
2433                                      dev->real_num_tx_queues);
2434                 return 0;
2435         }
2436         return queue_index;
2437 }
2438
2439 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2440 {
2441 #ifdef CONFIG_XPS
2442         struct xps_dev_maps *dev_maps;
2443         struct xps_map *map;
2444         int queue_index = -1;
2445
2446         rcu_read_lock();
2447         dev_maps = rcu_dereference(dev->xps_maps);
2448         if (dev_maps) {
2449                 map = rcu_dereference(
2450                     dev_maps->cpu_map[raw_smp_processor_id()]);
2451                 if (map) {
2452                         if (map->len == 1)
2453                                 queue_index = map->queues[0];
2454                         else {
2455                                 u32 hash;
2456                                 if (skb->sk && skb->sk->sk_hash)
2457                                         hash = skb->sk->sk_hash;
2458                                 else
2459                                         hash = (__force u16) skb->protocol ^
2460                                             skb->rxhash;
2461                                 hash = jhash_1word(hash, hashrnd);
2462                                 queue_index = map->queues[
2463                                     ((u64)hash * map->len) >> 32];
2464                         }
2465                         if (unlikely(queue_index >= dev->real_num_tx_queues))
2466                                 queue_index = -1;
2467                 }
2468         }
2469         rcu_read_unlock();
2470
2471         return queue_index;
2472 #else
2473         return -1;
2474 #endif
2475 }
2476
2477 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
2478                                     struct sk_buff *skb)
2479 {
2480         int queue_index;
2481         const struct net_device_ops *ops = dev->netdev_ops;
2482
2483         if (dev->real_num_tx_queues == 1)
2484                 queue_index = 0;
2485         else if (ops->ndo_select_queue) {
2486                 queue_index = ops->ndo_select_queue(dev, skb);
2487                 queue_index = dev_cap_txqueue(dev, queue_index);
2488         } else {
2489                 struct sock *sk = skb->sk;
2490                 queue_index = sk_tx_queue_get(sk);
2491
2492                 if (queue_index < 0 || skb->ooo_okay ||
2493                     queue_index >= dev->real_num_tx_queues) {
2494                         int old_index = queue_index;
2495
2496                         queue_index = get_xps_queue(dev, skb);
2497                         if (queue_index < 0)
2498                                 queue_index = skb_tx_hash(dev, skb);
2499
2500                         if (queue_index != old_index && sk) {
2501                                 struct dst_entry *dst =
2502                                     rcu_dereference_check(sk->sk_dst_cache, 1);
2503
2504                                 if (dst && skb_dst(skb) == dst)
2505                                         sk_tx_queue_set(sk, queue_index);
2506                         }
2507                 }
2508         }
2509
2510         skb_set_queue_mapping(skb, queue_index);
2511         return netdev_get_tx_queue(dev, queue_index);
2512 }
2513
2514 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2515                                  struct net_device *dev,
2516                                  struct netdev_queue *txq)
2517 {
2518         spinlock_t *root_lock = qdisc_lock(q);
2519         bool contended;
2520         int rc;
2521
2522         qdisc_skb_cb(skb)->pkt_len = skb->len;
2523         qdisc_calculate_pkt_len(skb, q);
2524         /*
2525          * Heuristic to force contended enqueues to serialize on a
2526          * separate lock before trying to get qdisc main lock.
2527          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2528          * and dequeue packets faster.
2529          */
2530         contended = qdisc_is_running(q);
2531         if (unlikely(contended))
2532                 spin_lock(&q->busylock);
2533
2534         spin_lock(root_lock);
2535         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2536                 kfree_skb(skb);
2537                 rc = NET_XMIT_DROP;
2538         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2539                    qdisc_run_begin(q)) {
2540                 /*
2541                  * This is a work-conserving queue; there are no old skbs
2542                  * waiting to be sent out; and the qdisc is not running -
2543                  * xmit the skb directly.
2544                  */
2545                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2546                         skb_dst_force(skb);
2547
2548                 qdisc_bstats_update(q, skb);
2549
2550                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2551                         if (unlikely(contended)) {
2552                                 spin_unlock(&q->busylock);
2553                                 contended = false;
2554                         }
2555                         __qdisc_run(q);
2556                 } else
2557                         qdisc_run_end(q);
2558
2559                 rc = NET_XMIT_SUCCESS;
2560         } else {
2561                 skb_dst_force(skb);
2562                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2563                 if (qdisc_run_begin(q)) {
2564                         if (unlikely(contended)) {
2565                                 spin_unlock(&q->busylock);
2566                                 contended = false;
2567                         }
2568                         __qdisc_run(q);
2569                 }
2570         }
2571         spin_unlock(root_lock);
2572         if (unlikely(contended))
2573                 spin_unlock(&q->busylock);
2574         return rc;
2575 }
2576
2577 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2578 static void skb_update_prio(struct sk_buff *skb)
2579 {
2580         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2581
2582         if (!skb->priority && skb->sk && map) {
2583                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2584
2585                 if (prioidx < map->priomap_len)
2586                         skb->priority = map->priomap[prioidx];
2587         }
2588 }
2589 #else
2590 #define skb_update_prio(skb)
2591 #endif
2592
2593 static DEFINE_PER_CPU(int, xmit_recursion);
2594 #define RECURSION_LIMIT 10
2595
2596 /**
2597  *      dev_loopback_xmit - loop back @skb
2598  *      @skb: buffer to transmit
2599  */
2600 int dev_loopback_xmit(struct sk_buff *skb)
2601 {
2602         skb_reset_mac_header(skb);
2603         __skb_pull(skb, skb_network_offset(skb));
2604         skb->pkt_type = PACKET_LOOPBACK;
2605         skb->ip_summed = CHECKSUM_UNNECESSARY;
2606         WARN_ON(!skb_dst(skb));
2607         skb_dst_force(skb);
2608         netif_rx_ni(skb);
2609         return 0;
2610 }
2611 EXPORT_SYMBOL(dev_loopback_xmit);
2612
2613 /**
2614  *      dev_queue_xmit - transmit a buffer
2615  *      @skb: buffer to transmit
2616  *
2617  *      Queue a buffer for transmission to a network device. The caller must
2618  *      have set the device and priority and built the buffer before calling
2619  *      this function. The function can be called from an interrupt.
2620  *
2621  *      A negative errno code is returned on a failure. A success does not
2622  *      guarantee the frame will be transmitted as it may be dropped due
2623  *      to congestion or traffic shaping.
2624  *
2625  * -----------------------------------------------------------------------------------
2626  *      I notice this method can also return errors from the queue disciplines,
2627  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2628  *      be positive.
2629  *
2630  *      Regardless of the return value, the skb is consumed, so it is currently
2631  *      difficult to retry a send to this method.  (You can bump the ref count
2632  *      before sending to hold a reference for retry if you are careful.)
2633  *
2634  *      When calling this method, interrupts MUST be enabled.  This is because
2635  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2636  *          --BLG
2637  */
2638 int dev_queue_xmit(struct sk_buff *skb)
2639 {
2640         struct net_device *dev = skb->dev;
2641         struct netdev_queue *txq;
2642         struct Qdisc *q;
2643         int rc = -ENOMEM;
2644
2645         /* Disable soft irqs for various locks below. Also
2646          * stops preemption for RCU.
2647          */
2648         rcu_read_lock_bh();
2649
2650         skb_update_prio(skb);
2651
2652         txq = netdev_pick_tx(dev, skb);
2653         q = rcu_dereference_bh(txq->qdisc);
2654
2655 #ifdef CONFIG_NET_CLS_ACT
2656         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2657 #endif
2658         trace_net_dev_queue(skb);
2659         if (q->enqueue) {
2660                 rc = __dev_xmit_skb(skb, q, dev, txq);
2661                 goto out;
2662         }
2663
2664         /* The device has no queue. Common case for software devices:
2665            loopback, all the sorts of tunnels...
2666
2667            Really, it is unlikely that netif_tx_lock protection is necessary
2668            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2669            counters.)
2670            However, it is possible, that they rely on protection
2671            made by us here.
2672
2673            Check this and shot the lock. It is not prone from deadlocks.
2674            Either shot noqueue qdisc, it is even simpler 8)
2675          */
2676         if (dev->flags & IFF_UP) {
2677                 int cpu = smp_processor_id(); /* ok because BHs are off */
2678
2679                 if (txq->xmit_lock_owner != cpu) {
2680
2681                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2682                                 goto recursion_alert;
2683
2684                         HARD_TX_LOCK(dev, txq, cpu);
2685
2686                         if (!netif_xmit_stopped(txq)) {
2687                                 __this_cpu_inc(xmit_recursion);
2688                                 rc = dev_hard_start_xmit(skb, dev, txq);
2689                                 __this_cpu_dec(xmit_recursion);
2690                                 if (dev_xmit_complete(rc)) {
2691                                         HARD_TX_UNLOCK(dev, txq);
2692                                         goto out;
2693                                 }
2694                         }
2695                         HARD_TX_UNLOCK(dev, txq);
2696                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2697                                              dev->name);
2698                 } else {
2699                         /* Recursion is detected! It is possible,
2700                          * unfortunately
2701                          */
2702 recursion_alert:
2703                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2704                                              dev->name);
2705                 }
2706         }
2707
2708         rc = -ENETDOWN;
2709         rcu_read_unlock_bh();
2710
2711         kfree_skb(skb);
2712         return rc;
2713 out:
2714         rcu_read_unlock_bh();
2715         return rc;
2716 }
2717 EXPORT_SYMBOL(dev_queue_xmit);
2718
2719
2720 /*=======================================================================
2721                         Receiver routines
2722   =======================================================================*/
2723
2724 int netdev_max_backlog __read_mostly = 1000;
2725 EXPORT_SYMBOL(netdev_max_backlog);
2726
2727 int netdev_tstamp_prequeue __read_mostly = 1;
2728 int netdev_budget __read_mostly = 300;
2729 int weight_p __read_mostly = 64;            /* old backlog weight */
2730
2731 /* Called with irq disabled */
2732 static inline void ____napi_schedule(struct softnet_data *sd,
2733                                      struct napi_struct *napi)
2734 {
2735         list_add_tail(&napi->poll_list, &sd->poll_list);
2736         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2737 }
2738
2739 /*
2740  * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2741  * and src/dst port numbers.  Sets rxhash in skb to non-zero hash value
2742  * on success, zero indicates no valid hash.  Also, sets l4_rxhash in skb
2743  * if hash is a canonical 4-tuple hash over transport ports.
2744  */
2745 void __skb_get_rxhash(struct sk_buff *skb)
2746 {
2747         struct flow_keys keys;
2748         u32 hash;
2749
2750         if (!skb_flow_dissect(skb, &keys))
2751                 return;
2752
2753         if (keys.ports)
2754                 skb->l4_rxhash = 1;
2755
2756         /* get a consistent hash (same value on both flow directions) */
2757         if (((__force u32)keys.dst < (__force u32)keys.src) ||
2758             (((__force u32)keys.dst == (__force u32)keys.src) &&
2759              ((__force u16)keys.port16[1] < (__force u16)keys.port16[0]))) {
2760                 swap(keys.dst, keys.src);
2761                 swap(keys.port16[0], keys.port16[1]);
2762         }
2763
2764         hash = jhash_3words((__force u32)keys.dst,
2765                             (__force u32)keys.src,
2766                             (__force u32)keys.ports, hashrnd);
2767         if (!hash)
2768                 hash = 1;
2769
2770         skb->rxhash = hash;
2771 }
2772 EXPORT_SYMBOL(__skb_get_rxhash);
2773
2774 #ifdef CONFIG_RPS
2775
2776 /* One global table that all flow-based protocols share. */
2777 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2778 EXPORT_SYMBOL(rps_sock_flow_table);
2779
2780 struct static_key rps_needed __read_mostly;
2781
2782 static struct rps_dev_flow *
2783 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2784             struct rps_dev_flow *rflow, u16 next_cpu)
2785 {
2786         if (next_cpu != RPS_NO_CPU) {
2787 #ifdef CONFIG_RFS_ACCEL
2788                 struct netdev_rx_queue *rxqueue;
2789                 struct rps_dev_flow_table *flow_table;
2790                 struct rps_dev_flow *old_rflow;
2791                 u32 flow_id;
2792                 u16 rxq_index;
2793                 int rc;
2794
2795                 /* Should we steer this flow to a different hardware queue? */
2796                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2797                     !(dev->features & NETIF_F_NTUPLE))
2798                         goto out;
2799                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2800                 if (rxq_index == skb_get_rx_queue(skb))
2801                         goto out;
2802
2803                 rxqueue = dev->_rx + rxq_index;
2804                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2805                 if (!flow_table)
2806                         goto out;
2807                 flow_id = skb->rxhash & flow_table->mask;
2808                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2809                                                         rxq_index, flow_id);
2810                 if (rc < 0)
2811                         goto out;
2812                 old_rflow = rflow;
2813                 rflow = &flow_table->flows[flow_id];
2814                 rflow->filter = rc;
2815                 if (old_rflow->filter == rflow->filter)
2816                         old_rflow->filter = RPS_NO_FILTER;
2817         out:
2818 #endif
2819                 rflow->last_qtail =
2820                         per_cpu(softnet_data, next_cpu).input_queue_head;
2821         }
2822
2823         rflow->cpu = next_cpu;
2824         return rflow;
2825 }
2826
2827 /*
2828  * get_rps_cpu is called from netif_receive_skb and returns the target
2829  * CPU from the RPS map of the receiving queue for a given skb.
2830  * rcu_read_lock must be held on entry.
2831  */
2832 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2833                        struct rps_dev_flow **rflowp)
2834 {
2835         struct netdev_rx_queue *rxqueue;
2836         struct rps_map *map;
2837         struct rps_dev_flow_table *flow_table;
2838         struct rps_sock_flow_table *sock_flow_table;
2839         int cpu = -1;
2840         u16 tcpu;
2841
2842         if (skb_rx_queue_recorded(skb)) {
2843                 u16 index = skb_get_rx_queue(skb);
2844                 if (unlikely(index >= dev->real_num_rx_queues)) {
2845                         WARN_ONCE(dev->real_num_rx_queues > 1,
2846                                   "%s received packet on queue %u, but number "
2847                                   "of RX queues is %u\n",
2848                                   dev->name, index, dev->real_num_rx_queues);
2849                         goto done;
2850                 }
2851                 rxqueue = dev->_rx + index;
2852         } else
2853                 rxqueue = dev->_rx;
2854
2855         map = rcu_dereference(rxqueue->rps_map);
2856         if (map) {
2857                 if (map->len == 1 &&
2858                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2859                         tcpu = map->cpus[0];
2860                         if (cpu_online(tcpu))
2861                                 cpu = tcpu;
2862                         goto done;
2863                 }
2864         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2865                 goto done;
2866         }
2867
2868         skb_reset_network_header(skb);
2869         if (!skb_get_rxhash(skb))
2870                 goto done;
2871
2872         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2873         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2874         if (flow_table && sock_flow_table) {
2875                 u16 next_cpu;
2876                 struct rps_dev_flow *rflow;
2877
2878                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2879                 tcpu = rflow->cpu;
2880
2881                 next_cpu = sock_flow_table->ents[skb->rxhash &
2882                     sock_flow_table->mask];
2883
2884                 /*
2885                  * If the desired CPU (where last recvmsg was done) is
2886                  * different from current CPU (one in the rx-queue flow
2887                  * table entry), switch if one of the following holds:
2888                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2889                  *   - Current CPU is offline.
2890                  *   - The current CPU's queue tail has advanced beyond the
2891                  *     last packet that was enqueued using this table entry.
2892                  *     This guarantees that all previous packets for the flow
2893                  *     have been dequeued, thus preserving in order delivery.
2894                  */
2895                 if (unlikely(tcpu != next_cpu) &&
2896                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2897                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2898                       rflow->last_qtail)) >= 0))
2899                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2900
2901                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2902                         *rflowp = rflow;
2903                         cpu = tcpu;
2904                         goto done;
2905                 }
2906         }
2907
2908         if (map) {
2909                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2910
2911                 if (cpu_online(tcpu)) {
2912                         cpu = tcpu;
2913                         goto done;
2914                 }
2915         }
2916
2917 done:
2918         return cpu;
2919 }
2920
2921 #ifdef CONFIG_RFS_ACCEL
2922
2923 /**
2924  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2925  * @dev: Device on which the filter was set
2926  * @rxq_index: RX queue index
2927  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2928  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2929  *
2930  * Drivers that implement ndo_rx_flow_steer() should periodically call
2931  * this function for each installed filter and remove the filters for
2932  * which it returns %true.
2933  */
2934 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2935                          u32 flow_id, u16 filter_id)
2936 {
2937         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2938         struct rps_dev_flow_table *flow_table;
2939         struct rps_dev_flow *rflow;
2940         bool expire = true;
2941         int cpu;
2942
2943         rcu_read_lock();
2944         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2945         if (flow_table && flow_id <= flow_table->mask) {
2946                 rflow = &flow_table->flows[flow_id];
2947                 cpu = ACCESS_ONCE(rflow->cpu);
2948                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2949                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2950                            rflow->last_qtail) <
2951                      (int)(10 * flow_table->mask)))
2952                         expire = false;
2953         }
2954         rcu_read_unlock();
2955         return expire;
2956 }
2957 EXPORT_SYMBOL(rps_may_expire_flow);
2958
2959 #endif /* CONFIG_RFS_ACCEL */
2960
2961 /* Called from hardirq (IPI) context */
2962 static void rps_trigger_softirq(void *data)
2963 {
2964         struct softnet_data *sd = data;
2965
2966         ____napi_schedule(sd, &sd->backlog);
2967         sd->received_rps++;
2968 }
2969
2970 #endif /* CONFIG_RPS */
2971
2972 /*
2973  * Check if this softnet_data structure is another cpu one
2974  * If yes, queue it to our IPI list and return 1
2975  * If no, return 0
2976  */
2977 static int rps_ipi_queued(struct softnet_data *sd)
2978 {
2979 #ifdef CONFIG_RPS
2980         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2981
2982         if (sd != mysd) {
2983                 sd->rps_ipi_next = mysd->rps_ipi_list;
2984                 mysd->rps_ipi_list = sd;
2985
2986                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2987                 return 1;
2988         }
2989 #endif /* CONFIG_RPS */
2990         return 0;
2991 }
2992
2993 /*
2994  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2995  * queue (may be a remote CPU queue).
2996  */
2997 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2998                               unsigned int *qtail)
2999 {
3000         struct softnet_data *sd;
3001         unsigned long flags;
3002
3003         sd = &per_cpu(softnet_data, cpu);
3004
3005         local_irq_save(flags);
3006
3007         rps_lock(sd);
3008         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3009                 if (skb_queue_len(&sd->input_pkt_queue)) {
3010 enqueue:
3011                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3012                         input_queue_tail_incr_save(sd, qtail);
3013                         rps_unlock(sd);
3014                         local_irq_restore(flags);
3015                         return NET_RX_SUCCESS;
3016                 }
3017
3018                 /* Schedule NAPI for backlog device
3019                  * We can use non atomic operation since we own the queue lock
3020                  */
3021                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3022                         if (!rps_ipi_queued(sd))
3023                                 ____napi_schedule(sd, &sd->backlog);
3024                 }
3025                 goto enqueue;
3026         }
3027
3028         sd->dropped++;
3029         rps_unlock(sd);
3030
3031         local_irq_restore(flags);
3032
3033         atomic_long_inc(&skb->dev->rx_dropped);
3034         kfree_skb(skb);
3035         return NET_RX_DROP;
3036 }
3037
3038 /**
3039  *      netif_rx        -       post buffer to the network code
3040  *      @skb: buffer to post
3041  *
3042  *      This function receives a packet from a device driver and queues it for
3043  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3044  *      may be dropped during processing for congestion control or by the
3045  *      protocol layers.
3046  *
3047  *      return values:
3048  *      NET_RX_SUCCESS  (no congestion)
3049  *      NET_RX_DROP     (packet was dropped)
3050  *
3051  */
3052
3053 int netif_rx(struct sk_buff *skb)
3054 {
3055         int ret;
3056
3057         /* if netpoll wants it, pretend we never saw it */
3058         if (netpoll_rx(skb))
3059                 return NET_RX_DROP;
3060
3061         net_timestamp_check(netdev_tstamp_prequeue, skb);
3062
3063         trace_netif_rx(skb);
3064 #ifdef CONFIG_RPS
3065         if (static_key_false(&rps_needed)) {
3066                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3067                 int cpu;
3068
3069                 preempt_disable();
3070                 rcu_read_lock();
3071
3072                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3073                 if (cpu < 0)
3074                         cpu = smp_processor_id();
3075
3076                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3077
3078                 rcu_read_unlock();
3079                 preempt_enable();
3080         } else
3081 #endif
3082         {
3083                 unsigned int qtail;
3084                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3085                 put_cpu();
3086         }
3087         return ret;
3088 }
3089 EXPORT_SYMBOL(netif_rx);
3090
3091 int netif_rx_ni(struct sk_buff *skb)
3092 {
3093         int err;
3094
3095         preempt_disable();
3096         err = netif_rx(skb);
3097         if (local_softirq_pending())
3098                 do_softirq();
3099         preempt_enable();
3100
3101         return err;
3102 }
3103 EXPORT_SYMBOL(netif_rx_ni);
3104
3105 static void net_tx_action(struct softirq_action *h)
3106 {
3107         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3108
3109         if (sd->completion_queue) {
3110                 struct sk_buff *clist;
3111
3112                 local_irq_disable();
3113                 clist = sd->completion_queue;
3114                 sd->completion_queue = NULL;
3115                 local_irq_enable();
3116
3117                 while (clist) {
3118                         struct sk_buff *skb = clist;
3119                         clist = clist->next;
3120
3121                         WARN_ON(atomic_read(&skb->users));
3122                         trace_kfree_skb(skb, net_tx_action);
3123                         __kfree_skb(skb);
3124                 }
3125         }
3126
3127         if (sd->output_queue) {
3128                 struct Qdisc *head;
3129
3130                 local_irq_disable();
3131                 head = sd->output_queue;
3132                 sd->output_queue = NULL;
3133                 sd->output_queue_tailp = &sd->output_queue;
3134                 local_irq_enable();
3135
3136                 while (head) {
3137                         struct Qdisc *q = head;
3138                         spinlock_t *root_lock;
3139
3140                         head = head->next_sched;
3141
3142                         root_lock = qdisc_lock(q);
3143                         if (spin_trylock(root_lock)) {
3144                                 smp_mb__before_clear_bit();
3145                                 clear_bit(__QDISC_STATE_SCHED,
3146                                           &q->state);
3147                                 qdisc_run(q);
3148                                 spin_unlock(root_lock);
3149                         } else {
3150                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3151                                               &q->state)) {
3152                                         __netif_reschedule(q);
3153                                 } else {
3154                                         smp_mb__before_clear_bit();
3155                                         clear_bit(__QDISC_STATE_SCHED,
3156                                                   &q->state);
3157                                 }
3158                         }
3159                 }
3160         }
3161 }
3162
3163 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3164     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3165 /* This hook is defined here for ATM LANE */
3166 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3167                              unsigned char *addr) __read_mostly;
3168 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3169 #endif
3170
3171 #ifdef CONFIG_NET_CLS_ACT
3172 /* TODO: Maybe we should just force sch_ingress to be compiled in
3173  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3174  * a compare and 2 stores extra right now if we dont have it on
3175  * but have CONFIG_NET_CLS_ACT
3176  * NOTE: This doesn't stop any functionality; if you dont have
3177  * the ingress scheduler, you just can't add policies on ingress.
3178  *
3179  */
3180 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3181 {
3182         struct net_device *dev = skb->dev;
3183         u32 ttl = G_TC_RTTL(skb->tc_verd);
3184         int result = TC_ACT_OK;
3185         struct Qdisc *q;
3186
3187         if (unlikely(MAX_RED_LOOP < ttl++)) {
3188                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3189                                      skb->skb_iif, dev->ifindex);
3190                 return TC_ACT_SHOT;
3191         }
3192
3193         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3194         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3195
3196         q = rxq->qdisc;
3197         if (q != &noop_qdisc) {
3198                 spin_lock(qdisc_lock(q));
3199                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3200                         result = qdisc_enqueue_root(skb, q);
3201                 spin_unlock(qdisc_lock(q));
3202         }
3203
3204         return result;
3205 }
3206
3207 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3208                                          struct packet_type **pt_prev,
3209                                          int *ret, struct net_device *orig_dev)
3210 {
3211         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3212
3213         if (!rxq || rxq->qdisc == &noop_qdisc)
3214                 goto out;
3215
3216         if (*pt_prev) {
3217                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3218                 *pt_prev = NULL;
3219         }
3220
3221         switch (ing_filter(skb, rxq)) {
3222         case TC_ACT_SHOT:
3223         case TC_ACT_STOLEN:
3224                 kfree_skb(skb);
3225                 return NULL;
3226         }
3227
3228 out:
3229         skb->tc_verd = 0;
3230         return skb;
3231 }
3232 #endif
3233
3234 /**
3235  *      netdev_rx_handler_register - register receive handler
3236  *      @dev: device to register a handler for
3237  *      @rx_handler: receive handler to register
3238  *      @rx_handler_data: data pointer that is used by rx handler
3239  *
3240  *      Register a receive hander for a device. This handler will then be
3241  *      called from __netif_receive_skb. A negative errno code is returned
3242  *      on a failure.
3243  *
3244  *      The caller must hold the rtnl_mutex.
3245  *
3246  *      For a general description of rx_handler, see enum rx_handler_result.
3247  */
3248 int netdev_rx_handler_register(struct net_device *dev,
3249                                rx_handler_func_t *rx_handler,
3250                                void *rx_handler_data)
3251 {
3252         ASSERT_RTNL();
3253
3254         if (dev->rx_handler)
3255                 return -EBUSY;
3256
3257         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3258         rcu_assign_pointer(dev->rx_handler, rx_handler);
3259
3260         return 0;
3261 }
3262 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3263
3264 /**
3265  *      netdev_rx_handler_unregister - unregister receive handler
3266  *      @dev: device to unregister a handler from
3267  *
3268  *      Unregister a receive hander from a device.
3269  *
3270  *      The caller must hold the rtnl_mutex.
3271  */
3272 void netdev_rx_handler_unregister(struct net_device *dev)
3273 {
3274
3275         ASSERT_RTNL();
3276         RCU_INIT_POINTER(dev->rx_handler, NULL);
3277         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3278 }
3279 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3280
3281 /*
3282  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3283  * the special handling of PFMEMALLOC skbs.
3284  */
3285 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3286 {
3287         switch (skb->protocol) {
3288         case __constant_htons(ETH_P_ARP):
3289         case __constant_htons(ETH_P_IP):
3290         case __constant_htons(ETH_P_IPV6):
3291         case __constant_htons(ETH_P_8021Q):
3292                 return true;
3293         default:
3294                 return false;
3295         }
3296 }
3297
3298 static int __netif_receive_skb(struct sk_buff *skb)
3299 {
3300         struct packet_type *ptype, *pt_prev;
3301         rx_handler_func_t *rx_handler;
3302         struct net_device *orig_dev;
3303         struct net_device *null_or_dev;
3304         bool deliver_exact = false;
3305         int ret = NET_RX_DROP;
3306         __be16 type;
3307         unsigned long pflags = current->flags;
3308
3309         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3310
3311         trace_netif_receive_skb(skb);
3312
3313         /*
3314          * PFMEMALLOC skbs are special, they should
3315          * - be delivered to SOCK_MEMALLOC sockets only
3316          * - stay away from userspace
3317          * - have bounded memory usage
3318          *
3319          * Use PF_MEMALLOC as this saves us from propagating the allocation
3320          * context down to all allocation sites.
3321          */
3322         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3323                 current->flags |= PF_MEMALLOC;
3324
3325         /* if we've gotten here through NAPI, check netpoll */
3326         if (netpoll_receive_skb(skb))
3327                 goto out;
3328
3329         orig_dev = skb->dev;
3330
3331         skb_reset_network_header(skb);
3332         skb_reset_transport_header(skb);
3333         skb_reset_mac_len(skb);
3334
3335         pt_prev = NULL;
3336
3337         rcu_read_lock();
3338
3339 another_round:
3340         skb->skb_iif = skb->dev->ifindex;
3341
3342         __this_cpu_inc(softnet_data.processed);
3343
3344         if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3345                 skb = vlan_untag(skb);
3346                 if (unlikely(!skb))
3347                         goto unlock;
3348         }
3349
3350 #ifdef CONFIG_NET_CLS_ACT
3351         if (skb->tc_verd & TC_NCLS) {
3352                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3353                 goto ncls;
3354         }
3355 #endif
3356
3357         if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3358                 goto skip_taps;
3359
3360         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3361                 if (!ptype->dev || ptype->dev == skb->dev) {
3362                         if (pt_prev)
3363                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3364                         pt_prev = ptype;
3365                 }
3366         }
3367
3368 skip_taps:
3369 #ifdef CONFIG_NET_CLS_ACT
3370         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3371         if (!skb)
3372                 goto unlock;
3373 ncls:
3374 #endif
3375
3376         if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3377                                 && !skb_pfmemalloc_protocol(skb))
3378                 goto drop;
3379
3380         if (vlan_tx_tag_present(skb)) {
3381                 if (pt_prev) {
3382                         ret = deliver_skb(skb, pt_prev, orig_dev);
3383                         pt_prev = NULL;
3384                 }
3385                 if (vlan_do_receive(&skb))
3386                         goto another_round;
3387                 else if (unlikely(!skb))
3388                         goto unlock;
3389         }
3390
3391         rx_handler = rcu_dereference(skb->dev->rx_handler);
3392         if (rx_handler) {
3393                 if (pt_prev) {
3394                         ret = deliver_skb(skb, pt_prev, orig_dev);
3395                         pt_prev = NULL;
3396                 }
3397                 switch (rx_handler(&skb)) {
3398                 case RX_HANDLER_CONSUMED:
3399                         goto unlock;
3400                 case RX_HANDLER_ANOTHER:
3401                         goto another_round;
3402                 case RX_HANDLER_EXACT:
3403                         deliver_exact = true;
3404                 case RX_HANDLER_PASS:
3405                         break;
3406                 default:
3407                         BUG();
3408                 }
3409         }
3410
3411         if (vlan_tx_nonzero_tag_present(skb))
3412                 skb->pkt_type = PACKET_OTHERHOST;
3413
3414         /* deliver only exact match when indicated */
3415         null_or_dev = deliver_exact ? skb->dev : NULL;
3416
3417         type = skb->protocol;
3418         list_for_each_entry_rcu(ptype,
3419                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3420                 if (ptype->type == type &&
3421                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3422                      ptype->dev == orig_dev)) {
3423                         if (pt_prev)
3424                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3425                         pt_prev = ptype;
3426                 }
3427         }
3428
3429         if (pt_prev) {
3430                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3431                         goto drop;
3432                 else
3433                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3434         } else {
3435 drop:
3436                 atomic_long_inc(&skb->dev->rx_dropped);
3437                 kfree_skb(skb);
3438                 /* Jamal, now you will not able to escape explaining
3439                  * me how you were going to use this. :-)
3440                  */
3441                 ret = NET_RX_DROP;
3442         }
3443
3444 unlock:
3445         rcu_read_unlock();
3446 out:
3447         tsk_restore_flags(current, pflags, PF_MEMALLOC);
3448         return ret;
3449 }
3450
3451 /**
3452  *      netif_receive_skb - process receive buffer from network
3453  *      @skb: buffer to process
3454  *
3455  *      netif_receive_skb() is the main receive data processing function.
3456  *      It always succeeds. The buffer may be dropped during processing
3457  *      for congestion control or by the protocol layers.
3458  *
3459  *      This function may only be called from softirq context and interrupts
3460  *      should be enabled.
3461  *
3462  *      Return values (usually ignored):
3463  *      NET_RX_SUCCESS: no congestion
3464  *      NET_RX_DROP: packet was dropped
3465  */
3466 int netif_receive_skb(struct sk_buff *skb)
3467 {
3468         net_timestamp_check(netdev_tstamp_prequeue, skb);
3469
3470         if (skb_defer_rx_timestamp(skb))
3471                 return NET_RX_SUCCESS;
3472
3473 #ifdef CONFIG_RPS
3474         if (static_key_false(&rps_needed)) {
3475                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3476                 int cpu, ret;
3477
3478                 rcu_read_lock();
3479
3480                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3481
3482                 if (cpu >= 0) {
3483                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3484                         rcu_read_unlock();
3485                         return ret;
3486                 }
3487                 rcu_read_unlock();
3488         }
3489 #endif
3490         return __netif_receive_skb(skb);
3491 }
3492 EXPORT_SYMBOL(netif_receive_skb);
3493
3494 /* Network device is going away, flush any packets still pending
3495  * Called with irqs disabled.
3496  */
3497 static void flush_backlog(void *arg)
3498 {
3499         struct net_device *dev = arg;
3500         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3501         struct sk_buff *skb, *tmp;
3502
3503         rps_lock(sd);
3504         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3505                 if (skb->dev == dev) {
3506                         __skb_unlink(skb, &sd->input_pkt_queue);
3507                         kfree_skb(skb);
3508                         input_queue_head_incr(sd);
3509                 }
3510         }
3511         rps_unlock(sd);
3512
3513         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3514                 if (skb->dev == dev) {
3515                         __skb_unlink(skb, &sd->process_queue);
3516                         kfree_skb(skb);
3517                         input_queue_head_incr(sd);
3518                 }
3519         }
3520 }
3521
3522 static int napi_gro_complete(struct sk_buff *skb)
3523 {
3524         struct packet_offload *ptype;
3525         __be16 type = skb->protocol;
3526         struct list_head *head = &offload_base;
3527         int err = -ENOENT;
3528
3529         if (NAPI_GRO_CB(skb)->count == 1) {
3530                 skb_shinfo(skb)->gso_size = 0;
3531                 goto out;
3532         }
3533
3534         rcu_read_lock();
3535         list_for_each_entry_rcu(ptype, head, list) {
3536                 if (ptype->type != type || !ptype->gro_complete)
3537                         continue;
3538
3539                 err = ptype->gro_complete(skb);
3540                 break;
3541         }
3542         rcu_read_unlock();
3543
3544         if (err) {
3545                 WARN_ON(&ptype->list == head);
3546                 kfree_skb(skb);
3547                 return NET_RX_SUCCESS;
3548         }
3549
3550 out:
3551         return netif_receive_skb(skb);
3552 }
3553
3554 /* napi->gro_list contains packets ordered by age.
3555  * youngest packets at the head of it.
3556  * Complete skbs in reverse order to reduce latencies.
3557  */
3558 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3559 {
3560         struct sk_buff *skb, *prev = NULL;
3561
3562         /* scan list and build reverse chain */
3563         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3564                 skb->prev = prev;
3565                 prev = skb;
3566         }
3567
3568         for (skb = prev; skb; skb = prev) {
3569                 skb->next = NULL;
3570
3571                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3572                         return;
3573
3574                 prev = skb->prev;
3575                 napi_gro_complete(skb);
3576                 napi->gro_count--;
3577         }
3578
3579         napi->gro_list = NULL;
3580 }
3581 EXPORT_SYMBOL(napi_gro_flush);
3582
3583 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3584 {
3585         struct sk_buff **pp = NULL;
3586         struct packet_offload *ptype;
3587         __be16 type = skb->protocol;
3588         struct list_head *head = &offload_base;
3589         int same_flow;
3590         int mac_len;
3591         enum gro_result ret;
3592
3593         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3594                 goto normal;
3595
3596         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3597                 goto normal;
3598
3599         rcu_read_lock();
3600         list_for_each_entry_rcu(ptype, head, list) {
3601                 if (ptype->type != type || !ptype->gro_receive)
3602                         continue;
3603
3604                 skb_set_network_header(skb, skb_gro_offset(skb));
3605                 mac_len = skb->network_header - skb->mac_header;
3606                 skb->mac_len = mac_len;
3607                 NAPI_GRO_CB(skb)->same_flow = 0;
3608                 NAPI_GRO_CB(skb)->flush = 0;
3609                 NAPI_GRO_CB(skb)->free = 0;
3610
3611                 pp = ptype->gro_receive(&napi->gro_list, skb);
3612                 break;
3613         }
3614         rcu_read_unlock();
3615
3616         if (&ptype->list == head)
3617                 goto normal;
3618
3619         same_flow = NAPI_GRO_CB(skb)->same_flow;
3620         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3621
3622         if (pp) {
3623                 struct sk_buff *nskb = *pp;
3624
3625                 *pp = nskb->next;
3626                 nskb->next = NULL;
3627                 napi_gro_complete(nskb);
3628                 napi->gro_count--;
3629         }
3630
3631         if (same_flow)
3632                 goto ok;
3633
3634         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3635                 goto normal;
3636
3637         napi->gro_count++;
3638         NAPI_GRO_CB(skb)->count = 1;
3639         NAPI_GRO_CB(skb)->age = jiffies;
3640         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3641         skb->next = napi->gro_list;
3642         napi->gro_list = skb;
3643         ret = GRO_HELD;
3644
3645 pull:
3646         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3647                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3648
3649                 BUG_ON(skb->end - skb->tail < grow);
3650
3651                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3652
3653                 skb->tail += grow;
3654                 skb->data_len -= grow;
3655
3656                 skb_shinfo(skb)->frags[0].page_offset += grow;
3657                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3658
3659                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3660                         skb_frag_unref(skb, 0);
3661                         memmove(skb_shinfo(skb)->frags,
3662                                 skb_shinfo(skb)->frags + 1,
3663                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3664                 }
3665         }
3666
3667 ok:
3668         return ret;
3669
3670 normal:
3671         ret = GRO_NORMAL;
3672         goto pull;
3673 }
3674 EXPORT_SYMBOL(dev_gro_receive);
3675
3676 static inline gro_result_t
3677 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3678 {
3679         struct sk_buff *p;
3680         unsigned int maclen = skb->dev->hard_header_len;
3681
3682         for (p = napi->gro_list; p; p = p->next) {
3683                 unsigned long diffs;
3684
3685                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3686                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3687                 if (maclen == ETH_HLEN)
3688                         diffs |= compare_ether_header(skb_mac_header(p),
3689                                                       skb_gro_mac_header(skb));
3690                 else if (!diffs)
3691                         diffs = memcmp(skb_mac_header(p),
3692                                        skb_gro_mac_header(skb),
3693                                        maclen);
3694                 NAPI_GRO_CB(p)->same_flow = !diffs;
3695                 NAPI_GRO_CB(p)->flush = 0;
3696         }
3697
3698         return dev_gro_receive(napi, skb);
3699 }
3700
3701 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3702 {
3703         switch (ret) {
3704         case GRO_NORMAL:
3705                 if (netif_receive_skb(skb))
3706                         ret = GRO_DROP;
3707                 break;
3708
3709         case GRO_DROP:
3710                 kfree_skb(skb);
3711                 break;
3712
3713         case GRO_MERGED_FREE:
3714                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3715                         kmem_cache_free(skbuff_head_cache, skb);
3716                 else
3717                         __kfree_skb(skb);
3718                 break;
3719
3720         case GRO_HELD:
3721         case GRO_MERGED:
3722                 break;
3723         }
3724
3725         return ret;
3726 }
3727 EXPORT_SYMBOL(napi_skb_finish);
3728
3729 static void skb_gro_reset_offset(struct sk_buff *skb)
3730 {
3731         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3732         const skb_frag_t *frag0 = &pinfo->frags[0];
3733
3734         NAPI_GRO_CB(skb)->data_offset = 0;
3735         NAPI_GRO_CB(skb)->frag0 = NULL;
3736         NAPI_GRO_CB(skb)->frag0_len = 0;
3737
3738         if (skb->mac_header == skb->tail &&
3739             pinfo->nr_frags &&
3740             !PageHighMem(skb_frag_page(frag0))) {
3741                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3742                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3743         }
3744 }
3745
3746 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3747 {
3748         skb_gro_reset_offset(skb);
3749
3750         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3751 }
3752 EXPORT_SYMBOL(napi_gro_receive);
3753
3754 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3755 {
3756         __skb_pull(skb, skb_headlen(skb));
3757         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3758         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3759         skb->vlan_tci = 0;
3760         skb->dev = napi->dev;
3761         skb->skb_iif = 0;
3762
3763         napi->skb = skb;
3764 }
3765
3766 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3767 {
3768         struct sk_buff *skb = napi->skb;
3769
3770         if (!skb) {
3771                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3772                 if (skb)
3773                         napi->skb = skb;
3774         }
3775         return skb;
3776 }
3777 EXPORT_SYMBOL(napi_get_frags);
3778
3779 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3780                                gro_result_t ret)
3781 {
3782         switch (ret) {
3783         case GRO_NORMAL:
3784         case GRO_HELD:
3785                 skb->protocol = eth_type_trans(skb, skb->dev);
3786
3787                 if (ret == GRO_HELD)
3788                         skb_gro_pull(skb, -ETH_HLEN);
3789                 else if (netif_receive_skb(skb))
3790                         ret = GRO_DROP;
3791                 break;
3792
3793         case GRO_DROP:
3794         case GRO_MERGED_FREE:
3795                 napi_reuse_skb(napi, skb);
3796                 break;
3797
3798         case GRO_MERGED:
3799                 break;
3800         }
3801
3802         return ret;
3803 }
3804 EXPORT_SYMBOL(napi_frags_finish);
3805
3806 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3807 {
3808         struct sk_buff *skb = napi->skb;
3809         struct ethhdr *eth;
3810         unsigned int hlen;
3811         unsigned int off;
3812
3813         napi->skb = NULL;
3814
3815         skb_reset_mac_header(skb);
3816         skb_gro_reset_offset(skb);
3817
3818         off = skb_gro_offset(skb);
3819         hlen = off + sizeof(*eth);
3820         eth = skb_gro_header_fast(skb, off);
3821         if (skb_gro_header_hard(skb, hlen)) {
3822                 eth = skb_gro_header_slow(skb, hlen, off);
3823                 if (unlikely(!eth)) {
3824                         napi_reuse_skb(napi, skb);
3825                         skb = NULL;
3826                         goto out;
3827                 }
3828         }
3829
3830         skb_gro_pull(skb, sizeof(*eth));
3831
3832         /*
3833          * This works because the only protocols we care about don't require
3834          * special handling.  We'll fix it up properly at the end.
3835          */
3836         skb->protocol = eth->h_proto;
3837
3838 out:
3839         return skb;
3840 }
3841
3842 gro_result_t napi_gro_frags(struct napi_struct *napi)
3843 {
3844         struct sk_buff *skb = napi_frags_skb(napi);
3845
3846         if (!skb)
3847                 return GRO_DROP;
3848
3849         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3850 }
3851 EXPORT_SYMBOL(napi_gro_frags);
3852
3853 /*
3854  * net_rps_action sends any pending IPI's for rps.
3855  * Note: called with local irq disabled, but exits with local irq enabled.
3856  */
3857 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3858 {
3859 #ifdef CONFIG_RPS
3860         struct softnet_data *remsd = sd->rps_ipi_list;
3861
3862         if (remsd) {
3863                 sd->rps_ipi_list = NULL;
3864
3865                 local_irq_enable();
3866
3867                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3868                 while (remsd) {
3869                         struct softnet_data *next = remsd->rps_ipi_next;
3870
3871                         if (cpu_online(remsd->cpu))
3872                                 __smp_call_function_single(remsd->cpu,
3873                                                            &remsd->csd, 0);
3874                         remsd = next;
3875                 }
3876         } else
3877 #endif
3878                 local_irq_enable();
3879 }
3880
3881 static int process_backlog(struct napi_struct *napi, int quota)
3882 {
3883         int work = 0;
3884         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
3885
3886 #ifdef CONFIG_RPS
3887         /* Check if we have pending ipi, its better to send them now,
3888          * not waiting net_rx_action() end.
3889          */
3890         if (sd->rps_ipi_list) {
3891                 local_irq_disable();
3892                 net_rps_action_and_irq_enable(sd);
3893         }
3894 #endif
3895         napi->weight = weight_p;
3896         local_irq_disable();
3897         while (work < quota) {
3898                 struct sk_buff *skb;
3899                 unsigned int qlen;
3900
3901                 while ((skb = __skb_dequeue(&sd->process_queue))) {
3902                         local_irq_enable();
3903                         __netif_receive_skb(skb);
3904                         local_irq_disable();
3905                         input_queue_head_incr(sd);
3906                         if (++work >= quota) {
3907                                 local_irq_enable();
3908                                 return work;
3909                         }
3910                 }
3911
3912                 rps_lock(sd);
3913                 qlen = skb_queue_len(&sd->input_pkt_queue);
3914                 if (qlen)
3915                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
3916                                                    &sd->process_queue);
3917
3918                 if (qlen < quota - work) {
3919                         /*
3920                          * Inline a custom version of __napi_complete().
3921                          * only current cpu owns and manipulates this napi,
3922                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3923                          * we can use a plain write instead of clear_bit(),
3924                          * and we dont need an smp_mb() memory barrier.
3925                          */
3926                         list_del(&napi->poll_list);
3927                         napi->state = 0;
3928
3929                         quota = work + qlen;
3930                 }
3931                 rps_unlock(sd);
3932         }
3933         local_irq_enable();
3934
3935         return work;
3936 }
3937
3938 /**
3939  * __napi_schedule - schedule for receive
3940  * @n: entry to schedule
3941  *
3942  * The entry's receive function will be scheduled to run
3943  */
3944 void __napi_schedule(struct napi_struct *n)
3945 {
3946         unsigned long flags;
3947
3948         local_irq_save(flags);
3949         ____napi_schedule(&__get_cpu_var(softnet_data), n);
3950         local_irq_restore(flags);
3951 }
3952 EXPORT_SYMBOL(__napi_schedule);
3953
3954 void __napi_complete(struct napi_struct *n)
3955 {
3956         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3957         BUG_ON(n->gro_list);
3958
3959         list_del(&n->poll_list);
3960         smp_mb__before_clear_bit();
3961         clear_bit(NAPI_STATE_SCHED, &n->state);
3962 }
3963 EXPORT_SYMBOL(__napi_complete);
3964
3965 void napi_complete(struct napi_struct *n)
3966 {
3967         unsigned long flags;
3968
3969         /*
3970          * don't let napi dequeue from the cpu poll list
3971          * just in case its running on a different cpu
3972          */
3973         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3974                 return;
3975
3976         napi_gro_flush(n, false);
3977         local_irq_save(flags);
3978         __napi_complete(n);
3979         local_irq_restore(flags);
3980 }
3981 EXPORT_SYMBOL(napi_complete);
3982
3983 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3984                     int (*poll)(struct napi_struct *, int), int weight)
3985 {
3986         INIT_LIST_HEAD(&napi->poll_list);
3987         napi->gro_count = 0;
3988         napi->gro_list = NULL;
3989         napi->skb = NULL;
3990         napi->poll = poll;
3991         napi->weight = weight;
3992         list_add(&napi->dev_list, &dev->napi_list);
3993         napi->dev = dev;
3994 #ifdef CONFIG_NETPOLL
3995         spin_lock_init(&napi->poll_lock);
3996         napi->poll_owner = -1;
3997 #endif
3998         set_bit(NAPI_STATE_SCHED, &napi->state);
3999 }
4000 EXPORT_SYMBOL(netif_napi_add);
4001
4002 void netif_napi_del(struct napi_struct *napi)
4003 {
4004         struct sk_buff *skb, *next;
4005
4006         list_del_init(&napi->dev_list);
4007         napi_free_frags(napi);
4008
4009         for (skb = napi->gro_list; skb; skb = next) {
4010                 next = skb->next;
4011                 skb->next = NULL;
4012                 kfree_skb(skb);
4013         }
4014
4015         napi->gro_list = NULL;
4016         napi->gro_count = 0;
4017 }
4018 EXPORT_SYMBOL(netif_napi_del);
4019
4020 static void net_rx_action(struct softirq_action *h)
4021 {
4022         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4023         unsigned long time_limit = jiffies + 2;
4024         int budget = netdev_budget;
4025         void *have;
4026
4027         local_irq_disable();
4028
4029         while (!list_empty(&sd->poll_list)) {
4030                 struct napi_struct *n;
4031                 int work, weight;
4032
4033                 /* If softirq window is exhuasted then punt.
4034                  * Allow this to run for 2 jiffies since which will allow
4035                  * an average latency of 1.5/HZ.
4036                  */
4037                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
4038                         goto softnet_break;
4039
4040                 local_irq_enable();
4041
4042                 /* Even though interrupts have been re-enabled, this
4043                  * access is safe because interrupts can only add new
4044                  * entries to the tail of this list, and only ->poll()
4045                  * calls can remove this head entry from the list.
4046                  */
4047                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4048
4049                 have = netpoll_poll_lock(n);
4050
4051                 weight = n->weight;
4052
4053                 /* This NAPI_STATE_SCHED test is for avoiding a race
4054                  * with netpoll's poll_napi().  Only the entity which
4055                  * obtains the lock and sees NAPI_STATE_SCHED set will
4056                  * actually make the ->poll() call.  Therefore we avoid
4057                  * accidentally calling ->poll() when NAPI is not scheduled.
4058                  */
4059                 work = 0;
4060                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4061                         work = n->poll(n, weight);
4062                         trace_napi_poll(n);
4063                 }
4064
4065                 WARN_ON_ONCE(work > weight);
4066
4067                 budget -= work;
4068
4069                 local_irq_disable();
4070
4071                 /* Drivers must not modify the NAPI state if they
4072                  * consume the entire weight.  In such cases this code
4073                  * still "owns" the NAPI instance and therefore can
4074                  * move the instance around on the list at-will.
4075                  */
4076                 if (unlikely(work == weight)) {
4077                         if (unlikely(napi_disable_pending(n))) {
4078                                 local_irq_enable();
4079                                 napi_complete(n);
4080                                 local_irq_disable();
4081                         } else {
4082                                 if (n->gro_list) {
4083                                         /* flush too old packets
4084                                          * If HZ < 1000, flush all packets.
4085                                          */
4086                                         local_irq_enable();
4087                                         napi_gro_flush(n, HZ >= 1000);
4088                                         local_irq_disable();
4089                                 }
4090                                 list_move_tail(&n->poll_list, &sd->poll_list);
4091                         }
4092                 }
4093
4094                 netpoll_poll_unlock(have);
4095         }
4096 out:
4097         net_rps_action_and_irq_enable(sd);
4098
4099 #ifdef CONFIG_NET_DMA
4100         /*
4101          * There may not be any more sk_buffs coming right now, so push
4102          * any pending DMA copies to hardware
4103          */
4104         dma_issue_pending_all();
4105 #endif
4106
4107         return;
4108
4109 softnet_break:
4110         sd->time_squeeze++;
4111         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4112         goto out;
4113 }
4114
4115 static gifconf_func_t *gifconf_list[NPROTO];
4116
4117 /**
4118  *      register_gifconf        -       register a SIOCGIF handler
4119  *      @family: Address family
4120  *      @gifconf: Function handler
4121  *
4122  *      Register protocol dependent address dumping routines. The handler
4123  *      that is passed must not be freed or reused until it has been replaced
4124  *      by another handler.
4125  */
4126 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
4127 {
4128         if (family >= NPROTO)
4129                 return -EINVAL;
4130         gifconf_list[family] = gifconf;
4131         return 0;
4132 }
4133 EXPORT_SYMBOL(register_gifconf);
4134
4135
4136 /*
4137  *      Map an interface index to its name (SIOCGIFNAME)
4138  */
4139
4140 /*
4141  *      We need this ioctl for efficient implementation of the
4142  *      if_indextoname() function required by the IPv6 API.  Without
4143  *      it, we would have to search all the interfaces to find a
4144  *      match.  --pb
4145  */
4146
4147 static int dev_ifname(struct net *net, struct ifreq __user *arg)
4148 {
4149         struct net_device *dev;
4150         struct ifreq ifr;
4151
4152         /*
4153          *      Fetch the caller's info block.
4154          */
4155
4156         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4157                 return -EFAULT;
4158
4159         rcu_read_lock();
4160         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
4161         if (!dev) {
4162                 rcu_read_unlock();
4163                 return -ENODEV;
4164         }
4165
4166         strcpy(ifr.ifr_name, dev->name);
4167         rcu_read_unlock();
4168
4169         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
4170                 return -EFAULT;
4171         return 0;
4172 }
4173
4174 /*
4175  *      Perform a SIOCGIFCONF call. This structure will change
4176  *      size eventually, and there is nothing I can do about it.
4177  *      Thus we will need a 'compatibility mode'.
4178  */
4179
4180 static int dev_ifconf(struct net *net, char __user *arg)
4181 {
4182         struct ifconf ifc;
4183         struct net_device *dev;
4184         char __user *pos;
4185         int len;
4186         int total;
4187         int i;
4188
4189         /*
4190          *      Fetch the caller's info block.
4191          */
4192
4193         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4194                 return -EFAULT;
4195
4196         pos = ifc.ifc_buf;
4197         len = ifc.ifc_len;
4198
4199         /*
4200          *      Loop over the interfaces, and write an info block for each.
4201          */
4202
4203         total = 0;
4204         for_each_netdev(net, dev) {
4205                 for (i = 0; i < NPROTO; i++) {
4206                         if (gifconf_list[i]) {
4207                                 int done;
4208                                 if (!pos)
4209                                         done = gifconf_list[i](dev, NULL, 0);
4210                                 else
4211                                         done = gifconf_list[i](dev, pos + total,
4212                                                                len - total);
4213                                 if (done < 0)
4214                                         return -EFAULT;
4215                                 total += done;
4216                         }
4217                 }
4218         }
4219
4220         /*
4221          *      All done.  Write the updated control block back to the caller.
4222          */
4223         ifc.ifc_len = total;
4224
4225         /*
4226          *      Both BSD and Solaris return 0 here, so we do too.
4227          */
4228         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4229 }
4230
4231 #ifdef CONFIG_PROC_FS
4232
4233 #define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
4234
4235 #define get_bucket(x) ((x) >> BUCKET_SPACE)
4236 #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4237 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4238
4239 static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
4240 {
4241         struct net *net = seq_file_net(seq);
4242         struct net_device *dev;
4243         struct hlist_node *p;
4244         struct hlist_head *h;
4245         unsigned int count = 0, offset = get_offset(*pos);
4246
4247         h = &net->dev_name_head[get_bucket(*pos)];
4248         hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
4249                 if (++count == offset)
4250                         return dev;
4251         }
4252
4253         return NULL;
4254 }
4255
4256 static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
4257 {
4258         struct net_device *dev;
4259         unsigned int bucket;
4260
4261         do {
4262                 dev = dev_from_same_bucket(seq, pos);
4263                 if (dev)
4264                         return dev;
4265
4266                 bucket = get_bucket(*pos) + 1;
4267                 *pos = set_bucket_offset(bucket, 1);
4268         } while (bucket < NETDEV_HASHENTRIES);
4269
4270         return NULL;
4271 }
4272
4273 /*
4274  *      This is invoked by the /proc filesystem handler to display a device
4275  *      in detail.
4276  */
4277 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
4278         __acquires(RCU)
4279 {
4280         rcu_read_lock();
4281         if (!*pos)
4282                 return SEQ_START_TOKEN;
4283
4284         if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
4285                 return NULL;
4286
4287         return dev_from_bucket(seq, pos);
4288 }
4289
4290 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4291 {
4292         ++*pos;
4293         return dev_from_bucket(seq, pos);
4294 }
4295
4296 void dev_seq_stop(struct seq_file *seq, void *v)
4297         __releases(RCU)
4298 {
4299         rcu_read_unlock();
4300 }
4301
4302 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4303 {
4304         struct rtnl_link_stats64 temp;
4305         const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
4306
4307         seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4308                    "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
4309                    dev->name, stats->rx_bytes, stats->rx_packets,
4310                    stats->rx_errors,
4311                    stats->rx_dropped + stats->rx_missed_errors,
4312                    stats->rx_fifo_errors,
4313                    stats->rx_length_errors + stats->rx_over_errors +
4314                     stats->rx_crc_errors + stats->rx_frame_errors,
4315                    stats->rx_compressed, stats->multicast,
4316                    stats->tx_bytes, stats->tx_packets,
4317                    stats->tx_errors, stats->tx_dropped,
4318                    stats->tx_fifo_errors, stats->collisions,
4319                    stats->tx_carrier_errors +
4320                     stats->tx_aborted_errors +
4321                     stats->tx_window_errors +
4322                     stats->tx_heartbeat_errors,
4323                    stats->tx_compressed);
4324 }
4325
4326 /*
4327  *      Called from the PROCfs module. This now uses the new arbitrary sized
4328  *      /proc/net interface to create /proc/net/dev
4329  */
4330 static int dev_seq_show(struct seq_file *seq, void *v)
4331 {
4332         if (v == SEQ_START_TOKEN)
4333                 seq_puts(seq, "Inter-|   Receive                            "
4334                               "                    |  Transmit\n"
4335                               " face |bytes    packets errs drop fifo frame "
4336                               "compressed multicast|bytes    packets errs "
4337                               "drop fifo colls carrier compressed\n");
4338         else
4339                 dev_seq_printf_stats(seq, v);
4340         return 0;
4341 }
4342
4343 static struct softnet_data *softnet_get_online(loff_t *pos)
4344 {
4345         struct softnet_data *sd = NULL;
4346
4347         while (*pos < nr_cpu_ids)
4348                 if (cpu_online(*pos)) {
4349                         sd = &per_cpu(softnet_data, *pos);
4350                         break;
4351                 } else
4352                         ++*pos;
4353         return sd;
4354 }
4355
4356 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4357 {
4358         return softnet_get_online(pos);
4359 }
4360
4361 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4362 {
4363         ++*pos;
4364         return softnet_get_online(pos);
4365 }
4366
4367 static void softnet_seq_stop(struct seq_file *seq, void *v)
4368 {
4369 }
4370
4371 static int softnet_seq_show(struct seq_file *seq, void *v)
4372 {
4373         struct softnet_data *sd = v;
4374
4375         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4376                    sd->processed, sd->dropped, sd->time_squeeze, 0,
4377                    0, 0, 0, 0, /* was fastroute */
4378                    sd->cpu_collision, sd->received_rps);
4379         return 0;
4380 }
4381
4382 static const struct seq_operations dev_seq_ops = {
4383         .start = dev_seq_start,
4384         .next  = dev_seq_next,
4385         .stop  = dev_seq_stop,
4386         .show  = dev_seq_show,
4387 };
4388
4389 static int dev_seq_open(struct inode *inode, struct file *file)
4390 {
4391         return seq_open_net(inode, file, &dev_seq_ops,
4392                             sizeof(struct seq_net_private));
4393 }
4394
4395 static const struct file_operations dev_seq_fops = {
4396         .owner   = THIS_MODULE,
4397         .open    = dev_seq_open,
4398         .read    = seq_read,
4399         .llseek  = seq_lseek,
4400         .release = seq_release_net,
4401 };
4402
4403 static const struct seq_operations softnet_seq_ops = {
4404         .start = softnet_seq_start,
4405         .next  = softnet_seq_next,
4406         .stop  = softnet_seq_stop,
4407         .show  = softnet_seq_show,
4408 };
4409
4410 static int softnet_seq_open(struct inode *inode, struct file *file)
4411 {
4412         return seq_open(file, &softnet_seq_ops);
4413 }
4414
4415 static const struct file_operations softnet_seq_fops = {
4416         .owner   = THIS_MODULE,
4417         .open    = softnet_seq_open,
4418         .read    = seq_read,
4419         .llseek  = seq_lseek,
4420         .release = seq_release,
4421 };
4422
4423 static void *ptype_get_idx(loff_t pos)
4424 {
4425         struct packet_type *pt = NULL;
4426         loff_t i = 0;
4427         int t;
4428
4429         list_for_each_entry_rcu(pt, &ptype_all, list) {
4430                 if (i == pos)
4431                         return pt;
4432                 ++i;
4433         }
4434
4435         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
4436                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4437                         if (i == pos)
4438                                 return pt;
4439                         ++i;
4440                 }
4441         }
4442         return NULL;
4443 }
4444
4445 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
4446         __acquires(RCU)
4447 {
4448         rcu_read_lock();
4449         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4450 }
4451
4452 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4453 {
4454         struct packet_type *pt;
4455         struct list_head *nxt;
4456         int hash;
4457
4458         ++*pos;
4459         if (v == SEQ_START_TOKEN)
4460                 return ptype_get_idx(0);
4461
4462         pt = v;
4463         nxt = pt->list.next;
4464         if (pt->type == htons(ETH_P_ALL)) {
4465                 if (nxt != &ptype_all)
4466                         goto found;
4467                 hash = 0;
4468                 nxt = ptype_base[0].next;
4469         } else
4470                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
4471
4472         while (nxt == &ptype_base[hash]) {
4473                 if (++hash >= PTYPE_HASH_SIZE)
4474                         return NULL;
4475                 nxt = ptype_base[hash].next;
4476         }
4477 found:
4478         return list_entry(nxt, struct packet_type, list);
4479 }
4480
4481 static void ptype_seq_stop(struct seq_file *seq, void *v)
4482         __releases(RCU)
4483 {
4484         rcu_read_unlock();
4485 }
4486
4487 static int ptype_seq_show(struct seq_file *seq, void *v)
4488 {
4489         struct packet_type *pt = v;
4490
4491         if (v == SEQ_START_TOKEN)
4492                 seq_puts(seq, "Type Device      Function\n");
4493         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
4494                 if (pt->type == htons(ETH_P_ALL))
4495                         seq_puts(seq, "ALL ");
4496                 else
4497                         seq_printf(seq, "%04x", ntohs(pt->type));
4498
4499                 seq_printf(seq, " %-8s %pF\n",
4500                            pt->dev ? pt->dev->name : "", pt->func);
4501         }
4502
4503         return 0;
4504 }
4505
4506 static const struct seq_operations ptype_seq_ops = {
4507         .start = ptype_seq_start,
4508         .next  = ptype_seq_next,
4509         .stop  = ptype_seq_stop,
4510         .show  = ptype_seq_show,
4511 };
4512
4513 static int ptype_seq_open(struct inode *inode, struct file *file)
4514 {
4515         return seq_open_net(inode, file, &ptype_seq_ops,
4516                         sizeof(struct seq_net_private));
4517 }
4518
4519 static const struct file_operations ptype_seq_fops = {
4520         .owner   = THIS_MODULE,
4521         .open    = ptype_seq_open,
4522         .read    = seq_read,
4523         .llseek  = seq_lseek,
4524         .release = seq_release_net,
4525 };
4526
4527
4528 static int __net_init dev_proc_net_init(struct net *net)
4529 {
4530         int rc = -ENOMEM;
4531
4532         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
4533                 goto out;
4534         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
4535                 goto out_dev;
4536         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
4537                 goto out_softnet;
4538
4539         if (wext_proc_init(net))
4540                 goto out_ptype;
4541         rc = 0;
4542 out:
4543         return rc;
4544 out_ptype:
4545         proc_net_remove(net, "ptype");
4546 out_softnet:
4547         proc_net_remove(net, "softnet_stat");
4548 out_dev:
4549         proc_net_remove(net, "dev");
4550         goto out;
4551 }
4552
4553 static void __net_exit dev_proc_net_exit(struct net *net)
4554 {
4555         wext_proc_exit(net);
4556
4557         proc_net_remove(net, "ptype");
4558         proc_net_remove(net, "softnet_stat");
4559         proc_net_remove(net, "dev");
4560 }
4561
4562 static struct pernet_operations __net_initdata dev_proc_ops = {
4563         .init = dev_proc_net_init,
4564         .exit = dev_proc_net_exit,
4565 };
4566
4567 static int __init dev_proc_init(void)
4568 {
4569         return register_pernet_subsys(&dev_proc_ops);
4570 }
4571 #else
4572 #define dev_proc_init() 0
4573 #endif  /* CONFIG_PROC_FS */
4574
4575
4576 /**
4577  *      netdev_set_master       -       set up master pointer
4578  *      @slave: slave device
4579  *      @master: new master device
4580  *
4581  *      Changes the master device of the slave. Pass %NULL to break the
4582  *      bonding. The caller must hold the RTNL semaphore. On a failure
4583  *      a negative errno code is returned. On success the reference counts
4584  *      are adjusted and the function returns zero.
4585  */
4586 int netdev_set_master(struct net_device *slave, struct net_device *master)
4587 {
4588         struct net_device *old = slave->master;
4589
4590         ASSERT_RTNL();
4591
4592         if (master) {
4593                 if (old)
4594                         return -EBUSY;
4595                 dev_hold(master);
4596         }
4597
4598         slave->master = master;
4599
4600         if (old)
4601                 dev_put(old);
4602         return 0;
4603 }
4604 EXPORT_SYMBOL(netdev_set_master);
4605
4606 /**
4607  *      netdev_set_bond_master  -       set up bonding master/slave pair
4608  *      @slave: slave device
4609  *      @master: new master device
4610  *
4611  *      Changes the master device of the slave. Pass %NULL to break the
4612  *      bonding. The caller must hold the RTNL semaphore. On a failure
4613  *      a negative errno code is returned. On success %RTM_NEWLINK is sent
4614  *      to the routing socket and the function returns zero.
4615  */
4616 int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4617 {
4618         int err;
4619
4620         ASSERT_RTNL();
4621
4622         err = netdev_set_master(slave, master);
4623         if (err)
4624                 return err;
4625         if (master)
4626                 slave->flags |= IFF_SLAVE;
4627         else
4628                 slave->flags &= ~IFF_SLAVE;
4629
4630         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4631         return 0;
4632 }
4633 EXPORT_SYMBOL(netdev_set_bond_master);
4634
4635 static void dev_change_rx_flags(struct net_device *dev, int flags)
4636 {
4637         const struct net_device_ops *ops = dev->netdev_ops;
4638
4639         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4640                 ops->ndo_change_rx_flags(dev, flags);
4641 }
4642
4643 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4644 {
4645         unsigned int old_flags = dev->flags;
4646         kuid_t uid;
4647         kgid_t gid;
4648
4649         ASSERT_RTNL();
4650
4651         dev->flags |= IFF_PROMISC;
4652         dev->promiscuity += inc;
4653         if (dev->promiscuity == 0) {
4654                 /*
4655                  * Avoid overflow.
4656                  * If inc causes overflow, untouch promisc and return error.
4657                  */
4658                 if (inc < 0)
4659                         dev->flags &= ~IFF_PROMISC;
4660                 else {
4661                         dev->promiscuity -= inc;
4662                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4663                                 dev->name);
4664                         return -EOVERFLOW;
4665                 }
4666         }
4667         if (dev->flags != old_flags) {
4668                 pr_info("device %s %s promiscuous mode\n",
4669                         dev->name,
4670                         dev->flags & IFF_PROMISC ? "entered" : "left");
4671                 if (audit_enabled) {
4672                         current_uid_gid(&uid, &gid);
4673                         audit_log(current->audit_context, GFP_ATOMIC,
4674                                 AUDIT_ANOM_PROMISCUOUS,
4675                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4676                                 dev->name, (dev->flags & IFF_PROMISC),
4677                                 (old_flags & IFF_PROMISC),
4678                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4679                                 from_kuid(&init_user_ns, uid),
4680                                 from_kgid(&init_user_ns, gid),
4681                                 audit_get_sessionid(current));
4682                 }
4683
4684                 dev_change_rx_flags(dev, IFF_PROMISC);
4685         }
4686         return 0;
4687 }
4688
4689 /**
4690  *      dev_set_promiscuity     - update promiscuity count on a device
4691  *      @dev: device
4692  *      @inc: modifier
4693  *
4694  *      Add or remove promiscuity from a device. While the count in the device
4695  *      remains above zero the interface remains promiscuous. Once it hits zero
4696  *      the device reverts back to normal filtering operation. A negative inc
4697  *      value is used to drop promiscuity on the device.
4698  *      Return 0 if successful or a negative errno code on error.
4699  */
4700 int dev_set_promiscuity(struct net_device *dev, int inc)
4701 {
4702         unsigned int old_flags = dev->flags;
4703         int err;
4704
4705         err = __dev_set_promiscuity(dev, inc);
4706         if (err < 0)
4707                 return err;
4708         if (dev->flags != old_flags)
4709                 dev_set_rx_mode(dev);
4710         return err;
4711 }
4712 EXPORT_SYMBOL(dev_set_promiscuity);
4713
4714 /**
4715  *      dev_set_allmulti        - update allmulti count on a device
4716  *      @dev: device
4717  *      @inc: modifier
4718  *
4719  *      Add or remove reception of all multicast frames to a device. While the
4720  *      count in the device remains above zero the interface remains listening
4721  *      to all interfaces. Once it hits zero the device reverts back to normal
4722  *      filtering operation. A negative @inc value is used to drop the counter
4723  *      when releasing a resource needing all multicasts.
4724  *      Return 0 if successful or a negative errno code on error.
4725  */
4726
4727 int dev_set_allmulti(struct net_device *dev, int inc)
4728 {
4729         unsigned int old_flags = dev->flags;
4730
4731         ASSERT_RTNL();
4732
4733         dev->flags |= IFF_ALLMULTI;
4734         dev->allmulti += inc;
4735         if (dev->allmulti == 0) {
4736                 /*
4737                  * Avoid overflow.
4738                  * If inc causes overflow, untouch allmulti and return error.
4739                  */
4740                 if (inc < 0)
4741                         dev->flags &= ~IFF_ALLMULTI;
4742                 else {
4743                         dev->allmulti -= inc;
4744                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4745                                 dev->name);
4746                         return -EOVERFLOW;
4747                 }
4748         }
4749         if (dev->flags ^ old_flags) {
4750                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4751                 dev_set_rx_mode(dev);
4752         }
4753         return 0;
4754 }
4755 EXPORT_SYMBOL(dev_set_allmulti);
4756
4757 /*
4758  *      Upload unicast and multicast address lists to device and
4759  *      configure RX filtering. When the device doesn't support unicast
4760  *      filtering it is put in promiscuous mode while unicast addresses
4761  *      are present.
4762  */
4763 void __dev_set_rx_mode(struct net_device *dev)
4764 {
4765         const struct net_device_ops *ops = dev->netdev_ops;
4766
4767         /* dev_open will call this function so the list will stay sane. */
4768         if (!(dev->flags&IFF_UP))
4769                 return;
4770
4771         if (!netif_device_present(dev))
4772                 return;
4773
4774         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4775                 /* Unicast addresses changes may only happen under the rtnl,
4776                  * therefore calling __dev_set_promiscuity here is safe.
4777                  */
4778                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4779                         __dev_set_promiscuity(dev, 1);
4780                         dev->uc_promisc = true;
4781                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4782                         __dev_set_promiscuity(dev, -1);
4783                         dev->uc_promisc = false;
4784                 }
4785         }
4786
4787         if (ops->ndo_set_rx_mode)
4788                 ops->ndo_set_rx_mode(dev);
4789 }
4790
4791 void dev_set_rx_mode(struct net_device *dev)
4792 {
4793         netif_addr_lock_bh(dev);
4794         __dev_set_rx_mode(dev);
4795         netif_addr_unlock_bh(dev);
4796 }
4797
4798 /**
4799  *      dev_get_flags - get flags reported to userspace
4800  *      @dev: device
4801  *
4802  *      Get the combination of flag bits exported through APIs to userspace.
4803  */
4804 unsigned int dev_get_flags(const struct net_device *dev)
4805 {
4806         unsigned int flags;
4807
4808         flags = (dev->flags & ~(IFF_PROMISC |
4809                                 IFF_ALLMULTI |
4810                                 IFF_RUNNING |
4811                                 IFF_LOWER_UP |
4812                                 IFF_DORMANT)) |
4813                 (dev->gflags & (IFF_PROMISC |
4814                                 IFF_ALLMULTI));
4815
4816         if (netif_running(dev)) {
4817                 if (netif_oper_up(dev))
4818                         flags |= IFF_RUNNING;
4819                 if (netif_carrier_ok(dev))
4820                         flags |= IFF_LOWER_UP;
4821                 if (netif_dormant(dev))
4822                         flags |= IFF_DORMANT;
4823         }
4824
4825         return flags;
4826 }
4827 EXPORT_SYMBOL(dev_get_flags);
4828
4829 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4830 {
4831         unsigned int old_flags = dev->flags;
4832         int ret;
4833
4834         ASSERT_RTNL();
4835
4836         /*
4837          *      Set the flags on our device.
4838          */
4839
4840         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4841                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4842                                IFF_AUTOMEDIA)) |
4843                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4844                                     IFF_ALLMULTI));
4845
4846         /*
4847          *      Load in the correct multicast list now the flags have changed.
4848          */
4849
4850         if ((old_flags ^ flags) & IFF_MULTICAST)
4851                 dev_change_rx_flags(dev, IFF_MULTICAST);
4852
4853         dev_set_rx_mode(dev);
4854
4855         /*
4856          *      Have we downed the interface. We handle IFF_UP ourselves
4857          *      according to user attempts to set it, rather than blindly
4858          *      setting it.
4859          */
4860
4861         ret = 0;
4862         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4863                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4864
4865                 if (!ret)
4866                         dev_set_rx_mode(dev);
4867         }
4868
4869         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4870                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4871
4872                 dev->gflags ^= IFF_PROMISC;
4873                 dev_set_promiscuity(dev, inc);
4874         }
4875
4876         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4877            is important. Some (broken) drivers set IFF_PROMISC, when
4878            IFF_ALLMULTI is requested not asking us and not reporting.
4879          */
4880         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4881                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4882
4883                 dev->gflags ^= IFF_ALLMULTI;
4884                 dev_set_allmulti(dev, inc);
4885         }
4886
4887         return ret;
4888 }
4889
4890 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4891 {
4892         unsigned int changes = dev->flags ^ old_flags;
4893
4894         if (changes & IFF_UP) {
4895                 if (dev->flags & IFF_UP)
4896                         call_netdevice_notifiers(NETDEV_UP, dev);
4897                 else
4898                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4899         }
4900
4901         if (dev->flags & IFF_UP &&
4902             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4903                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4904 }
4905
4906 /**
4907  *      dev_change_flags - change device settings
4908  *      @dev: device
4909  *      @flags: device state flags
4910  *
4911  *      Change settings on device based state flags. The flags are
4912  *      in the userspace exported format.
4913  */
4914 int dev_change_flags(struct net_device *dev, unsigned int flags)
4915 {
4916         int ret;
4917         unsigned int changes, old_flags = dev->flags;
4918
4919         ret = __dev_change_flags(dev, flags);
4920         if (ret < 0)
4921                 return ret;
4922
4923         changes = old_flags ^ dev->flags;
4924         if (changes)
4925                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4926
4927         __dev_notify_flags(dev, old_flags);
4928         return ret;
4929 }
4930 EXPORT_SYMBOL(dev_change_flags);
4931
4932 /**
4933  *      dev_set_mtu - Change maximum transfer unit
4934  *      @dev: device
4935  *      @new_mtu: new transfer unit
4936  *
4937  *      Change the maximum transfer size of the network device.
4938  */
4939 int dev_set_mtu(struct net_device *dev, int new_mtu)
4940 {
4941         const struct net_device_ops *ops = dev->netdev_ops;
4942         int err;
4943
4944         if (new_mtu == dev->mtu)
4945                 return 0;
4946
4947         /*      MTU must be positive.    */
4948         if (new_mtu < 0)
4949                 return -EINVAL;
4950
4951         if (!netif_device_present(dev))
4952                 return -ENODEV;
4953
4954         err = 0;
4955         if (ops->ndo_change_mtu)
4956                 err = ops->ndo_change_mtu(dev, new_mtu);
4957         else
4958                 dev->mtu = new_mtu;
4959
4960         if (!err && dev->flags & IFF_UP)
4961                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4962         return err;
4963 }
4964 EXPORT_SYMBOL(dev_set_mtu);
4965
4966 /**
4967  *      dev_set_group - Change group this device belongs to
4968  *      @dev: device
4969  *      @new_group: group this device should belong to
4970  */
4971 void dev_set_group(struct net_device *dev, int new_group)
4972 {
4973         dev->group = new_group;
4974 }
4975 EXPORT_SYMBOL(dev_set_group);
4976
4977 /**
4978  *      dev_set_mac_address - Change Media Access Control Address
4979  *      @dev: device
4980  *      @sa: new address
4981  *
4982  *      Change the hardware (MAC) address of the device
4983  */
4984 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4985 {
4986         const struct net_device_ops *ops = dev->netdev_ops;
4987         int err;
4988
4989         if (!ops->ndo_set_mac_address)
4990                 return -EOPNOTSUPP;
4991         if (sa->sa_family != dev->type)
4992                 return -EINVAL;
4993         if (!netif_device_present(dev))
4994                 return -ENODEV;
4995         err = ops->ndo_set_mac_address(dev, sa);
4996         if (!err)
4997                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4998         add_device_randomness(dev->dev_addr, dev->addr_len);
4999         return err;
5000 }
5001 EXPORT_SYMBOL(dev_set_mac_address);
5002
5003 /*
5004  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
5005  */
5006 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
5007 {
5008         int err;
5009         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
5010
5011         if (!dev)
5012                 return -ENODEV;
5013
5014         switch (cmd) {
5015         case SIOCGIFFLAGS:      /* Get interface flags */
5016                 ifr->ifr_flags = (short) dev_get_flags(dev);
5017                 return 0;
5018
5019         case SIOCGIFMETRIC:     /* Get the metric on the interface
5020                                    (currently unused) */
5021                 ifr->ifr_metric = 0;
5022                 return 0;
5023
5024         case SIOCGIFMTU:        /* Get the MTU of a device */
5025                 ifr->ifr_mtu = dev->mtu;
5026                 return 0;
5027
5028         case SIOCGIFHWADDR:
5029                 if (!dev->addr_len)
5030                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
5031                 else
5032                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
5033                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5034                 ifr->ifr_hwaddr.sa_family = dev->type;
5035                 return 0;
5036
5037         case SIOCGIFSLAVE:
5038                 err = -EINVAL;
5039                 break;
5040
5041         case SIOCGIFMAP:
5042                 ifr->ifr_map.mem_start = dev->mem_start;
5043                 ifr->ifr_map.mem_end   = dev->mem_end;
5044                 ifr->ifr_map.base_addr = dev->base_addr;
5045                 ifr->ifr_map.irq       = dev->irq;
5046                 ifr->ifr_map.dma       = dev->dma;
5047                 ifr->ifr_map.port      = dev->if_port;
5048                 return 0;
5049
5050         case SIOCGIFINDEX:
5051                 ifr->ifr_ifindex = dev->ifindex;
5052                 return 0;
5053
5054         case SIOCGIFTXQLEN:
5055                 ifr->ifr_qlen = dev->tx_queue_len;
5056                 return 0;
5057
5058         default:
5059                 /* dev_ioctl() should ensure this case
5060                  * is never reached
5061                  */
5062                 WARN_ON(1);
5063                 err = -ENOTTY;
5064                 break;
5065
5066         }
5067         return err;
5068 }
5069
5070 /*
5071  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
5072  */
5073 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
5074 {
5075         int err;
5076         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5077         const struct net_device_ops *ops;
5078
5079         if (!dev)
5080                 return -ENODEV;
5081
5082         ops = dev->netdev_ops;
5083
5084         switch (cmd) {
5085         case SIOCSIFFLAGS:      /* Set interface flags */
5086                 return dev_change_flags(dev, ifr->ifr_flags);
5087
5088         case SIOCSIFMETRIC:     /* Set the metric on the interface
5089                                    (currently unused) */
5090                 return -EOPNOTSUPP;
5091
5092         case SIOCSIFMTU:        /* Set the MTU of a device */
5093                 return dev_set_mtu(dev, ifr->ifr_mtu);
5094
5095         case SIOCSIFHWADDR:
5096                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
5097
5098         case SIOCSIFHWBROADCAST:
5099                 if (ifr->ifr_hwaddr.sa_family != dev->type)
5100                         return -EINVAL;
5101                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
5102                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
5103                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5104                 return 0;
5105
5106         case SIOCSIFMAP:
5107                 if (ops->ndo_set_config) {
5108                         if (!netif_device_present(dev))
5109                                 return -ENODEV;
5110                         return ops->ndo_set_config(dev, &ifr->ifr_map);
5111                 }
5112                 return -EOPNOTSUPP;
5113
5114         case SIOCADDMULTI:
5115                 if (!ops->ndo_set_rx_mode ||
5116                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5117                         return -EINVAL;
5118                 if (!netif_device_present(dev))
5119                         return -ENODEV;
5120                 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
5121
5122         case SIOCDELMULTI:
5123                 if (!ops->ndo_set_rx_mode ||
5124                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
5125                         return -EINVAL;
5126                 if (!netif_device_present(dev))
5127                         return -ENODEV;
5128                 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
5129
5130         case SIOCSIFTXQLEN:
5131                 if (ifr->ifr_qlen < 0)
5132                         return -EINVAL;
5133                 dev->tx_queue_len = ifr->ifr_qlen;
5134                 return 0;
5135
5136         case SIOCSIFNAME:
5137                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
5138                 return dev_change_name(dev, ifr->ifr_newname);
5139
5140         case SIOCSHWTSTAMP:
5141                 err = net_hwtstamp_validate(ifr);
5142                 if (err)
5143                         return err;
5144                 /* fall through */
5145
5146         /*
5147          *      Unknown or private ioctl
5148          */
5149         default:
5150                 if ((cmd >= SIOCDEVPRIVATE &&
5151                     cmd <= SIOCDEVPRIVATE + 15) ||
5152                     cmd == SIOCBONDENSLAVE ||
5153                     cmd == SIOCBONDRELEASE ||
5154                     cmd == SIOCBONDSETHWADDR ||
5155                     cmd == SIOCBONDSLAVEINFOQUERY ||
5156                     cmd == SIOCBONDINFOQUERY ||
5157                     cmd == SIOCBONDCHANGEACTIVE ||
5158                     cmd == SIOCGMIIPHY ||
5159                     cmd == SIOCGMIIREG ||
5160                     cmd == SIOCSMIIREG ||
5161                     cmd == SIOCBRADDIF ||
5162                     cmd == SIOCBRDELIF ||
5163                     cmd == SIOCSHWTSTAMP ||
5164                     cmd == SIOCWANDEV) {
5165                         err = -EOPNOTSUPP;
5166                         if (ops->ndo_do_ioctl) {
5167                                 if (netif_device_present(dev))
5168                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
5169                                 else
5170                                         err = -ENODEV;
5171                         }
5172                 } else
5173                         err = -EINVAL;
5174
5175         }
5176         return err;
5177 }
5178
5179 /*
5180  *      This function handles all "interface"-type I/O control requests. The actual
5181  *      'doing' part of this is dev_ifsioc above.
5182  */
5183
5184 /**
5185  *      dev_ioctl       -       network device ioctl
5186  *      @net: the applicable net namespace
5187  *      @cmd: command to issue
5188  *      @arg: pointer to a struct ifreq in user space
5189  *
5190  *      Issue ioctl functions to devices. This is normally called by the
5191  *      user space syscall interfaces but can sometimes be useful for
5192  *      other purposes. The return value is the return from the syscall if
5193  *      positive or a negative errno code on error.
5194  */
5195
5196 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
5197 {
5198         struct ifreq ifr;
5199         int ret;
5200         char *colon;
5201
5202         /* One special case: SIOCGIFCONF takes ifconf argument
5203            and requires shared lock, because it sleeps writing
5204            to user space.
5205          */
5206
5207         if (cmd == SIOCGIFCONF) {
5208                 rtnl_lock();
5209                 ret = dev_ifconf(net, (char __user *) arg);
5210                 rtnl_unlock();
5211                 return ret;
5212         }
5213         if (cmd == SIOCGIFNAME)
5214                 return dev_ifname(net, (struct ifreq __user *)arg);
5215
5216         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5217                 return -EFAULT;
5218
5219         ifr.ifr_name[IFNAMSIZ-1] = 0;
5220
5221         colon = strchr(ifr.ifr_name, ':');
5222         if (colon)
5223                 *colon = 0;
5224
5225         /*
5226          *      See which interface the caller is talking about.
5227          */
5228
5229         switch (cmd) {
5230         /*
5231          *      These ioctl calls:
5232          *      - can be done by all.
5233          *      - atomic and do not require locking.
5234          *      - return a value
5235          */
5236         case SIOCGIFFLAGS:
5237         case SIOCGIFMETRIC:
5238         case SIOCGIFMTU:
5239         case SIOCGIFHWADDR:
5240         case SIOCGIFSLAVE:
5241         case SIOCGIFMAP:
5242         case SIOCGIFINDEX:
5243         case SIOCGIFTXQLEN:
5244                 dev_load(net, ifr.ifr_name);
5245                 rcu_read_lock();
5246                 ret = dev_ifsioc_locked(net, &ifr, cmd);
5247                 rcu_read_unlock();
5248                 if (!ret) {
5249                         if (colon)
5250                                 *colon = ':';
5251                         if (copy_to_user(arg, &ifr,
5252                                          sizeof(struct ifreq)))
5253                                 ret = -EFAULT;
5254                 }
5255                 return ret;
5256
5257         case SIOCETHTOOL:
5258                 dev_load(net, ifr.ifr_name);
5259                 rtnl_lock();
5260                 ret = dev_ethtool(net, &ifr);
5261                 rtnl_unlock();
5262                 if (!ret) {
5263                         if (colon)
5264                                 *colon = ':';
5265                         if (copy_to_user(arg, &ifr,
5266                                          sizeof(struct ifreq)))
5267                                 ret = -EFAULT;
5268                 }
5269                 return ret;
5270
5271         /*
5272          *      These ioctl calls:
5273          *      - require superuser power.
5274          *      - require strict serialization.
5275          *      - return a value
5276          */
5277         case SIOCGMIIPHY:
5278         case SIOCGMIIREG:
5279         case SIOCSIFNAME:
5280                 if (!capable(CAP_NET_ADMIN))
5281                         return -EPERM;
5282                 dev_load(net, ifr.ifr_name);
5283                 rtnl_lock();
5284                 ret = dev_ifsioc(net, &ifr, cmd);
5285                 rtnl_unlock();
5286                 if (!ret) {
5287                         if (colon)
5288                                 *colon = ':';
5289                         if (copy_to_user(arg, &ifr,
5290                                          sizeof(struct ifreq)))
5291                                 ret = -EFAULT;
5292                 }
5293                 return ret;
5294
5295         /*
5296          *      These ioctl calls:
5297          *      - require superuser power.
5298          *      - require strict serialization.
5299          *      - do not return a value
5300          */
5301         case SIOCSIFFLAGS:
5302         case SIOCSIFMETRIC:
5303         case SIOCSIFMTU:
5304         case SIOCSIFMAP:
5305         case SIOCSIFHWADDR:
5306         case SIOCSIFSLAVE:
5307         case SIOCADDMULTI:
5308         case SIOCDELMULTI:
5309         case SIOCSIFHWBROADCAST:
5310         case SIOCSIFTXQLEN:
5311         case SIOCSMIIREG:
5312         case SIOCBONDENSLAVE:
5313         case SIOCBONDRELEASE:
5314         case SIOCBONDSETHWADDR:
5315         case SIOCBONDCHANGEACTIVE:
5316         case SIOCBRADDIF:
5317         case SIOCBRDELIF:
5318         case SIOCSHWTSTAMP:
5319                 if (!capable(CAP_NET_ADMIN))
5320                         return -EPERM;
5321                 /* fall through */
5322         case SIOCBONDSLAVEINFOQUERY:
5323         case SIOCBONDINFOQUERY:
5324                 dev_load(net, ifr.ifr_name);
5325                 rtnl_lock();
5326                 ret = dev_ifsioc(net, &ifr, cmd);
5327                 rtnl_unlock();
5328                 return ret;
5329
5330         case SIOCGIFMEM:
5331                 /* Get the per device memory space. We can add this but
5332                  * currently do not support it */
5333         case SIOCSIFMEM:
5334                 /* Set the per device memory buffer space.
5335                  * Not applicable in our case */
5336         case SIOCSIFLINK:
5337                 return -ENOTTY;
5338
5339         /*
5340          *      Unknown or private ioctl.
5341          */
5342         default:
5343                 if (cmd == SIOCWANDEV ||
5344                     (cmd >= SIOCDEVPRIVATE &&
5345                      cmd <= SIOCDEVPRIVATE + 15)) {
5346                         dev_load(net, ifr.ifr_name);
5347                         rtnl_lock();
5348                         ret = dev_ifsioc(net, &ifr, cmd);
5349                         rtnl_unlock();
5350                         if (!ret && copy_to_user(arg, &ifr,
5351                                                  sizeof(struct ifreq)))
5352                                 ret = -EFAULT;
5353                         return ret;
5354                 }
5355                 /* Take care of Wireless Extensions */
5356                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5357                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5358                 return -ENOTTY;
5359         }
5360 }
5361
5362
5363 /**
5364  *      dev_new_index   -       allocate an ifindex
5365  *      @net: the applicable net namespace
5366  *
5367  *      Returns a suitable unique value for a new device interface
5368  *      number.  The caller must hold the rtnl semaphore or the
5369  *      dev_base_lock to be sure it remains unique.
5370  */
5371 static int dev_new_index(struct net *net)
5372 {
5373         int ifindex = net->ifindex;
5374         for (;;) {
5375                 if (++ifindex <= 0)
5376                         ifindex = 1;
5377                 if (!__dev_get_by_index(net, ifindex))
5378                         return net->ifindex = ifindex;
5379         }
5380 }
5381
5382 /* Delayed registration/unregisteration */
5383 static LIST_HEAD(net_todo_list);
5384
5385 static void net_set_todo(struct net_device *dev)
5386 {
5387         list_add_tail(&dev->todo_list, &net_todo_list);
5388 }
5389
5390 static void rollback_registered_many(struct list_head *head)
5391 {
5392         struct net_device *dev, *tmp;
5393
5394         BUG_ON(dev_boot_phase);
5395         ASSERT_RTNL();
5396
5397         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5398                 /* Some devices call without registering
5399                  * for initialization unwind. Remove those
5400                  * devices and proceed with the remaining.
5401                  */
5402                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5403                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5404                                  dev->name, dev);
5405
5406                         WARN_ON(1);
5407                         list_del(&dev->unreg_list);
5408                         continue;
5409                 }
5410                 dev->dismantle = true;
5411                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5412         }
5413
5414         /* If device is running, close it first. */
5415         dev_close_many(head);
5416
5417         list_for_each_entry(dev, head, unreg_list) {
5418                 /* And unlink it from device chain. */
5419                 unlist_netdevice(dev);
5420
5421                 dev->reg_state = NETREG_UNREGISTERING;
5422         }
5423
5424         synchronize_net();
5425
5426         list_for_each_entry(dev, head, unreg_list) {
5427                 /* Shutdown queueing discipline. */
5428                 dev_shutdown(dev);
5429
5430
5431                 /* Notify protocols, that we are about to destroy
5432                    this device. They should clean all the things.
5433                 */
5434                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5435
5436                 if (!dev->rtnl_link_ops ||
5437                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5438                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5439
5440                 /*
5441                  *      Flush the unicast and multicast chains
5442                  */
5443                 dev_uc_flush(dev);
5444                 dev_mc_flush(dev);
5445
5446                 if (dev->netdev_ops->ndo_uninit)
5447                         dev->netdev_ops->ndo_uninit(dev);
5448
5449                 /* Notifier chain MUST detach us from master device. */
5450                 WARN_ON(dev->master);
5451
5452                 /* Remove entries from kobject tree */
5453                 netdev_unregister_kobject(dev);
5454         }
5455
5456         synchronize_net();
5457
5458         list_for_each_entry(dev, head, unreg_list)
5459                 dev_put(dev);
5460 }
5461
5462 static void rollback_registered(struct net_device *dev)
5463 {
5464         LIST_HEAD(single);
5465
5466         list_add(&dev->unreg_list, &single);
5467         rollback_registered_many(&single);
5468         list_del(&single);
5469 }
5470
5471 static netdev_features_t netdev_fix_features(struct net_device *dev,
5472         netdev_features_t features)
5473 {
5474         /* Fix illegal checksum combinations */
5475         if ((features & NETIF_F_HW_CSUM) &&
5476             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5477                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5478                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5479         }
5480
5481         /* Fix illegal SG+CSUM combinations. */
5482         if ((features & NETIF_F_SG) &&
5483             !(features & NETIF_F_ALL_CSUM)) {
5484                 netdev_dbg(dev,
5485                         "Dropping NETIF_F_SG since no checksum feature.\n");
5486                 features &= ~NETIF_F_SG;
5487         }
5488
5489         /* TSO requires that SG is present as well. */
5490         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5491                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5492                 features &= ~NETIF_F_ALL_TSO;
5493         }
5494
5495         /* TSO ECN requires that TSO is present as well. */
5496         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5497                 features &= ~NETIF_F_TSO_ECN;
5498
5499         /* Software GSO depends on SG. */
5500         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5501                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5502                 features &= ~NETIF_F_GSO;
5503         }
5504
5505         /* UFO needs SG and checksumming */
5506         if (features & NETIF_F_UFO) {
5507                 /* maybe split UFO into V4 and V6? */
5508                 if (!((features & NETIF_F_GEN_CSUM) ||
5509                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5510                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5511                         netdev_dbg(dev,
5512                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5513                         features &= ~NETIF_F_UFO;
5514                 }
5515
5516                 if (!(features & NETIF_F_SG)) {
5517                         netdev_dbg(dev,
5518                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5519                         features &= ~NETIF_F_UFO;
5520                 }
5521         }
5522
5523         return features;
5524 }
5525
5526 int __netdev_update_features(struct net_device *dev)
5527 {
5528         netdev_features_t features;
5529         int err = 0;
5530
5531         ASSERT_RTNL();
5532
5533         features = netdev_get_wanted_features(dev);
5534
5535         if (dev->netdev_ops->ndo_fix_features)
5536                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5537
5538         /* driver might be less strict about feature dependencies */
5539         features = netdev_fix_features(dev, features);
5540
5541         if (dev->features == features)
5542                 return 0;
5543
5544         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5545                 &dev->features, &features);
5546
5547         if (dev->netdev_ops->ndo_set_features)
5548                 err = dev->netdev_ops->ndo_set_features(dev, features);
5549
5550         if (unlikely(err < 0)) {
5551                 netdev_err(dev,
5552                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5553                         err, &features, &dev->features);
5554                 return -1;
5555         }
5556
5557         if (!err)
5558                 dev->features = features;
5559
5560         return 1;
5561 }
5562
5563 /**
5564  *      netdev_update_features - recalculate device features
5565  *      @dev: the device to check
5566  *
5567  *      Recalculate dev->features set and send notifications if it
5568  *      has changed. Should be called after driver or hardware dependent
5569  *      conditions might have changed that influence the features.
5570  */
5571 void netdev_update_features(struct net_device *dev)
5572 {
5573         if (__netdev_update_features(dev))
5574                 netdev_features_change(dev);
5575 }
5576 EXPORT_SYMBOL(netdev_update_features);
5577
5578 /**
5579  *      netdev_change_features - recalculate device features
5580  *      @dev: the device to check
5581  *
5582  *      Recalculate dev->features set and send notifications even
5583  *      if they have not changed. Should be called instead of
5584  *      netdev_update_features() if also dev->vlan_features might
5585  *      have changed to allow the changes to be propagated to stacked
5586  *      VLAN devices.
5587  */
5588 void netdev_change_features(struct net_device *dev)
5589 {
5590         __netdev_update_features(dev);
5591         netdev_features_change(dev);
5592 }
5593 EXPORT_SYMBOL(netdev_change_features);
5594
5595 /**
5596  *      netif_stacked_transfer_operstate -      transfer operstate
5597  *      @rootdev: the root or lower level device to transfer state from
5598  *      @dev: the device to transfer operstate to
5599  *
5600  *      Transfer operational state from root to device. This is normally
5601  *      called when a stacking relationship exists between the root
5602  *      device and the device(a leaf device).
5603  */
5604 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5605                                         struct net_device *dev)
5606 {
5607         if (rootdev->operstate == IF_OPER_DORMANT)
5608                 netif_dormant_on(dev);
5609         else
5610                 netif_dormant_off(dev);
5611
5612         if (netif_carrier_ok(rootdev)) {
5613                 if (!netif_carrier_ok(dev))
5614                         netif_carrier_on(dev);
5615         } else {
5616                 if (netif_carrier_ok(dev))
5617                         netif_carrier_off(dev);
5618         }
5619 }
5620 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5621
5622 #ifdef CONFIG_RPS
5623 static int netif_alloc_rx_queues(struct net_device *dev)
5624 {
5625         unsigned int i, count = dev->num_rx_queues;
5626         struct netdev_rx_queue *rx;
5627
5628         BUG_ON(count < 1);
5629
5630         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5631         if (!rx) {
5632                 pr_err("netdev: Unable to allocate %u rx queues\n", count);
5633                 return -ENOMEM;
5634         }
5635         dev->_rx = rx;
5636
5637         for (i = 0; i < count; i++)
5638                 rx[i].dev = dev;
5639         return 0;
5640 }
5641 #endif
5642
5643 static void netdev_init_one_queue(struct net_device *dev,
5644                                   struct netdev_queue *queue, void *_unused)
5645 {
5646         /* Initialize queue lock */
5647         spin_lock_init(&queue->_xmit_lock);
5648         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5649         queue->xmit_lock_owner = -1;
5650         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5651         queue->dev = dev;
5652 #ifdef CONFIG_BQL
5653         dql_init(&queue->dql, HZ);
5654 #endif
5655 }
5656
5657 static int netif_alloc_netdev_queues(struct net_device *dev)
5658 {
5659         unsigned int count = dev->num_tx_queues;
5660         struct netdev_queue *tx;
5661
5662         BUG_ON(count < 1);
5663
5664         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5665         if (!tx) {
5666                 pr_err("netdev: Unable to allocate %u tx queues\n", count);
5667                 return -ENOMEM;
5668         }
5669         dev->_tx = tx;
5670
5671         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5672         spin_lock_init(&dev->tx_global_lock);
5673
5674         return 0;
5675 }
5676
5677 /**
5678  *      register_netdevice      - register a network device
5679  *      @dev: device to register
5680  *
5681  *      Take a completed network device structure and add it to the kernel
5682  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5683  *      chain. 0 is returned on success. A negative errno code is returned
5684  *      on a failure to set up the device, or if the name is a duplicate.
5685  *
5686  *      Callers must hold the rtnl semaphore. You may want
5687  *      register_netdev() instead of this.
5688  *
5689  *      BUGS:
5690  *      The locking appears insufficient to guarantee two parallel registers
5691  *      will not get the same name.
5692  */
5693
5694 int register_netdevice(struct net_device *dev)
5695 {
5696         int ret;
5697         struct net *net = dev_net(dev);
5698
5699         BUG_ON(dev_boot_phase);
5700         ASSERT_RTNL();
5701
5702         might_sleep();
5703
5704         /* When net_device's are persistent, this will be fatal. */
5705         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5706         BUG_ON(!net);
5707
5708         spin_lock_init(&dev->addr_list_lock);
5709         netdev_set_addr_lockdep_class(dev);
5710
5711         dev->iflink = -1;
5712
5713         ret = dev_get_valid_name(net, dev, dev->name);
5714         if (ret < 0)
5715                 goto out;
5716
5717         /* Init, if this function is available */
5718         if (dev->netdev_ops->ndo_init) {
5719                 ret = dev->netdev_ops->ndo_init(dev);
5720                 if (ret) {
5721                         if (ret > 0)
5722                                 ret = -EIO;
5723                         goto out;
5724                 }
5725         }
5726
5727         ret = -EBUSY;
5728         if (!dev->ifindex)
5729                 dev->ifindex = dev_new_index(net);
5730         else if (__dev_get_by_index(net, dev->ifindex))
5731                 goto err_uninit;
5732
5733         if (dev->iflink == -1)
5734                 dev->iflink = dev->ifindex;
5735
5736         /* Transfer changeable features to wanted_features and enable
5737          * software offloads (GSO and GRO).
5738          */
5739         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5740         dev->features |= NETIF_F_SOFT_FEATURES;
5741         dev->wanted_features = dev->features & dev->hw_features;
5742
5743         /* Turn on no cache copy if HW is doing checksum */
5744         if (!(dev->flags & IFF_LOOPBACK)) {
5745                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5746                 if (dev->features & NETIF_F_ALL_CSUM) {
5747                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5748                         dev->features |= NETIF_F_NOCACHE_COPY;
5749                 }
5750         }
5751
5752         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5753          */
5754         dev->vlan_features |= NETIF_F_HIGHDMA;
5755
5756         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5757         ret = notifier_to_errno(ret);
5758         if (ret)
5759                 goto err_uninit;
5760
5761         ret = netdev_register_kobject(dev);
5762         if (ret)
5763                 goto err_uninit;
5764         dev->reg_state = NETREG_REGISTERED;
5765
5766         __netdev_update_features(dev);
5767
5768         /*
5769          *      Default initial state at registry is that the
5770          *      device is present.
5771          */
5772
5773         set_bit(__LINK_STATE_PRESENT, &dev->state);
5774
5775         linkwatch_init_dev(dev);
5776
5777         dev_init_scheduler(dev);
5778         dev_hold(dev);
5779         list_netdevice(dev);
5780         add_device_randomness(dev->dev_addr, dev->addr_len);
5781
5782         /* Notify protocols, that a new device appeared. */
5783         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5784         ret = notifier_to_errno(ret);
5785         if (ret) {
5786                 rollback_registered(dev);
5787                 dev->reg_state = NETREG_UNREGISTERED;
5788         }
5789         /*
5790          *      Prevent userspace races by waiting until the network
5791          *      device is fully setup before sending notifications.
5792          */
5793         if (!dev->rtnl_link_ops ||
5794             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5795                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5796
5797 out:
5798         return ret;
5799
5800 err_uninit:
5801         if (dev->netdev_ops->ndo_uninit)
5802                 dev->netdev_ops->ndo_uninit(dev);
5803         goto out;
5804 }
5805 EXPORT_SYMBOL(register_netdevice);
5806
5807 /**
5808  *      init_dummy_netdev       - init a dummy network device for NAPI
5809  *      @dev: device to init
5810  *
5811  *      This takes a network device structure and initialize the minimum
5812  *      amount of fields so it can be used to schedule NAPI polls without
5813  *      registering a full blown interface. This is to be used by drivers
5814  *      that need to tie several hardware interfaces to a single NAPI
5815  *      poll scheduler due to HW limitations.
5816  */
5817 int init_dummy_netdev(struct net_device *dev)
5818 {
5819         /* Clear everything. Note we don't initialize spinlocks
5820          * are they aren't supposed to be taken by any of the
5821          * NAPI code and this dummy netdev is supposed to be
5822          * only ever used for NAPI polls
5823          */
5824         memset(dev, 0, sizeof(struct net_device));
5825
5826         /* make sure we BUG if trying to hit standard
5827          * register/unregister code path
5828          */
5829         dev->reg_state = NETREG_DUMMY;
5830
5831         /* NAPI wants this */
5832         INIT_LIST_HEAD(&dev->napi_list);
5833
5834         /* a dummy interface is started by default */
5835         set_bit(__LINK_STATE_PRESENT, &dev->state);
5836         set_bit(__LINK_STATE_START, &dev->state);
5837
5838         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5839          * because users of this 'device' dont need to change
5840          * its refcount.
5841          */
5842
5843         return 0;
5844 }
5845 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5846
5847
5848 /**
5849  *      register_netdev - register a network device
5850  *      @dev: device to register
5851  *
5852  *      Take a completed network device structure and add it to the kernel
5853  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5854  *      chain. 0 is returned on success. A negative errno code is returned
5855  *      on a failure to set up the device, or if the name is a duplicate.
5856  *
5857  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5858  *      and expands the device name if you passed a format string to
5859  *      alloc_netdev.
5860  */
5861 int register_netdev(struct net_device *dev)
5862 {
5863         int err;
5864
5865         rtnl_lock();
5866         err = register_netdevice(dev);
5867         rtnl_unlock();
5868         return err;
5869 }
5870 EXPORT_SYMBOL(register_netdev);
5871
5872 int netdev_refcnt_read(const struct net_device *dev)
5873 {
5874         int i, refcnt = 0;
5875
5876         for_each_possible_cpu(i)
5877                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5878         return refcnt;
5879 }
5880 EXPORT_SYMBOL(netdev_refcnt_read);
5881
5882 /**
5883  * netdev_wait_allrefs - wait until all references are gone.
5884  * @dev: target net_device
5885  *
5886  * This is called when unregistering network devices.
5887  *
5888  * Any protocol or device that holds a reference should register
5889  * for netdevice notification, and cleanup and put back the
5890  * reference if they receive an UNREGISTER event.
5891  * We can get stuck here if buggy protocols don't correctly
5892  * call dev_put.
5893  */
5894 static void netdev_wait_allrefs(struct net_device *dev)
5895 {
5896         unsigned long rebroadcast_time, warning_time;
5897         int refcnt;
5898
5899         linkwatch_forget_dev(dev);
5900
5901         rebroadcast_time = warning_time = jiffies;
5902         refcnt = netdev_refcnt_read(dev);
5903
5904         while (refcnt != 0) {
5905                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5906                         rtnl_lock();
5907
5908                         /* Rebroadcast unregister notification */
5909                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5910
5911                         __rtnl_unlock();
5912                         rcu_barrier();
5913                         rtnl_lock();
5914
5915                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5916                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5917                                      &dev->state)) {
5918                                 /* We must not have linkwatch events
5919                                  * pending on unregister. If this
5920                                  * happens, we simply run the queue
5921                                  * unscheduled, resulting in a noop
5922                                  * for this device.
5923                                  */
5924                                 linkwatch_run_queue();
5925                         }
5926
5927                         __rtnl_unlock();
5928
5929                         rebroadcast_time = jiffies;
5930                 }
5931
5932                 msleep(250);
5933
5934                 refcnt = netdev_refcnt_read(dev);
5935
5936                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5937                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5938                                  dev->name, refcnt);
5939                         warning_time = jiffies;
5940                 }
5941         }
5942 }
5943
5944 /* The sequence is:
5945  *
5946  *      rtnl_lock();
5947  *      ...
5948  *      register_netdevice(x1);
5949  *      register_netdevice(x2);
5950  *      ...
5951  *      unregister_netdevice(y1);
5952  *      unregister_netdevice(y2);
5953  *      ...
5954  *      rtnl_unlock();
5955  *      free_netdev(y1);
5956  *      free_netdev(y2);
5957  *
5958  * We are invoked by rtnl_unlock().
5959  * This allows us to deal with problems:
5960  * 1) We can delete sysfs objects which invoke hotplug
5961  *    without deadlocking with linkwatch via keventd.
5962  * 2) Since we run with the RTNL semaphore not held, we can sleep
5963  *    safely in order to wait for the netdev refcnt to drop to zero.
5964  *
5965  * We must not return until all unregister events added during
5966  * the interval the lock was held have been completed.
5967  */
5968 void netdev_run_todo(void)
5969 {
5970         struct list_head list;
5971
5972         /* Snapshot list, allow later requests */
5973         list_replace_init(&net_todo_list, &list);
5974
5975         __rtnl_unlock();
5976
5977
5978         /* Wait for rcu callbacks to finish before next phase */
5979         if (!list_empty(&list))
5980                 rcu_barrier();
5981
5982         while (!list_empty(&list)) {
5983                 struct net_device *dev
5984                         = list_first_entry(&list, struct net_device, todo_list);
5985                 list_del(&dev->todo_list);
5986
5987                 rtnl_lock();
5988                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5989                 __rtnl_unlock();
5990
5991                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5992                         pr_err("network todo '%s' but state %d\n",
5993                                dev->name, dev->reg_state);
5994                         dump_stack();
5995                         continue;
5996                 }
5997
5998                 dev->reg_state = NETREG_UNREGISTERED;
5999
6000                 on_each_cpu(flush_backlog, dev, 1);
6001
6002                 netdev_wait_allrefs(dev);
6003
6004                 /* paranoia */
6005                 BUG_ON(netdev_refcnt_read(dev));
6006                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6007                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6008                 WARN_ON(dev->dn_ptr);
6009
6010                 if (dev->destructor)
6011                         dev->destructor(dev);
6012
6013                 /* Free network device */
6014                 kobject_put(&dev->dev.kobj);
6015         }
6016 }
6017
6018 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6019  * fields in the same order, with only the type differing.
6020  */
6021 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6022                              const struct net_device_stats *netdev_stats)
6023 {
6024 #if BITS_PER_LONG == 64
6025         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6026         memcpy(stats64, netdev_stats, sizeof(*stats64));
6027 #else
6028         size_t i, n = sizeof(*stats64) / sizeof(u64);
6029         const unsigned long *src = (const unsigned long *)netdev_stats;
6030         u64 *dst = (u64 *)stats64;
6031
6032         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6033                      sizeof(*stats64) / sizeof(u64));
6034         for (i = 0; i < n; i++)
6035                 dst[i] = src[i];
6036 #endif
6037 }
6038 EXPORT_SYMBOL(netdev_stats_to_stats64);
6039
6040 /**
6041  *      dev_get_stats   - get network device statistics
6042  *      @dev: device to get statistics from
6043  *      @storage: place to store stats
6044  *
6045  *      Get network statistics from device. Return @storage.
6046  *      The device driver may provide its own method by setting
6047  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6048  *      otherwise the internal statistics structure is used.
6049  */
6050 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6051                                         struct rtnl_link_stats64 *storage)
6052 {
6053         const struct net_device_ops *ops = dev->netdev_ops;
6054
6055         if (ops->ndo_get_stats64) {
6056                 memset(storage, 0, sizeof(*storage));
6057                 ops->ndo_get_stats64(dev, storage);
6058         } else if (ops->ndo_get_stats) {
6059                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6060         } else {
6061                 netdev_stats_to_stats64(storage, &dev->stats);
6062         }
6063         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6064         return storage;
6065 }
6066 EXPORT_SYMBOL(dev_get_stats);
6067
6068 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6069 {
6070         struct netdev_queue *queue = dev_ingress_queue(dev);
6071
6072 #ifdef CONFIG_NET_CLS_ACT
6073         if (queue)
6074                 return queue;
6075         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6076         if (!queue)
6077                 return NULL;
6078         netdev_init_one_queue(dev, queue, NULL);
6079         queue->qdisc = &noop_qdisc;
6080         queue->qdisc_sleeping = &noop_qdisc;
6081         rcu_assign_pointer(dev->ingress_queue, queue);
6082 #endif
6083         return queue;
6084 }
6085
6086 static const struct ethtool_ops default_ethtool_ops;
6087
6088 /**
6089  *      alloc_netdev_mqs - allocate network device
6090  *      @sizeof_priv:   size of private data to allocate space for
6091  *      @name:          device name format string
6092  *      @setup:         callback to initialize device
6093  *      @txqs:          the number of TX subqueues to allocate
6094  *      @rxqs:          the number of RX subqueues to allocate
6095  *
6096  *      Allocates a struct net_device with private data area for driver use
6097  *      and performs basic initialization.  Also allocates subquue structs
6098  *      for each queue on the device.
6099  */
6100 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6101                 void (*setup)(struct net_device *),
6102                 unsigned int txqs, unsigned int rxqs)
6103 {
6104         struct net_device *dev;
6105         size_t alloc_size;
6106         struct net_device *p;
6107
6108         BUG_ON(strlen(name) >= sizeof(dev->name));
6109
6110         if (txqs < 1) {
6111                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6112                 return NULL;
6113         }
6114
6115 #ifdef CONFIG_RPS
6116         if (rxqs < 1) {
6117                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6118                 return NULL;
6119         }
6120 #endif
6121
6122         alloc_size = sizeof(struct net_device);
6123         if (sizeof_priv) {
6124                 /* ensure 32-byte alignment of private area */
6125                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6126                 alloc_size += sizeof_priv;
6127         }
6128         /* ensure 32-byte alignment of whole construct */
6129         alloc_size += NETDEV_ALIGN - 1;
6130
6131         p = kzalloc(alloc_size, GFP_KERNEL);
6132         if (!p) {
6133                 pr_err("alloc_netdev: Unable to allocate device\n");
6134                 return NULL;
6135         }
6136
6137         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6138         dev->padded = (char *)dev - (char *)p;
6139
6140         dev->pcpu_refcnt = alloc_percpu(int);
6141         if (!dev->pcpu_refcnt)
6142                 goto free_p;
6143
6144         if (dev_addr_init(dev))
6145                 goto free_pcpu;
6146
6147         dev_mc_init(dev);
6148         dev_uc_init(dev);
6149
6150         dev_net_set(dev, &init_net);
6151
6152         dev->gso_max_size = GSO_MAX_SIZE;
6153         dev->gso_max_segs = GSO_MAX_SEGS;
6154
6155         INIT_LIST_HEAD(&dev->napi_list);
6156         INIT_LIST_HEAD(&dev->unreg_list);
6157         INIT_LIST_HEAD(&dev->link_watch_list);
6158         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6159         setup(dev);
6160
6161         dev->num_tx_queues = txqs;
6162         dev->real_num_tx_queues = txqs;
6163         if (netif_alloc_netdev_queues(dev))
6164                 goto free_all;
6165
6166 #ifdef CONFIG_RPS
6167         dev->num_rx_queues = rxqs;
6168         dev->real_num_rx_queues = rxqs;
6169         if (netif_alloc_rx_queues(dev))
6170                 goto free_all;
6171 #endif
6172
6173         strcpy(dev->name, name);
6174         dev->group = INIT_NETDEV_GROUP;
6175         if (!dev->ethtool_ops)
6176                 dev->ethtool_ops = &default_ethtool_ops;
6177         return dev;
6178
6179 free_all:
6180         free_netdev(dev);
6181         return NULL;
6182
6183 free_pcpu:
6184         free_percpu(dev->pcpu_refcnt);
6185         kfree(dev->_tx);
6186 #ifdef CONFIG_RPS
6187         kfree(dev->_rx);
6188 #endif
6189
6190 free_p:
6191         kfree(p);
6192         return NULL;
6193 }
6194 EXPORT_SYMBOL(alloc_netdev_mqs);
6195
6196 /**
6197  *      free_netdev - free network device
6198  *      @dev: device
6199  *
6200  *      This function does the last stage of destroying an allocated device
6201  *      interface. The reference to the device object is released.
6202  *      If this is the last reference then it will be freed.
6203  */
6204 void free_netdev(struct net_device *dev)
6205 {
6206         struct napi_struct *p, *n;
6207
6208         release_net(dev_net(dev));
6209
6210         kfree(dev->_tx);
6211 #ifdef CONFIG_RPS
6212         kfree(dev->_rx);
6213 #endif
6214
6215         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6216
6217         /* Flush device addresses */
6218         dev_addr_flush(dev);
6219
6220         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6221                 netif_napi_del(p);
6222
6223         free_percpu(dev->pcpu_refcnt);
6224         dev->pcpu_refcnt = NULL;
6225
6226         /*  Compatibility with error handling in drivers */
6227         if (dev->reg_state == NETREG_UNINITIALIZED) {
6228                 kfree((char *)dev - dev->padded);
6229                 return;
6230         }
6231
6232         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6233         dev->reg_state = NETREG_RELEASED;
6234
6235         /* will free via device release */
6236         put_device(&dev->dev);
6237 }
6238 EXPORT_SYMBOL(free_netdev);
6239
6240 /**
6241  *      synchronize_net -  Synchronize with packet receive processing
6242  *
6243  *      Wait for packets currently being received to be done.
6244  *      Does not block later packets from starting.
6245  */
6246 void synchronize_net(void)
6247 {
6248         might_sleep();
6249         if (rtnl_is_locked())
6250                 synchronize_rcu_expedited();
6251         else
6252                 synchronize_rcu();
6253 }
6254 EXPORT_SYMBOL(synchronize_net);
6255
6256 /**
6257  *      unregister_netdevice_queue - remove device from the kernel
6258  *      @dev: device
6259  *      @head: list
6260  *
6261  *      This function shuts down a device interface and removes it
6262  *      from the kernel tables.
6263  *      If head not NULL, device is queued to be unregistered later.
6264  *
6265  *      Callers must hold the rtnl semaphore.  You may want
6266  *      unregister_netdev() instead of this.
6267  */
6268
6269 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6270 {
6271         ASSERT_RTNL();
6272
6273         if (head) {
6274                 list_move_tail(&dev->unreg_list, head);
6275         } else {
6276                 rollback_registered(dev);
6277                 /* Finish processing unregister after unlock */
6278                 net_set_todo(dev);
6279         }
6280 }
6281 EXPORT_SYMBOL(unregister_netdevice_queue);
6282
6283 /**
6284  *      unregister_netdevice_many - unregister many devices
6285  *      @head: list of devices
6286  */
6287 void unregister_netdevice_many(struct list_head *head)
6288 {
6289         struct net_device *dev;
6290
6291         if (!list_empty(head)) {
6292                 rollback_registered_many(head);
6293                 list_for_each_entry(dev, head, unreg_list)
6294                         net_set_todo(dev);
6295         }
6296 }
6297 EXPORT_SYMBOL(unregister_netdevice_many);
6298
6299 /**
6300  *      unregister_netdev - remove device from the kernel
6301  *      @dev: device
6302  *
6303  *      This function shuts down a device interface and removes it
6304  *      from the kernel tables.
6305  *
6306  *      This is just a wrapper for unregister_netdevice that takes
6307  *      the rtnl semaphore.  In general you want to use this and not
6308  *      unregister_netdevice.
6309  */
6310 void unregister_netdev(struct net_device *dev)
6311 {
6312         rtnl_lock();
6313         unregister_netdevice(dev);
6314         rtnl_unlock();
6315 }
6316 EXPORT_SYMBOL(unregister_netdev);
6317
6318 /**
6319  *      dev_change_net_namespace - move device to different nethost namespace
6320  *      @dev: device
6321  *      @net: network namespace
6322  *      @pat: If not NULL name pattern to try if the current device name
6323  *            is already taken in the destination network namespace.
6324  *
6325  *      This function shuts down a device interface and moves it
6326  *      to a new network namespace. On success 0 is returned, on
6327  *      a failure a netagive errno code is returned.
6328  *
6329  *      Callers must hold the rtnl semaphore.
6330  */
6331
6332 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6333 {
6334         int err;
6335
6336         ASSERT_RTNL();
6337
6338         /* Don't allow namespace local devices to be moved. */
6339         err = -EINVAL;
6340         if (dev->features & NETIF_F_NETNS_LOCAL)
6341                 goto out;
6342
6343         /* Ensure the device has been registrered */
6344         if (dev->reg_state != NETREG_REGISTERED)
6345                 goto out;
6346
6347         /* Get out if there is nothing todo */
6348         err = 0;
6349         if (net_eq(dev_net(dev), net))
6350                 goto out;
6351
6352         /* Pick the destination device name, and ensure
6353          * we can use it in the destination network namespace.
6354          */
6355         err = -EEXIST;
6356         if (__dev_get_by_name(net, dev->name)) {
6357                 /* We get here if we can't use the current device name */
6358                 if (!pat)
6359                         goto out;
6360                 if (dev_get_valid_name(net, dev, pat) < 0)
6361                         goto out;
6362         }
6363
6364         /*
6365          * And now a mini version of register_netdevice unregister_netdevice.
6366          */
6367
6368         /* If device is running close it first. */
6369         dev_close(dev);
6370
6371         /* And unlink it from device chain */
6372         err = -ENODEV;
6373         unlist_netdevice(dev);
6374
6375         synchronize_net();
6376
6377         /* Shutdown queueing discipline. */
6378         dev_shutdown(dev);
6379
6380         /* Notify protocols, that we are about to destroy
6381            this device. They should clean all the things.
6382
6383            Note that dev->reg_state stays at NETREG_REGISTERED.
6384            This is wanted because this way 8021q and macvlan know
6385            the device is just moving and can keep their slaves up.
6386         */
6387         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6388         rcu_barrier();
6389         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6390         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6391
6392         /*
6393          *      Flush the unicast and multicast chains
6394          */
6395         dev_uc_flush(dev);
6396         dev_mc_flush(dev);
6397
6398         /* Actually switch the network namespace */
6399         dev_net_set(dev, net);
6400
6401         /* If there is an ifindex conflict assign a new one */
6402         if (__dev_get_by_index(net, dev->ifindex)) {
6403                 int iflink = (dev->iflink == dev->ifindex);
6404                 dev->ifindex = dev_new_index(net);
6405                 if (iflink)
6406                         dev->iflink = dev->ifindex;
6407         }
6408
6409         /* Fixup kobjects */
6410         err = device_rename(&dev->dev, dev->name);
6411         WARN_ON(err);
6412
6413         /* Add the device back in the hashes */
6414         list_netdevice(dev);
6415
6416         /* Notify protocols, that a new device appeared. */
6417         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6418
6419         /*
6420          *      Prevent userspace races by waiting until the network
6421          *      device is fully setup before sending notifications.
6422          */
6423         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6424
6425         synchronize_net();
6426         err = 0;
6427 out:
6428         return err;
6429 }
6430 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6431
6432 static int dev_cpu_callback(struct notifier_block *nfb,
6433                             unsigned long action,
6434                             void *ocpu)
6435 {
6436         struct sk_buff **list_skb;
6437         struct sk_buff *skb;
6438         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6439         struct softnet_data *sd, *oldsd;
6440
6441         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6442                 return NOTIFY_OK;
6443
6444         local_irq_disable();
6445         cpu = smp_processor_id();
6446         sd = &per_cpu(softnet_data, cpu);
6447         oldsd = &per_cpu(softnet_data, oldcpu);
6448
6449         /* Find end of our completion_queue. */
6450         list_skb = &sd->completion_queue;
6451         while (*list_skb)
6452                 list_skb = &(*list_skb)->next;
6453         /* Append completion queue from offline CPU. */
6454         *list_skb = oldsd->completion_queue;
6455         oldsd->completion_queue = NULL;
6456
6457         /* Append output queue from offline CPU. */
6458         if (oldsd->output_queue) {
6459                 *sd->output_queue_tailp = oldsd->output_queue;
6460                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6461                 oldsd->output_queue = NULL;
6462                 oldsd->output_queue_tailp = &oldsd->output_queue;
6463         }
6464         /* Append NAPI poll list from offline CPU. */
6465         if (!list_empty(&oldsd->poll_list)) {
6466                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6467                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6468         }
6469
6470         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6471         local_irq_enable();
6472
6473         /* Process offline CPU's input_pkt_queue */
6474         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6475                 netif_rx(skb);
6476                 input_queue_head_incr(oldsd);
6477         }
6478         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6479                 netif_rx(skb);
6480                 input_queue_head_incr(oldsd);
6481         }
6482
6483         return NOTIFY_OK;
6484 }
6485
6486
6487 /**
6488  *      netdev_increment_features - increment feature set by one
6489  *      @all: current feature set
6490  *      @one: new feature set
6491  *      @mask: mask feature set
6492  *
6493  *      Computes a new feature set after adding a device with feature set
6494  *      @one to the master device with current feature set @all.  Will not
6495  *      enable anything that is off in @mask. Returns the new feature set.
6496  */
6497 netdev_features_t netdev_increment_features(netdev_features_t all,
6498         netdev_features_t one, netdev_features_t mask)
6499 {
6500         if (mask & NETIF_F_GEN_CSUM)
6501                 mask |= NETIF_F_ALL_CSUM;
6502         mask |= NETIF_F_VLAN_CHALLENGED;
6503
6504         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6505         all &= one | ~NETIF_F_ALL_FOR_ALL;
6506
6507         /* If one device supports hw checksumming, set for all. */
6508         if (all & NETIF_F_GEN_CSUM)
6509                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6510
6511         return all;
6512 }
6513 EXPORT_SYMBOL(netdev_increment_features);
6514
6515 static struct hlist_head *netdev_create_hash(void)
6516 {
6517         int i;
6518         struct hlist_head *hash;
6519
6520         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6521         if (hash != NULL)
6522                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6523                         INIT_HLIST_HEAD(&hash[i]);
6524
6525         return hash;
6526 }
6527
6528 /* Initialize per network namespace state */
6529 static int __net_init netdev_init(struct net *net)
6530 {
6531         if (net != &init_net)
6532                 INIT_LIST_HEAD(&net->dev_base_head);
6533
6534         net->dev_name_head = netdev_create_hash();
6535         if (net->dev_name_head == NULL)
6536                 goto err_name;
6537
6538         net->dev_index_head = netdev_create_hash();
6539         if (net->dev_index_head == NULL)
6540                 goto err_idx;
6541
6542         return 0;
6543
6544 err_idx:
6545         kfree(net->dev_name_head);
6546 err_name:
6547         return -ENOMEM;
6548 }
6549
6550 /**
6551  *      netdev_drivername - network driver for the device
6552  *      @dev: network device
6553  *
6554  *      Determine network driver for device.
6555  */
6556 const char *netdev_drivername(const struct net_device *dev)
6557 {
6558         const struct device_driver *driver;
6559         const struct device *parent;
6560         const char *empty = "";
6561
6562         parent = dev->dev.parent;
6563         if (!parent)
6564                 return empty;
6565
6566         driver = parent->driver;
6567         if (driver && driver->name)
6568                 return driver->name;
6569         return empty;
6570 }
6571
6572 static int __netdev_printk(const char *level, const struct net_device *dev,
6573                            struct va_format *vaf)
6574 {
6575         int r;
6576
6577         if (dev && dev->dev.parent) {
6578                 r = dev_printk_emit(level[1] - '0',
6579                                     dev->dev.parent,
6580                                     "%s %s %s: %pV",
6581                                     dev_driver_string(dev->dev.parent),
6582                                     dev_name(dev->dev.parent),
6583                                     netdev_name(dev), vaf);
6584         } else if (dev) {
6585                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6586         } else {
6587                 r = printk("%s(NULL net_device): %pV", level, vaf);
6588         }
6589
6590         return r;
6591 }
6592
6593 int netdev_printk(const char *level, const struct net_device *dev,
6594                   const char *format, ...)
6595 {
6596         struct va_format vaf;
6597         va_list args;
6598         int r;
6599
6600         va_start(args, format);
6601
6602         vaf.fmt = format;
6603         vaf.va = &args;
6604
6605         r = __netdev_printk(level, dev, &vaf);
6606
6607         va_end(args);
6608
6609         return r;
6610 }
6611 EXPORT_SYMBOL(netdev_printk);
6612
6613 #define define_netdev_printk_level(func, level)                 \
6614 int func(const struct net_device *dev, const char *fmt, ...)    \
6615 {                                                               \
6616         int r;                                                  \
6617         struct va_format vaf;                                   \
6618         va_list args;                                           \
6619                                                                 \
6620         va_start(args, fmt);                                    \
6621                                                                 \
6622         vaf.fmt = fmt;                                          \
6623         vaf.va = &args;                                         \
6624                                                                 \
6625         r = __netdev_printk(level, dev, &vaf);                  \
6626                                                                 \
6627         va_end(args);                                           \
6628                                                                 \
6629         return r;                                               \
6630 }                                                               \
6631 EXPORT_SYMBOL(func);
6632
6633 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6634 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6635 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6636 define_netdev_printk_level(netdev_err, KERN_ERR);
6637 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6638 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6639 define_netdev_printk_level(netdev_info, KERN_INFO);
6640
6641 static void __net_exit netdev_exit(struct net *net)
6642 {
6643         kfree(net->dev_name_head);
6644         kfree(net->dev_index_head);
6645 }
6646
6647 static struct pernet_operations __net_initdata netdev_net_ops = {
6648         .init = netdev_init,
6649         .exit = netdev_exit,
6650 };
6651
6652 static void __net_exit default_device_exit(struct net *net)
6653 {
6654         struct net_device *dev, *aux;
6655         /*
6656          * Push all migratable network devices back to the
6657          * initial network namespace
6658          */
6659         rtnl_lock();
6660         for_each_netdev_safe(net, dev, aux) {
6661                 int err;
6662                 char fb_name[IFNAMSIZ];
6663
6664                 /* Ignore unmoveable devices (i.e. loopback) */
6665                 if (dev->features & NETIF_F_NETNS_LOCAL)
6666                         continue;
6667
6668                 /* Leave virtual devices for the generic cleanup */
6669                 if (dev->rtnl_link_ops)
6670                         continue;
6671
6672                 /* Push remaining network devices to init_net */
6673                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6674                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6675                 if (err) {
6676                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6677                                  __func__, dev->name, err);
6678                         BUG();
6679                 }
6680         }
6681         rtnl_unlock();
6682 }
6683
6684 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6685 {
6686         /* At exit all network devices most be removed from a network
6687          * namespace.  Do this in the reverse order of registration.
6688          * Do this across as many network namespaces as possible to
6689          * improve batching efficiency.
6690          */
6691         struct net_device *dev;
6692         struct net *net;
6693         LIST_HEAD(dev_kill_list);
6694
6695         rtnl_lock();
6696         list_for_each_entry(net, net_list, exit_list) {
6697                 for_each_netdev_reverse(net, dev) {
6698                         if (dev->rtnl_link_ops)
6699                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6700                         else
6701                                 unregister_netdevice_queue(dev, &dev_kill_list);
6702                 }
6703         }
6704         unregister_netdevice_many(&dev_kill_list);
6705         list_del(&dev_kill_list);
6706         rtnl_unlock();
6707 }
6708
6709 static struct pernet_operations __net_initdata default_device_ops = {
6710         .exit = default_device_exit,
6711         .exit_batch = default_device_exit_batch,
6712 };
6713
6714 /*
6715  *      Initialize the DEV module. At boot time this walks the device list and
6716  *      unhooks any devices that fail to initialise (normally hardware not
6717  *      present) and leaves us with a valid list of present and active devices.
6718  *
6719  */
6720
6721 /*
6722  *       This is called single threaded during boot, so no need
6723  *       to take the rtnl semaphore.
6724  */
6725 static int __init net_dev_init(void)
6726 {
6727         int i, rc = -ENOMEM;
6728
6729         BUG_ON(!dev_boot_phase);
6730
6731         if (dev_proc_init())
6732                 goto out;
6733
6734         if (netdev_kobject_init())
6735                 goto out;
6736
6737         INIT_LIST_HEAD(&ptype_all);
6738         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6739                 INIT_LIST_HEAD(&ptype_base[i]);
6740
6741         INIT_LIST_HEAD(&offload_base);
6742
6743         if (register_pernet_subsys(&netdev_net_ops))
6744                 goto out;
6745
6746         /*
6747          *      Initialise the packet receive queues.
6748          */
6749
6750         for_each_possible_cpu(i) {
6751                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6752
6753                 memset(sd, 0, sizeof(*sd));
6754                 skb_queue_head_init(&sd->input_pkt_queue);
6755                 skb_queue_head_init(&sd->process_queue);
6756                 sd->completion_queue = NULL;
6757                 INIT_LIST_HEAD(&sd->poll_list);
6758                 sd->output_queue = NULL;
6759                 sd->output_queue_tailp = &sd->output_queue;
6760 #ifdef CONFIG_RPS
6761                 sd->csd.func = rps_trigger_softirq;
6762                 sd->csd.info = sd;
6763                 sd->csd.flags = 0;
6764                 sd->cpu = i;
6765 #endif
6766
6767                 sd->backlog.poll = process_backlog;
6768                 sd->backlog.weight = weight_p;
6769                 sd->backlog.gro_list = NULL;
6770                 sd->backlog.gro_count = 0;
6771         }
6772
6773         dev_boot_phase = 0;
6774
6775         /* The loopback device is special if any other network devices
6776          * is present in a network namespace the loopback device must
6777          * be present. Since we now dynamically allocate and free the
6778          * loopback device ensure this invariant is maintained by
6779          * keeping the loopback device as the first device on the
6780          * list of network devices.  Ensuring the loopback devices
6781          * is the first device that appears and the last network device
6782          * that disappears.
6783          */
6784         if (register_pernet_device(&loopback_net_ops))
6785                 goto out;
6786
6787         if (register_pernet_device(&default_device_ops))
6788                 goto out;
6789
6790         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6791         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6792
6793         hotcpu_notifier(dev_cpu_callback, 0);
6794         dst_init();
6795         dev_mcast_init();
6796         rc = 0;
6797 out:
6798         return rc;
6799 }
6800
6801 subsys_initcall(net_dev_init);
6802
6803 static int __init initialize_hashrnd(void)
6804 {
6805         get_random_bytes(&hashrnd, sizeof(hashrnd));
6806         return 0;
6807 }
6808
6809 late_initcall_sync(initialize_hashrnd);
6810