Pileus Git - ~andy/linux/blob - net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132
 133 #include "net-sysfs.h"
 134
 135 /* Instead of increasing this, you should create a hash table. */
 136 #define MAX_GRO_SKBS 8
 137
 138 /* This should be increased if a protocol with a bigger head is added. */
 139 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 140
 141 static DEFINE_SPINLOCK(ptype_lock);
 142 static DEFINE_SPINLOCK(offload_lock);
 143 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 144 struct list_head ptype_all __read_mostly;       /* Taps */
 145 static struct list_head offload_base __read_mostly;
 146
 147 /*
 148  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 149  * semaphore.
 150  *
 151  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 152  *
 153  * Writers must hold the rtnl semaphore while they loop through the
 154  * dev_base_head list, and hold dev_base_lock for writing when they do the
 155  * actual updates.  This allows pure readers to access the list even
 156  * while a writer is preparing to update it.
 157  *
 158  * To put it another way, dev_base_lock is held for writing only to
 159  * protect against pure readers; the rtnl semaphore provides the
 160  * protection against other writers.
 161  *
 162  * See, for example usages, register_netdevice() and
 163  * unregister_netdevice(), which must be called with the rtnl
 164  * semaphore held.
 165  */
 166 DEFINE_RWLOCK(dev_base_lock);
 167 EXPORT_SYMBOL(dev_base_lock);
 168
 169 seqcount_t devnet_rename_seq;
 170
 171 static inline void dev_base_seq_inc(struct net *net)
 172 {
 173         while (++net->dev_base_seq == 0);
 174 }
 175
 176 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 177 {
 178         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 179
 180         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 181 }
 182
 183 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 184 {
 185         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 186 }
 187
 188 static inline void rps_lock(struct softnet_data *sd)
 189 {
 190 #ifdef CONFIG_RPS
 191         spin_lock(&sd->input_pkt_queue.lock);
 192 #endif
 193 }
 194
 195 static inline void rps_unlock(struct softnet_data *sd)
 196 {
 197 #ifdef CONFIG_RPS
 198         spin_unlock(&sd->input_pkt_queue.lock);
 199 #endif
 200 }
 201
 202 /* Device list insertion */
 203 static void list_netdevice(struct net_device *dev)
 204 {
 205         struct net *net = dev_net(dev);
 206
 207         ASSERT_RTNL();
 208
 209         write_lock_bh(&dev_base_lock);
 210         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 211         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 212         hlist_add_head_rcu(&dev->index_hlist,
 213                            dev_index_hash(net, dev->ifindex));
 214         write_unlock_bh(&dev_base_lock);
 215
 216         dev_base_seq_inc(net);
 217 }
 218
 219 /* Device list removal
 220  * caller must respect a RCU grace period before freeing/reusing dev
 221  */
 222 static void unlist_netdevice(struct net_device *dev)
 223 {
 224         ASSERT_RTNL();
 225
 226         /* Unlink dev from the device chain */
 227         write_lock_bh(&dev_base_lock);
 228         list_del_rcu(&dev->dev_list);
 229         hlist_del_rcu(&dev->name_hlist);
 230         hlist_del_rcu(&dev->index_hlist);
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(dev_net(dev));
 234 }
 235
 236 /*
 237  *      Our notifier list
 238  */
 239
 240 static RAW_NOTIFIER_HEAD(netdev_chain);
 241
 242 /*
 243  *      Device drivers call our routines to queue packets here. We empty the
 244  *      queue in the local softnet handler.
 245  */
 246
 247 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 248 EXPORT_PER_CPU_SYMBOL(softnet_data);
 249
 250 #ifdef CONFIG_LOCKDEP
 251 /*
 252  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 253  * according to dev->type
 254  */
 255 static const unsigned short netdev_lock_type[] =
 256         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 257          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 258          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 259          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 260          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 261          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 262          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 263          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 264          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 265          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 266          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 267          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 268          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 269          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 270          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 271
 272 static const char *const netdev_lock_name[] =
 273         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 274          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 275          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 276          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 277          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 278          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 279          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 280          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 281          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 282          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 283          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 284          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 285          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 286          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 287          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 288
 289 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 290 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 291
 292 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 293 {
 294         int i;
 295
 296         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 297                 if (netdev_lock_type[i] == dev_type)
 298                         return i;
 299         /* the last key is used by default */
 300         return ARRAY_SIZE(netdev_lock_type) - 1;
 301 }
 302
 303 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 304                                                  unsigned short dev_type)
 305 {
 306         int i;
 307
 308         i = netdev_lock_pos(dev_type);
 309         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 310                                    netdev_lock_name[i]);
 311 }
 312
 313 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 314 {
 315         int i;
 316
 317         i = netdev_lock_pos(dev->type);
 318         lockdep_set_class_and_name(&dev->addr_list_lock,
 319                                    &netdev_addr_lock_key[i],
 320                                    netdev_lock_name[i]);
 321 }
 322 #else
 323 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 324                                                  unsigned short dev_type)
 325 {
 326 }
 327 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 328 {
 329 }
 330 #endif
 331
 332 /*******************************************************************************
 333
 334                 Protocol management and registration routines
 335
 336 *******************************************************************************/
 337
 338 /*
 339  *      Add a protocol ID to the list. Now that the input handler is
 340  *      smarter we can dispense with all the messy stuff that used to be
 341  *      here.
 342  *
 343  *      BEWARE!!! Protocol handlers, mangling input packets,
 344  *      MUST BE last in hash buckets and checking protocol handlers
 345  *      MUST start from promiscuous ptype_all chain in net_bh.
 346  *      It is true now, do not change it.
 347  *      Explanation follows: if protocol handler, mangling packet, will
 348  *      be the first on list, it is not able to sense, that packet
 349  *      is cloned and should be copied-on-write, so that it will
 350  *      change it and subsequent readers will get broken packet.
 351  *                                                      --ANK (980803)
 352  */
 353
 354 static inline struct list_head *ptype_head(const struct packet_type *pt)
 355 {
 356         if (pt->type == htons(ETH_P_ALL))
 357                 return &ptype_all;
 358         else
 359                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 360 }
 361
 362 /**
 363  *      dev_add_pack - add packet handler
 364  *      @pt: packet type declaration
 365  *
 366  *      Add a protocol handler to the networking stack. The passed &packet_type
 367  *      is linked into kernel lists and may not be freed until it has been
 368  *      removed from the kernel lists.
 369  *
 370  *      This call does not sleep therefore it can not
 371  *      guarantee all CPU's that are in middle of receiving packets
 372  *      will see the new packet type (until the next received packet).
 373  */
 374
 375 void dev_add_pack(struct packet_type *pt)
 376 {
 377         struct list_head *head = ptype_head(pt);
 378
 379         spin_lock(&ptype_lock);
 380         list_add_rcu(&pt->list, head);
 381         spin_unlock(&ptype_lock);
 382 }
 383 EXPORT_SYMBOL(dev_add_pack);
 384
 385 /**
 386  *      __dev_remove_pack        - remove packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Remove a protocol handler that was previously added to the kernel
 390  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 391  *      from the kernel lists and can be freed or reused once this function
 392  *      returns.
 393  *
 394  *      The packet type might still be in use by receivers
 395  *      and must not be freed until after all the CPU's have gone
 396  *      through a quiescent state.
 397  */
 398 void __dev_remove_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401         struct packet_type *pt1;
 402
 403         spin_lock(&ptype_lock);
 404
 405         list_for_each_entry(pt1, head, list) {
 406                 if (pt == pt1) {
 407                         list_del_rcu(&pt->list);
 408                         goto out;
 409                 }
 410         }
 411
 412         pr_warn("dev_remove_pack: %p not found\n", pt);
 413 out:
 414         spin_unlock(&ptype_lock);
 415 }
 416 EXPORT_SYMBOL(__dev_remove_pack);
 417
 418 /**
 419  *      dev_remove_pack  - remove packet handler
 420  *      @pt: packet type declaration
 421  *
 422  *      Remove a protocol handler that was previously added to the kernel
 423  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 424  *      from the kernel lists and can be freed or reused once this function
 425  *      returns.
 426  *
 427  *      This call sleeps to guarantee that no CPU is looking at the packet
 428  *      type after return.
 429  */
 430 void dev_remove_pack(struct packet_type *pt)
 431 {
 432         __dev_remove_pack(pt);
 433
 434         synchronize_net();
 435 }
 436 EXPORT_SYMBOL(dev_remove_pack);
 437
 438
 439 /**
 440  *      dev_add_offload - register offload handlers
 441  *      @po: protocol offload declaration
 442  *
 443  *      Add protocol offload handlers to the networking stack. The passed
 444  *      &proto_offload is linked into kernel lists and may not be freed until
 445  *      it has been removed from the kernel lists.
 446  *
 447  *      This call does not sleep therefore it can not
 448  *      guarantee all CPU's that are in middle of receiving packets
 449  *      will see the new offload handlers (until the next received packet).
 450  */
 451 void dev_add_offload(struct packet_offload *po)
 452 {
 453         struct list_head *head = &offload_base;
 454
 455         spin_lock(&offload_lock);
 456         list_add_rcu(&po->list, head);
 457         spin_unlock(&offload_lock);
 458 }
 459 EXPORT_SYMBOL(dev_add_offload);
 460
 461 /**
 462  *      __dev_remove_offload     - remove offload handler
 463  *      @po: packet offload declaration
 464  *
 465  *      Remove a protocol offload handler that was previously added to the
 466  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 467  *      is removed from the kernel lists and can be freed or reused once this
 468  *      function returns.
 469  *
 470  *      The packet type might still be in use by receivers
 471  *      and must not be freed until after all the CPU's have gone
 472  *      through a quiescent state.
 473  */
 474 void __dev_remove_offload(struct packet_offload *po)
 475 {
 476         struct list_head *head = &offload_base;
 477         struct packet_offload *po1;
 478
 479         spin_lock(&offload_lock);
 480
 481         list_for_each_entry(po1, head, list) {
 482                 if (po == po1) {
 483                         list_del_rcu(&po->list);
 484                         goto out;
 485                 }
 486         }
 487
 488         pr_warn("dev_remove_offload: %p not found\n", po);
 489 out:
 490         spin_unlock(&offload_lock);
 491 }
 492 EXPORT_SYMBOL(__dev_remove_offload);
 493
 494 /**
 495  *      dev_remove_offload       - remove packet offload handler
 496  *      @po: packet offload declaration
 497  *
 498  *      Remove a packet offload handler that was previously added to the kernel
 499  *      offload handlers by dev_add_offload(). The passed &offload_type is
 500  *      removed from the kernel lists and can be freed or reused once this
 501  *      function returns.
 502  *
 503  *      This call sleeps to guarantee that no CPU is looking at the packet
 504  *      type after return.
 505  */
 506 void dev_remove_offload(struct packet_offload *po)
 507 {
 508         __dev_remove_offload(po);
 509
 510         synchronize_net();
 511 }
 512 EXPORT_SYMBOL(dev_remove_offload);
 513
 514 /******************************************************************************
 515
 516                       Device Boot-time Settings Routines
 517
 518 *******************************************************************************/
 519
 520 /* Boot time configuration table */
 521 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 522
 523 /**
 524  *      netdev_boot_setup_add   - add new setup entry
 525  *      @name: name of the device
 526  *      @map: configured settings for the device
 527  *
 528  *      Adds new setup entry to the dev_boot_setup list.  The function
 529  *      returns 0 on error and 1 on success.  This is a generic routine to
 530  *      all netdevices.
 531  */
 532 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 533 {
 534         struct netdev_boot_setup *s;
 535         int i;
 536
 537         s = dev_boot_setup;
 538         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 539                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 540                         memset(s[i].name, 0, sizeof(s[i].name));
 541                         strlcpy(s[i].name, name, IFNAMSIZ);
 542                         memcpy(&s[i].map, map, sizeof(s[i].map));
 543                         break;
 544                 }
 545         }
 546
 547         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 548 }
 549
 550 /**
 551  *      netdev_boot_setup_check - check boot time settings
 552  *      @dev: the netdevice
 553  *
 554  *      Check boot time settings for the device.
 555  *      The found settings are set for the device to be used
 556  *      later in the device probing.
 557  *      Returns 0 if no settings found, 1 if they are.
 558  */
 559 int netdev_boot_setup_check(struct net_device *dev)
 560 {
 561         struct netdev_boot_setup *s = dev_boot_setup;
 562         int i;
 563
 564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 566                     !strcmp(dev->name, s[i].name)) {
 567                         dev->irq        = s[i].map.irq;
 568                         dev->base_addr  = s[i].map.base_addr;
 569                         dev->mem_start  = s[i].map.mem_start;
 570                         dev->mem_end    = s[i].map.mem_end;
 571                         return 1;
 572                 }
 573         }
 574         return 0;
 575 }
 576 EXPORT_SYMBOL(netdev_boot_setup_check);
 577
 578
 579 /**
 580  *      netdev_boot_base        - get address from boot time settings
 581  *      @prefix: prefix for network device
 582  *      @unit: id for network device
 583  *
 584  *      Check boot time settings for the base address of device.
 585  *      The found settings are set for the device to be used
 586  *      later in the device probing.
 587  *      Returns 0 if no settings found.
 588  */
 589 unsigned long netdev_boot_base(const char *prefix, int unit)
 590 {
 591         const struct netdev_boot_setup *s = dev_boot_setup;
 592         char name[IFNAMSIZ];
 593         int i;
 594
 595         sprintf(name, "%s%d", prefix, unit);
 596
 597         /*
 598          * If device already registered then return base of 1
 599          * to indicate not to probe for this interface
 600          */
 601         if (__dev_get_by_name(&init_net, name))
 602                 return 1;
 603
 604         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 605                 if (!strcmp(name, s[i].name))
 606                         return s[i].map.base_addr;
 607         return 0;
 608 }
 609
 610 /*
 611  * Saves at boot time configured settings for any netdevice.
 612  */
 613 int __init netdev_boot_setup(char *str)
 614 {
 615         int ints[5];
 616         struct ifmap map;
 617
 618         str = get_options(str, ARRAY_SIZE(ints), ints);
 619         if (!str || !*str)
 620                 return 0;
 621
 622         /* Save settings */
 623         memset(&map, 0, sizeof(map));
 624         if (ints[0] > 0)
 625                 map.irq = ints[1];
 626         if (ints[0] > 1)
 627                 map.base_addr = ints[2];
 628         if (ints[0] > 2)
 629                 map.mem_start = ints[3];
 630         if (ints[0] > 3)
 631                 map.mem_end = ints[4];
 632
 633         /* Add new entry to the list */
 634         return netdev_boot_setup_add(str, &map);
 635 }
 636
 637 __setup("netdev=", netdev_boot_setup);
 638
 639 /*******************************************************************************
 640
 641                             Device Interface Subroutines
 642
 643 *******************************************************************************/
 644
 645 /**
 646  *      __dev_get_by_name       - find a device by its name
 647  *      @net: the applicable net namespace
 648  *      @name: name to find
 649  *
 650  *      Find an interface by name. Must be called under RTNL semaphore
 651  *      or @dev_base_lock. If the name is found a pointer to the device
 652  *      is returned. If the name is not found then %NULL is returned. The
 653  *      reference counters are not incremented so the caller must be
 654  *      careful with locks.
 655  */
 656
 657 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 658 {
 659         struct net_device *dev;
 660         struct hlist_head *head = dev_name_hash(net, name);
 661
 662         hlist_for_each_entry(dev, head, name_hlist)
 663                 if (!strncmp(dev->name, name, IFNAMSIZ))
 664                         return dev;
 665
 666         return NULL;
 667 }
 668 EXPORT_SYMBOL(__dev_get_by_name);
 669
 670 /**
 671  *      dev_get_by_name_rcu     - find a device by its name
 672  *      @net: the applicable net namespace
 673  *      @name: name to find
 674  *
 675  *      Find an interface by name.
 676  *      If the name is found a pointer to the device is returned.
 677  *      If the name is not found then %NULL is returned.
 678  *      The reference counters are not incremented so the caller must be
 679  *      careful with locks. The caller must hold RCU lock.
 680  */
 681
 682 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 683 {
 684         struct net_device *dev;
 685         struct hlist_head *head = dev_name_hash(net, name);
 686
 687         hlist_for_each_entry_rcu(dev, head, name_hlist)
 688                 if (!strncmp(dev->name, name, IFNAMSIZ))
 689                         return dev;
 690
 691         return NULL;
 692 }
 693 EXPORT_SYMBOL(dev_get_by_name_rcu);
 694
 695 /**
 696  *      dev_get_by_name         - find a device by its name
 697  *      @net: the applicable net namespace
 698  *      @name: name to find
 699  *
 700  *      Find an interface by name. This can be called from any
 701  *      context and does its own locking. The returned handle has
 702  *      the usage count incremented and the caller must use dev_put() to
 703  *      release it when it is no longer needed. %NULL is returned if no
 704  *      matching device is found.
 705  */
 706
 707 struct net_device *dev_get_by_name(struct net *net, const char *name)
 708 {
 709         struct net_device *dev;
 710
 711         rcu_read_lock();
 712         dev = dev_get_by_name_rcu(net, name);
 713         if (dev)
 714                 dev_hold(dev);
 715         rcu_read_unlock();
 716         return dev;
 717 }
 718 EXPORT_SYMBOL(dev_get_by_name);
 719
 720 /**
 721  *      __dev_get_by_index - find a device by its ifindex
 722  *      @net: the applicable net namespace
 723  *      @ifindex: index of device
 724  *
 725  *      Search for an interface by index. Returns %NULL if the device
 726  *      is not found or a pointer to the device. The device has not
 727  *      had its reference counter increased so the caller must be careful
 728  *      about locking. The caller must hold either the RTNL semaphore
 729  *      or @dev_base_lock.
 730  */
 731
 732 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 733 {
 734         struct net_device *dev;
 735         struct hlist_head *head = dev_index_hash(net, ifindex);
 736
 737         hlist_for_each_entry(dev, head, index_hlist)
 738                 if (dev->ifindex == ifindex)
 739                         return dev;
 740
 741         return NULL;
 742 }
 743 EXPORT_SYMBOL(__dev_get_by_index);
 744
 745 /**
 746  *      dev_get_by_index_rcu - find a device by its ifindex
 747  *      @net: the applicable net namespace
 748  *      @ifindex: index of device
 749  *
 750  *      Search for an interface by index. Returns %NULL if the device
 751  *      is not found or a pointer to the device. The device has not
 752  *      had its reference counter increased so the caller must be careful
 753  *      about locking. The caller must hold RCU lock.
 754  */
 755
 756 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 757 {
 758         struct net_device *dev;
 759         struct hlist_head *head = dev_index_hash(net, ifindex);
 760
 761         hlist_for_each_entry_rcu(dev, head, index_hlist)
 762                 if (dev->ifindex == ifindex)
 763                         return dev;
 764
 765         return NULL;
 766 }
 767 EXPORT_SYMBOL(dev_get_by_index_rcu);
 768
 769
 770 /**
 771  *      dev_get_by_index - find a device by its ifindex
 772  *      @net: the applicable net namespace
 773  *      @ifindex: index of device
 774  *
 775  *      Search for an interface by index. Returns NULL if the device
 776  *      is not found or a pointer to the device. The device returned has
 777  *      had a reference added and the pointer is safe until the user calls
 778  *      dev_put to indicate they have finished with it.
 779  */
 780
 781 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 782 {
 783         struct net_device *dev;
 784
 785         rcu_read_lock();
 786         dev = dev_get_by_index_rcu(net, ifindex);
 787         if (dev)
 788                 dev_hold(dev);
 789         rcu_read_unlock();
 790         return dev;
 791 }
 792 EXPORT_SYMBOL(dev_get_by_index);
 793
 794 /**
 795  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 796  *      @net: the applicable net namespace
 797  *      @type: media type of device
 798  *      @ha: hardware address
 799  *
 800  *      Search for an interface by MAC address. Returns NULL if the device
 801  *      is not found or a pointer to the device.
 802  *      The caller must hold RCU or RTNL.
 803  *      The returned device has not had its ref count increased
 804  *      and the caller must therefore be careful about locking
 805  *
 806  */
 807
 808 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 809                                        const char *ha)
 810 {
 811         struct net_device *dev;
 812
 813         for_each_netdev_rcu(net, dev)
 814                 if (dev->type == type &&
 815                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 816                         return dev;
 817
 818         return NULL;
 819 }
 820 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 821
 822 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 823 {
 824         struct net_device *dev;
 825
 826         ASSERT_RTNL();
 827         for_each_netdev(net, dev)
 828                 if (dev->type == type)
 829                         return dev;
 830
 831         return NULL;
 832 }
 833 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 834
 835 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 836 {
 837         struct net_device *dev, *ret = NULL;
 838
 839         rcu_read_lock();
 840         for_each_netdev_rcu(net, dev)
 841                 if (dev->type == type) {
 842                         dev_hold(dev);
 843                         ret = dev;
 844                         break;
 845                 }
 846         rcu_read_unlock();
 847         return ret;
 848 }
 849 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 850
 851 /**
 852  *      dev_get_by_flags_rcu - find any device with given flags
 853  *      @net: the applicable net namespace
 854  *      @if_flags: IFF_* values
 855  *      @mask: bitmask of bits in if_flags to check
 856  *
 857  *      Search for any interface with the given flags. Returns NULL if a device
 858  *      is not found or a pointer to the device. Must be called inside
 859  *      rcu_read_lock(), and result refcount is unchanged.
 860  */
 861
 862 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 863                                     unsigned short mask)
 864 {
 865         struct net_device *dev, *ret;
 866
 867         ret = NULL;
 868         for_each_netdev_rcu(net, dev) {
 869                 if (((dev->flags ^ if_flags) & mask) == 0) {
 870                         ret = dev;
 871                         break;
 872                 }
 873         }
 874         return ret;
 875 }
 876 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 877
 878 /**
 879  *      dev_valid_name - check if name is okay for network device
 880  *      @name: name string
 881  *
 882  *      Network device names need to be valid file names to
 883  *      to allow sysfs to work.  We also disallow any kind of
 884  *      whitespace.
 885  */
 886 bool dev_valid_name(const char *name)
 887 {
 888         if (*name == '\0')
 889                 return false;
 890         if (strlen(name) >= IFNAMSIZ)
 891                 return false;
 892         if (!strcmp(name, ".") || !strcmp(name, ".."))
 893                 return false;
 894
 895         while (*name) {
 896                 if (*name == '/' || isspace(*name))
 897                         return false;
 898                 name++;
 899         }
 900         return true;
 901 }
 902 EXPORT_SYMBOL(dev_valid_name);
 903
 904 /**
 905  *      __dev_alloc_name - allocate a name for a device
 906  *      @net: network namespace to allocate the device name in
 907  *      @name: name format string
 908  *      @buf:  scratch buffer and result name string
 909  *
 910  *      Passed a format string - eg "lt%d" it will try and find a suitable
 911  *      id. It scans list of devices to build up a free map, then chooses
 912  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 913  *      while allocating the name and adding the device in order to avoid
 914  *      duplicates.
 915  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 916  *      Returns the number of the unit assigned or a negative errno code.
 917  */
 918
 919 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 920 {
 921         int i = 0;
 922         const char *p;
 923         const int max_netdevices = 8*PAGE_SIZE;
 924         unsigned long *inuse;
 925         struct net_device *d;
 926
 927         p = strnchr(name, IFNAMSIZ-1, '%');
 928         if (p) {
 929                 /*
 930                  * Verify the string as this thing may have come from
 931                  * the user.  There must be either one "%d" and no other "%"
 932                  * characters.
 933                  */
 934                 if (p[1] != 'd' || strchr(p + 2, '%'))
 935                         return -EINVAL;
 936
 937                 /* Use one page as a bit array of possible slots */
 938                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 939                 if (!inuse)
 940                         return -ENOMEM;
 941
 942                 for_each_netdev(net, d) {
 943                         if (!sscanf(d->name, name, &i))
 944                                 continue;
 945                         if (i < 0 || i >= max_netdevices)
 946                                 continue;
 947
 948                         /*  avoid cases where sscanf is not exact inverse of printf */
 949                         snprintf(buf, IFNAMSIZ, name, i);
 950                         if (!strncmp(buf, d->name, IFNAMSIZ))
 951                                 set_bit(i, inuse);
 952                 }
 953
 954                 i = find_first_zero_bit(inuse, max_netdevices);
 955                 free_page((unsigned long) inuse);
 956         }
 957
 958         if (buf != name)
 959                 snprintf(buf, IFNAMSIZ, name, i);
 960         if (!__dev_get_by_name(net, buf))
 961                 return i;
 962
 963         /* It is possible to run out of possible slots
 964          * when the name is long and there isn't enough space left
 965          * for the digits, or if all bits are used.
 966          */
 967         return -ENFILE;
 968 }
 969
 970 /**
 971  *      dev_alloc_name - allocate a name for a device
 972  *      @dev: device
 973  *      @name: name format string
 974  *
 975  *      Passed a format string - eg "lt%d" it will try and find a suitable
 976  *      id. It scans list of devices to build up a free map, then chooses
 977  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 978  *      while allocating the name and adding the device in order to avoid
 979  *      duplicates.
 980  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 981  *      Returns the number of the unit assigned or a negative errno code.
 982  */
 983
 984 int dev_alloc_name(struct net_device *dev, const char *name)
 985 {
 986         char buf[IFNAMSIZ];
 987         struct net *net;
 988         int ret;
 989
 990         BUG_ON(!dev_net(dev));
 991         net = dev_net(dev);
 992         ret = __dev_alloc_name(net, name, buf);
 993         if (ret >= 0)
 994                 strlcpy(dev->name, buf, IFNAMSIZ);
 995         return ret;
 996 }
 997 EXPORT_SYMBOL(dev_alloc_name);
 998
 999 static int dev_alloc_name_ns(struct net *net,
1000                              struct net_device *dev,
1001                              const char *name)
1002 {
1003         char buf[IFNAMSIZ];
1004         int ret;
1005
1006         ret = __dev_alloc_name(net, name, buf);
1007         if (ret >= 0)
1008                 strlcpy(dev->name, buf, IFNAMSIZ);
1009         return ret;
1010 }
1011
1012 static int dev_get_valid_name(struct net *net,
1013                               struct net_device *dev,
1014                               const char *name)
1015 {
1016         BUG_ON(!net);
1017
1018         if (!dev_valid_name(name))
1019                 return -EINVAL;
1020
1021         if (strchr(name, '%'))
1022                 return dev_alloc_name_ns(net, dev, name);
1023         else if (__dev_get_by_name(net, name))
1024                 return -EEXIST;
1025         else if (dev->name != name)
1026                 strlcpy(dev->name, name, IFNAMSIZ);
1027
1028         return 0;
1029 }
1030
1031 /**
1032  *      dev_change_name - change name of a device
1033  *      @dev: device
1034  *      @newname: name (or format string) must be at least IFNAMSIZ
1035  *
1036  *      Change name of a device, can pass format strings "eth%d".
1037  *      for wildcarding.
1038  */
1039 int dev_change_name(struct net_device *dev, const char *newname)
1040 {
1041         char oldname[IFNAMSIZ];
1042         int err = 0;
1043         int ret;
1044         struct net *net;
1045
1046         ASSERT_RTNL();
1047         BUG_ON(!dev_net(dev));
1048
1049         net = dev_net(dev);
1050         if (dev->flags & IFF_UP)
1051                 return -EBUSY;
1052
1053         write_seqcount_begin(&devnet_rename_seq);
1054
1055         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1056                 write_seqcount_end(&devnet_rename_seq);
1057                 return 0;
1058         }
1059
1060         memcpy(oldname, dev->name, IFNAMSIZ);
1061
1062         err = dev_get_valid_name(net, dev, newname);
1063         if (err < 0) {
1064                 write_seqcount_end(&devnet_rename_seq);
1065                 return err;
1066         }
1067
1068 rollback:
1069         ret = device_rename(&dev->dev, dev->name);
1070         if (ret) {
1071                 memcpy(dev->name, oldname, IFNAMSIZ);
1072                 write_seqcount_end(&devnet_rename_seq);
1073                 return ret;
1074         }
1075
1076         write_seqcount_end(&devnet_rename_seq);
1077
1078         write_lock_bh(&dev_base_lock);
1079         hlist_del_rcu(&dev->name_hlist);
1080         write_unlock_bh(&dev_base_lock);
1081
1082         synchronize_rcu();
1083
1084         write_lock_bh(&dev_base_lock);
1085         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1086         write_unlock_bh(&dev_base_lock);
1087
1088         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1089         ret = notifier_to_errno(ret);
1090
1091         if (ret) {
1092                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1093                 if (err >= 0) {
1094                         err = ret;
1095                         write_seqcount_begin(&devnet_rename_seq);
1096                         memcpy(dev->name, oldname, IFNAMSIZ);
1097                         goto rollback;
1098                 } else {
1099                         pr_err("%s: name change rollback failed: %d\n",
1100                                dev->name, ret);
1101                 }
1102         }
1103
1104         return err;
1105 }
1106
1107 /**
1108  *      dev_set_alias - change ifalias of a device
1109  *      @dev: device
1110  *      @alias: name up to IFALIASZ
1111  *      @len: limit of bytes to copy from info
1112  *
1113  *      Set ifalias for a device,
1114  */
1115 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1116 {
1117         char *new_ifalias;
1118
1119         ASSERT_RTNL();
1120
1121         if (len >= IFALIASZ)
1122                 return -EINVAL;
1123
1124         if (!len) {
1125                 kfree(dev->ifalias);
1126                 dev->ifalias = NULL;
1127                 return 0;
1128         }
1129
1130         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1131         if (!new_ifalias)
1132                 return -ENOMEM;
1133         dev->ifalias = new_ifalias;
1134
1135         strlcpy(dev->ifalias, alias, len+1);
1136         return len;
1137 }
1138
1139
1140 /**
1141  *      netdev_features_change - device changes features
1142  *      @dev: device to cause notification
1143  *
1144  *      Called to indicate a device has changed features.
1145  */
1146 void netdev_features_change(struct net_device *dev)
1147 {
1148         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1149 }
1150 EXPORT_SYMBOL(netdev_features_change);
1151
1152 /**
1153  *      netdev_state_change - device changes state
1154  *      @dev: device to cause notification
1155  *
1156  *      Called to indicate a device has changed state. This function calls
1157  *      the notifier chains for netdev_chain and sends a NEWLINK message
1158  *      to the routing socket.
1159  */
1160 void netdev_state_change(struct net_device *dev)
1161 {
1162         if (dev->flags & IFF_UP) {
1163                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1164                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1165         }
1166 }
1167 EXPORT_SYMBOL(netdev_state_change);
1168
1169 /**
1170  *      netdev_notify_peers - notify network peers about existence of @dev
1171  *      @dev: network device
1172  *
1173  * Generate traffic such that interested network peers are aware of
1174  * @dev, such as by generating a gratuitous ARP. This may be used when
1175  * a device wants to inform the rest of the network about some sort of
1176  * reconfiguration such as a failover event or virtual machine
1177  * migration.
1178  */
1179 void netdev_notify_peers(struct net_device *dev)
1180 {
1181         rtnl_lock();
1182         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1183         rtnl_unlock();
1184 }
1185 EXPORT_SYMBOL(netdev_notify_peers);
1186
1187 static int __dev_open(struct net_device *dev)
1188 {
1189         const struct net_device_ops *ops = dev->netdev_ops;
1190         int ret;
1191
1192         ASSERT_RTNL();
1193
1194         if (!netif_device_present(dev))
1195                 return -ENODEV;
1196
1197         /* Block netpoll from trying to do any rx path servicing.
1198          * If we don't do this there is a chance ndo_poll_controller
1199          * or ndo_poll may be running while we open the device
1200          */
1201         netpoll_rx_disable(dev);
1202
1203         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1204         ret = notifier_to_errno(ret);
1205         if (ret)
1206                 return ret;
1207
1208         set_bit(__LINK_STATE_START, &dev->state);
1209
1210         if (ops->ndo_validate_addr)
1211                 ret = ops->ndo_validate_addr(dev);
1212
1213         if (!ret && ops->ndo_open)
1214                 ret = ops->ndo_open(dev);
1215
1216         netpoll_rx_enable(dev);
1217
1218         if (ret)
1219                 clear_bit(__LINK_STATE_START, &dev->state);
1220         else {
1221                 dev->flags |= IFF_UP;
1222                 net_dmaengine_get();
1223                 dev_set_rx_mode(dev);
1224                 dev_activate(dev);
1225                 add_device_randomness(dev->dev_addr, dev->addr_len);
1226         }
1227
1228         return ret;
1229 }
1230
1231 /**
1232  *      dev_open        - prepare an interface for use.
1233  *      @dev:   device to open
1234  *
1235  *      Takes a device from down to up state. The device's private open
1236  *      function is invoked and then the multicast lists are loaded. Finally
1237  *      the device is moved into the up state and a %NETDEV_UP message is
1238  *      sent to the netdev notifier chain.
1239  *
1240  *      Calling this function on an active interface is a nop. On a failure
1241  *      a negative errno code is returned.
1242  */
1243 int dev_open(struct net_device *dev)
1244 {
1245         int ret;
1246
1247         if (dev->flags & IFF_UP)
1248                 return 0;
1249
1250         ret = __dev_open(dev);
1251         if (ret < 0)
1252                 return ret;
1253
1254         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1255         call_netdevice_notifiers(NETDEV_UP, dev);
1256
1257         return ret;
1258 }
1259 EXPORT_SYMBOL(dev_open);
1260
1261 static int __dev_close_many(struct list_head *head)
1262 {
1263         struct net_device *dev;
1264
1265         ASSERT_RTNL();
1266         might_sleep();
1267
1268         list_for_each_entry(dev, head, unreg_list) {
1269                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1270
1271                 clear_bit(__LINK_STATE_START, &dev->state);
1272
1273                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1274                  * can be even on different cpu. So just clear netif_running().
1275                  *
1276                  * dev->stop() will invoke napi_disable() on all of it's
1277                  * napi_struct instances on this device.
1278                  */
1279                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1280         }
1281
1282         dev_deactivate_many(head);
1283
1284         list_for_each_entry(dev, head, unreg_list) {
1285                 const struct net_device_ops *ops = dev->netdev_ops;
1286
1287                 /*
1288                  *      Call the device specific close. This cannot fail.
1289                  *      Only if device is UP
1290                  *
1291                  *      We allow it to be called even after a DETACH hot-plug
1292                  *      event.
1293                  */
1294                 if (ops->ndo_stop)
1295                         ops->ndo_stop(dev);
1296
1297                 dev->flags &= ~IFF_UP;
1298                 net_dmaengine_put();
1299         }
1300
1301         return 0;
1302 }
1303
1304 static int __dev_close(struct net_device *dev)
1305 {
1306         int retval;
1307         LIST_HEAD(single);
1308
1309         /* Temporarily disable netpoll until the interface is down */
1310         netpoll_rx_disable(dev);
1311
1312         list_add(&dev->unreg_list, &single);
1313         retval = __dev_close_many(&single);
1314         list_del(&single);
1315
1316         netpoll_rx_enable(dev);
1317         return retval;
1318 }
1319
1320 static int dev_close_many(struct list_head *head)
1321 {
1322         struct net_device *dev, *tmp;
1323         LIST_HEAD(tmp_list);
1324
1325         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1326                 if (!(dev->flags & IFF_UP))
1327                         list_move(&dev->unreg_list, &tmp_list);
1328
1329         __dev_close_many(head);
1330
1331         list_for_each_entry(dev, head, unreg_list) {
1332                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1333                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1334         }
1335
1336         /* rollback_registered_many needs the complete original list */
1337         list_splice(&tmp_list, head);
1338         return 0;
1339 }
1340
1341 /**
1342  *      dev_close - shutdown an interface.
1343  *      @dev: device to shutdown
1344  *
1345  *      This function moves an active device into down state. A
1346  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1347  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1348  *      chain.
1349  */
1350 int dev_close(struct net_device *dev)
1351 {
1352         if (dev->flags & IFF_UP) {
1353                 LIST_HEAD(single);
1354
1355                 /* Block netpoll rx while the interface is going down */
1356                 netpoll_rx_disable(dev);
1357
1358                 list_add(&dev->unreg_list, &single);
1359                 dev_close_many(&single);
1360                 list_del(&single);
1361
1362                 netpoll_rx_enable(dev);
1363         }
1364         return 0;
1365 }
1366 EXPORT_SYMBOL(dev_close);
1367
1368
1369 /**
1370  *      dev_disable_lro - disable Large Receive Offload on a device
1371  *      @dev: device
1372  *
1373  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1374  *      called under RTNL.  This is needed if received packets may be
1375  *      forwarded to another interface.
1376  */
1377 void dev_disable_lro(struct net_device *dev)
1378 {
1379         /*
1380          * If we're trying to disable lro on a vlan device
1381          * use the underlying physical device instead
1382          */
1383         if (is_vlan_dev(dev))
1384                 dev = vlan_dev_real_dev(dev);
1385
1386         dev->wanted_features &= ~NETIF_F_LRO;
1387         netdev_update_features(dev);
1388
1389         if (unlikely(dev->features & NETIF_F_LRO))
1390                 netdev_WARN(dev, "failed to disable LRO!\n");
1391 }
1392 EXPORT_SYMBOL(dev_disable_lro);
1393
1394 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1395                                    struct net_device *dev)
1396 {
1397         struct netdev_notifier_info info;
1398
1399         netdev_notifier_info_init(&info, dev);
1400         return nb->notifier_call(nb, val, &info);
1401 }
1402
1403 static int dev_boot_phase = 1;
1404
1405 /**
1406  *      register_netdevice_notifier - register a network notifier block
1407  *      @nb: notifier
1408  *
1409  *      Register a notifier to be called when network device events occur.
1410  *      The notifier passed is linked into the kernel structures and must
1411  *      not be reused until it has been unregistered. A negative errno code
1412  *      is returned on a failure.
1413  *
1414  *      When registered all registration and up events are replayed
1415  *      to the new notifier to allow device to have a race free
1416  *      view of the network device list.
1417  */
1418
1419 int register_netdevice_notifier(struct notifier_block *nb)
1420 {
1421         struct net_device *dev;
1422         struct net_device *last;
1423         struct net *net;
1424         int err;
1425
1426         rtnl_lock();
1427         err = raw_notifier_chain_register(&netdev_chain, nb);
1428         if (err)
1429                 goto unlock;
1430         if (dev_boot_phase)
1431                 goto unlock;
1432         for_each_net(net) {
1433                 for_each_netdev(net, dev) {
1434                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1435                         err = notifier_to_errno(err);
1436                         if (err)
1437                                 goto rollback;
1438
1439                         if (!(dev->flags & IFF_UP))
1440                                 continue;
1441
1442                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1443                 }
1444         }
1445
1446 unlock:
1447         rtnl_unlock();
1448         return err;
1449
1450 rollback:
1451         last = dev;
1452         for_each_net(net) {
1453                 for_each_netdev(net, dev) {
1454                         if (dev == last)
1455                                 goto outroll;
1456
1457                         if (dev->flags & IFF_UP) {
1458                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1459                                                         dev);
1460                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1461                         }
1462                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1463                 }
1464         }
1465
1466 outroll:
1467         raw_notifier_chain_unregister(&netdev_chain, nb);
1468         goto unlock;
1469 }
1470 EXPORT_SYMBOL(register_netdevice_notifier);
1471
1472 /**
1473  *      unregister_netdevice_notifier - unregister a network notifier block
1474  *      @nb: notifier
1475  *
1476  *      Unregister a notifier previously registered by
1477  *      register_netdevice_notifier(). The notifier is unlinked into the
1478  *      kernel structures and may then be reused. A negative errno code
1479  *      is returned on a failure.
1480  *
1481  *      After unregistering unregister and down device events are synthesized
1482  *      for all devices on the device list to the removed notifier to remove
1483  *      the need for special case cleanup code.
1484  */
1485
1486 int unregister_netdevice_notifier(struct notifier_block *nb)
1487 {
1488         struct net_device *dev;
1489         struct net *net;
1490         int err;
1491
1492         rtnl_lock();
1493         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1494         if (err)
1495                 goto unlock;
1496
1497         for_each_net(net) {
1498                 for_each_netdev(net, dev) {
1499                         if (dev->flags & IFF_UP) {
1500                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1501                                                         dev);
1502                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1503                         }
1504                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1505                 }
1506         }
1507 unlock:
1508         rtnl_unlock();
1509         return err;
1510 }
1511 EXPORT_SYMBOL(unregister_netdevice_notifier);
1512
1513 /**
1514  *      call_netdevice_notifiers_info - call all network notifier blocks
1515  *      @val: value passed unmodified to notifier function
1516  *      @dev: net_device pointer passed unmodified to notifier function
1517  *      @info: notifier information data
1518  *
1519  *      Call all network notifier blocks.  Parameters and return value
1520  *      are as for raw_notifier_call_chain().
1521  */
1522
1523 int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1524                                   struct netdev_notifier_info *info)
1525 {
1526         ASSERT_RTNL();
1527         netdev_notifier_info_init(info, dev);
1528         return raw_notifier_call_chain(&netdev_chain, val, info);
1529 }
1530 EXPORT_SYMBOL(call_netdevice_notifiers_info);
1531
1532 /**
1533  *      call_netdevice_notifiers - call all network notifier blocks
1534  *      @val: value passed unmodified to notifier function
1535  *      @dev: net_device pointer passed unmodified to notifier function
1536  *
1537  *      Call all network notifier blocks.  Parameters and return value
1538  *      are as for raw_notifier_call_chain().
1539  */
1540
1541 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1542 {
1543         struct netdev_notifier_info info;
1544
1545         return call_netdevice_notifiers_info(val, dev, &info);
1546 }
1547 EXPORT_SYMBOL(call_netdevice_notifiers);
1548
1549 static struct static_key netstamp_needed __read_mostly;
1550 #ifdef HAVE_JUMP_LABEL
1551 /* We are not allowed to call static_key_slow_dec() from irq context
1552  * If net_disable_timestamp() is called from irq context, defer the
1553  * static_key_slow_dec() calls.
1554  */
1555 static atomic_t netstamp_needed_deferred;
1556 #endif
1557
1558 void net_enable_timestamp(void)
1559 {
1560 #ifdef HAVE_JUMP_LABEL
1561         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1562
1563         if (deferred) {
1564                 while (--deferred)
1565                         static_key_slow_dec(&netstamp_needed);
1566                 return;
1567         }
1568 #endif
1569         static_key_slow_inc(&netstamp_needed);
1570 }
1571 EXPORT_SYMBOL(net_enable_timestamp);
1572
1573 void net_disable_timestamp(void)
1574 {
1575 #ifdef HAVE_JUMP_LABEL
1576         if (in_interrupt()) {
1577                 atomic_inc(&netstamp_needed_deferred);
1578                 return;
1579         }
1580 #endif
1581         static_key_slow_dec(&netstamp_needed);
1582 }
1583 EXPORT_SYMBOL(net_disable_timestamp);
1584
1585 static inline void net_timestamp_set(struct sk_buff *skb)
1586 {
1587         skb->tstamp.tv64 = 0;
1588         if (static_key_false(&netstamp_needed))
1589                 __net_timestamp(skb);
1590 }
1591
1592 #define net_timestamp_check(COND, SKB)                  \
1593         if (static_key_false(&netstamp_needed)) {               \
1594                 if ((COND) && !(SKB)->tstamp.tv64)      \
1595                         __net_timestamp(SKB);           \
1596         }                                               \
1597
1598 static inline bool is_skb_forwardable(struct net_device *dev,
1599                                       struct sk_buff *skb)
1600 {
1601         unsigned int len;
1602
1603         if (!(dev->flags & IFF_UP))
1604                 return false;
1605
1606         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1607         if (skb->len <= len)
1608                 return true;
1609
1610         /* if TSO is enabled, we don't care about the length as the packet
1611          * could be forwarded without being segmented before
1612          */
1613         if (skb_is_gso(skb))
1614                 return true;
1615
1616         return false;
1617 }
1618
1619 /**
1620  * dev_forward_skb - loopback an skb to another netif
1621  *
1622  * @dev: destination network device
1623  * @skb: buffer to forward
1624  *
1625  * return values:
1626  *      NET_RX_SUCCESS  (no congestion)
1627  *      NET_RX_DROP     (packet was dropped, but freed)
1628  *
1629  * dev_forward_skb can be used for injecting an skb from the
1630  * start_xmit function of one device into the receive queue
1631  * of another device.
1632  *
1633  * The receiving device may be in another namespace, so
1634  * we have to clear all information in the skb that could
1635  * impact namespace isolation.
1636  */
1637 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1638 {
1639         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1640                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1641                         atomic_long_inc(&dev->rx_dropped);
1642                         kfree_skb(skb);
1643                         return NET_RX_DROP;
1644                 }
1645         }
1646
1647         skb_orphan(skb);
1648
1649         if (unlikely(!is_skb_forwardable(dev, skb))) {
1650                 atomic_long_inc(&dev->rx_dropped);
1651                 kfree_skb(skb);
1652                 return NET_RX_DROP;
1653         }
1654         skb->skb_iif = 0;
1655         skb_dst_drop(skb);
1656         skb->tstamp.tv64 = 0;
1657         skb->pkt_type = PACKET_HOST;
1658         skb->protocol = eth_type_trans(skb, dev);
1659         skb->mark = 0;
1660         secpath_reset(skb);
1661         nf_reset(skb);
1662         nf_reset_trace(skb);
1663         return netif_rx(skb);
1664 }
1665 EXPORT_SYMBOL_GPL(dev_forward_skb);
1666
1667 static inline int deliver_skb(struct sk_buff *skb,
1668                               struct packet_type *pt_prev,
1669                               struct net_device *orig_dev)
1670 {
1671         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1672                 return -ENOMEM;
1673         atomic_inc(&skb->users);
1674         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1675 }
1676
1677 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1678 {
1679         if (!ptype->af_packet_priv || !skb->sk)
1680                 return false;
1681
1682         if (ptype->id_match)
1683                 return ptype->id_match(ptype, skb->sk);
1684         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1685                 return true;
1686
1687         return false;
1688 }
1689
1690 /*
1691  *      Support routine. Sends outgoing frames to any network
1692  *      taps currently in use.
1693  */
1694
1695 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1696 {
1697         struct packet_type *ptype;
1698         struct sk_buff *skb2 = NULL;
1699         struct packet_type *pt_prev = NULL;
1700
1701         rcu_read_lock();
1702         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1703                 /* Never send packets back to the socket
1704                  * they originated from - MvS (miquels@drinkel.ow.org)
1705                  */
1706                 if ((ptype->dev == dev || !ptype->dev) &&
1707                     (!skb_loop_sk(ptype, skb))) {
1708                         if (pt_prev) {
1709                                 deliver_skb(skb2, pt_prev, skb->dev);
1710                                 pt_prev = ptype;
1711                                 continue;
1712                         }
1713
1714                         skb2 = skb_clone(skb, GFP_ATOMIC);
1715                         if (!skb2)
1716                                 break;
1717
1718                         net_timestamp_set(skb2);
1719
1720                         /* skb->nh should be correctly
1721                            set by sender, so that the second statement is
1722                            just protection against buggy protocols.
1723                          */
1724                         skb_reset_mac_header(skb2);
1725
1726                         if (skb_network_header(skb2) < skb2->data ||
1727                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1728                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1729                                                      ntohs(skb2->protocol),
1730                                                      dev->name);
1731                                 skb_reset_network_header(skb2);
1732                         }
1733
1734                         skb2->transport_header = skb2->network_header;
1735                         skb2->pkt_type = PACKET_OUTGOING;
1736                         pt_prev = ptype;
1737                 }
1738         }
1739         if (pt_prev)
1740                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1741         rcu_read_unlock();
1742 }
1743
1744 /**
1745  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1746  * @dev: Network device
1747  * @txq: number of queues available
1748  *
1749  * If real_num_tx_queues is changed the tc mappings may no longer be
1750  * valid. To resolve this verify the tc mapping remains valid and if
1751  * not NULL the mapping. With no priorities mapping to this
1752  * offset/count pair it will no longer be used. In the worst case TC0
1753  * is invalid nothing can be done so disable priority mappings. If is
1754  * expected that drivers will fix this mapping if they can before
1755  * calling netif_set_real_num_tx_queues.
1756  */
1757 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1758 {
1759         int i;
1760         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1761
1762         /* If TC0 is invalidated disable TC mapping */
1763         if (tc->offset + tc->count > txq) {
1764                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1765                 dev->num_tc = 0;
1766                 return;
1767         }
1768
1769         /* Invalidated prio to tc mappings set to TC0 */
1770         for (i = 1; i < TC_BITMASK + 1; i++) {
1771                 int q = netdev_get_prio_tc_map(dev, i);
1772
1773                 tc = &dev->tc_to_txq[q];
1774                 if (tc->offset + tc->count > txq) {
1775                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1776                                 i, q);
1777                         netdev_set_prio_tc_map(dev, i, 0);
1778                 }
1779         }
1780 }
1781
1782 #ifdef CONFIG_XPS
1783 static DEFINE_MUTEX(xps_map_mutex);
1784 #define xmap_dereference(P)             \
1785         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1786
1787 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1788                                         int cpu, u16 index)
1789 {
1790         struct xps_map *map = NULL;
1791         int pos;
1792
1793         if (dev_maps)
1794                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1795
1796         for (pos = 0; map && pos < map->len; pos++) {
1797                 if (map->queues[pos] == index) {
1798                         if (map->len > 1) {
1799                                 map->queues[pos] = map->queues[--map->len];
1800                         } else {
1801                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1802                                 kfree_rcu(map, rcu);
1803                                 map = NULL;
1804                         }
1805                         break;
1806                 }
1807         }
1808
1809         return map;
1810 }
1811
1812 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1813 {
1814         struct xps_dev_maps *dev_maps;
1815         int cpu, i;
1816         bool active = false;
1817
1818         mutex_lock(&xps_map_mutex);
1819         dev_maps = xmap_dereference(dev->xps_maps);
1820
1821         if (!dev_maps)
1822                 goto out_no_maps;
1823
1824         for_each_possible_cpu(cpu) {
1825                 for (i = index; i < dev->num_tx_queues; i++) {
1826                         if (!remove_xps_queue(dev_maps, cpu, i))
1827                                 break;
1828                 }
1829                 if (i == dev->num_tx_queues)
1830                         active = true;
1831         }
1832
1833         if (!active) {
1834                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1835                 kfree_rcu(dev_maps, rcu);
1836         }
1837
1838         for (i = index; i < dev->num_tx_queues; i++)
1839                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1840                                              NUMA_NO_NODE);
1841
1842 out_no_maps:
1843         mutex_unlock(&xps_map_mutex);
1844 }
1845
1846 static struct xps_map *expand_xps_map(struct xps_map *map,
1847                                       int cpu, u16 index)
1848 {
1849         struct xps_map *new_map;
1850         int alloc_len = XPS_MIN_MAP_ALLOC;
1851         int i, pos;
1852
1853         for (pos = 0; map && pos < map->len; pos++) {
1854                 if (map->queues[pos] != index)
1855                         continue;
1856                 return map;
1857         }
1858
1859         /* Need to add queue to this CPU's existing map */
1860         if (map) {
1861                 if (pos < map->alloc_len)
1862                         return map;
1863
1864                 alloc_len = map->alloc_len * 2;
1865         }
1866
1867         /* Need to allocate new map to store queue on this CPU's map */
1868         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1869                                cpu_to_node(cpu));
1870         if (!new_map)
1871                 return NULL;
1872
1873         for (i = 0; i < pos; i++)
1874                 new_map->queues[i] = map->queues[i];
1875         new_map->alloc_len = alloc_len;
1876         new_map->len = pos;
1877
1878         return new_map;
1879 }
1880
1881 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1882 {
1883         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1884         struct xps_map *map, *new_map;
1885         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1886         int cpu, numa_node_id = -2;
1887         bool active = false;
1888
1889         mutex_lock(&xps_map_mutex);
1890
1891         dev_maps = xmap_dereference(dev->xps_maps);
1892
1893         /* allocate memory for queue storage */
1894         for_each_online_cpu(cpu) {
1895                 if (!cpumask_test_cpu(cpu, mask))
1896                         continue;
1897
1898                 if (!new_dev_maps)
1899                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1900                 if (!new_dev_maps) {
1901                         mutex_unlock(&xps_map_mutex);
1902                         return -ENOMEM;
1903                 }
1904
1905                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1906                                  NULL;
1907
1908                 map = expand_xps_map(map, cpu, index);
1909                 if (!map)
1910                         goto error;
1911
1912                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1913         }
1914
1915         if (!new_dev_maps)
1916                 goto out_no_new_maps;
1917
1918         for_each_possible_cpu(cpu) {
1919                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1920                         /* add queue to CPU maps */
1921                         int pos = 0;
1922
1923                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1924                         while ((pos < map->len) && (map->queues[pos] != index))
1925                                 pos++;
1926
1927                         if (pos == map->len)
1928                                 map->queues[map->len++] = index;
1929 #ifdef CONFIG_NUMA
1930                         if (numa_node_id == -2)
1931                                 numa_node_id = cpu_to_node(cpu);
1932                         else if (numa_node_id != cpu_to_node(cpu))
1933                                 numa_node_id = -1;
1934 #endif
1935                 } else if (dev_maps) {
1936                         /* fill in the new device map from the old device map */
1937                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1938                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1939                 }
1940
1941         }
1942
1943         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1944
1945         /* Cleanup old maps */
1946         if (dev_maps) {
1947                 for_each_possible_cpu(cpu) {
1948                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1949                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1950                         if (map && map != new_map)
1951                                 kfree_rcu(map, rcu);
1952                 }
1953
1954                 kfree_rcu(dev_maps, rcu);
1955         }
1956
1957         dev_maps = new_dev_maps;
1958         active = true;
1959
1960 out_no_new_maps:
1961         /* update Tx queue numa node */
1962         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1963                                      (numa_node_id >= 0) ? numa_node_id :
1964                                      NUMA_NO_NODE);
1965
1966         if (!dev_maps)
1967                 goto out_no_maps;
1968
1969         /* removes queue from unused CPUs */
1970         for_each_possible_cpu(cpu) {
1971                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1972                         continue;
1973
1974                 if (remove_xps_queue(dev_maps, cpu, index))
1975                         active = true;
1976         }
1977
1978         /* free map if not active */
1979         if (!active) {
1980                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1981                 kfree_rcu(dev_maps, rcu);
1982         }
1983
1984 out_no_maps:
1985         mutex_unlock(&xps_map_mutex);
1986
1987         return 0;
1988 error:
1989         /* remove any maps that we added */
1990         for_each_possible_cpu(cpu) {
1991                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1992                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1993                                  NULL;
1994                 if (new_map && new_map != map)
1995                         kfree(new_map);
1996         }
1997
1998         mutex_unlock(&xps_map_mutex);
1999
2000         kfree(new_dev_maps);
2001         return -ENOMEM;
2002 }
2003 EXPORT_SYMBOL(netif_set_xps_queue);
2004
2005 #endif
2006 /*
2007  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2008  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2009  */
2010 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2011 {
2012         int rc;
2013
2014         if (txq < 1 || txq > dev->num_tx_queues)
2015                 return -EINVAL;
2016
2017         if (dev->reg_state == NETREG_REGISTERED ||
2018             dev->reg_state == NETREG_UNREGISTERING) {
2019                 ASSERT_RTNL();
2020
2021                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2022                                                   txq);
2023                 if (rc)
2024                         return rc;
2025
2026                 if (dev->num_tc)
2027                         netif_setup_tc(dev, txq);
2028
2029                 if (txq < dev->real_num_tx_queues) {
2030                         qdisc_reset_all_tx_gt(dev, txq);
2031 #ifdef CONFIG_XPS
2032                         netif_reset_xps_queues_gt(dev, txq);
2033 #endif
2034                 }
2035         }
2036
2037         dev->real_num_tx_queues = txq;
2038         return 0;
2039 }
2040 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2041
2042 #ifdef CONFIG_RPS
2043 /**
2044  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2045  *      @dev: Network device
2046  *      @rxq: Actual number of RX queues
2047  *
2048  *      This must be called either with the rtnl_lock held or before
2049  *      registration of the net device.  Returns 0 on success, or a
2050  *      negative error code.  If called before registration, it always
2051  *      succeeds.
2052  */
2053 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2054 {
2055         int rc;
2056
2057         if (rxq < 1 || rxq > dev->num_rx_queues)
2058                 return -EINVAL;
2059
2060         if (dev->reg_state == NETREG_REGISTERED) {
2061                 ASSERT_RTNL();
2062
2063                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2064                                                   rxq);
2065                 if (rc)
2066                         return rc;
2067         }
2068
2069         dev->real_num_rx_queues = rxq;
2070         return 0;
2071 }
2072 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2073 #endif
2074
2075 /**
2076  * netif_get_num_default_rss_queues - default number of RSS queues
2077  *
2078  * This routine should set an upper limit on the number of RSS queues
2079  * used by default by multiqueue devices.
2080  */
2081 int netif_get_num_default_rss_queues(void)
2082 {
2083         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2084 }
2085 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2086
2087 static inline void __netif_reschedule(struct Qdisc *q)
2088 {
2089         struct softnet_data *sd;
2090         unsigned long flags;
2091
2092         local_irq_save(flags);
2093         sd = &__get_cpu_var(softnet_data);
2094         q->next_sched = NULL;
2095         *sd->output_queue_tailp = q;
2096         sd->output_queue_tailp = &q->next_sched;
2097         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2098         local_irq_restore(flags);
2099 }
2100
2101 void __netif_schedule(struct Qdisc *q)
2102 {
2103         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2104                 __netif_reschedule(q);
2105 }
2106 EXPORT_SYMBOL(__netif_schedule);
2107
2108 void dev_kfree_skb_irq(struct sk_buff *skb)
2109 {
2110         if (atomic_dec_and_test(&skb->users)) {
2111                 struct softnet_data *sd;
2112                 unsigned long flags;
2113
2114                 local_irq_save(flags);
2115                 sd = &__get_cpu_var(softnet_data);
2116                 skb->next = sd->completion_queue;
2117                 sd->completion_queue = skb;
2118                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2119                 local_irq_restore(flags);
2120         }
2121 }
2122 EXPORT_SYMBOL(dev_kfree_skb_irq);
2123
2124 void dev_kfree_skb_any(struct sk_buff *skb)
2125 {
2126         if (in_irq() || irqs_disabled())
2127                 dev_kfree_skb_irq(skb);
2128         else
2129                 dev_kfree_skb(skb);
2130 }
2131 EXPORT_SYMBOL(dev_kfree_skb_any);
2132
2133
2134 /**
2135  * netif_device_detach - mark device as removed
2136  * @dev: network device
2137  *
2138  * Mark device as removed from system and therefore no longer available.
2139  */
2140 void netif_device_detach(struct net_device *dev)
2141 {
2142         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2143             netif_running(dev)) {
2144                 netif_tx_stop_all_queues(dev);
2145         }
2146 }
2147 EXPORT_SYMBOL(netif_device_detach);
2148
2149 /**
2150  * netif_device_attach - mark device as attached
2151  * @dev: network device
2152  *
2153  * Mark device as attached from system and restart if needed.
2154  */
2155 void netif_device_attach(struct net_device *dev)
2156 {
2157         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2158             netif_running(dev)) {
2159                 netif_tx_wake_all_queues(dev);
2160                 __netdev_watchdog_up(dev);
2161         }
2162 }
2163 EXPORT_SYMBOL(netif_device_attach);
2164
2165 static void skb_warn_bad_offload(const struct sk_buff *skb)
2166 {
2167         static const netdev_features_t null_features = 0;
2168         struct net_device *dev = skb->dev;
2169         const char *driver = "";
2170
2171         if (!net_ratelimit())
2172                 return;
2173
2174         if (dev && dev->dev.parent)
2175                 driver = dev_driver_string(dev->dev.parent);
2176
2177         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2178              "gso_type=%d ip_summed=%d\n",
2179              driver, dev ? &dev->features : &null_features,
2180              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2181              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2182              skb_shinfo(skb)->gso_type, skb->ip_summed);
2183 }
2184
2185 /*
2186  * Invalidate hardware checksum when packet is to be mangled, and
2187  * complete checksum manually on outgoing path.
2188  */
2189 int skb_checksum_help(struct sk_buff *skb)
2190 {
2191         __wsum csum;
2192         int ret = 0, offset;
2193
2194         if (skb->ip_summed == CHECKSUM_COMPLETE)
2195                 goto out_set_summed;
2196
2197         if (unlikely(skb_shinfo(skb)->gso_size)) {
2198                 skb_warn_bad_offload(skb);
2199                 return -EINVAL;
2200         }
2201
2202         /* Before computing a checksum, we should make sure no frag could
2203          * be modified by an external entity : checksum could be wrong.
2204          */
2205         if (skb_has_shared_frag(skb)) {
2206                 ret = __skb_linearize(skb);
2207                 if (ret)
2208                         goto out;
2209         }
2210
2211         offset = skb_checksum_start_offset(skb);
2212         BUG_ON(offset >= skb_headlen(skb));
2213         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2214
2215         offset += skb->csum_offset;
2216         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2217
2218         if (skb_cloned(skb) &&
2219             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2220                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2221                 if (ret)
2222                         goto out;
2223         }
2224
2225         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2226 out_set_summed:
2227         skb->ip_summed = CHECKSUM_NONE;
2228 out:
2229         return ret;
2230 }
2231 EXPORT_SYMBOL(skb_checksum_help);
2232
2233 __be16 skb_network_protocol(struct sk_buff *skb)
2234 {
2235         __be16 type = skb->protocol;
2236         int vlan_depth = ETH_HLEN;
2237
2238         /* Tunnel gso handlers can set protocol to ethernet. */
2239         if (type == htons(ETH_P_TEB)) {
2240                 struct ethhdr *eth;
2241
2242                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2243                         return 0;
2244
2245                 eth = (struct ethhdr *)skb_mac_header(skb);
2246                 type = eth->h_proto;
2247         }
2248
2249         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2250                 struct vlan_hdr *vh;
2251
2252                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2253                         return 0;
2254
2255                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2256                 type = vh->h_vlan_encapsulated_proto;
2257                 vlan_depth += VLAN_HLEN;
2258         }
2259
2260         return type;
2261 }
2262
2263 /**
2264  *      skb_mac_gso_segment - mac layer segmentation handler.
2265  *      @skb: buffer to segment
2266  *      @features: features for the output path (see dev->features)
2267  */
2268 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2269                                     netdev_features_t features)
2270 {
2271         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2272         struct packet_offload *ptype;
2273         __be16 type = skb_network_protocol(skb);
2274
2275         if (unlikely(!type))
2276                 return ERR_PTR(-EINVAL);
2277
2278         __skb_pull(skb, skb->mac_len);
2279
2280         rcu_read_lock();
2281         list_for_each_entry_rcu(ptype, &offload_base, list) {
2282                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2283                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2284                                 int err;
2285
2286                                 err = ptype->callbacks.gso_send_check(skb);
2287                                 segs = ERR_PTR(err);
2288                                 if (err || skb_gso_ok(skb, features))
2289                                         break;
2290                                 __skb_push(skb, (skb->data -
2291                                                  skb_network_header(skb)));
2292                         }
2293                         segs = ptype->callbacks.gso_segment(skb, features);
2294                         break;
2295                 }
2296         }
2297         rcu_read_unlock();
2298
2299         __skb_push(skb, skb->data - skb_mac_header(skb));
2300
2301         return segs;
2302 }
2303 EXPORT_SYMBOL(skb_mac_gso_segment);
2304
2305
2306 /* openvswitch calls this on rx path, so we need a different check.
2307  */
2308 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2309 {
2310         if (tx_path)
2311                 return skb->ip_summed != CHECKSUM_PARTIAL;
2312         else
2313                 return skb->ip_summed == CHECKSUM_NONE;
2314 }
2315
2316 /**
2317  *      __skb_gso_segment - Perform segmentation on skb.
2318  *      @skb: buffer to segment
2319  *      @features: features for the output path (see dev->features)
2320  *      @tx_path: whether it is called in TX path
2321  *
2322  *      This function segments the given skb and returns a list of segments.
2323  *
2324  *      It may return NULL if the skb requires no segmentation.  This is
2325  *      only possible when GSO is used for verifying header integrity.
2326  */
2327 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2328                                   netdev_features_t features, bool tx_path)
2329 {
2330         if (unlikely(skb_needs_check(skb, tx_path))) {
2331                 int err;
2332
2333                 skb_warn_bad_offload(skb);
2334
2335                 if (skb_header_cloned(skb) &&
2336                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2337                         return ERR_PTR(err);
2338         }
2339
2340         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2341         skb_reset_mac_header(skb);
2342         skb_reset_mac_len(skb);
2343
2344         return skb_mac_gso_segment(skb, features);
2345 }
2346 EXPORT_SYMBOL(__skb_gso_segment);
2347
2348 /* Take action when hardware reception checksum errors are detected. */
2349 #ifdef CONFIG_BUG
2350 void netdev_rx_csum_fault(struct net_device *dev)
2351 {
2352         if (net_ratelimit()) {
2353                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2354                 dump_stack();
2355         }
2356 }
2357 EXPORT_SYMBOL(netdev_rx_csum_fault);
2358 #endif
2359
2360 /* Actually, we should eliminate this check as soon as we know, that:
2361  * 1. IOMMU is present and allows to map all the memory.
2362  * 2. No high memory really exists on this machine.
2363  */
2364
2365 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2366 {
2367 #ifdef CONFIG_HIGHMEM
2368         int i;
2369         if (!(dev->features & NETIF_F_HIGHDMA)) {
2370                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2371                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2372                         if (PageHighMem(skb_frag_page(frag)))
2373                                 return 1;
2374                 }
2375         }
2376
2377         if (PCI_DMA_BUS_IS_PHYS) {
2378                 struct device *pdev = dev->dev.parent;
2379
2380                 if (!pdev)
2381                         return 0;
2382                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2383                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2384                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2385                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2386                                 return 1;
2387                 }
2388         }
2389 #endif
2390         return 0;
2391 }
2392
2393 struct dev_gso_cb {
2394         void (*destructor)(struct sk_buff *skb);
2395 };
2396
2397 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2398
2399 static void dev_gso_skb_destructor(struct sk_buff *skb)
2400 {
2401         struct dev_gso_cb *cb;
2402
2403         do {
2404                 struct sk_buff *nskb = skb->next;
2405
2406                 skb->next = nskb->next;
2407                 nskb->next = NULL;
2408                 kfree_skb(nskb);
2409         } while (skb->next);
2410
2411         cb = DEV_GSO_CB(skb);
2412         if (cb->destructor)
2413                 cb->destructor(skb);
2414 }
2415
2416 /**
2417  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2418  *      @skb: buffer to segment
2419  *      @features: device features as applicable to this skb
2420  *
2421  *      This function segments the given skb and stores the list of segments
2422  *      in skb->next.
2423  */
2424 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2425 {
2426         struct sk_buff *segs;
2427
2428         segs = skb_gso_segment(skb, features);
2429
2430         /* Verifying header integrity only. */
2431         if (!segs)
2432                 return 0;
2433
2434         if (IS_ERR(segs))
2435                 return PTR_ERR(segs);
2436
2437         skb->next = segs;
2438         DEV_GSO_CB(skb)->destructor = skb->destructor;
2439         skb->destructor = dev_gso_skb_destructor;
2440
2441         return 0;
2442 }
2443
2444 static netdev_features_t harmonize_features(struct sk_buff *skb,
2445         __be16 protocol, netdev_features_t features)
2446 {
2447         if (skb->ip_summed != CHECKSUM_NONE &&
2448             !can_checksum_protocol(features, protocol)) {
2449                 features &= ~NETIF_F_ALL_CSUM;
2450         } else if (illegal_highdma(skb->dev, skb)) {
2451                 features &= ~NETIF_F_SG;
2452         }
2453
2454         return features;
2455 }
2456
2457 netdev_features_t netif_skb_features(struct sk_buff *skb)
2458 {
2459         __be16 protocol = skb->protocol;
2460         netdev_features_t features = skb->dev->features;
2461
2462         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2463                 features &= ~NETIF_F_GSO_MASK;
2464
2465         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2466                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2467                 protocol = veh->h_vlan_encapsulated_proto;
2468         } else if (!vlan_tx_tag_present(skb)) {
2469                 return harmonize_features(skb, protocol, features);
2470         }
2471
2472         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2473                                                NETIF_F_HW_VLAN_STAG_TX);
2474
2475         if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
2476                 return harmonize_features(skb, protocol, features);
2477         } else {
2478                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2479                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2480                                 NETIF_F_HW_VLAN_STAG_TX;
2481                 return harmonize_features(skb, protocol, features);
2482         }
2483 }
2484 EXPORT_SYMBOL(netif_skb_features);
2485
2486 /*
2487  * Returns true if either:
2488  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2489  *      2. skb is fragmented and the device does not support SG.
2490  */
2491 static inline int skb_needs_linearize(struct sk_buff *skb,
2492                                       netdev_features_t features)
2493 {
2494         return skb_is_nonlinear(skb) &&
2495                         ((skb_has_frag_list(skb) &&
2496                                 !(features & NETIF_F_FRAGLIST)) ||
2497                         (skb_shinfo(skb)->nr_frags &&
2498                                 !(features & NETIF_F_SG)));
2499 }
2500
2501 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2502                         struct netdev_queue *txq)
2503 {
2504         const struct net_device_ops *ops = dev->netdev_ops;
2505         int rc = NETDEV_TX_OK;
2506         unsigned int skb_len;
2507
2508         if (likely(!skb->next)) {
2509                 netdev_features_t features;
2510
2511                 /*
2512                  * If device doesn't need skb->dst, release it right now while
2513                  * its hot in this cpu cache
2514                  */
2515                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2516                         skb_dst_drop(skb);
2517
2518                 features = netif_skb_features(skb);
2519
2520                 if (vlan_tx_tag_present(skb) &&
2521                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2522                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2523                                              vlan_tx_tag_get(skb));
2524                         if (unlikely(!skb))
2525                                 goto out;
2526
2527                         skb->vlan_tci = 0;
2528                 }
2529
2530                 /* If encapsulation offload request, verify we are testing
2531                  * hardware encapsulation features instead of standard
2532                  * features for the netdev
2533                  */
2534                 if (skb->encapsulation)
2535                         features &= dev->hw_enc_features;
2536
2537                 if (netif_needs_gso(skb, features)) {
2538                         if (unlikely(dev_gso_segment(skb, features)))
2539                                 goto out_kfree_skb;
2540                         if (skb->next)
2541                                 goto gso;
2542                 } else {
2543                         if (skb_needs_linearize(skb, features) &&
2544                             __skb_linearize(skb))
2545                                 goto out_kfree_skb;
2546
2547                         /* If packet is not checksummed and device does not
2548                          * support checksumming for this protocol, complete
2549                          * checksumming here.
2550                          */
2551                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2552                                 if (skb->encapsulation)
2553                                         skb_set_inner_transport_header(skb,
2554                                                 skb_checksum_start_offset(skb));
2555                                 else
2556                                         skb_set_transport_header(skb,
2557                                                 skb_checksum_start_offset(skb));
2558                                 if (!(features & NETIF_F_ALL_CSUM) &&
2559                                      skb_checksum_help(skb))
2560                                         goto out_kfree_skb;
2561                         }
2562                 }
2563
2564                 if (!list_empty(&ptype_all))
2565                         dev_queue_xmit_nit(skb, dev);
2566
2567                 skb_len = skb->len;
2568                 rc = ops->ndo_start_xmit(skb, dev);
2569                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2570                 if (rc == NETDEV_TX_OK)
2571                         txq_trans_update(txq);
2572                 return rc;
2573         }
2574
2575 gso:
2576         do {
2577                 struct sk_buff *nskb = skb->next;
2578
2579                 skb->next = nskb->next;
2580                 nskb->next = NULL;
2581
2582                 if (!list_empty(&ptype_all))
2583                         dev_queue_xmit_nit(nskb, dev);
2584
2585                 skb_len = nskb->len;
2586                 rc = ops->ndo_start_xmit(nskb, dev);
2587                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2588                 if (unlikely(rc != NETDEV_TX_OK)) {
2589                         if (rc & ~NETDEV_TX_MASK)
2590                                 goto out_kfree_gso_skb;
2591                         nskb->next = skb->next;
2592                         skb->next = nskb;
2593                         return rc;
2594                 }
2595                 txq_trans_update(txq);
2596                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2597                         return NETDEV_TX_BUSY;
2598         } while (skb->next);
2599
2600 out_kfree_gso_skb:
2601         if (likely(skb->next == NULL)) {
2602                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2603                 consume_skb(skb);
2604                 return rc;
2605         }
2606 out_kfree_skb:
2607         kfree_skb(skb);
2608 out:
2609         return rc;
2610 }
2611
2612 static void qdisc_pkt_len_init(struct sk_buff *skb)
2613 {
2614         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2615
2616         qdisc_skb_cb(skb)->pkt_len = skb->len;
2617
2618         /* To get more precise estimation of bytes sent on wire,
2619          * we add to pkt_len the headers size of all segments
2620          */
2621         if (shinfo->gso_size)  {
2622                 unsigned int hdr_len;
2623                 u16 gso_segs = shinfo->gso_segs;
2624
2625                 /* mac layer + network layer */
2626                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2627
2628                 /* + transport layer */
2629                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2630                         hdr_len += tcp_hdrlen(skb);
2631                 else
2632                         hdr_len += sizeof(struct udphdr);
2633
2634                 if (shinfo->gso_type & SKB_GSO_DODGY)
2635                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2636                                                 shinfo->gso_size);
2637
2638                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2639         }
2640 }
2641
2642 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2643                                  struct net_device *dev,
2644                                  struct netdev_queue *txq)
2645 {
2646         spinlock_t *root_lock = qdisc_lock(q);
2647         bool contended;
2648         int rc;
2649
2650         qdisc_pkt_len_init(skb);
2651         qdisc_calculate_pkt_len(skb, q);
2652         /*
2653          * Heuristic to force contended enqueues to serialize on a
2654          * separate lock before trying to get qdisc main lock.
2655          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2656          * and dequeue packets faster.
2657          */
2658         contended = qdisc_is_running(q);
2659         if (unlikely(contended))
2660                 spin_lock(&q->busylock);
2661
2662         spin_lock(root_lock);
2663         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2664                 kfree_skb(skb);
2665                 rc = NET_XMIT_DROP;
2666         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2667                    qdisc_run_begin(q)) {
2668                 /*
2669                  * This is a work-conserving queue; there are no old skbs
2670                  * waiting to be sent out; and the qdisc is not running -
2671                  * xmit the skb directly.
2672                  */
2673                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2674                         skb_dst_force(skb);
2675
2676                 qdisc_bstats_update(q, skb);
2677
2678                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2679                         if (unlikely(contended)) {
2680                                 spin_unlock(&q->busylock);
2681                                 contended = false;
2682                         }
2683                         __qdisc_run(q);
2684                 } else
2685                         qdisc_run_end(q);
2686
2687                 rc = NET_XMIT_SUCCESS;
2688         } else {
2689                 skb_dst_force(skb);
2690                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2691                 if (qdisc_run_begin(q)) {
2692                         if (unlikely(contended)) {
2693                                 spin_unlock(&q->busylock);
2694                                 contended = false;
2695                         }
2696                         __qdisc_run(q);
2697                 }
2698         }
2699         spin_unlock(root_lock);
2700         if (unlikely(contended))
2701                 spin_unlock(&q->busylock);
2702         return rc;
2703 }
2704
2705 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2706 static void skb_update_prio(struct sk_buff *skb)
2707 {
2708         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2709
2710         if (!skb->priority && skb->sk && map) {
2711                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2712
2713                 if (prioidx < map->priomap_len)
2714                         skb->priority = map->priomap[prioidx];
2715         }
2716 }
2717 #else
2718 #define skb_update_prio(skb)
2719 #endif
2720
2721 static DEFINE_PER_CPU(int, xmit_recursion);
2722 #define RECURSION_LIMIT 10
2723
2724 /**
2725  *      dev_loopback_xmit - loop back @skb
2726  *      @skb: buffer to transmit
2727  */
2728 int dev_loopback_xmit(struct sk_buff *skb)
2729 {
2730         skb_reset_mac_header(skb);
2731         __skb_pull(skb, skb_network_offset(skb));
2732         skb->pkt_type = PACKET_LOOPBACK;
2733         skb->ip_summed = CHECKSUM_UNNECESSARY;
2734         WARN_ON(!skb_dst(skb));
2735         skb_dst_force(skb);
2736         netif_rx_ni(skb);
2737         return 0;
2738 }
2739 EXPORT_SYMBOL(dev_loopback_xmit);
2740
2741 /**
2742  *      dev_queue_xmit - transmit a buffer
2743  *      @skb: buffer to transmit
2744  *
2745  *      Queue a buffer for transmission to a network device. The caller must
2746  *      have set the device and priority and built the buffer before calling
2747  *      this function. The function can be called from an interrupt.
2748  *
2749  *      A negative errno code is returned on a failure. A success does not
2750  *      guarantee the frame will be transmitted as it may be dropped due
2751  *      to congestion or traffic shaping.
2752  *
2753  * -----------------------------------------------------------------------------------
2754  *      I notice this method can also return errors from the queue disciplines,
2755  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2756  *      be positive.
2757  *
2758  *      Regardless of the return value, the skb is consumed, so it is currently
2759  *      difficult to retry a send to this method.  (You can bump the ref count
2760  *      before sending to hold a reference for retry if you are careful.)
2761  *
2762  *      When calling this method, interrupts MUST be enabled.  This is because
2763  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2764  *          --BLG
2765  */
2766 int dev_queue_xmit(struct sk_buff *skb)
2767 {
2768         struct net_device *dev = skb->dev;
2769         struct netdev_queue *txq;
2770         struct Qdisc *q;
2771         int rc = -ENOMEM;
2772
2773         skb_reset_mac_header(skb);
2774
2775         /* Disable soft irqs for various locks below. Also
2776          * stops preemption for RCU.
2777          */
2778         rcu_read_lock_bh();
2779
2780         skb_update_prio(skb);
2781
2782         txq = netdev_pick_tx(dev, skb);
2783         q = rcu_dereference_bh(txq->qdisc);
2784
2785 #ifdef CONFIG_NET_CLS_ACT
2786         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2787 #endif
2788         trace_net_dev_queue(skb);
2789         if (q->enqueue) {
2790                 rc = __dev_xmit_skb(skb, q, dev, txq);
2791                 goto out;
2792         }
2793
2794         /* The device has no queue. Common case for software devices:
2795            loopback, all the sorts of tunnels...
2796
2797            Really, it is unlikely that netif_tx_lock protection is necessary
2798            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2799            counters.)
2800            However, it is possible, that they rely on protection
2801            made by us here.
2802
2803            Check this and shot the lock. It is not prone from deadlocks.
2804            Either shot noqueue qdisc, it is even simpler 8)
2805          */
2806         if (dev->flags & IFF_UP) {
2807                 int cpu = smp_processor_id(); /* ok because BHs are off */
2808
2809                 if (txq->xmit_lock_owner != cpu) {
2810
2811                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2812                                 goto recursion_alert;
2813
2814                         HARD_TX_LOCK(dev, txq, cpu);
2815
2816                         if (!netif_xmit_stopped(txq)) {
2817                                 __this_cpu_inc(xmit_recursion);
2818                                 rc = dev_hard_start_xmit(skb, dev, txq);
2819                                 __this_cpu_dec(xmit_recursion);
2820                                 if (dev_xmit_complete(rc)) {
2821                                         HARD_TX_UNLOCK(dev, txq);
2822                                         goto out;
2823                                 }
2824                         }
2825                         HARD_TX_UNLOCK(dev, txq);
2826                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2827                                              dev->name);
2828                 } else {
2829                         /* Recursion is detected! It is possible,
2830                          * unfortunately
2831                          */
2832 recursion_alert:
2833                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2834                                              dev->name);
2835                 }
2836         }
2837
2838         rc = -ENETDOWN;
2839         rcu_read_unlock_bh();
2840
2841         kfree_skb(skb);
2842         return rc;
2843 out:
2844         rcu_read_unlock_bh();
2845         return rc;
2846 }
2847 EXPORT_SYMBOL(dev_queue_xmit);
2848
2849
2850 /*=======================================================================
2851                         Receiver routines
2852   =======================================================================*/
2853
2854 int netdev_max_backlog __read_mostly = 1000;
2855 EXPORT_SYMBOL(netdev_max_backlog);
2856
2857 int netdev_tstamp_prequeue __read_mostly = 1;
2858 int netdev_budget __read_mostly = 300;
2859 int weight_p __read_mostly = 64;            /* old backlog weight */
2860
2861 /* Called with irq disabled */
2862 static inline void ____napi_schedule(struct softnet_data *sd,
2863                                      struct napi_struct *napi)
2864 {
2865         list_add_tail(&napi->poll_list, &sd->poll_list);
2866         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2867 }
2868
2869 #ifdef CONFIG_RPS
2870
2871 /* One global table that all flow-based protocols share. */
2872 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2873 EXPORT_SYMBOL(rps_sock_flow_table);
2874
2875 struct static_key rps_needed __read_mostly;
2876
2877 static struct rps_dev_flow *
2878 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2879             struct rps_dev_flow *rflow, u16 next_cpu)
2880 {
2881         if (next_cpu != RPS_NO_CPU) {
2882 #ifdef CONFIG_RFS_ACCEL
2883                 struct netdev_rx_queue *rxqueue;
2884                 struct rps_dev_flow_table *flow_table;
2885                 struct rps_dev_flow *old_rflow;
2886                 u32 flow_id;
2887                 u16 rxq_index;
2888                 int rc;
2889
2890                 /* Should we steer this flow to a different hardware queue? */
2891                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2892                     !(dev->features & NETIF_F_NTUPLE))
2893                         goto out;
2894                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2895                 if (rxq_index == skb_get_rx_queue(skb))
2896                         goto out;
2897
2898                 rxqueue = dev->_rx + rxq_index;
2899                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2900                 if (!flow_table)
2901                         goto out;
2902                 flow_id = skb->rxhash & flow_table->mask;
2903                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2904                                                         rxq_index, flow_id);
2905                 if (rc < 0)
2906                         goto out;
2907                 old_rflow = rflow;
2908                 rflow = &flow_table->flows[flow_id];
2909                 rflow->filter = rc;
2910                 if (old_rflow->filter == rflow->filter)
2911                         old_rflow->filter = RPS_NO_FILTER;
2912         out:
2913 #endif
2914                 rflow->last_qtail =
2915                         per_cpu(softnet_data, next_cpu).input_queue_head;
2916         }
2917
2918         rflow->cpu = next_cpu;
2919         return rflow;
2920 }
2921
2922 /*
2923  * get_rps_cpu is called from netif_receive_skb and returns the target
2924  * CPU from the RPS map of the receiving queue for a given skb.
2925  * rcu_read_lock must be held on entry.
2926  */
2927 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2928                        struct rps_dev_flow **rflowp)
2929 {
2930         struct netdev_rx_queue *rxqueue;
2931         struct rps_map *map;
2932         struct rps_dev_flow_table *flow_table;
2933         struct rps_sock_flow_table *sock_flow_table;
2934         int cpu = -1;
2935         u16 tcpu;
2936
2937         if (skb_rx_queue_recorded(skb)) {
2938                 u16 index = skb_get_rx_queue(skb);
2939                 if (unlikely(index >= dev->real_num_rx_queues)) {
2940                         WARN_ONCE(dev->real_num_rx_queues > 1,
2941                                   "%s received packet on queue %u, but number "
2942                                   "of RX queues is %u\n",
2943                                   dev->name, index, dev->real_num_rx_queues);
2944                         goto done;
2945                 }
2946                 rxqueue = dev->_rx + index;
2947         } else
2948                 rxqueue = dev->_rx;
2949
2950         map = rcu_dereference(rxqueue->rps_map);
2951         if (map) {
2952                 if (map->len == 1 &&
2953                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2954                         tcpu = map->cpus[0];
2955                         if (cpu_online(tcpu))
2956                                 cpu = tcpu;
2957                         goto done;
2958                 }
2959         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2960                 goto done;
2961         }
2962
2963         skb_reset_network_header(skb);
2964         if (!skb_get_rxhash(skb))
2965                 goto done;
2966
2967         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2968         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2969         if (flow_table && sock_flow_table) {
2970                 u16 next_cpu;
2971                 struct rps_dev_flow *rflow;
2972
2973                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2974                 tcpu = rflow->cpu;
2975
2976                 next_cpu = sock_flow_table->ents[skb->rxhash &
2977                     sock_flow_table->mask];
2978
2979                 /*
2980                  * If the desired CPU (where last recvmsg was done) is
2981                  * different from current CPU (one in the rx-queue flow
2982                  * table entry), switch if one of the following holds:
2983                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2984                  *   - Current CPU is offline.
2985                  *   - The current CPU's queue tail has advanced beyond the
2986                  *     last packet that was enqueued using this table entry.
2987                  *     This guarantees that all previous packets for the flow
2988                  *     have been dequeued, thus preserving in order delivery.
2989                  */
2990                 if (unlikely(tcpu != next_cpu) &&
2991                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2992                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2993                       rflow->last_qtail)) >= 0)) {
2994                         tcpu = next_cpu;
2995                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2996                 }
2997
2998                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2999                         *rflowp = rflow;
3000                         cpu = tcpu;
3001                         goto done;
3002                 }
3003         }
3004
3005         if (map) {
3006                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3007
3008                 if (cpu_online(tcpu)) {
3009                         cpu = tcpu;
3010                         goto done;
3011                 }
3012         }
3013
3014 done:
3015         return cpu;
3016 }
3017
3018 #ifdef CONFIG_RFS_ACCEL
3019
3020 /**
3021  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3022  * @dev: Device on which the filter was set
3023  * @rxq_index: RX queue index
3024  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3025  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3026  *
3027  * Drivers that implement ndo_rx_flow_steer() should periodically call
3028  * this function for each installed filter and remove the filters for
3029  * which it returns %true.
3030  */
3031 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3032                          u32 flow_id, u16 filter_id)
3033 {
3034         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3035         struct rps_dev_flow_table *flow_table;
3036         struct rps_dev_flow *rflow;
3037         bool expire = true;
3038         int cpu;
3039
3040         rcu_read_lock();
3041         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3042         if (flow_table && flow_id <= flow_table->mask) {
3043                 rflow = &flow_table->flows[flow_id];
3044                 cpu = ACCESS_ONCE(rflow->cpu);
3045                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3046                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3047                            rflow->last_qtail) <
3048                      (int)(10 * flow_table->mask)))
3049                         expire = false;
3050         }
3051         rcu_read_unlock();
3052         return expire;
3053 }
3054 EXPORT_SYMBOL(rps_may_expire_flow);
3055
3056 #endif /* CONFIG_RFS_ACCEL */
3057
3058 /* Called from hardirq (IPI) context */
3059 static void rps_trigger_softirq(void *data)
3060 {
3061         struct softnet_data *sd = data;
3062
3063         ____napi_schedule(sd, &sd->backlog);
3064         sd->received_rps++;
3065 }
3066
3067 #endif /* CONFIG_RPS */
3068
3069 /*
3070  * Check if this softnet_data structure is another cpu one
3071  * If yes, queue it to our IPI list and return 1
3072  * If no, return 0
3073  */
3074 static int rps_ipi_queued(struct softnet_data *sd)
3075 {
3076 #ifdef CONFIG_RPS
3077         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3078
3079         if (sd != mysd) {
3080                 sd->rps_ipi_next = mysd->rps_ipi_list;
3081                 mysd->rps_ipi_list = sd;
3082
3083                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3084                 return 1;
3085         }
3086 #endif /* CONFIG_RPS */
3087         return 0;
3088 }
3089
3090 #ifdef CONFIG_NET_FLOW_LIMIT
3091 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3092 #endif
3093
3094 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3095 {
3096 #ifdef CONFIG_NET_FLOW_LIMIT
3097         struct sd_flow_limit *fl;
3098         struct softnet_data *sd;
3099         unsigned int old_flow, new_flow;
3100
3101         if (qlen < (netdev_max_backlog >> 1))
3102                 return false;
3103
3104         sd = &__get_cpu_var(softnet_data);
3105
3106         rcu_read_lock();
3107         fl = rcu_dereference(sd->flow_limit);
3108         if (fl) {
3109                 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3110                 old_flow = fl->history[fl->history_head];
3111                 fl->history[fl->history_head] = new_flow;
3112
3113                 fl->history_head++;
3114                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3115
3116                 if (likely(fl->buckets[old_flow]))
3117                         fl->buckets[old_flow]--;
3118
3119                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3120                         fl->count++;
3121                         rcu_read_unlock();
3122                         return true;
3123                 }
3124         }
3125         rcu_read_unlock();
3126 #endif
3127         return false;
3128 }
3129
3130 /*
3131  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3132  * queue (may be a remote CPU queue).
3133  */
3134 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3135                               unsigned int *qtail)
3136 {
3137         struct softnet_data *sd;
3138         unsigned long flags;
3139         unsigned int qlen;
3140
3141         sd = &per_cpu(softnet_data, cpu);
3142
3143         local_irq_save(flags);
3144
3145         rps_lock(sd);
3146         qlen = skb_queue_len(&sd->input_pkt_queue);
3147         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3148                 if (skb_queue_len(&sd->input_pkt_queue)) {
3149 enqueue:
3150                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3151                         input_queue_tail_incr_save(sd, qtail);
3152                         rps_unlock(sd);
3153                         local_irq_restore(flags);
3154                         return NET_RX_SUCCESS;
3155                 }
3156
3157                 /* Schedule NAPI for backlog device
3158                  * We can use non atomic operation since we own the queue lock
3159                  */
3160                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3161                         if (!rps_ipi_queued(sd))
3162                                 ____napi_schedule(sd, &sd->backlog);
3163                 }
3164                 goto enqueue;
3165         }
3166
3167         sd->dropped++;
3168         rps_unlock(sd);
3169
3170         local_irq_restore(flags);
3171
3172         atomic_long_inc(&skb->dev->rx_dropped);
3173         kfree_skb(skb);
3174         return NET_RX_DROP;
3175 }
3176
3177 /**
3178  *      netif_rx        -       post buffer to the network code
3179  *      @skb: buffer to post
3180  *
3181  *      This function receives a packet from a device driver and queues it for
3182  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3183  *      may be dropped during processing for congestion control or by the
3184  *      protocol layers.
3185  *
3186  *      return values:
3187  *      NET_RX_SUCCESS  (no congestion)
3188  *      NET_RX_DROP     (packet was dropped)
3189  *
3190  */
3191
3192 int netif_rx(struct sk_buff *skb)
3193 {
3194         int ret;
3195
3196         /* if netpoll wants it, pretend we never saw it */
3197         if (netpoll_rx(skb))
3198                 return NET_RX_DROP;
3199
3200         net_timestamp_check(netdev_tstamp_prequeue, skb);
3201
3202         trace_netif_rx(skb);
3203 #ifdef CONFIG_RPS
3204         if (static_key_false(&rps_needed)) {
3205                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3206                 int cpu;
3207
3208                 preempt_disable();
3209                 rcu_read_lock();
3210
3211                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3212                 if (cpu < 0)
3213                         cpu = smp_processor_id();
3214
3215                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3216
3217                 rcu_read_unlock();
3218                 preempt_enable();
3219         } else
3220 #endif
3221         {
3222                 unsigned int qtail;
3223                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3224                 put_cpu();
3225         }
3226         return ret;
3227 }
3228 EXPORT_SYMBOL(netif_rx);
3229
3230 int netif_rx_ni(struct sk_buff *skb)
3231 {
3232         int err;
3233
3234         preempt_disable();
3235         err = netif_rx(skb);
3236         if (local_softirq_pending())
3237                 do_softirq();
3238         preempt_enable();
3239
3240         return err;
3241 }
3242 EXPORT_SYMBOL(netif_rx_ni);
3243
3244 static void net_tx_action(struct softirq_action *h)
3245 {
3246         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3247
3248         if (sd->completion_queue) {
3249                 struct sk_buff *clist;
3250
3251                 local_irq_disable();
3252                 clist = sd->completion_queue;
3253                 sd->completion_queue = NULL;
3254                 local_irq_enable();
3255
3256                 while (clist) {
3257                         struct sk_buff *skb = clist;
3258                         clist = clist->next;
3259
3260                         WARN_ON(atomic_read(&skb->users));
3261                         trace_kfree_skb(skb, net_tx_action);
3262                         __kfree_skb(skb);
3263                 }
3264         }
3265
3266         if (sd->output_queue) {
3267                 struct Qdisc *head;
3268
3269                 local_irq_disable();
3270                 head = sd->output_queue;
3271                 sd->output_queue = NULL;
3272                 sd->output_queue_tailp = &sd->output_queue;
3273                 local_irq_enable();
3274
3275                 while (head) {
3276                         struct Qdisc *q = head;
3277                         spinlock_t *root_lock;
3278
3279                         head = head->next_sched;
3280
3281                         root_lock = qdisc_lock(q);
3282                         if (spin_trylock(root_lock)) {
3283                                 smp_mb__before_clear_bit();
3284                                 clear_bit(__QDISC_STATE_SCHED,
3285                                           &q->state);
3286                                 qdisc_run(q);
3287                                 spin_unlock(root_lock);
3288                         } else {
3289                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3290                                               &q->state)) {
3291                                         __netif_reschedule(q);
3292                                 } else {
3293                                         smp_mb__before_clear_bit();
3294                                         clear_bit(__QDISC_STATE_SCHED,
3295                                                   &q->state);
3296                                 }
3297                         }
3298                 }
3299         }
3300 }
3301
3302 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3303     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3304 /* This hook is defined here for ATM LANE */
3305 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3306                              unsigned char *addr) __read_mostly;
3307 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3308 #endif
3309
3310 #ifdef CONFIG_NET_CLS_ACT
3311 /* TODO: Maybe we should just force sch_ingress to be compiled in
3312  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3313  * a compare and 2 stores extra right now if we dont have it on
3314  * but have CONFIG_NET_CLS_ACT
3315  * NOTE: This doesn't stop any functionality; if you dont have
3316  * the ingress scheduler, you just can't add policies on ingress.
3317  *
3318  */
3319 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3320 {
3321         struct net_device *dev = skb->dev;
3322         u32 ttl = G_TC_RTTL(skb->tc_verd);
3323         int result = TC_ACT_OK;
3324         struct Qdisc *q;
3325
3326         if (unlikely(MAX_RED_LOOP < ttl++)) {
3327                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3328                                      skb->skb_iif, dev->ifindex);
3329                 return TC_ACT_SHOT;
3330         }
3331
3332         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3333         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3334
3335         q = rxq->qdisc;
3336         if (q != &noop_qdisc) {
3337                 spin_lock(qdisc_lock(q));
3338                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3339                         result = qdisc_enqueue_root(skb, q);
3340                 spin_unlock(qdisc_lock(q));
3341         }
3342
3343         return result;
3344 }
3345
3346 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3347                                          struct packet_type **pt_prev,
3348                                          int *ret, struct net_device *orig_dev)
3349 {
3350         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3351
3352         if (!rxq || rxq->qdisc == &noop_qdisc)
3353                 goto out;
3354
3355         if (*pt_prev) {
3356                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3357                 *pt_prev = NULL;
3358         }
3359
3360         switch (ing_filter(skb, rxq)) {
3361         case TC_ACT_SHOT:
3362         case TC_ACT_STOLEN:
3363                 kfree_skb(skb);
3364                 return NULL;
3365         }
3366
3367 out:
3368         skb->tc_verd = 0;
3369         return skb;
3370 }
3371 #endif
3372
3373 /**
3374  *      netdev_rx_handler_register - register receive handler
3375  *      @dev: device to register a handler for
3376  *      @rx_handler: receive handler to register
3377  *      @rx_handler_data: data pointer that is used by rx handler
3378  *
3379  *      Register a receive hander for a device. This handler will then be
3380  *      called from __netif_receive_skb. A negative errno code is returned
3381  *      on a failure.
3382  *
3383  *      The caller must hold the rtnl_mutex.
3384  *
3385  *      For a general description of rx_handler, see enum rx_handler_result.
3386  */
3387 int netdev_rx_handler_register(struct net_device *dev,
3388                                rx_handler_func_t *rx_handler,
3389                                void *rx_handler_data)
3390 {
3391         ASSERT_RTNL();
3392
3393         if (dev->rx_handler)
3394                 return -EBUSY;
3395
3396         /* Note: rx_handler_data must be set before rx_handler */
3397         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3398         rcu_assign_pointer(dev->rx_handler, rx_handler);
3399
3400         return 0;
3401 }
3402 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3403
3404 /**
3405  *      netdev_rx_handler_unregister - unregister receive handler
3406  *      @dev: device to unregister a handler from
3407  *
3408  *      Unregister a receive handler from a device.
3409  *
3410  *      The caller must hold the rtnl_mutex.
3411  */
3412 void netdev_rx_handler_unregister(struct net_device *dev)
3413 {
3414
3415         ASSERT_RTNL();
3416         RCU_INIT_POINTER(dev->rx_handler, NULL);
3417         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3418          * section has a guarantee to see a non NULL rx_handler_data
3419          * as well.
3420          */
3421         synchronize_net();
3422         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3423 }
3424 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3425
3426 /*
3427  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3428  * the special handling of PFMEMALLOC skbs.
3429  */
3430 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3431 {
3432         switch (skb->protocol) {
3433         case __constant_htons(ETH_P_ARP):
3434         case __constant_htons(ETH_P_IP):
3435         case __constant_htons(ETH_P_IPV6):
3436         case __constant_htons(ETH_P_8021Q):
3437         case __constant_htons(ETH_P_8021AD):
3438                 return true;
3439         default:
3440                 return false;
3441         }
3442 }
3443
3444 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3445 {
3446         struct packet_type *ptype, *pt_prev;
3447         rx_handler_func_t *rx_handler;
3448         struct net_device *orig_dev;
3449         struct net_device *null_or_dev;
3450         bool deliver_exact = false;
3451         int ret = NET_RX_DROP;
3452         __be16 type;
3453
3454         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3455
3456         trace_netif_receive_skb(skb);
3457
3458         /* if we've gotten here through NAPI, check netpoll */
3459         if (netpoll_receive_skb(skb))
3460                 goto out;
3461
3462         orig_dev = skb->dev;
3463
3464         skb_reset_network_header(skb);
3465         if (!skb_transport_header_was_set(skb))
3466                 skb_reset_transport_header(skb);
3467         skb_reset_mac_len(skb);
3468
3469         pt_prev = NULL;
3470
3471         rcu_read_lock();
3472
3473 another_round:
3474         skb->skb_iif = skb->dev->ifindex;
3475
3476         __this_cpu_inc(softnet_data.processed);
3477
3478         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3479             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3480                 skb = vlan_untag(skb);
3481                 if (unlikely(!skb))
3482                         goto unlock;
3483         }
3484
3485 #ifdef CONFIG_NET_CLS_ACT
3486         if (skb->tc_verd & TC_NCLS) {
3487                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3488                 goto ncls;
3489         }
3490 #endif
3491
3492         if (pfmemalloc)
3493                 goto skip_taps;
3494
3495         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3496                 if (!ptype->dev || ptype->dev == skb->dev) {
3497                         if (pt_prev)
3498                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3499                         pt_prev = ptype;
3500                 }
3501         }
3502
3503 skip_taps:
3504 #ifdef CONFIG_NET_CLS_ACT
3505         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3506         if (!skb)
3507                 goto unlock;
3508 ncls:
3509 #endif
3510
3511         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3512                 goto drop;
3513
3514         if (vlan_tx_tag_present(skb)) {
3515                 if (pt_prev) {
3516                         ret = deliver_skb(skb, pt_prev, orig_dev);
3517                         pt_prev = NULL;
3518                 }
3519                 if (vlan_do_receive(&skb))
3520                         goto another_round;
3521                 else if (unlikely(!skb))
3522                         goto unlock;
3523         }
3524
3525         rx_handler = rcu_dereference(skb->dev->rx_handler);
3526         if (rx_handler) {
3527                 if (pt_prev) {
3528                         ret = deliver_skb(skb, pt_prev, orig_dev);
3529                         pt_prev = NULL;
3530                 }
3531                 switch (rx_handler(&skb)) {
3532                 case RX_HANDLER_CONSUMED:
3533                         ret = NET_RX_SUCCESS;
3534                         goto unlock;
3535                 case RX_HANDLER_ANOTHER:
3536                         goto another_round;
3537                 case RX_HANDLER_EXACT:
3538                         deliver_exact = true;
3539                 case RX_HANDLER_PASS:
3540                         break;
3541                 default:
3542                         BUG();
3543                 }
3544         }
3545
3546         if (vlan_tx_nonzero_tag_present(skb))
3547                 skb->pkt_type = PACKET_OTHERHOST;
3548
3549         /* deliver only exact match when indicated */
3550         null_or_dev = deliver_exact ? skb->dev : NULL;
3551
3552         type = skb->protocol;
3553         list_for_each_entry_rcu(ptype,
3554                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3555                 if (ptype->type == type &&
3556                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3557                      ptype->dev == orig_dev)) {
3558                         if (pt_prev)
3559                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3560                         pt_prev = ptype;
3561                 }
3562         }
3563
3564         if (pt_prev) {
3565                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3566                         goto drop;
3567                 else
3568                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3569         } else {
3570 drop:
3571                 atomic_long_inc(&skb->dev->rx_dropped);
3572                 kfree_skb(skb);
3573                 /* Jamal, now you will not able to escape explaining
3574                  * me how you were going to use this. :-)
3575                  */
3576                 ret = NET_RX_DROP;
3577         }
3578
3579 unlock:
3580         rcu_read_unlock();
3581 out:
3582         return ret;
3583 }
3584
3585 static int __netif_receive_skb(struct sk_buff *skb)
3586 {
3587         int ret;
3588
3589         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3590                 unsigned long pflags = current->flags;
3591
3592                 /*
3593                  * PFMEMALLOC skbs are special, they should
3594                  * - be delivered to SOCK_MEMALLOC sockets only
3595                  * - stay away from userspace
3596                  * - have bounded memory usage
3597                  *
3598                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3599                  * context down to all allocation sites.
3600                  */
3601                 current->flags |= PF_MEMALLOC;
3602                 ret = __netif_receive_skb_core(skb, true);
3603                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3604         } else
3605                 ret = __netif_receive_skb_core(skb, false);
3606
3607         return ret;
3608 }
3609
3610 /**
3611  *      netif_receive_skb - process receive buffer from network
3612  *      @skb: buffer to process
3613  *
3614  *      netif_receive_skb() is the main receive data processing function.
3615  *      It always succeeds. The buffer may be dropped during processing
3616  *      for congestion control or by the protocol layers.
3617  *
3618  *      This function may only be called from softirq context and interrupts
3619  *      should be enabled.
3620  *
3621  *      Return values (usually ignored):
3622  *      NET_RX_SUCCESS: no congestion
3623  *      NET_RX_DROP: packet was dropped
3624  */
3625 int netif_receive_skb(struct sk_buff *skb)
3626 {
3627         net_timestamp_check(netdev_tstamp_prequeue, skb);
3628
3629         if (skb_defer_rx_timestamp(skb))
3630                 return NET_RX_SUCCESS;
3631
3632 #ifdef CONFIG_RPS
3633         if (static_key_false(&rps_needed)) {
3634                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3635                 int cpu, ret;
3636
3637                 rcu_read_lock();
3638
3639                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3640
3641                 if (cpu >= 0) {
3642                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3643                         rcu_read_unlock();
3644                         return ret;
3645                 }
3646                 rcu_read_unlock();
3647         }
3648 #endif
3649         return __netif_receive_skb(skb);
3650 }
3651 EXPORT_SYMBOL(netif_receive_skb);
3652
3653 /* Network device is going away, flush any packets still pending
3654  * Called with irqs disabled.
3655  */
3656 static void flush_backlog(void *arg)
3657 {
3658         struct net_device *dev = arg;
3659         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3660         struct sk_buff *skb, *tmp;
3661
3662         rps_lock(sd);
3663         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3664                 if (skb->dev == dev) {
3665                         __skb_unlink(skb, &sd->input_pkt_queue);
3666                         kfree_skb(skb);
3667                         input_queue_head_incr(sd);
3668                 }
3669         }
3670         rps_unlock(sd);
3671
3672         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3673                 if (skb->dev == dev) {
3674                         __skb_unlink(skb, &sd->process_queue);
3675                         kfree_skb(skb);
3676                         input_queue_head_incr(sd);
3677                 }
3678         }
3679 }
3680
3681 static int napi_gro_complete(struct sk_buff *skb)
3682 {
3683         struct packet_offload *ptype;
3684         __be16 type = skb->protocol;
3685         struct list_head *head = &offload_base;
3686         int err = -ENOENT;
3687
3688         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3689
3690         if (NAPI_GRO_CB(skb)->count == 1) {
3691                 skb_shinfo(skb)->gso_size = 0;
3692                 goto out;
3693         }
3694
3695         rcu_read_lock();
3696         list_for_each_entry_rcu(ptype, head, list) {
3697                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3698                         continue;
3699
3700                 err = ptype->callbacks.gro_complete(skb);
3701                 break;
3702         }
3703         rcu_read_unlock();
3704
3705         if (err) {
3706                 WARN_ON(&ptype->list == head);
3707                 kfree_skb(skb);
3708                 return NET_RX_SUCCESS;
3709         }
3710
3711 out:
3712         return netif_receive_skb(skb);
3713 }
3714
3715 /* napi->gro_list contains packets ordered by age.
3716  * youngest packets at the head of it.
3717  * Complete skbs in reverse order to reduce latencies.
3718  */
3719 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3720 {
3721         struct sk_buff *skb, *prev = NULL;
3722
3723         /* scan list and build reverse chain */
3724         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3725                 skb->prev = prev;
3726                 prev = skb;
3727         }
3728
3729         for (skb = prev; skb; skb = prev) {
3730                 skb->next = NULL;
3731
3732                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3733                         return;
3734
3735                 prev = skb->prev;
3736                 napi_gro_complete(skb);
3737                 napi->gro_count--;
3738         }
3739
3740         napi->gro_list = NULL;
3741 }
3742 EXPORT_SYMBOL(napi_gro_flush);
3743
3744 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3745 {
3746         struct sk_buff *p;
3747         unsigned int maclen = skb->dev->hard_header_len;
3748
3749         for (p = napi->gro_list; p; p = p->next) {
3750                 unsigned long diffs;
3751
3752                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3753                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3754                 if (maclen == ETH_HLEN)
3755                         diffs |= compare_ether_header(skb_mac_header(p),
3756                                                       skb_gro_mac_header(skb));
3757                 else if (!diffs)
3758                         diffs = memcmp(skb_mac_header(p),
3759                                        skb_gro_mac_header(skb),
3760                                        maclen);
3761                 NAPI_GRO_CB(p)->same_flow = !diffs;
3762                 NAPI_GRO_CB(p)->flush = 0;
3763         }
3764 }
3765
3766 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3767 {
3768         struct sk_buff **pp = NULL;
3769         struct packet_offload *ptype;
3770         __be16 type = skb->protocol;
3771         struct list_head *head = &offload_base;
3772         int same_flow;
3773         enum gro_result ret;
3774
3775         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3776                 goto normal;
3777
3778         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3779                 goto normal;
3780
3781         gro_list_prepare(napi, skb);
3782
3783         rcu_read_lock();
3784         list_for_each_entry_rcu(ptype, head, list) {
3785                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3786                         continue;
3787
3788                 skb_set_network_header(skb, skb_gro_offset(skb));
3789                 skb_reset_mac_len(skb);
3790                 NAPI_GRO_CB(skb)->same_flow = 0;
3791                 NAPI_GRO_CB(skb)->flush = 0;
3792                 NAPI_GRO_CB(skb)->free = 0;
3793
3794                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3795                 break;
3796         }
3797         rcu_read_unlock();
3798
3799         if (&ptype->list == head)
3800                 goto normal;
3801
3802         same_flow = NAPI_GRO_CB(skb)->same_flow;
3803         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3804
3805         if (pp) {
3806                 struct sk_buff *nskb = *pp;
3807
3808                 *pp = nskb->next;
3809                 nskb->next = NULL;
3810                 napi_gro_complete(nskb);
3811                 napi->gro_count--;
3812         }
3813
3814         if (same_flow)
3815                 goto ok;
3816
3817         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3818                 goto normal;
3819
3820         napi->gro_count++;
3821         NAPI_GRO_CB(skb)->count = 1;
3822         NAPI_GRO_CB(skb)->age = jiffies;
3823         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3824         skb->next = napi->gro_list;
3825         napi->gro_list = skb;
3826         ret = GRO_HELD;
3827
3828 pull:
3829         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3830                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3831
3832                 BUG_ON(skb->end - skb->tail < grow);
3833
3834                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3835
3836                 skb->tail += grow;
3837                 skb->data_len -= grow;
3838
3839                 skb_shinfo(skb)->frags[0].page_offset += grow;
3840                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3841
3842                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3843                         skb_frag_unref(skb, 0);
3844                         memmove(skb_shinfo(skb)->frags,
3845                                 skb_shinfo(skb)->frags + 1,
3846                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3847                 }
3848         }
3849
3850 ok:
3851         return ret;
3852
3853 normal:
3854         ret = GRO_NORMAL;
3855         goto pull;
3856 }
3857
3858
3859 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3860 {
3861         switch (ret) {
3862         case GRO_NORMAL:
3863                 if (netif_receive_skb(skb))
3864                         ret = GRO_DROP;
3865                 break;
3866
3867         case GRO_DROP:
3868                 kfree_skb(skb);
3869                 break;
3870
3871         case GRO_MERGED_FREE:
3872                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3873                         kmem_cache_free(skbuff_head_cache, skb);
3874                 else
3875                         __kfree_skb(skb);
3876                 break;
3877
3878         case GRO_HELD:
3879         case GRO_MERGED:
3880                 break;
3881         }
3882
3883         return ret;
3884 }
3885
3886 static void skb_gro_reset_offset(struct sk_buff *skb)
3887 {
3888         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3889         const skb_frag_t *frag0 = &pinfo->frags[0];
3890
3891         NAPI_GRO_CB(skb)->data_offset = 0;
3892         NAPI_GRO_CB(skb)->frag0 = NULL;
3893         NAPI_GRO_CB(skb)->frag0_len = 0;
3894
3895         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3896             pinfo->nr_frags &&
3897             !PageHighMem(skb_frag_page(frag0))) {
3898                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3899                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3900         }
3901 }
3902
3903 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3904 {
3905         skb_gro_reset_offset(skb);
3906
3907         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3908 }
3909 EXPORT_SYMBOL(napi_gro_receive);
3910
3911 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3912 {
3913         __skb_pull(skb, skb_headlen(skb));
3914         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3915         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3916         skb->vlan_tci = 0;
3917         skb->dev = napi->dev;
3918         skb->skb_iif = 0;
3919
3920         napi->skb = skb;
3921 }
3922
3923 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3924 {
3925         struct sk_buff *skb = napi->skb;
3926
3927         if (!skb) {
3928                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3929                 if (skb)
3930                         napi->skb = skb;
3931         }
3932         return skb;
3933 }
3934 EXPORT_SYMBOL(napi_get_frags);
3935
3936 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3937                                gro_result_t ret)
3938 {
3939         switch (ret) {
3940         case GRO_NORMAL:
3941         case GRO_HELD:
3942                 skb->protocol = eth_type_trans(skb, skb->dev);
3943
3944                 if (ret == GRO_HELD)
3945                         skb_gro_pull(skb, -ETH_HLEN);
3946                 else if (netif_receive_skb(skb))
3947                         ret = GRO_DROP;
3948                 break;
3949
3950         case GRO_DROP:
3951         case GRO_MERGED_FREE:
3952                 napi_reuse_skb(napi, skb);
3953                 break;
3954
3955         case GRO_MERGED:
3956                 break;
3957         }
3958
3959         return ret;
3960 }
3961
3962 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3963 {
3964         struct sk_buff *skb = napi->skb;
3965         struct ethhdr *eth;
3966         unsigned int hlen;
3967         unsigned int off;
3968
3969         napi->skb = NULL;
3970
3971         skb_reset_mac_header(skb);
3972         skb_gro_reset_offset(skb);
3973
3974         off = skb_gro_offset(skb);
3975         hlen = off + sizeof(*eth);
3976         eth = skb_gro_header_fast(skb, off);
3977         if (skb_gro_header_hard(skb, hlen)) {
3978                 eth = skb_gro_header_slow(skb, hlen, off);
3979                 if (unlikely(!eth)) {
3980                         napi_reuse_skb(napi, skb);
3981                         skb = NULL;
3982                         goto out;
3983                 }
3984         }
3985
3986         skb_gro_pull(skb, sizeof(*eth));
3987
3988         /*
3989          * This works because the only protocols we care about don't require
3990          * special handling.  We'll fix it up properly at the end.
3991          */
3992         skb->protocol = eth->h_proto;
3993
3994 out:
3995         return skb;
3996 }
3997
3998 gro_result_t napi_gro_frags(struct napi_struct *napi)
3999 {
4000         struct sk_buff *skb = napi_frags_skb(napi);
4001
4002         if (!skb)
4003                 return GRO_DROP;
4004
4005         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4006 }
4007 EXPORT_SYMBOL(napi_gro_frags);
4008
4009 /*
4010  * net_rps_action sends any pending IPI's for rps.
4011  * Note: called with local irq disabled, but exits with local irq enabled.
4012  */
4013 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4014 {
4015 #ifdef CONFIG_RPS
4016         struct softnet_data *remsd = sd->rps_ipi_list;
4017
4018         if (remsd) {
4019                 sd->rps_ipi_list = NULL;
4020
4021                 local_irq_enable();
4022
4023                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4024                 while (remsd) {
4025                         struct softnet_data *next = remsd->rps_ipi_next;
4026
4027                         if (cpu_online(remsd->cpu))
4028                                 __smp_call_function_single(remsd->cpu,
4029                                                            &remsd->csd, 0);
4030                         remsd = next;
4031                 }
4032         } else
4033 #endif
4034                 local_irq_enable();
4035 }
4036
4037 static int process_backlog(struct napi_struct *napi, int quota)
4038 {
4039         int work = 0;
4040         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4041
4042 #ifdef CONFIG_RPS
4043         /* Check if we have pending ipi, its better to send them now,
4044          * not waiting net_rx_action() end.
4045          */
4046         if (sd->rps_ipi_list) {
4047                 local_irq_disable();
4048                 net_rps_action_and_irq_enable(sd);
4049         }
4050 #endif
4051         napi->weight = weight_p;
4052         local_irq_disable();
4053         while (work < quota) {
4054                 struct sk_buff *skb;
4055                 unsigned int qlen;
4056
4057                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4058                         local_irq_enable();
4059                         __netif_receive_skb(skb);
4060                         local_irq_disable();
4061                         input_queue_head_incr(sd);
4062                         if (++work >= quota) {
4063                                 local_irq_enable();
4064                                 return work;
4065                         }
4066                 }
4067
4068                 rps_lock(sd);
4069                 qlen = skb_queue_len(&sd->input_pkt_queue);
4070                 if (qlen)
4071                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4072                                                    &sd->process_queue);
4073
4074                 if (qlen < quota - work) {
4075                         /*
4076                          * Inline a custom version of __napi_complete().
4077                          * only current cpu owns and manipulates this napi,
4078                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4079                          * we can use a plain write instead of clear_bit(),
4080                          * and we dont need an smp_mb() memory barrier.
4081                          */
4082                         list_del(&napi->poll_list);
4083                         napi->state = 0;
4084
4085                         quota = work + qlen;
4086                 }
4087                 rps_unlock(sd);
4088         }
4089         local_irq_enable();
4090
4091         return work;
4092 }
4093
4094 /**
4095  * __napi_schedule - schedule for receive
4096  * @n: entry to schedule
4097  *
4098  * The entry's receive function will be scheduled to run
4099  */
4100 void __napi_schedule(struct napi_struct *n)
4101 {
4102         unsigned long flags;
4103
4104         local_irq_save(flags);
4105         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4106         local_irq_restore(flags);
4107 }
4108 EXPORT_SYMBOL(__napi_schedule);
4109
4110 void __napi_complete(struct napi_struct *n)
4111 {
4112         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4113         BUG_ON(n->gro_list);
4114
4115         list_del(&n->poll_list);
4116         smp_mb__before_clear_bit();
4117         clear_bit(NAPI_STATE_SCHED, &n->state);
4118 }
4119 EXPORT_SYMBOL(__napi_complete);
4120
4121 void napi_complete(struct napi_struct *n)
4122 {
4123         unsigned long flags;
4124
4125         /*
4126          * don't let napi dequeue from the cpu poll list
4127          * just in case its running on a different cpu
4128          */
4129         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4130                 return;
4131
4132         napi_gro_flush(n, false);
4133         local_irq_save(flags);
4134         __napi_complete(n);
4135         local_irq_restore(flags);
4136 }
4137 EXPORT_SYMBOL(napi_complete);
4138
4139 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4140                     int (*poll)(struct napi_struct *, int), int weight)
4141 {
4142         INIT_LIST_HEAD(&napi->poll_list);
4143         napi->gro_count = 0;
4144         napi->gro_list = NULL;
4145         napi->skb = NULL;
4146         napi->poll = poll;
4147         if (weight > NAPI_POLL_WEIGHT)
4148                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4149                             weight, dev->name);
4150         napi->weight = weight;
4151         list_add(&napi->dev_list, &dev->napi_list);
4152         napi->dev = dev;
4153 #ifdef CONFIG_NETPOLL
4154         spin_lock_init(&napi->poll_lock);
4155         napi->poll_owner = -1;
4156 #endif
4157         set_bit(NAPI_STATE_SCHED, &napi->state);
4158 }
4159 EXPORT_SYMBOL(netif_napi_add);
4160
4161 void netif_napi_del(struct napi_struct *napi)
4162 {
4163         struct sk_buff *skb, *next;
4164
4165         list_del_init(&napi->dev_list);
4166         napi_free_frags(napi);
4167
4168         for (skb = napi->gro_list; skb; skb = next) {
4169                 next = skb->next;
4170                 skb->next = NULL;
4171                 kfree_skb(skb);
4172         }
4173
4174         napi->gro_list = NULL;
4175         napi->gro_count = 0;
4176 }
4177 EXPORT_SYMBOL(netif_napi_del);
4178
4179 static void net_rx_action(struct softirq_action *h)
4180 {
4181         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4182         unsigned long time_limit = jiffies + 2;
4183         int budget = netdev_budget;
4184         void *have;
4185
4186         local_irq_disable();
4187
4188         while (!list_empty(&sd->poll_list)) {
4189                 struct napi_struct *n;
4190                 int work, weight;
4191
4192                 /* If softirq window is exhuasted then punt.
4193                  * Allow this to run for 2 jiffies since which will allow
4194                  * an average latency of 1.5/HZ.
4195                  */
4196                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4197                         goto softnet_break;
4198
4199                 local_irq_enable();
4200
4201                 /* Even though interrupts have been re-enabled, this
4202                  * access is safe because interrupts can only add new
4203                  * entries to the tail of this list, and only ->poll()
4204                  * calls can remove this head entry from the list.
4205                  */
4206                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4207
4208                 have = netpoll_poll_lock(n);
4209
4210                 weight = n->weight;
4211
4212                 /* This NAPI_STATE_SCHED test is for avoiding a race
4213                  * with netpoll's poll_napi().  Only the entity which
4214                  * obtains the lock and sees NAPI_STATE_SCHED set will
4215                  * actually make the ->poll() call.  Therefore we avoid
4216                  * accidentally calling ->poll() when NAPI is not scheduled.
4217                  */
4218                 work = 0;
4219                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4220                         work = n->poll(n, weight);
4221                         trace_napi_poll(n);
4222                 }
4223
4224                 WARN_ON_ONCE(work > weight);
4225
4226                 budget -= work;
4227
4228                 local_irq_disable();
4229
4230                 /* Drivers must not modify the NAPI state if they
4231                  * consume the entire weight.  In such cases this code
4232                  * still "owns" the NAPI instance and therefore can
4233                  * move the instance around on the list at-will.
4234                  */
4235                 if (unlikely(work == weight)) {
4236                         if (unlikely(napi_disable_pending(n))) {
4237                                 local_irq_enable();
4238                                 napi_complete(n);
4239                                 local_irq_disable();
4240                         } else {
4241                                 if (n->gro_list) {
4242                                         /* flush too old packets
4243                                          * If HZ < 1000, flush all packets.
4244                                          */
4245                                         local_irq_enable();
4246                                         napi_gro_flush(n, HZ >= 1000);
4247                                         local_irq_disable();
4248                                 }
4249                                 list_move_tail(&n->poll_list, &sd->poll_list);
4250                         }
4251                 }
4252
4253                 netpoll_poll_unlock(have);
4254         }
4255 out:
4256         net_rps_action_and_irq_enable(sd);
4257
4258 #ifdef CONFIG_NET_DMA
4259         /*
4260          * There may not be any more sk_buffs coming right now, so push
4261          * any pending DMA copies to hardware
4262          */
4263         dma_issue_pending_all();
4264 #endif
4265
4266         return;
4267
4268 softnet_break:
4269         sd->time_squeeze++;
4270         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4271         goto out;
4272 }
4273
4274 struct netdev_upper {
4275         struct net_device *dev;
4276         bool master;
4277         struct list_head list;
4278         struct rcu_head rcu;
4279         struct list_head search_list;
4280 };
4281
4282 static void __append_search_uppers(struct list_head *search_list,
4283                                    struct net_device *dev)
4284 {
4285         struct netdev_upper *upper;
4286
4287         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4288                 /* check if this upper is not already in search list */
4289                 if (list_empty(&upper->search_list))
4290                         list_add_tail(&upper->search_list, search_list);
4291         }
4292 }
4293
4294 static bool __netdev_search_upper_dev(struct net_device *dev,
4295                                       struct net_device *upper_dev)
4296 {
4297         LIST_HEAD(search_list);
4298         struct netdev_upper *upper;
4299         struct netdev_upper *tmp;
4300         bool ret = false;
4301
4302         __append_search_uppers(&search_list, dev);
4303         list_for_each_entry(upper, &search_list, search_list) {
4304                 if (upper->dev == upper_dev) {
4305                         ret = true;
4306                         break;
4307                 }
4308                 __append_search_uppers(&search_list, upper->dev);
4309         }
4310         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4311                 INIT_LIST_HEAD(&upper->search_list);
4312         return ret;
4313 }
4314
4315 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4316                                                 struct net_device *upper_dev)
4317 {
4318         struct netdev_upper *upper;
4319
4320         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4321                 if (upper->dev == upper_dev)
4322                         return upper;
4323         }
4324         return NULL;
4325 }
4326
4327 /**
4328  * netdev_has_upper_dev - Check if device is linked to an upper device
4329  * @dev: device
4330  * @upper_dev: upper device to check
4331  *
4332  * Find out if a device is linked to specified upper device and return true
4333  * in case it is. Note that this checks only immediate upper device,
4334  * not through a complete stack of devices. The caller must hold the RTNL lock.
4335  */
4336 bool netdev_has_upper_dev(struct net_device *dev,
4337                           struct net_device *upper_dev)
4338 {
4339         ASSERT_RTNL();
4340
4341         return __netdev_find_upper(dev, upper_dev);
4342 }
4343 EXPORT_SYMBOL(netdev_has_upper_dev);
4344
4345 /**
4346  * netdev_has_any_upper_dev - Check if device is linked to some device
4347  * @dev: device
4348  *
4349  * Find out if a device is linked to an upper device and return true in case
4350  * it is. The caller must hold the RTNL lock.
4351  */
4352 bool netdev_has_any_upper_dev(struct net_device *dev)
4353 {
4354         ASSERT_RTNL();
4355
4356         return !list_empty(&dev->upper_dev_list);
4357 }
4358 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4359
4360 /**
4361  * netdev_master_upper_dev_get - Get master upper device
4362  * @dev: device
4363  *
4364  * Find a master upper device and return pointer to it or NULL in case
4365  * it's not there. The caller must hold the RTNL lock.
4366  */
4367 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4368 {
4369         struct netdev_upper *upper;
4370
4371         ASSERT_RTNL();
4372
4373         if (list_empty(&dev->upper_dev_list))
4374                 return NULL;
4375
4376         upper = list_first_entry(&dev->upper_dev_list,
4377                                  struct netdev_upper, list);
4378         if (likely(upper->master))
4379                 return upper->dev;
4380         return NULL;
4381 }
4382 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4383
4384 /**
4385  * netdev_master_upper_dev_get_rcu - Get master upper device
4386  * @dev: device
4387  *
4388  * Find a master upper device and return pointer to it or NULL in case
4389  * it's not there. The caller must hold the RCU read lock.
4390  */
4391 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4392 {
4393         struct netdev_upper *upper;
4394
4395         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4396                                        struct netdev_upper, list);
4397         if (upper && likely(upper->master))
4398                 return upper->dev;
4399         return NULL;
4400 }
4401 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4402
4403 static int __netdev_upper_dev_link(struct net_device *dev,
4404                                    struct net_device *upper_dev, bool master)
4405 {
4406         struct netdev_upper *upper;
4407
4408         ASSERT_RTNL();
4409
4410         if (dev == upper_dev)
4411                 return -EBUSY;
4412
4413         /* To prevent loops, check if dev is not upper device to upper_dev. */
4414         if (__netdev_search_upper_dev(upper_dev, dev))
4415                 return -EBUSY;
4416
4417         if (__netdev_find_upper(dev, upper_dev))
4418                 return -EEXIST;
4419
4420         if (master && netdev_master_upper_dev_get(dev))
4421                 return -EBUSY;
4422
4423         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4424         if (!upper)
4425                 return -ENOMEM;
4426
4427         upper->dev = upper_dev;
4428         upper->master = master;
4429         INIT_LIST_HEAD(&upper->search_list);
4430
4431         /* Ensure that master upper link is always the first item in list. */
4432         if (master)
4433                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4434         else
4435                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4436         dev_hold(upper_dev);
4437         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4438         return 0;
4439 }
4440
4441 /**
4442  * netdev_upper_dev_link - Add a link to the upper device
4443  * @dev: device
4444  * @upper_dev: new upper device
4445  *
4446  * Adds a link to device which is upper to this one. The caller must hold
4447  * the RTNL lock. On a failure a negative errno code is returned.
4448  * On success the reference counts are adjusted and the function
4449  * returns zero.
4450  */
4451 int netdev_upper_dev_link(struct net_device *dev,
4452                           struct net_device *upper_dev)
4453 {
4454         return __netdev_upper_dev_link(dev, upper_dev, false);
4455 }
4456 EXPORT_SYMBOL(netdev_upper_dev_link);
4457
4458 /**
4459  * netdev_master_upper_dev_link - Add a master link to the upper device
4460  * @dev: device
4461  * @upper_dev: new upper device
4462  *
4463  * Adds a link to device which is upper to this one. In this case, only
4464  * one master upper device can be linked, although other non-master devices
4465  * might be linked as well. The caller must hold the RTNL lock.
4466  * On a failure a negative errno code is returned. On success the reference
4467  * counts are adjusted and the function returns zero.
4468  */
4469 int netdev_master_upper_dev_link(struct net_device *dev,
4470                                  struct net_device *upper_dev)
4471 {
4472         return __netdev_upper_dev_link(dev, upper_dev, true);
4473 }
4474 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4475
4476 /**
4477  * netdev_upper_dev_unlink - Removes a link to upper device
4478  * @dev: device
4479  * @upper_dev: new upper device
4480  *
4481  * Removes a link to device which is upper to this one. The caller must hold
4482  * the RTNL lock.
4483  */
4484 void netdev_upper_dev_unlink(struct net_device *dev,
4485                              struct net_device *upper_dev)
4486 {
4487         struct netdev_upper *upper;
4488
4489         ASSERT_RTNL();
4490
4491         upper = __netdev_find_upper(dev, upper_dev);
4492         if (!upper)
4493                 return;
4494         list_del_rcu(&upper->list);
4495         dev_put(upper_dev);
4496         kfree_rcu(upper, rcu);
4497         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4498 }
4499 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4500
4501 static void dev_change_rx_flags(struct net_device *dev, int flags)
4502 {
4503         const struct net_device_ops *ops = dev->netdev_ops;
4504
4505         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4506                 ops->ndo_change_rx_flags(dev, flags);
4507 }
4508
4509 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4510 {
4511         unsigned int old_flags = dev->flags;
4512         kuid_t uid;
4513         kgid_t gid;
4514
4515         ASSERT_RTNL();
4516
4517         dev->flags |= IFF_PROMISC;
4518         dev->promiscuity += inc;
4519         if (dev->promiscuity == 0) {
4520                 /*
4521                  * Avoid overflow.
4522                  * If inc causes overflow, untouch promisc and return error.
4523                  */
4524                 if (inc < 0)
4525                         dev->flags &= ~IFF_PROMISC;
4526                 else {
4527                         dev->promiscuity -= inc;
4528                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4529                                 dev->name);
4530                         return -EOVERFLOW;
4531                 }
4532         }
4533         if (dev->flags != old_flags) {
4534                 pr_info("device %s %s promiscuous mode\n",
4535                         dev->name,
4536                         dev->flags & IFF_PROMISC ? "entered" : "left");
4537                 if (audit_enabled) {
4538                         current_uid_gid(&uid, &gid);
4539                         audit_log(current->audit_context, GFP_ATOMIC,
4540                                 AUDIT_ANOM_PROMISCUOUS,
4541                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4542                                 dev->name, (dev->flags & IFF_PROMISC),
4543                                 (old_flags & IFF_PROMISC),
4544                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4545                                 from_kuid(&init_user_ns, uid),
4546                                 from_kgid(&init_user_ns, gid),
4547                                 audit_get_sessionid(current));
4548                 }
4549
4550                 dev_change_rx_flags(dev, IFF_PROMISC);
4551         }
4552         return 0;
4553 }
4554
4555 /**
4556  *      dev_set_promiscuity     - update promiscuity count on a device
4557  *      @dev: device
4558  *      @inc: modifier
4559  *
4560  *      Add or remove promiscuity from a device. While the count in the device
4561  *      remains above zero the interface remains promiscuous. Once it hits zero
4562  *      the device reverts back to normal filtering operation. A negative inc
4563  *      value is used to drop promiscuity on the device.
4564  *      Return 0 if successful or a negative errno code on error.
4565  */
4566 int dev_set_promiscuity(struct net_device *dev, int inc)
4567 {
4568         unsigned int old_flags = dev->flags;
4569         int err;
4570
4571         err = __dev_set_promiscuity(dev, inc);
4572         if (err < 0)
4573                 return err;
4574         if (dev->flags != old_flags)
4575                 dev_set_rx_mode(dev);
4576         return err;
4577 }
4578 EXPORT_SYMBOL(dev_set_promiscuity);
4579
4580 /**
4581  *      dev_set_allmulti        - update allmulti count on a device
4582  *      @dev: device
4583  *      @inc: modifier
4584  *
4585  *      Add or remove reception of all multicast frames to a device. While the
4586  *      count in the device remains above zero the interface remains listening
4587  *      to all interfaces. Once it hits zero the device reverts back to normal
4588  *      filtering operation. A negative @inc value is used to drop the counter
4589  *      when releasing a resource needing all multicasts.
4590  *      Return 0 if successful or a negative errno code on error.
4591  */
4592
4593 int dev_set_allmulti(struct net_device *dev, int inc)
4594 {
4595         unsigned int old_flags = dev->flags;
4596
4597         ASSERT_RTNL();
4598
4599         dev->flags |= IFF_ALLMULTI;
4600         dev->allmulti += inc;
4601         if (dev->allmulti == 0) {
4602                 /*
4603                  * Avoid overflow.
4604                  * If inc causes overflow, untouch allmulti and return error.
4605                  */
4606                 if (inc < 0)
4607                         dev->flags &= ~IFF_ALLMULTI;
4608                 else {
4609                         dev->allmulti -= inc;
4610                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4611                                 dev->name);
4612                         return -EOVERFLOW;
4613                 }
4614         }
4615         if (dev->flags ^ old_flags) {
4616                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4617                 dev_set_rx_mode(dev);
4618         }
4619         return 0;
4620 }
4621 EXPORT_SYMBOL(dev_set_allmulti);
4622
4623 /*
4624  *      Upload unicast and multicast address lists to device and
4625  *      configure RX filtering. When the device doesn't support unicast
4626  *      filtering it is put in promiscuous mode while unicast addresses
4627  *      are present.
4628  */
4629 void __dev_set_rx_mode(struct net_device *dev)
4630 {
4631         const struct net_device_ops *ops = dev->netdev_ops;
4632
4633         /* dev_open will call this function so the list will stay sane. */
4634         if (!(dev->flags&IFF_UP))
4635                 return;
4636
4637         if (!netif_device_present(dev))
4638                 return;
4639
4640         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4641                 /* Unicast addresses changes may only happen under the rtnl,
4642                  * therefore calling __dev_set_promiscuity here is safe.
4643                  */
4644                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4645                         __dev_set_promiscuity(dev, 1);
4646                         dev->uc_promisc = true;
4647                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4648                         __dev_set_promiscuity(dev, -1);
4649                         dev->uc_promisc = false;
4650                 }
4651         }
4652
4653         if (ops->ndo_set_rx_mode)
4654                 ops->ndo_set_rx_mode(dev);
4655 }
4656
4657 void dev_set_rx_mode(struct net_device *dev)
4658 {
4659         netif_addr_lock_bh(dev);
4660         __dev_set_rx_mode(dev);
4661         netif_addr_unlock_bh(dev);
4662 }
4663
4664 /**
4665  *      dev_get_flags - get flags reported to userspace
4666  *      @dev: device
4667  *
4668  *      Get the combination of flag bits exported through APIs to userspace.
4669  */
4670 unsigned int dev_get_flags(const struct net_device *dev)
4671 {
4672         unsigned int flags;
4673
4674         flags = (dev->flags & ~(IFF_PROMISC |
4675                                 IFF_ALLMULTI |
4676                                 IFF_RUNNING |
4677                                 IFF_LOWER_UP |
4678                                 IFF_DORMANT)) |
4679                 (dev->gflags & (IFF_PROMISC |
4680                                 IFF_ALLMULTI));
4681
4682         if (netif_running(dev)) {
4683                 if (netif_oper_up(dev))
4684                         flags |= IFF_RUNNING;
4685                 if (netif_carrier_ok(dev))
4686                         flags |= IFF_LOWER_UP;
4687                 if (netif_dormant(dev))
4688                         flags |= IFF_DORMANT;
4689         }
4690
4691         return flags;
4692 }
4693 EXPORT_SYMBOL(dev_get_flags);
4694
4695 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4696 {
4697         unsigned int old_flags = dev->flags;
4698         int ret;
4699
4700         ASSERT_RTNL();
4701
4702         /*
4703          *      Set the flags on our device.
4704          */
4705
4706         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4707                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4708                                IFF_AUTOMEDIA)) |
4709                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4710                                     IFF_ALLMULTI));
4711
4712         /*
4713          *      Load in the correct multicast list now the flags have changed.
4714          */
4715
4716         if ((old_flags ^ flags) & IFF_MULTICAST)
4717                 dev_change_rx_flags(dev, IFF_MULTICAST);
4718
4719         dev_set_rx_mode(dev);
4720
4721         /*
4722          *      Have we downed the interface. We handle IFF_UP ourselves
4723          *      according to user attempts to set it, rather than blindly
4724          *      setting it.
4725          */
4726
4727         ret = 0;
4728         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4729                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4730
4731                 if (!ret)
4732                         dev_set_rx_mode(dev);
4733         }
4734
4735         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4736                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4737
4738                 dev->gflags ^= IFF_PROMISC;
4739                 dev_set_promiscuity(dev, inc);
4740         }
4741
4742         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4743            is important. Some (broken) drivers set IFF_PROMISC, when
4744            IFF_ALLMULTI is requested not asking us and not reporting.
4745          */
4746         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4747                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4748
4749                 dev->gflags ^= IFF_ALLMULTI;
4750                 dev_set_allmulti(dev, inc);
4751         }
4752
4753         return ret;
4754 }
4755
4756 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4757 {
4758         unsigned int changes = dev->flags ^ old_flags;
4759
4760         if (changes & IFF_UP) {
4761                 if (dev->flags & IFF_UP)
4762                         call_netdevice_notifiers(NETDEV_UP, dev);
4763                 else
4764                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4765         }
4766
4767         if (dev->flags & IFF_UP &&
4768             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
4769                 struct netdev_notifier_change_info change_info;
4770
4771                 change_info.flags_changed = changes;
4772                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
4773                                               &change_info.info);
4774         }
4775 }
4776
4777 /**
4778  *      dev_change_flags - change device settings
4779  *      @dev: device
4780  *      @flags: device state flags
4781  *
4782  *      Change settings on device based state flags. The flags are
4783  *      in the userspace exported format.
4784  */
4785 int dev_change_flags(struct net_device *dev, unsigned int flags)
4786 {
4787         int ret;
4788         unsigned int changes, old_flags = dev->flags;
4789
4790         ret = __dev_change_flags(dev, flags);
4791         if (ret < 0)
4792                 return ret;
4793
4794         changes = old_flags ^ dev->flags;
4795         if (changes)
4796                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4797
4798         __dev_notify_flags(dev, old_flags);
4799         return ret;
4800 }
4801 EXPORT_SYMBOL(dev_change_flags);
4802
4803 /**
4804  *      dev_set_mtu - Change maximum transfer unit
4805  *      @dev: device
4806  *      @new_mtu: new transfer unit
4807  *
4808  *      Change the maximum transfer size of the network device.
4809  */
4810 int dev_set_mtu(struct net_device *dev, int new_mtu)
4811 {
4812         const struct net_device_ops *ops = dev->netdev_ops;
4813         int err;
4814
4815         if (new_mtu == dev->mtu)
4816                 return 0;
4817
4818         /*      MTU must be positive.    */
4819         if (new_mtu < 0)
4820                 return -EINVAL;
4821
4822         if (!netif_device_present(dev))
4823                 return -ENODEV;
4824
4825         err = 0;
4826         if (ops->ndo_change_mtu)
4827                 err = ops->ndo_change_mtu(dev, new_mtu);
4828         else
4829                 dev->mtu = new_mtu;
4830
4831         if (!err)
4832                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4833         return err;
4834 }
4835 EXPORT_SYMBOL(dev_set_mtu);
4836
4837 /**
4838  *      dev_set_group - Change group this device belongs to
4839  *      @dev: device
4840  *      @new_group: group this device should belong to
4841  */
4842 void dev_set_group(struct net_device *dev, int new_group)
4843 {
4844         dev->group = new_group;
4845 }
4846 EXPORT_SYMBOL(dev_set_group);
4847
4848 /**
4849  *      dev_set_mac_address - Change Media Access Control Address
4850  *      @dev: device
4851  *      @sa: new address
4852  *
4853  *      Change the hardware (MAC) address of the device
4854  */
4855 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4856 {
4857         const struct net_device_ops *ops = dev->netdev_ops;
4858         int err;
4859
4860         if (!ops->ndo_set_mac_address)
4861                 return -EOPNOTSUPP;
4862         if (sa->sa_family != dev->type)
4863                 return -EINVAL;
4864         if (!netif_device_present(dev))
4865                 return -ENODEV;
4866         err = ops->ndo_set_mac_address(dev, sa);
4867         if (err)
4868                 return err;
4869         dev->addr_assign_type = NET_ADDR_SET;
4870         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4871         add_device_randomness(dev->dev_addr, dev->addr_len);
4872         return 0;
4873 }
4874 EXPORT_SYMBOL(dev_set_mac_address);
4875
4876 /**
4877  *      dev_change_carrier - Change device carrier
4878  *      @dev: device
4879  *      @new_carrier: new value
4880  *
4881  *      Change device carrier
4882  */
4883 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4884 {
4885         const struct net_device_ops *ops = dev->netdev_ops;
4886
4887         if (!ops->ndo_change_carrier)
4888                 return -EOPNOTSUPP;
4889         if (!netif_device_present(dev))
4890                 return -ENODEV;
4891         return ops->ndo_change_carrier(dev, new_carrier);
4892 }
4893 EXPORT_SYMBOL(dev_change_carrier);
4894
4895 /**
4896  *      dev_new_index   -       allocate an ifindex
4897  *      @net: the applicable net namespace
4898  *
4899  *      Returns a suitable unique value for a new device interface
4900  *      number.  The caller must hold the rtnl semaphore or the
4901  *      dev_base_lock to be sure it remains unique.
4902  */
4903 static int dev_new_index(struct net *net)
4904 {
4905         int ifindex = net->ifindex;
4906         for (;;) {
4907                 if (++ifindex <= 0)
4908                         ifindex = 1;
4909                 if (!__dev_get_by_index(net, ifindex))
4910                         return net->ifindex = ifindex;
4911         }
4912 }
4913
4914 /* Delayed registration/unregisteration */
4915 static LIST_HEAD(net_todo_list);
4916
4917 static void net_set_todo(struct net_device *dev)
4918 {
4919         list_add_tail(&dev->todo_list, &net_todo_list);
4920 }
4921
4922 static void rollback_registered_many(struct list_head *head)
4923 {
4924         struct net_device *dev, *tmp;
4925
4926         BUG_ON(dev_boot_phase);
4927         ASSERT_RTNL();
4928
4929         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4930                 /* Some devices call without registering
4931                  * for initialization unwind. Remove those
4932                  * devices and proceed with the remaining.
4933                  */
4934                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4935                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4936                                  dev->name, dev);
4937
4938                         WARN_ON(1);
4939                         list_del(&dev->unreg_list);
4940                         continue;
4941                 }
4942                 dev->dismantle = true;
4943                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4944         }
4945
4946         /* If device is running, close it first. */
4947         dev_close_many(head);
4948
4949         list_for_each_entry(dev, head, unreg_list) {
4950                 /* And unlink it from device chain. */
4951                 unlist_netdevice(dev);
4952
4953                 dev->reg_state = NETREG_UNREGISTERING;
4954         }
4955
4956         synchronize_net();
4957
4958         list_for_each_entry(dev, head, unreg_list) {
4959                 /* Shutdown queueing discipline. */
4960                 dev_shutdown(dev);
4961
4962
4963                 /* Notify protocols, that we are about to destroy
4964                    this device. They should clean all the things.
4965                 */
4966                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4967
4968                 if (!dev->rtnl_link_ops ||
4969                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4970                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4971
4972                 /*
4973                  *      Flush the unicast and multicast chains
4974                  */
4975                 dev_uc_flush(dev);
4976                 dev_mc_flush(dev);
4977
4978                 if (dev->netdev_ops->ndo_uninit)
4979                         dev->netdev_ops->ndo_uninit(dev);
4980
4981                 /* Notifier chain MUST detach us all upper devices. */
4982                 WARN_ON(netdev_has_any_upper_dev(dev));
4983
4984                 /* Remove entries from kobject tree */
4985                 netdev_unregister_kobject(dev);
4986 #ifdef CONFIG_XPS
4987                 /* Remove XPS queueing entries */
4988                 netif_reset_xps_queues_gt(dev, 0);
4989 #endif
4990         }
4991
4992         synchronize_net();
4993
4994         list_for_each_entry(dev, head, unreg_list)
4995                 dev_put(dev);
4996 }
4997
4998 static void rollback_registered(struct net_device *dev)
4999 {
5000         LIST_HEAD(single);
5001
5002         list_add(&dev->unreg_list, &single);
5003         rollback_registered_many(&single);
5004         list_del(&single);
5005 }
5006
5007 static netdev_features_t netdev_fix_features(struct net_device *dev,
5008         netdev_features_t features)
5009 {
5010         /* Fix illegal checksum combinations */
5011         if ((features & NETIF_F_HW_CSUM) &&
5012             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5013                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5014                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5015         }
5016
5017         /* TSO requires that SG is present as well. */
5018         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5019                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5020                 features &= ~NETIF_F_ALL_TSO;
5021         }
5022
5023         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5024                                         !(features & NETIF_F_IP_CSUM)) {
5025                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5026                 features &= ~NETIF_F_TSO;
5027                 features &= ~NETIF_F_TSO_ECN;
5028         }
5029
5030         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5031                                          !(features & NETIF_F_IPV6_CSUM)) {
5032                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5033                 features &= ~NETIF_F_TSO6;
5034         }
5035
5036         /* TSO ECN requires that TSO is present as well. */
5037         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5038                 features &= ~NETIF_F_TSO_ECN;
5039
5040         /* Software GSO depends on SG. */
5041         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5042                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5043                 features &= ~NETIF_F_GSO;
5044         }
5045
5046         /* UFO needs SG and checksumming */
5047         if (features & NETIF_F_UFO) {
5048                 /* maybe split UFO into V4 and V6? */
5049                 if (!((features & NETIF_F_GEN_CSUM) ||
5050                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5051                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5052                         netdev_dbg(dev,
5053                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5054                         features &= ~NETIF_F_UFO;
5055                 }
5056
5057                 if (!(features & NETIF_F_SG)) {
5058                         netdev_dbg(dev,
5059                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5060                         features &= ~NETIF_F_UFO;
5061                 }
5062         }
5063
5064         return features;
5065 }
5066
5067 int __netdev_update_features(struct net_device *dev)
5068 {
5069         netdev_features_t features;
5070         int err = 0;
5071
5072         ASSERT_RTNL();
5073
5074         features = netdev_get_wanted_features(dev);
5075
5076         if (dev->netdev_ops->ndo_fix_features)
5077                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5078
5079         /* driver might be less strict about feature dependencies */
5080         features = netdev_fix_features(dev, features);
5081
5082         if (dev->features == features)
5083                 return 0;
5084
5085         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5086                 &dev->features, &features);
5087
5088         if (dev->netdev_ops->ndo_set_features)
5089                 err = dev->netdev_ops->ndo_set_features(dev, features);
5090
5091         if (unlikely(err < 0)) {
5092                 netdev_err(dev,
5093                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5094                         err, &features, &dev->features);
5095                 return -1;
5096         }
5097
5098         if (!err)
5099                 dev->features = features;
5100
5101         return 1;
5102 }
5103
5104 /**
5105  *      netdev_update_features - recalculate device features
5106  *      @dev: the device to check
5107  *
5108  *      Recalculate dev->features set and send notifications if it
5109  *      has changed. Should be called after driver or hardware dependent
5110  *      conditions might have changed that influence the features.
5111  */
5112 void netdev_update_features(struct net_device *dev)
5113 {
5114         if (__netdev_update_features(dev))
5115                 netdev_features_change(dev);
5116 }
5117 EXPORT_SYMBOL(netdev_update_features);
5118
5119 /**
5120  *      netdev_change_features - recalculate device features
5121  *      @dev: the device to check
5122  *
5123  *      Recalculate dev->features set and send notifications even
5124  *      if they have not changed. Should be called instead of
5125  *      netdev_update_features() if also dev->vlan_features might
5126  *      have changed to allow the changes to be propagated to stacked
5127  *      VLAN devices.
5128  */
5129 void netdev_change_features(struct net_device *dev)
5130 {
5131         __netdev_update_features(dev);
5132         netdev_features_change(dev);
5133 }
5134 EXPORT_SYMBOL(netdev_change_features);
5135
5136 /**
5137  *      netif_stacked_transfer_operstate -      transfer operstate
5138  *      @rootdev: the root or lower level device to transfer state from
5139  *      @dev: the device to transfer operstate to
5140  *
5141  *      Transfer operational state from root to device. This is normally
5142  *      called when a stacking relationship exists between the root
5143  *      device and the device(a leaf device).
5144  */
5145 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5146                                         struct net_device *dev)
5147 {
5148         if (rootdev->operstate == IF_OPER_DORMANT)
5149                 netif_dormant_on(dev);
5150         else
5151                 netif_dormant_off(dev);
5152
5153         if (netif_carrier_ok(rootdev)) {
5154                 if (!netif_carrier_ok(dev))
5155                         netif_carrier_on(dev);
5156         } else {
5157                 if (netif_carrier_ok(dev))
5158                         netif_carrier_off(dev);
5159         }
5160 }
5161 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5162
5163 #ifdef CONFIG_RPS
5164 static int netif_alloc_rx_queues(struct net_device *dev)
5165 {
5166         unsigned int i, count = dev->num_rx_queues;
5167         struct netdev_rx_queue *rx;
5168
5169         BUG_ON(count < 1);
5170
5171         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5172         if (!rx)
5173                 return -ENOMEM;
5174
5175         dev->_rx = rx;
5176
5177         for (i = 0; i < count; i++)
5178                 rx[i].dev = dev;
5179         return 0;
5180 }
5181 #endif
5182
5183 static void netdev_init_one_queue(struct net_device *dev,
5184                                   struct netdev_queue *queue, void *_unused)
5185 {
5186         /* Initialize queue lock */
5187         spin_lock_init(&queue->_xmit_lock);
5188         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5189         queue->xmit_lock_owner = -1;
5190         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5191         queue->dev = dev;
5192 #ifdef CONFIG_BQL
5193         dql_init(&queue->dql, HZ);
5194 #endif
5195 }
5196
5197 static int netif_alloc_netdev_queues(struct net_device *dev)
5198 {
5199         unsigned int count = dev->num_tx_queues;
5200         struct netdev_queue *tx;
5201
5202         BUG_ON(count < 1);
5203
5204         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5205         if (!tx)
5206                 return -ENOMEM;
5207
5208         dev->_tx = tx;
5209
5210         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5211         spin_lock_init(&dev->tx_global_lock);
5212
5213         return 0;
5214 }
5215
5216 /**
5217  *      register_netdevice      - register a network device
5218  *      @dev: device to register
5219  *
5220  *      Take a completed network device structure and add it to the kernel
5221  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5222  *      chain. 0 is returned on success. A negative errno code is returned
5223  *      on a failure to set up the device, or if the name is a duplicate.
5224  *
5225  *      Callers must hold the rtnl semaphore. You may want
5226  *      register_netdev() instead of this.
5227  *
5228  *      BUGS:
5229  *      The locking appears insufficient to guarantee two parallel registers
5230  *      will not get the same name.
5231  */
5232
5233 int register_netdevice(struct net_device *dev)
5234 {
5235         int ret;
5236         struct net *net = dev_net(dev);
5237
5238         BUG_ON(dev_boot_phase);
5239         ASSERT_RTNL();
5240
5241         might_sleep();
5242
5243         /* When net_device's are persistent, this will be fatal. */
5244         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5245         BUG_ON(!net);
5246
5247         spin_lock_init(&dev->addr_list_lock);
5248         netdev_set_addr_lockdep_class(dev);
5249
5250         dev->iflink = -1;
5251
5252         ret = dev_get_valid_name(net, dev, dev->name);
5253         if (ret < 0)
5254                 goto out;
5255
5256         /* Init, if this function is available */
5257         if (dev->netdev_ops->ndo_init) {
5258                 ret = dev->netdev_ops->ndo_init(dev);
5259                 if (ret) {
5260                         if (ret > 0)
5261                                 ret = -EIO;
5262                         goto out;
5263                 }
5264         }
5265
5266         if (((dev->hw_features | dev->features) &
5267              NETIF_F_HW_VLAN_CTAG_FILTER) &&
5268             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5269              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5270                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5271                 ret = -EINVAL;
5272                 goto err_uninit;
5273         }
5274
5275         ret = -EBUSY;
5276         if (!dev->ifindex)
5277                 dev->ifindex = dev_new_index(net);
5278         else if (__dev_get_by_index(net, dev->ifindex))
5279                 goto err_uninit;
5280
5281         if (dev->iflink == -1)
5282                 dev->iflink = dev->ifindex;
5283
5284         /* Transfer changeable features to wanted_features and enable
5285          * software offloads (GSO and GRO).
5286          */
5287         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5288         dev->features |= NETIF_F_SOFT_FEATURES;
5289         dev->wanted_features = dev->features & dev->hw_features;
5290
5291         /* Turn on no cache copy if HW is doing checksum */
5292         if (!(dev->flags & IFF_LOOPBACK)) {
5293                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5294                 if (dev->features & NETIF_F_ALL_CSUM) {
5295                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5296                         dev->features |= NETIF_F_NOCACHE_COPY;
5297                 }
5298         }
5299
5300         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5301          */
5302         dev->vlan_features |= NETIF_F_HIGHDMA;
5303
5304         /* Make NETIF_F_SG inheritable to tunnel devices.
5305          */
5306         dev->hw_enc_features |= NETIF_F_SG;
5307
5308         /* Make NETIF_F_SG inheritable to MPLS.
5309          */
5310         dev->mpls_features |= NETIF_F_SG;
5311
5312         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5313         ret = notifier_to_errno(ret);
5314         if (ret)
5315                 goto err_uninit;
5316
5317         ret = netdev_register_kobject(dev);
5318         if (ret)
5319                 goto err_uninit;
5320         dev->reg_state = NETREG_REGISTERED;
5321
5322         __netdev_update_features(dev);
5323
5324         /*
5325          *      Default initial state at registry is that the
5326          *      device is present.
5327          */
5328
5329         set_bit(__LINK_STATE_PRESENT, &dev->state);
5330
5331         linkwatch_init_dev(dev);
5332
5333         dev_init_scheduler(dev);
5334         dev_hold(dev);
5335         list_netdevice(dev);
5336         add_device_randomness(dev->dev_addr, dev->addr_len);
5337
5338         /* If the device has permanent device address, driver should
5339          * set dev_addr and also addr_assign_type should be set to
5340          * NET_ADDR_PERM (default value).
5341          */
5342         if (dev->addr_assign_type == NET_ADDR_PERM)
5343                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5344
5345         /* Notify protocols, that a new device appeared. */
5346         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5347         ret = notifier_to_errno(ret);
5348         if (ret) {
5349                 rollback_registered(dev);
5350                 dev->reg_state = NETREG_UNREGISTERED;
5351         }
5352         /*
5353          *      Prevent userspace races by waiting until the network
5354          *      device is fully setup before sending notifications.
5355          */
5356         if (!dev->rtnl_link_ops ||
5357             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5358                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5359
5360 out:
5361         return ret;
5362
5363 err_uninit:
5364         if (dev->netdev_ops->ndo_uninit)
5365                 dev->netdev_ops->ndo_uninit(dev);
5366         goto out;
5367 }
5368 EXPORT_SYMBOL(register_netdevice);
5369
5370 /**
5371  *      init_dummy_netdev       - init a dummy network device for NAPI
5372  *      @dev: device to init
5373  *
5374  *      This takes a network device structure and initialize the minimum
5375  *      amount of fields so it can be used to schedule NAPI polls without
5376  *      registering a full blown interface. This is to be used by drivers
5377  *      that need to tie several hardware interfaces to a single NAPI
5378  *      poll scheduler due to HW limitations.
5379  */
5380 int init_dummy_netdev(struct net_device *dev)
5381 {
5382         /* Clear everything. Note we don't initialize spinlocks
5383          * are they aren't supposed to be taken by any of the
5384          * NAPI code and this dummy netdev is supposed to be
5385          * only ever used for NAPI polls
5386          */
5387         memset(dev, 0, sizeof(struct net_device));
5388
5389         /* make sure we BUG if trying to hit standard
5390          * register/unregister code path
5391          */
5392         dev->reg_state = NETREG_DUMMY;
5393
5394         /* NAPI wants this */
5395         INIT_LIST_HEAD(&dev->napi_list);
5396
5397         /* a dummy interface is started by default */
5398         set_bit(__LINK_STATE_PRESENT, &dev->state);
5399         set_bit(__LINK_STATE_START, &dev->state);
5400
5401         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5402          * because users of this 'device' dont need to change
5403          * its refcount.
5404          */
5405
5406         return 0;
5407 }
5408 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5409
5410
5411 /**
5412  *      register_netdev - register a network device
5413  *      @dev: device to register
5414  *
5415  *      Take a completed network device structure and add it to the kernel
5416  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5417  *      chain. 0 is returned on success. A negative errno code is returned
5418  *      on a failure to set up the device, or if the name is a duplicate.
5419  *
5420  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5421  *      and expands the device name if you passed a format string to
5422  *      alloc_netdev.
5423  */
5424 int register_netdev(struct net_device *dev)
5425 {
5426         int err;
5427
5428         rtnl_lock();
5429         err = register_netdevice(dev);
5430         rtnl_unlock();
5431         return err;
5432 }
5433 EXPORT_SYMBOL(register_netdev);
5434
5435 int netdev_refcnt_read(const struct net_device *dev)
5436 {
5437         int i, refcnt = 0;
5438
5439         for_each_possible_cpu(i)
5440                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5441         return refcnt;
5442 }
5443 EXPORT_SYMBOL(netdev_refcnt_read);
5444
5445 /**
5446  * netdev_wait_allrefs - wait until all references are gone.
5447  * @dev: target net_device
5448  *
5449  * This is called when unregistering network devices.
5450  *
5451  * Any protocol or device that holds a reference should register
5452  * for netdevice notification, and cleanup and put back the
5453  * reference if they receive an UNREGISTER event.
5454  * We can get stuck here if buggy protocols don't correctly
5455  * call dev_put.
5456  */
5457 static void netdev_wait_allrefs(struct net_device *dev)
5458 {
5459         unsigned long rebroadcast_time, warning_time;
5460         int refcnt;
5461
5462         linkwatch_forget_dev(dev);
5463
5464         rebroadcast_time = warning_time = jiffies;
5465         refcnt = netdev_refcnt_read(dev);
5466
5467         while (refcnt != 0) {
5468                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5469                         rtnl_lock();
5470
5471                         /* Rebroadcast unregister notification */
5472                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5473
5474                         __rtnl_unlock();
5475                         rcu_barrier();
5476                         rtnl_lock();
5477
5478                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5479                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5480                                      &dev->state)) {
5481                                 /* We must not have linkwatch events
5482                                  * pending on unregister. If this
5483                                  * happens, we simply run the queue
5484                                  * unscheduled, resulting in a noop
5485                                  * for this device.
5486                                  */
5487                                 linkwatch_run_queue();
5488                         }
5489
5490                         __rtnl_unlock();
5491
5492                         rebroadcast_time = jiffies;
5493                 }
5494
5495                 msleep(250);
5496
5497                 refcnt = netdev_refcnt_read(dev);
5498
5499                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5500                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5501                                  dev->name, refcnt);
5502                         warning_time = jiffies;
5503                 }
5504         }
5505 }
5506
5507 /* The sequence is:
5508  *
5509  *      rtnl_lock();
5510  *      ...
5511  *      register_netdevice(x1);
5512  *      register_netdevice(x2);
5513  *      ...
5514  *      unregister_netdevice(y1);
5515  *      unregister_netdevice(y2);
5516  *      ...
5517  *      rtnl_unlock();
5518  *      free_netdev(y1);
5519  *      free_netdev(y2);
5520  *
5521  * We are invoked by rtnl_unlock().
5522  * This allows us to deal with problems:
5523  * 1) We can delete sysfs objects which invoke hotplug
5524  *    without deadlocking with linkwatch via keventd.
5525  * 2) Since we run with the RTNL semaphore not held, we can sleep
5526  *    safely in order to wait for the netdev refcnt to drop to zero.
5527  *
5528  * We must not return until all unregister events added during
5529  * the interval the lock was held have been completed.
5530  */
5531 void netdev_run_todo(void)
5532 {
5533         struct list_head list;
5534
5535         /* Snapshot list, allow later requests */
5536         list_replace_init(&net_todo_list, &list);
5537
5538         __rtnl_unlock();
5539
5540
5541         /* Wait for rcu callbacks to finish before next phase */
5542         if (!list_empty(&list))
5543                 rcu_barrier();
5544
5545         while (!list_empty(&list)) {
5546                 struct net_device *dev
5547                         = list_first_entry(&list, struct net_device, todo_list);
5548                 list_del(&dev->todo_list);
5549
5550                 rtnl_lock();
5551                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5552                 __rtnl_unlock();
5553
5554                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5555                         pr_err("network todo '%s' but state %d\n",
5556                                dev->name, dev->reg_state);
5557                         dump_stack();
5558                         continue;
5559                 }
5560
5561                 dev->reg_state = NETREG_UNREGISTERED;
5562
5563                 on_each_cpu(flush_backlog, dev, 1);
5564
5565                 netdev_wait_allrefs(dev);
5566
5567                 /* paranoia */
5568                 BUG_ON(netdev_refcnt_read(dev));
5569                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5570                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5571                 WARN_ON(dev->dn_ptr);
5572
5573                 if (dev->destructor)
5574                         dev->destructor(dev);
5575
5576                 /* Free network device */
5577                 kobject_put(&dev->dev.kobj);
5578         }
5579 }
5580
5581 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5582  * fields in the same order, with only the type differing.
5583  */
5584 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5585                              const struct net_device_stats *netdev_stats)
5586 {
5587 #if BITS_PER_LONG == 64
5588         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5589         memcpy(stats64, netdev_stats, sizeof(*stats64));
5590 #else
5591         size_t i, n = sizeof(*stats64) / sizeof(u64);
5592         const unsigned long *src = (const unsigned long *)netdev_stats;
5593         u64 *dst = (u64 *)stats64;
5594
5595         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5596                      sizeof(*stats64) / sizeof(u64));
5597         for (i = 0; i < n; i++)
5598                 dst[i] = src[i];
5599 #endif
5600 }
5601 EXPORT_SYMBOL(netdev_stats_to_stats64);
5602
5603 /**
5604  *      dev_get_stats   - get network device statistics
5605  *      @dev: device to get statistics from
5606  *      @storage: place to store stats
5607  *
5608  *      Get network statistics from device. Return @storage.
5609  *      The device driver may provide its own method by setting
5610  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5611  *      otherwise the internal statistics structure is used.
5612  */
5613 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5614                                         struct rtnl_link_stats64 *storage)
5615 {
5616         const struct net_device_ops *ops = dev->netdev_ops;
5617
5618         if (ops->ndo_get_stats64) {
5619                 memset(storage, 0, sizeof(*storage));
5620                 ops->ndo_get_stats64(dev, storage);
5621         } else if (ops->ndo_get_stats) {
5622                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5623         } else {
5624                 netdev_stats_to_stats64(storage, &dev->stats);
5625         }
5626         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5627         return storage;
5628 }
5629 EXPORT_SYMBOL(dev_get_stats);
5630
5631 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5632 {
5633         struct netdev_queue *queue = dev_ingress_queue(dev);
5634
5635 #ifdef CONFIG_NET_CLS_ACT
5636         if (queue)
5637                 return queue;
5638         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5639         if (!queue)
5640                 return NULL;
5641         netdev_init_one_queue(dev, queue, NULL);
5642         queue->qdisc = &noop_qdisc;
5643         queue->qdisc_sleeping = &noop_qdisc;
5644         rcu_assign_pointer(dev->ingress_queue, queue);
5645 #endif
5646         return queue;
5647 }
5648
5649 static const struct ethtool_ops default_ethtool_ops;
5650
5651 void netdev_set_default_ethtool_ops(struct net_device *dev,
5652                                     const struct ethtool_ops *ops)
5653 {
5654         if (dev->ethtool_ops == &default_ethtool_ops)
5655                 dev->ethtool_ops = ops;
5656 }
5657 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5658
5659 /**
5660  *      alloc_netdev_mqs - allocate network device
5661  *      @sizeof_priv:   size of private data to allocate space for
5662  *      @name:          device name format string
5663  *      @setup:         callback to initialize device
5664  *      @txqs:          the number of TX subqueues to allocate
5665  *      @rxqs:          the number of RX subqueues to allocate
5666  *
5667  *      Allocates a struct net_device with private data area for driver use
5668  *      and performs basic initialization.  Also allocates subquue structs
5669  *      for each queue on the device.
5670  */
5671 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5672                 void (*setup)(struct net_device *),
5673                 unsigned int txqs, unsigned int rxqs)
5674 {
5675         struct net_device *dev;
5676         size_t alloc_size;
5677         struct net_device *p;
5678
5679         BUG_ON(strlen(name) >= sizeof(dev->name));
5680
5681         if (txqs < 1) {
5682                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5683                 return NULL;
5684         }
5685
5686 #ifdef CONFIG_RPS
5687         if (rxqs < 1) {
5688                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5689                 return NULL;
5690         }
5691 #endif
5692
5693         alloc_size = sizeof(struct net_device);
5694         if (sizeof_priv) {
5695                 /* ensure 32-byte alignment of private area */
5696                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5697                 alloc_size += sizeof_priv;
5698         }
5699         /* ensure 32-byte alignment of whole construct */
5700         alloc_size += NETDEV_ALIGN - 1;
5701
5702         p = kzalloc(alloc_size, GFP_KERNEL);
5703         if (!p)
5704                 return NULL;
5705
5706         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5707         dev->padded = (char *)dev - (char *)p;
5708
5709         dev->pcpu_refcnt = alloc_percpu(int);
5710         if (!dev->pcpu_refcnt)
5711                 goto free_p;
5712
5713         if (dev_addr_init(dev))
5714                 goto free_pcpu;
5715
5716         dev_mc_init(dev);
5717         dev_uc_init(dev);
5718
5719         dev_net_set(dev, &init_net);
5720
5721         dev->gso_max_size = GSO_MAX_SIZE;
5722         dev->gso_max_segs = GSO_MAX_SEGS;
5723
5724         INIT_LIST_HEAD(&dev->napi_list);
5725         INIT_LIST_HEAD(&dev->unreg_list);
5726         INIT_LIST_HEAD(&dev->link_watch_list);
5727         INIT_LIST_HEAD(&dev->upper_dev_list);
5728         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5729         setup(dev);
5730
5731         dev->num_tx_queues = txqs;
5732         dev->real_num_tx_queues = txqs;
5733         if (netif_alloc_netdev_queues(dev))
5734                 goto free_all;
5735
5736 #ifdef CONFIG_RPS
5737         dev->num_rx_queues = rxqs;
5738         dev->real_num_rx_queues = rxqs;
5739         if (netif_alloc_rx_queues(dev))
5740                 goto free_all;
5741 #endif
5742
5743         strcpy(dev->name, name);
5744         dev->group = INIT_NETDEV_GROUP;
5745         if (!dev->ethtool_ops)
5746                 dev->ethtool_ops = &default_ethtool_ops;
5747         return dev;
5748
5749 free_all:
5750         free_netdev(dev);
5751         return NULL;
5752
5753 free_pcpu:
5754         free_percpu(dev->pcpu_refcnt);
5755         kfree(dev->_tx);
5756 #ifdef CONFIG_RPS
5757         kfree(dev->_rx);
5758 #endif
5759
5760 free_p:
5761         kfree(p);
5762         return NULL;
5763 }
5764 EXPORT_SYMBOL(alloc_netdev_mqs);
5765
5766 /**
5767  *      free_netdev - free network device
5768  *      @dev: device
5769  *
5770  *      This function does the last stage of destroying an allocated device
5771  *      interface. The reference to the device object is released.
5772  *      If this is the last reference then it will be freed.
5773  */
5774 void free_netdev(struct net_device *dev)
5775 {
5776         struct napi_struct *p, *n;
5777
5778         release_net(dev_net(dev));
5779
5780         kfree(dev->_tx);
5781 #ifdef CONFIG_RPS
5782         kfree(dev->_rx);
5783 #endif
5784
5785         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5786
5787         /* Flush device addresses */
5788         dev_addr_flush(dev);
5789
5790         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5791                 netif_napi_del(p);
5792
5793         free_percpu(dev->pcpu_refcnt);
5794         dev->pcpu_refcnt = NULL;
5795
5796         /*  Compatibility with error handling in drivers */
5797         if (dev->reg_state == NETREG_UNINITIALIZED) {
5798                 kfree((char *)dev - dev->padded);
5799                 return;
5800         }
5801
5802         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5803         dev->reg_state = NETREG_RELEASED;
5804
5805         /* will free via device release */
5806         put_device(&dev->dev);
5807 }
5808 EXPORT_SYMBOL(free_netdev);
5809
5810 /**
5811  *      synchronize_net -  Synchronize with packet receive processing
5812  *
5813  *      Wait for packets currently being received to be done.
5814  *      Does not block later packets from starting.
5815  */
5816 void synchronize_net(void)
5817 {
5818         might_sleep();
5819         if (rtnl_is_locked())
5820                 synchronize_rcu_expedited();
5821         else
5822                 synchronize_rcu();
5823 }
5824 EXPORT_SYMBOL(synchronize_net);
5825
5826 /**
5827  *      unregister_netdevice_queue - remove device from the kernel
5828  *      @dev: device
5829  *      @head: list
5830  *
5831  *      This function shuts down a device interface and removes it
5832  *      from the kernel tables.
5833  *      If head not NULL, device is queued to be unregistered later.
5834  *
5835  *      Callers must hold the rtnl semaphore.  You may want
5836  *      unregister_netdev() instead of this.
5837  */
5838
5839 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5840 {
5841         ASSERT_RTNL();
5842
5843         if (head) {
5844                 list_move_tail(&dev->unreg_list, head);
5845         } else {
5846                 rollback_registered(dev);
5847                 /* Finish processing unregister after unlock */
5848                 net_set_todo(dev);
5849         }
5850 }
5851 EXPORT_SYMBOL(unregister_netdevice_queue);
5852
5853 /**
5854  *      unregister_netdevice_many - unregister many devices
5855  *      @head: list of devices
5856  */
5857 void unregister_netdevice_many(struct list_head *head)
5858 {
5859         struct net_device *dev;
5860
5861         if (!list_empty(head)) {
5862                 rollback_registered_many(head);
5863                 list_for_each_entry(dev, head, unreg_list)
5864                         net_set_todo(dev);
5865         }
5866 }
5867 EXPORT_SYMBOL(unregister_netdevice_many);
5868
5869 /**
5870  *      unregister_netdev - remove device from the kernel
5871  *      @dev: device
5872  *
5873  *      This function shuts down a device interface and removes it
5874  *      from the kernel tables.
5875  *
5876  *      This is just a wrapper for unregister_netdevice that takes
5877  *      the rtnl semaphore.  In general you want to use this and not
5878  *      unregister_netdevice.
5879  */
5880 void unregister_netdev(struct net_device *dev)
5881 {
5882         rtnl_lock();
5883         unregister_netdevice(dev);
5884         rtnl_unlock();
5885 }
5886 EXPORT_SYMBOL(unregister_netdev);
5887
5888 /**
5889  *      dev_change_net_namespace - move device to different nethost namespace
5890  *      @dev: device
5891  *      @net: network namespace
5892  *      @pat: If not NULL name pattern to try if the current device name
5893  *            is already taken in the destination network namespace.
5894  *
5895  *      This function shuts down a device interface and moves it
5896  *      to a new network namespace. On success 0 is returned, on
5897  *      a failure a netagive errno code is returned.
5898  *
5899  *      Callers must hold the rtnl semaphore.
5900  */
5901
5902 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5903 {
5904         int err;
5905
5906         ASSERT_RTNL();
5907
5908         /* Don't allow namespace local devices to be moved. */
5909         err = -EINVAL;
5910         if (dev->features & NETIF_F_NETNS_LOCAL)
5911                 goto out;
5912
5913         /* Ensure the device has been registrered */
5914         if (dev->reg_state != NETREG_REGISTERED)
5915                 goto out;
5916
5917         /* Get out if there is nothing todo */
5918         err = 0;
5919         if (net_eq(dev_net(dev), net))
5920                 goto out;
5921
5922         /* Pick the destination device name, and ensure
5923          * we can use it in the destination network namespace.
5924          */
5925         err = -EEXIST;
5926         if (__dev_get_by_name(net, dev->name)) {
5927                 /* We get here if we can't use the current device name */
5928                 if (!pat)
5929                         goto out;
5930                 if (dev_get_valid_name(net, dev, pat) < 0)
5931                         goto out;
5932         }
5933
5934         /*
5935          * And now a mini version of register_netdevice unregister_netdevice.
5936          */
5937
5938         /* If device is running close it first. */
5939         dev_close(dev);
5940
5941         /* And unlink it from device chain */
5942         err = -ENODEV;
5943         unlist_netdevice(dev);
5944
5945         synchronize_net();
5946
5947         /* Shutdown queueing discipline. */
5948         dev_shutdown(dev);
5949
5950         /* Notify protocols, that we are about to destroy
5951            this device. They should clean all the things.
5952
5953            Note that dev->reg_state stays at NETREG_REGISTERED.
5954            This is wanted because this way 8021q and macvlan know
5955            the device is just moving and can keep their slaves up.
5956         */
5957         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5958         rcu_barrier();
5959         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5960         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5961
5962         /*
5963          *      Flush the unicast and multicast chains
5964          */
5965         dev_uc_flush(dev);
5966         dev_mc_flush(dev);
5967
5968         /* Send a netdev-removed uevent to the old namespace */
5969         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
5970
5971         /* Actually switch the network namespace */
5972         dev_net_set(dev, net);
5973
5974         /* If there is an ifindex conflict assign a new one */
5975         if (__dev_get_by_index(net, dev->ifindex)) {
5976                 int iflink = (dev->iflink == dev->ifindex);
5977                 dev->ifindex = dev_new_index(net);
5978                 if (iflink)
5979                         dev->iflink = dev->ifindex;
5980         }
5981
5982         /* Send a netdev-add uevent to the new namespace */
5983         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
5984
5985         /* Fixup kobjects */
5986         err = device_rename(&dev->dev, dev->name);
5987         WARN_ON(err);
5988
5989         /* Add the device back in the hashes */
5990         list_netdevice(dev);
5991
5992         /* Notify protocols, that a new device appeared. */
5993         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5994
5995         /*
5996          *      Prevent userspace races by waiting until the network
5997          *      device is fully setup before sending notifications.
5998          */
5999         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6000
6001         synchronize_net();
6002         err = 0;
6003 out:
6004         return err;
6005 }
6006 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6007
6008 static int dev_cpu_callback(struct notifier_block *nfb,
6009                             unsigned long action,
6010                             void *ocpu)
6011 {
6012         struct sk_buff **list_skb;
6013         struct sk_buff *skb;
6014         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6015         struct softnet_data *sd, *oldsd;
6016
6017         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6018                 return NOTIFY_OK;
6019
6020         local_irq_disable();
6021         cpu = smp_processor_id();
6022         sd = &per_cpu(softnet_data, cpu);
6023         oldsd = &per_cpu(softnet_data, oldcpu);
6024
6025         /* Find end of our completion_queue. */
6026         list_skb = &sd->completion_queue;
6027         while (*list_skb)
6028                 list_skb = &(*list_skb)->next;
6029         /* Append completion queue from offline CPU. */
6030         *list_skb = oldsd->completion_queue;
6031         oldsd->completion_queue = NULL;
6032
6033         /* Append output queue from offline CPU. */
6034         if (oldsd->output_queue) {
6035                 *sd->output_queue_tailp = oldsd->output_queue;
6036                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6037                 oldsd->output_queue = NULL;
6038                 oldsd->output_queue_tailp = &oldsd->output_queue;
6039         }
6040         /* Append NAPI poll list from offline CPU. */
6041         if (!list_empty(&oldsd->poll_list)) {
6042                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6043                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6044         }
6045
6046         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6047         local_irq_enable();
6048
6049         /* Process offline CPU's input_pkt_queue */
6050         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6051                 netif_rx(skb);
6052                 input_queue_head_incr(oldsd);
6053         }
6054         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6055                 netif_rx(skb);
6056                 input_queue_head_incr(oldsd);
6057         }
6058
6059         return NOTIFY_OK;
6060 }
6061
6062
6063 /**
6064  *      netdev_increment_features - increment feature set by one
6065  *      @all: current feature set
6066  *      @one: new feature set
6067  *      @mask: mask feature set
6068  *
6069  *      Computes a new feature set after adding a device with feature set
6070  *      @one to the master device with current feature set @all.  Will not
6071  *      enable anything that is off in @mask. Returns the new feature set.
6072  */
6073 netdev_features_t netdev_increment_features(netdev_features_t all,
6074         netdev_features_t one, netdev_features_t mask)
6075 {
6076         if (mask & NETIF_F_GEN_CSUM)
6077                 mask |= NETIF_F_ALL_CSUM;
6078         mask |= NETIF_F_VLAN_CHALLENGED;
6079
6080         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6081         all &= one | ~NETIF_F_ALL_FOR_ALL;
6082
6083         /* If one device supports hw checksumming, set for all. */
6084         if (all & NETIF_F_GEN_CSUM)
6085                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6086
6087         return all;
6088 }
6089 EXPORT_SYMBOL(netdev_increment_features);
6090
6091 static struct hlist_head * __net_init netdev_create_hash(void)
6092 {
6093         int i;
6094         struct hlist_head *hash;
6095
6096         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6097         if (hash != NULL)
6098                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6099                         INIT_HLIST_HEAD(&hash[i]);
6100
6101         return hash;
6102 }
6103
6104 /* Initialize per network namespace state */
6105 static int __net_init netdev_init(struct net *net)
6106 {
6107         if (net != &init_net)
6108                 INIT_LIST_HEAD(&net->dev_base_head);
6109
6110         net->dev_name_head = netdev_create_hash();
6111         if (net->dev_name_head == NULL)
6112                 goto err_name;
6113
6114         net->dev_index_head = netdev_create_hash();
6115         if (net->dev_index_head == NULL)
6116                 goto err_idx;
6117
6118         return 0;
6119
6120 err_idx:
6121         kfree(net->dev_name_head);
6122 err_name:
6123         return -ENOMEM;
6124 }
6125
6126 /**
6127  *      netdev_drivername - network driver for the device
6128  *      @dev: network device
6129  *
6130  *      Determine network driver for device.
6131  */
6132 const char *netdev_drivername(const struct net_device *dev)
6133 {
6134         const struct device_driver *driver;
6135         const struct device *parent;
6136         const char *empty = "";
6137
6138         parent = dev->dev.parent;
6139         if (!parent)
6140                 return empty;
6141
6142         driver = parent->driver;
6143         if (driver && driver->name)
6144                 return driver->name;
6145         return empty;
6146 }
6147
6148 static int __netdev_printk(const char *level, const struct net_device *dev,
6149                            struct va_format *vaf)
6150 {
6151         int r;
6152
6153         if (dev && dev->dev.parent) {
6154                 r = dev_printk_emit(level[1] - '0',
6155                                     dev->dev.parent,
6156                                     "%s %s %s: %pV",
6157                                     dev_driver_string(dev->dev.parent),
6158                                     dev_name(dev->dev.parent),
6159                                     netdev_name(dev), vaf);
6160         } else if (dev) {
6161                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6162         } else {
6163                 r = printk("%s(NULL net_device): %pV", level, vaf);
6164         }
6165
6166         return r;
6167 }
6168
6169 int netdev_printk(const char *level, const struct net_device *dev,
6170                   const char *format, ...)
6171 {
6172         struct va_format vaf;
6173         va_list args;
6174         int r;
6175
6176         va_start(args, format);
6177
6178         vaf.fmt = format;
6179         vaf.va = &args;
6180
6181         r = __netdev_printk(level, dev, &vaf);
6182
6183         va_end(args);
6184
6185         return r;
6186 }
6187 EXPORT_SYMBOL(netdev_printk);
6188
6189 #define define_netdev_printk_level(func, level)                 \
6190 int func(const struct net_device *dev, const char *fmt, ...)    \
6191 {                                                               \
6192         int r;                                                  \
6193         struct va_format vaf;                                   \
6194         va_list args;                                           \
6195                                                                 \
6196         va_start(args, fmt);                                    \
6197                                                                 \
6198         vaf.fmt = fmt;                                          \
6199         vaf.va = &args;                                         \
6200                                                                 \
6201         r = __netdev_printk(level, dev, &vaf);                  \
6202                                                                 \
6203         va_end(args);                                           \
6204                                                                 \
6205         return r;                                               \
6206 }                                                               \
6207 EXPORT_SYMBOL(func);
6208
6209 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6210 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6211 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6212 define_netdev_printk_level(netdev_err, KERN_ERR);
6213 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6214 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6215 define_netdev_printk_level(netdev_info, KERN_INFO);
6216
6217 static void __net_exit netdev_exit(struct net *net)
6218 {
6219         kfree(net->dev_name_head);
6220         kfree(net->dev_index_head);
6221 }
6222
6223 static struct pernet_operations __net_initdata netdev_net_ops = {
6224         .init = netdev_init,
6225         .exit = netdev_exit,
6226 };
6227
6228 static void __net_exit default_device_exit(struct net *net)
6229 {
6230         struct net_device *dev, *aux;
6231         /*
6232          * Push all migratable network devices back to the
6233          * initial network namespace
6234          */
6235         rtnl_lock();
6236         for_each_netdev_safe(net, dev, aux) {
6237                 int err;
6238                 char fb_name[IFNAMSIZ];
6239
6240                 /* Ignore unmoveable devices (i.e. loopback) */
6241                 if (dev->features & NETIF_F_NETNS_LOCAL)
6242                         continue;
6243
6244                 /* Leave virtual devices for the generic cleanup */
6245                 if (dev->rtnl_link_ops)
6246                         continue;
6247
6248                 /* Push remaining network devices to init_net */
6249                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6250                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6251                 if (err) {
6252                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6253                                  __func__, dev->name, err);
6254                         BUG();
6255                 }
6256         }
6257         rtnl_unlock();
6258 }
6259
6260 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6261 {
6262         /* At exit all network devices most be removed from a network
6263          * namespace.  Do this in the reverse order of registration.
6264          * Do this across as many network namespaces as possible to
6265          * improve batching efficiency.
6266          */
6267         struct net_device *dev;
6268         struct net *net;
6269         LIST_HEAD(dev_kill_list);
6270
6271         rtnl_lock();
6272         list_for_each_entry(net, net_list, exit_list) {
6273                 for_each_netdev_reverse(net, dev) {
6274                         if (dev->rtnl_link_ops)
6275                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6276                         else
6277                                 unregister_netdevice_queue(dev, &dev_kill_list);
6278                 }
6279         }
6280         unregister_netdevice_many(&dev_kill_list);
6281         list_del(&dev_kill_list);
6282         rtnl_unlock();
6283 }
6284
6285 static struct pernet_operations __net_initdata default_device_ops = {
6286         .exit = default_device_exit,
6287         .exit_batch = default_device_exit_batch,
6288 };
6289
6290 /*
6291  *      Initialize the DEV module. At boot time this walks the device list and
6292  *      unhooks any devices that fail to initialise (normally hardware not
6293  *      present) and leaves us with a valid list of present and active devices.
6294  *
6295  */
6296
6297 /*
6298  *       This is called single threaded during boot, so no need
6299  *       to take the rtnl semaphore.
6300  */
6301 static int __init net_dev_init(void)
6302 {
6303         int i, rc = -ENOMEM;
6304
6305         BUG_ON(!dev_boot_phase);
6306
6307         if (dev_proc_init())
6308                 goto out;
6309
6310         if (netdev_kobject_init())
6311                 goto out;
6312
6313         INIT_LIST_HEAD(&ptype_all);
6314         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6315                 INIT_LIST_HEAD(&ptype_base[i]);
6316
6317         INIT_LIST_HEAD(&offload_base);
6318
6319         if (register_pernet_subsys(&netdev_net_ops))
6320                 goto out;
6321
6322         /*
6323          *      Initialise the packet receive queues.
6324          */
6325
6326         for_each_possible_cpu(i) {
6327                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6328
6329                 memset(sd, 0, sizeof(*sd));
6330                 skb_queue_head_init(&sd->input_pkt_queue);
6331                 skb_queue_head_init(&sd->process_queue);
6332                 sd->completion_queue = NULL;
6333                 INIT_LIST_HEAD(&sd->poll_list);
6334                 sd->output_queue = NULL;
6335                 sd->output_queue_tailp = &sd->output_queue;
6336 #ifdef CONFIG_RPS
6337                 sd->csd.func = rps_trigger_softirq;
6338                 sd->csd.info = sd;
6339                 sd->csd.flags = 0;
6340                 sd->cpu = i;
6341 #endif
6342
6343                 sd->backlog.poll = process_backlog;
6344                 sd->backlog.weight = weight_p;
6345                 sd->backlog.gro_list = NULL;
6346                 sd->backlog.gro_count = 0;
6347
6348 #ifdef CONFIG_NET_FLOW_LIMIT
6349                 sd->flow_limit = NULL;
6350 #endif
6351         }
6352
6353         dev_boot_phase = 0;
6354
6355         /* The loopback device is special if any other network devices
6356          * is present in a network namespace the loopback device must
6357          * be present. Since we now dynamically allocate and free the
6358          * loopback device ensure this invariant is maintained by
6359          * keeping the loopback device as the first device on the
6360          * list of network devices.  Ensuring the loopback devices
6361          * is the first device that appears and the last network device
6362          * that disappears.
6363          */
6364         if (register_pernet_device(&loopback_net_ops))
6365                 goto out;
6366
6367         if (register_pernet_device(&default_device_ops))
6368                 goto out;
6369
6370         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6371         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6372
6373         hotcpu_notifier(dev_cpu_callback, 0);
6374         dst_init();
6375         rc = 0;
6376 out:
6377         return rc;
6378 }
6379
6380 subsys_initcall(net_dev_init);