Pileus Git - ~andy/linux/blob - net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <linux/hashtable.h>
 133
 134 #include "net-sysfs.h"
 135
 136 /* Instead of increasing this, you should create a hash table. */
 137 #define MAX_GRO_SKBS 8
 138
 139 /* This should be increased if a protocol with a bigger head is added. */
 140 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 141
 142 static DEFINE_SPINLOCK(ptype_lock);
 143 static DEFINE_SPINLOCK(offload_lock);
 144 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 145 struct list_head ptype_all __read_mostly;       /* Taps */
 146 static struct list_head offload_base __read_mostly;
 147
 148 /*
 149  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 150  * semaphore.
 151  *
 152  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 153  *
 154  * Writers must hold the rtnl semaphore while they loop through the
 155  * dev_base_head list, and hold dev_base_lock for writing when they do the
 156  * actual updates.  This allows pure readers to access the list even
 157  * while a writer is preparing to update it.
 158  *
 159  * To put it another way, dev_base_lock is held for writing only to
 160  * protect against pure readers; the rtnl semaphore provides the
 161  * protection against other writers.
 162  *
 163  * See, for example usages, register_netdevice() and
 164  * unregister_netdevice(), which must be called with the rtnl
 165  * semaphore held.
 166  */
 167 DEFINE_RWLOCK(dev_base_lock);
 168 EXPORT_SYMBOL(dev_base_lock);
 169
 170 /* protects napi_hash addition/deletion and napi_gen_id */
 171 static DEFINE_SPINLOCK(napi_hash_lock);
 172
 173 static unsigned int napi_gen_id;
 174 static DEFINE_HASHTABLE(napi_hash, 8);
 175
 176 seqcount_t devnet_rename_seq;
 177
 178 static inline void dev_base_seq_inc(struct net *net)
 179 {
 180         while (++net->dev_base_seq == 0);
 181 }
 182
 183 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 184 {
 185         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 186
 187         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 188 }
 189
 190 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 191 {
 192         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 193 }
 194
 195 static inline void rps_lock(struct softnet_data *sd)
 196 {
 197 #ifdef CONFIG_RPS
 198         spin_lock(&sd->input_pkt_queue.lock);
 199 #endif
 200 }
 201
 202 static inline void rps_unlock(struct softnet_data *sd)
 203 {
 204 #ifdef CONFIG_RPS
 205         spin_unlock(&sd->input_pkt_queue.lock);
 206 #endif
 207 }
 208
 209 /* Device list insertion */
 210 static void list_netdevice(struct net_device *dev)
 211 {
 212         struct net *net = dev_net(dev);
 213
 214         ASSERT_RTNL();
 215
 216         write_lock_bh(&dev_base_lock);
 217         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 218         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 219         hlist_add_head_rcu(&dev->index_hlist,
 220                            dev_index_hash(net, dev->ifindex));
 221         write_unlock_bh(&dev_base_lock);
 222
 223         dev_base_seq_inc(net);
 224 }
 225
 226 /* Device list removal
 227  * caller must respect a RCU grace period before freeing/reusing dev
 228  */
 229 static void unlist_netdevice(struct net_device *dev)
 230 {
 231         ASSERT_RTNL();
 232
 233         /* Unlink dev from the device chain */
 234         write_lock_bh(&dev_base_lock);
 235         list_del_rcu(&dev->dev_list);
 236         hlist_del_rcu(&dev->name_hlist);
 237         hlist_del_rcu(&dev->index_hlist);
 238         write_unlock_bh(&dev_base_lock);
 239
 240         dev_base_seq_inc(dev_net(dev));
 241 }
 242
 243 /*
 244  *      Our notifier list
 245  */
 246
 247 static RAW_NOTIFIER_HEAD(netdev_chain);
 248
 249 /*
 250  *      Device drivers call our routines to queue packets here. We empty the
 251  *      queue in the local softnet handler.
 252  */
 253
 254 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 255 EXPORT_PER_CPU_SYMBOL(softnet_data);
 256
 257 #ifdef CONFIG_LOCKDEP
 258 /*
 259  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 260  * according to dev->type
 261  */
 262 static const unsigned short netdev_lock_type[] =
 263         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 264          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 265          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 266          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 267          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 268          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 269          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 270          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 271          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 272          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 273          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 274          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 275          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 276          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 277          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 278
 279 static const char *const netdev_lock_name[] =
 280         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 281          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 282          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 283          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 284          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 285          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 286          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 287          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 288          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 289          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 290          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 291          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 292          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 293          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 294          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 295
 296 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 297 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 298
 299 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 300 {
 301         int i;
 302
 303         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 304                 if (netdev_lock_type[i] == dev_type)
 305                         return i;
 306         /* the last key is used by default */
 307         return ARRAY_SIZE(netdev_lock_type) - 1;
 308 }
 309
 310 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 311                                                  unsigned short dev_type)
 312 {
 313         int i;
 314
 315         i = netdev_lock_pos(dev_type);
 316         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 317                                    netdev_lock_name[i]);
 318 }
 319
 320 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 321 {
 322         int i;
 323
 324         i = netdev_lock_pos(dev->type);
 325         lockdep_set_class_and_name(&dev->addr_list_lock,
 326                                    &netdev_addr_lock_key[i],
 327                                    netdev_lock_name[i]);
 328 }
 329 #else
 330 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 331                                                  unsigned short dev_type)
 332 {
 333 }
 334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335 {
 336 }
 337 #endif
 338
 339 /*******************************************************************************
 340
 341                 Protocol management and registration routines
 342
 343 *******************************************************************************/
 344
 345 /*
 346  *      Add a protocol ID to the list. Now that the input handler is
 347  *      smarter we can dispense with all the messy stuff that used to be
 348  *      here.
 349  *
 350  *      BEWARE!!! Protocol handlers, mangling input packets,
 351  *      MUST BE last in hash buckets and checking protocol handlers
 352  *      MUST start from promiscuous ptype_all chain in net_bh.
 353  *      It is true now, do not change it.
 354  *      Explanation follows: if protocol handler, mangling packet, will
 355  *      be the first on list, it is not able to sense, that packet
 356  *      is cloned and should be copied-on-write, so that it will
 357  *      change it and subsequent readers will get broken packet.
 358  *                                                      --ANK (980803)
 359  */
 360
 361 static inline struct list_head *ptype_head(const struct packet_type *pt)
 362 {
 363         if (pt->type == htons(ETH_P_ALL))
 364                 return &ptype_all;
 365         else
 366                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 367 }
 368
 369 /**
 370  *      dev_add_pack - add packet handler
 371  *      @pt: packet type declaration
 372  *
 373  *      Add a protocol handler to the networking stack. The passed &packet_type
 374  *      is linked into kernel lists and may not be freed until it has been
 375  *      removed from the kernel lists.
 376  *
 377  *      This call does not sleep therefore it can not
 378  *      guarantee all CPU's that are in middle of receiving packets
 379  *      will see the new packet type (until the next received packet).
 380  */
 381
 382 void dev_add_pack(struct packet_type *pt)
 383 {
 384         struct list_head *head = ptype_head(pt);
 385
 386         spin_lock(&ptype_lock);
 387         list_add_rcu(&pt->list, head);
 388         spin_unlock(&ptype_lock);
 389 }
 390 EXPORT_SYMBOL(dev_add_pack);
 391
 392 /**
 393  *      __dev_remove_pack        - remove packet handler
 394  *      @pt: packet type declaration
 395  *
 396  *      Remove a protocol handler that was previously added to the kernel
 397  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 398  *      from the kernel lists and can be freed or reused once this function
 399  *      returns.
 400  *
 401  *      The packet type might still be in use by receivers
 402  *      and must not be freed until after all the CPU's have gone
 403  *      through a quiescent state.
 404  */
 405 void __dev_remove_pack(struct packet_type *pt)
 406 {
 407         struct list_head *head = ptype_head(pt);
 408         struct packet_type *pt1;
 409
 410         spin_lock(&ptype_lock);
 411
 412         list_for_each_entry(pt1, head, list) {
 413                 if (pt == pt1) {
 414                         list_del_rcu(&pt->list);
 415                         goto out;
 416                 }
 417         }
 418
 419         pr_warn("dev_remove_pack: %p not found\n", pt);
 420 out:
 421         spin_unlock(&ptype_lock);
 422 }
 423 EXPORT_SYMBOL(__dev_remove_pack);
 424
 425 /**
 426  *      dev_remove_pack  - remove packet handler
 427  *      @pt: packet type declaration
 428  *
 429  *      Remove a protocol handler that was previously added to the kernel
 430  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 431  *      from the kernel lists and can be freed or reused once this function
 432  *      returns.
 433  *
 434  *      This call sleeps to guarantee that no CPU is looking at the packet
 435  *      type after return.
 436  */
 437 void dev_remove_pack(struct packet_type *pt)
 438 {
 439         __dev_remove_pack(pt);
 440
 441         synchronize_net();
 442 }
 443 EXPORT_SYMBOL(dev_remove_pack);
 444
 445
 446 /**
 447  *      dev_add_offload - register offload handlers
 448  *      @po: protocol offload declaration
 449  *
 450  *      Add protocol offload handlers to the networking stack. The passed
 451  *      &proto_offload is linked into kernel lists and may not be freed until
 452  *      it has been removed from the kernel lists.
 453  *
 454  *      This call does not sleep therefore it can not
 455  *      guarantee all CPU's that are in middle of receiving packets
 456  *      will see the new offload handlers (until the next received packet).
 457  */
 458 void dev_add_offload(struct packet_offload *po)
 459 {
 460         struct list_head *head = &offload_base;
 461
 462         spin_lock(&offload_lock);
 463         list_add_rcu(&po->list, head);
 464         spin_unlock(&offload_lock);
 465 }
 466 EXPORT_SYMBOL(dev_add_offload);
 467
 468 /**
 469  *      __dev_remove_offload     - remove offload handler
 470  *      @po: packet offload declaration
 471  *
 472  *      Remove a protocol offload handler that was previously added to the
 473  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 474  *      is removed from the kernel lists and can be freed or reused once this
 475  *      function returns.
 476  *
 477  *      The packet type might still be in use by receivers
 478  *      and must not be freed until after all the CPU's have gone
 479  *      through a quiescent state.
 480  */
 481 void __dev_remove_offload(struct packet_offload *po)
 482 {
 483         struct list_head *head = &offload_base;
 484         struct packet_offload *po1;
 485
 486         spin_lock(&offload_lock);
 487
 488         list_for_each_entry(po1, head, list) {
 489                 if (po == po1) {
 490                         list_del_rcu(&po->list);
 491                         goto out;
 492                 }
 493         }
 494
 495         pr_warn("dev_remove_offload: %p not found\n", po);
 496 out:
 497         spin_unlock(&offload_lock);
 498 }
 499 EXPORT_SYMBOL(__dev_remove_offload);
 500
 501 /**
 502  *      dev_remove_offload       - remove packet offload handler
 503  *      @po: packet offload declaration
 504  *
 505  *      Remove a packet offload handler that was previously added to the kernel
 506  *      offload handlers by dev_add_offload(). The passed &offload_type is
 507  *      removed from the kernel lists and can be freed or reused once this
 508  *      function returns.
 509  *
 510  *      This call sleeps to guarantee that no CPU is looking at the packet
 511  *      type after return.
 512  */
 513 void dev_remove_offload(struct packet_offload *po)
 514 {
 515         __dev_remove_offload(po);
 516
 517         synchronize_net();
 518 }
 519 EXPORT_SYMBOL(dev_remove_offload);
 520
 521 /******************************************************************************
 522
 523                       Device Boot-time Settings Routines
 524
 525 *******************************************************************************/
 526
 527 /* Boot time configuration table */
 528 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 529
 530 /**
 531  *      netdev_boot_setup_add   - add new setup entry
 532  *      @name: name of the device
 533  *      @map: configured settings for the device
 534  *
 535  *      Adds new setup entry to the dev_boot_setup list.  The function
 536  *      returns 0 on error and 1 on success.  This is a generic routine to
 537  *      all netdevices.
 538  */
 539 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 540 {
 541         struct netdev_boot_setup *s;
 542         int i;
 543
 544         s = dev_boot_setup;
 545         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 546                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 547                         memset(s[i].name, 0, sizeof(s[i].name));
 548                         strlcpy(s[i].name, name, IFNAMSIZ);
 549                         memcpy(&s[i].map, map, sizeof(s[i].map));
 550                         break;
 551                 }
 552         }
 553
 554         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 555 }
 556
 557 /**
 558  *      netdev_boot_setup_check - check boot time settings
 559  *      @dev: the netdevice
 560  *
 561  *      Check boot time settings for the device.
 562  *      The found settings are set for the device to be used
 563  *      later in the device probing.
 564  *      Returns 0 if no settings found, 1 if they are.
 565  */
 566 int netdev_boot_setup_check(struct net_device *dev)
 567 {
 568         struct netdev_boot_setup *s = dev_boot_setup;
 569         int i;
 570
 571         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 572                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 573                     !strcmp(dev->name, s[i].name)) {
 574                         dev->irq        = s[i].map.irq;
 575                         dev->base_addr  = s[i].map.base_addr;
 576                         dev->mem_start  = s[i].map.mem_start;
 577                         dev->mem_end    = s[i].map.mem_end;
 578                         return 1;
 579                 }
 580         }
 581         return 0;
 582 }
 583 EXPORT_SYMBOL(netdev_boot_setup_check);
 584
 585
 586 /**
 587  *      netdev_boot_base        - get address from boot time settings
 588  *      @prefix: prefix for network device
 589  *      @unit: id for network device
 590  *
 591  *      Check boot time settings for the base address of device.
 592  *      The found settings are set for the device to be used
 593  *      later in the device probing.
 594  *      Returns 0 if no settings found.
 595  */
 596 unsigned long netdev_boot_base(const char *prefix, int unit)
 597 {
 598         const struct netdev_boot_setup *s = dev_boot_setup;
 599         char name[IFNAMSIZ];
 600         int i;
 601
 602         sprintf(name, "%s%d", prefix, unit);
 603
 604         /*
 605          * If device already registered then return base of 1
 606          * to indicate not to probe for this interface
 607          */
 608         if (__dev_get_by_name(&init_net, name))
 609                 return 1;
 610
 611         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 612                 if (!strcmp(name, s[i].name))
 613                         return s[i].map.base_addr;
 614         return 0;
 615 }
 616
 617 /*
 618  * Saves at boot time configured settings for any netdevice.
 619  */
 620 int __init netdev_boot_setup(char *str)
 621 {
 622         int ints[5];
 623         struct ifmap map;
 624
 625         str = get_options(str, ARRAY_SIZE(ints), ints);
 626         if (!str || !*str)
 627                 return 0;
 628
 629         /* Save settings */
 630         memset(&map, 0, sizeof(map));
 631         if (ints[0] > 0)
 632                 map.irq = ints[1];
 633         if (ints[0] > 1)
 634                 map.base_addr = ints[2];
 635         if (ints[0] > 2)
 636                 map.mem_start = ints[3];
 637         if (ints[0] > 3)
 638                 map.mem_end = ints[4];
 639
 640         /* Add new entry to the list */
 641         return netdev_boot_setup_add(str, &map);
 642 }
 643
 644 __setup("netdev=", netdev_boot_setup);
 645
 646 /*******************************************************************************
 647
 648                             Device Interface Subroutines
 649
 650 *******************************************************************************/
 651
 652 /**
 653  *      __dev_get_by_name       - find a device by its name
 654  *      @net: the applicable net namespace
 655  *      @name: name to find
 656  *
 657  *      Find an interface by name. Must be called under RTNL semaphore
 658  *      or @dev_base_lock. If the name is found a pointer to the device
 659  *      is returned. If the name is not found then %NULL is returned. The
 660  *      reference counters are not incremented so the caller must be
 661  *      careful with locks.
 662  */
 663
 664 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 665 {
 666         struct net_device *dev;
 667         struct hlist_head *head = dev_name_hash(net, name);
 668
 669         hlist_for_each_entry(dev, head, name_hlist)
 670                 if (!strncmp(dev->name, name, IFNAMSIZ))
 671                         return dev;
 672
 673         return NULL;
 674 }
 675 EXPORT_SYMBOL(__dev_get_by_name);
 676
 677 /**
 678  *      dev_get_by_name_rcu     - find a device by its name
 679  *      @net: the applicable net namespace
 680  *      @name: name to find
 681  *
 682  *      Find an interface by name.
 683  *      If the name is found a pointer to the device is returned.
 684  *      If the name is not found then %NULL is returned.
 685  *      The reference counters are not incremented so the caller must be
 686  *      careful with locks. The caller must hold RCU lock.
 687  */
 688
 689 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 690 {
 691         struct net_device *dev;
 692         struct hlist_head *head = dev_name_hash(net, name);
 693
 694         hlist_for_each_entry_rcu(dev, head, name_hlist)
 695                 if (!strncmp(dev->name, name, IFNAMSIZ))
 696                         return dev;
 697
 698         return NULL;
 699 }
 700 EXPORT_SYMBOL(dev_get_by_name_rcu);
 701
 702 /**
 703  *      dev_get_by_name         - find a device by its name
 704  *      @net: the applicable net namespace
 705  *      @name: name to find
 706  *
 707  *      Find an interface by name. This can be called from any
 708  *      context and does its own locking. The returned handle has
 709  *      the usage count incremented and the caller must use dev_put() to
 710  *      release it when it is no longer needed. %NULL is returned if no
 711  *      matching device is found.
 712  */
 713
 714 struct net_device *dev_get_by_name(struct net *net, const char *name)
 715 {
 716         struct net_device *dev;
 717
 718         rcu_read_lock();
 719         dev = dev_get_by_name_rcu(net, name);
 720         if (dev)
 721                 dev_hold(dev);
 722         rcu_read_unlock();
 723         return dev;
 724 }
 725 EXPORT_SYMBOL(dev_get_by_name);
 726
 727 /**
 728  *      __dev_get_by_index - find a device by its ifindex
 729  *      @net: the applicable net namespace
 730  *      @ifindex: index of device
 731  *
 732  *      Search for an interface by index. Returns %NULL if the device
 733  *      is not found or a pointer to the device. The device has not
 734  *      had its reference counter increased so the caller must be careful
 735  *      about locking. The caller must hold either the RTNL semaphore
 736  *      or @dev_base_lock.
 737  */
 738
 739 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 740 {
 741         struct net_device *dev;
 742         struct hlist_head *head = dev_index_hash(net, ifindex);
 743
 744         hlist_for_each_entry(dev, head, index_hlist)
 745                 if (dev->ifindex == ifindex)
 746                         return dev;
 747
 748         return NULL;
 749 }
 750 EXPORT_SYMBOL(__dev_get_by_index);
 751
 752 /**
 753  *      dev_get_by_index_rcu - find a device by its ifindex
 754  *      @net: the applicable net namespace
 755  *      @ifindex: index of device
 756  *
 757  *      Search for an interface by index. Returns %NULL if the device
 758  *      is not found or a pointer to the device. The device has not
 759  *      had its reference counter increased so the caller must be careful
 760  *      about locking. The caller must hold RCU lock.
 761  */
 762
 763 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 764 {
 765         struct net_device *dev;
 766         struct hlist_head *head = dev_index_hash(net, ifindex);
 767
 768         hlist_for_each_entry_rcu(dev, head, index_hlist)
 769                 if (dev->ifindex == ifindex)
 770                         return dev;
 771
 772         return NULL;
 773 }
 774 EXPORT_SYMBOL(dev_get_by_index_rcu);
 775
 776
 777 /**
 778  *      dev_get_by_index - find a device by its ifindex
 779  *      @net: the applicable net namespace
 780  *      @ifindex: index of device
 781  *
 782  *      Search for an interface by index. Returns NULL if the device
 783  *      is not found or a pointer to the device. The device returned has
 784  *      had a reference added and the pointer is safe until the user calls
 785  *      dev_put to indicate they have finished with it.
 786  */
 787
 788 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 789 {
 790         struct net_device *dev;
 791
 792         rcu_read_lock();
 793         dev = dev_get_by_index_rcu(net, ifindex);
 794         if (dev)
 795                 dev_hold(dev);
 796         rcu_read_unlock();
 797         return dev;
 798 }
 799 EXPORT_SYMBOL(dev_get_by_index);
 800
 801 /**
 802  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 803  *      @net: the applicable net namespace
 804  *      @type: media type of device
 805  *      @ha: hardware address
 806  *
 807  *      Search for an interface by MAC address. Returns NULL if the device
 808  *      is not found or a pointer to the device.
 809  *      The caller must hold RCU or RTNL.
 810  *      The returned device has not had its ref count increased
 811  *      and the caller must therefore be careful about locking
 812  *
 813  */
 814
 815 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 816                                        const char *ha)
 817 {
 818         struct net_device *dev;
 819
 820         for_each_netdev_rcu(net, dev)
 821                 if (dev->type == type &&
 822                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 823                         return dev;
 824
 825         return NULL;
 826 }
 827 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 828
 829 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 830 {
 831         struct net_device *dev;
 832
 833         ASSERT_RTNL();
 834         for_each_netdev(net, dev)
 835                 if (dev->type == type)
 836                         return dev;
 837
 838         return NULL;
 839 }
 840 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 841
 842 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 843 {
 844         struct net_device *dev, *ret = NULL;
 845
 846         rcu_read_lock();
 847         for_each_netdev_rcu(net, dev)
 848                 if (dev->type == type) {
 849                         dev_hold(dev);
 850                         ret = dev;
 851                         break;
 852                 }
 853         rcu_read_unlock();
 854         return ret;
 855 }
 856 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 857
 858 /**
 859  *      dev_get_by_flags_rcu - find any device with given flags
 860  *      @net: the applicable net namespace
 861  *      @if_flags: IFF_* values
 862  *      @mask: bitmask of bits in if_flags to check
 863  *
 864  *      Search for any interface with the given flags. Returns NULL if a device
 865  *      is not found or a pointer to the device. Must be called inside
 866  *      rcu_read_lock(), and result refcount is unchanged.
 867  */
 868
 869 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 870                                     unsigned short mask)
 871 {
 872         struct net_device *dev, *ret;
 873
 874         ret = NULL;
 875         for_each_netdev_rcu(net, dev) {
 876                 if (((dev->flags ^ if_flags) & mask) == 0) {
 877                         ret = dev;
 878                         break;
 879                 }
 880         }
 881         return ret;
 882 }
 883 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 884
 885 /**
 886  *      dev_valid_name - check if name is okay for network device
 887  *      @name: name string
 888  *
 889  *      Network device names need to be valid file names to
 890  *      to allow sysfs to work.  We also disallow any kind of
 891  *      whitespace.
 892  */
 893 bool dev_valid_name(const char *name)
 894 {
 895         if (*name == '\0')
 896                 return false;
 897         if (strlen(name) >= IFNAMSIZ)
 898                 return false;
 899         if (!strcmp(name, ".") || !strcmp(name, ".."))
 900                 return false;
 901
 902         while (*name) {
 903                 if (*name == '/' || isspace(*name))
 904                         return false;
 905                 name++;
 906         }
 907         return true;
 908 }
 909 EXPORT_SYMBOL(dev_valid_name);
 910
 911 /**
 912  *      __dev_alloc_name - allocate a name for a device
 913  *      @net: network namespace to allocate the device name in
 914  *      @name: name format string
 915  *      @buf:  scratch buffer and result name string
 916  *
 917  *      Passed a format string - eg "lt%d" it will try and find a suitable
 918  *      id. It scans list of devices to build up a free map, then chooses
 919  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 920  *      while allocating the name and adding the device in order to avoid
 921  *      duplicates.
 922  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 923  *      Returns the number of the unit assigned or a negative errno code.
 924  */
 925
 926 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 927 {
 928         int i = 0;
 929         const char *p;
 930         const int max_netdevices = 8*PAGE_SIZE;
 931         unsigned long *inuse;
 932         struct net_device *d;
 933
 934         p = strnchr(name, IFNAMSIZ-1, '%');
 935         if (p) {
 936                 /*
 937                  * Verify the string as this thing may have come from
 938                  * the user.  There must be either one "%d" and no other "%"
 939                  * characters.
 940                  */
 941                 if (p[1] != 'd' || strchr(p + 2, '%'))
 942                         return -EINVAL;
 943
 944                 /* Use one page as a bit array of possible slots */
 945                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 946                 if (!inuse)
 947                         return -ENOMEM;
 948
 949                 for_each_netdev(net, d) {
 950                         if (!sscanf(d->name, name, &i))
 951                                 continue;
 952                         if (i < 0 || i >= max_netdevices)
 953                                 continue;
 954
 955                         /*  avoid cases where sscanf is not exact inverse of printf */
 956                         snprintf(buf, IFNAMSIZ, name, i);
 957                         if (!strncmp(buf, d->name, IFNAMSIZ))
 958                                 set_bit(i, inuse);
 959                 }
 960
 961                 i = find_first_zero_bit(inuse, max_netdevices);
 962                 free_page((unsigned long) inuse);
 963         }
 964
 965         if (buf != name)
 966                 snprintf(buf, IFNAMSIZ, name, i);
 967         if (!__dev_get_by_name(net, buf))
 968                 return i;
 969
 970         /* It is possible to run out of possible slots
 971          * when the name is long and there isn't enough space left
 972          * for the digits, or if all bits are used.
 973          */
 974         return -ENFILE;
 975 }
 976
 977 /**
 978  *      dev_alloc_name - allocate a name for a device
 979  *      @dev: device
 980  *      @name: name format string
 981  *
 982  *      Passed a format string - eg "lt%d" it will try and find a suitable
 983  *      id. It scans list of devices to build up a free map, then chooses
 984  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 985  *      while allocating the name and adding the device in order to avoid
 986  *      duplicates.
 987  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 988  *      Returns the number of the unit assigned or a negative errno code.
 989  */
 990
 991 int dev_alloc_name(struct net_device *dev, const char *name)
 992 {
 993         char buf[IFNAMSIZ];
 994         struct net *net;
 995         int ret;
 996
 997         BUG_ON(!dev_net(dev));
 998         net = dev_net(dev);
 999         ret = __dev_alloc_name(net, name, buf);
1000         if (ret >= 0)
1001                 strlcpy(dev->name, buf, IFNAMSIZ);
1002         return ret;
1003 }
1004 EXPORT_SYMBOL(dev_alloc_name);
1005
1006 static int dev_alloc_name_ns(struct net *net,
1007                              struct net_device *dev,
1008                              const char *name)
1009 {
1010         char buf[IFNAMSIZ];
1011         int ret;
1012
1013         ret = __dev_alloc_name(net, name, buf);
1014         if (ret >= 0)
1015                 strlcpy(dev->name, buf, IFNAMSIZ);
1016         return ret;
1017 }
1018
1019 static int dev_get_valid_name(struct net *net,
1020                               struct net_device *dev,
1021                               const char *name)
1022 {
1023         BUG_ON(!net);
1024
1025         if (!dev_valid_name(name))
1026                 return -EINVAL;
1027
1028         if (strchr(name, '%'))
1029                 return dev_alloc_name_ns(net, dev, name);
1030         else if (__dev_get_by_name(net, name))
1031                 return -EEXIST;
1032         else if (dev->name != name)
1033                 strlcpy(dev->name, name, IFNAMSIZ);
1034
1035         return 0;
1036 }
1037
1038 /**
1039  *      dev_change_name - change name of a device
1040  *      @dev: device
1041  *      @newname: name (or format string) must be at least IFNAMSIZ
1042  *
1043  *      Change name of a device, can pass format strings "eth%d".
1044  *      for wildcarding.
1045  */
1046 int dev_change_name(struct net_device *dev, const char *newname)
1047 {
1048         char oldname[IFNAMSIZ];
1049         int err = 0;
1050         int ret;
1051         struct net *net;
1052
1053         ASSERT_RTNL();
1054         BUG_ON(!dev_net(dev));
1055
1056         net = dev_net(dev);
1057         if (dev->flags & IFF_UP)
1058                 return -EBUSY;
1059
1060         write_seqcount_begin(&devnet_rename_seq);
1061
1062         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1063                 write_seqcount_end(&devnet_rename_seq);
1064                 return 0;
1065         }
1066
1067         memcpy(oldname, dev->name, IFNAMSIZ);
1068
1069         err = dev_get_valid_name(net, dev, newname);
1070         if (err < 0) {
1071                 write_seqcount_end(&devnet_rename_seq);
1072                 return err;
1073         }
1074
1075 rollback:
1076         ret = device_rename(&dev->dev, dev->name);
1077         if (ret) {
1078                 memcpy(dev->name, oldname, IFNAMSIZ);
1079                 write_seqcount_end(&devnet_rename_seq);
1080                 return ret;
1081         }
1082
1083         write_seqcount_end(&devnet_rename_seq);
1084
1085         write_lock_bh(&dev_base_lock);
1086         hlist_del_rcu(&dev->name_hlist);
1087         write_unlock_bh(&dev_base_lock);
1088
1089         synchronize_rcu();
1090
1091         write_lock_bh(&dev_base_lock);
1092         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1093         write_unlock_bh(&dev_base_lock);
1094
1095         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1096         ret = notifier_to_errno(ret);
1097
1098         if (ret) {
1099                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1100                 if (err >= 0) {
1101                         err = ret;
1102                         write_seqcount_begin(&devnet_rename_seq);
1103                         memcpy(dev->name, oldname, IFNAMSIZ);
1104                         goto rollback;
1105                 } else {
1106                         pr_err("%s: name change rollback failed: %d\n",
1107                                dev->name, ret);
1108                 }
1109         }
1110
1111         return err;
1112 }
1113
1114 /**
1115  *      dev_set_alias - change ifalias of a device
1116  *      @dev: device
1117  *      @alias: name up to IFALIASZ
1118  *      @len: limit of bytes to copy from info
1119  *
1120  *      Set ifalias for a device,
1121  */
1122 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1123 {
1124         char *new_ifalias;
1125
1126         ASSERT_RTNL();
1127
1128         if (len >= IFALIASZ)
1129                 return -EINVAL;
1130
1131         if (!len) {
1132                 kfree(dev->ifalias);
1133                 dev->ifalias = NULL;
1134                 return 0;
1135         }
1136
1137         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1138         if (!new_ifalias)
1139                 return -ENOMEM;
1140         dev->ifalias = new_ifalias;
1141
1142         strlcpy(dev->ifalias, alias, len+1);
1143         return len;
1144 }
1145
1146
1147 /**
1148  *      netdev_features_change - device changes features
1149  *      @dev: device to cause notification
1150  *
1151  *      Called to indicate a device has changed features.
1152  */
1153 void netdev_features_change(struct net_device *dev)
1154 {
1155         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1156 }
1157 EXPORT_SYMBOL(netdev_features_change);
1158
1159 /**
1160  *      netdev_state_change - device changes state
1161  *      @dev: device to cause notification
1162  *
1163  *      Called to indicate a device has changed state. This function calls
1164  *      the notifier chains for netdev_chain and sends a NEWLINK message
1165  *      to the routing socket.
1166  */
1167 void netdev_state_change(struct net_device *dev)
1168 {
1169         if (dev->flags & IFF_UP) {
1170                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1171                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1172         }
1173 }
1174 EXPORT_SYMBOL(netdev_state_change);
1175
1176 /**
1177  *      netdev_notify_peers - notify network peers about existence of @dev
1178  *      @dev: network device
1179  *
1180  * Generate traffic such that interested network peers are aware of
1181  * @dev, such as by generating a gratuitous ARP. This may be used when
1182  * a device wants to inform the rest of the network about some sort of
1183  * reconfiguration such as a failover event or virtual machine
1184  * migration.
1185  */
1186 void netdev_notify_peers(struct net_device *dev)
1187 {
1188         rtnl_lock();
1189         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1190         rtnl_unlock();
1191 }
1192 EXPORT_SYMBOL(netdev_notify_peers);
1193
1194 static int __dev_open(struct net_device *dev)
1195 {
1196         const struct net_device_ops *ops = dev->netdev_ops;
1197         int ret;
1198
1199         ASSERT_RTNL();
1200
1201         if (!netif_device_present(dev))
1202                 return -ENODEV;
1203
1204         /* Block netpoll from trying to do any rx path servicing.
1205          * If we don't do this there is a chance ndo_poll_controller
1206          * or ndo_poll may be running while we open the device
1207          */
1208         netpoll_rx_disable(dev);
1209
1210         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1211         ret = notifier_to_errno(ret);
1212         if (ret)
1213                 return ret;
1214
1215         set_bit(__LINK_STATE_START, &dev->state);
1216
1217         if (ops->ndo_validate_addr)
1218                 ret = ops->ndo_validate_addr(dev);
1219
1220         if (!ret && ops->ndo_open)
1221                 ret = ops->ndo_open(dev);
1222
1223         netpoll_rx_enable(dev);
1224
1225         if (ret)
1226                 clear_bit(__LINK_STATE_START, &dev->state);
1227         else {
1228                 dev->flags |= IFF_UP;
1229                 net_dmaengine_get();
1230                 dev_set_rx_mode(dev);
1231                 dev_activate(dev);
1232                 add_device_randomness(dev->dev_addr, dev->addr_len);
1233         }
1234
1235         return ret;
1236 }
1237
1238 /**
1239  *      dev_open        - prepare an interface for use.
1240  *      @dev:   device to open
1241  *
1242  *      Takes a device from down to up state. The device's private open
1243  *      function is invoked and then the multicast lists are loaded. Finally
1244  *      the device is moved into the up state and a %NETDEV_UP message is
1245  *      sent to the netdev notifier chain.
1246  *
1247  *      Calling this function on an active interface is a nop. On a failure
1248  *      a negative errno code is returned.
1249  */
1250 int dev_open(struct net_device *dev)
1251 {
1252         int ret;
1253
1254         if (dev->flags & IFF_UP)
1255                 return 0;
1256
1257         ret = __dev_open(dev);
1258         if (ret < 0)
1259                 return ret;
1260
1261         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1262         call_netdevice_notifiers(NETDEV_UP, dev);
1263
1264         return ret;
1265 }
1266 EXPORT_SYMBOL(dev_open);
1267
1268 static int __dev_close_many(struct list_head *head)
1269 {
1270         struct net_device *dev;
1271
1272         ASSERT_RTNL();
1273         might_sleep();
1274
1275         list_for_each_entry(dev, head, unreg_list) {
1276                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1277
1278                 clear_bit(__LINK_STATE_START, &dev->state);
1279
1280                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1281                  * can be even on different cpu. So just clear netif_running().
1282                  *
1283                  * dev->stop() will invoke napi_disable() on all of it's
1284                  * napi_struct instances on this device.
1285                  */
1286                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1287         }
1288
1289         dev_deactivate_many(head);
1290
1291         list_for_each_entry(dev, head, unreg_list) {
1292                 const struct net_device_ops *ops = dev->netdev_ops;
1293
1294                 /*
1295                  *      Call the device specific close. This cannot fail.
1296                  *      Only if device is UP
1297                  *
1298                  *      We allow it to be called even after a DETACH hot-plug
1299                  *      event.
1300                  */
1301                 if (ops->ndo_stop)
1302                         ops->ndo_stop(dev);
1303
1304                 dev->flags &= ~IFF_UP;
1305                 net_dmaengine_put();
1306         }
1307
1308         return 0;
1309 }
1310
1311 static int __dev_close(struct net_device *dev)
1312 {
1313         int retval;
1314         LIST_HEAD(single);
1315
1316         /* Temporarily disable netpoll until the interface is down */
1317         netpoll_rx_disable(dev);
1318
1319         list_add(&dev->unreg_list, &single);
1320         retval = __dev_close_many(&single);
1321         list_del(&single);
1322
1323         netpoll_rx_enable(dev);
1324         return retval;
1325 }
1326
1327 static int dev_close_many(struct list_head *head)
1328 {
1329         struct net_device *dev, *tmp;
1330         LIST_HEAD(tmp_list);
1331
1332         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1333                 if (!(dev->flags & IFF_UP))
1334                         list_move(&dev->unreg_list, &tmp_list);
1335
1336         __dev_close_many(head);
1337
1338         list_for_each_entry(dev, head, unreg_list) {
1339                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1340                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1341         }
1342
1343         /* rollback_registered_many needs the complete original list */
1344         list_splice(&tmp_list, head);
1345         return 0;
1346 }
1347
1348 /**
1349  *      dev_close - shutdown an interface.
1350  *      @dev: device to shutdown
1351  *
1352  *      This function moves an active device into down state. A
1353  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1354  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1355  *      chain.
1356  */
1357 int dev_close(struct net_device *dev)
1358 {
1359         if (dev->flags & IFF_UP) {
1360                 LIST_HEAD(single);
1361
1362                 /* Block netpoll rx while the interface is going down */
1363                 netpoll_rx_disable(dev);
1364
1365                 list_add(&dev->unreg_list, &single);
1366                 dev_close_many(&single);
1367                 list_del(&single);
1368
1369                 netpoll_rx_enable(dev);
1370         }
1371         return 0;
1372 }
1373 EXPORT_SYMBOL(dev_close);
1374
1375
1376 /**
1377  *      dev_disable_lro - disable Large Receive Offload on a device
1378  *      @dev: device
1379  *
1380  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1381  *      called under RTNL.  This is needed if received packets may be
1382  *      forwarded to another interface.
1383  */
1384 void dev_disable_lro(struct net_device *dev)
1385 {
1386         /*
1387          * If we're trying to disable lro on a vlan device
1388          * use the underlying physical device instead
1389          */
1390         if (is_vlan_dev(dev))
1391                 dev = vlan_dev_real_dev(dev);
1392
1393         dev->wanted_features &= ~NETIF_F_LRO;
1394         netdev_update_features(dev);
1395
1396         if (unlikely(dev->features & NETIF_F_LRO))
1397                 netdev_WARN(dev, "failed to disable LRO!\n");
1398 }
1399 EXPORT_SYMBOL(dev_disable_lro);
1400
1401 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1402                                    struct net_device *dev)
1403 {
1404         struct netdev_notifier_info info;
1405
1406         netdev_notifier_info_init(&info, dev);
1407         return nb->notifier_call(nb, val, &info);
1408 }
1409
1410 static int dev_boot_phase = 1;
1411
1412 /**
1413  *      register_netdevice_notifier - register a network notifier block
1414  *      @nb: notifier
1415  *
1416  *      Register a notifier to be called when network device events occur.
1417  *      The notifier passed is linked into the kernel structures and must
1418  *      not be reused until it has been unregistered. A negative errno code
1419  *      is returned on a failure.
1420  *
1421  *      When registered all registration and up events are replayed
1422  *      to the new notifier to allow device to have a race free
1423  *      view of the network device list.
1424  */
1425
1426 int register_netdevice_notifier(struct notifier_block *nb)
1427 {
1428         struct net_device *dev;
1429         struct net_device *last;
1430         struct net *net;
1431         int err;
1432
1433         rtnl_lock();
1434         err = raw_notifier_chain_register(&netdev_chain, nb);
1435         if (err)
1436                 goto unlock;
1437         if (dev_boot_phase)
1438                 goto unlock;
1439         for_each_net(net) {
1440                 for_each_netdev(net, dev) {
1441                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1442                         err = notifier_to_errno(err);
1443                         if (err)
1444                                 goto rollback;
1445
1446                         if (!(dev->flags & IFF_UP))
1447                                 continue;
1448
1449                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1450                 }
1451         }
1452
1453 unlock:
1454         rtnl_unlock();
1455         return err;
1456
1457 rollback:
1458         last = dev;
1459         for_each_net(net) {
1460                 for_each_netdev(net, dev) {
1461                         if (dev == last)
1462                                 goto outroll;
1463
1464                         if (dev->flags & IFF_UP) {
1465                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1466                                                         dev);
1467                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1468                         }
1469                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1470                 }
1471         }
1472
1473 outroll:
1474         raw_notifier_chain_unregister(&netdev_chain, nb);
1475         goto unlock;
1476 }
1477 EXPORT_SYMBOL(register_netdevice_notifier);
1478
1479 /**
1480  *      unregister_netdevice_notifier - unregister a network notifier block
1481  *      @nb: notifier
1482  *
1483  *      Unregister a notifier previously registered by
1484  *      register_netdevice_notifier(). The notifier is unlinked into the
1485  *      kernel structures and may then be reused. A negative errno code
1486  *      is returned on a failure.
1487  *
1488  *      After unregistering unregister and down device events are synthesized
1489  *      for all devices on the device list to the removed notifier to remove
1490  *      the need for special case cleanup code.
1491  */
1492
1493 int unregister_netdevice_notifier(struct notifier_block *nb)
1494 {
1495         struct net_device *dev;
1496         struct net *net;
1497         int err;
1498
1499         rtnl_lock();
1500         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1501         if (err)
1502                 goto unlock;
1503
1504         for_each_net(net) {
1505                 for_each_netdev(net, dev) {
1506                         if (dev->flags & IFF_UP) {
1507                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1508                                                         dev);
1509                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1510                         }
1511                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1512                 }
1513         }
1514 unlock:
1515         rtnl_unlock();
1516         return err;
1517 }
1518 EXPORT_SYMBOL(unregister_netdevice_notifier);
1519
1520 /**
1521  *      call_netdevice_notifiers_info - call all network notifier blocks
1522  *      @val: value passed unmodified to notifier function
1523  *      @dev: net_device pointer passed unmodified to notifier function
1524  *      @info: notifier information data
1525  *
1526  *      Call all network notifier blocks.  Parameters and return value
1527  *      are as for raw_notifier_call_chain().
1528  */
1529
1530 int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1531                                   struct netdev_notifier_info *info)
1532 {
1533         ASSERT_RTNL();
1534         netdev_notifier_info_init(info, dev);
1535         return raw_notifier_call_chain(&netdev_chain, val, info);
1536 }
1537 EXPORT_SYMBOL(call_netdevice_notifiers_info);
1538
1539 /**
1540  *      call_netdevice_notifiers - call all network notifier blocks
1541  *      @val: value passed unmodified to notifier function
1542  *      @dev: net_device pointer passed unmodified to notifier function
1543  *
1544  *      Call all network notifier blocks.  Parameters and return value
1545  *      are as for raw_notifier_call_chain().
1546  */
1547
1548 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1549 {
1550         struct netdev_notifier_info info;
1551
1552         return call_netdevice_notifiers_info(val, dev, &info);
1553 }
1554 EXPORT_SYMBOL(call_netdevice_notifiers);
1555
1556 static struct static_key netstamp_needed __read_mostly;
1557 #ifdef HAVE_JUMP_LABEL
1558 /* We are not allowed to call static_key_slow_dec() from irq context
1559  * If net_disable_timestamp() is called from irq context, defer the
1560  * static_key_slow_dec() calls.
1561  */
1562 static atomic_t netstamp_needed_deferred;
1563 #endif
1564
1565 void net_enable_timestamp(void)
1566 {
1567 #ifdef HAVE_JUMP_LABEL
1568         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1569
1570         if (deferred) {
1571                 while (--deferred)
1572                         static_key_slow_dec(&netstamp_needed);
1573                 return;
1574         }
1575 #endif
1576         static_key_slow_inc(&netstamp_needed);
1577 }
1578 EXPORT_SYMBOL(net_enable_timestamp);
1579
1580 void net_disable_timestamp(void)
1581 {
1582 #ifdef HAVE_JUMP_LABEL
1583         if (in_interrupt()) {
1584                 atomic_inc(&netstamp_needed_deferred);
1585                 return;
1586         }
1587 #endif
1588         static_key_slow_dec(&netstamp_needed);
1589 }
1590 EXPORT_SYMBOL(net_disable_timestamp);
1591
1592 static inline void net_timestamp_set(struct sk_buff *skb)
1593 {
1594         skb->tstamp.tv64 = 0;
1595         if (static_key_false(&netstamp_needed))
1596                 __net_timestamp(skb);
1597 }
1598
1599 #define net_timestamp_check(COND, SKB)                  \
1600         if (static_key_false(&netstamp_needed)) {               \
1601                 if ((COND) && !(SKB)->tstamp.tv64)      \
1602                         __net_timestamp(SKB);           \
1603         }                                               \
1604
1605 static inline bool is_skb_forwardable(struct net_device *dev,
1606                                       struct sk_buff *skb)
1607 {
1608         unsigned int len;
1609
1610         if (!(dev->flags & IFF_UP))
1611                 return false;
1612
1613         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1614         if (skb->len <= len)
1615                 return true;
1616
1617         /* if TSO is enabled, we don't care about the length as the packet
1618          * could be forwarded without being segmented before
1619          */
1620         if (skb_is_gso(skb))
1621                 return true;
1622
1623         return false;
1624 }
1625
1626 /**
1627  * dev_forward_skb - loopback an skb to another netif
1628  *
1629  * @dev: destination network device
1630  * @skb: buffer to forward
1631  *
1632  * return values:
1633  *      NET_RX_SUCCESS  (no congestion)
1634  *      NET_RX_DROP     (packet was dropped, but freed)
1635  *
1636  * dev_forward_skb can be used for injecting an skb from the
1637  * start_xmit function of one device into the receive queue
1638  * of another device.
1639  *
1640  * The receiving device may be in another namespace, so
1641  * we have to clear all information in the skb that could
1642  * impact namespace isolation.
1643  */
1644 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1645 {
1646         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1647                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1648                         atomic_long_inc(&dev->rx_dropped);
1649                         kfree_skb(skb);
1650                         return NET_RX_DROP;
1651                 }
1652         }
1653
1654         skb_orphan(skb);
1655
1656         if (unlikely(!is_skb_forwardable(dev, skb))) {
1657                 atomic_long_inc(&dev->rx_dropped);
1658                 kfree_skb(skb);
1659                 return NET_RX_DROP;
1660         }
1661         skb->skb_iif = 0;
1662         skb_dst_drop(skb);
1663         skb->tstamp.tv64 = 0;
1664         skb->pkt_type = PACKET_HOST;
1665         skb->protocol = eth_type_trans(skb, dev);
1666         skb->mark = 0;
1667         secpath_reset(skb);
1668         nf_reset(skb);
1669         nf_reset_trace(skb);
1670         return netif_rx(skb);
1671 }
1672 EXPORT_SYMBOL_GPL(dev_forward_skb);
1673
1674 static inline int deliver_skb(struct sk_buff *skb,
1675                               struct packet_type *pt_prev,
1676                               struct net_device *orig_dev)
1677 {
1678         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1679                 return -ENOMEM;
1680         atomic_inc(&skb->users);
1681         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1682 }
1683
1684 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1685 {
1686         if (!ptype->af_packet_priv || !skb->sk)
1687                 return false;
1688
1689         if (ptype->id_match)
1690                 return ptype->id_match(ptype, skb->sk);
1691         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1692                 return true;
1693
1694         return false;
1695 }
1696
1697 /*
1698  *      Support routine. Sends outgoing frames to any network
1699  *      taps currently in use.
1700  */
1701
1702 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1703 {
1704         struct packet_type *ptype;
1705         struct sk_buff *skb2 = NULL;
1706         struct packet_type *pt_prev = NULL;
1707
1708         rcu_read_lock();
1709         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1710                 /* Never send packets back to the socket
1711                  * they originated from - MvS (miquels@drinkel.ow.org)
1712                  */
1713                 if ((ptype->dev == dev || !ptype->dev) &&
1714                     (!skb_loop_sk(ptype, skb))) {
1715                         if (pt_prev) {
1716                                 deliver_skb(skb2, pt_prev, skb->dev);
1717                                 pt_prev = ptype;
1718                                 continue;
1719                         }
1720
1721                         skb2 = skb_clone(skb, GFP_ATOMIC);
1722                         if (!skb2)
1723                                 break;
1724
1725                         net_timestamp_set(skb2);
1726
1727                         /* skb->nh should be correctly
1728                            set by sender, so that the second statement is
1729                            just protection against buggy protocols.
1730                          */
1731                         skb_reset_mac_header(skb2);
1732
1733                         if (skb_network_header(skb2) < skb2->data ||
1734                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1735                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1736                                                      ntohs(skb2->protocol),
1737                                                      dev->name);
1738                                 skb_reset_network_header(skb2);
1739                         }
1740
1741                         skb2->transport_header = skb2->network_header;
1742                         skb2->pkt_type = PACKET_OUTGOING;
1743                         pt_prev = ptype;
1744                 }
1745         }
1746         if (pt_prev)
1747                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1748         rcu_read_unlock();
1749 }
1750
1751 /**
1752  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1753  * @dev: Network device
1754  * @txq: number of queues available
1755  *
1756  * If real_num_tx_queues is changed the tc mappings may no longer be
1757  * valid. To resolve this verify the tc mapping remains valid and if
1758  * not NULL the mapping. With no priorities mapping to this
1759  * offset/count pair it will no longer be used. In the worst case TC0
1760  * is invalid nothing can be done so disable priority mappings. If is
1761  * expected that drivers will fix this mapping if they can before
1762  * calling netif_set_real_num_tx_queues.
1763  */
1764 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1765 {
1766         int i;
1767         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1768
1769         /* If TC0 is invalidated disable TC mapping */
1770         if (tc->offset + tc->count > txq) {
1771                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1772                 dev->num_tc = 0;
1773                 return;
1774         }
1775
1776         /* Invalidated prio to tc mappings set to TC0 */
1777         for (i = 1; i < TC_BITMASK + 1; i++) {
1778                 int q = netdev_get_prio_tc_map(dev, i);
1779
1780                 tc = &dev->tc_to_txq[q];
1781                 if (tc->offset + tc->count > txq) {
1782                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1783                                 i, q);
1784                         netdev_set_prio_tc_map(dev, i, 0);
1785                 }
1786         }
1787 }
1788
1789 #ifdef CONFIG_XPS
1790 static DEFINE_MUTEX(xps_map_mutex);
1791 #define xmap_dereference(P)             \
1792         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1793
1794 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1795                                         int cpu, u16 index)
1796 {
1797         struct xps_map *map = NULL;
1798         int pos;
1799
1800         if (dev_maps)
1801                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1802
1803         for (pos = 0; map && pos < map->len; pos++) {
1804                 if (map->queues[pos] == index) {
1805                         if (map->len > 1) {
1806                                 map->queues[pos] = map->queues[--map->len];
1807                         } else {
1808                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1809                                 kfree_rcu(map, rcu);
1810                                 map = NULL;
1811                         }
1812                         break;
1813                 }
1814         }
1815
1816         return map;
1817 }
1818
1819 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1820 {
1821         struct xps_dev_maps *dev_maps;
1822         int cpu, i;
1823         bool active = false;
1824
1825         mutex_lock(&xps_map_mutex);
1826         dev_maps = xmap_dereference(dev->xps_maps);
1827
1828         if (!dev_maps)
1829                 goto out_no_maps;
1830
1831         for_each_possible_cpu(cpu) {
1832                 for (i = index; i < dev->num_tx_queues; i++) {
1833                         if (!remove_xps_queue(dev_maps, cpu, i))
1834                                 break;
1835                 }
1836                 if (i == dev->num_tx_queues)
1837                         active = true;
1838         }
1839
1840         if (!active) {
1841                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1842                 kfree_rcu(dev_maps, rcu);
1843         }
1844
1845         for (i = index; i < dev->num_tx_queues; i++)
1846                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1847                                              NUMA_NO_NODE);
1848
1849 out_no_maps:
1850         mutex_unlock(&xps_map_mutex);
1851 }
1852
1853 static struct xps_map *expand_xps_map(struct xps_map *map,
1854                                       int cpu, u16 index)
1855 {
1856         struct xps_map *new_map;
1857         int alloc_len = XPS_MIN_MAP_ALLOC;
1858         int i, pos;
1859
1860         for (pos = 0; map && pos < map->len; pos++) {
1861                 if (map->queues[pos] != index)
1862                         continue;
1863                 return map;
1864         }
1865
1866         /* Need to add queue to this CPU's existing map */
1867         if (map) {
1868                 if (pos < map->alloc_len)
1869                         return map;
1870
1871                 alloc_len = map->alloc_len * 2;
1872         }
1873
1874         /* Need to allocate new map to store queue on this CPU's map */
1875         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1876                                cpu_to_node(cpu));
1877         if (!new_map)
1878                 return NULL;
1879
1880         for (i = 0; i < pos; i++)
1881                 new_map->queues[i] = map->queues[i];
1882         new_map->alloc_len = alloc_len;
1883         new_map->len = pos;
1884
1885         return new_map;
1886 }
1887
1888 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1889 {
1890         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1891         struct xps_map *map, *new_map;
1892         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1893         int cpu, numa_node_id = -2;
1894         bool active = false;
1895
1896         mutex_lock(&xps_map_mutex);
1897
1898         dev_maps = xmap_dereference(dev->xps_maps);
1899
1900         /* allocate memory for queue storage */
1901         for_each_online_cpu(cpu) {
1902                 if (!cpumask_test_cpu(cpu, mask))
1903                         continue;
1904
1905                 if (!new_dev_maps)
1906                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1907                 if (!new_dev_maps) {
1908                         mutex_unlock(&xps_map_mutex);
1909                         return -ENOMEM;
1910                 }
1911
1912                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1913                                  NULL;
1914
1915                 map = expand_xps_map(map, cpu, index);
1916                 if (!map)
1917                         goto error;
1918
1919                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1920         }
1921
1922         if (!new_dev_maps)
1923                 goto out_no_new_maps;
1924
1925         for_each_possible_cpu(cpu) {
1926                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1927                         /* add queue to CPU maps */
1928                         int pos = 0;
1929
1930                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1931                         while ((pos < map->len) && (map->queues[pos] != index))
1932                                 pos++;
1933
1934                         if (pos == map->len)
1935                                 map->queues[map->len++] = index;
1936 #ifdef CONFIG_NUMA
1937                         if (numa_node_id == -2)
1938                                 numa_node_id = cpu_to_node(cpu);
1939                         else if (numa_node_id != cpu_to_node(cpu))
1940                                 numa_node_id = -1;
1941 #endif
1942                 } else if (dev_maps) {
1943                         /* fill in the new device map from the old device map */
1944                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1945                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1946                 }
1947
1948         }
1949
1950         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1951
1952         /* Cleanup old maps */
1953         if (dev_maps) {
1954                 for_each_possible_cpu(cpu) {
1955                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1956                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1957                         if (map && map != new_map)
1958                                 kfree_rcu(map, rcu);
1959                 }
1960
1961                 kfree_rcu(dev_maps, rcu);
1962         }
1963
1964         dev_maps = new_dev_maps;
1965         active = true;
1966
1967 out_no_new_maps:
1968         /* update Tx queue numa node */
1969         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1970                                      (numa_node_id >= 0) ? numa_node_id :
1971                                      NUMA_NO_NODE);
1972
1973         if (!dev_maps)
1974                 goto out_no_maps;
1975
1976         /* removes queue from unused CPUs */
1977         for_each_possible_cpu(cpu) {
1978                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1979                         continue;
1980
1981                 if (remove_xps_queue(dev_maps, cpu, index))
1982                         active = true;
1983         }
1984
1985         /* free map if not active */
1986         if (!active) {
1987                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1988                 kfree_rcu(dev_maps, rcu);
1989         }
1990
1991 out_no_maps:
1992         mutex_unlock(&xps_map_mutex);
1993
1994         return 0;
1995 error:
1996         /* remove any maps that we added */
1997         for_each_possible_cpu(cpu) {
1998                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1999                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2000                                  NULL;
2001                 if (new_map && new_map != map)
2002                         kfree(new_map);
2003         }
2004
2005         mutex_unlock(&xps_map_mutex);
2006
2007         kfree(new_dev_maps);
2008         return -ENOMEM;
2009 }
2010 EXPORT_SYMBOL(netif_set_xps_queue);
2011
2012 #endif
2013 /*
2014  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2015  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2016  */
2017 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2018 {
2019         int rc;
2020
2021         if (txq < 1 || txq > dev->num_tx_queues)
2022                 return -EINVAL;
2023
2024         if (dev->reg_state == NETREG_REGISTERED ||
2025             dev->reg_state == NETREG_UNREGISTERING) {
2026                 ASSERT_RTNL();
2027
2028                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2029                                                   txq);
2030                 if (rc)
2031                         return rc;
2032
2033                 if (dev->num_tc)
2034                         netif_setup_tc(dev, txq);
2035
2036                 if (txq < dev->real_num_tx_queues) {
2037                         qdisc_reset_all_tx_gt(dev, txq);
2038 #ifdef CONFIG_XPS
2039                         netif_reset_xps_queues_gt(dev, txq);
2040 #endif
2041                 }
2042         }
2043
2044         dev->real_num_tx_queues = txq;
2045         return 0;
2046 }
2047 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2048
2049 #ifdef CONFIG_RPS
2050 /**
2051  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2052  *      @dev: Network device
2053  *      @rxq: Actual number of RX queues
2054  *
2055  *      This must be called either with the rtnl_lock held or before
2056  *      registration of the net device.  Returns 0 on success, or a
2057  *      negative error code.  If called before registration, it always
2058  *      succeeds.
2059  */
2060 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2061 {
2062         int rc;
2063
2064         if (rxq < 1 || rxq > dev->num_rx_queues)
2065                 return -EINVAL;
2066
2067         if (dev->reg_state == NETREG_REGISTERED) {
2068                 ASSERT_RTNL();
2069
2070                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2071                                                   rxq);
2072                 if (rc)
2073                         return rc;
2074         }
2075
2076         dev->real_num_rx_queues = rxq;
2077         return 0;
2078 }
2079 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2080 #endif
2081
2082 /**
2083  * netif_get_num_default_rss_queues - default number of RSS queues
2084  *
2085  * This routine should set an upper limit on the number of RSS queues
2086  * used by default by multiqueue devices.
2087  */
2088 int netif_get_num_default_rss_queues(void)
2089 {
2090         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2091 }
2092 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2093
2094 static inline void __netif_reschedule(struct Qdisc *q)
2095 {
2096         struct softnet_data *sd;
2097         unsigned long flags;
2098
2099         local_irq_save(flags);
2100         sd = &__get_cpu_var(softnet_data);
2101         q->next_sched = NULL;
2102         *sd->output_queue_tailp = q;
2103         sd->output_queue_tailp = &q->next_sched;
2104         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2105         local_irq_restore(flags);
2106 }
2107
2108 void __netif_schedule(struct Qdisc *q)
2109 {
2110         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2111                 __netif_reschedule(q);
2112 }
2113 EXPORT_SYMBOL(__netif_schedule);
2114
2115 void dev_kfree_skb_irq(struct sk_buff *skb)
2116 {
2117         if (atomic_dec_and_test(&skb->users)) {
2118                 struct softnet_data *sd;
2119                 unsigned long flags;
2120
2121                 local_irq_save(flags);
2122                 sd = &__get_cpu_var(softnet_data);
2123                 skb->next = sd->completion_queue;
2124                 sd->completion_queue = skb;
2125                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2126                 local_irq_restore(flags);
2127         }
2128 }
2129 EXPORT_SYMBOL(dev_kfree_skb_irq);
2130
2131 void dev_kfree_skb_any(struct sk_buff *skb)
2132 {
2133         if (in_irq() || irqs_disabled())
2134                 dev_kfree_skb_irq(skb);
2135         else
2136                 dev_kfree_skb(skb);
2137 }
2138 EXPORT_SYMBOL(dev_kfree_skb_any);
2139
2140
2141 /**
2142  * netif_device_detach - mark device as removed
2143  * @dev: network device
2144  *
2145  * Mark device as removed from system and therefore no longer available.
2146  */
2147 void netif_device_detach(struct net_device *dev)
2148 {
2149         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2150             netif_running(dev)) {
2151                 netif_tx_stop_all_queues(dev);
2152         }
2153 }
2154 EXPORT_SYMBOL(netif_device_detach);
2155
2156 /**
2157  * netif_device_attach - mark device as attached
2158  * @dev: network device
2159  *
2160  * Mark device as attached from system and restart if needed.
2161  */
2162 void netif_device_attach(struct net_device *dev)
2163 {
2164         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2165             netif_running(dev)) {
2166                 netif_tx_wake_all_queues(dev);
2167                 __netdev_watchdog_up(dev);
2168         }
2169 }
2170 EXPORT_SYMBOL(netif_device_attach);
2171
2172 static void skb_warn_bad_offload(const struct sk_buff *skb)
2173 {
2174         static const netdev_features_t null_features = 0;
2175         struct net_device *dev = skb->dev;
2176         const char *driver = "";
2177
2178         if (!net_ratelimit())
2179                 return;
2180
2181         if (dev && dev->dev.parent)
2182                 driver = dev_driver_string(dev->dev.parent);
2183
2184         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2185              "gso_type=%d ip_summed=%d\n",
2186              driver, dev ? &dev->features : &null_features,
2187              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2188              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2189              skb_shinfo(skb)->gso_type, skb->ip_summed);
2190 }
2191
2192 /*
2193  * Invalidate hardware checksum when packet is to be mangled, and
2194  * complete checksum manually on outgoing path.
2195  */
2196 int skb_checksum_help(struct sk_buff *skb)
2197 {
2198         __wsum csum;
2199         int ret = 0, offset;
2200
2201         if (skb->ip_summed == CHECKSUM_COMPLETE)
2202                 goto out_set_summed;
2203
2204         if (unlikely(skb_shinfo(skb)->gso_size)) {
2205                 skb_warn_bad_offload(skb);
2206                 return -EINVAL;
2207         }
2208
2209         /* Before computing a checksum, we should make sure no frag could
2210          * be modified by an external entity : checksum could be wrong.
2211          */
2212         if (skb_has_shared_frag(skb)) {
2213                 ret = __skb_linearize(skb);
2214                 if (ret)
2215                         goto out;
2216         }
2217
2218         offset = skb_checksum_start_offset(skb);
2219         BUG_ON(offset >= skb_headlen(skb));
2220         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2221
2222         offset += skb->csum_offset;
2223         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2224
2225         if (skb_cloned(skb) &&
2226             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2227                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2228                 if (ret)
2229                         goto out;
2230         }
2231
2232         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2233 out_set_summed:
2234         skb->ip_summed = CHECKSUM_NONE;
2235 out:
2236         return ret;
2237 }
2238 EXPORT_SYMBOL(skb_checksum_help);
2239
2240 __be16 skb_network_protocol(struct sk_buff *skb)
2241 {
2242         __be16 type = skb->protocol;
2243         int vlan_depth = ETH_HLEN;
2244
2245         /* Tunnel gso handlers can set protocol to ethernet. */
2246         if (type == htons(ETH_P_TEB)) {
2247                 struct ethhdr *eth;
2248
2249                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2250                         return 0;
2251
2252                 eth = (struct ethhdr *)skb_mac_header(skb);
2253                 type = eth->h_proto;
2254         }
2255
2256         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2257                 struct vlan_hdr *vh;
2258
2259                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2260                         return 0;
2261
2262                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2263                 type = vh->h_vlan_encapsulated_proto;
2264                 vlan_depth += VLAN_HLEN;
2265         }
2266
2267         return type;
2268 }
2269
2270 /**
2271  *      skb_mac_gso_segment - mac layer segmentation handler.
2272  *      @skb: buffer to segment
2273  *      @features: features for the output path (see dev->features)
2274  */
2275 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2276                                     netdev_features_t features)
2277 {
2278         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2279         struct packet_offload *ptype;
2280         __be16 type = skb_network_protocol(skb);
2281
2282         if (unlikely(!type))
2283                 return ERR_PTR(-EINVAL);
2284
2285         __skb_pull(skb, skb->mac_len);
2286
2287         rcu_read_lock();
2288         list_for_each_entry_rcu(ptype, &offload_base, list) {
2289                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2290                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2291                                 int err;
2292
2293                                 err = ptype->callbacks.gso_send_check(skb);
2294                                 segs = ERR_PTR(err);
2295                                 if (err || skb_gso_ok(skb, features))
2296                                         break;
2297                                 __skb_push(skb, (skb->data -
2298                                                  skb_network_header(skb)));
2299                         }
2300                         segs = ptype->callbacks.gso_segment(skb, features);
2301                         break;
2302                 }
2303         }
2304         rcu_read_unlock();
2305
2306         __skb_push(skb, skb->data - skb_mac_header(skb));
2307
2308         return segs;
2309 }
2310 EXPORT_SYMBOL(skb_mac_gso_segment);
2311
2312
2313 /* openvswitch calls this on rx path, so we need a different check.
2314  */
2315 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2316 {
2317         if (tx_path)
2318                 return skb->ip_summed != CHECKSUM_PARTIAL;
2319         else
2320                 return skb->ip_summed == CHECKSUM_NONE;
2321 }
2322
2323 /**
2324  *      __skb_gso_segment - Perform segmentation on skb.
2325  *      @skb: buffer to segment
2326  *      @features: features for the output path (see dev->features)
2327  *      @tx_path: whether it is called in TX path
2328  *
2329  *      This function segments the given skb and returns a list of segments.
2330  *
2331  *      It may return NULL if the skb requires no segmentation.  This is
2332  *      only possible when GSO is used for verifying header integrity.
2333  */
2334 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2335                                   netdev_features_t features, bool tx_path)
2336 {
2337         if (unlikely(skb_needs_check(skb, tx_path))) {
2338                 int err;
2339
2340                 skb_warn_bad_offload(skb);
2341
2342                 if (skb_header_cloned(skb) &&
2343                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2344                         return ERR_PTR(err);
2345         }
2346
2347         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2348         skb_reset_mac_header(skb);
2349         skb_reset_mac_len(skb);
2350
2351         return skb_mac_gso_segment(skb, features);
2352 }
2353 EXPORT_SYMBOL(__skb_gso_segment);
2354
2355 /* Take action when hardware reception checksum errors are detected. */
2356 #ifdef CONFIG_BUG
2357 void netdev_rx_csum_fault(struct net_device *dev)
2358 {
2359         if (net_ratelimit()) {
2360                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2361                 dump_stack();
2362         }
2363 }
2364 EXPORT_SYMBOL(netdev_rx_csum_fault);
2365 #endif
2366
2367 /* Actually, we should eliminate this check as soon as we know, that:
2368  * 1. IOMMU is present and allows to map all the memory.
2369  * 2. No high memory really exists on this machine.
2370  */
2371
2372 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2373 {
2374 #ifdef CONFIG_HIGHMEM
2375         int i;
2376         if (!(dev->features & NETIF_F_HIGHDMA)) {
2377                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2378                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2379                         if (PageHighMem(skb_frag_page(frag)))
2380                                 return 1;
2381                 }
2382         }
2383
2384         if (PCI_DMA_BUS_IS_PHYS) {
2385                 struct device *pdev = dev->dev.parent;
2386
2387                 if (!pdev)
2388                         return 0;
2389                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2390                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2391                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2392                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2393                                 return 1;
2394                 }
2395         }
2396 #endif
2397         return 0;
2398 }
2399
2400 struct dev_gso_cb {
2401         void (*destructor)(struct sk_buff *skb);
2402 };
2403
2404 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2405
2406 static void dev_gso_skb_destructor(struct sk_buff *skb)
2407 {
2408         struct dev_gso_cb *cb;
2409
2410         do {
2411                 struct sk_buff *nskb = skb->next;
2412
2413                 skb->next = nskb->next;
2414                 nskb->next = NULL;
2415                 kfree_skb(nskb);
2416         } while (skb->next);
2417
2418         cb = DEV_GSO_CB(skb);
2419         if (cb->destructor)
2420                 cb->destructor(skb);
2421 }
2422
2423 /**
2424  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2425  *      @skb: buffer to segment
2426  *      @features: device features as applicable to this skb
2427  *
2428  *      This function segments the given skb and stores the list of segments
2429  *      in skb->next.
2430  */
2431 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2432 {
2433         struct sk_buff *segs;
2434
2435         segs = skb_gso_segment(skb, features);
2436
2437         /* Verifying header integrity only. */
2438         if (!segs)
2439                 return 0;
2440
2441         if (IS_ERR(segs))
2442                 return PTR_ERR(segs);
2443
2444         skb->next = segs;
2445         DEV_GSO_CB(skb)->destructor = skb->destructor;
2446         skb->destructor = dev_gso_skb_destructor;
2447
2448         return 0;
2449 }
2450
2451 static netdev_features_t harmonize_features(struct sk_buff *skb,
2452         __be16 protocol, netdev_features_t features)
2453 {
2454         if (skb->ip_summed != CHECKSUM_NONE &&
2455             !can_checksum_protocol(features, protocol)) {
2456                 features &= ~NETIF_F_ALL_CSUM;
2457         } else if (illegal_highdma(skb->dev, skb)) {
2458                 features &= ~NETIF_F_SG;
2459         }
2460
2461         return features;
2462 }
2463
2464 netdev_features_t netif_skb_features(struct sk_buff *skb)
2465 {
2466         __be16 protocol = skb->protocol;
2467         netdev_features_t features = skb->dev->features;
2468
2469         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2470                 features &= ~NETIF_F_GSO_MASK;
2471
2472         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2473                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2474                 protocol = veh->h_vlan_encapsulated_proto;
2475         } else if (!vlan_tx_tag_present(skb)) {
2476                 return harmonize_features(skb, protocol, features);
2477         }
2478
2479         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2480                                                NETIF_F_HW_VLAN_STAG_TX);
2481
2482         if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
2483                 return harmonize_features(skb, protocol, features);
2484         } else {
2485                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2486                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2487                                 NETIF_F_HW_VLAN_STAG_TX;
2488                 return harmonize_features(skb, protocol, features);
2489         }
2490 }
2491 EXPORT_SYMBOL(netif_skb_features);
2492
2493 /*
2494  * Returns true if either:
2495  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2496  *      2. skb is fragmented and the device does not support SG.
2497  */
2498 static inline int skb_needs_linearize(struct sk_buff *skb,
2499                                       netdev_features_t features)
2500 {
2501         return skb_is_nonlinear(skb) &&
2502                         ((skb_has_frag_list(skb) &&
2503                                 !(features & NETIF_F_FRAGLIST)) ||
2504                         (skb_shinfo(skb)->nr_frags &&
2505                                 !(features & NETIF_F_SG)));
2506 }
2507
2508 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2509                         struct netdev_queue *txq)
2510 {
2511         const struct net_device_ops *ops = dev->netdev_ops;
2512         int rc = NETDEV_TX_OK;
2513         unsigned int skb_len;
2514
2515         if (likely(!skb->next)) {
2516                 netdev_features_t features;
2517
2518                 /*
2519                  * If device doesn't need skb->dst, release it right now while
2520                  * its hot in this cpu cache
2521                  */
2522                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2523                         skb_dst_drop(skb);
2524
2525                 features = netif_skb_features(skb);
2526
2527                 if (vlan_tx_tag_present(skb) &&
2528                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2529                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2530                                              vlan_tx_tag_get(skb));
2531                         if (unlikely(!skb))
2532                                 goto out;
2533
2534                         skb->vlan_tci = 0;
2535                 }
2536
2537                 /* If encapsulation offload request, verify we are testing
2538                  * hardware encapsulation features instead of standard
2539                  * features for the netdev
2540                  */
2541                 if (skb->encapsulation)
2542                         features &= dev->hw_enc_features;
2543
2544                 if (netif_needs_gso(skb, features)) {
2545                         if (unlikely(dev_gso_segment(skb, features)))
2546                                 goto out_kfree_skb;
2547                         if (skb->next)
2548                                 goto gso;
2549                 } else {
2550                         if (skb_needs_linearize(skb, features) &&
2551                             __skb_linearize(skb))
2552                                 goto out_kfree_skb;
2553
2554                         /* If packet is not checksummed and device does not
2555                          * support checksumming for this protocol, complete
2556                          * checksumming here.
2557                          */
2558                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2559                                 if (skb->encapsulation)
2560                                         skb_set_inner_transport_header(skb,
2561                                                 skb_checksum_start_offset(skb));
2562                                 else
2563                                         skb_set_transport_header(skb,
2564                                                 skb_checksum_start_offset(skb));
2565                                 if (!(features & NETIF_F_ALL_CSUM) &&
2566                                      skb_checksum_help(skb))
2567                                         goto out_kfree_skb;
2568                         }
2569                 }
2570
2571                 if (!list_empty(&ptype_all))
2572                         dev_queue_xmit_nit(skb, dev);
2573
2574                 skb_len = skb->len;
2575                 rc = ops->ndo_start_xmit(skb, dev);
2576                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2577                 if (rc == NETDEV_TX_OK)
2578                         txq_trans_update(txq);
2579                 return rc;
2580         }
2581
2582 gso:
2583         do {
2584                 struct sk_buff *nskb = skb->next;
2585
2586                 skb->next = nskb->next;
2587                 nskb->next = NULL;
2588
2589                 if (!list_empty(&ptype_all))
2590                         dev_queue_xmit_nit(nskb, dev);
2591
2592                 skb_len = nskb->len;
2593                 rc = ops->ndo_start_xmit(nskb, dev);
2594                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2595                 if (unlikely(rc != NETDEV_TX_OK)) {
2596                         if (rc & ~NETDEV_TX_MASK)
2597                                 goto out_kfree_gso_skb;
2598                         nskb->next = skb->next;
2599                         skb->next = nskb;
2600                         return rc;
2601                 }
2602                 txq_trans_update(txq);
2603                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2604                         return NETDEV_TX_BUSY;
2605         } while (skb->next);
2606
2607 out_kfree_gso_skb:
2608         if (likely(skb->next == NULL)) {
2609                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2610                 consume_skb(skb);
2611                 return rc;
2612         }
2613 out_kfree_skb:
2614         kfree_skb(skb);
2615 out:
2616         return rc;
2617 }
2618
2619 static void qdisc_pkt_len_init(struct sk_buff *skb)
2620 {
2621         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2622
2623         qdisc_skb_cb(skb)->pkt_len = skb->len;
2624
2625         /* To get more precise estimation of bytes sent on wire,
2626          * we add to pkt_len the headers size of all segments
2627          */
2628         if (shinfo->gso_size)  {
2629                 unsigned int hdr_len;
2630                 u16 gso_segs = shinfo->gso_segs;
2631
2632                 /* mac layer + network layer */
2633                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2634
2635                 /* + transport layer */
2636                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2637                         hdr_len += tcp_hdrlen(skb);
2638                 else
2639                         hdr_len += sizeof(struct udphdr);
2640
2641                 if (shinfo->gso_type & SKB_GSO_DODGY)
2642                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2643                                                 shinfo->gso_size);
2644
2645                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2646         }
2647 }
2648
2649 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2650                                  struct net_device *dev,
2651                                  struct netdev_queue *txq)
2652 {
2653         spinlock_t *root_lock = qdisc_lock(q);
2654         bool contended;
2655         int rc;
2656
2657         qdisc_pkt_len_init(skb);
2658         qdisc_calculate_pkt_len(skb, q);
2659         /*
2660          * Heuristic to force contended enqueues to serialize on a
2661          * separate lock before trying to get qdisc main lock.
2662          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2663          * and dequeue packets faster.
2664          */
2665         contended = qdisc_is_running(q);
2666         if (unlikely(contended))
2667                 spin_lock(&q->busylock);
2668
2669         spin_lock(root_lock);
2670         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2671                 kfree_skb(skb);
2672                 rc = NET_XMIT_DROP;
2673         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2674                    qdisc_run_begin(q)) {
2675                 /*
2676                  * This is a work-conserving queue; there are no old skbs
2677                  * waiting to be sent out; and the qdisc is not running -
2678                  * xmit the skb directly.
2679                  */
2680                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2681                         skb_dst_force(skb);
2682
2683                 qdisc_bstats_update(q, skb);
2684
2685                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2686                         if (unlikely(contended)) {
2687                                 spin_unlock(&q->busylock);
2688                                 contended = false;
2689                         }
2690                         __qdisc_run(q);
2691                 } else
2692                         qdisc_run_end(q);
2693
2694                 rc = NET_XMIT_SUCCESS;
2695         } else {
2696                 skb_dst_force(skb);
2697                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2698                 if (qdisc_run_begin(q)) {
2699                         if (unlikely(contended)) {
2700                                 spin_unlock(&q->busylock);
2701                                 contended = false;
2702                         }
2703                         __qdisc_run(q);
2704                 }
2705         }
2706         spin_unlock(root_lock);
2707         if (unlikely(contended))
2708                 spin_unlock(&q->busylock);
2709         return rc;
2710 }
2711
2712 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2713 static void skb_update_prio(struct sk_buff *skb)
2714 {
2715         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2716
2717         if (!skb->priority && skb->sk && map) {
2718                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2719
2720                 if (prioidx < map->priomap_len)
2721                         skb->priority = map->priomap[prioidx];
2722         }
2723 }
2724 #else
2725 #define skb_update_prio(skb)
2726 #endif
2727
2728 static DEFINE_PER_CPU(int, xmit_recursion);
2729 #define RECURSION_LIMIT 10
2730
2731 /**
2732  *      dev_loopback_xmit - loop back @skb
2733  *      @skb: buffer to transmit
2734  */
2735 int dev_loopback_xmit(struct sk_buff *skb)
2736 {
2737         skb_reset_mac_header(skb);
2738         __skb_pull(skb, skb_network_offset(skb));
2739         skb->pkt_type = PACKET_LOOPBACK;
2740         skb->ip_summed = CHECKSUM_UNNECESSARY;
2741         WARN_ON(!skb_dst(skb));
2742         skb_dst_force(skb);
2743         netif_rx_ni(skb);
2744         return 0;
2745 }
2746 EXPORT_SYMBOL(dev_loopback_xmit);
2747
2748 /**
2749  *      dev_queue_xmit - transmit a buffer
2750  *      @skb: buffer to transmit
2751  *
2752  *      Queue a buffer for transmission to a network device. The caller must
2753  *      have set the device and priority and built the buffer before calling
2754  *      this function. The function can be called from an interrupt.
2755  *
2756  *      A negative errno code is returned on a failure. A success does not
2757  *      guarantee the frame will be transmitted as it may be dropped due
2758  *      to congestion or traffic shaping.
2759  *
2760  * -----------------------------------------------------------------------------------
2761  *      I notice this method can also return errors from the queue disciplines,
2762  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2763  *      be positive.
2764  *
2765  *      Regardless of the return value, the skb is consumed, so it is currently
2766  *      difficult to retry a send to this method.  (You can bump the ref count
2767  *      before sending to hold a reference for retry if you are careful.)
2768  *
2769  *      When calling this method, interrupts MUST be enabled.  This is because
2770  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2771  *          --BLG
2772  */
2773 int dev_queue_xmit(struct sk_buff *skb)
2774 {
2775         struct net_device *dev = skb->dev;
2776         struct netdev_queue *txq;
2777         struct Qdisc *q;
2778         int rc = -ENOMEM;
2779
2780         skb_reset_mac_header(skb);
2781
2782         /* Disable soft irqs for various locks below. Also
2783          * stops preemption for RCU.
2784          */
2785         rcu_read_lock_bh();
2786
2787         skb_update_prio(skb);
2788
2789         txq = netdev_pick_tx(dev, skb);
2790         q = rcu_dereference_bh(txq->qdisc);
2791
2792 #ifdef CONFIG_NET_CLS_ACT
2793         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2794 #endif
2795         trace_net_dev_queue(skb);
2796         if (q->enqueue) {
2797                 rc = __dev_xmit_skb(skb, q, dev, txq);
2798                 goto out;
2799         }
2800
2801         /* The device has no queue. Common case for software devices:
2802            loopback, all the sorts of tunnels...
2803
2804            Really, it is unlikely that netif_tx_lock protection is necessary
2805            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2806            counters.)
2807            However, it is possible, that they rely on protection
2808            made by us here.
2809
2810            Check this and shot the lock. It is not prone from deadlocks.
2811            Either shot noqueue qdisc, it is even simpler 8)
2812          */
2813         if (dev->flags & IFF_UP) {
2814                 int cpu = smp_processor_id(); /* ok because BHs are off */
2815
2816                 if (txq->xmit_lock_owner != cpu) {
2817
2818                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2819                                 goto recursion_alert;
2820
2821                         HARD_TX_LOCK(dev, txq, cpu);
2822
2823                         if (!netif_xmit_stopped(txq)) {
2824                                 __this_cpu_inc(xmit_recursion);
2825                                 rc = dev_hard_start_xmit(skb, dev, txq);
2826                                 __this_cpu_dec(xmit_recursion);
2827                                 if (dev_xmit_complete(rc)) {
2828                                         HARD_TX_UNLOCK(dev, txq);
2829                                         goto out;
2830                                 }
2831                         }
2832                         HARD_TX_UNLOCK(dev, txq);
2833                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2834                                              dev->name);
2835                 } else {
2836                         /* Recursion is detected! It is possible,
2837                          * unfortunately
2838                          */
2839 recursion_alert:
2840                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2841                                              dev->name);
2842                 }
2843         }
2844
2845         rc = -ENETDOWN;
2846         rcu_read_unlock_bh();
2847
2848         kfree_skb(skb);
2849         return rc;
2850 out:
2851         rcu_read_unlock_bh();
2852         return rc;
2853 }
2854 EXPORT_SYMBOL(dev_queue_xmit);
2855
2856
2857 /*=======================================================================
2858                         Receiver routines
2859   =======================================================================*/
2860
2861 int netdev_max_backlog __read_mostly = 1000;
2862 EXPORT_SYMBOL(netdev_max_backlog);
2863
2864 int netdev_tstamp_prequeue __read_mostly = 1;
2865 int netdev_budget __read_mostly = 300;
2866 int weight_p __read_mostly = 64;            /* old backlog weight */
2867
2868 /* Called with irq disabled */
2869 static inline void ____napi_schedule(struct softnet_data *sd,
2870                                      struct napi_struct *napi)
2871 {
2872         list_add_tail(&napi->poll_list, &sd->poll_list);
2873         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2874 }
2875
2876 #ifdef CONFIG_RPS
2877
2878 /* One global table that all flow-based protocols share. */
2879 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2880 EXPORT_SYMBOL(rps_sock_flow_table);
2881
2882 struct static_key rps_needed __read_mostly;
2883
2884 static struct rps_dev_flow *
2885 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2886             struct rps_dev_flow *rflow, u16 next_cpu)
2887 {
2888         if (next_cpu != RPS_NO_CPU) {
2889 #ifdef CONFIG_RFS_ACCEL
2890                 struct netdev_rx_queue *rxqueue;
2891                 struct rps_dev_flow_table *flow_table;
2892                 struct rps_dev_flow *old_rflow;
2893                 u32 flow_id;
2894                 u16 rxq_index;
2895                 int rc;
2896
2897                 /* Should we steer this flow to a different hardware queue? */
2898                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2899                     !(dev->features & NETIF_F_NTUPLE))
2900                         goto out;
2901                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2902                 if (rxq_index == skb_get_rx_queue(skb))
2903                         goto out;
2904
2905                 rxqueue = dev->_rx + rxq_index;
2906                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2907                 if (!flow_table)
2908                         goto out;
2909                 flow_id = skb->rxhash & flow_table->mask;
2910                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2911                                                         rxq_index, flow_id);
2912                 if (rc < 0)
2913                         goto out;
2914                 old_rflow = rflow;
2915                 rflow = &flow_table->flows[flow_id];
2916                 rflow->filter = rc;
2917                 if (old_rflow->filter == rflow->filter)
2918                         old_rflow->filter = RPS_NO_FILTER;
2919         out:
2920 #endif
2921                 rflow->last_qtail =
2922                         per_cpu(softnet_data, next_cpu).input_queue_head;
2923         }
2924
2925         rflow->cpu = next_cpu;
2926         return rflow;
2927 }
2928
2929 /*
2930  * get_rps_cpu is called from netif_receive_skb and returns the target
2931  * CPU from the RPS map of the receiving queue for a given skb.
2932  * rcu_read_lock must be held on entry.
2933  */
2934 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2935                        struct rps_dev_flow **rflowp)
2936 {
2937         struct netdev_rx_queue *rxqueue;
2938         struct rps_map *map;
2939         struct rps_dev_flow_table *flow_table;
2940         struct rps_sock_flow_table *sock_flow_table;
2941         int cpu = -1;
2942         u16 tcpu;
2943
2944         if (skb_rx_queue_recorded(skb)) {
2945                 u16 index = skb_get_rx_queue(skb);
2946                 if (unlikely(index >= dev->real_num_rx_queues)) {
2947                         WARN_ONCE(dev->real_num_rx_queues > 1,
2948                                   "%s received packet on queue %u, but number "
2949                                   "of RX queues is %u\n",
2950                                   dev->name, index, dev->real_num_rx_queues);
2951                         goto done;
2952                 }
2953                 rxqueue = dev->_rx + index;
2954         } else
2955                 rxqueue = dev->_rx;
2956
2957         map = rcu_dereference(rxqueue->rps_map);
2958         if (map) {
2959                 if (map->len == 1 &&
2960                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2961                         tcpu = map->cpus[0];
2962                         if (cpu_online(tcpu))
2963                                 cpu = tcpu;
2964                         goto done;
2965                 }
2966         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2967                 goto done;
2968         }
2969
2970         skb_reset_network_header(skb);
2971         if (!skb_get_rxhash(skb))
2972                 goto done;
2973
2974         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2975         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2976         if (flow_table && sock_flow_table) {
2977                 u16 next_cpu;
2978                 struct rps_dev_flow *rflow;
2979
2980                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2981                 tcpu = rflow->cpu;
2982
2983                 next_cpu = sock_flow_table->ents[skb->rxhash &
2984                     sock_flow_table->mask];
2985
2986                 /*
2987                  * If the desired CPU (where last recvmsg was done) is
2988                  * different from current CPU (one in the rx-queue flow
2989                  * table entry), switch if one of the following holds:
2990                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2991                  *   - Current CPU is offline.
2992                  *   - The current CPU's queue tail has advanced beyond the
2993                  *     last packet that was enqueued using this table entry.
2994                  *     This guarantees that all previous packets for the flow
2995                  *     have been dequeued, thus preserving in order delivery.
2996                  */
2997                 if (unlikely(tcpu != next_cpu) &&
2998                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2999                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3000                       rflow->last_qtail)) >= 0)) {
3001                         tcpu = next_cpu;
3002                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3003                 }
3004
3005                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3006                         *rflowp = rflow;
3007                         cpu = tcpu;
3008                         goto done;
3009                 }
3010         }
3011
3012         if (map) {
3013                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3014
3015                 if (cpu_online(tcpu)) {
3016                         cpu = tcpu;
3017                         goto done;
3018                 }
3019         }
3020
3021 done:
3022         return cpu;
3023 }
3024
3025 #ifdef CONFIG_RFS_ACCEL
3026
3027 /**
3028  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3029  * @dev: Device on which the filter was set
3030  * @rxq_index: RX queue index
3031  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3032  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3033  *
3034  * Drivers that implement ndo_rx_flow_steer() should periodically call
3035  * this function for each installed filter and remove the filters for
3036  * which it returns %true.
3037  */
3038 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3039                          u32 flow_id, u16 filter_id)
3040 {
3041         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3042         struct rps_dev_flow_table *flow_table;
3043         struct rps_dev_flow *rflow;
3044         bool expire = true;
3045         int cpu;
3046
3047         rcu_read_lock();
3048         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3049         if (flow_table && flow_id <= flow_table->mask) {
3050                 rflow = &flow_table->flows[flow_id];
3051                 cpu = ACCESS_ONCE(rflow->cpu);
3052                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3053                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3054                            rflow->last_qtail) <
3055                      (int)(10 * flow_table->mask)))
3056                         expire = false;
3057         }
3058         rcu_read_unlock();
3059         return expire;
3060 }
3061 EXPORT_SYMBOL(rps_may_expire_flow);
3062
3063 #endif /* CONFIG_RFS_ACCEL */
3064
3065 /* Called from hardirq (IPI) context */
3066 static void rps_trigger_softirq(void *data)
3067 {
3068         struct softnet_data *sd = data;
3069
3070         ____napi_schedule(sd, &sd->backlog);
3071         sd->received_rps++;
3072 }
3073
3074 #endif /* CONFIG_RPS */
3075
3076 /*
3077  * Check if this softnet_data structure is another cpu one
3078  * If yes, queue it to our IPI list and return 1
3079  * If no, return 0
3080  */
3081 static int rps_ipi_queued(struct softnet_data *sd)
3082 {
3083 #ifdef CONFIG_RPS
3084         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3085
3086         if (sd != mysd) {
3087                 sd->rps_ipi_next = mysd->rps_ipi_list;
3088                 mysd->rps_ipi_list = sd;
3089
3090                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3091                 return 1;
3092         }
3093 #endif /* CONFIG_RPS */
3094         return 0;
3095 }
3096
3097 #ifdef CONFIG_NET_FLOW_LIMIT
3098 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3099 #endif
3100
3101 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3102 {
3103 #ifdef CONFIG_NET_FLOW_LIMIT
3104         struct sd_flow_limit *fl;
3105         struct softnet_data *sd;
3106         unsigned int old_flow, new_flow;
3107
3108         if (qlen < (netdev_max_backlog >> 1))
3109                 return false;
3110
3111         sd = &__get_cpu_var(softnet_data);
3112
3113         rcu_read_lock();
3114         fl = rcu_dereference(sd->flow_limit);
3115         if (fl) {
3116                 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3117                 old_flow = fl->history[fl->history_head];
3118                 fl->history[fl->history_head] = new_flow;
3119
3120                 fl->history_head++;
3121                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3122
3123                 if (likely(fl->buckets[old_flow]))
3124                         fl->buckets[old_flow]--;
3125
3126                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3127                         fl->count++;
3128                         rcu_read_unlock();
3129                         return true;
3130                 }
3131         }
3132         rcu_read_unlock();
3133 #endif
3134         return false;
3135 }
3136
3137 /*
3138  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3139  * queue (may be a remote CPU queue).
3140  */
3141 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3142                               unsigned int *qtail)
3143 {
3144         struct softnet_data *sd;
3145         unsigned long flags;
3146         unsigned int qlen;
3147
3148         sd = &per_cpu(softnet_data, cpu);
3149
3150         local_irq_save(flags);
3151
3152         rps_lock(sd);
3153         qlen = skb_queue_len(&sd->input_pkt_queue);
3154         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3155                 if (skb_queue_len(&sd->input_pkt_queue)) {
3156 enqueue:
3157                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3158                         input_queue_tail_incr_save(sd, qtail);
3159                         rps_unlock(sd);
3160                         local_irq_restore(flags);
3161                         return NET_RX_SUCCESS;
3162                 }
3163
3164                 /* Schedule NAPI for backlog device
3165                  * We can use non atomic operation since we own the queue lock
3166                  */
3167                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3168                         if (!rps_ipi_queued(sd))
3169                                 ____napi_schedule(sd, &sd->backlog);
3170                 }
3171                 goto enqueue;
3172         }
3173
3174         sd->dropped++;
3175         rps_unlock(sd);
3176
3177         local_irq_restore(flags);
3178
3179         atomic_long_inc(&skb->dev->rx_dropped);
3180         kfree_skb(skb);
3181         return NET_RX_DROP;
3182 }
3183
3184 /**
3185  *      netif_rx        -       post buffer to the network code
3186  *      @skb: buffer to post
3187  *
3188  *      This function receives a packet from a device driver and queues it for
3189  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3190  *      may be dropped during processing for congestion control or by the
3191  *      protocol layers.
3192  *
3193  *      return values:
3194  *      NET_RX_SUCCESS  (no congestion)
3195  *      NET_RX_DROP     (packet was dropped)
3196  *
3197  */
3198
3199 int netif_rx(struct sk_buff *skb)
3200 {
3201         int ret;
3202
3203         /* if netpoll wants it, pretend we never saw it */
3204         if (netpoll_rx(skb))
3205                 return NET_RX_DROP;
3206
3207         net_timestamp_check(netdev_tstamp_prequeue, skb);
3208
3209         trace_netif_rx(skb);
3210 #ifdef CONFIG_RPS
3211         if (static_key_false(&rps_needed)) {
3212                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3213                 int cpu;
3214
3215                 preempt_disable();
3216                 rcu_read_lock();
3217
3218                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3219                 if (cpu < 0)
3220                         cpu = smp_processor_id();
3221
3222                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3223
3224                 rcu_read_unlock();
3225                 preempt_enable();
3226         } else
3227 #endif
3228         {
3229                 unsigned int qtail;
3230                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3231                 put_cpu();
3232         }
3233         return ret;
3234 }
3235 EXPORT_SYMBOL(netif_rx);
3236
3237 int netif_rx_ni(struct sk_buff *skb)
3238 {
3239         int err;
3240
3241         preempt_disable();
3242         err = netif_rx(skb);
3243         if (local_softirq_pending())
3244                 do_softirq();
3245         preempt_enable();
3246
3247         return err;
3248 }
3249 EXPORT_SYMBOL(netif_rx_ni);
3250
3251 static void net_tx_action(struct softirq_action *h)
3252 {
3253         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3254
3255         if (sd->completion_queue) {
3256                 struct sk_buff *clist;
3257
3258                 local_irq_disable();
3259                 clist = sd->completion_queue;
3260                 sd->completion_queue = NULL;
3261                 local_irq_enable();
3262
3263                 while (clist) {
3264                         struct sk_buff *skb = clist;
3265                         clist = clist->next;
3266
3267                         WARN_ON(atomic_read(&skb->users));
3268                         trace_kfree_skb(skb, net_tx_action);
3269                         __kfree_skb(skb);
3270                 }
3271         }
3272
3273         if (sd->output_queue) {
3274                 struct Qdisc *head;
3275
3276                 local_irq_disable();
3277                 head = sd->output_queue;
3278                 sd->output_queue = NULL;
3279                 sd->output_queue_tailp = &sd->output_queue;
3280                 local_irq_enable();
3281
3282                 while (head) {
3283                         struct Qdisc *q = head;
3284                         spinlock_t *root_lock;
3285
3286                         head = head->next_sched;
3287
3288                         root_lock = qdisc_lock(q);
3289                         if (spin_trylock(root_lock)) {
3290                                 smp_mb__before_clear_bit();
3291                                 clear_bit(__QDISC_STATE_SCHED,
3292                                           &q->state);
3293                                 qdisc_run(q);
3294                                 spin_unlock(root_lock);
3295                         } else {
3296                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3297                                               &q->state)) {
3298                                         __netif_reschedule(q);
3299                                 } else {
3300                                         smp_mb__before_clear_bit();
3301                                         clear_bit(__QDISC_STATE_SCHED,
3302                                                   &q->state);
3303                                 }
3304                         }
3305                 }
3306         }
3307 }
3308
3309 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3310     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3311 /* This hook is defined here for ATM LANE */
3312 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3313                              unsigned char *addr) __read_mostly;
3314 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3315 #endif
3316
3317 #ifdef CONFIG_NET_CLS_ACT
3318 /* TODO: Maybe we should just force sch_ingress to be compiled in
3319  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3320  * a compare and 2 stores extra right now if we dont have it on
3321  * but have CONFIG_NET_CLS_ACT
3322  * NOTE: This doesn't stop any functionality; if you dont have
3323  * the ingress scheduler, you just can't add policies on ingress.
3324  *
3325  */
3326 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3327 {
3328         struct net_device *dev = skb->dev;
3329         u32 ttl = G_TC_RTTL(skb->tc_verd);
3330         int result = TC_ACT_OK;
3331         struct Qdisc *q;
3332
3333         if (unlikely(MAX_RED_LOOP < ttl++)) {
3334                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3335                                      skb->skb_iif, dev->ifindex);
3336                 return TC_ACT_SHOT;
3337         }
3338
3339         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3340         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3341
3342         q = rxq->qdisc;
3343         if (q != &noop_qdisc) {
3344                 spin_lock(qdisc_lock(q));
3345                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3346                         result = qdisc_enqueue_root(skb, q);
3347                 spin_unlock(qdisc_lock(q));
3348         }
3349
3350         return result;
3351 }
3352
3353 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3354                                          struct packet_type **pt_prev,
3355                                          int *ret, struct net_device *orig_dev)
3356 {
3357         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3358
3359         if (!rxq || rxq->qdisc == &noop_qdisc)
3360                 goto out;
3361
3362         if (*pt_prev) {
3363                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3364                 *pt_prev = NULL;
3365         }
3366
3367         switch (ing_filter(skb, rxq)) {
3368         case TC_ACT_SHOT:
3369         case TC_ACT_STOLEN:
3370                 kfree_skb(skb);
3371                 return NULL;
3372         }
3373
3374 out:
3375         skb->tc_verd = 0;
3376         return skb;
3377 }
3378 #endif
3379
3380 /**
3381  *      netdev_rx_handler_register - register receive handler
3382  *      @dev: device to register a handler for
3383  *      @rx_handler: receive handler to register
3384  *      @rx_handler_data: data pointer that is used by rx handler
3385  *
3386  *      Register a receive hander for a device. This handler will then be
3387  *      called from __netif_receive_skb. A negative errno code is returned
3388  *      on a failure.
3389  *
3390  *      The caller must hold the rtnl_mutex.
3391  *
3392  *      For a general description of rx_handler, see enum rx_handler_result.
3393  */
3394 int netdev_rx_handler_register(struct net_device *dev,
3395                                rx_handler_func_t *rx_handler,
3396                                void *rx_handler_data)
3397 {
3398         ASSERT_RTNL();
3399
3400         if (dev->rx_handler)
3401                 return -EBUSY;
3402
3403         /* Note: rx_handler_data must be set before rx_handler */
3404         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3405         rcu_assign_pointer(dev->rx_handler, rx_handler);
3406
3407         return 0;
3408 }
3409 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3410
3411 /**
3412  *      netdev_rx_handler_unregister - unregister receive handler
3413  *      @dev: device to unregister a handler from
3414  *
3415  *      Unregister a receive handler from a device.
3416  *
3417  *      The caller must hold the rtnl_mutex.
3418  */
3419 void netdev_rx_handler_unregister(struct net_device *dev)
3420 {
3421
3422         ASSERT_RTNL();
3423         RCU_INIT_POINTER(dev->rx_handler, NULL);
3424         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3425          * section has a guarantee to see a non NULL rx_handler_data
3426          * as well.
3427          */
3428         synchronize_net();
3429         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3430 }
3431 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3432
3433 /*
3434  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3435  * the special handling of PFMEMALLOC skbs.
3436  */
3437 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3438 {
3439         switch (skb->protocol) {
3440         case __constant_htons(ETH_P_ARP):
3441         case __constant_htons(ETH_P_IP):
3442         case __constant_htons(ETH_P_IPV6):
3443         case __constant_htons(ETH_P_8021Q):
3444         case __constant_htons(ETH_P_8021AD):
3445                 return true;
3446         default:
3447                 return false;
3448         }
3449 }
3450
3451 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3452 {
3453         struct packet_type *ptype, *pt_prev;
3454         rx_handler_func_t *rx_handler;
3455         struct net_device *orig_dev;
3456         struct net_device *null_or_dev;
3457         bool deliver_exact = false;
3458         int ret = NET_RX_DROP;
3459         __be16 type;
3460
3461         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3462
3463         trace_netif_receive_skb(skb);
3464
3465         /* if we've gotten here through NAPI, check netpoll */
3466         if (netpoll_receive_skb(skb))
3467                 goto out;
3468
3469         orig_dev = skb->dev;
3470
3471         skb_reset_network_header(skb);
3472         if (!skb_transport_header_was_set(skb))
3473                 skb_reset_transport_header(skb);
3474         skb_reset_mac_len(skb);
3475
3476         pt_prev = NULL;
3477
3478         rcu_read_lock();
3479
3480 another_round:
3481         skb->skb_iif = skb->dev->ifindex;
3482
3483         __this_cpu_inc(softnet_data.processed);
3484
3485         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3486             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3487                 skb = vlan_untag(skb);
3488                 if (unlikely(!skb))
3489                         goto unlock;
3490         }
3491
3492 #ifdef CONFIG_NET_CLS_ACT
3493         if (skb->tc_verd & TC_NCLS) {
3494                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3495                 goto ncls;
3496         }
3497 #endif
3498
3499         if (pfmemalloc)
3500                 goto skip_taps;
3501
3502         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3503                 if (!ptype->dev || ptype->dev == skb->dev) {
3504                         if (pt_prev)
3505                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3506                         pt_prev = ptype;
3507                 }
3508         }
3509
3510 skip_taps:
3511 #ifdef CONFIG_NET_CLS_ACT
3512         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3513         if (!skb)
3514                 goto unlock;
3515 ncls:
3516 #endif
3517
3518         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3519                 goto drop;
3520
3521         if (vlan_tx_tag_present(skb)) {
3522                 if (pt_prev) {
3523                         ret = deliver_skb(skb, pt_prev, orig_dev);
3524                         pt_prev = NULL;
3525                 }
3526                 if (vlan_do_receive(&skb))
3527                         goto another_round;
3528                 else if (unlikely(!skb))
3529                         goto unlock;
3530         }
3531
3532         rx_handler = rcu_dereference(skb->dev->rx_handler);
3533         if (rx_handler) {
3534                 if (pt_prev) {
3535                         ret = deliver_skb(skb, pt_prev, orig_dev);
3536                         pt_prev = NULL;
3537                 }
3538                 switch (rx_handler(&skb)) {
3539                 case RX_HANDLER_CONSUMED:
3540                         ret = NET_RX_SUCCESS;
3541                         goto unlock;
3542                 case RX_HANDLER_ANOTHER:
3543                         goto another_round;
3544                 case RX_HANDLER_EXACT:
3545                         deliver_exact = true;
3546                 case RX_HANDLER_PASS:
3547                         break;
3548                 default:
3549                         BUG();
3550                 }
3551         }
3552
3553         if (vlan_tx_nonzero_tag_present(skb))
3554                 skb->pkt_type = PACKET_OTHERHOST;
3555
3556         /* deliver only exact match when indicated */
3557         null_or_dev = deliver_exact ? skb->dev : NULL;
3558
3559         type = skb->protocol;
3560         list_for_each_entry_rcu(ptype,
3561                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3562                 if (ptype->type == type &&
3563                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3564                      ptype->dev == orig_dev)) {
3565                         if (pt_prev)
3566                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3567                         pt_prev = ptype;
3568                 }
3569         }
3570
3571         if (pt_prev) {
3572                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3573                         goto drop;
3574                 else
3575                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3576         } else {
3577 drop:
3578                 atomic_long_inc(&skb->dev->rx_dropped);
3579                 kfree_skb(skb);
3580                 /* Jamal, now you will not able to escape explaining
3581                  * me how you were going to use this. :-)
3582                  */
3583                 ret = NET_RX_DROP;
3584         }
3585
3586 unlock:
3587         rcu_read_unlock();
3588 out:
3589         return ret;
3590 }
3591
3592 static int __netif_receive_skb(struct sk_buff *skb)
3593 {
3594         int ret;
3595
3596         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3597                 unsigned long pflags = current->flags;
3598
3599                 /*
3600                  * PFMEMALLOC skbs are special, they should
3601                  * - be delivered to SOCK_MEMALLOC sockets only
3602                  * - stay away from userspace
3603                  * - have bounded memory usage
3604                  *
3605                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3606                  * context down to all allocation sites.
3607                  */
3608                 current->flags |= PF_MEMALLOC;
3609                 ret = __netif_receive_skb_core(skb, true);
3610                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3611         } else
3612                 ret = __netif_receive_skb_core(skb, false);
3613
3614         return ret;
3615 }
3616
3617 /**
3618  *      netif_receive_skb - process receive buffer from network
3619  *      @skb: buffer to process
3620  *
3621  *      netif_receive_skb() is the main receive data processing function.
3622  *      It always succeeds. The buffer may be dropped during processing
3623  *      for congestion control or by the protocol layers.
3624  *
3625  *      This function may only be called from softirq context and interrupts
3626  *      should be enabled.
3627  *
3628  *      Return values (usually ignored):
3629  *      NET_RX_SUCCESS: no congestion
3630  *      NET_RX_DROP: packet was dropped
3631  */
3632 int netif_receive_skb(struct sk_buff *skb)
3633 {
3634         net_timestamp_check(netdev_tstamp_prequeue, skb);
3635
3636         if (skb_defer_rx_timestamp(skb))
3637                 return NET_RX_SUCCESS;
3638
3639 #ifdef CONFIG_RPS
3640         if (static_key_false(&rps_needed)) {
3641                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3642                 int cpu, ret;
3643
3644                 rcu_read_lock();
3645
3646                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3647
3648                 if (cpu >= 0) {
3649                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3650                         rcu_read_unlock();
3651                         return ret;
3652                 }
3653                 rcu_read_unlock();
3654         }
3655 #endif
3656         return __netif_receive_skb(skb);
3657 }
3658 EXPORT_SYMBOL(netif_receive_skb);
3659
3660 /* Network device is going away, flush any packets still pending
3661  * Called with irqs disabled.
3662  */
3663 static void flush_backlog(void *arg)
3664 {
3665         struct net_device *dev = arg;
3666         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3667         struct sk_buff *skb, *tmp;
3668
3669         rps_lock(sd);
3670         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3671                 if (skb->dev == dev) {
3672                         __skb_unlink(skb, &sd->input_pkt_queue);
3673                         kfree_skb(skb);
3674                         input_queue_head_incr(sd);
3675                 }
3676         }
3677         rps_unlock(sd);
3678
3679         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3680                 if (skb->dev == dev) {
3681                         __skb_unlink(skb, &sd->process_queue);
3682                         kfree_skb(skb);
3683                         input_queue_head_incr(sd);
3684                 }
3685         }
3686 }
3687
3688 static int napi_gro_complete(struct sk_buff *skb)
3689 {
3690         struct packet_offload *ptype;
3691         __be16 type = skb->protocol;
3692         struct list_head *head = &offload_base;
3693         int err = -ENOENT;
3694
3695         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3696
3697         if (NAPI_GRO_CB(skb)->count == 1) {
3698                 skb_shinfo(skb)->gso_size = 0;
3699                 goto out;
3700         }
3701
3702         rcu_read_lock();
3703         list_for_each_entry_rcu(ptype, head, list) {
3704                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3705                         continue;
3706
3707                 err = ptype->callbacks.gro_complete(skb);
3708                 break;
3709         }
3710         rcu_read_unlock();
3711
3712         if (err) {
3713                 WARN_ON(&ptype->list == head);
3714                 kfree_skb(skb);
3715                 return NET_RX_SUCCESS;
3716         }
3717
3718 out:
3719         return netif_receive_skb(skb);
3720 }
3721
3722 /* napi->gro_list contains packets ordered by age.
3723  * youngest packets at the head of it.
3724  * Complete skbs in reverse order to reduce latencies.
3725  */
3726 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3727 {
3728         struct sk_buff *skb, *prev = NULL;
3729
3730         /* scan list and build reverse chain */
3731         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3732                 skb->prev = prev;
3733                 prev = skb;
3734         }
3735
3736         for (skb = prev; skb; skb = prev) {
3737                 skb->next = NULL;
3738
3739                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3740                         return;
3741
3742                 prev = skb->prev;
3743                 napi_gro_complete(skb);
3744                 napi->gro_count--;
3745         }
3746
3747         napi->gro_list = NULL;
3748 }
3749 EXPORT_SYMBOL(napi_gro_flush);
3750
3751 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3752 {
3753         struct sk_buff *p;
3754         unsigned int maclen = skb->dev->hard_header_len;
3755
3756         for (p = napi->gro_list; p; p = p->next) {
3757                 unsigned long diffs;
3758
3759                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3760                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3761                 if (maclen == ETH_HLEN)
3762                         diffs |= compare_ether_header(skb_mac_header(p),
3763                                                       skb_gro_mac_header(skb));
3764                 else if (!diffs)
3765                         diffs = memcmp(skb_mac_header(p),
3766                                        skb_gro_mac_header(skb),
3767                                        maclen);
3768                 NAPI_GRO_CB(p)->same_flow = !diffs;
3769                 NAPI_GRO_CB(p)->flush = 0;
3770         }
3771 }
3772
3773 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3774 {
3775         struct sk_buff **pp = NULL;
3776         struct packet_offload *ptype;
3777         __be16 type = skb->protocol;
3778         struct list_head *head = &offload_base;
3779         int same_flow;
3780         enum gro_result ret;
3781
3782         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3783                 goto normal;
3784
3785         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3786                 goto normal;
3787
3788         gro_list_prepare(napi, skb);
3789
3790         rcu_read_lock();
3791         list_for_each_entry_rcu(ptype, head, list) {
3792                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3793                         continue;
3794
3795                 skb_set_network_header(skb, skb_gro_offset(skb));
3796                 skb_reset_mac_len(skb);
3797                 NAPI_GRO_CB(skb)->same_flow = 0;
3798                 NAPI_GRO_CB(skb)->flush = 0;
3799                 NAPI_GRO_CB(skb)->free = 0;
3800
3801                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3802                 break;
3803         }
3804         rcu_read_unlock();
3805
3806         if (&ptype->list == head)
3807                 goto normal;
3808
3809         same_flow = NAPI_GRO_CB(skb)->same_flow;
3810         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3811
3812         if (pp) {
3813                 struct sk_buff *nskb = *pp;
3814
3815                 *pp = nskb->next;
3816                 nskb->next = NULL;
3817                 napi_gro_complete(nskb);
3818                 napi->gro_count--;
3819         }
3820
3821         if (same_flow)
3822                 goto ok;
3823
3824         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3825                 goto normal;
3826
3827         napi->gro_count++;
3828         NAPI_GRO_CB(skb)->count = 1;
3829         NAPI_GRO_CB(skb)->age = jiffies;
3830         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3831         skb->next = napi->gro_list;
3832         napi->gro_list = skb;
3833         ret = GRO_HELD;
3834
3835 pull:
3836         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3837                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3838
3839                 BUG_ON(skb->end - skb->tail < grow);
3840
3841                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3842
3843                 skb->tail += grow;
3844                 skb->data_len -= grow;
3845
3846                 skb_shinfo(skb)->frags[0].page_offset += grow;
3847                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3848
3849                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3850                         skb_frag_unref(skb, 0);
3851                         memmove(skb_shinfo(skb)->frags,
3852                                 skb_shinfo(skb)->frags + 1,
3853                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3854                 }
3855         }
3856
3857 ok:
3858         return ret;
3859
3860 normal:
3861         ret = GRO_NORMAL;
3862         goto pull;
3863 }
3864
3865
3866 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3867 {
3868         switch (ret) {
3869         case GRO_NORMAL:
3870                 if (netif_receive_skb(skb))
3871                         ret = GRO_DROP;
3872                 break;
3873
3874         case GRO_DROP:
3875                 kfree_skb(skb);
3876                 break;
3877
3878         case GRO_MERGED_FREE:
3879                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3880                         kmem_cache_free(skbuff_head_cache, skb);
3881                 else
3882                         __kfree_skb(skb);
3883                 break;
3884
3885         case GRO_HELD:
3886         case GRO_MERGED:
3887                 break;
3888         }
3889
3890         return ret;
3891 }
3892
3893 static void skb_gro_reset_offset(struct sk_buff *skb)
3894 {
3895         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3896         const skb_frag_t *frag0 = &pinfo->frags[0];
3897
3898         NAPI_GRO_CB(skb)->data_offset = 0;
3899         NAPI_GRO_CB(skb)->frag0 = NULL;
3900         NAPI_GRO_CB(skb)->frag0_len = 0;
3901
3902         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3903             pinfo->nr_frags &&
3904             !PageHighMem(skb_frag_page(frag0))) {
3905                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3906                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3907         }
3908 }
3909
3910 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3911 {
3912         skb_gro_reset_offset(skb);
3913
3914         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3915 }
3916 EXPORT_SYMBOL(napi_gro_receive);
3917
3918 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3919 {
3920         __skb_pull(skb, skb_headlen(skb));
3921         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3922         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3923         skb->vlan_tci = 0;
3924         skb->dev = napi->dev;
3925         skb->skb_iif = 0;
3926
3927         napi->skb = skb;
3928 }
3929
3930 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3931 {
3932         struct sk_buff *skb = napi->skb;
3933
3934         if (!skb) {
3935                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3936                 if (skb)
3937                         napi->skb = skb;
3938         }
3939         return skb;
3940 }
3941 EXPORT_SYMBOL(napi_get_frags);
3942
3943 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3944                                gro_result_t ret)
3945 {
3946         switch (ret) {
3947         case GRO_NORMAL:
3948         case GRO_HELD:
3949                 skb->protocol = eth_type_trans(skb, skb->dev);
3950
3951                 if (ret == GRO_HELD)
3952                         skb_gro_pull(skb, -ETH_HLEN);
3953                 else if (netif_receive_skb(skb))
3954                         ret = GRO_DROP;
3955                 break;
3956
3957         case GRO_DROP:
3958         case GRO_MERGED_FREE:
3959                 napi_reuse_skb(napi, skb);
3960                 break;
3961
3962         case GRO_MERGED:
3963                 break;
3964         }
3965
3966         return ret;
3967 }
3968
3969 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3970 {
3971         struct sk_buff *skb = napi->skb;
3972         struct ethhdr *eth;
3973         unsigned int hlen;
3974         unsigned int off;
3975
3976         napi->skb = NULL;
3977
3978         skb_reset_mac_header(skb);
3979         skb_gro_reset_offset(skb);
3980
3981         off = skb_gro_offset(skb);
3982         hlen = off + sizeof(*eth);
3983         eth = skb_gro_header_fast(skb, off);
3984         if (skb_gro_header_hard(skb, hlen)) {
3985                 eth = skb_gro_header_slow(skb, hlen, off);
3986                 if (unlikely(!eth)) {
3987                         napi_reuse_skb(napi, skb);
3988                         skb = NULL;
3989                         goto out;
3990                 }
3991         }
3992
3993         skb_gro_pull(skb, sizeof(*eth));
3994
3995         /*
3996          * This works because the only protocols we care about don't require
3997          * special handling.  We'll fix it up properly at the end.
3998          */
3999         skb->protocol = eth->h_proto;
4000
4001 out:
4002         return skb;
4003 }
4004
4005 gro_result_t napi_gro_frags(struct napi_struct *napi)
4006 {
4007         struct sk_buff *skb = napi_frags_skb(napi);
4008
4009         if (!skb)
4010                 return GRO_DROP;
4011
4012         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4013 }
4014 EXPORT_SYMBOL(napi_gro_frags);
4015
4016 /*
4017  * net_rps_action sends any pending IPI's for rps.
4018  * Note: called with local irq disabled, but exits with local irq enabled.
4019  */
4020 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4021 {
4022 #ifdef CONFIG_RPS
4023         struct softnet_data *remsd = sd->rps_ipi_list;
4024
4025         if (remsd) {
4026                 sd->rps_ipi_list = NULL;
4027
4028                 local_irq_enable();
4029
4030                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4031                 while (remsd) {
4032                         struct softnet_data *next = remsd->rps_ipi_next;
4033
4034                         if (cpu_online(remsd->cpu))
4035                                 __smp_call_function_single(remsd->cpu,
4036                                                            &remsd->csd, 0);
4037                         remsd = next;
4038                 }
4039         } else
4040 #endif
4041                 local_irq_enable();
4042 }
4043
4044 static int process_backlog(struct napi_struct *napi, int quota)
4045 {
4046         int work = 0;
4047         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4048
4049 #ifdef CONFIG_RPS
4050         /* Check if we have pending ipi, its better to send them now,
4051          * not waiting net_rx_action() end.
4052          */
4053         if (sd->rps_ipi_list) {
4054                 local_irq_disable();
4055                 net_rps_action_and_irq_enable(sd);
4056         }
4057 #endif
4058         napi->weight = weight_p;
4059         local_irq_disable();
4060         while (work < quota) {
4061                 struct sk_buff *skb;
4062                 unsigned int qlen;
4063
4064                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4065                         local_irq_enable();
4066                         __netif_receive_skb(skb);
4067                         local_irq_disable();
4068                         input_queue_head_incr(sd);
4069                         if (++work >= quota) {
4070                                 local_irq_enable();
4071                                 return work;
4072                         }
4073                 }
4074
4075                 rps_lock(sd);
4076                 qlen = skb_queue_len(&sd->input_pkt_queue);
4077                 if (qlen)
4078                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4079                                                    &sd->process_queue);
4080
4081                 if (qlen < quota - work) {
4082                         /*
4083                          * Inline a custom version of __napi_complete().
4084                          * only current cpu owns and manipulates this napi,
4085                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4086                          * we can use a plain write instead of clear_bit(),
4087                          * and we dont need an smp_mb() memory barrier.
4088                          */
4089                         list_del(&napi->poll_list);
4090                         napi->state = 0;
4091
4092                         quota = work + qlen;
4093                 }
4094                 rps_unlock(sd);
4095         }
4096         local_irq_enable();
4097
4098         return work;
4099 }
4100
4101 /**
4102  * __napi_schedule - schedule for receive
4103  * @n: entry to schedule
4104  *
4105  * The entry's receive function will be scheduled to run
4106  */
4107 void __napi_schedule(struct napi_struct *n)
4108 {
4109         unsigned long flags;
4110
4111         local_irq_save(flags);
4112         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4113         local_irq_restore(flags);
4114 }
4115 EXPORT_SYMBOL(__napi_schedule);
4116
4117 void __napi_complete(struct napi_struct *n)
4118 {
4119         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4120         BUG_ON(n->gro_list);
4121
4122         list_del(&n->poll_list);
4123         smp_mb__before_clear_bit();
4124         clear_bit(NAPI_STATE_SCHED, &n->state);
4125 }
4126 EXPORT_SYMBOL(__napi_complete);
4127
4128 void napi_complete(struct napi_struct *n)
4129 {
4130         unsigned long flags;
4131
4132         /*
4133          * don't let napi dequeue from the cpu poll list
4134          * just in case its running on a different cpu
4135          */
4136         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4137                 return;
4138
4139         napi_gro_flush(n, false);
4140         local_irq_save(flags);
4141         __napi_complete(n);
4142         local_irq_restore(flags);
4143 }
4144 EXPORT_SYMBOL(napi_complete);
4145
4146 /* must be called under rcu_read_lock(), as we dont take a reference */
4147 struct napi_struct *napi_by_id(unsigned int napi_id)
4148 {
4149         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4150         struct napi_struct *napi;
4151
4152         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4153                 if (napi->napi_id == napi_id)
4154                         return napi;
4155
4156         return NULL;
4157 }
4158 EXPORT_SYMBOL_GPL(napi_by_id);
4159
4160 void napi_hash_add(struct napi_struct *napi)
4161 {
4162         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4163
4164                 spin_lock(&napi_hash_lock);
4165
4166                 /* 0 is not a valid id, we also skip an id that is taken
4167                  * we expect both events to be extremely rare
4168                  */
4169                 napi->napi_id = 0;
4170                 while (!napi->napi_id) {
4171                         napi->napi_id = ++napi_gen_id;
4172                         if (napi_by_id(napi->napi_id))
4173                                 napi->napi_id = 0;
4174                 }
4175
4176                 hlist_add_head_rcu(&napi->napi_hash_node,
4177                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4178
4179                 spin_unlock(&napi_hash_lock);
4180         }
4181 }
4182 EXPORT_SYMBOL_GPL(napi_hash_add);
4183
4184 /* Warning : caller is responsible to make sure rcu grace period
4185  * is respected before freeing memory containing @napi
4186  */
4187 void napi_hash_del(struct napi_struct *napi)
4188 {
4189         spin_lock(&napi_hash_lock);
4190
4191         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4192                 hlist_del_rcu(&napi->napi_hash_node);
4193
4194         spin_unlock(&napi_hash_lock);
4195 }
4196 EXPORT_SYMBOL_GPL(napi_hash_del);
4197
4198 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4199                     int (*poll)(struct napi_struct *, int), int weight)
4200 {
4201         INIT_LIST_HEAD(&napi->poll_list);
4202         napi->gro_count = 0;
4203         napi->gro_list = NULL;
4204         napi->skb = NULL;
4205         napi->poll = poll;
4206         if (weight > NAPI_POLL_WEIGHT)
4207                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4208                             weight, dev->name);
4209         napi->weight = weight;
4210         list_add(&napi->dev_list, &dev->napi_list);
4211         napi->dev = dev;
4212 #ifdef CONFIG_NETPOLL
4213         spin_lock_init(&napi->poll_lock);
4214         napi->poll_owner = -1;
4215 #endif
4216         set_bit(NAPI_STATE_SCHED, &napi->state);
4217 }
4218 EXPORT_SYMBOL(netif_napi_add);
4219
4220 void netif_napi_del(struct napi_struct *napi)
4221 {
4222         struct sk_buff *skb, *next;
4223
4224         list_del_init(&napi->dev_list);
4225         napi_free_frags(napi);
4226
4227         for (skb = napi->gro_list; skb; skb = next) {
4228                 next = skb->next;
4229                 skb->next = NULL;
4230                 kfree_skb(skb);
4231         }
4232
4233         napi->gro_list = NULL;
4234         napi->gro_count = 0;
4235 }
4236 EXPORT_SYMBOL(netif_napi_del);
4237
4238 static void net_rx_action(struct softirq_action *h)
4239 {
4240         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4241         unsigned long time_limit = jiffies + 2;
4242         int budget = netdev_budget;
4243         void *have;
4244
4245         local_irq_disable();
4246
4247         while (!list_empty(&sd->poll_list)) {
4248                 struct napi_struct *n;
4249                 int work, weight;
4250
4251                 /* If softirq window is exhuasted then punt.
4252                  * Allow this to run for 2 jiffies since which will allow
4253                  * an average latency of 1.5/HZ.
4254                  */
4255                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4256                         goto softnet_break;
4257
4258                 local_irq_enable();
4259
4260                 /* Even though interrupts have been re-enabled, this
4261                  * access is safe because interrupts can only add new
4262                  * entries to the tail of this list, and only ->poll()
4263                  * calls can remove this head entry from the list.
4264                  */
4265                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4266
4267                 have = netpoll_poll_lock(n);
4268
4269                 weight = n->weight;
4270
4271                 /* This NAPI_STATE_SCHED test is for avoiding a race
4272                  * with netpoll's poll_napi().  Only the entity which
4273                  * obtains the lock and sees NAPI_STATE_SCHED set will
4274                  * actually make the ->poll() call.  Therefore we avoid
4275                  * accidentally calling ->poll() when NAPI is not scheduled.
4276                  */
4277                 work = 0;
4278                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4279                         work = n->poll(n, weight);
4280                         trace_napi_poll(n);
4281                 }
4282
4283                 WARN_ON_ONCE(work > weight);
4284
4285                 budget -= work;
4286
4287                 local_irq_disable();
4288
4289                 /* Drivers must not modify the NAPI state if they
4290                  * consume the entire weight.  In such cases this code
4291                  * still "owns" the NAPI instance and therefore can
4292                  * move the instance around on the list at-will.
4293                  */
4294                 if (unlikely(work == weight)) {
4295                         if (unlikely(napi_disable_pending(n))) {
4296                                 local_irq_enable();
4297                                 napi_complete(n);
4298                                 local_irq_disable();
4299                         } else {
4300                                 if (n->gro_list) {
4301                                         /* flush too old packets
4302                                          * If HZ < 1000, flush all packets.
4303                                          */
4304                                         local_irq_enable();
4305                                         napi_gro_flush(n, HZ >= 1000);
4306                                         local_irq_disable();
4307                                 }
4308                                 list_move_tail(&n->poll_list, &sd->poll_list);
4309                         }
4310                 }
4311
4312                 netpoll_poll_unlock(have);
4313         }
4314 out:
4315         net_rps_action_and_irq_enable(sd);
4316
4317 #ifdef CONFIG_NET_DMA
4318         /*
4319          * There may not be any more sk_buffs coming right now, so push
4320          * any pending DMA copies to hardware
4321          */
4322         dma_issue_pending_all();
4323 #endif
4324
4325         return;
4326
4327 softnet_break:
4328         sd->time_squeeze++;
4329         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4330         goto out;
4331 }
4332
4333 struct netdev_upper {
4334         struct net_device *dev;
4335         bool master;
4336         struct list_head list;
4337         struct rcu_head rcu;
4338         struct list_head search_list;
4339 };
4340
4341 static void __append_search_uppers(struct list_head *search_list,
4342                                    struct net_device *dev)
4343 {
4344         struct netdev_upper *upper;
4345
4346         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4347                 /* check if this upper is not already in search list */
4348                 if (list_empty(&upper->search_list))
4349                         list_add_tail(&upper->search_list, search_list);
4350         }
4351 }
4352
4353 static bool __netdev_search_upper_dev(struct net_device *dev,
4354                                       struct net_device *upper_dev)
4355 {
4356         LIST_HEAD(search_list);
4357         struct netdev_upper *upper;
4358         struct netdev_upper *tmp;
4359         bool ret = false;
4360
4361         __append_search_uppers(&search_list, dev);
4362         list_for_each_entry(upper, &search_list, search_list) {
4363                 if (upper->dev == upper_dev) {
4364                         ret = true;
4365                         break;
4366                 }
4367                 __append_search_uppers(&search_list, upper->dev);
4368         }
4369         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4370                 INIT_LIST_HEAD(&upper->search_list);
4371         return ret;
4372 }
4373
4374 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4375                                                 struct net_device *upper_dev)
4376 {
4377         struct netdev_upper *upper;
4378
4379         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4380                 if (upper->dev == upper_dev)
4381                         return upper;
4382         }
4383         return NULL;
4384 }
4385
4386 /**
4387  * netdev_has_upper_dev - Check if device is linked to an upper device
4388  * @dev: device
4389  * @upper_dev: upper device to check
4390  *
4391  * Find out if a device is linked to specified upper device and return true
4392  * in case it is. Note that this checks only immediate upper device,
4393  * not through a complete stack of devices. The caller must hold the RTNL lock.
4394  */
4395 bool netdev_has_upper_dev(struct net_device *dev,
4396                           struct net_device *upper_dev)
4397 {
4398         ASSERT_RTNL();
4399
4400         return __netdev_find_upper(dev, upper_dev);
4401 }
4402 EXPORT_SYMBOL(netdev_has_upper_dev);
4403
4404 /**
4405  * netdev_has_any_upper_dev - Check if device is linked to some device
4406  * @dev: device
4407  *
4408  * Find out if a device is linked to an upper device and return true in case
4409  * it is. The caller must hold the RTNL lock.
4410  */
4411 bool netdev_has_any_upper_dev(struct net_device *dev)
4412 {
4413         ASSERT_RTNL();
4414
4415         return !list_empty(&dev->upper_dev_list);
4416 }
4417 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4418
4419 /**
4420  * netdev_master_upper_dev_get - Get master upper device
4421  * @dev: device
4422  *
4423  * Find a master upper device and return pointer to it or NULL in case
4424  * it's not there. The caller must hold the RTNL lock.
4425  */
4426 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4427 {
4428         struct netdev_upper *upper;
4429
4430         ASSERT_RTNL();
4431
4432         if (list_empty(&dev->upper_dev_list))
4433                 return NULL;
4434
4435         upper = list_first_entry(&dev->upper_dev_list,
4436                                  struct netdev_upper, list);
4437         if (likely(upper->master))
4438                 return upper->dev;
4439         return NULL;
4440 }
4441 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4442
4443 /**
4444  * netdev_master_upper_dev_get_rcu - Get master upper device
4445  * @dev: device
4446  *
4447  * Find a master upper device and return pointer to it or NULL in case
4448  * it's not there. The caller must hold the RCU read lock.
4449  */
4450 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4451 {
4452         struct netdev_upper *upper;
4453
4454         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4455                                        struct netdev_upper, list);
4456         if (upper && likely(upper->master))
4457                 return upper->dev;
4458         return NULL;
4459 }
4460 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4461
4462 static int __netdev_upper_dev_link(struct net_device *dev,
4463                                    struct net_device *upper_dev, bool master)
4464 {
4465         struct netdev_upper *upper;
4466
4467         ASSERT_RTNL();
4468
4469         if (dev == upper_dev)
4470                 return -EBUSY;
4471
4472         /* To prevent loops, check if dev is not upper device to upper_dev. */
4473         if (__netdev_search_upper_dev(upper_dev, dev))
4474                 return -EBUSY;
4475
4476         if (__netdev_find_upper(dev, upper_dev))
4477                 return -EEXIST;
4478
4479         if (master && netdev_master_upper_dev_get(dev))
4480                 return -EBUSY;
4481
4482         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4483         if (!upper)
4484                 return -ENOMEM;
4485
4486         upper->dev = upper_dev;
4487         upper->master = master;
4488         INIT_LIST_HEAD(&upper->search_list);
4489
4490         /* Ensure that master upper link is always the first item in list. */
4491         if (master)
4492                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4493         else
4494                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4495         dev_hold(upper_dev);
4496         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4497         return 0;
4498 }
4499
4500 /**
4501  * netdev_upper_dev_link - Add a link to the upper device
4502  * @dev: device
4503  * @upper_dev: new upper device
4504  *
4505  * Adds a link to device which is upper to this one. The caller must hold
4506  * the RTNL lock. On a failure a negative errno code is returned.
4507  * On success the reference counts are adjusted and the function
4508  * returns zero.
4509  */
4510 int netdev_upper_dev_link(struct net_device *dev,
4511                           struct net_device *upper_dev)
4512 {
4513         return __netdev_upper_dev_link(dev, upper_dev, false);
4514 }
4515 EXPORT_SYMBOL(netdev_upper_dev_link);
4516
4517 /**
4518  * netdev_master_upper_dev_link - Add a master link to the upper device
4519  * @dev: device
4520  * @upper_dev: new upper device
4521  *
4522  * Adds a link to device which is upper to this one. In this case, only
4523  * one master upper device can be linked, although other non-master devices
4524  * might be linked as well. The caller must hold the RTNL lock.
4525  * On a failure a negative errno code is returned. On success the reference
4526  * counts are adjusted and the function returns zero.
4527  */
4528 int netdev_master_upper_dev_link(struct net_device *dev,
4529                                  struct net_device *upper_dev)
4530 {
4531         return __netdev_upper_dev_link(dev, upper_dev, true);
4532 }
4533 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4534
4535 /**
4536  * netdev_upper_dev_unlink - Removes a link to upper device
4537  * @dev: device
4538  * @upper_dev: new upper device
4539  *
4540  * Removes a link to device which is upper to this one. The caller must hold
4541  * the RTNL lock.
4542  */
4543 void netdev_upper_dev_unlink(struct net_device *dev,
4544                              struct net_device *upper_dev)
4545 {
4546         struct netdev_upper *upper;
4547
4548         ASSERT_RTNL();
4549
4550         upper = __netdev_find_upper(dev, upper_dev);
4551         if (!upper)
4552                 return;
4553         list_del_rcu(&upper->list);
4554         dev_put(upper_dev);
4555         kfree_rcu(upper, rcu);
4556         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4557 }
4558 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4559
4560 static void dev_change_rx_flags(struct net_device *dev, int flags)
4561 {
4562         const struct net_device_ops *ops = dev->netdev_ops;
4563
4564         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4565                 ops->ndo_change_rx_flags(dev, flags);
4566 }
4567
4568 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4569 {
4570         unsigned int old_flags = dev->flags;
4571         kuid_t uid;
4572         kgid_t gid;
4573
4574         ASSERT_RTNL();
4575
4576         dev->flags |= IFF_PROMISC;
4577         dev->promiscuity += inc;
4578         if (dev->promiscuity == 0) {
4579                 /*
4580                  * Avoid overflow.
4581                  * If inc causes overflow, untouch promisc and return error.
4582                  */
4583                 if (inc < 0)
4584                         dev->flags &= ~IFF_PROMISC;
4585                 else {
4586                         dev->promiscuity -= inc;
4587                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4588                                 dev->name);
4589                         return -EOVERFLOW;
4590                 }
4591         }
4592         if (dev->flags != old_flags) {
4593                 pr_info("device %s %s promiscuous mode\n",
4594                         dev->name,
4595                         dev->flags & IFF_PROMISC ? "entered" : "left");
4596                 if (audit_enabled) {
4597                         current_uid_gid(&uid, &gid);
4598                         audit_log(current->audit_context, GFP_ATOMIC,
4599                                 AUDIT_ANOM_PROMISCUOUS,
4600                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4601                                 dev->name, (dev->flags & IFF_PROMISC),
4602                                 (old_flags & IFF_PROMISC),
4603                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4604                                 from_kuid(&init_user_ns, uid),
4605                                 from_kgid(&init_user_ns, gid),
4606                                 audit_get_sessionid(current));
4607                 }
4608
4609                 dev_change_rx_flags(dev, IFF_PROMISC);
4610         }
4611         return 0;
4612 }
4613
4614 /**
4615  *      dev_set_promiscuity     - update promiscuity count on a device
4616  *      @dev: device
4617  *      @inc: modifier
4618  *
4619  *      Add or remove promiscuity from a device. While the count in the device
4620  *      remains above zero the interface remains promiscuous. Once it hits zero
4621  *      the device reverts back to normal filtering operation. A negative inc
4622  *      value is used to drop promiscuity on the device.
4623  *      Return 0 if successful or a negative errno code on error.
4624  */
4625 int dev_set_promiscuity(struct net_device *dev, int inc)
4626 {
4627         unsigned int old_flags = dev->flags;
4628         int err;
4629
4630         err = __dev_set_promiscuity(dev, inc);
4631         if (err < 0)
4632                 return err;
4633         if (dev->flags != old_flags)
4634                 dev_set_rx_mode(dev);
4635         return err;
4636 }
4637 EXPORT_SYMBOL(dev_set_promiscuity);
4638
4639 /**
4640  *      dev_set_allmulti        - update allmulti count on a device
4641  *      @dev: device
4642  *      @inc: modifier
4643  *
4644  *      Add or remove reception of all multicast frames to a device. While the
4645  *      count in the device remains above zero the interface remains listening
4646  *      to all interfaces. Once it hits zero the device reverts back to normal
4647  *      filtering operation. A negative @inc value is used to drop the counter
4648  *      when releasing a resource needing all multicasts.
4649  *      Return 0 if successful or a negative errno code on error.
4650  */
4651
4652 int dev_set_allmulti(struct net_device *dev, int inc)
4653 {
4654         unsigned int old_flags = dev->flags;
4655
4656         ASSERT_RTNL();
4657
4658         dev->flags |= IFF_ALLMULTI;
4659         dev->allmulti += inc;
4660         if (dev->allmulti == 0) {
4661                 /*
4662                  * Avoid overflow.
4663                  * If inc causes overflow, untouch allmulti and return error.
4664                  */
4665                 if (inc < 0)
4666                         dev->flags &= ~IFF_ALLMULTI;
4667                 else {
4668                         dev->allmulti -= inc;
4669                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4670                                 dev->name);
4671                         return -EOVERFLOW;
4672                 }
4673         }
4674         if (dev->flags ^ old_flags) {
4675                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4676                 dev_set_rx_mode(dev);
4677         }
4678         return 0;
4679 }
4680 EXPORT_SYMBOL(dev_set_allmulti);
4681
4682 /*
4683  *      Upload unicast and multicast address lists to device and
4684  *      configure RX filtering. When the device doesn't support unicast
4685  *      filtering it is put in promiscuous mode while unicast addresses
4686  *      are present.
4687  */
4688 void __dev_set_rx_mode(struct net_device *dev)
4689 {
4690         const struct net_device_ops *ops = dev->netdev_ops;
4691
4692         /* dev_open will call this function so the list will stay sane. */
4693         if (!(dev->flags&IFF_UP))
4694                 return;
4695
4696         if (!netif_device_present(dev))
4697                 return;
4698
4699         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4700                 /* Unicast addresses changes may only happen under the rtnl,
4701                  * therefore calling __dev_set_promiscuity here is safe.
4702                  */
4703                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4704                         __dev_set_promiscuity(dev, 1);
4705                         dev->uc_promisc = true;
4706                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4707                         __dev_set_promiscuity(dev, -1);
4708                         dev->uc_promisc = false;
4709                 }
4710         }
4711
4712         if (ops->ndo_set_rx_mode)
4713                 ops->ndo_set_rx_mode(dev);
4714 }
4715
4716 void dev_set_rx_mode(struct net_device *dev)
4717 {
4718         netif_addr_lock_bh(dev);
4719         __dev_set_rx_mode(dev);
4720         netif_addr_unlock_bh(dev);
4721 }
4722
4723 /**
4724  *      dev_get_flags - get flags reported to userspace
4725  *      @dev: device
4726  *
4727  *      Get the combination of flag bits exported through APIs to userspace.
4728  */
4729 unsigned int dev_get_flags(const struct net_device *dev)
4730 {
4731         unsigned int flags;
4732
4733         flags = (dev->flags & ~(IFF_PROMISC |
4734                                 IFF_ALLMULTI |
4735                                 IFF_RUNNING |
4736                                 IFF_LOWER_UP |
4737                                 IFF_DORMANT)) |
4738                 (dev->gflags & (IFF_PROMISC |
4739                                 IFF_ALLMULTI));
4740
4741         if (netif_running(dev)) {
4742                 if (netif_oper_up(dev))
4743                         flags |= IFF_RUNNING;
4744                 if (netif_carrier_ok(dev))
4745                         flags |= IFF_LOWER_UP;
4746                 if (netif_dormant(dev))
4747                         flags |= IFF_DORMANT;
4748         }
4749
4750         return flags;
4751 }
4752 EXPORT_SYMBOL(dev_get_flags);
4753
4754 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4755 {
4756         unsigned int old_flags = dev->flags;
4757         int ret;
4758
4759         ASSERT_RTNL();
4760
4761         /*
4762          *      Set the flags on our device.
4763          */
4764
4765         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4766                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4767                                IFF_AUTOMEDIA)) |
4768                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4769                                     IFF_ALLMULTI));
4770
4771         /*
4772          *      Load in the correct multicast list now the flags have changed.
4773          */
4774
4775         if ((old_flags ^ flags) & IFF_MULTICAST)
4776                 dev_change_rx_flags(dev, IFF_MULTICAST);
4777
4778         dev_set_rx_mode(dev);
4779
4780         /*
4781          *      Have we downed the interface. We handle IFF_UP ourselves
4782          *      according to user attempts to set it, rather than blindly
4783          *      setting it.
4784          */
4785
4786         ret = 0;
4787         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4788                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4789
4790                 if (!ret)
4791                         dev_set_rx_mode(dev);
4792         }
4793
4794         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4795                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4796
4797                 dev->gflags ^= IFF_PROMISC;
4798                 dev_set_promiscuity(dev, inc);
4799         }
4800
4801         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4802            is important. Some (broken) drivers set IFF_PROMISC, when
4803            IFF_ALLMULTI is requested not asking us and not reporting.
4804          */
4805         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4806                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4807
4808                 dev->gflags ^= IFF_ALLMULTI;
4809                 dev_set_allmulti(dev, inc);
4810         }
4811
4812         return ret;
4813 }
4814
4815 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4816 {
4817         unsigned int changes = dev->flags ^ old_flags;
4818
4819         if (changes & IFF_UP) {
4820                 if (dev->flags & IFF_UP)
4821                         call_netdevice_notifiers(NETDEV_UP, dev);
4822                 else
4823                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4824         }
4825
4826         if (dev->flags & IFF_UP &&
4827             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
4828                 struct netdev_notifier_change_info change_info;
4829
4830                 change_info.flags_changed = changes;
4831                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
4832                                               &change_info.info);
4833         }
4834 }
4835
4836 /**
4837  *      dev_change_flags - change device settings
4838  *      @dev: device
4839  *      @flags: device state flags
4840  *
4841  *      Change settings on device based state flags. The flags are
4842  *      in the userspace exported format.
4843  */
4844 int dev_change_flags(struct net_device *dev, unsigned int flags)
4845 {
4846         int ret;
4847         unsigned int changes, old_flags = dev->flags;
4848
4849         ret = __dev_change_flags(dev, flags);
4850         if (ret < 0)
4851                 return ret;
4852
4853         changes = old_flags ^ dev->flags;
4854         if (changes)
4855                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4856
4857         __dev_notify_flags(dev, old_flags);
4858         return ret;
4859 }
4860 EXPORT_SYMBOL(dev_change_flags);
4861
4862 /**
4863  *      dev_set_mtu - Change maximum transfer unit
4864  *      @dev: device
4865  *      @new_mtu: new transfer unit
4866  *
4867  *      Change the maximum transfer size of the network device.
4868  */
4869 int dev_set_mtu(struct net_device *dev, int new_mtu)
4870 {
4871         const struct net_device_ops *ops = dev->netdev_ops;
4872         int err;
4873
4874         if (new_mtu == dev->mtu)
4875                 return 0;
4876
4877         /*      MTU must be positive.    */
4878         if (new_mtu < 0)
4879                 return -EINVAL;
4880
4881         if (!netif_device_present(dev))
4882                 return -ENODEV;
4883
4884         err = 0;
4885         if (ops->ndo_change_mtu)
4886                 err = ops->ndo_change_mtu(dev, new_mtu);
4887         else
4888                 dev->mtu = new_mtu;
4889
4890         if (!err)
4891                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4892         return err;
4893 }
4894 EXPORT_SYMBOL(dev_set_mtu);
4895
4896 /**
4897  *      dev_set_group - Change group this device belongs to
4898  *      @dev: device
4899  *      @new_group: group this device should belong to
4900  */
4901 void dev_set_group(struct net_device *dev, int new_group)
4902 {
4903         dev->group = new_group;
4904 }
4905 EXPORT_SYMBOL(dev_set_group);
4906
4907 /**
4908  *      dev_set_mac_address - Change Media Access Control Address
4909  *      @dev: device
4910  *      @sa: new address
4911  *
4912  *      Change the hardware (MAC) address of the device
4913  */
4914 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4915 {
4916         const struct net_device_ops *ops = dev->netdev_ops;
4917         int err;
4918
4919         if (!ops->ndo_set_mac_address)
4920                 return -EOPNOTSUPP;
4921         if (sa->sa_family != dev->type)
4922                 return -EINVAL;
4923         if (!netif_device_present(dev))
4924                 return -ENODEV;
4925         err = ops->ndo_set_mac_address(dev, sa);
4926         if (err)
4927                 return err;
4928         dev->addr_assign_type = NET_ADDR_SET;
4929         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4930         add_device_randomness(dev->dev_addr, dev->addr_len);
4931         return 0;
4932 }
4933 EXPORT_SYMBOL(dev_set_mac_address);
4934
4935 /**
4936  *      dev_change_carrier - Change device carrier
4937  *      @dev: device
4938  *      @new_carrier: new value
4939  *
4940  *      Change device carrier
4941  */
4942 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4943 {
4944         const struct net_device_ops *ops = dev->netdev_ops;
4945
4946         if (!ops->ndo_change_carrier)
4947                 return -EOPNOTSUPP;
4948         if (!netif_device_present(dev))
4949                 return -ENODEV;
4950         return ops->ndo_change_carrier(dev, new_carrier);
4951 }
4952 EXPORT_SYMBOL(dev_change_carrier);
4953
4954 /**
4955  *      dev_new_index   -       allocate an ifindex
4956  *      @net: the applicable net namespace
4957  *
4958  *      Returns a suitable unique value for a new device interface
4959  *      number.  The caller must hold the rtnl semaphore or the
4960  *      dev_base_lock to be sure it remains unique.
4961  */
4962 static int dev_new_index(struct net *net)
4963 {
4964         int ifindex = net->ifindex;
4965         for (;;) {
4966                 if (++ifindex <= 0)
4967                         ifindex = 1;
4968                 if (!__dev_get_by_index(net, ifindex))
4969                         return net->ifindex = ifindex;
4970         }
4971 }
4972
4973 /* Delayed registration/unregisteration */
4974 static LIST_HEAD(net_todo_list);
4975
4976 static void net_set_todo(struct net_device *dev)
4977 {
4978         list_add_tail(&dev->todo_list, &net_todo_list);
4979 }
4980
4981 static void rollback_registered_many(struct list_head *head)
4982 {
4983         struct net_device *dev, *tmp;
4984
4985         BUG_ON(dev_boot_phase);
4986         ASSERT_RTNL();
4987
4988         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4989                 /* Some devices call without registering
4990                  * for initialization unwind. Remove those
4991                  * devices and proceed with the remaining.
4992                  */
4993                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4994                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4995                                  dev->name, dev);
4996
4997                         WARN_ON(1);
4998                         list_del(&dev->unreg_list);
4999                         continue;
5000                 }
5001                 dev->dismantle = true;
5002                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5003         }
5004
5005         /* If device is running, close it first. */
5006         dev_close_many(head);
5007
5008         list_for_each_entry(dev, head, unreg_list) {
5009                 /* And unlink it from device chain. */
5010                 unlist_netdevice(dev);
5011
5012                 dev->reg_state = NETREG_UNREGISTERING;
5013         }
5014
5015         synchronize_net();
5016
5017         list_for_each_entry(dev, head, unreg_list) {
5018                 /* Shutdown queueing discipline. */
5019                 dev_shutdown(dev);
5020
5021
5022                 /* Notify protocols, that we are about to destroy
5023                    this device. They should clean all the things.
5024                 */
5025                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5026
5027                 if (!dev->rtnl_link_ops ||
5028                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5029                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5030
5031                 /*
5032                  *      Flush the unicast and multicast chains
5033                  */
5034                 dev_uc_flush(dev);
5035                 dev_mc_flush(dev);
5036
5037                 if (dev->netdev_ops->ndo_uninit)
5038                         dev->netdev_ops->ndo_uninit(dev);
5039
5040                 /* Notifier chain MUST detach us all upper devices. */
5041                 WARN_ON(netdev_has_any_upper_dev(dev));
5042
5043                 /* Remove entries from kobject tree */
5044                 netdev_unregister_kobject(dev);
5045 #ifdef CONFIG_XPS
5046                 /* Remove XPS queueing entries */
5047                 netif_reset_xps_queues_gt(dev, 0);
5048 #endif
5049         }
5050
5051         synchronize_net();
5052
5053         list_for_each_entry(dev, head, unreg_list)
5054                 dev_put(dev);
5055 }
5056
5057 static void rollback_registered(struct net_device *dev)
5058 {
5059         LIST_HEAD(single);
5060
5061         list_add(&dev->unreg_list, &single);
5062         rollback_registered_many(&single);
5063         list_del(&single);
5064 }
5065
5066 static netdev_features_t netdev_fix_features(struct net_device *dev,
5067         netdev_features_t features)
5068 {
5069         /* Fix illegal checksum combinations */
5070         if ((features & NETIF_F_HW_CSUM) &&
5071             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5072                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5073                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5074         }
5075
5076         /* TSO requires that SG is present as well. */
5077         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5078                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5079                 features &= ~NETIF_F_ALL_TSO;
5080         }
5081
5082         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5083                                         !(features & NETIF_F_IP_CSUM)) {
5084                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5085                 features &= ~NETIF_F_TSO;
5086                 features &= ~NETIF_F_TSO_ECN;
5087         }
5088
5089         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5090                                          !(features & NETIF_F_IPV6_CSUM)) {
5091                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5092                 features &= ~NETIF_F_TSO6;
5093         }
5094
5095         /* TSO ECN requires that TSO is present as well. */
5096         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5097                 features &= ~NETIF_F_TSO_ECN;
5098
5099         /* Software GSO depends on SG. */
5100         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5101                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5102                 features &= ~NETIF_F_GSO;
5103         }
5104
5105         /* UFO needs SG and checksumming */
5106         if (features & NETIF_F_UFO) {
5107                 /* maybe split UFO into V4 and V6? */
5108                 if (!((features & NETIF_F_GEN_CSUM) ||
5109                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5110                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5111                         netdev_dbg(dev,
5112                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5113                         features &= ~NETIF_F_UFO;
5114                 }
5115
5116                 if (!(features & NETIF_F_SG)) {
5117                         netdev_dbg(dev,
5118                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5119                         features &= ~NETIF_F_UFO;
5120                 }
5121         }
5122
5123         return features;
5124 }
5125
5126 int __netdev_update_features(struct net_device *dev)
5127 {
5128         netdev_features_t features;
5129         int err = 0;
5130
5131         ASSERT_RTNL();
5132
5133         features = netdev_get_wanted_features(dev);
5134
5135         if (dev->netdev_ops->ndo_fix_features)
5136                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5137
5138         /* driver might be less strict about feature dependencies */
5139         features = netdev_fix_features(dev, features);
5140
5141         if (dev->features == features)
5142                 return 0;
5143
5144         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5145                 &dev->features, &features);
5146
5147         if (dev->netdev_ops->ndo_set_features)
5148                 err = dev->netdev_ops->ndo_set_features(dev, features);
5149
5150         if (unlikely(err < 0)) {
5151                 netdev_err(dev,
5152                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5153                         err, &features, &dev->features);
5154                 return -1;
5155         }
5156
5157         if (!err)
5158                 dev->features = features;
5159
5160         return 1;
5161 }
5162
5163 /**
5164  *      netdev_update_features - recalculate device features
5165  *      @dev: the device to check
5166  *
5167  *      Recalculate dev->features set and send notifications if it
5168  *      has changed. Should be called after driver or hardware dependent
5169  *      conditions might have changed that influence the features.
5170  */
5171 void netdev_update_features(struct net_device *dev)
5172 {
5173         if (__netdev_update_features(dev))
5174                 netdev_features_change(dev);
5175 }
5176 EXPORT_SYMBOL(netdev_update_features);
5177
5178 /**
5179  *      netdev_change_features - recalculate device features
5180  *      @dev: the device to check
5181  *
5182  *      Recalculate dev->features set and send notifications even
5183  *      if they have not changed. Should be called instead of
5184  *      netdev_update_features() if also dev->vlan_features might
5185  *      have changed to allow the changes to be propagated to stacked
5186  *      VLAN devices.
5187  */
5188 void netdev_change_features(struct net_device *dev)
5189 {
5190         __netdev_update_features(dev);
5191         netdev_features_change(dev);
5192 }
5193 EXPORT_SYMBOL(netdev_change_features);
5194
5195 /**
5196  *      netif_stacked_transfer_operstate -      transfer operstate
5197  *      @rootdev: the root or lower level device to transfer state from
5198  *      @dev: the device to transfer operstate to
5199  *
5200  *      Transfer operational state from root to device. This is normally
5201  *      called when a stacking relationship exists between the root
5202  *      device and the device(a leaf device).
5203  */
5204 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5205                                         struct net_device *dev)
5206 {
5207         if (rootdev->operstate == IF_OPER_DORMANT)
5208                 netif_dormant_on(dev);
5209         else
5210                 netif_dormant_off(dev);
5211
5212         if (netif_carrier_ok(rootdev)) {
5213                 if (!netif_carrier_ok(dev))
5214                         netif_carrier_on(dev);
5215         } else {
5216                 if (netif_carrier_ok(dev))
5217                         netif_carrier_off(dev);
5218         }
5219 }
5220 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5221
5222 #ifdef CONFIG_RPS
5223 static int netif_alloc_rx_queues(struct net_device *dev)
5224 {
5225         unsigned int i, count = dev->num_rx_queues;
5226         struct netdev_rx_queue *rx;
5227
5228         BUG_ON(count < 1);
5229
5230         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5231         if (!rx)
5232                 return -ENOMEM;
5233
5234         dev->_rx = rx;
5235
5236         for (i = 0; i < count; i++)
5237                 rx[i].dev = dev;
5238         return 0;
5239 }
5240 #endif
5241
5242 static void netdev_init_one_queue(struct net_device *dev,
5243                                   struct netdev_queue *queue, void *_unused)
5244 {
5245         /* Initialize queue lock */
5246         spin_lock_init(&queue->_xmit_lock);
5247         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5248         queue->xmit_lock_owner = -1;
5249         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5250         queue->dev = dev;
5251 #ifdef CONFIG_BQL
5252         dql_init(&queue->dql, HZ);
5253 #endif
5254 }
5255
5256 static int netif_alloc_netdev_queues(struct net_device *dev)
5257 {
5258         unsigned int count = dev->num_tx_queues;
5259         struct netdev_queue *tx;
5260
5261         BUG_ON(count < 1);
5262
5263         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5264         if (!tx)
5265                 return -ENOMEM;
5266
5267         dev->_tx = tx;
5268
5269         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5270         spin_lock_init(&dev->tx_global_lock);
5271
5272         return 0;
5273 }
5274
5275 /**
5276  *      register_netdevice      - register a network device
5277  *      @dev: device to register
5278  *
5279  *      Take a completed network device structure and add it to the kernel
5280  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5281  *      chain. 0 is returned on success. A negative errno code is returned
5282  *      on a failure to set up the device, or if the name is a duplicate.
5283  *
5284  *      Callers must hold the rtnl semaphore. You may want
5285  *      register_netdev() instead of this.
5286  *
5287  *      BUGS:
5288  *      The locking appears insufficient to guarantee two parallel registers
5289  *      will not get the same name.
5290  */
5291
5292 int register_netdevice(struct net_device *dev)
5293 {
5294         int ret;
5295         struct net *net = dev_net(dev);
5296
5297         BUG_ON(dev_boot_phase);
5298         ASSERT_RTNL();
5299
5300         might_sleep();
5301
5302         /* When net_device's are persistent, this will be fatal. */
5303         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5304         BUG_ON(!net);
5305
5306         spin_lock_init(&dev->addr_list_lock);
5307         netdev_set_addr_lockdep_class(dev);
5308
5309         dev->iflink = -1;
5310
5311         ret = dev_get_valid_name(net, dev, dev->name);
5312         if (ret < 0)
5313                 goto out;
5314
5315         /* Init, if this function is available */
5316         if (dev->netdev_ops->ndo_init) {
5317                 ret = dev->netdev_ops->ndo_init(dev);
5318                 if (ret) {
5319                         if (ret > 0)
5320                                 ret = -EIO;
5321                         goto out;
5322                 }
5323         }
5324
5325         if (((dev->hw_features | dev->features) &
5326              NETIF_F_HW_VLAN_CTAG_FILTER) &&
5327             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5328              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5329                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5330                 ret = -EINVAL;
5331                 goto err_uninit;
5332         }
5333
5334         ret = -EBUSY;
5335         if (!dev->ifindex)
5336                 dev->ifindex = dev_new_index(net);
5337         else if (__dev_get_by_index(net, dev->ifindex))
5338                 goto err_uninit;
5339
5340         if (dev->iflink == -1)
5341                 dev->iflink = dev->ifindex;
5342
5343         /* Transfer changeable features to wanted_features and enable
5344          * software offloads (GSO and GRO).
5345          */
5346         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5347         dev->features |= NETIF_F_SOFT_FEATURES;
5348         dev->wanted_features = dev->features & dev->hw_features;
5349
5350         /* Turn on no cache copy if HW is doing checksum */
5351         if (!(dev->flags & IFF_LOOPBACK)) {
5352                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5353                 if (dev->features & NETIF_F_ALL_CSUM) {
5354                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5355                         dev->features |= NETIF_F_NOCACHE_COPY;
5356                 }
5357         }
5358
5359         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5360          */
5361         dev->vlan_features |= NETIF_F_HIGHDMA;
5362
5363         /* Make NETIF_F_SG inheritable to tunnel devices.
5364          */
5365         dev->hw_enc_features |= NETIF_F_SG;
5366
5367         /* Make NETIF_F_SG inheritable to MPLS.
5368          */
5369         dev->mpls_features |= NETIF_F_SG;
5370
5371         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5372         ret = notifier_to_errno(ret);
5373         if (ret)
5374                 goto err_uninit;
5375
5376         ret = netdev_register_kobject(dev);
5377         if (ret)
5378                 goto err_uninit;
5379         dev->reg_state = NETREG_REGISTERED;
5380
5381         __netdev_update_features(dev);
5382
5383         /*
5384          *      Default initial state at registry is that the
5385          *      device is present.
5386          */
5387
5388         set_bit(__LINK_STATE_PRESENT, &dev->state);
5389
5390         linkwatch_init_dev(dev);
5391
5392         dev_init_scheduler(dev);
5393         dev_hold(dev);
5394         list_netdevice(dev);
5395         add_device_randomness(dev->dev_addr, dev->addr_len);
5396
5397         /* If the device has permanent device address, driver should
5398          * set dev_addr and also addr_assign_type should be set to
5399          * NET_ADDR_PERM (default value).
5400          */
5401         if (dev->addr_assign_type == NET_ADDR_PERM)
5402                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5403
5404         /* Notify protocols, that a new device appeared. */
5405         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5406         ret = notifier_to_errno(ret);
5407         if (ret) {
5408                 rollback_registered(dev);
5409                 dev->reg_state = NETREG_UNREGISTERED;
5410         }
5411         /*
5412          *      Prevent userspace races by waiting until the network
5413          *      device is fully setup before sending notifications.
5414          */
5415         if (!dev->rtnl_link_ops ||
5416             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5417                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5418
5419 out:
5420         return ret;
5421
5422 err_uninit:
5423         if (dev->netdev_ops->ndo_uninit)
5424                 dev->netdev_ops->ndo_uninit(dev);
5425         goto out;
5426 }
5427 EXPORT_SYMBOL(register_netdevice);
5428
5429 /**
5430  *      init_dummy_netdev       - init a dummy network device for NAPI
5431  *      @dev: device to init
5432  *
5433  *      This takes a network device structure and initialize the minimum
5434  *      amount of fields so it can be used to schedule NAPI polls without
5435  *      registering a full blown interface. This is to be used by drivers
5436  *      that need to tie several hardware interfaces to a single NAPI
5437  *      poll scheduler due to HW limitations.
5438  */
5439 int init_dummy_netdev(struct net_device *dev)
5440 {
5441         /* Clear everything. Note we don't initialize spinlocks
5442          * are they aren't supposed to be taken by any of the
5443          * NAPI code and this dummy netdev is supposed to be
5444          * only ever used for NAPI polls
5445          */
5446         memset(dev, 0, sizeof(struct net_device));
5447
5448         /* make sure we BUG if trying to hit standard
5449          * register/unregister code path
5450          */
5451         dev->reg_state = NETREG_DUMMY;
5452
5453         /* NAPI wants this */
5454         INIT_LIST_HEAD(&dev->napi_list);
5455
5456         /* a dummy interface is started by default */
5457         set_bit(__LINK_STATE_PRESENT, &dev->state);
5458         set_bit(__LINK_STATE_START, &dev->state);
5459
5460         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5461          * because users of this 'device' dont need to change
5462          * its refcount.
5463          */
5464
5465         return 0;
5466 }
5467 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5468
5469
5470 /**
5471  *      register_netdev - register a network device
5472  *      @dev: device to register
5473  *
5474  *      Take a completed network device structure and add it to the kernel
5475  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5476  *      chain. 0 is returned on success. A negative errno code is returned
5477  *      on a failure to set up the device, or if the name is a duplicate.
5478  *
5479  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5480  *      and expands the device name if you passed a format string to
5481  *      alloc_netdev.
5482  */
5483 int register_netdev(struct net_device *dev)
5484 {
5485         int err;
5486
5487         rtnl_lock();
5488         err = register_netdevice(dev);
5489         rtnl_unlock();
5490         return err;
5491 }
5492 EXPORT_SYMBOL(register_netdev);
5493
5494 int netdev_refcnt_read(const struct net_device *dev)
5495 {
5496         int i, refcnt = 0;
5497
5498         for_each_possible_cpu(i)
5499                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5500         return refcnt;
5501 }
5502 EXPORT_SYMBOL(netdev_refcnt_read);
5503
5504 /**
5505  * netdev_wait_allrefs - wait until all references are gone.
5506  * @dev: target net_device
5507  *
5508  * This is called when unregistering network devices.
5509  *
5510  * Any protocol or device that holds a reference should register
5511  * for netdevice notification, and cleanup and put back the
5512  * reference if they receive an UNREGISTER event.
5513  * We can get stuck here if buggy protocols don't correctly
5514  * call dev_put.
5515  */
5516 static void netdev_wait_allrefs(struct net_device *dev)
5517 {
5518         unsigned long rebroadcast_time, warning_time;
5519         int refcnt;
5520
5521         linkwatch_forget_dev(dev);
5522
5523         rebroadcast_time = warning_time = jiffies;
5524         refcnt = netdev_refcnt_read(dev);
5525
5526         while (refcnt != 0) {
5527                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5528                         rtnl_lock();
5529
5530                         /* Rebroadcast unregister notification */
5531                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5532
5533                         __rtnl_unlock();
5534                         rcu_barrier();
5535                         rtnl_lock();
5536
5537                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5538                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5539                                      &dev->state)) {
5540                                 /* We must not have linkwatch events
5541                                  * pending on unregister. If this
5542                                  * happens, we simply run the queue
5543                                  * unscheduled, resulting in a noop
5544                                  * for this device.
5545                                  */
5546                                 linkwatch_run_queue();
5547                         }
5548
5549                         __rtnl_unlock();
5550
5551                         rebroadcast_time = jiffies;
5552                 }
5553
5554                 msleep(250);
5555
5556                 refcnt = netdev_refcnt_read(dev);
5557
5558                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5559                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5560                                  dev->name, refcnt);
5561                         warning_time = jiffies;
5562                 }
5563         }
5564 }
5565
5566 /* The sequence is:
5567  *
5568  *      rtnl_lock();
5569  *      ...
5570  *      register_netdevice(x1);
5571  *      register_netdevice(x2);
5572  *      ...
5573  *      unregister_netdevice(y1);
5574  *      unregister_netdevice(y2);
5575  *      ...
5576  *      rtnl_unlock();
5577  *      free_netdev(y1);
5578  *      free_netdev(y2);
5579  *
5580  * We are invoked by rtnl_unlock().
5581  * This allows us to deal with problems:
5582  * 1) We can delete sysfs objects which invoke hotplug
5583  *    without deadlocking with linkwatch via keventd.
5584  * 2) Since we run with the RTNL semaphore not held, we can sleep
5585  *    safely in order to wait for the netdev refcnt to drop to zero.
5586  *
5587  * We must not return until all unregister events added during
5588  * the interval the lock was held have been completed.
5589  */
5590 void netdev_run_todo(void)
5591 {
5592         struct list_head list;
5593
5594         /* Snapshot list, allow later requests */
5595         list_replace_init(&net_todo_list, &list);
5596
5597         __rtnl_unlock();
5598
5599
5600         /* Wait for rcu callbacks to finish before next phase */
5601         if (!list_empty(&list))
5602                 rcu_barrier();
5603
5604         while (!list_empty(&list)) {
5605                 struct net_device *dev
5606                         = list_first_entry(&list, struct net_device, todo_list);
5607                 list_del(&dev->todo_list);
5608
5609                 rtnl_lock();
5610                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5611                 __rtnl_unlock();
5612
5613                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5614                         pr_err("network todo '%s' but state %d\n",
5615                                dev->name, dev->reg_state);
5616                         dump_stack();
5617                         continue;
5618                 }
5619
5620                 dev->reg_state = NETREG_UNREGISTERED;
5621
5622                 on_each_cpu(flush_backlog, dev, 1);
5623
5624                 netdev_wait_allrefs(dev);
5625
5626                 /* paranoia */
5627                 BUG_ON(netdev_refcnt_read(dev));
5628                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5629                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5630                 WARN_ON(dev->dn_ptr);
5631
5632                 if (dev->destructor)
5633                         dev->destructor(dev);
5634
5635                 /* Free network device */
5636                 kobject_put(&dev->dev.kobj);
5637         }
5638 }
5639
5640 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5641  * fields in the same order, with only the type differing.
5642  */
5643 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5644                              const struct net_device_stats *netdev_stats)
5645 {
5646 #if BITS_PER_LONG == 64
5647         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5648         memcpy(stats64, netdev_stats, sizeof(*stats64));
5649 #else
5650         size_t i, n = sizeof(*stats64) / sizeof(u64);
5651         const unsigned long *src = (const unsigned long *)netdev_stats;
5652         u64 *dst = (u64 *)stats64;
5653
5654         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5655                      sizeof(*stats64) / sizeof(u64));
5656         for (i = 0; i < n; i++)
5657                 dst[i] = src[i];
5658 #endif
5659 }
5660 EXPORT_SYMBOL(netdev_stats_to_stats64);
5661
5662 /**
5663  *      dev_get_stats   - get network device statistics
5664  *      @dev: device to get statistics from
5665  *      @storage: place to store stats
5666  *
5667  *      Get network statistics from device. Return @storage.
5668  *      The device driver may provide its own method by setting
5669  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5670  *      otherwise the internal statistics structure is used.
5671  */
5672 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5673                                         struct rtnl_link_stats64 *storage)
5674 {
5675         const struct net_device_ops *ops = dev->netdev_ops;
5676
5677         if (ops->ndo_get_stats64) {
5678                 memset(storage, 0, sizeof(*storage));
5679                 ops->ndo_get_stats64(dev, storage);
5680         } else if (ops->ndo_get_stats) {
5681                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5682         } else {
5683                 netdev_stats_to_stats64(storage, &dev->stats);
5684         }
5685         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5686         return storage;
5687 }
5688 EXPORT_SYMBOL(dev_get_stats);
5689
5690 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5691 {
5692         struct netdev_queue *queue = dev_ingress_queue(dev);
5693
5694 #ifdef CONFIG_NET_CLS_ACT
5695         if (queue)
5696                 return queue;
5697         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5698         if (!queue)
5699                 return NULL;
5700         netdev_init_one_queue(dev, queue, NULL);
5701         queue->qdisc = &noop_qdisc;
5702         queue->qdisc_sleeping = &noop_qdisc;
5703         rcu_assign_pointer(dev->ingress_queue, queue);
5704 #endif
5705         return queue;
5706 }
5707
5708 static const struct ethtool_ops default_ethtool_ops;
5709
5710 void netdev_set_default_ethtool_ops(struct net_device *dev,
5711                                     const struct ethtool_ops *ops)
5712 {
5713         if (dev->ethtool_ops == &default_ethtool_ops)
5714                 dev->ethtool_ops = ops;
5715 }
5716 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5717
5718 /**
5719  *      alloc_netdev_mqs - allocate network device
5720  *      @sizeof_priv:   size of private data to allocate space for
5721  *      @name:          device name format string
5722  *      @setup:         callback to initialize device
5723  *      @txqs:          the number of TX subqueues to allocate
5724  *      @rxqs:          the number of RX subqueues to allocate
5725  *
5726  *      Allocates a struct net_device with private data area for driver use
5727  *      and performs basic initialization.  Also allocates subquue structs
5728  *      for each queue on the device.
5729  */
5730 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5731                 void (*setup)(struct net_device *),
5732                 unsigned int txqs, unsigned int rxqs)
5733 {
5734         struct net_device *dev;
5735         size_t alloc_size;
5736         struct net_device *p;
5737
5738         BUG_ON(strlen(name) >= sizeof(dev->name));
5739
5740         if (txqs < 1) {
5741                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5742                 return NULL;
5743         }
5744
5745 #ifdef CONFIG_RPS
5746         if (rxqs < 1) {
5747                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5748                 return NULL;
5749         }
5750 #endif
5751
5752         alloc_size = sizeof(struct net_device);
5753         if (sizeof_priv) {
5754                 /* ensure 32-byte alignment of private area */
5755                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5756                 alloc_size += sizeof_priv;
5757         }
5758         /* ensure 32-byte alignment of whole construct */
5759         alloc_size += NETDEV_ALIGN - 1;
5760
5761         p = kzalloc(alloc_size, GFP_KERNEL);
5762         if (!p)
5763                 return NULL;
5764
5765         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5766         dev->padded = (char *)dev - (char *)p;
5767
5768         dev->pcpu_refcnt = alloc_percpu(int);
5769         if (!dev->pcpu_refcnt)
5770                 goto free_p;
5771
5772         if (dev_addr_init(dev))
5773                 goto free_pcpu;
5774
5775         dev_mc_init(dev);
5776         dev_uc_init(dev);
5777
5778         dev_net_set(dev, &init_net);
5779
5780         dev->gso_max_size = GSO_MAX_SIZE;
5781         dev->gso_max_segs = GSO_MAX_SEGS;
5782
5783         INIT_LIST_HEAD(&dev->napi_list);
5784         INIT_LIST_HEAD(&dev->unreg_list);
5785         INIT_LIST_HEAD(&dev->link_watch_list);
5786         INIT_LIST_HEAD(&dev->upper_dev_list);
5787         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5788         setup(dev);
5789
5790         dev->num_tx_queues = txqs;
5791         dev->real_num_tx_queues = txqs;
5792         if (netif_alloc_netdev_queues(dev))
5793                 goto free_all;
5794
5795 #ifdef CONFIG_RPS
5796         dev->num_rx_queues = rxqs;
5797         dev->real_num_rx_queues = rxqs;
5798         if (netif_alloc_rx_queues(dev))
5799                 goto free_all;
5800 #endif
5801
5802         strcpy(dev->name, name);
5803         dev->group = INIT_NETDEV_GROUP;
5804         if (!dev->ethtool_ops)
5805                 dev->ethtool_ops = &default_ethtool_ops;
5806         return dev;
5807
5808 free_all:
5809         free_netdev(dev);
5810         return NULL;
5811
5812 free_pcpu:
5813         free_percpu(dev->pcpu_refcnt);
5814         kfree(dev->_tx);
5815 #ifdef CONFIG_RPS
5816         kfree(dev->_rx);
5817 #endif
5818
5819 free_p:
5820         kfree(p);
5821         return NULL;
5822 }
5823 EXPORT_SYMBOL(alloc_netdev_mqs);
5824
5825 /**
5826  *      free_netdev - free network device
5827  *      @dev: device
5828  *
5829  *      This function does the last stage of destroying an allocated device
5830  *      interface. The reference to the device object is released.
5831  *      If this is the last reference then it will be freed.
5832  */
5833 void free_netdev(struct net_device *dev)
5834 {
5835         struct napi_struct *p, *n;
5836
5837         release_net(dev_net(dev));
5838
5839         kfree(dev->_tx);
5840 #ifdef CONFIG_RPS
5841         kfree(dev->_rx);
5842 #endif
5843
5844         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5845
5846         /* Flush device addresses */
5847         dev_addr_flush(dev);
5848
5849         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5850                 netif_napi_del(p);
5851
5852         free_percpu(dev->pcpu_refcnt);
5853         dev->pcpu_refcnt = NULL;
5854
5855         /*  Compatibility with error handling in drivers */
5856         if (dev->reg_state == NETREG_UNINITIALIZED) {
5857                 kfree((char *)dev - dev->padded);
5858                 return;
5859         }
5860
5861         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5862         dev->reg_state = NETREG_RELEASED;
5863
5864         /* will free via device release */
5865         put_device(&dev->dev);
5866 }
5867 EXPORT_SYMBOL(free_netdev);
5868
5869 /**
5870  *      synchronize_net -  Synchronize with packet receive processing
5871  *
5872  *      Wait for packets currently being received to be done.
5873  *      Does not block later packets from starting.
5874  */
5875 void synchronize_net(void)
5876 {
5877         might_sleep();
5878         if (rtnl_is_locked())
5879                 synchronize_rcu_expedited();
5880         else
5881                 synchronize_rcu();
5882 }
5883 EXPORT_SYMBOL(synchronize_net);
5884
5885 /**
5886  *      unregister_netdevice_queue - remove device from the kernel
5887  *      @dev: device
5888  *      @head: list
5889  *
5890  *      This function shuts down a device interface and removes it
5891  *      from the kernel tables.
5892  *      If head not NULL, device is queued to be unregistered later.
5893  *
5894  *      Callers must hold the rtnl semaphore.  You may want
5895  *      unregister_netdev() instead of this.
5896  */
5897
5898 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5899 {
5900         ASSERT_RTNL();
5901
5902         if (head) {
5903                 list_move_tail(&dev->unreg_list, head);
5904         } else {
5905                 rollback_registered(dev);
5906                 /* Finish processing unregister after unlock */
5907                 net_set_todo(dev);
5908         }
5909 }
5910 EXPORT_SYMBOL(unregister_netdevice_queue);
5911
5912 /**
5913  *      unregister_netdevice_many - unregister many devices
5914  *      @head: list of devices
5915  */
5916 void unregister_netdevice_many(struct list_head *head)
5917 {
5918         struct net_device *dev;
5919
5920         if (!list_empty(head)) {
5921                 rollback_registered_many(head);
5922                 list_for_each_entry(dev, head, unreg_list)
5923                         net_set_todo(dev);
5924         }
5925 }
5926 EXPORT_SYMBOL(unregister_netdevice_many);
5927
5928 /**
5929  *      unregister_netdev - remove device from the kernel
5930  *      @dev: device
5931  *
5932  *      This function shuts down a device interface and removes it
5933  *      from the kernel tables.
5934  *
5935  *      This is just a wrapper for unregister_netdevice that takes
5936  *      the rtnl semaphore.  In general you want to use this and not
5937  *      unregister_netdevice.
5938  */
5939 void unregister_netdev(struct net_device *dev)
5940 {
5941         rtnl_lock();
5942         unregister_netdevice(dev);
5943         rtnl_unlock();
5944 }
5945 EXPORT_SYMBOL(unregister_netdev);
5946
5947 /**
5948  *      dev_change_net_namespace - move device to different nethost namespace
5949  *      @dev: device
5950  *      @net: network namespace
5951  *      @pat: If not NULL name pattern to try if the current device name
5952  *            is already taken in the destination network namespace.
5953  *
5954  *      This function shuts down a device interface and moves it
5955  *      to a new network namespace. On success 0 is returned, on
5956  *      a failure a netagive errno code is returned.
5957  *
5958  *      Callers must hold the rtnl semaphore.
5959  */
5960
5961 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5962 {
5963         int err;
5964
5965         ASSERT_RTNL();
5966
5967         /* Don't allow namespace local devices to be moved. */
5968         err = -EINVAL;
5969         if (dev->features & NETIF_F_NETNS_LOCAL)
5970                 goto out;
5971
5972         /* Ensure the device has been registrered */
5973         if (dev->reg_state != NETREG_REGISTERED)
5974                 goto out;
5975
5976         /* Get out if there is nothing todo */
5977         err = 0;
5978         if (net_eq(dev_net(dev), net))
5979                 goto out;
5980
5981         /* Pick the destination device name, and ensure
5982          * we can use it in the destination network namespace.
5983          */
5984         err = -EEXIST;
5985         if (__dev_get_by_name(net, dev->name)) {
5986                 /* We get here if we can't use the current device name */
5987                 if (!pat)
5988                         goto out;
5989                 if (dev_get_valid_name(net, dev, pat) < 0)
5990                         goto out;
5991         }
5992
5993         /*
5994          * And now a mini version of register_netdevice unregister_netdevice.
5995          */
5996
5997         /* If device is running close it first. */
5998         dev_close(dev);
5999
6000         /* And unlink it from device chain */
6001         err = -ENODEV;
6002         unlist_netdevice(dev);
6003
6004         synchronize_net();
6005
6006         /* Shutdown queueing discipline. */
6007         dev_shutdown(dev);
6008
6009         /* Notify protocols, that we are about to destroy
6010            this device. They should clean all the things.
6011
6012            Note that dev->reg_state stays at NETREG_REGISTERED.
6013            This is wanted because this way 8021q and macvlan know
6014            the device is just moving and can keep their slaves up.
6015         */
6016         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6017         rcu_barrier();
6018         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6019         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6020
6021         /*
6022          *      Flush the unicast and multicast chains
6023          */
6024         dev_uc_flush(dev);
6025         dev_mc_flush(dev);
6026
6027         /* Send a netdev-removed uevent to the old namespace */
6028         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6029
6030         /* Actually switch the network namespace */
6031         dev_net_set(dev, net);
6032
6033         /* If there is an ifindex conflict assign a new one */
6034         if (__dev_get_by_index(net, dev->ifindex)) {
6035                 int iflink = (dev->iflink == dev->ifindex);
6036                 dev->ifindex = dev_new_index(net);
6037                 if (iflink)
6038                         dev->iflink = dev->ifindex;
6039         }
6040
6041         /* Send a netdev-add uevent to the new namespace */
6042         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6043
6044         /* Fixup kobjects */
6045         err = device_rename(&dev->dev, dev->name);
6046         WARN_ON(err);
6047
6048         /* Add the device back in the hashes */
6049         list_netdevice(dev);
6050
6051         /* Notify protocols, that a new device appeared. */
6052         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6053
6054         /*
6055          *      Prevent userspace races by waiting until the network
6056          *      device is fully setup before sending notifications.
6057          */
6058         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6059
6060         synchronize_net();
6061         err = 0;
6062 out:
6063         return err;
6064 }
6065 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6066
6067 static int dev_cpu_callback(struct notifier_block *nfb,
6068                             unsigned long action,
6069                             void *ocpu)
6070 {
6071         struct sk_buff **list_skb;
6072         struct sk_buff *skb;
6073         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6074         struct softnet_data *sd, *oldsd;
6075
6076         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6077                 return NOTIFY_OK;
6078
6079         local_irq_disable();
6080         cpu = smp_processor_id();
6081         sd = &per_cpu(softnet_data, cpu);
6082         oldsd = &per_cpu(softnet_data, oldcpu);
6083
6084         /* Find end of our completion_queue. */
6085         list_skb = &sd->completion_queue;
6086         while (*list_skb)
6087                 list_skb = &(*list_skb)->next;
6088         /* Append completion queue from offline CPU. */
6089         *list_skb = oldsd->completion_queue;
6090         oldsd->completion_queue = NULL;
6091
6092         /* Append output queue from offline CPU. */
6093         if (oldsd->output_queue) {
6094                 *sd->output_queue_tailp = oldsd->output_queue;
6095                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6096                 oldsd->output_queue = NULL;
6097                 oldsd->output_queue_tailp = &oldsd->output_queue;
6098         }
6099         /* Append NAPI poll list from offline CPU. */
6100         if (!list_empty(&oldsd->poll_list)) {
6101                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6102                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6103         }
6104
6105         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6106         local_irq_enable();
6107
6108         /* Process offline CPU's input_pkt_queue */
6109         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6110                 netif_rx(skb);
6111                 input_queue_head_incr(oldsd);
6112         }
6113         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6114                 netif_rx(skb);
6115                 input_queue_head_incr(oldsd);
6116         }
6117
6118         return NOTIFY_OK;
6119 }
6120
6121
6122 /**
6123  *      netdev_increment_features - increment feature set by one
6124  *      @all: current feature set
6125  *      @one: new feature set
6126  *      @mask: mask feature set
6127  *
6128  *      Computes a new feature set after adding a device with feature set
6129  *      @one to the master device with current feature set @all.  Will not
6130  *      enable anything that is off in @mask. Returns the new feature set.
6131  */
6132 netdev_features_t netdev_increment_features(netdev_features_t all,
6133         netdev_features_t one, netdev_features_t mask)
6134 {
6135         if (mask & NETIF_F_GEN_CSUM)
6136                 mask |= NETIF_F_ALL_CSUM;
6137         mask |= NETIF_F_VLAN_CHALLENGED;
6138
6139         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6140         all &= one | ~NETIF_F_ALL_FOR_ALL;
6141
6142         /* If one device supports hw checksumming, set for all. */
6143         if (all & NETIF_F_GEN_CSUM)
6144                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6145
6146         return all;
6147 }
6148 EXPORT_SYMBOL(netdev_increment_features);
6149
6150 static struct hlist_head * __net_init netdev_create_hash(void)
6151 {
6152         int i;
6153         struct hlist_head *hash;
6154
6155         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6156         if (hash != NULL)
6157                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6158                         INIT_HLIST_HEAD(&hash[i]);
6159
6160         return hash;
6161 }
6162
6163 /* Initialize per network namespace state */
6164 static int __net_init netdev_init(struct net *net)
6165 {
6166         if (net != &init_net)
6167                 INIT_LIST_HEAD(&net->dev_base_head);
6168
6169         net->dev_name_head = netdev_create_hash();
6170         if (net->dev_name_head == NULL)
6171                 goto err_name;
6172
6173         net->dev_index_head = netdev_create_hash();
6174         if (net->dev_index_head == NULL)
6175                 goto err_idx;
6176
6177         return 0;
6178
6179 err_idx:
6180         kfree(net->dev_name_head);
6181 err_name:
6182         return -ENOMEM;
6183 }
6184
6185 /**
6186  *      netdev_drivername - network driver for the device
6187  *      @dev: network device
6188  *
6189  *      Determine network driver for device.
6190  */
6191 const char *netdev_drivername(const struct net_device *dev)
6192 {
6193         const struct device_driver *driver;
6194         const struct device *parent;
6195         const char *empty = "";
6196
6197         parent = dev->dev.parent;
6198         if (!parent)
6199                 return empty;
6200
6201         driver = parent->driver;
6202         if (driver && driver->name)
6203                 return driver->name;
6204         return empty;
6205 }
6206
6207 static int __netdev_printk(const char *level, const struct net_device *dev,
6208                            struct va_format *vaf)
6209 {
6210         int r;
6211
6212         if (dev && dev->dev.parent) {
6213                 r = dev_printk_emit(level[1] - '0',
6214                                     dev->dev.parent,
6215                                     "%s %s %s: %pV",
6216                                     dev_driver_string(dev->dev.parent),
6217                                     dev_name(dev->dev.parent),
6218                                     netdev_name(dev), vaf);
6219         } else if (dev) {
6220                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6221         } else {
6222                 r = printk("%s(NULL net_device): %pV", level, vaf);
6223         }
6224
6225         return r;
6226 }
6227
6228 int netdev_printk(const char *level, const struct net_device *dev,
6229                   const char *format, ...)
6230 {
6231         struct va_format vaf;
6232         va_list args;
6233         int r;
6234
6235         va_start(args, format);
6236
6237         vaf.fmt = format;
6238         vaf.va = &args;
6239
6240         r = __netdev_printk(level, dev, &vaf);
6241
6242         va_end(args);
6243
6244         return r;
6245 }
6246 EXPORT_SYMBOL(netdev_printk);
6247
6248 #define define_netdev_printk_level(func, level)                 \
6249 int func(const struct net_device *dev, const char *fmt, ...)    \
6250 {                                                               \
6251         int r;                                                  \
6252         struct va_format vaf;                                   \
6253         va_list args;                                           \
6254                                                                 \
6255         va_start(args, fmt);                                    \
6256                                                                 \
6257         vaf.fmt = fmt;                                          \
6258         vaf.va = &args;                                         \
6259                                                                 \
6260         r = __netdev_printk(level, dev, &vaf);                  \
6261                                                                 \
6262         va_end(args);                                           \
6263                                                                 \
6264         return r;                                               \
6265 }                                                               \
6266 EXPORT_SYMBOL(func);
6267
6268 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6269 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6270 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6271 define_netdev_printk_level(netdev_err, KERN_ERR);
6272 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6273 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6274 define_netdev_printk_level(netdev_info, KERN_INFO);
6275
6276 static void __net_exit netdev_exit(struct net *net)
6277 {
6278         kfree(net->dev_name_head);
6279         kfree(net->dev_index_head);
6280 }
6281
6282 static struct pernet_operations __net_initdata netdev_net_ops = {
6283         .init = netdev_init,
6284         .exit = netdev_exit,
6285 };
6286
6287 static void __net_exit default_device_exit(struct net *net)
6288 {
6289         struct net_device *dev, *aux;
6290         /*
6291          * Push all migratable network devices back to the
6292          * initial network namespace
6293          */
6294         rtnl_lock();
6295         for_each_netdev_safe(net, dev, aux) {
6296                 int err;
6297                 char fb_name[IFNAMSIZ];
6298
6299                 /* Ignore unmoveable devices (i.e. loopback) */
6300                 if (dev->features & NETIF_F_NETNS_LOCAL)
6301                         continue;
6302
6303                 /* Leave virtual devices for the generic cleanup */
6304                 if (dev->rtnl_link_ops)
6305                         continue;
6306
6307                 /* Push remaining network devices to init_net */
6308                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6309                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6310                 if (err) {
6311                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6312                                  __func__, dev->name, err);
6313                         BUG();
6314                 }
6315         }
6316         rtnl_unlock();
6317 }
6318
6319 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6320 {
6321         /* At exit all network devices most be removed from a network
6322          * namespace.  Do this in the reverse order of registration.
6323          * Do this across as many network namespaces as possible to
6324          * improve batching efficiency.
6325          */
6326         struct net_device *dev;
6327         struct net *net;
6328         LIST_HEAD(dev_kill_list);
6329
6330         rtnl_lock();
6331         list_for_each_entry(net, net_list, exit_list) {
6332                 for_each_netdev_reverse(net, dev) {
6333                         if (dev->rtnl_link_ops)
6334                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6335                         else
6336                                 unregister_netdevice_queue(dev, &dev_kill_list);
6337                 }
6338         }
6339         unregister_netdevice_many(&dev_kill_list);
6340         list_del(&dev_kill_list);
6341         rtnl_unlock();
6342 }
6343
6344 static struct pernet_operations __net_initdata default_device_ops = {
6345         .exit = default_device_exit,
6346         .exit_batch = default_device_exit_batch,
6347 };
6348
6349 /*
6350  *      Initialize the DEV module. At boot time this walks the device list and
6351  *      unhooks any devices that fail to initialise (normally hardware not
6352  *      present) and leaves us with a valid list of present and active devices.
6353  *
6354  */
6355
6356 /*
6357  *       This is called single threaded during boot, so no need
6358  *       to take the rtnl semaphore.
6359  */
6360 static int __init net_dev_init(void)
6361 {
6362         int i, rc = -ENOMEM;
6363
6364         BUG_ON(!dev_boot_phase);
6365
6366         if (dev_proc_init())
6367                 goto out;
6368
6369         if (netdev_kobject_init())
6370                 goto out;
6371
6372         INIT_LIST_HEAD(&ptype_all);
6373         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6374                 INIT_LIST_HEAD(&ptype_base[i]);
6375
6376         INIT_LIST_HEAD(&offload_base);
6377
6378         if (register_pernet_subsys(&netdev_net_ops))
6379                 goto out;
6380
6381         /*
6382          *      Initialise the packet receive queues.
6383          */
6384
6385         for_each_possible_cpu(i) {
6386                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6387
6388                 memset(sd, 0, sizeof(*sd));
6389                 skb_queue_head_init(&sd->input_pkt_queue);
6390                 skb_queue_head_init(&sd->process_queue);
6391                 sd->completion_queue = NULL;
6392                 INIT_LIST_HEAD(&sd->poll_list);
6393                 sd->output_queue = NULL;
6394                 sd->output_queue_tailp = &sd->output_queue;
6395 #ifdef CONFIG_RPS
6396                 sd->csd.func = rps_trigger_softirq;
6397                 sd->csd.info = sd;
6398                 sd->csd.flags = 0;
6399                 sd->cpu = i;
6400 #endif
6401
6402                 sd->backlog.poll = process_backlog;
6403                 sd->backlog.weight = weight_p;
6404                 sd->backlog.gro_list = NULL;
6405                 sd->backlog.gro_count = 0;
6406
6407 #ifdef CONFIG_NET_FLOW_LIMIT
6408                 sd->flow_limit = NULL;
6409 #endif
6410         }
6411
6412         dev_boot_phase = 0;
6413
6414         /* The loopback device is special if any other network devices
6415          * is present in a network namespace the loopback device must
6416          * be present. Since we now dynamically allocate and free the
6417          * loopback device ensure this invariant is maintained by
6418          * keeping the loopback device as the first device on the
6419          * list of network devices.  Ensuring the loopback devices
6420          * is the first device that appears and the last network device
6421          * that disappears.
6422          */
6423         if (register_pernet_device(&loopback_net_ops))
6424                 goto out;
6425
6426         if (register_pernet_device(&default_device_ops))
6427                 goto out;
6428
6429         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6430         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6431
6432         hotcpu_notifier(dev_cpu_callback, 0);
6433         dst_init();
6434         rc = 0;
6435 out:
6436         return rc;
6437 }
6438
6439 subsys_initcall(net_dev_init);