Pileus Git - ~andy/linux/blob - net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132
 133 #include "net-sysfs.h"
 134
 135 /* Instead of increasing this, you should create a hash table. */
 136 #define MAX_GRO_SKBS 8
 137
 138 /* This should be increased if a protocol with a bigger head is added. */
 139 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 140
 141 static DEFINE_SPINLOCK(ptype_lock);
 142 static DEFINE_SPINLOCK(offload_lock);
 143 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 144 struct list_head ptype_all __read_mostly;       /* Taps */
 145 static struct list_head offload_base __read_mostly;
 146
 147 /*
 148  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 149  * semaphore.
 150  *
 151  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 152  *
 153  * Writers must hold the rtnl semaphore while they loop through the
 154  * dev_base_head list, and hold dev_base_lock for writing when they do the
 155  * actual updates.  This allows pure readers to access the list even
 156  * while a writer is preparing to update it.
 157  *
 158  * To put it another way, dev_base_lock is held for writing only to
 159  * protect against pure readers; the rtnl semaphore provides the
 160  * protection against other writers.
 161  *
 162  * See, for example usages, register_netdevice() and
 163  * unregister_netdevice(), which must be called with the rtnl
 164  * semaphore held.
 165  */
 166 DEFINE_RWLOCK(dev_base_lock);
 167 EXPORT_SYMBOL(dev_base_lock);
 168
 169 seqcount_t devnet_rename_seq;
 170
 171 static inline void dev_base_seq_inc(struct net *net)
 172 {
 173         while (++net->dev_base_seq == 0);
 174 }
 175
 176 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 177 {
 178         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 179
 180         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 181 }
 182
 183 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 184 {
 185         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 186 }
 187
 188 static inline void rps_lock(struct softnet_data *sd)
 189 {
 190 #ifdef CONFIG_RPS
 191         spin_lock(&sd->input_pkt_queue.lock);
 192 #endif
 193 }
 194
 195 static inline void rps_unlock(struct softnet_data *sd)
 196 {
 197 #ifdef CONFIG_RPS
 198         spin_unlock(&sd->input_pkt_queue.lock);
 199 #endif
 200 }
 201
 202 /* Device list insertion */
 203 static void list_netdevice(struct net_device *dev)
 204 {
 205         struct net *net = dev_net(dev);
 206
 207         ASSERT_RTNL();
 208
 209         write_lock_bh(&dev_base_lock);
 210         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 211         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 212         hlist_add_head_rcu(&dev->index_hlist,
 213                            dev_index_hash(net, dev->ifindex));
 214         write_unlock_bh(&dev_base_lock);
 215
 216         dev_base_seq_inc(net);
 217 }
 218
 219 /* Device list removal
 220  * caller must respect a RCU grace period before freeing/reusing dev
 221  */
 222 static void unlist_netdevice(struct net_device *dev)
 223 {
 224         ASSERT_RTNL();
 225
 226         /* Unlink dev from the device chain */
 227         write_lock_bh(&dev_base_lock);
 228         list_del_rcu(&dev->dev_list);
 229         hlist_del_rcu(&dev->name_hlist);
 230         hlist_del_rcu(&dev->index_hlist);
 231         write_unlock_bh(&dev_base_lock);
 232
 233         dev_base_seq_inc(dev_net(dev));
 234 }
 235
 236 /*
 237  *      Our notifier list
 238  */
 239
 240 static RAW_NOTIFIER_HEAD(netdev_chain);
 241
 242 /*
 243  *      Device drivers call our routines to queue packets here. We empty the
 244  *      queue in the local softnet handler.
 245  */
 246
 247 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 248 EXPORT_PER_CPU_SYMBOL(softnet_data);
 249
 250 #ifdef CONFIG_LOCKDEP
 251 /*
 252  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 253  * according to dev->type
 254  */
 255 static const unsigned short netdev_lock_type[] =
 256         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 257          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 258          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 259          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 260          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 261          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 262          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 263          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 264          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 265          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 266          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 267          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 268          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 269          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 270          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 271
 272 static const char *const netdev_lock_name[] =
 273         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 274          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 275          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 276          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 277          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 278          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 279          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 280          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 281          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 282          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 283          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 284          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 285          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 286          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 287          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 288
 289 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 290 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 291
 292 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 293 {
 294         int i;
 295
 296         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 297                 if (netdev_lock_type[i] == dev_type)
 298                         return i;
 299         /* the last key is used by default */
 300         return ARRAY_SIZE(netdev_lock_type) - 1;
 301 }
 302
 303 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 304                                                  unsigned short dev_type)
 305 {
 306         int i;
 307
 308         i = netdev_lock_pos(dev_type);
 309         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 310                                    netdev_lock_name[i]);
 311 }
 312
 313 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 314 {
 315         int i;
 316
 317         i = netdev_lock_pos(dev->type);
 318         lockdep_set_class_and_name(&dev->addr_list_lock,
 319                                    &netdev_addr_lock_key[i],
 320                                    netdev_lock_name[i]);
 321 }
 322 #else
 323 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 324                                                  unsigned short dev_type)
 325 {
 326 }
 327 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 328 {
 329 }
 330 #endif
 331
 332 /*******************************************************************************
 333
 334                 Protocol management and registration routines
 335
 336 *******************************************************************************/
 337
 338 /*
 339  *      Add a protocol ID to the list. Now that the input handler is
 340  *      smarter we can dispense with all the messy stuff that used to be
 341  *      here.
 342  *
 343  *      BEWARE!!! Protocol handlers, mangling input packets,
 344  *      MUST BE last in hash buckets and checking protocol handlers
 345  *      MUST start from promiscuous ptype_all chain in net_bh.
 346  *      It is true now, do not change it.
 347  *      Explanation follows: if protocol handler, mangling packet, will
 348  *      be the first on list, it is not able to sense, that packet
 349  *      is cloned and should be copied-on-write, so that it will
 350  *      change it and subsequent readers will get broken packet.
 351  *                                                      --ANK (980803)
 352  */
 353
 354 static inline struct list_head *ptype_head(const struct packet_type *pt)
 355 {
 356         if (pt->type == htons(ETH_P_ALL))
 357                 return &ptype_all;
 358         else
 359                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 360 }
 361
 362 /**
 363  *      dev_add_pack - add packet handler
 364  *      @pt: packet type declaration
 365  *
 366  *      Add a protocol handler to the networking stack. The passed &packet_type
 367  *      is linked into kernel lists and may not be freed until it has been
 368  *      removed from the kernel lists.
 369  *
 370  *      This call does not sleep therefore it can not
 371  *      guarantee all CPU's that are in middle of receiving packets
 372  *      will see the new packet type (until the next received packet).
 373  */
 374
 375 void dev_add_pack(struct packet_type *pt)
 376 {
 377         struct list_head *head = ptype_head(pt);
 378
 379         spin_lock(&ptype_lock);
 380         list_add_rcu(&pt->list, head);
 381         spin_unlock(&ptype_lock);
 382 }
 383 EXPORT_SYMBOL(dev_add_pack);
 384
 385 /**
 386  *      __dev_remove_pack        - remove packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Remove a protocol handler that was previously added to the kernel
 390  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 391  *      from the kernel lists and can be freed or reused once this function
 392  *      returns.
 393  *
 394  *      The packet type might still be in use by receivers
 395  *      and must not be freed until after all the CPU's have gone
 396  *      through a quiescent state.
 397  */
 398 void __dev_remove_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401         struct packet_type *pt1;
 402
 403         spin_lock(&ptype_lock);
 404
 405         list_for_each_entry(pt1, head, list) {
 406                 if (pt == pt1) {
 407                         list_del_rcu(&pt->list);
 408                         goto out;
 409                 }
 410         }
 411
 412         pr_warn("dev_remove_pack: %p not found\n", pt);
 413 out:
 414         spin_unlock(&ptype_lock);
 415 }
 416 EXPORT_SYMBOL(__dev_remove_pack);
 417
 418 /**
 419  *      dev_remove_pack  - remove packet handler
 420  *      @pt: packet type declaration
 421  *
 422  *      Remove a protocol handler that was previously added to the kernel
 423  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 424  *      from the kernel lists and can be freed or reused once this function
 425  *      returns.
 426  *
 427  *      This call sleeps to guarantee that no CPU is looking at the packet
 428  *      type after return.
 429  */
 430 void dev_remove_pack(struct packet_type *pt)
 431 {
 432         __dev_remove_pack(pt);
 433
 434         synchronize_net();
 435 }
 436 EXPORT_SYMBOL(dev_remove_pack);
 437
 438
 439 /**
 440  *      dev_add_offload - register offload handlers
 441  *      @po: protocol offload declaration
 442  *
 443  *      Add protocol offload handlers to the networking stack. The passed
 444  *      &proto_offload is linked into kernel lists and may not be freed until
 445  *      it has been removed from the kernel lists.
 446  *
 447  *      This call does not sleep therefore it can not
 448  *      guarantee all CPU's that are in middle of receiving packets
 449  *      will see the new offload handlers (until the next received packet).
 450  */
 451 void dev_add_offload(struct packet_offload *po)
 452 {
 453         struct list_head *head = &offload_base;
 454
 455         spin_lock(&offload_lock);
 456         list_add_rcu(&po->list, head);
 457         spin_unlock(&offload_lock);
 458 }
 459 EXPORT_SYMBOL(dev_add_offload);
 460
 461 /**
 462  *      __dev_remove_offload     - remove offload handler
 463  *      @po: packet offload declaration
 464  *
 465  *      Remove a protocol offload handler that was previously added to the
 466  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 467  *      is removed from the kernel lists and can be freed or reused once this
 468  *      function returns.
 469  *
 470  *      The packet type might still be in use by receivers
 471  *      and must not be freed until after all the CPU's have gone
 472  *      through a quiescent state.
 473  */
 474 void __dev_remove_offload(struct packet_offload *po)
 475 {
 476         struct list_head *head = &offload_base;
 477         struct packet_offload *po1;
 478
 479         spin_lock(&offload_lock);
 480
 481         list_for_each_entry(po1, head, list) {
 482                 if (po == po1) {
 483                         list_del_rcu(&po->list);
 484                         goto out;
 485                 }
 486         }
 487
 488         pr_warn("dev_remove_offload: %p not found\n", po);
 489 out:
 490         spin_unlock(&offload_lock);
 491 }
 492 EXPORT_SYMBOL(__dev_remove_offload);
 493
 494 /**
 495  *      dev_remove_offload       - remove packet offload handler
 496  *      @po: packet offload declaration
 497  *
 498  *      Remove a packet offload handler that was previously added to the kernel
 499  *      offload handlers by dev_add_offload(). The passed &offload_type is
 500  *      removed from the kernel lists and can be freed or reused once this
 501  *      function returns.
 502  *
 503  *      This call sleeps to guarantee that no CPU is looking at the packet
 504  *      type after return.
 505  */
 506 void dev_remove_offload(struct packet_offload *po)
 507 {
 508         __dev_remove_offload(po);
 509
 510         synchronize_net();
 511 }
 512 EXPORT_SYMBOL(dev_remove_offload);
 513
 514 /******************************************************************************
 515
 516                       Device Boot-time Settings Routines
 517
 518 *******************************************************************************/
 519
 520 /* Boot time configuration table */
 521 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 522
 523 /**
 524  *      netdev_boot_setup_add   - add new setup entry
 525  *      @name: name of the device
 526  *      @map: configured settings for the device
 527  *
 528  *      Adds new setup entry to the dev_boot_setup list.  The function
 529  *      returns 0 on error and 1 on success.  This is a generic routine to
 530  *      all netdevices.
 531  */
 532 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 533 {
 534         struct netdev_boot_setup *s;
 535         int i;
 536
 537         s = dev_boot_setup;
 538         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 539                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 540                         memset(s[i].name, 0, sizeof(s[i].name));
 541                         strlcpy(s[i].name, name, IFNAMSIZ);
 542                         memcpy(&s[i].map, map, sizeof(s[i].map));
 543                         break;
 544                 }
 545         }
 546
 547         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 548 }
 549
 550 /**
 551  *      netdev_boot_setup_check - check boot time settings
 552  *      @dev: the netdevice
 553  *
 554  *      Check boot time settings for the device.
 555  *      The found settings are set for the device to be used
 556  *      later in the device probing.
 557  *      Returns 0 if no settings found, 1 if they are.
 558  */
 559 int netdev_boot_setup_check(struct net_device *dev)
 560 {
 561         struct netdev_boot_setup *s = dev_boot_setup;
 562         int i;
 563
 564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 566                     !strcmp(dev->name, s[i].name)) {
 567                         dev->irq        = s[i].map.irq;
 568                         dev->base_addr  = s[i].map.base_addr;
 569                         dev->mem_start  = s[i].map.mem_start;
 570                         dev->mem_end    = s[i].map.mem_end;
 571                         return 1;
 572                 }
 573         }
 574         return 0;
 575 }
 576 EXPORT_SYMBOL(netdev_boot_setup_check);
 577
 578
 579 /**
 580  *      netdev_boot_base        - get address from boot time settings
 581  *      @prefix: prefix for network device
 582  *      @unit: id for network device
 583  *
 584  *      Check boot time settings for the base address of device.
 585  *      The found settings are set for the device to be used
 586  *      later in the device probing.
 587  *      Returns 0 if no settings found.
 588  */
 589 unsigned long netdev_boot_base(const char *prefix, int unit)
 590 {
 591         const struct netdev_boot_setup *s = dev_boot_setup;
 592         char name[IFNAMSIZ];
 593         int i;
 594
 595         sprintf(name, "%s%d", prefix, unit);
 596
 597         /*
 598          * If device already registered then return base of 1
 599          * to indicate not to probe for this interface
 600          */
 601         if (__dev_get_by_name(&init_net, name))
 602                 return 1;
 603
 604         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 605                 if (!strcmp(name, s[i].name))
 606                         return s[i].map.base_addr;
 607         return 0;
 608 }
 609
 610 /*
 611  * Saves at boot time configured settings for any netdevice.
 612  */
 613 int __init netdev_boot_setup(char *str)
 614 {
 615         int ints[5];
 616         struct ifmap map;
 617
 618         str = get_options(str, ARRAY_SIZE(ints), ints);
 619         if (!str || !*str)
 620                 return 0;
 621
 622         /* Save settings */
 623         memset(&map, 0, sizeof(map));
 624         if (ints[0] > 0)
 625                 map.irq = ints[1];
 626         if (ints[0] > 1)
 627                 map.base_addr = ints[2];
 628         if (ints[0] > 2)
 629                 map.mem_start = ints[3];
 630         if (ints[0] > 3)
 631                 map.mem_end = ints[4];
 632
 633         /* Add new entry to the list */
 634         return netdev_boot_setup_add(str, &map);
 635 }
 636
 637 __setup("netdev=", netdev_boot_setup);
 638
 639 /*******************************************************************************
 640
 641                             Device Interface Subroutines
 642
 643 *******************************************************************************/
 644
 645 /**
 646  *      __dev_get_by_name       - find a device by its name
 647  *      @net: the applicable net namespace
 648  *      @name: name to find
 649  *
 650  *      Find an interface by name. Must be called under RTNL semaphore
 651  *      or @dev_base_lock. If the name is found a pointer to the device
 652  *      is returned. If the name is not found then %NULL is returned. The
 653  *      reference counters are not incremented so the caller must be
 654  *      careful with locks.
 655  */
 656
 657 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 658 {
 659         struct net_device *dev;
 660         struct hlist_head *head = dev_name_hash(net, name);
 661
 662         hlist_for_each_entry(dev, head, name_hlist)
 663                 if (!strncmp(dev->name, name, IFNAMSIZ))
 664                         return dev;
 665
 666         return NULL;
 667 }
 668 EXPORT_SYMBOL(__dev_get_by_name);
 669
 670 /**
 671  *      dev_get_by_name_rcu     - find a device by its name
 672  *      @net: the applicable net namespace
 673  *      @name: name to find
 674  *
 675  *      Find an interface by name.
 676  *      If the name is found a pointer to the device is returned.
 677  *      If the name is not found then %NULL is returned.
 678  *      The reference counters are not incremented so the caller must be
 679  *      careful with locks. The caller must hold RCU lock.
 680  */
 681
 682 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 683 {
 684         struct net_device *dev;
 685         struct hlist_head *head = dev_name_hash(net, name);
 686
 687         hlist_for_each_entry_rcu(dev, head, name_hlist)
 688                 if (!strncmp(dev->name, name, IFNAMSIZ))
 689                         return dev;
 690
 691         return NULL;
 692 }
 693 EXPORT_SYMBOL(dev_get_by_name_rcu);
 694
 695 /**
 696  *      dev_get_by_name         - find a device by its name
 697  *      @net: the applicable net namespace
 698  *      @name: name to find
 699  *
 700  *      Find an interface by name. This can be called from any
 701  *      context and does its own locking. The returned handle has
 702  *      the usage count incremented and the caller must use dev_put() to
 703  *      release it when it is no longer needed. %NULL is returned if no
 704  *      matching device is found.
 705  */
 706
 707 struct net_device *dev_get_by_name(struct net *net, const char *name)
 708 {
 709         struct net_device *dev;
 710
 711         rcu_read_lock();
 712         dev = dev_get_by_name_rcu(net, name);
 713         if (dev)
 714                 dev_hold(dev);
 715         rcu_read_unlock();
 716         return dev;
 717 }
 718 EXPORT_SYMBOL(dev_get_by_name);
 719
 720 /**
 721  *      __dev_get_by_index - find a device by its ifindex
 722  *      @net: the applicable net namespace
 723  *      @ifindex: index of device
 724  *
 725  *      Search for an interface by index. Returns %NULL if the device
 726  *      is not found or a pointer to the device. The device has not
 727  *      had its reference counter increased so the caller must be careful
 728  *      about locking. The caller must hold either the RTNL semaphore
 729  *      or @dev_base_lock.
 730  */
 731
 732 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 733 {
 734         struct net_device *dev;
 735         struct hlist_head *head = dev_index_hash(net, ifindex);
 736
 737         hlist_for_each_entry(dev, head, index_hlist)
 738                 if (dev->ifindex == ifindex)
 739                         return dev;
 740
 741         return NULL;
 742 }
 743 EXPORT_SYMBOL(__dev_get_by_index);
 744
 745 /**
 746  *      dev_get_by_index_rcu - find a device by its ifindex
 747  *      @net: the applicable net namespace
 748  *      @ifindex: index of device
 749  *
 750  *      Search for an interface by index. Returns %NULL if the device
 751  *      is not found or a pointer to the device. The device has not
 752  *      had its reference counter increased so the caller must be careful
 753  *      about locking. The caller must hold RCU lock.
 754  */
 755
 756 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 757 {
 758         struct net_device *dev;
 759         struct hlist_head *head = dev_index_hash(net, ifindex);
 760
 761         hlist_for_each_entry_rcu(dev, head, index_hlist)
 762                 if (dev->ifindex == ifindex)
 763                         return dev;
 764
 765         return NULL;
 766 }
 767 EXPORT_SYMBOL(dev_get_by_index_rcu);
 768
 769
 770 /**
 771  *      dev_get_by_index - find a device by its ifindex
 772  *      @net: the applicable net namespace
 773  *      @ifindex: index of device
 774  *
 775  *      Search for an interface by index. Returns NULL if the device
 776  *      is not found or a pointer to the device. The device returned has
 777  *      had a reference added and the pointer is safe until the user calls
 778  *      dev_put to indicate they have finished with it.
 779  */
 780
 781 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 782 {
 783         struct net_device *dev;
 784
 785         rcu_read_lock();
 786         dev = dev_get_by_index_rcu(net, ifindex);
 787         if (dev)
 788                 dev_hold(dev);
 789         rcu_read_unlock();
 790         return dev;
 791 }
 792 EXPORT_SYMBOL(dev_get_by_index);
 793
 794 /**
 795  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 796  *      @net: the applicable net namespace
 797  *      @type: media type of device
 798  *      @ha: hardware address
 799  *
 800  *      Search for an interface by MAC address. Returns NULL if the device
 801  *      is not found or a pointer to the device.
 802  *      The caller must hold RCU or RTNL.
 803  *      The returned device has not had its ref count increased
 804  *      and the caller must therefore be careful about locking
 805  *
 806  */
 807
 808 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 809                                        const char *ha)
 810 {
 811         struct net_device *dev;
 812
 813         for_each_netdev_rcu(net, dev)
 814                 if (dev->type == type &&
 815                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 816                         return dev;
 817
 818         return NULL;
 819 }
 820 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 821
 822 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 823 {
 824         struct net_device *dev;
 825
 826         ASSERT_RTNL();
 827         for_each_netdev(net, dev)
 828                 if (dev->type == type)
 829                         return dev;
 830
 831         return NULL;
 832 }
 833 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 834
 835 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 836 {
 837         struct net_device *dev, *ret = NULL;
 838
 839         rcu_read_lock();
 840         for_each_netdev_rcu(net, dev)
 841                 if (dev->type == type) {
 842                         dev_hold(dev);
 843                         ret = dev;
 844                         break;
 845                 }
 846         rcu_read_unlock();
 847         return ret;
 848 }
 849 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 850
 851 /**
 852  *      dev_get_by_flags_rcu - find any device with given flags
 853  *      @net: the applicable net namespace
 854  *      @if_flags: IFF_* values
 855  *      @mask: bitmask of bits in if_flags to check
 856  *
 857  *      Search for any interface with the given flags. Returns NULL if a device
 858  *      is not found or a pointer to the device. Must be called inside
 859  *      rcu_read_lock(), and result refcount is unchanged.
 860  */
 861
 862 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 863                                     unsigned short mask)
 864 {
 865         struct net_device *dev, *ret;
 866
 867         ret = NULL;
 868         for_each_netdev_rcu(net, dev) {
 869                 if (((dev->flags ^ if_flags) & mask) == 0) {
 870                         ret = dev;
 871                         break;
 872                 }
 873         }
 874         return ret;
 875 }
 876 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 877
 878 /**
 879  *      dev_valid_name - check if name is okay for network device
 880  *      @name: name string
 881  *
 882  *      Network device names need to be valid file names to
 883  *      to allow sysfs to work.  We also disallow any kind of
 884  *      whitespace.
 885  */
 886 bool dev_valid_name(const char *name)
 887 {
 888         if (*name == '\0')
 889                 return false;
 890         if (strlen(name) >= IFNAMSIZ)
 891                 return false;
 892         if (!strcmp(name, ".") || !strcmp(name, ".."))
 893                 return false;
 894
 895         while (*name) {
 896                 if (*name == '/' || isspace(*name))
 897                         return false;
 898                 name++;
 899         }
 900         return true;
 901 }
 902 EXPORT_SYMBOL(dev_valid_name);
 903
 904 /**
 905  *      __dev_alloc_name - allocate a name for a device
 906  *      @net: network namespace to allocate the device name in
 907  *      @name: name format string
 908  *      @buf:  scratch buffer and result name string
 909  *
 910  *      Passed a format string - eg "lt%d" it will try and find a suitable
 911  *      id. It scans list of devices to build up a free map, then chooses
 912  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 913  *      while allocating the name and adding the device in order to avoid
 914  *      duplicates.
 915  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 916  *      Returns the number of the unit assigned or a negative errno code.
 917  */
 918
 919 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 920 {
 921         int i = 0;
 922         const char *p;
 923         const int max_netdevices = 8*PAGE_SIZE;
 924         unsigned long *inuse;
 925         struct net_device *d;
 926
 927         p = strnchr(name, IFNAMSIZ-1, '%');
 928         if (p) {
 929                 /*
 930                  * Verify the string as this thing may have come from
 931                  * the user.  There must be either one "%d" and no other "%"
 932                  * characters.
 933                  */
 934                 if (p[1] != 'd' || strchr(p + 2, '%'))
 935                         return -EINVAL;
 936
 937                 /* Use one page as a bit array of possible slots */
 938                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 939                 if (!inuse)
 940                         return -ENOMEM;
 941
 942                 for_each_netdev(net, d) {
 943                         if (!sscanf(d->name, name, &i))
 944                                 continue;
 945                         if (i < 0 || i >= max_netdevices)
 946                                 continue;
 947
 948                         /*  avoid cases where sscanf is not exact inverse of printf */
 949                         snprintf(buf, IFNAMSIZ, name, i);
 950                         if (!strncmp(buf, d->name, IFNAMSIZ))
 951                                 set_bit(i, inuse);
 952                 }
 953
 954                 i = find_first_zero_bit(inuse, max_netdevices);
 955                 free_page((unsigned long) inuse);
 956         }
 957
 958         if (buf != name)
 959                 snprintf(buf, IFNAMSIZ, name, i);
 960         if (!__dev_get_by_name(net, buf))
 961                 return i;
 962
 963         /* It is possible to run out of possible slots
 964          * when the name is long and there isn't enough space left
 965          * for the digits, or if all bits are used.
 966          */
 967         return -ENFILE;
 968 }
 969
 970 /**
 971  *      dev_alloc_name - allocate a name for a device
 972  *      @dev: device
 973  *      @name: name format string
 974  *
 975  *      Passed a format string - eg "lt%d" it will try and find a suitable
 976  *      id. It scans list of devices to build up a free map, then chooses
 977  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 978  *      while allocating the name and adding the device in order to avoid
 979  *      duplicates.
 980  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 981  *      Returns the number of the unit assigned or a negative errno code.
 982  */
 983
 984 int dev_alloc_name(struct net_device *dev, const char *name)
 985 {
 986         char buf[IFNAMSIZ];
 987         struct net *net;
 988         int ret;
 989
 990         BUG_ON(!dev_net(dev));
 991         net = dev_net(dev);
 992         ret = __dev_alloc_name(net, name, buf);
 993         if (ret >= 0)
 994                 strlcpy(dev->name, buf, IFNAMSIZ);
 995         return ret;
 996 }
 997 EXPORT_SYMBOL(dev_alloc_name);
 998
 999 static int dev_alloc_name_ns(struct net *net,
1000                              struct net_device *dev,
1001                              const char *name)
1002 {
1003         char buf[IFNAMSIZ];
1004         int ret;
1005
1006         ret = __dev_alloc_name(net, name, buf);
1007         if (ret >= 0)
1008                 strlcpy(dev->name, buf, IFNAMSIZ);
1009         return ret;
1010 }
1011
1012 static int dev_get_valid_name(struct net *net,
1013                               struct net_device *dev,
1014                               const char *name)
1015 {
1016         BUG_ON(!net);
1017
1018         if (!dev_valid_name(name))
1019                 return -EINVAL;
1020
1021         if (strchr(name, '%'))
1022                 return dev_alloc_name_ns(net, dev, name);
1023         else if (__dev_get_by_name(net, name))
1024                 return -EEXIST;
1025         else if (dev->name != name)
1026                 strlcpy(dev->name, name, IFNAMSIZ);
1027
1028         return 0;
1029 }
1030
1031 /**
1032  *      dev_change_name - change name of a device
1033  *      @dev: device
1034  *      @newname: name (or format string) must be at least IFNAMSIZ
1035  *
1036  *      Change name of a device, can pass format strings "eth%d".
1037  *      for wildcarding.
1038  */
1039 int dev_change_name(struct net_device *dev, const char *newname)
1040 {
1041         char oldname[IFNAMSIZ];
1042         int err = 0;
1043         int ret;
1044         struct net *net;
1045
1046         ASSERT_RTNL();
1047         BUG_ON(!dev_net(dev));
1048
1049         net = dev_net(dev);
1050         if (dev->flags & IFF_UP)
1051                 return -EBUSY;
1052
1053         write_seqcount_begin(&devnet_rename_seq);
1054
1055         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1056                 write_seqcount_end(&devnet_rename_seq);
1057                 return 0;
1058         }
1059
1060         memcpy(oldname, dev->name, IFNAMSIZ);
1061
1062         err = dev_get_valid_name(net, dev, newname);
1063         if (err < 0) {
1064                 write_seqcount_end(&devnet_rename_seq);
1065                 return err;
1066         }
1067
1068 rollback:
1069         ret = device_rename(&dev->dev, dev->name);
1070         if (ret) {
1071                 memcpy(dev->name, oldname, IFNAMSIZ);
1072                 write_seqcount_end(&devnet_rename_seq);
1073                 return ret;
1074         }
1075
1076         write_seqcount_end(&devnet_rename_seq);
1077
1078         write_lock_bh(&dev_base_lock);
1079         hlist_del_rcu(&dev->name_hlist);
1080         write_unlock_bh(&dev_base_lock);
1081
1082         synchronize_rcu();
1083
1084         write_lock_bh(&dev_base_lock);
1085         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1086         write_unlock_bh(&dev_base_lock);
1087
1088         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1089         ret = notifier_to_errno(ret);
1090
1091         if (ret) {
1092                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1093                 if (err >= 0) {
1094                         err = ret;
1095                         write_seqcount_begin(&devnet_rename_seq);
1096                         memcpy(dev->name, oldname, IFNAMSIZ);
1097                         goto rollback;
1098                 } else {
1099                         pr_err("%s: name change rollback failed: %d\n",
1100                                dev->name, ret);
1101                 }
1102         }
1103
1104         return err;
1105 }
1106
1107 /**
1108  *      dev_set_alias - change ifalias of a device
1109  *      @dev: device
1110  *      @alias: name up to IFALIASZ
1111  *      @len: limit of bytes to copy from info
1112  *
1113  *      Set ifalias for a device,
1114  */
1115 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1116 {
1117         char *new_ifalias;
1118
1119         ASSERT_RTNL();
1120
1121         if (len >= IFALIASZ)
1122                 return -EINVAL;
1123
1124         if (!len) {
1125                 kfree(dev->ifalias);
1126                 dev->ifalias = NULL;
1127                 return 0;
1128         }
1129
1130         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1131         if (!new_ifalias)
1132                 return -ENOMEM;
1133         dev->ifalias = new_ifalias;
1134
1135         strlcpy(dev->ifalias, alias, len+1);
1136         return len;
1137 }
1138
1139
1140 /**
1141  *      netdev_features_change - device changes features
1142  *      @dev: device to cause notification
1143  *
1144  *      Called to indicate a device has changed features.
1145  */
1146 void netdev_features_change(struct net_device *dev)
1147 {
1148         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1149 }
1150 EXPORT_SYMBOL(netdev_features_change);
1151
1152 /**
1153  *      netdev_state_change - device changes state
1154  *      @dev: device to cause notification
1155  *
1156  *      Called to indicate a device has changed state. This function calls
1157  *      the notifier chains for netdev_chain and sends a NEWLINK message
1158  *      to the routing socket.
1159  */
1160 void netdev_state_change(struct net_device *dev)
1161 {
1162         if (dev->flags & IFF_UP) {
1163                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1164                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1165         }
1166 }
1167 EXPORT_SYMBOL(netdev_state_change);
1168
1169 /**
1170  *      netdev_notify_peers - notify network peers about existence of @dev
1171  *      @dev: network device
1172  *
1173  * Generate traffic such that interested network peers are aware of
1174  * @dev, such as by generating a gratuitous ARP. This may be used when
1175  * a device wants to inform the rest of the network about some sort of
1176  * reconfiguration such as a failover event or virtual machine
1177  * migration.
1178  */
1179 void netdev_notify_peers(struct net_device *dev)
1180 {
1181         rtnl_lock();
1182         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1183         rtnl_unlock();
1184 }
1185 EXPORT_SYMBOL(netdev_notify_peers);
1186
1187 static int __dev_open(struct net_device *dev)
1188 {
1189         const struct net_device_ops *ops = dev->netdev_ops;
1190         int ret;
1191
1192         ASSERT_RTNL();
1193
1194         if (!netif_device_present(dev))
1195                 return -ENODEV;
1196
1197         /* Block netpoll from trying to do any rx path servicing.
1198          * If we don't do this there is a chance ndo_poll_controller
1199          * or ndo_poll may be running while we open the device
1200          */
1201         netpoll_rx_disable(dev);
1202
1203         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1204         ret = notifier_to_errno(ret);
1205         if (ret)
1206                 return ret;
1207
1208         set_bit(__LINK_STATE_START, &dev->state);
1209
1210         if (ops->ndo_validate_addr)
1211                 ret = ops->ndo_validate_addr(dev);
1212
1213         if (!ret && ops->ndo_open)
1214                 ret = ops->ndo_open(dev);
1215
1216         netpoll_rx_enable(dev);
1217
1218         if (ret)
1219                 clear_bit(__LINK_STATE_START, &dev->state);
1220         else {
1221                 dev->flags |= IFF_UP;
1222                 net_dmaengine_get();
1223                 dev_set_rx_mode(dev);
1224                 dev_activate(dev);
1225                 add_device_randomness(dev->dev_addr, dev->addr_len);
1226         }
1227
1228         return ret;
1229 }
1230
1231 /**
1232  *      dev_open        - prepare an interface for use.
1233  *      @dev:   device to open
1234  *
1235  *      Takes a device from down to up state. The device's private open
1236  *      function is invoked and then the multicast lists are loaded. Finally
1237  *      the device is moved into the up state and a %NETDEV_UP message is
1238  *      sent to the netdev notifier chain.
1239  *
1240  *      Calling this function on an active interface is a nop. On a failure
1241  *      a negative errno code is returned.
1242  */
1243 int dev_open(struct net_device *dev)
1244 {
1245         int ret;
1246
1247         if (dev->flags & IFF_UP)
1248                 return 0;
1249
1250         ret = __dev_open(dev);
1251         if (ret < 0)
1252                 return ret;
1253
1254         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1255         call_netdevice_notifiers(NETDEV_UP, dev);
1256
1257         return ret;
1258 }
1259 EXPORT_SYMBOL(dev_open);
1260
1261 static int __dev_close_many(struct list_head *head)
1262 {
1263         struct net_device *dev;
1264
1265         ASSERT_RTNL();
1266         might_sleep();
1267
1268         list_for_each_entry(dev, head, unreg_list) {
1269                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1270
1271                 clear_bit(__LINK_STATE_START, &dev->state);
1272
1273                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1274                  * can be even on different cpu. So just clear netif_running().
1275                  *
1276                  * dev->stop() will invoke napi_disable() on all of it's
1277                  * napi_struct instances on this device.
1278                  */
1279                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1280         }
1281
1282         dev_deactivate_many(head);
1283
1284         list_for_each_entry(dev, head, unreg_list) {
1285                 const struct net_device_ops *ops = dev->netdev_ops;
1286
1287                 /*
1288                  *      Call the device specific close. This cannot fail.
1289                  *      Only if device is UP
1290                  *
1291                  *      We allow it to be called even after a DETACH hot-plug
1292                  *      event.
1293                  */
1294                 if (ops->ndo_stop)
1295                         ops->ndo_stop(dev);
1296
1297                 dev->flags &= ~IFF_UP;
1298                 net_dmaengine_put();
1299         }
1300
1301         return 0;
1302 }
1303
1304 static int __dev_close(struct net_device *dev)
1305 {
1306         int retval;
1307         LIST_HEAD(single);
1308
1309         /* Temporarily disable netpoll until the interface is down */
1310         netpoll_rx_disable(dev);
1311
1312         list_add(&dev->unreg_list, &single);
1313         retval = __dev_close_many(&single);
1314         list_del(&single);
1315
1316         netpoll_rx_enable(dev);
1317         return retval;
1318 }
1319
1320 static int dev_close_many(struct list_head *head)
1321 {
1322         struct net_device *dev, *tmp;
1323         LIST_HEAD(tmp_list);
1324
1325         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1326                 if (!(dev->flags & IFF_UP))
1327                         list_move(&dev->unreg_list, &tmp_list);
1328
1329         __dev_close_many(head);
1330
1331         list_for_each_entry(dev, head, unreg_list) {
1332                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1333                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1334         }
1335
1336         /* rollback_registered_many needs the complete original list */
1337         list_splice(&tmp_list, head);
1338         return 0;
1339 }
1340
1341 /**
1342  *      dev_close - shutdown an interface.
1343  *      @dev: device to shutdown
1344  *
1345  *      This function moves an active device into down state. A
1346  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1347  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1348  *      chain.
1349  */
1350 int dev_close(struct net_device *dev)
1351 {
1352         if (dev->flags & IFF_UP) {
1353                 LIST_HEAD(single);
1354
1355                 /* Block netpoll rx while the interface is going down */
1356                 netpoll_rx_disable(dev);
1357
1358                 list_add(&dev->unreg_list, &single);
1359                 dev_close_many(&single);
1360                 list_del(&single);
1361
1362                 netpoll_rx_enable(dev);
1363         }
1364         return 0;
1365 }
1366 EXPORT_SYMBOL(dev_close);
1367
1368
1369 /**
1370  *      dev_disable_lro - disable Large Receive Offload on a device
1371  *      @dev: device
1372  *
1373  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1374  *      called under RTNL.  This is needed if received packets may be
1375  *      forwarded to another interface.
1376  */
1377 void dev_disable_lro(struct net_device *dev)
1378 {
1379         /*
1380          * If we're trying to disable lro on a vlan device
1381          * use the underlying physical device instead
1382          */
1383         if (is_vlan_dev(dev))
1384                 dev = vlan_dev_real_dev(dev);
1385
1386         dev->wanted_features &= ~NETIF_F_LRO;
1387         netdev_update_features(dev);
1388
1389         if (unlikely(dev->features & NETIF_F_LRO))
1390                 netdev_WARN(dev, "failed to disable LRO!\n");
1391 }
1392 EXPORT_SYMBOL(dev_disable_lro);
1393
1394
1395 static int dev_boot_phase = 1;
1396
1397 /**
1398  *      register_netdevice_notifier - register a network notifier block
1399  *      @nb: notifier
1400  *
1401  *      Register a notifier to be called when network device events occur.
1402  *      The notifier passed is linked into the kernel structures and must
1403  *      not be reused until it has been unregistered. A negative errno code
1404  *      is returned on a failure.
1405  *
1406  *      When registered all registration and up events are replayed
1407  *      to the new notifier to allow device to have a race free
1408  *      view of the network device list.
1409  */
1410
1411 int register_netdevice_notifier(struct notifier_block *nb)
1412 {
1413         struct net_device *dev;
1414         struct net_device *last;
1415         struct net *net;
1416         int err;
1417
1418         rtnl_lock();
1419         err = raw_notifier_chain_register(&netdev_chain, nb);
1420         if (err)
1421                 goto unlock;
1422         if (dev_boot_phase)
1423                 goto unlock;
1424         for_each_net(net) {
1425                 for_each_netdev(net, dev) {
1426                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1427                         err = notifier_to_errno(err);
1428                         if (err)
1429                                 goto rollback;
1430
1431                         if (!(dev->flags & IFF_UP))
1432                                 continue;
1433
1434                         nb->notifier_call(nb, NETDEV_UP, dev);
1435                 }
1436         }
1437
1438 unlock:
1439         rtnl_unlock();
1440         return err;
1441
1442 rollback:
1443         last = dev;
1444         for_each_net(net) {
1445                 for_each_netdev(net, dev) {
1446                         if (dev == last)
1447                                 goto outroll;
1448
1449                         if (dev->flags & IFF_UP) {
1450                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1451                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1452                         }
1453                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1454                 }
1455         }
1456
1457 outroll:
1458         raw_notifier_chain_unregister(&netdev_chain, nb);
1459         goto unlock;
1460 }
1461 EXPORT_SYMBOL(register_netdevice_notifier);
1462
1463 /**
1464  *      unregister_netdevice_notifier - unregister a network notifier block
1465  *      @nb: notifier
1466  *
1467  *      Unregister a notifier previously registered by
1468  *      register_netdevice_notifier(). The notifier is unlinked into the
1469  *      kernel structures and may then be reused. A negative errno code
1470  *      is returned on a failure.
1471  *
1472  *      After unregistering unregister and down device events are synthesized
1473  *      for all devices on the device list to the removed notifier to remove
1474  *      the need for special case cleanup code.
1475  */
1476
1477 int unregister_netdevice_notifier(struct notifier_block *nb)
1478 {
1479         struct net_device *dev;
1480         struct net *net;
1481         int err;
1482
1483         rtnl_lock();
1484         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1485         if (err)
1486                 goto unlock;
1487
1488         for_each_net(net) {
1489                 for_each_netdev(net, dev) {
1490                         if (dev->flags & IFF_UP) {
1491                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1492                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1493                         }
1494                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1495                 }
1496         }
1497 unlock:
1498         rtnl_unlock();
1499         return err;
1500 }
1501 EXPORT_SYMBOL(unregister_netdevice_notifier);
1502
1503 /**
1504  *      call_netdevice_notifiers - call all network notifier blocks
1505  *      @val: value passed unmodified to notifier function
1506  *      @dev: net_device pointer passed unmodified to notifier function
1507  *
1508  *      Call all network notifier blocks.  Parameters and return value
1509  *      are as for raw_notifier_call_chain().
1510  */
1511
1512 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1513 {
1514         ASSERT_RTNL();
1515         return raw_notifier_call_chain(&netdev_chain, val, dev);
1516 }
1517 EXPORT_SYMBOL(call_netdevice_notifiers);
1518
1519 static struct static_key netstamp_needed __read_mostly;
1520 #ifdef HAVE_JUMP_LABEL
1521 /* We are not allowed to call static_key_slow_dec() from irq context
1522  * If net_disable_timestamp() is called from irq context, defer the
1523  * static_key_slow_dec() calls.
1524  */
1525 static atomic_t netstamp_needed_deferred;
1526 #endif
1527
1528 void net_enable_timestamp(void)
1529 {
1530 #ifdef HAVE_JUMP_LABEL
1531         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1532
1533         if (deferred) {
1534                 while (--deferred)
1535                         static_key_slow_dec(&netstamp_needed);
1536                 return;
1537         }
1538 #endif
1539         static_key_slow_inc(&netstamp_needed);
1540 }
1541 EXPORT_SYMBOL(net_enable_timestamp);
1542
1543 void net_disable_timestamp(void)
1544 {
1545 #ifdef HAVE_JUMP_LABEL
1546         if (in_interrupt()) {
1547                 atomic_inc(&netstamp_needed_deferred);
1548                 return;
1549         }
1550 #endif
1551         static_key_slow_dec(&netstamp_needed);
1552 }
1553 EXPORT_SYMBOL(net_disable_timestamp);
1554
1555 static inline void net_timestamp_set(struct sk_buff *skb)
1556 {
1557         skb->tstamp.tv64 = 0;
1558         if (static_key_false(&netstamp_needed))
1559                 __net_timestamp(skb);
1560 }
1561
1562 #define net_timestamp_check(COND, SKB)                  \
1563         if (static_key_false(&netstamp_needed)) {               \
1564                 if ((COND) && !(SKB)->tstamp.tv64)      \
1565                         __net_timestamp(SKB);           \
1566         }                                               \
1567
1568 static inline bool is_skb_forwardable(struct net_device *dev,
1569                                       struct sk_buff *skb)
1570 {
1571         unsigned int len;
1572
1573         if (!(dev->flags & IFF_UP))
1574                 return false;
1575
1576         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1577         if (skb->len <= len)
1578                 return true;
1579
1580         /* if TSO is enabled, we don't care about the length as the packet
1581          * could be forwarded without being segmented before
1582          */
1583         if (skb_is_gso(skb))
1584                 return true;
1585
1586         return false;
1587 }
1588
1589 /**
1590  * dev_forward_skb - loopback an skb to another netif
1591  *
1592  * @dev: destination network device
1593  * @skb: buffer to forward
1594  *
1595  * return values:
1596  *      NET_RX_SUCCESS  (no congestion)
1597  *      NET_RX_DROP     (packet was dropped, but freed)
1598  *
1599  * dev_forward_skb can be used for injecting an skb from the
1600  * start_xmit function of one device into the receive queue
1601  * of another device.
1602  *
1603  * The receiving device may be in another namespace, so
1604  * we have to clear all information in the skb that could
1605  * impact namespace isolation.
1606  */
1607 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1608 {
1609         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1610                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1611                         atomic_long_inc(&dev->rx_dropped);
1612                         kfree_skb(skb);
1613                         return NET_RX_DROP;
1614                 }
1615         }
1616
1617         skb_orphan(skb);
1618
1619         if (unlikely(!is_skb_forwardable(dev, skb))) {
1620                 atomic_long_inc(&dev->rx_dropped);
1621                 kfree_skb(skb);
1622                 return NET_RX_DROP;
1623         }
1624         skb->skb_iif = 0;
1625         skb_dst_drop(skb);
1626         skb->tstamp.tv64 = 0;
1627         skb->pkt_type = PACKET_HOST;
1628         skb->protocol = eth_type_trans(skb, dev);
1629         skb->mark = 0;
1630         secpath_reset(skb);
1631         nf_reset(skb);
1632         nf_reset_trace(skb);
1633         return netif_rx(skb);
1634 }
1635 EXPORT_SYMBOL_GPL(dev_forward_skb);
1636
1637 static inline int deliver_skb(struct sk_buff *skb,
1638                               struct packet_type *pt_prev,
1639                               struct net_device *orig_dev)
1640 {
1641         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1642                 return -ENOMEM;
1643         atomic_inc(&skb->users);
1644         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1645 }
1646
1647 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1648 {
1649         if (!ptype->af_packet_priv || !skb->sk)
1650                 return false;
1651
1652         if (ptype->id_match)
1653                 return ptype->id_match(ptype, skb->sk);
1654         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1655                 return true;
1656
1657         return false;
1658 }
1659
1660 /*
1661  *      Support routine. Sends outgoing frames to any network
1662  *      taps currently in use.
1663  */
1664
1665 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1666 {
1667         struct packet_type *ptype;
1668         struct sk_buff *skb2 = NULL;
1669         struct packet_type *pt_prev = NULL;
1670
1671         rcu_read_lock();
1672         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1673                 /* Never send packets back to the socket
1674                  * they originated from - MvS (miquels@drinkel.ow.org)
1675                  */
1676                 if ((ptype->dev == dev || !ptype->dev) &&
1677                     (!skb_loop_sk(ptype, skb))) {
1678                         if (pt_prev) {
1679                                 deliver_skb(skb2, pt_prev, skb->dev);
1680                                 pt_prev = ptype;
1681                                 continue;
1682                         }
1683
1684                         skb2 = skb_clone(skb, GFP_ATOMIC);
1685                         if (!skb2)
1686                                 break;
1687
1688                         net_timestamp_set(skb2);
1689
1690                         /* skb->nh should be correctly
1691                            set by sender, so that the second statement is
1692                            just protection against buggy protocols.
1693                          */
1694                         skb_reset_mac_header(skb2);
1695
1696                         if (skb_network_header(skb2) < skb2->data ||
1697                             skb2->network_header > skb2->tail) {
1698                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1699                                                      ntohs(skb2->protocol),
1700                                                      dev->name);
1701                                 skb_reset_network_header(skb2);
1702                         }
1703
1704                         skb2->transport_header = skb2->network_header;
1705                         skb2->pkt_type = PACKET_OUTGOING;
1706                         pt_prev = ptype;
1707                 }
1708         }
1709         if (pt_prev)
1710                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1711         rcu_read_unlock();
1712 }
1713
1714 /**
1715  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1716  * @dev: Network device
1717  * @txq: number of queues available
1718  *
1719  * If real_num_tx_queues is changed the tc mappings may no longer be
1720  * valid. To resolve this verify the tc mapping remains valid and if
1721  * not NULL the mapping. With no priorities mapping to this
1722  * offset/count pair it will no longer be used. In the worst case TC0
1723  * is invalid nothing can be done so disable priority mappings. If is
1724  * expected that drivers will fix this mapping if they can before
1725  * calling netif_set_real_num_tx_queues.
1726  */
1727 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1728 {
1729         int i;
1730         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1731
1732         /* If TC0 is invalidated disable TC mapping */
1733         if (tc->offset + tc->count > txq) {
1734                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1735                 dev->num_tc = 0;
1736                 return;
1737         }
1738
1739         /* Invalidated prio to tc mappings set to TC0 */
1740         for (i = 1; i < TC_BITMASK + 1; i++) {
1741                 int q = netdev_get_prio_tc_map(dev, i);
1742
1743                 tc = &dev->tc_to_txq[q];
1744                 if (tc->offset + tc->count > txq) {
1745                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1746                                 i, q);
1747                         netdev_set_prio_tc_map(dev, i, 0);
1748                 }
1749         }
1750 }
1751
1752 #ifdef CONFIG_XPS
1753 static DEFINE_MUTEX(xps_map_mutex);
1754 #define xmap_dereference(P)             \
1755         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1756
1757 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1758                                         int cpu, u16 index)
1759 {
1760         struct xps_map *map = NULL;
1761         int pos;
1762
1763         if (dev_maps)
1764                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1765
1766         for (pos = 0; map && pos < map->len; pos++) {
1767                 if (map->queues[pos] == index) {
1768                         if (map->len > 1) {
1769                                 map->queues[pos] = map->queues[--map->len];
1770                         } else {
1771                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1772                                 kfree_rcu(map, rcu);
1773                                 map = NULL;
1774                         }
1775                         break;
1776                 }
1777         }
1778
1779         return map;
1780 }
1781
1782 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1783 {
1784         struct xps_dev_maps *dev_maps;
1785         int cpu, i;
1786         bool active = false;
1787
1788         mutex_lock(&xps_map_mutex);
1789         dev_maps = xmap_dereference(dev->xps_maps);
1790
1791         if (!dev_maps)
1792                 goto out_no_maps;
1793
1794         for_each_possible_cpu(cpu) {
1795                 for (i = index; i < dev->num_tx_queues; i++) {
1796                         if (!remove_xps_queue(dev_maps, cpu, i))
1797                                 break;
1798                 }
1799                 if (i == dev->num_tx_queues)
1800                         active = true;
1801         }
1802
1803         if (!active) {
1804                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1805                 kfree_rcu(dev_maps, rcu);
1806         }
1807
1808         for (i = index; i < dev->num_tx_queues; i++)
1809                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1810                                              NUMA_NO_NODE);
1811
1812 out_no_maps:
1813         mutex_unlock(&xps_map_mutex);
1814 }
1815
1816 static struct xps_map *expand_xps_map(struct xps_map *map,
1817                                       int cpu, u16 index)
1818 {
1819         struct xps_map *new_map;
1820         int alloc_len = XPS_MIN_MAP_ALLOC;
1821         int i, pos;
1822
1823         for (pos = 0; map && pos < map->len; pos++) {
1824                 if (map->queues[pos] != index)
1825                         continue;
1826                 return map;
1827         }
1828
1829         /* Need to add queue to this CPU's existing map */
1830         if (map) {
1831                 if (pos < map->alloc_len)
1832                         return map;
1833
1834                 alloc_len = map->alloc_len * 2;
1835         }
1836
1837         /* Need to allocate new map to store queue on this CPU's map */
1838         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1839                                cpu_to_node(cpu));
1840         if (!new_map)
1841                 return NULL;
1842
1843         for (i = 0; i < pos; i++)
1844                 new_map->queues[i] = map->queues[i];
1845         new_map->alloc_len = alloc_len;
1846         new_map->len = pos;
1847
1848         return new_map;
1849 }
1850
1851 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1852 {
1853         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1854         struct xps_map *map, *new_map;
1855         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1856         int cpu, numa_node_id = -2;
1857         bool active = false;
1858
1859         mutex_lock(&xps_map_mutex);
1860
1861         dev_maps = xmap_dereference(dev->xps_maps);
1862
1863         /* allocate memory for queue storage */
1864         for_each_online_cpu(cpu) {
1865                 if (!cpumask_test_cpu(cpu, mask))
1866                         continue;
1867
1868                 if (!new_dev_maps)
1869                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1870                 if (!new_dev_maps) {
1871                         mutex_unlock(&xps_map_mutex);
1872                         return -ENOMEM;
1873                 }
1874
1875                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1876                                  NULL;
1877
1878                 map = expand_xps_map(map, cpu, index);
1879                 if (!map)
1880                         goto error;
1881
1882                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1883         }
1884
1885         if (!new_dev_maps)
1886                 goto out_no_new_maps;
1887
1888         for_each_possible_cpu(cpu) {
1889                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1890                         /* add queue to CPU maps */
1891                         int pos = 0;
1892
1893                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1894                         while ((pos < map->len) && (map->queues[pos] != index))
1895                                 pos++;
1896
1897                         if (pos == map->len)
1898                                 map->queues[map->len++] = index;
1899 #ifdef CONFIG_NUMA
1900                         if (numa_node_id == -2)
1901                                 numa_node_id = cpu_to_node(cpu);
1902                         else if (numa_node_id != cpu_to_node(cpu))
1903                                 numa_node_id = -1;
1904 #endif
1905                 } else if (dev_maps) {
1906                         /* fill in the new device map from the old device map */
1907                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1908                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1909                 }
1910
1911         }
1912
1913         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1914
1915         /* Cleanup old maps */
1916         if (dev_maps) {
1917                 for_each_possible_cpu(cpu) {
1918                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1919                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1920                         if (map && map != new_map)
1921                                 kfree_rcu(map, rcu);
1922                 }
1923
1924                 kfree_rcu(dev_maps, rcu);
1925         }
1926
1927         dev_maps = new_dev_maps;
1928         active = true;
1929
1930 out_no_new_maps:
1931         /* update Tx queue numa node */
1932         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1933                                      (numa_node_id >= 0) ? numa_node_id :
1934                                      NUMA_NO_NODE);
1935
1936         if (!dev_maps)
1937                 goto out_no_maps;
1938
1939         /* removes queue from unused CPUs */
1940         for_each_possible_cpu(cpu) {
1941                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1942                         continue;
1943
1944                 if (remove_xps_queue(dev_maps, cpu, index))
1945                         active = true;
1946         }
1947
1948         /* free map if not active */
1949         if (!active) {
1950                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1951                 kfree_rcu(dev_maps, rcu);
1952         }
1953
1954 out_no_maps:
1955         mutex_unlock(&xps_map_mutex);
1956
1957         return 0;
1958 error:
1959         /* remove any maps that we added */
1960         for_each_possible_cpu(cpu) {
1961                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1962                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1963                                  NULL;
1964                 if (new_map && new_map != map)
1965                         kfree(new_map);
1966         }
1967
1968         mutex_unlock(&xps_map_mutex);
1969
1970         kfree(new_dev_maps);
1971         return -ENOMEM;
1972 }
1973 EXPORT_SYMBOL(netif_set_xps_queue);
1974
1975 #endif
1976 /*
1977  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1978  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1979  */
1980 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1981 {
1982         int rc;
1983
1984         if (txq < 1 || txq > dev->num_tx_queues)
1985                 return -EINVAL;
1986
1987         if (dev->reg_state == NETREG_REGISTERED ||
1988             dev->reg_state == NETREG_UNREGISTERING) {
1989                 ASSERT_RTNL();
1990
1991                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1992                                                   txq);
1993                 if (rc)
1994                         return rc;
1995
1996                 if (dev->num_tc)
1997                         netif_setup_tc(dev, txq);
1998
1999                 if (txq < dev->real_num_tx_queues) {
2000                         qdisc_reset_all_tx_gt(dev, txq);
2001 #ifdef CONFIG_XPS
2002                         netif_reset_xps_queues_gt(dev, txq);
2003 #endif
2004                 }
2005         }
2006
2007         dev->real_num_tx_queues = txq;
2008         return 0;
2009 }
2010 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2011
2012 #ifdef CONFIG_RPS
2013 /**
2014  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2015  *      @dev: Network device
2016  *      @rxq: Actual number of RX queues
2017  *
2018  *      This must be called either with the rtnl_lock held or before
2019  *      registration of the net device.  Returns 0 on success, or a
2020  *      negative error code.  If called before registration, it always
2021  *      succeeds.
2022  */
2023 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2024 {
2025         int rc;
2026
2027         if (rxq < 1 || rxq > dev->num_rx_queues)
2028                 return -EINVAL;
2029
2030         if (dev->reg_state == NETREG_REGISTERED) {
2031                 ASSERT_RTNL();
2032
2033                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2034                                                   rxq);
2035                 if (rc)
2036                         return rc;
2037         }
2038
2039         dev->real_num_rx_queues = rxq;
2040         return 0;
2041 }
2042 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2043 #endif
2044
2045 /**
2046  * netif_get_num_default_rss_queues - default number of RSS queues
2047  *
2048  * This routine should set an upper limit on the number of RSS queues
2049  * used by default by multiqueue devices.
2050  */
2051 int netif_get_num_default_rss_queues(void)
2052 {
2053         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2054 }
2055 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2056
2057 static inline void __netif_reschedule(struct Qdisc *q)
2058 {
2059         struct softnet_data *sd;
2060         unsigned long flags;
2061
2062         local_irq_save(flags);
2063         sd = &__get_cpu_var(softnet_data);
2064         q->next_sched = NULL;
2065         *sd->output_queue_tailp = q;
2066         sd->output_queue_tailp = &q->next_sched;
2067         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2068         local_irq_restore(flags);
2069 }
2070
2071 void __netif_schedule(struct Qdisc *q)
2072 {
2073         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2074                 __netif_reschedule(q);
2075 }
2076 EXPORT_SYMBOL(__netif_schedule);
2077
2078 void dev_kfree_skb_irq(struct sk_buff *skb)
2079 {
2080         if (atomic_dec_and_test(&skb->users)) {
2081                 struct softnet_data *sd;
2082                 unsigned long flags;
2083
2084                 local_irq_save(flags);
2085                 sd = &__get_cpu_var(softnet_data);
2086                 skb->next = sd->completion_queue;
2087                 sd->completion_queue = skb;
2088                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2089                 local_irq_restore(flags);
2090         }
2091 }
2092 EXPORT_SYMBOL(dev_kfree_skb_irq);
2093
2094 void dev_kfree_skb_any(struct sk_buff *skb)
2095 {
2096         if (in_irq() || irqs_disabled())
2097                 dev_kfree_skb_irq(skb);
2098         else
2099                 dev_kfree_skb(skb);
2100 }
2101 EXPORT_SYMBOL(dev_kfree_skb_any);
2102
2103
2104 /**
2105  * netif_device_detach - mark device as removed
2106  * @dev: network device
2107  *
2108  * Mark device as removed from system and therefore no longer available.
2109  */
2110 void netif_device_detach(struct net_device *dev)
2111 {
2112         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2113             netif_running(dev)) {
2114                 netif_tx_stop_all_queues(dev);
2115         }
2116 }
2117 EXPORT_SYMBOL(netif_device_detach);
2118
2119 /**
2120  * netif_device_attach - mark device as attached
2121  * @dev: network device
2122  *
2123  * Mark device as attached from system and restart if needed.
2124  */
2125 void netif_device_attach(struct net_device *dev)
2126 {
2127         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2128             netif_running(dev)) {
2129                 netif_tx_wake_all_queues(dev);
2130                 __netdev_watchdog_up(dev);
2131         }
2132 }
2133 EXPORT_SYMBOL(netif_device_attach);
2134
2135 static void skb_warn_bad_offload(const struct sk_buff *skb)
2136 {
2137         static const netdev_features_t null_features = 0;
2138         struct net_device *dev = skb->dev;
2139         const char *driver = "";
2140
2141         if (!net_ratelimit())
2142                 return;
2143
2144         if (dev && dev->dev.parent)
2145                 driver = dev_driver_string(dev->dev.parent);
2146
2147         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2148              "gso_type=%d ip_summed=%d\n",
2149              driver, dev ? &dev->features : &null_features,
2150              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2151              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2152              skb_shinfo(skb)->gso_type, skb->ip_summed);
2153 }
2154
2155 /*
2156  * Invalidate hardware checksum when packet is to be mangled, and
2157  * complete checksum manually on outgoing path.
2158  */
2159 int skb_checksum_help(struct sk_buff *skb)
2160 {
2161         __wsum csum;
2162         int ret = 0, offset;
2163
2164         if (skb->ip_summed == CHECKSUM_COMPLETE)
2165                 goto out_set_summed;
2166
2167         if (unlikely(skb_shinfo(skb)->gso_size)) {
2168                 skb_warn_bad_offload(skb);
2169                 return -EINVAL;
2170         }
2171
2172         /* Before computing a checksum, we should make sure no frag could
2173          * be modified by an external entity : checksum could be wrong.
2174          */
2175         if (skb_has_shared_frag(skb)) {
2176                 ret = __skb_linearize(skb);
2177                 if (ret)
2178                         goto out;
2179         }
2180
2181         offset = skb_checksum_start_offset(skb);
2182         BUG_ON(offset >= skb_headlen(skb));
2183         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2184
2185         offset += skb->csum_offset;
2186         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2187
2188         if (skb_cloned(skb) &&
2189             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2190                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2191                 if (ret)
2192                         goto out;
2193         }
2194
2195         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2196 out_set_summed:
2197         skb->ip_summed = CHECKSUM_NONE;
2198 out:
2199         return ret;
2200 }
2201 EXPORT_SYMBOL(skb_checksum_help);
2202
2203 __be16 skb_network_protocol(struct sk_buff *skb)
2204 {
2205         __be16 type = skb->protocol;
2206         int vlan_depth = ETH_HLEN;
2207
2208         /* Tunnel gso handlers can set protocol to ethernet. */
2209         if (type == htons(ETH_P_TEB)) {
2210                 struct ethhdr *eth;
2211
2212                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2213                         return 0;
2214
2215                 eth = (struct ethhdr *)skb_mac_header(skb);
2216                 type = eth->h_proto;
2217         }
2218
2219         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2220                 struct vlan_hdr *vh;
2221
2222                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2223                         return 0;
2224
2225                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2226                 type = vh->h_vlan_encapsulated_proto;
2227                 vlan_depth += VLAN_HLEN;
2228         }
2229
2230         return type;
2231 }
2232
2233 /**
2234  *      skb_mac_gso_segment - mac layer segmentation handler.
2235  *      @skb: buffer to segment
2236  *      @features: features for the output path (see dev->features)
2237  */
2238 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2239                                     netdev_features_t features)
2240 {
2241         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2242         struct packet_offload *ptype;
2243         __be16 type = skb_network_protocol(skb);
2244
2245         if (unlikely(!type))
2246                 return ERR_PTR(-EINVAL);
2247
2248         __skb_pull(skb, skb->mac_len);
2249
2250         rcu_read_lock();
2251         list_for_each_entry_rcu(ptype, &offload_base, list) {
2252                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2253                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2254                                 int err;
2255
2256                                 err = ptype->callbacks.gso_send_check(skb);
2257                                 segs = ERR_PTR(err);
2258                                 if (err || skb_gso_ok(skb, features))
2259                                         break;
2260                                 __skb_push(skb, (skb->data -
2261                                                  skb_network_header(skb)));
2262                         }
2263                         segs = ptype->callbacks.gso_segment(skb, features);
2264                         break;
2265                 }
2266         }
2267         rcu_read_unlock();
2268
2269         __skb_push(skb, skb->data - skb_mac_header(skb));
2270
2271         return segs;
2272 }
2273 EXPORT_SYMBOL(skb_mac_gso_segment);
2274
2275
2276 /* openvswitch calls this on rx path, so we need a different check.
2277  */
2278 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2279 {
2280         if (tx_path)
2281                 return skb->ip_summed != CHECKSUM_PARTIAL;
2282         else
2283                 return skb->ip_summed == CHECKSUM_NONE;
2284 }
2285
2286 /**
2287  *      __skb_gso_segment - Perform segmentation on skb.
2288  *      @skb: buffer to segment
2289  *      @features: features for the output path (see dev->features)
2290  *      @tx_path: whether it is called in TX path
2291  *
2292  *      This function segments the given skb and returns a list of segments.
2293  *
2294  *      It may return NULL if the skb requires no segmentation.  This is
2295  *      only possible when GSO is used for verifying header integrity.
2296  */
2297 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2298                                   netdev_features_t features, bool tx_path)
2299 {
2300         if (unlikely(skb_needs_check(skb, tx_path))) {
2301                 int err;
2302
2303                 skb_warn_bad_offload(skb);
2304
2305                 if (skb_header_cloned(skb) &&
2306                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2307                         return ERR_PTR(err);
2308         }
2309
2310         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2311         skb_reset_mac_header(skb);
2312         skb_reset_mac_len(skb);
2313
2314         return skb_mac_gso_segment(skb, features);
2315 }
2316 EXPORT_SYMBOL(__skb_gso_segment);
2317
2318 /* Take action when hardware reception checksum errors are detected. */
2319 #ifdef CONFIG_BUG
2320 void netdev_rx_csum_fault(struct net_device *dev)
2321 {
2322         if (net_ratelimit()) {
2323                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2324                 dump_stack();
2325         }
2326 }
2327 EXPORT_SYMBOL(netdev_rx_csum_fault);
2328 #endif
2329
2330 /* Actually, we should eliminate this check as soon as we know, that:
2331  * 1. IOMMU is present and allows to map all the memory.
2332  * 2. No high memory really exists on this machine.
2333  */
2334
2335 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2336 {
2337 #ifdef CONFIG_HIGHMEM
2338         int i;
2339         if (!(dev->features & NETIF_F_HIGHDMA)) {
2340                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2341                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2342                         if (PageHighMem(skb_frag_page(frag)))
2343                                 return 1;
2344                 }
2345         }
2346
2347         if (PCI_DMA_BUS_IS_PHYS) {
2348                 struct device *pdev = dev->dev.parent;
2349
2350                 if (!pdev)
2351                         return 0;
2352                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2353                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2354                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2355                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2356                                 return 1;
2357                 }
2358         }
2359 #endif
2360         return 0;
2361 }
2362
2363 struct dev_gso_cb {
2364         void (*destructor)(struct sk_buff *skb);
2365 };
2366
2367 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2368
2369 static void dev_gso_skb_destructor(struct sk_buff *skb)
2370 {
2371         struct dev_gso_cb *cb;
2372
2373         do {
2374                 struct sk_buff *nskb = skb->next;
2375
2376                 skb->next = nskb->next;
2377                 nskb->next = NULL;
2378                 kfree_skb(nskb);
2379         } while (skb->next);
2380
2381         cb = DEV_GSO_CB(skb);
2382         if (cb->destructor)
2383                 cb->destructor(skb);
2384 }
2385
2386 /**
2387  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2388  *      @skb: buffer to segment
2389  *      @features: device features as applicable to this skb
2390  *
2391  *      This function segments the given skb and stores the list of segments
2392  *      in skb->next.
2393  */
2394 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2395 {
2396         struct sk_buff *segs;
2397
2398         segs = skb_gso_segment(skb, features);
2399
2400         /* Verifying header integrity only. */
2401         if (!segs)
2402                 return 0;
2403
2404         if (IS_ERR(segs))
2405                 return PTR_ERR(segs);
2406
2407         skb->next = segs;
2408         DEV_GSO_CB(skb)->destructor = skb->destructor;
2409         skb->destructor = dev_gso_skb_destructor;
2410
2411         return 0;
2412 }
2413
2414 static netdev_features_t harmonize_features(struct sk_buff *skb,
2415         __be16 protocol, netdev_features_t features)
2416 {
2417         if (skb->ip_summed != CHECKSUM_NONE &&
2418             !can_checksum_protocol(features, protocol)) {
2419                 features &= ~NETIF_F_ALL_CSUM;
2420         } else if (illegal_highdma(skb->dev, skb)) {
2421                 features &= ~NETIF_F_SG;
2422         }
2423
2424         return features;
2425 }
2426
2427 netdev_features_t netif_skb_features(struct sk_buff *skb)
2428 {
2429         __be16 protocol = skb->protocol;
2430         netdev_features_t features = skb->dev->features;
2431
2432         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2433                 features &= ~NETIF_F_GSO_MASK;
2434
2435         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2436                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2437                 protocol = veh->h_vlan_encapsulated_proto;
2438         } else if (!vlan_tx_tag_present(skb)) {
2439                 return harmonize_features(skb, protocol, features);
2440         }
2441
2442         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2443                                                NETIF_F_HW_VLAN_STAG_TX);
2444
2445         if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
2446                 return harmonize_features(skb, protocol, features);
2447         } else {
2448                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2449                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2450                                 NETIF_F_HW_VLAN_STAG_TX;
2451                 return harmonize_features(skb, protocol, features);
2452         }
2453 }
2454 EXPORT_SYMBOL(netif_skb_features);
2455
2456 /*
2457  * Returns true if either:
2458  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2459  *      2. skb is fragmented and the device does not support SG.
2460  */
2461 static inline int skb_needs_linearize(struct sk_buff *skb,
2462                                       netdev_features_t features)
2463 {
2464         return skb_is_nonlinear(skb) &&
2465                         ((skb_has_frag_list(skb) &&
2466                                 !(features & NETIF_F_FRAGLIST)) ||
2467                         (skb_shinfo(skb)->nr_frags &&
2468                                 !(features & NETIF_F_SG)));
2469 }
2470
2471 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2472                         struct netdev_queue *txq)
2473 {
2474         const struct net_device_ops *ops = dev->netdev_ops;
2475         int rc = NETDEV_TX_OK;
2476         unsigned int skb_len;
2477
2478         if (likely(!skb->next)) {
2479                 netdev_features_t features;
2480
2481                 /*
2482                  * If device doesn't need skb->dst, release it right now while
2483                  * its hot in this cpu cache
2484                  */
2485                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2486                         skb_dst_drop(skb);
2487
2488                 features = netif_skb_features(skb);
2489
2490                 if (vlan_tx_tag_present(skb) &&
2491                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2492                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2493                                              vlan_tx_tag_get(skb));
2494                         if (unlikely(!skb))
2495                                 goto out;
2496
2497                         skb->vlan_tci = 0;
2498                 }
2499
2500                 /* If encapsulation offload request, verify we are testing
2501                  * hardware encapsulation features instead of standard
2502                  * features for the netdev
2503                  */
2504                 if (skb->encapsulation)
2505                         features &= dev->hw_enc_features;
2506
2507                 if (netif_needs_gso(skb, features)) {
2508                         if (unlikely(dev_gso_segment(skb, features)))
2509                                 goto out_kfree_skb;
2510                         if (skb->next)
2511                                 goto gso;
2512                 } else {
2513                         if (skb_needs_linearize(skb, features) &&
2514                             __skb_linearize(skb))
2515                                 goto out_kfree_skb;
2516
2517                         /* If packet is not checksummed and device does not
2518                          * support checksumming for this protocol, complete
2519                          * checksumming here.
2520                          */
2521                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2522                                 if (skb->encapsulation)
2523                                         skb_set_inner_transport_header(skb,
2524                                                 skb_checksum_start_offset(skb));
2525                                 else
2526                                         skb_set_transport_header(skb,
2527                                                 skb_checksum_start_offset(skb));
2528                                 if (!(features & NETIF_F_ALL_CSUM) &&
2529                                      skb_checksum_help(skb))
2530                                         goto out_kfree_skb;
2531                         }
2532                 }
2533
2534                 if (!list_empty(&ptype_all))
2535                         dev_queue_xmit_nit(skb, dev);
2536
2537                 skb_len = skb->len;
2538                 rc = ops->ndo_start_xmit(skb, dev);
2539                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2540                 if (rc == NETDEV_TX_OK)
2541                         txq_trans_update(txq);
2542                 return rc;
2543         }
2544
2545 gso:
2546         do {
2547                 struct sk_buff *nskb = skb->next;
2548
2549                 skb->next = nskb->next;
2550                 nskb->next = NULL;
2551
2552                 if (!list_empty(&ptype_all))
2553                         dev_queue_xmit_nit(nskb, dev);
2554
2555                 skb_len = nskb->len;
2556                 rc = ops->ndo_start_xmit(nskb, dev);
2557                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2558                 if (unlikely(rc != NETDEV_TX_OK)) {
2559                         if (rc & ~NETDEV_TX_MASK)
2560                                 goto out_kfree_gso_skb;
2561                         nskb->next = skb->next;
2562                         skb->next = nskb;
2563                         return rc;
2564                 }
2565                 txq_trans_update(txq);
2566                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2567                         return NETDEV_TX_BUSY;
2568         } while (skb->next);
2569
2570 out_kfree_gso_skb:
2571         if (likely(skb->next == NULL)) {
2572                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2573                 consume_skb(skb);
2574                 return rc;
2575         }
2576 out_kfree_skb:
2577         kfree_skb(skb);
2578 out:
2579         return rc;
2580 }
2581
2582 static void qdisc_pkt_len_init(struct sk_buff *skb)
2583 {
2584         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2585
2586         qdisc_skb_cb(skb)->pkt_len = skb->len;
2587
2588         /* To get more precise estimation of bytes sent on wire,
2589          * we add to pkt_len the headers size of all segments
2590          */
2591         if (shinfo->gso_size)  {
2592                 unsigned int hdr_len;
2593                 u16 gso_segs = shinfo->gso_segs;
2594
2595                 /* mac layer + network layer */
2596                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2597
2598                 /* + transport layer */
2599                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2600                         hdr_len += tcp_hdrlen(skb);
2601                 else
2602                         hdr_len += sizeof(struct udphdr);
2603
2604                 if (shinfo->gso_type & SKB_GSO_DODGY)
2605                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2606                                                 shinfo->gso_size);
2607
2608                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2609         }
2610 }
2611
2612 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2613                                  struct net_device *dev,
2614                                  struct netdev_queue *txq)
2615 {
2616         spinlock_t *root_lock = qdisc_lock(q);
2617         bool contended;
2618         int rc;
2619
2620         qdisc_pkt_len_init(skb);
2621         qdisc_calculate_pkt_len(skb, q);
2622         /*
2623          * Heuristic to force contended enqueues to serialize on a
2624          * separate lock before trying to get qdisc main lock.
2625          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2626          * and dequeue packets faster.
2627          */
2628         contended = qdisc_is_running(q);
2629         if (unlikely(contended))
2630                 spin_lock(&q->busylock);
2631
2632         spin_lock(root_lock);
2633         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2634                 kfree_skb(skb);
2635                 rc = NET_XMIT_DROP;
2636         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2637                    qdisc_run_begin(q)) {
2638                 /*
2639                  * This is a work-conserving queue; there are no old skbs
2640                  * waiting to be sent out; and the qdisc is not running -
2641                  * xmit the skb directly.
2642                  */
2643                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2644                         skb_dst_force(skb);
2645
2646                 qdisc_bstats_update(q, skb);
2647
2648                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2649                         if (unlikely(contended)) {
2650                                 spin_unlock(&q->busylock);
2651                                 contended = false;
2652                         }
2653                         __qdisc_run(q);
2654                 } else
2655                         qdisc_run_end(q);
2656
2657                 rc = NET_XMIT_SUCCESS;
2658         } else {
2659                 skb_dst_force(skb);
2660                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2661                 if (qdisc_run_begin(q)) {
2662                         if (unlikely(contended)) {
2663                                 spin_unlock(&q->busylock);
2664                                 contended = false;
2665                         }
2666                         __qdisc_run(q);
2667                 }
2668         }
2669         spin_unlock(root_lock);
2670         if (unlikely(contended))
2671                 spin_unlock(&q->busylock);
2672         return rc;
2673 }
2674
2675 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2676 static void skb_update_prio(struct sk_buff *skb)
2677 {
2678         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2679
2680         if (!skb->priority && skb->sk && map) {
2681                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2682
2683                 if (prioidx < map->priomap_len)
2684                         skb->priority = map->priomap[prioidx];
2685         }
2686 }
2687 #else
2688 #define skb_update_prio(skb)
2689 #endif
2690
2691 static DEFINE_PER_CPU(int, xmit_recursion);
2692 #define RECURSION_LIMIT 10
2693
2694 /**
2695  *      dev_loopback_xmit - loop back @skb
2696  *      @skb: buffer to transmit
2697  */
2698 int dev_loopback_xmit(struct sk_buff *skb)
2699 {
2700         skb_reset_mac_header(skb);
2701         __skb_pull(skb, skb_network_offset(skb));
2702         skb->pkt_type = PACKET_LOOPBACK;
2703         skb->ip_summed = CHECKSUM_UNNECESSARY;
2704         WARN_ON(!skb_dst(skb));
2705         skb_dst_force(skb);
2706         netif_rx_ni(skb);
2707         return 0;
2708 }
2709 EXPORT_SYMBOL(dev_loopback_xmit);
2710
2711 /**
2712  *      dev_queue_xmit - transmit a buffer
2713  *      @skb: buffer to transmit
2714  *
2715  *      Queue a buffer for transmission to a network device. The caller must
2716  *      have set the device and priority and built the buffer before calling
2717  *      this function. The function can be called from an interrupt.
2718  *
2719  *      A negative errno code is returned on a failure. A success does not
2720  *      guarantee the frame will be transmitted as it may be dropped due
2721  *      to congestion or traffic shaping.
2722  *
2723  * -----------------------------------------------------------------------------------
2724  *      I notice this method can also return errors from the queue disciplines,
2725  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2726  *      be positive.
2727  *
2728  *      Regardless of the return value, the skb is consumed, so it is currently
2729  *      difficult to retry a send to this method.  (You can bump the ref count
2730  *      before sending to hold a reference for retry if you are careful.)
2731  *
2732  *      When calling this method, interrupts MUST be enabled.  This is because
2733  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2734  *          --BLG
2735  */
2736 int dev_queue_xmit(struct sk_buff *skb)
2737 {
2738         struct net_device *dev = skb->dev;
2739         struct netdev_queue *txq;
2740         struct Qdisc *q;
2741         int rc = -ENOMEM;
2742
2743         skb_reset_mac_header(skb);
2744
2745         /* Disable soft irqs for various locks below. Also
2746          * stops preemption for RCU.
2747          */
2748         rcu_read_lock_bh();
2749
2750         skb_update_prio(skb);
2751
2752         txq = netdev_pick_tx(dev, skb);
2753         q = rcu_dereference_bh(txq->qdisc);
2754
2755 #ifdef CONFIG_NET_CLS_ACT
2756         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2757 #endif
2758         trace_net_dev_queue(skb);
2759         if (q->enqueue) {
2760                 rc = __dev_xmit_skb(skb, q, dev, txq);
2761                 goto out;
2762         }
2763
2764         /* The device has no queue. Common case for software devices:
2765            loopback, all the sorts of tunnels...
2766
2767            Really, it is unlikely that netif_tx_lock protection is necessary
2768            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2769            counters.)
2770            However, it is possible, that they rely on protection
2771            made by us here.
2772
2773            Check this and shot the lock. It is not prone from deadlocks.
2774            Either shot noqueue qdisc, it is even simpler 8)
2775          */
2776         if (dev->flags & IFF_UP) {
2777                 int cpu = smp_processor_id(); /* ok because BHs are off */
2778
2779                 if (txq->xmit_lock_owner != cpu) {
2780
2781                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2782                                 goto recursion_alert;
2783
2784                         HARD_TX_LOCK(dev, txq, cpu);
2785
2786                         if (!netif_xmit_stopped(txq)) {
2787                                 __this_cpu_inc(xmit_recursion);
2788                                 rc = dev_hard_start_xmit(skb, dev, txq);
2789                                 __this_cpu_dec(xmit_recursion);
2790                                 if (dev_xmit_complete(rc)) {
2791                                         HARD_TX_UNLOCK(dev, txq);
2792                                         goto out;
2793                                 }
2794                         }
2795                         HARD_TX_UNLOCK(dev, txq);
2796                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2797                                              dev->name);
2798                 } else {
2799                         /* Recursion is detected! It is possible,
2800                          * unfortunately
2801                          */
2802 recursion_alert:
2803                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2804                                              dev->name);
2805                 }
2806         }
2807
2808         rc = -ENETDOWN;
2809         rcu_read_unlock_bh();
2810
2811         kfree_skb(skb);
2812         return rc;
2813 out:
2814         rcu_read_unlock_bh();
2815         return rc;
2816 }
2817 EXPORT_SYMBOL(dev_queue_xmit);
2818
2819
2820 /*=======================================================================
2821                         Receiver routines
2822   =======================================================================*/
2823
2824 int netdev_max_backlog __read_mostly = 1000;
2825 EXPORT_SYMBOL(netdev_max_backlog);
2826
2827 int netdev_tstamp_prequeue __read_mostly = 1;
2828 int netdev_budget __read_mostly = 300;
2829 int weight_p __read_mostly = 64;            /* old backlog weight */
2830
2831 /* Called with irq disabled */
2832 static inline void ____napi_schedule(struct softnet_data *sd,
2833                                      struct napi_struct *napi)
2834 {
2835         list_add_tail(&napi->poll_list, &sd->poll_list);
2836         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2837 }
2838
2839 #ifdef CONFIG_RPS
2840
2841 /* One global table that all flow-based protocols share. */
2842 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2843 EXPORT_SYMBOL(rps_sock_flow_table);
2844
2845 struct static_key rps_needed __read_mostly;
2846
2847 static struct rps_dev_flow *
2848 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2849             struct rps_dev_flow *rflow, u16 next_cpu)
2850 {
2851         if (next_cpu != RPS_NO_CPU) {
2852 #ifdef CONFIG_RFS_ACCEL
2853                 struct netdev_rx_queue *rxqueue;
2854                 struct rps_dev_flow_table *flow_table;
2855                 struct rps_dev_flow *old_rflow;
2856                 u32 flow_id;
2857                 u16 rxq_index;
2858                 int rc;
2859
2860                 /* Should we steer this flow to a different hardware queue? */
2861                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2862                     !(dev->features & NETIF_F_NTUPLE))
2863                         goto out;
2864                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2865                 if (rxq_index == skb_get_rx_queue(skb))
2866                         goto out;
2867
2868                 rxqueue = dev->_rx + rxq_index;
2869                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2870                 if (!flow_table)
2871                         goto out;
2872                 flow_id = skb->rxhash & flow_table->mask;
2873                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2874                                                         rxq_index, flow_id);
2875                 if (rc < 0)
2876                         goto out;
2877                 old_rflow = rflow;
2878                 rflow = &flow_table->flows[flow_id];
2879                 rflow->filter = rc;
2880                 if (old_rflow->filter == rflow->filter)
2881                         old_rflow->filter = RPS_NO_FILTER;
2882         out:
2883 #endif
2884                 rflow->last_qtail =
2885                         per_cpu(softnet_data, next_cpu).input_queue_head;
2886         }
2887
2888         rflow->cpu = next_cpu;
2889         return rflow;
2890 }
2891
2892 /*
2893  * get_rps_cpu is called from netif_receive_skb and returns the target
2894  * CPU from the RPS map of the receiving queue for a given skb.
2895  * rcu_read_lock must be held on entry.
2896  */
2897 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2898                        struct rps_dev_flow **rflowp)
2899 {
2900         struct netdev_rx_queue *rxqueue;
2901         struct rps_map *map;
2902         struct rps_dev_flow_table *flow_table;
2903         struct rps_sock_flow_table *sock_flow_table;
2904         int cpu = -1;
2905         u16 tcpu;
2906
2907         if (skb_rx_queue_recorded(skb)) {
2908                 u16 index = skb_get_rx_queue(skb);
2909                 if (unlikely(index >= dev->real_num_rx_queues)) {
2910                         WARN_ONCE(dev->real_num_rx_queues > 1,
2911                                   "%s received packet on queue %u, but number "
2912                                   "of RX queues is %u\n",
2913                                   dev->name, index, dev->real_num_rx_queues);
2914                         goto done;
2915                 }
2916                 rxqueue = dev->_rx + index;
2917         } else
2918                 rxqueue = dev->_rx;
2919
2920         map = rcu_dereference(rxqueue->rps_map);
2921         if (map) {
2922                 if (map->len == 1 &&
2923                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2924                         tcpu = map->cpus[0];
2925                         if (cpu_online(tcpu))
2926                                 cpu = tcpu;
2927                         goto done;
2928                 }
2929         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2930                 goto done;
2931         }
2932
2933         skb_reset_network_header(skb);
2934         if (!skb_get_rxhash(skb))
2935                 goto done;
2936
2937         flow_table = rcu_dereference(rxqueue->rps_flow_table);
2938         sock_flow_table = rcu_dereference(rps_sock_flow_table);
2939         if (flow_table && sock_flow_table) {
2940                 u16 next_cpu;
2941                 struct rps_dev_flow *rflow;
2942
2943                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2944                 tcpu = rflow->cpu;
2945
2946                 next_cpu = sock_flow_table->ents[skb->rxhash &
2947                     sock_flow_table->mask];
2948
2949                 /*
2950                  * If the desired CPU (where last recvmsg was done) is
2951                  * different from current CPU (one in the rx-queue flow
2952                  * table entry), switch if one of the following holds:
2953                  *   - Current CPU is unset (equal to RPS_NO_CPU).
2954                  *   - Current CPU is offline.
2955                  *   - The current CPU's queue tail has advanced beyond the
2956                  *     last packet that was enqueued using this table entry.
2957                  *     This guarantees that all previous packets for the flow
2958                  *     have been dequeued, thus preserving in order delivery.
2959                  */
2960                 if (unlikely(tcpu != next_cpu) &&
2961                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2962                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2963                       rflow->last_qtail)) >= 0)) {
2964                         tcpu = next_cpu;
2965                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2966                 }
2967
2968                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2969                         *rflowp = rflow;
2970                         cpu = tcpu;
2971                         goto done;
2972                 }
2973         }
2974
2975         if (map) {
2976                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2977
2978                 if (cpu_online(tcpu)) {
2979                         cpu = tcpu;
2980                         goto done;
2981                 }
2982         }
2983
2984 done:
2985         return cpu;
2986 }
2987
2988 #ifdef CONFIG_RFS_ACCEL
2989
2990 /**
2991  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2992  * @dev: Device on which the filter was set
2993  * @rxq_index: RX queue index
2994  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2995  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2996  *
2997  * Drivers that implement ndo_rx_flow_steer() should periodically call
2998  * this function for each installed filter and remove the filters for
2999  * which it returns %true.
3000  */
3001 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3002                          u32 flow_id, u16 filter_id)
3003 {
3004         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3005         struct rps_dev_flow_table *flow_table;
3006         struct rps_dev_flow *rflow;
3007         bool expire = true;
3008         int cpu;
3009
3010         rcu_read_lock();
3011         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3012         if (flow_table && flow_id <= flow_table->mask) {
3013                 rflow = &flow_table->flows[flow_id];
3014                 cpu = ACCESS_ONCE(rflow->cpu);
3015                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3016                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3017                            rflow->last_qtail) <
3018                      (int)(10 * flow_table->mask)))
3019                         expire = false;
3020         }
3021         rcu_read_unlock();
3022         return expire;
3023 }
3024 EXPORT_SYMBOL(rps_may_expire_flow);
3025
3026 #endif /* CONFIG_RFS_ACCEL */
3027
3028 /* Called from hardirq (IPI) context */
3029 static void rps_trigger_softirq(void *data)
3030 {
3031         struct softnet_data *sd = data;
3032
3033         ____napi_schedule(sd, &sd->backlog);
3034         sd->received_rps++;
3035 }
3036
3037 #endif /* CONFIG_RPS */
3038
3039 /*
3040  * Check if this softnet_data structure is another cpu one
3041  * If yes, queue it to our IPI list and return 1
3042  * If no, return 0
3043  */
3044 static int rps_ipi_queued(struct softnet_data *sd)
3045 {
3046 #ifdef CONFIG_RPS
3047         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3048
3049         if (sd != mysd) {
3050                 sd->rps_ipi_next = mysd->rps_ipi_list;
3051                 mysd->rps_ipi_list = sd;
3052
3053                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3054                 return 1;
3055         }
3056 #endif /* CONFIG_RPS */
3057         return 0;
3058 }
3059
3060 #ifdef CONFIG_NET_FLOW_LIMIT
3061 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3062 #endif
3063
3064 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3065 {
3066 #ifdef CONFIG_NET_FLOW_LIMIT
3067         struct sd_flow_limit *fl;
3068         struct softnet_data *sd;
3069         unsigned int old_flow, new_flow;
3070
3071         if (qlen < (netdev_max_backlog >> 1))
3072                 return false;
3073
3074         sd = &__get_cpu_var(softnet_data);
3075
3076         rcu_read_lock();
3077         fl = rcu_dereference(sd->flow_limit);
3078         if (fl) {
3079                 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3080                 old_flow = fl->history[fl->history_head];
3081                 fl->history[fl->history_head] = new_flow;
3082
3083                 fl->history_head++;
3084                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3085
3086                 if (likely(fl->buckets[old_flow]))
3087                         fl->buckets[old_flow]--;
3088
3089                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3090                         fl->count++;
3091                         rcu_read_unlock();
3092                         return true;
3093                 }
3094         }
3095         rcu_read_unlock();
3096 #endif
3097         return false;
3098 }
3099
3100 /*
3101  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3102  * queue (may be a remote CPU queue).
3103  */
3104 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3105                               unsigned int *qtail)
3106 {
3107         struct softnet_data *sd;
3108         unsigned long flags;
3109         unsigned int qlen;
3110
3111         sd = &per_cpu(softnet_data, cpu);
3112
3113         local_irq_save(flags);
3114
3115         rps_lock(sd);
3116         qlen = skb_queue_len(&sd->input_pkt_queue);
3117         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3118                 if (skb_queue_len(&sd->input_pkt_queue)) {
3119 enqueue:
3120                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3121                         input_queue_tail_incr_save(sd, qtail);
3122                         rps_unlock(sd);
3123                         local_irq_restore(flags);
3124                         return NET_RX_SUCCESS;
3125                 }
3126
3127                 /* Schedule NAPI for backlog device
3128                  * We can use non atomic operation since we own the queue lock
3129                  */
3130                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3131                         if (!rps_ipi_queued(sd))
3132                                 ____napi_schedule(sd, &sd->backlog);
3133                 }
3134                 goto enqueue;
3135         }
3136
3137         sd->dropped++;
3138         rps_unlock(sd);
3139
3140         local_irq_restore(flags);
3141
3142         atomic_long_inc(&skb->dev->rx_dropped);
3143         kfree_skb(skb);
3144         return NET_RX_DROP;
3145 }
3146
3147 /**
3148  *      netif_rx        -       post buffer to the network code
3149  *      @skb: buffer to post
3150  *
3151  *      This function receives a packet from a device driver and queues it for
3152  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3153  *      may be dropped during processing for congestion control or by the
3154  *      protocol layers.
3155  *
3156  *      return values:
3157  *      NET_RX_SUCCESS  (no congestion)
3158  *      NET_RX_DROP     (packet was dropped)
3159  *
3160  */
3161
3162 int netif_rx(struct sk_buff *skb)
3163 {
3164         int ret;
3165
3166         /* if netpoll wants it, pretend we never saw it */
3167         if (netpoll_rx(skb))
3168                 return NET_RX_DROP;
3169
3170         net_timestamp_check(netdev_tstamp_prequeue, skb);
3171
3172         trace_netif_rx(skb);
3173 #ifdef CONFIG_RPS
3174         if (static_key_false(&rps_needed)) {
3175                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3176                 int cpu;
3177
3178                 preempt_disable();
3179                 rcu_read_lock();
3180
3181                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3182                 if (cpu < 0)
3183                         cpu = smp_processor_id();
3184
3185                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3186
3187                 rcu_read_unlock();
3188                 preempt_enable();
3189         } else
3190 #endif
3191         {
3192                 unsigned int qtail;
3193                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3194                 put_cpu();
3195         }
3196         return ret;
3197 }
3198 EXPORT_SYMBOL(netif_rx);
3199
3200 int netif_rx_ni(struct sk_buff *skb)
3201 {
3202         int err;
3203
3204         preempt_disable();
3205         err = netif_rx(skb);
3206         if (local_softirq_pending())
3207                 do_softirq();
3208         preempt_enable();
3209
3210         return err;
3211 }
3212 EXPORT_SYMBOL(netif_rx_ni);
3213
3214 static void net_tx_action(struct softirq_action *h)
3215 {
3216         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3217
3218         if (sd->completion_queue) {
3219                 struct sk_buff *clist;
3220
3221                 local_irq_disable();
3222                 clist = sd->completion_queue;
3223                 sd->completion_queue = NULL;
3224                 local_irq_enable();
3225
3226                 while (clist) {
3227                         struct sk_buff *skb = clist;
3228                         clist = clist->next;
3229
3230                         WARN_ON(atomic_read(&skb->users));
3231                         trace_kfree_skb(skb, net_tx_action);
3232                         __kfree_skb(skb);
3233                 }
3234         }
3235
3236         if (sd->output_queue) {
3237                 struct Qdisc *head;
3238
3239                 local_irq_disable();
3240                 head = sd->output_queue;
3241                 sd->output_queue = NULL;
3242                 sd->output_queue_tailp = &sd->output_queue;
3243                 local_irq_enable();
3244
3245                 while (head) {
3246                         struct Qdisc *q = head;
3247                         spinlock_t *root_lock;
3248
3249                         head = head->next_sched;
3250
3251                         root_lock = qdisc_lock(q);
3252                         if (spin_trylock(root_lock)) {
3253                                 smp_mb__before_clear_bit();
3254                                 clear_bit(__QDISC_STATE_SCHED,
3255                                           &q->state);
3256                                 qdisc_run(q);
3257                                 spin_unlock(root_lock);
3258                         } else {
3259                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3260                                               &q->state)) {
3261                                         __netif_reschedule(q);
3262                                 } else {
3263                                         smp_mb__before_clear_bit();
3264                                         clear_bit(__QDISC_STATE_SCHED,
3265                                                   &q->state);
3266                                 }
3267                         }
3268                 }
3269         }
3270 }
3271
3272 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3273     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3274 /* This hook is defined here for ATM LANE */
3275 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3276                              unsigned char *addr) __read_mostly;
3277 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3278 #endif
3279
3280 #ifdef CONFIG_NET_CLS_ACT
3281 /* TODO: Maybe we should just force sch_ingress to be compiled in
3282  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3283  * a compare and 2 stores extra right now if we dont have it on
3284  * but have CONFIG_NET_CLS_ACT
3285  * NOTE: This doesn't stop any functionality; if you dont have
3286  * the ingress scheduler, you just can't add policies on ingress.
3287  *
3288  */
3289 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3290 {
3291         struct net_device *dev = skb->dev;
3292         u32 ttl = G_TC_RTTL(skb->tc_verd);
3293         int result = TC_ACT_OK;
3294         struct Qdisc *q;
3295
3296         if (unlikely(MAX_RED_LOOP < ttl++)) {
3297                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3298                                      skb->skb_iif, dev->ifindex);
3299                 return TC_ACT_SHOT;
3300         }
3301
3302         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3303         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3304
3305         q = rxq->qdisc;
3306         if (q != &noop_qdisc) {
3307                 spin_lock(qdisc_lock(q));
3308                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3309                         result = qdisc_enqueue_root(skb, q);
3310                 spin_unlock(qdisc_lock(q));
3311         }
3312
3313         return result;
3314 }
3315
3316 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3317                                          struct packet_type **pt_prev,
3318                                          int *ret, struct net_device *orig_dev)
3319 {
3320         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3321
3322         if (!rxq || rxq->qdisc == &noop_qdisc)
3323                 goto out;
3324
3325         if (*pt_prev) {
3326                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3327                 *pt_prev = NULL;
3328         }
3329
3330         switch (ing_filter(skb, rxq)) {
3331         case TC_ACT_SHOT:
3332         case TC_ACT_STOLEN:
3333                 kfree_skb(skb);
3334                 return NULL;
3335         }
3336
3337 out:
3338         skb->tc_verd = 0;
3339         return skb;
3340 }
3341 #endif
3342
3343 /**
3344  *      netdev_rx_handler_register - register receive handler
3345  *      @dev: device to register a handler for
3346  *      @rx_handler: receive handler to register
3347  *      @rx_handler_data: data pointer that is used by rx handler
3348  *
3349  *      Register a receive hander for a device. This handler will then be
3350  *      called from __netif_receive_skb. A negative errno code is returned
3351  *      on a failure.
3352  *
3353  *      The caller must hold the rtnl_mutex.
3354  *
3355  *      For a general description of rx_handler, see enum rx_handler_result.
3356  */
3357 int netdev_rx_handler_register(struct net_device *dev,
3358                                rx_handler_func_t *rx_handler,
3359                                void *rx_handler_data)
3360 {
3361         ASSERT_RTNL();
3362
3363         if (dev->rx_handler)
3364                 return -EBUSY;
3365
3366         /* Note: rx_handler_data must be set before rx_handler */
3367         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3368         rcu_assign_pointer(dev->rx_handler, rx_handler);
3369
3370         return 0;
3371 }
3372 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3373
3374 /**
3375  *      netdev_rx_handler_unregister - unregister receive handler
3376  *      @dev: device to unregister a handler from
3377  *
3378  *      Unregister a receive handler from a device.
3379  *
3380  *      The caller must hold the rtnl_mutex.
3381  */
3382 void netdev_rx_handler_unregister(struct net_device *dev)
3383 {
3384
3385         ASSERT_RTNL();
3386         RCU_INIT_POINTER(dev->rx_handler, NULL);
3387         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3388          * section has a guarantee to see a non NULL rx_handler_data
3389          * as well.
3390          */
3391         synchronize_net();
3392         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3393 }
3394 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3395
3396 /*
3397  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3398  * the special handling of PFMEMALLOC skbs.
3399  */
3400 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3401 {
3402         switch (skb->protocol) {
3403         case __constant_htons(ETH_P_ARP):
3404         case __constant_htons(ETH_P_IP):
3405         case __constant_htons(ETH_P_IPV6):
3406         case __constant_htons(ETH_P_8021Q):
3407         case __constant_htons(ETH_P_8021AD):
3408                 return true;
3409         default:
3410                 return false;
3411         }
3412 }
3413
3414 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3415 {
3416         struct packet_type *ptype, *pt_prev;
3417         rx_handler_func_t *rx_handler;
3418         struct net_device *orig_dev;
3419         struct net_device *null_or_dev;
3420         bool deliver_exact = false;
3421         int ret = NET_RX_DROP;
3422         __be16 type;
3423
3424         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3425
3426         trace_netif_receive_skb(skb);
3427
3428         /* if we've gotten here through NAPI, check netpoll */
3429         if (netpoll_receive_skb(skb))
3430                 goto out;
3431
3432         orig_dev = skb->dev;
3433
3434         skb_reset_network_header(skb);
3435         if (!skb_transport_header_was_set(skb))
3436                 skb_reset_transport_header(skb);
3437         skb_reset_mac_len(skb);
3438
3439         pt_prev = NULL;
3440
3441         rcu_read_lock();
3442
3443 another_round:
3444         skb->skb_iif = skb->dev->ifindex;
3445
3446         __this_cpu_inc(softnet_data.processed);
3447
3448         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3449             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3450                 skb = vlan_untag(skb);
3451                 if (unlikely(!skb))
3452                         goto unlock;
3453         }
3454
3455 #ifdef CONFIG_NET_CLS_ACT
3456         if (skb->tc_verd & TC_NCLS) {
3457                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3458                 goto ncls;
3459         }
3460 #endif
3461
3462         if (pfmemalloc)
3463                 goto skip_taps;
3464
3465         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3466                 if (!ptype->dev || ptype->dev == skb->dev) {
3467                         if (pt_prev)
3468                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3469                         pt_prev = ptype;
3470                 }
3471         }
3472
3473 skip_taps:
3474 #ifdef CONFIG_NET_CLS_ACT
3475         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3476         if (!skb)
3477                 goto unlock;
3478 ncls:
3479 #endif
3480
3481         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3482                 goto drop;
3483
3484         if (vlan_tx_tag_present(skb)) {
3485                 if (pt_prev) {
3486                         ret = deliver_skb(skb, pt_prev, orig_dev);
3487                         pt_prev = NULL;
3488                 }
3489                 if (vlan_do_receive(&skb))
3490                         goto another_round;
3491                 else if (unlikely(!skb))
3492                         goto unlock;
3493         }
3494
3495         rx_handler = rcu_dereference(skb->dev->rx_handler);
3496         if (rx_handler) {
3497                 if (pt_prev) {
3498                         ret = deliver_skb(skb, pt_prev, orig_dev);
3499                         pt_prev = NULL;
3500                 }
3501                 switch (rx_handler(&skb)) {
3502                 case RX_HANDLER_CONSUMED:
3503                         ret = NET_RX_SUCCESS;
3504                         goto unlock;
3505                 case RX_HANDLER_ANOTHER:
3506                         goto another_round;
3507                 case RX_HANDLER_EXACT:
3508                         deliver_exact = true;
3509                 case RX_HANDLER_PASS:
3510                         break;
3511                 default:
3512                         BUG();
3513                 }
3514         }
3515
3516         if (vlan_tx_nonzero_tag_present(skb))
3517                 skb->pkt_type = PACKET_OTHERHOST;
3518
3519         /* deliver only exact match when indicated */
3520         null_or_dev = deliver_exact ? skb->dev : NULL;
3521
3522         type = skb->protocol;
3523         list_for_each_entry_rcu(ptype,
3524                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3525                 if (ptype->type == type &&
3526                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3527                      ptype->dev == orig_dev)) {
3528                         if (pt_prev)
3529                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3530                         pt_prev = ptype;
3531                 }
3532         }
3533
3534         if (pt_prev) {
3535                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3536                         goto drop;
3537                 else
3538                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3539         } else {
3540 drop:
3541                 atomic_long_inc(&skb->dev->rx_dropped);
3542                 kfree_skb(skb);
3543                 /* Jamal, now you will not able to escape explaining
3544                  * me how you were going to use this. :-)
3545                  */
3546                 ret = NET_RX_DROP;
3547         }
3548
3549 unlock:
3550         rcu_read_unlock();
3551 out:
3552         return ret;
3553 }
3554
3555 static int __netif_receive_skb(struct sk_buff *skb)
3556 {
3557         int ret;
3558
3559         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3560                 unsigned long pflags = current->flags;
3561
3562                 /*
3563                  * PFMEMALLOC skbs are special, they should
3564                  * - be delivered to SOCK_MEMALLOC sockets only
3565                  * - stay away from userspace
3566                  * - have bounded memory usage
3567                  *
3568                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3569                  * context down to all allocation sites.
3570                  */
3571                 current->flags |= PF_MEMALLOC;
3572                 ret = __netif_receive_skb_core(skb, true);
3573                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3574         } else
3575                 ret = __netif_receive_skb_core(skb, false);
3576
3577         return ret;
3578 }
3579
3580 /**
3581  *      netif_receive_skb - process receive buffer from network
3582  *      @skb: buffer to process
3583  *
3584  *      netif_receive_skb() is the main receive data processing function.
3585  *      It always succeeds. The buffer may be dropped during processing
3586  *      for congestion control or by the protocol layers.
3587  *
3588  *      This function may only be called from softirq context and interrupts
3589  *      should be enabled.
3590  *
3591  *      Return values (usually ignored):
3592  *      NET_RX_SUCCESS: no congestion
3593  *      NET_RX_DROP: packet was dropped
3594  */
3595 int netif_receive_skb(struct sk_buff *skb)
3596 {
3597         net_timestamp_check(netdev_tstamp_prequeue, skb);
3598
3599         if (skb_defer_rx_timestamp(skb))
3600                 return NET_RX_SUCCESS;
3601
3602 #ifdef CONFIG_RPS
3603         if (static_key_false(&rps_needed)) {
3604                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3605                 int cpu, ret;
3606
3607                 rcu_read_lock();
3608
3609                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3610
3611                 if (cpu >= 0) {
3612                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3613                         rcu_read_unlock();
3614                         return ret;
3615                 }
3616                 rcu_read_unlock();
3617         }
3618 #endif
3619         return __netif_receive_skb(skb);
3620 }
3621 EXPORT_SYMBOL(netif_receive_skb);
3622
3623 /* Network device is going away, flush any packets still pending
3624  * Called with irqs disabled.
3625  */
3626 static void flush_backlog(void *arg)
3627 {
3628         struct net_device *dev = arg;
3629         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3630         struct sk_buff *skb, *tmp;
3631
3632         rps_lock(sd);
3633         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3634                 if (skb->dev == dev) {
3635                         __skb_unlink(skb, &sd->input_pkt_queue);
3636                         kfree_skb(skb);
3637                         input_queue_head_incr(sd);
3638                 }
3639         }
3640         rps_unlock(sd);
3641
3642         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3643                 if (skb->dev == dev) {
3644                         __skb_unlink(skb, &sd->process_queue);
3645                         kfree_skb(skb);
3646                         input_queue_head_incr(sd);
3647                 }
3648         }
3649 }
3650
3651 static int napi_gro_complete(struct sk_buff *skb)
3652 {
3653         struct packet_offload *ptype;
3654         __be16 type = skb->protocol;
3655         struct list_head *head = &offload_base;
3656         int err = -ENOENT;
3657
3658         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3659
3660         if (NAPI_GRO_CB(skb)->count == 1) {
3661                 skb_shinfo(skb)->gso_size = 0;
3662                 goto out;
3663         }
3664
3665         rcu_read_lock();
3666         list_for_each_entry_rcu(ptype, head, list) {
3667                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3668                         continue;
3669
3670                 err = ptype->callbacks.gro_complete(skb);
3671                 break;
3672         }
3673         rcu_read_unlock();
3674
3675         if (err) {
3676                 WARN_ON(&ptype->list == head);
3677                 kfree_skb(skb);
3678                 return NET_RX_SUCCESS;
3679         }
3680
3681 out:
3682         return netif_receive_skb(skb);
3683 }
3684
3685 /* napi->gro_list contains packets ordered by age.
3686  * youngest packets at the head of it.
3687  * Complete skbs in reverse order to reduce latencies.
3688  */
3689 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3690 {
3691         struct sk_buff *skb, *prev = NULL;
3692
3693         /* scan list and build reverse chain */
3694         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3695                 skb->prev = prev;
3696                 prev = skb;
3697         }
3698
3699         for (skb = prev; skb; skb = prev) {
3700                 skb->next = NULL;
3701
3702                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3703                         return;
3704
3705                 prev = skb->prev;
3706                 napi_gro_complete(skb);
3707                 napi->gro_count--;
3708         }
3709
3710         napi->gro_list = NULL;
3711 }
3712 EXPORT_SYMBOL(napi_gro_flush);
3713
3714 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3715 {
3716         struct sk_buff *p;
3717         unsigned int maclen = skb->dev->hard_header_len;
3718
3719         for (p = napi->gro_list; p; p = p->next) {
3720                 unsigned long diffs;
3721
3722                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3723                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3724                 if (maclen == ETH_HLEN)
3725                         diffs |= compare_ether_header(skb_mac_header(p),
3726                                                       skb_gro_mac_header(skb));
3727                 else if (!diffs)
3728                         diffs = memcmp(skb_mac_header(p),
3729                                        skb_gro_mac_header(skb),
3730                                        maclen);
3731                 NAPI_GRO_CB(p)->same_flow = !diffs;
3732                 NAPI_GRO_CB(p)->flush = 0;
3733         }
3734 }
3735
3736 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3737 {
3738         struct sk_buff **pp = NULL;
3739         struct packet_offload *ptype;
3740         __be16 type = skb->protocol;
3741         struct list_head *head = &offload_base;
3742         int same_flow;
3743         enum gro_result ret;
3744
3745         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3746                 goto normal;
3747
3748         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3749                 goto normal;
3750
3751         gro_list_prepare(napi, skb);
3752
3753         rcu_read_lock();
3754         list_for_each_entry_rcu(ptype, head, list) {
3755                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3756                         continue;
3757
3758                 skb_set_network_header(skb, skb_gro_offset(skb));
3759                 skb_reset_mac_len(skb);
3760                 NAPI_GRO_CB(skb)->same_flow = 0;
3761                 NAPI_GRO_CB(skb)->flush = 0;
3762                 NAPI_GRO_CB(skb)->free = 0;
3763
3764                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3765                 break;
3766         }
3767         rcu_read_unlock();
3768
3769         if (&ptype->list == head)
3770                 goto normal;
3771
3772         same_flow = NAPI_GRO_CB(skb)->same_flow;
3773         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3774
3775         if (pp) {
3776                 struct sk_buff *nskb = *pp;
3777
3778                 *pp = nskb->next;
3779                 nskb->next = NULL;
3780                 napi_gro_complete(nskb);
3781                 napi->gro_count--;
3782         }
3783
3784         if (same_flow)
3785                 goto ok;
3786
3787         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3788                 goto normal;
3789
3790         napi->gro_count++;
3791         NAPI_GRO_CB(skb)->count = 1;
3792         NAPI_GRO_CB(skb)->age = jiffies;
3793         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3794         skb->next = napi->gro_list;
3795         napi->gro_list = skb;
3796         ret = GRO_HELD;
3797
3798 pull:
3799         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3800                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3801
3802                 BUG_ON(skb->end - skb->tail < grow);
3803
3804                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3805
3806                 skb->tail += grow;
3807                 skb->data_len -= grow;
3808
3809                 skb_shinfo(skb)->frags[0].page_offset += grow;
3810                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3811
3812                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3813                         skb_frag_unref(skb, 0);
3814                         memmove(skb_shinfo(skb)->frags,
3815                                 skb_shinfo(skb)->frags + 1,
3816                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3817                 }
3818         }
3819
3820 ok:
3821         return ret;
3822
3823 normal:
3824         ret = GRO_NORMAL;
3825         goto pull;
3826 }
3827
3828
3829 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3830 {
3831         switch (ret) {
3832         case GRO_NORMAL:
3833                 if (netif_receive_skb(skb))
3834                         ret = GRO_DROP;
3835                 break;
3836
3837         case GRO_DROP:
3838                 kfree_skb(skb);
3839                 break;
3840
3841         case GRO_MERGED_FREE:
3842                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3843                         kmem_cache_free(skbuff_head_cache, skb);
3844                 else
3845                         __kfree_skb(skb);
3846                 break;
3847
3848         case GRO_HELD:
3849         case GRO_MERGED:
3850                 break;
3851         }
3852
3853         return ret;
3854 }
3855
3856 static void skb_gro_reset_offset(struct sk_buff *skb)
3857 {
3858         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3859         const skb_frag_t *frag0 = &pinfo->frags[0];
3860
3861         NAPI_GRO_CB(skb)->data_offset = 0;
3862         NAPI_GRO_CB(skb)->frag0 = NULL;
3863         NAPI_GRO_CB(skb)->frag0_len = 0;
3864
3865         if (skb->mac_header == skb->tail &&
3866             pinfo->nr_frags &&
3867             !PageHighMem(skb_frag_page(frag0))) {
3868                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3869                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3870         }
3871 }
3872
3873 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3874 {
3875         skb_gro_reset_offset(skb);
3876
3877         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3878 }
3879 EXPORT_SYMBOL(napi_gro_receive);
3880
3881 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3882 {
3883         __skb_pull(skb, skb_headlen(skb));
3884         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3885         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3886         skb->vlan_tci = 0;
3887         skb->dev = napi->dev;
3888         skb->skb_iif = 0;
3889
3890         napi->skb = skb;
3891 }
3892
3893 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3894 {
3895         struct sk_buff *skb = napi->skb;
3896
3897         if (!skb) {
3898                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3899                 if (skb)
3900                         napi->skb = skb;
3901         }
3902         return skb;
3903 }
3904 EXPORT_SYMBOL(napi_get_frags);
3905
3906 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3907                                gro_result_t ret)
3908 {
3909         switch (ret) {
3910         case GRO_NORMAL:
3911         case GRO_HELD:
3912                 skb->protocol = eth_type_trans(skb, skb->dev);
3913
3914                 if (ret == GRO_HELD)
3915                         skb_gro_pull(skb, -ETH_HLEN);
3916                 else if (netif_receive_skb(skb))
3917                         ret = GRO_DROP;
3918                 break;
3919
3920         case GRO_DROP:
3921         case GRO_MERGED_FREE:
3922                 napi_reuse_skb(napi, skb);
3923                 break;
3924
3925         case GRO_MERGED:
3926                 break;
3927         }
3928
3929         return ret;
3930 }
3931
3932 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3933 {
3934         struct sk_buff *skb = napi->skb;
3935         struct ethhdr *eth;
3936         unsigned int hlen;
3937         unsigned int off;
3938
3939         napi->skb = NULL;
3940
3941         skb_reset_mac_header(skb);
3942         skb_gro_reset_offset(skb);
3943
3944         off = skb_gro_offset(skb);
3945         hlen = off + sizeof(*eth);
3946         eth = skb_gro_header_fast(skb, off);
3947         if (skb_gro_header_hard(skb, hlen)) {
3948                 eth = skb_gro_header_slow(skb, hlen, off);
3949                 if (unlikely(!eth)) {
3950                         napi_reuse_skb(napi, skb);
3951                         skb = NULL;
3952                         goto out;
3953                 }
3954         }
3955
3956         skb_gro_pull(skb, sizeof(*eth));
3957
3958         /*
3959          * This works because the only protocols we care about don't require
3960          * special handling.  We'll fix it up properly at the end.
3961          */
3962         skb->protocol = eth->h_proto;
3963
3964 out:
3965         return skb;
3966 }
3967
3968 gro_result_t napi_gro_frags(struct napi_struct *napi)
3969 {
3970         struct sk_buff *skb = napi_frags_skb(napi);
3971
3972         if (!skb)
3973                 return GRO_DROP;
3974
3975         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
3976 }
3977 EXPORT_SYMBOL(napi_gro_frags);
3978
3979 /*
3980  * net_rps_action sends any pending IPI's for rps.
3981  * Note: called with local irq disabled, but exits with local irq enabled.
3982  */
3983 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3984 {
3985 #ifdef CONFIG_RPS
3986         struct softnet_data *remsd = sd->rps_ipi_list;
3987
3988         if (remsd) {
3989                 sd->rps_ipi_list = NULL;
3990
3991                 local_irq_enable();
3992
3993                 /* Send pending IPI's to kick RPS processing on remote cpus. */
3994                 while (remsd) {
3995                         struct softnet_data *next = remsd->rps_ipi_next;
3996
3997                         if (cpu_online(remsd->cpu))
3998                                 __smp_call_function_single(remsd->cpu,
3999                                                            &remsd->csd, 0);
4000                         remsd = next;
4001                 }
4002         } else
4003 #endif
4004                 local_irq_enable();
4005 }
4006
4007 static int process_backlog(struct napi_struct *napi, int quota)
4008 {
4009         int work = 0;
4010         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4011
4012 #ifdef CONFIG_RPS
4013         /* Check if we have pending ipi, its better to send them now,
4014          * not waiting net_rx_action() end.
4015          */
4016         if (sd->rps_ipi_list) {
4017                 local_irq_disable();
4018                 net_rps_action_and_irq_enable(sd);
4019         }
4020 #endif
4021         napi->weight = weight_p;
4022         local_irq_disable();
4023         while (work < quota) {
4024                 struct sk_buff *skb;
4025                 unsigned int qlen;
4026
4027                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4028                         local_irq_enable();
4029                         __netif_receive_skb(skb);
4030                         local_irq_disable();
4031                         input_queue_head_incr(sd);
4032                         if (++work >= quota) {
4033                                 local_irq_enable();
4034                                 return work;
4035                         }
4036                 }
4037
4038                 rps_lock(sd);
4039                 qlen = skb_queue_len(&sd->input_pkt_queue);
4040                 if (qlen)
4041                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4042                                                    &sd->process_queue);
4043
4044                 if (qlen < quota - work) {
4045                         /*
4046                          * Inline a custom version of __napi_complete().
4047                          * only current cpu owns and manipulates this napi,
4048                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4049                          * we can use a plain write instead of clear_bit(),
4050                          * and we dont need an smp_mb() memory barrier.
4051                          */
4052                         list_del(&napi->poll_list);
4053                         napi->state = 0;
4054
4055                         quota = work + qlen;
4056                 }
4057                 rps_unlock(sd);
4058         }
4059         local_irq_enable();
4060
4061         return work;
4062 }
4063
4064 /**
4065  * __napi_schedule - schedule for receive
4066  * @n: entry to schedule
4067  *
4068  * The entry's receive function will be scheduled to run
4069  */
4070 void __napi_schedule(struct napi_struct *n)
4071 {
4072         unsigned long flags;
4073
4074         local_irq_save(flags);
4075         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4076         local_irq_restore(flags);
4077 }
4078 EXPORT_SYMBOL(__napi_schedule);
4079
4080 void __napi_complete(struct napi_struct *n)
4081 {
4082         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4083         BUG_ON(n->gro_list);
4084
4085         list_del(&n->poll_list);
4086         smp_mb__before_clear_bit();
4087         clear_bit(NAPI_STATE_SCHED, &n->state);
4088 }
4089 EXPORT_SYMBOL(__napi_complete);
4090
4091 void napi_complete(struct napi_struct *n)
4092 {
4093         unsigned long flags;
4094
4095         /*
4096          * don't let napi dequeue from the cpu poll list
4097          * just in case its running on a different cpu
4098          */
4099         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4100                 return;
4101
4102         napi_gro_flush(n, false);
4103         local_irq_save(flags);
4104         __napi_complete(n);
4105         local_irq_restore(flags);
4106 }
4107 EXPORT_SYMBOL(napi_complete);
4108
4109 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4110                     int (*poll)(struct napi_struct *, int), int weight)
4111 {
4112         INIT_LIST_HEAD(&napi->poll_list);
4113         napi->gro_count = 0;
4114         napi->gro_list = NULL;
4115         napi->skb = NULL;
4116         napi->poll = poll;
4117         if (weight > NAPI_POLL_WEIGHT)
4118                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4119                             weight, dev->name);
4120         napi->weight = weight;
4121         list_add(&napi->dev_list, &dev->napi_list);
4122         napi->dev = dev;
4123 #ifdef CONFIG_NETPOLL
4124         spin_lock_init(&napi->poll_lock);
4125         napi->poll_owner = -1;
4126 #endif
4127         set_bit(NAPI_STATE_SCHED, &napi->state);
4128 }
4129 EXPORT_SYMBOL(netif_napi_add);
4130
4131 void netif_napi_del(struct napi_struct *napi)
4132 {
4133         struct sk_buff *skb, *next;
4134
4135         list_del_init(&napi->dev_list);
4136         napi_free_frags(napi);
4137
4138         for (skb = napi->gro_list; skb; skb = next) {
4139                 next = skb->next;
4140                 skb->next = NULL;
4141                 kfree_skb(skb);
4142         }
4143
4144         napi->gro_list = NULL;
4145         napi->gro_count = 0;
4146 }
4147 EXPORT_SYMBOL(netif_napi_del);
4148
4149 static void net_rx_action(struct softirq_action *h)
4150 {
4151         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4152         unsigned long time_limit = jiffies + 2;
4153         int budget = netdev_budget;
4154         void *have;
4155
4156         local_irq_disable();
4157
4158         while (!list_empty(&sd->poll_list)) {
4159                 struct napi_struct *n;
4160                 int work, weight;
4161
4162                 /* If softirq window is exhuasted then punt.
4163                  * Allow this to run for 2 jiffies since which will allow
4164                  * an average latency of 1.5/HZ.
4165                  */
4166                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4167                         goto softnet_break;
4168
4169                 local_irq_enable();
4170
4171                 /* Even though interrupts have been re-enabled, this
4172                  * access is safe because interrupts can only add new
4173                  * entries to the tail of this list, and only ->poll()
4174                  * calls can remove this head entry from the list.
4175                  */
4176                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4177
4178                 have = netpoll_poll_lock(n);
4179
4180                 weight = n->weight;
4181
4182                 /* This NAPI_STATE_SCHED test is for avoiding a race
4183                  * with netpoll's poll_napi().  Only the entity which
4184                  * obtains the lock and sees NAPI_STATE_SCHED set will
4185                  * actually make the ->poll() call.  Therefore we avoid
4186                  * accidentally calling ->poll() when NAPI is not scheduled.
4187                  */
4188                 work = 0;
4189                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4190                         work = n->poll(n, weight);
4191                         trace_napi_poll(n);
4192                 }
4193
4194                 WARN_ON_ONCE(work > weight);
4195
4196                 budget -= work;
4197
4198                 local_irq_disable();
4199
4200                 /* Drivers must not modify the NAPI state if they
4201                  * consume the entire weight.  In such cases this code
4202                  * still "owns" the NAPI instance and therefore can
4203                  * move the instance around on the list at-will.
4204                  */
4205                 if (unlikely(work == weight)) {
4206                         if (unlikely(napi_disable_pending(n))) {
4207                                 local_irq_enable();
4208                                 napi_complete(n);
4209                                 local_irq_disable();
4210                         } else {
4211                                 if (n->gro_list) {
4212                                         /* flush too old packets
4213                                          * If HZ < 1000, flush all packets.
4214                                          */
4215                                         local_irq_enable();
4216                                         napi_gro_flush(n, HZ >= 1000);
4217                                         local_irq_disable();
4218                                 }
4219                                 list_move_tail(&n->poll_list, &sd->poll_list);
4220                         }
4221                 }
4222
4223                 netpoll_poll_unlock(have);
4224         }
4225 out:
4226         net_rps_action_and_irq_enable(sd);
4227
4228 #ifdef CONFIG_NET_DMA
4229         /*
4230          * There may not be any more sk_buffs coming right now, so push
4231          * any pending DMA copies to hardware
4232          */
4233         dma_issue_pending_all();
4234 #endif
4235
4236         return;
4237
4238 softnet_break:
4239         sd->time_squeeze++;
4240         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4241         goto out;
4242 }
4243
4244 struct netdev_upper {
4245         struct net_device *dev;
4246         bool master;
4247         struct list_head list;
4248         struct rcu_head rcu;
4249         struct list_head search_list;
4250 };
4251
4252 static void __append_search_uppers(struct list_head *search_list,
4253                                    struct net_device *dev)
4254 {
4255         struct netdev_upper *upper;
4256
4257         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4258                 /* check if this upper is not already in search list */
4259                 if (list_empty(&upper->search_list))
4260                         list_add_tail(&upper->search_list, search_list);
4261         }
4262 }
4263
4264 static bool __netdev_search_upper_dev(struct net_device *dev,
4265                                       struct net_device *upper_dev)
4266 {
4267         LIST_HEAD(search_list);
4268         struct netdev_upper *upper;
4269         struct netdev_upper *tmp;
4270         bool ret = false;
4271
4272         __append_search_uppers(&search_list, dev);
4273         list_for_each_entry(upper, &search_list, search_list) {
4274                 if (upper->dev == upper_dev) {
4275                         ret = true;
4276                         break;
4277                 }
4278                 __append_search_uppers(&search_list, upper->dev);
4279         }
4280         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4281                 INIT_LIST_HEAD(&upper->search_list);
4282         return ret;
4283 }
4284
4285 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4286                                                 struct net_device *upper_dev)
4287 {
4288         struct netdev_upper *upper;
4289
4290         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4291                 if (upper->dev == upper_dev)
4292                         return upper;
4293         }
4294         return NULL;
4295 }
4296
4297 /**
4298  * netdev_has_upper_dev - Check if device is linked to an upper device
4299  * @dev: device
4300  * @upper_dev: upper device to check
4301  *
4302  * Find out if a device is linked to specified upper device and return true
4303  * in case it is. Note that this checks only immediate upper device,
4304  * not through a complete stack of devices. The caller must hold the RTNL lock.
4305  */
4306 bool netdev_has_upper_dev(struct net_device *dev,
4307                           struct net_device *upper_dev)
4308 {
4309         ASSERT_RTNL();
4310
4311         return __netdev_find_upper(dev, upper_dev);
4312 }
4313 EXPORT_SYMBOL(netdev_has_upper_dev);
4314
4315 /**
4316  * netdev_has_any_upper_dev - Check if device is linked to some device
4317  * @dev: device
4318  *
4319  * Find out if a device is linked to an upper device and return true in case
4320  * it is. The caller must hold the RTNL lock.
4321  */
4322 bool netdev_has_any_upper_dev(struct net_device *dev)
4323 {
4324         ASSERT_RTNL();
4325
4326         return !list_empty(&dev->upper_dev_list);
4327 }
4328 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4329
4330 /**
4331  * netdev_master_upper_dev_get - Get master upper device
4332  * @dev: device
4333  *
4334  * Find a master upper device and return pointer to it or NULL in case
4335  * it's not there. The caller must hold the RTNL lock.
4336  */
4337 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4338 {
4339         struct netdev_upper *upper;
4340
4341         ASSERT_RTNL();
4342
4343         if (list_empty(&dev->upper_dev_list))
4344                 return NULL;
4345
4346         upper = list_first_entry(&dev->upper_dev_list,
4347                                  struct netdev_upper, list);
4348         if (likely(upper->master))
4349                 return upper->dev;
4350         return NULL;
4351 }
4352 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4353
4354 /**
4355  * netdev_master_upper_dev_get_rcu - Get master upper device
4356  * @dev: device
4357  *
4358  * Find a master upper device and return pointer to it or NULL in case
4359  * it's not there. The caller must hold the RCU read lock.
4360  */
4361 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4362 {
4363         struct netdev_upper *upper;
4364
4365         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4366                                        struct netdev_upper, list);
4367         if (upper && likely(upper->master))
4368                 return upper->dev;
4369         return NULL;
4370 }
4371 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4372
4373 static int __netdev_upper_dev_link(struct net_device *dev,
4374                                    struct net_device *upper_dev, bool master)
4375 {
4376         struct netdev_upper *upper;
4377
4378         ASSERT_RTNL();
4379
4380         if (dev == upper_dev)
4381                 return -EBUSY;
4382
4383         /* To prevent loops, check if dev is not upper device to upper_dev. */
4384         if (__netdev_search_upper_dev(upper_dev, dev))
4385                 return -EBUSY;
4386
4387         if (__netdev_find_upper(dev, upper_dev))
4388                 return -EEXIST;
4389
4390         if (master && netdev_master_upper_dev_get(dev))
4391                 return -EBUSY;
4392
4393         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4394         if (!upper)
4395                 return -ENOMEM;
4396
4397         upper->dev = upper_dev;
4398         upper->master = master;
4399         INIT_LIST_HEAD(&upper->search_list);
4400
4401         /* Ensure that master upper link is always the first item in list. */
4402         if (master)
4403                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4404         else
4405                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4406         dev_hold(upper_dev);
4407         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4408         return 0;
4409 }
4410
4411 /**
4412  * netdev_upper_dev_link - Add a link to the upper device
4413  * @dev: device
4414  * @upper_dev: new upper device
4415  *
4416  * Adds a link to device which is upper to this one. The caller must hold
4417  * the RTNL lock. On a failure a negative errno code is returned.
4418  * On success the reference counts are adjusted and the function
4419  * returns zero.
4420  */
4421 int netdev_upper_dev_link(struct net_device *dev,
4422                           struct net_device *upper_dev)
4423 {
4424         return __netdev_upper_dev_link(dev, upper_dev, false);
4425 }
4426 EXPORT_SYMBOL(netdev_upper_dev_link);
4427
4428 /**
4429  * netdev_master_upper_dev_link - Add a master link to the upper device
4430  * @dev: device
4431  * @upper_dev: new upper device
4432  *
4433  * Adds a link to device which is upper to this one. In this case, only
4434  * one master upper device can be linked, although other non-master devices
4435  * might be linked as well. The caller must hold the RTNL lock.
4436  * On a failure a negative errno code is returned. On success the reference
4437  * counts are adjusted and the function returns zero.
4438  */
4439 int netdev_master_upper_dev_link(struct net_device *dev,
4440                                  struct net_device *upper_dev)
4441 {
4442         return __netdev_upper_dev_link(dev, upper_dev, true);
4443 }
4444 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4445
4446 /**
4447  * netdev_upper_dev_unlink - Removes a link to upper device
4448  * @dev: device
4449  * @upper_dev: new upper device
4450  *
4451  * Removes a link to device which is upper to this one. The caller must hold
4452  * the RTNL lock.
4453  */
4454 void netdev_upper_dev_unlink(struct net_device *dev,
4455                              struct net_device *upper_dev)
4456 {
4457         struct netdev_upper *upper;
4458
4459         ASSERT_RTNL();
4460
4461         upper = __netdev_find_upper(dev, upper_dev);
4462         if (!upper)
4463                 return;
4464         list_del_rcu(&upper->list);
4465         dev_put(upper_dev);
4466         kfree_rcu(upper, rcu);
4467         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4468 }
4469 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4470
4471 static void dev_change_rx_flags(struct net_device *dev, int flags)
4472 {
4473         const struct net_device_ops *ops = dev->netdev_ops;
4474
4475         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4476                 ops->ndo_change_rx_flags(dev, flags);
4477 }
4478
4479 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4480 {
4481         unsigned int old_flags = dev->flags;
4482         kuid_t uid;
4483         kgid_t gid;
4484
4485         ASSERT_RTNL();
4486
4487         dev->flags |= IFF_PROMISC;
4488         dev->promiscuity += inc;
4489         if (dev->promiscuity == 0) {
4490                 /*
4491                  * Avoid overflow.
4492                  * If inc causes overflow, untouch promisc and return error.
4493                  */
4494                 if (inc < 0)
4495                         dev->flags &= ~IFF_PROMISC;
4496                 else {
4497                         dev->promiscuity -= inc;
4498                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4499                                 dev->name);
4500                         return -EOVERFLOW;
4501                 }
4502         }
4503         if (dev->flags != old_flags) {
4504                 pr_info("device %s %s promiscuous mode\n",
4505                         dev->name,
4506                         dev->flags & IFF_PROMISC ? "entered" : "left");
4507                 if (audit_enabled) {
4508                         current_uid_gid(&uid, &gid);
4509                         audit_log(current->audit_context, GFP_ATOMIC,
4510                                 AUDIT_ANOM_PROMISCUOUS,
4511                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4512                                 dev->name, (dev->flags & IFF_PROMISC),
4513                                 (old_flags & IFF_PROMISC),
4514                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4515                                 from_kuid(&init_user_ns, uid),
4516                                 from_kgid(&init_user_ns, gid),
4517                                 audit_get_sessionid(current));
4518                 }
4519
4520                 dev_change_rx_flags(dev, IFF_PROMISC);
4521         }
4522         return 0;
4523 }
4524
4525 /**
4526  *      dev_set_promiscuity     - update promiscuity count on a device
4527  *      @dev: device
4528  *      @inc: modifier
4529  *
4530  *      Add or remove promiscuity from a device. While the count in the device
4531  *      remains above zero the interface remains promiscuous. Once it hits zero
4532  *      the device reverts back to normal filtering operation. A negative inc
4533  *      value is used to drop promiscuity on the device.
4534  *      Return 0 if successful or a negative errno code on error.
4535  */
4536 int dev_set_promiscuity(struct net_device *dev, int inc)
4537 {
4538         unsigned int old_flags = dev->flags;
4539         int err;
4540
4541         err = __dev_set_promiscuity(dev, inc);
4542         if (err < 0)
4543                 return err;
4544         if (dev->flags != old_flags)
4545                 dev_set_rx_mode(dev);
4546         return err;
4547 }
4548 EXPORT_SYMBOL(dev_set_promiscuity);
4549
4550 /**
4551  *      dev_set_allmulti        - update allmulti count on a device
4552  *      @dev: device
4553  *      @inc: modifier
4554  *
4555  *      Add or remove reception of all multicast frames to a device. While the
4556  *      count in the device remains above zero the interface remains listening
4557  *      to all interfaces. Once it hits zero the device reverts back to normal
4558  *      filtering operation. A negative @inc value is used to drop the counter
4559  *      when releasing a resource needing all multicasts.
4560  *      Return 0 if successful or a negative errno code on error.
4561  */
4562
4563 int dev_set_allmulti(struct net_device *dev, int inc)
4564 {
4565         unsigned int old_flags = dev->flags;
4566
4567         ASSERT_RTNL();
4568
4569         dev->flags |= IFF_ALLMULTI;
4570         dev->allmulti += inc;
4571         if (dev->allmulti == 0) {
4572                 /*
4573                  * Avoid overflow.
4574                  * If inc causes overflow, untouch allmulti and return error.
4575                  */
4576                 if (inc < 0)
4577                         dev->flags &= ~IFF_ALLMULTI;
4578                 else {
4579                         dev->allmulti -= inc;
4580                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4581                                 dev->name);
4582                         return -EOVERFLOW;
4583                 }
4584         }
4585         if (dev->flags ^ old_flags) {
4586                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4587                 dev_set_rx_mode(dev);
4588         }
4589         return 0;
4590 }
4591 EXPORT_SYMBOL(dev_set_allmulti);
4592
4593 /*
4594  *      Upload unicast and multicast address lists to device and
4595  *      configure RX filtering. When the device doesn't support unicast
4596  *      filtering it is put in promiscuous mode while unicast addresses
4597  *      are present.
4598  */
4599 void __dev_set_rx_mode(struct net_device *dev)
4600 {
4601         const struct net_device_ops *ops = dev->netdev_ops;
4602
4603         /* dev_open will call this function so the list will stay sane. */
4604         if (!(dev->flags&IFF_UP))
4605                 return;
4606
4607         if (!netif_device_present(dev))
4608                 return;
4609
4610         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4611                 /* Unicast addresses changes may only happen under the rtnl,
4612                  * therefore calling __dev_set_promiscuity here is safe.
4613                  */
4614                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4615                         __dev_set_promiscuity(dev, 1);
4616                         dev->uc_promisc = true;
4617                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4618                         __dev_set_promiscuity(dev, -1);
4619                         dev->uc_promisc = false;
4620                 }
4621         }
4622
4623         if (ops->ndo_set_rx_mode)
4624                 ops->ndo_set_rx_mode(dev);
4625 }
4626
4627 void dev_set_rx_mode(struct net_device *dev)
4628 {
4629         netif_addr_lock_bh(dev);
4630         __dev_set_rx_mode(dev);
4631         netif_addr_unlock_bh(dev);
4632 }
4633
4634 /**
4635  *      dev_get_flags - get flags reported to userspace
4636  *      @dev: device
4637  *
4638  *      Get the combination of flag bits exported through APIs to userspace.
4639  */
4640 unsigned int dev_get_flags(const struct net_device *dev)
4641 {
4642         unsigned int flags;
4643
4644         flags = (dev->flags & ~(IFF_PROMISC |
4645                                 IFF_ALLMULTI |
4646                                 IFF_RUNNING |
4647                                 IFF_LOWER_UP |
4648                                 IFF_DORMANT)) |
4649                 (dev->gflags & (IFF_PROMISC |
4650                                 IFF_ALLMULTI));
4651
4652         if (netif_running(dev)) {
4653                 if (netif_oper_up(dev))
4654                         flags |= IFF_RUNNING;
4655                 if (netif_carrier_ok(dev))
4656                         flags |= IFF_LOWER_UP;
4657                 if (netif_dormant(dev))
4658                         flags |= IFF_DORMANT;
4659         }
4660
4661         return flags;
4662 }
4663 EXPORT_SYMBOL(dev_get_flags);
4664
4665 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4666 {
4667         unsigned int old_flags = dev->flags;
4668         int ret;
4669
4670         ASSERT_RTNL();
4671
4672         /*
4673          *      Set the flags on our device.
4674          */
4675
4676         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4677                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4678                                IFF_AUTOMEDIA)) |
4679                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4680                                     IFF_ALLMULTI));
4681
4682         /*
4683          *      Load in the correct multicast list now the flags have changed.
4684          */
4685
4686         if ((old_flags ^ flags) & IFF_MULTICAST)
4687                 dev_change_rx_flags(dev, IFF_MULTICAST);
4688
4689         dev_set_rx_mode(dev);
4690
4691         /*
4692          *      Have we downed the interface. We handle IFF_UP ourselves
4693          *      according to user attempts to set it, rather than blindly
4694          *      setting it.
4695          */
4696
4697         ret = 0;
4698         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4699                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4700
4701                 if (!ret)
4702                         dev_set_rx_mode(dev);
4703         }
4704
4705         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4706                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4707
4708                 dev->gflags ^= IFF_PROMISC;
4709                 dev_set_promiscuity(dev, inc);
4710         }
4711
4712         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4713            is important. Some (broken) drivers set IFF_PROMISC, when
4714            IFF_ALLMULTI is requested not asking us and not reporting.
4715          */
4716         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4717                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4718
4719                 dev->gflags ^= IFF_ALLMULTI;
4720                 dev_set_allmulti(dev, inc);
4721         }
4722
4723         return ret;
4724 }
4725
4726 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4727 {
4728         unsigned int changes = dev->flags ^ old_flags;
4729
4730         if (changes & IFF_UP) {
4731                 if (dev->flags & IFF_UP)
4732                         call_netdevice_notifiers(NETDEV_UP, dev);
4733                 else
4734                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4735         }
4736
4737         if (dev->flags & IFF_UP &&
4738             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4739                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4740 }
4741
4742 /**
4743  *      dev_change_flags - change device settings
4744  *      @dev: device
4745  *      @flags: device state flags
4746  *
4747  *      Change settings on device based state flags. The flags are
4748  *      in the userspace exported format.
4749  */
4750 int dev_change_flags(struct net_device *dev, unsigned int flags)
4751 {
4752         int ret;
4753         unsigned int changes, old_flags = dev->flags;
4754
4755         ret = __dev_change_flags(dev, flags);
4756         if (ret < 0)
4757                 return ret;
4758
4759         changes = old_flags ^ dev->flags;
4760         if (changes)
4761                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4762
4763         __dev_notify_flags(dev, old_flags);
4764         return ret;
4765 }
4766 EXPORT_SYMBOL(dev_change_flags);
4767
4768 /**
4769  *      dev_set_mtu - Change maximum transfer unit
4770  *      @dev: device
4771  *      @new_mtu: new transfer unit
4772  *
4773  *      Change the maximum transfer size of the network device.
4774  */
4775 int dev_set_mtu(struct net_device *dev, int new_mtu)
4776 {
4777         const struct net_device_ops *ops = dev->netdev_ops;
4778         int err;
4779
4780         if (new_mtu == dev->mtu)
4781                 return 0;
4782
4783         /*      MTU must be positive.    */
4784         if (new_mtu < 0)
4785                 return -EINVAL;
4786
4787         if (!netif_device_present(dev))
4788                 return -ENODEV;
4789
4790         err = 0;
4791         if (ops->ndo_change_mtu)
4792                 err = ops->ndo_change_mtu(dev, new_mtu);
4793         else
4794                 dev->mtu = new_mtu;
4795
4796         if (!err)
4797                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4798         return err;
4799 }
4800 EXPORT_SYMBOL(dev_set_mtu);
4801
4802 /**
4803  *      dev_set_group - Change group this device belongs to
4804  *      @dev: device
4805  *      @new_group: group this device should belong to
4806  */
4807 void dev_set_group(struct net_device *dev, int new_group)
4808 {
4809         dev->group = new_group;
4810 }
4811 EXPORT_SYMBOL(dev_set_group);
4812
4813 /**
4814  *      dev_set_mac_address - Change Media Access Control Address
4815  *      @dev: device
4816  *      @sa: new address
4817  *
4818  *      Change the hardware (MAC) address of the device
4819  */
4820 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4821 {
4822         const struct net_device_ops *ops = dev->netdev_ops;
4823         int err;
4824
4825         if (!ops->ndo_set_mac_address)
4826                 return -EOPNOTSUPP;
4827         if (sa->sa_family != dev->type)
4828                 return -EINVAL;
4829         if (!netif_device_present(dev))
4830                 return -ENODEV;
4831         err = ops->ndo_set_mac_address(dev, sa);
4832         if (err)
4833                 return err;
4834         dev->addr_assign_type = NET_ADDR_SET;
4835         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4836         add_device_randomness(dev->dev_addr, dev->addr_len);
4837         return 0;
4838 }
4839 EXPORT_SYMBOL(dev_set_mac_address);
4840
4841 /**
4842  *      dev_change_carrier - Change device carrier
4843  *      @dev: device
4844  *      @new_carrier: new value
4845  *
4846  *      Change device carrier
4847  */
4848 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4849 {
4850         const struct net_device_ops *ops = dev->netdev_ops;
4851
4852         if (!ops->ndo_change_carrier)
4853                 return -EOPNOTSUPP;
4854         if (!netif_device_present(dev))
4855                 return -ENODEV;
4856         return ops->ndo_change_carrier(dev, new_carrier);
4857 }
4858 EXPORT_SYMBOL(dev_change_carrier);
4859
4860 /**
4861  *      dev_new_index   -       allocate an ifindex
4862  *      @net: the applicable net namespace
4863  *
4864  *      Returns a suitable unique value for a new device interface
4865  *      number.  The caller must hold the rtnl semaphore or the
4866  *      dev_base_lock to be sure it remains unique.
4867  */
4868 static int dev_new_index(struct net *net)
4869 {
4870         int ifindex = net->ifindex;
4871         for (;;) {
4872                 if (++ifindex <= 0)
4873                         ifindex = 1;
4874                 if (!__dev_get_by_index(net, ifindex))
4875                         return net->ifindex = ifindex;
4876         }
4877 }
4878
4879 /* Delayed registration/unregisteration */
4880 static LIST_HEAD(net_todo_list);
4881
4882 static void net_set_todo(struct net_device *dev)
4883 {
4884         list_add_tail(&dev->todo_list, &net_todo_list);
4885 }
4886
4887 static void rollback_registered_many(struct list_head *head)
4888 {
4889         struct net_device *dev, *tmp;
4890
4891         BUG_ON(dev_boot_phase);
4892         ASSERT_RTNL();
4893
4894         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4895                 /* Some devices call without registering
4896                  * for initialization unwind. Remove those
4897                  * devices and proceed with the remaining.
4898                  */
4899                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4900                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4901                                  dev->name, dev);
4902
4903                         WARN_ON(1);
4904                         list_del(&dev->unreg_list);
4905                         continue;
4906                 }
4907                 dev->dismantle = true;
4908                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4909         }
4910
4911         /* If device is running, close it first. */
4912         dev_close_many(head);
4913
4914         list_for_each_entry(dev, head, unreg_list) {
4915                 /* And unlink it from device chain. */
4916                 unlist_netdevice(dev);
4917
4918                 dev->reg_state = NETREG_UNREGISTERING;
4919         }
4920
4921         synchronize_net();
4922
4923         list_for_each_entry(dev, head, unreg_list) {
4924                 /* Shutdown queueing discipline. */
4925                 dev_shutdown(dev);
4926
4927
4928                 /* Notify protocols, that we are about to destroy
4929                    this device. They should clean all the things.
4930                 */
4931                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4932
4933                 if (!dev->rtnl_link_ops ||
4934                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4935                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4936
4937                 /*
4938                  *      Flush the unicast and multicast chains
4939                  */
4940                 dev_uc_flush(dev);
4941                 dev_mc_flush(dev);
4942
4943                 if (dev->netdev_ops->ndo_uninit)
4944                         dev->netdev_ops->ndo_uninit(dev);
4945
4946                 /* Notifier chain MUST detach us all upper devices. */
4947                 WARN_ON(netdev_has_any_upper_dev(dev));
4948
4949                 /* Remove entries from kobject tree */
4950                 netdev_unregister_kobject(dev);
4951 #ifdef CONFIG_XPS
4952                 /* Remove XPS queueing entries */
4953                 netif_reset_xps_queues_gt(dev, 0);
4954 #endif
4955         }
4956
4957         synchronize_net();
4958
4959         list_for_each_entry(dev, head, unreg_list)
4960                 dev_put(dev);
4961 }
4962
4963 static void rollback_registered(struct net_device *dev)
4964 {
4965         LIST_HEAD(single);
4966
4967         list_add(&dev->unreg_list, &single);
4968         rollback_registered_many(&single);
4969         list_del(&single);
4970 }
4971
4972 static netdev_features_t netdev_fix_features(struct net_device *dev,
4973         netdev_features_t features)
4974 {
4975         /* Fix illegal checksum combinations */
4976         if ((features & NETIF_F_HW_CSUM) &&
4977             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4978                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
4979                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4980         }
4981
4982         /* TSO requires that SG is present as well. */
4983         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
4984                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
4985                 features &= ~NETIF_F_ALL_TSO;
4986         }
4987
4988         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
4989                                         !(features & NETIF_F_IP_CSUM)) {
4990                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
4991                 features &= ~NETIF_F_TSO;
4992                 features &= ~NETIF_F_TSO_ECN;
4993         }
4994
4995         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
4996                                          !(features & NETIF_F_IPV6_CSUM)) {
4997                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
4998                 features &= ~NETIF_F_TSO6;
4999         }
5000
5001         /* TSO ECN requires that TSO is present as well. */
5002         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5003                 features &= ~NETIF_F_TSO_ECN;
5004
5005         /* Software GSO depends on SG. */
5006         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5007                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5008                 features &= ~NETIF_F_GSO;
5009         }
5010
5011         /* UFO needs SG and checksumming */
5012         if (features & NETIF_F_UFO) {
5013                 /* maybe split UFO into V4 and V6? */
5014                 if (!((features & NETIF_F_GEN_CSUM) ||
5015                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5016                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5017                         netdev_dbg(dev,
5018                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5019                         features &= ~NETIF_F_UFO;
5020                 }
5021
5022                 if (!(features & NETIF_F_SG)) {
5023                         netdev_dbg(dev,
5024                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5025                         features &= ~NETIF_F_UFO;
5026                 }
5027         }
5028
5029         return features;
5030 }
5031
5032 int __netdev_update_features(struct net_device *dev)
5033 {
5034         netdev_features_t features;
5035         int err = 0;
5036
5037         ASSERT_RTNL();
5038
5039         features = netdev_get_wanted_features(dev);
5040
5041         if (dev->netdev_ops->ndo_fix_features)
5042                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5043
5044         /* driver might be less strict about feature dependencies */
5045         features = netdev_fix_features(dev, features);
5046
5047         if (dev->features == features)
5048                 return 0;
5049
5050         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5051                 &dev->features, &features);
5052
5053         if (dev->netdev_ops->ndo_set_features)
5054                 err = dev->netdev_ops->ndo_set_features(dev, features);
5055
5056         if (unlikely(err < 0)) {
5057                 netdev_err(dev,
5058                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5059                         err, &features, &dev->features);
5060                 return -1;
5061         }
5062
5063         if (!err)
5064                 dev->features = features;
5065
5066         return 1;
5067 }
5068
5069 /**
5070  *      netdev_update_features - recalculate device features
5071  *      @dev: the device to check
5072  *
5073  *      Recalculate dev->features set and send notifications if it
5074  *      has changed. Should be called after driver or hardware dependent
5075  *      conditions might have changed that influence the features.
5076  */
5077 void netdev_update_features(struct net_device *dev)
5078 {
5079         if (__netdev_update_features(dev))
5080                 netdev_features_change(dev);
5081 }
5082 EXPORT_SYMBOL(netdev_update_features);
5083
5084 /**
5085  *      netdev_change_features - recalculate device features
5086  *      @dev: the device to check
5087  *
5088  *      Recalculate dev->features set and send notifications even
5089  *      if they have not changed. Should be called instead of
5090  *      netdev_update_features() if also dev->vlan_features might
5091  *      have changed to allow the changes to be propagated to stacked
5092  *      VLAN devices.
5093  */
5094 void netdev_change_features(struct net_device *dev)
5095 {
5096         __netdev_update_features(dev);
5097         netdev_features_change(dev);
5098 }
5099 EXPORT_SYMBOL(netdev_change_features);
5100
5101 /**
5102  *      netif_stacked_transfer_operstate -      transfer operstate
5103  *      @rootdev: the root or lower level device to transfer state from
5104  *      @dev: the device to transfer operstate to
5105  *
5106  *      Transfer operational state from root to device. This is normally
5107  *      called when a stacking relationship exists between the root
5108  *      device and the device(a leaf device).
5109  */
5110 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5111                                         struct net_device *dev)
5112 {
5113         if (rootdev->operstate == IF_OPER_DORMANT)
5114                 netif_dormant_on(dev);
5115         else
5116                 netif_dormant_off(dev);
5117
5118         if (netif_carrier_ok(rootdev)) {
5119                 if (!netif_carrier_ok(dev))
5120                         netif_carrier_on(dev);
5121         } else {
5122                 if (netif_carrier_ok(dev))
5123                         netif_carrier_off(dev);
5124         }
5125 }
5126 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5127
5128 #ifdef CONFIG_RPS
5129 static int netif_alloc_rx_queues(struct net_device *dev)
5130 {
5131         unsigned int i, count = dev->num_rx_queues;
5132         struct netdev_rx_queue *rx;
5133
5134         BUG_ON(count < 1);
5135
5136         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5137         if (!rx)
5138                 return -ENOMEM;
5139
5140         dev->_rx = rx;
5141
5142         for (i = 0; i < count; i++)
5143                 rx[i].dev = dev;
5144         return 0;
5145 }
5146 #endif
5147
5148 static void netdev_init_one_queue(struct net_device *dev,
5149                                   struct netdev_queue *queue, void *_unused)
5150 {
5151         /* Initialize queue lock */
5152         spin_lock_init(&queue->_xmit_lock);
5153         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5154         queue->xmit_lock_owner = -1;
5155         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5156         queue->dev = dev;
5157 #ifdef CONFIG_BQL
5158         dql_init(&queue->dql, HZ);
5159 #endif
5160 }
5161
5162 static int netif_alloc_netdev_queues(struct net_device *dev)
5163 {
5164         unsigned int count = dev->num_tx_queues;
5165         struct netdev_queue *tx;
5166
5167         BUG_ON(count < 1);
5168
5169         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5170         if (!tx)
5171                 return -ENOMEM;
5172
5173         dev->_tx = tx;
5174
5175         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5176         spin_lock_init(&dev->tx_global_lock);
5177
5178         return 0;
5179 }
5180
5181 /**
5182  *      register_netdevice      - register a network device
5183  *      @dev: device to register
5184  *
5185  *      Take a completed network device structure and add it to the kernel
5186  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5187  *      chain. 0 is returned on success. A negative errno code is returned
5188  *      on a failure to set up the device, or if the name is a duplicate.
5189  *
5190  *      Callers must hold the rtnl semaphore. You may want
5191  *      register_netdev() instead of this.
5192  *
5193  *      BUGS:
5194  *      The locking appears insufficient to guarantee two parallel registers
5195  *      will not get the same name.
5196  */
5197
5198 int register_netdevice(struct net_device *dev)
5199 {
5200         int ret;
5201         struct net *net = dev_net(dev);
5202
5203         BUG_ON(dev_boot_phase);
5204         ASSERT_RTNL();
5205
5206         might_sleep();
5207
5208         /* When net_device's are persistent, this will be fatal. */
5209         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5210         BUG_ON(!net);
5211
5212         spin_lock_init(&dev->addr_list_lock);
5213         netdev_set_addr_lockdep_class(dev);
5214
5215         dev->iflink = -1;
5216
5217         ret = dev_get_valid_name(net, dev, dev->name);
5218         if (ret < 0)
5219                 goto out;
5220
5221         /* Init, if this function is available */
5222         if (dev->netdev_ops->ndo_init) {
5223                 ret = dev->netdev_ops->ndo_init(dev);
5224                 if (ret) {
5225                         if (ret > 0)
5226                                 ret = -EIO;
5227                         goto out;
5228                 }
5229         }
5230
5231         if (((dev->hw_features | dev->features) &
5232              NETIF_F_HW_VLAN_CTAG_FILTER) &&
5233             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5234              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5235                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5236                 ret = -EINVAL;
5237                 goto err_uninit;
5238         }
5239
5240         ret = -EBUSY;
5241         if (!dev->ifindex)
5242                 dev->ifindex = dev_new_index(net);
5243         else if (__dev_get_by_index(net, dev->ifindex))
5244                 goto err_uninit;
5245
5246         if (dev->iflink == -1)
5247                 dev->iflink = dev->ifindex;
5248
5249         /* Transfer changeable features to wanted_features and enable
5250          * software offloads (GSO and GRO).
5251          */
5252         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5253         dev->features |= NETIF_F_SOFT_FEATURES;
5254         dev->wanted_features = dev->features & dev->hw_features;
5255
5256         /* Turn on no cache copy if HW is doing checksum */
5257         if (!(dev->flags & IFF_LOOPBACK)) {
5258                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5259                 if (dev->features & NETIF_F_ALL_CSUM) {
5260                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5261                         dev->features |= NETIF_F_NOCACHE_COPY;
5262                 }
5263         }
5264
5265         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5266          */
5267         dev->vlan_features |= NETIF_F_HIGHDMA;
5268
5269         /* Make NETIF_F_SG inheritable to tunnel devices.
5270          */
5271         dev->hw_enc_features |= NETIF_F_SG;
5272
5273         /* Make NETIF_F_SG inheritable to MPLS.
5274          */
5275         dev->mpls_features |= NETIF_F_SG;
5276
5277         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5278         ret = notifier_to_errno(ret);
5279         if (ret)
5280                 goto err_uninit;
5281
5282         ret = netdev_register_kobject(dev);
5283         if (ret)
5284                 goto err_uninit;
5285         dev->reg_state = NETREG_REGISTERED;
5286
5287         __netdev_update_features(dev);
5288
5289         /*
5290          *      Default initial state at registry is that the
5291          *      device is present.
5292          */
5293
5294         set_bit(__LINK_STATE_PRESENT, &dev->state);
5295
5296         linkwatch_init_dev(dev);
5297
5298         dev_init_scheduler(dev);
5299         dev_hold(dev);
5300         list_netdevice(dev);
5301         add_device_randomness(dev->dev_addr, dev->addr_len);
5302
5303         /* If the device has permanent device address, driver should
5304          * set dev_addr and also addr_assign_type should be set to
5305          * NET_ADDR_PERM (default value).
5306          */
5307         if (dev->addr_assign_type == NET_ADDR_PERM)
5308                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5309
5310         /* Notify protocols, that a new device appeared. */
5311         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5312         ret = notifier_to_errno(ret);
5313         if (ret) {
5314                 rollback_registered(dev);
5315                 dev->reg_state = NETREG_UNREGISTERED;
5316         }
5317         /*
5318          *      Prevent userspace races by waiting until the network
5319          *      device is fully setup before sending notifications.
5320          */
5321         if (!dev->rtnl_link_ops ||
5322             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5323                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5324
5325 out:
5326         return ret;
5327
5328 err_uninit:
5329         if (dev->netdev_ops->ndo_uninit)
5330                 dev->netdev_ops->ndo_uninit(dev);
5331         goto out;
5332 }
5333 EXPORT_SYMBOL(register_netdevice);
5334
5335 /**
5336  *      init_dummy_netdev       - init a dummy network device for NAPI
5337  *      @dev: device to init
5338  *
5339  *      This takes a network device structure and initialize the minimum
5340  *      amount of fields so it can be used to schedule NAPI polls without
5341  *      registering a full blown interface. This is to be used by drivers
5342  *      that need to tie several hardware interfaces to a single NAPI
5343  *      poll scheduler due to HW limitations.
5344  */
5345 int init_dummy_netdev(struct net_device *dev)
5346 {
5347         /* Clear everything. Note we don't initialize spinlocks
5348          * are they aren't supposed to be taken by any of the
5349          * NAPI code and this dummy netdev is supposed to be
5350          * only ever used for NAPI polls
5351          */
5352         memset(dev, 0, sizeof(struct net_device));
5353
5354         /* make sure we BUG if trying to hit standard
5355          * register/unregister code path
5356          */
5357         dev->reg_state = NETREG_DUMMY;
5358
5359         /* NAPI wants this */
5360         INIT_LIST_HEAD(&dev->napi_list);
5361
5362         /* a dummy interface is started by default */
5363         set_bit(__LINK_STATE_PRESENT, &dev->state);
5364         set_bit(__LINK_STATE_START, &dev->state);
5365
5366         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5367          * because users of this 'device' dont need to change
5368          * its refcount.
5369          */
5370
5371         return 0;
5372 }
5373 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5374
5375
5376 /**
5377  *      register_netdev - register a network device
5378  *      @dev: device to register
5379  *
5380  *      Take a completed network device structure and add it to the kernel
5381  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5382  *      chain. 0 is returned on success. A negative errno code is returned
5383  *      on a failure to set up the device, or if the name is a duplicate.
5384  *
5385  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5386  *      and expands the device name if you passed a format string to
5387  *      alloc_netdev.
5388  */
5389 int register_netdev(struct net_device *dev)
5390 {
5391         int err;
5392
5393         rtnl_lock();
5394         err = register_netdevice(dev);
5395         rtnl_unlock();
5396         return err;
5397 }
5398 EXPORT_SYMBOL(register_netdev);
5399
5400 int netdev_refcnt_read(const struct net_device *dev)
5401 {
5402         int i, refcnt = 0;
5403
5404         for_each_possible_cpu(i)
5405                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5406         return refcnt;
5407 }
5408 EXPORT_SYMBOL(netdev_refcnt_read);
5409
5410 /**
5411  * netdev_wait_allrefs - wait until all references are gone.
5412  * @dev: target net_device
5413  *
5414  * This is called when unregistering network devices.
5415  *
5416  * Any protocol or device that holds a reference should register
5417  * for netdevice notification, and cleanup and put back the
5418  * reference if they receive an UNREGISTER event.
5419  * We can get stuck here if buggy protocols don't correctly
5420  * call dev_put.
5421  */
5422 static void netdev_wait_allrefs(struct net_device *dev)
5423 {
5424         unsigned long rebroadcast_time, warning_time;
5425         int refcnt;
5426
5427         linkwatch_forget_dev(dev);
5428
5429         rebroadcast_time = warning_time = jiffies;
5430         refcnt = netdev_refcnt_read(dev);
5431
5432         while (refcnt != 0) {
5433                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5434                         rtnl_lock();
5435
5436                         /* Rebroadcast unregister notification */
5437                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5438
5439                         __rtnl_unlock();
5440                         rcu_barrier();
5441                         rtnl_lock();
5442
5443                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5444                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5445                                      &dev->state)) {
5446                                 /* We must not have linkwatch events
5447                                  * pending on unregister. If this
5448                                  * happens, we simply run the queue
5449                                  * unscheduled, resulting in a noop
5450                                  * for this device.
5451                                  */
5452                                 linkwatch_run_queue();
5453                         }
5454
5455                         __rtnl_unlock();
5456
5457                         rebroadcast_time = jiffies;
5458                 }
5459
5460                 msleep(250);
5461
5462                 refcnt = netdev_refcnt_read(dev);
5463
5464                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5465                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5466                                  dev->name, refcnt);
5467                         warning_time = jiffies;
5468                 }
5469         }
5470 }
5471
5472 /* The sequence is:
5473  *
5474  *      rtnl_lock();
5475  *      ...
5476  *      register_netdevice(x1);
5477  *      register_netdevice(x2);
5478  *      ...
5479  *      unregister_netdevice(y1);
5480  *      unregister_netdevice(y2);
5481  *      ...
5482  *      rtnl_unlock();
5483  *      free_netdev(y1);
5484  *      free_netdev(y2);
5485  *
5486  * We are invoked by rtnl_unlock().
5487  * This allows us to deal with problems:
5488  * 1) We can delete sysfs objects which invoke hotplug
5489  *    without deadlocking with linkwatch via keventd.
5490  * 2) Since we run with the RTNL semaphore not held, we can sleep
5491  *    safely in order to wait for the netdev refcnt to drop to zero.
5492  *
5493  * We must not return until all unregister events added during
5494  * the interval the lock was held have been completed.
5495  */
5496 void netdev_run_todo(void)
5497 {
5498         struct list_head list;
5499
5500         /* Snapshot list, allow later requests */
5501         list_replace_init(&net_todo_list, &list);
5502
5503         __rtnl_unlock();
5504
5505
5506         /* Wait for rcu callbacks to finish before next phase */
5507         if (!list_empty(&list))
5508                 rcu_barrier();
5509
5510         while (!list_empty(&list)) {
5511                 struct net_device *dev
5512                         = list_first_entry(&list, struct net_device, todo_list);
5513                 list_del(&dev->todo_list);
5514
5515                 rtnl_lock();
5516                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5517                 __rtnl_unlock();
5518
5519                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5520                         pr_err("network todo '%s' but state %d\n",
5521                                dev->name, dev->reg_state);
5522                         dump_stack();
5523                         continue;
5524                 }
5525
5526                 dev->reg_state = NETREG_UNREGISTERED;
5527
5528                 on_each_cpu(flush_backlog, dev, 1);
5529
5530                 netdev_wait_allrefs(dev);
5531
5532                 /* paranoia */
5533                 BUG_ON(netdev_refcnt_read(dev));
5534                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5535                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5536                 WARN_ON(dev->dn_ptr);
5537
5538                 if (dev->destructor)
5539                         dev->destructor(dev);
5540
5541                 /* Free network device */
5542                 kobject_put(&dev->dev.kobj);
5543         }
5544 }
5545
5546 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5547  * fields in the same order, with only the type differing.
5548  */
5549 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5550                              const struct net_device_stats *netdev_stats)
5551 {
5552 #if BITS_PER_LONG == 64
5553         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5554         memcpy(stats64, netdev_stats, sizeof(*stats64));
5555 #else
5556         size_t i, n = sizeof(*stats64) / sizeof(u64);
5557         const unsigned long *src = (const unsigned long *)netdev_stats;
5558         u64 *dst = (u64 *)stats64;
5559
5560         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5561                      sizeof(*stats64) / sizeof(u64));
5562         for (i = 0; i < n; i++)
5563                 dst[i] = src[i];
5564 #endif
5565 }
5566 EXPORT_SYMBOL(netdev_stats_to_stats64);
5567
5568 /**
5569  *      dev_get_stats   - get network device statistics
5570  *      @dev: device to get statistics from
5571  *      @storage: place to store stats
5572  *
5573  *      Get network statistics from device. Return @storage.
5574  *      The device driver may provide its own method by setting
5575  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5576  *      otherwise the internal statistics structure is used.
5577  */
5578 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5579                                         struct rtnl_link_stats64 *storage)
5580 {
5581         const struct net_device_ops *ops = dev->netdev_ops;
5582
5583         if (ops->ndo_get_stats64) {
5584                 memset(storage, 0, sizeof(*storage));
5585                 ops->ndo_get_stats64(dev, storage);
5586         } else if (ops->ndo_get_stats) {
5587                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5588         } else {
5589                 netdev_stats_to_stats64(storage, &dev->stats);
5590         }
5591         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5592         return storage;
5593 }
5594 EXPORT_SYMBOL(dev_get_stats);
5595
5596 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5597 {
5598         struct netdev_queue *queue = dev_ingress_queue(dev);
5599
5600 #ifdef CONFIG_NET_CLS_ACT
5601         if (queue)
5602                 return queue;
5603         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5604         if (!queue)
5605                 return NULL;
5606         netdev_init_one_queue(dev, queue, NULL);
5607         queue->qdisc = &noop_qdisc;
5608         queue->qdisc_sleeping = &noop_qdisc;
5609         rcu_assign_pointer(dev->ingress_queue, queue);
5610 #endif
5611         return queue;
5612 }
5613
5614 static const struct ethtool_ops default_ethtool_ops;
5615
5616 void netdev_set_default_ethtool_ops(struct net_device *dev,
5617                                     const struct ethtool_ops *ops)
5618 {
5619         if (dev->ethtool_ops == &default_ethtool_ops)
5620                 dev->ethtool_ops = ops;
5621 }
5622 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5623
5624 /**
5625  *      alloc_netdev_mqs - allocate network device
5626  *      @sizeof_priv:   size of private data to allocate space for
5627  *      @name:          device name format string
5628  *      @setup:         callback to initialize device
5629  *      @txqs:          the number of TX subqueues to allocate
5630  *      @rxqs:          the number of RX subqueues to allocate
5631  *
5632  *      Allocates a struct net_device with private data area for driver use
5633  *      and performs basic initialization.  Also allocates subquue structs
5634  *      for each queue on the device.
5635  */
5636 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5637                 void (*setup)(struct net_device *),
5638                 unsigned int txqs, unsigned int rxqs)
5639 {
5640         struct net_device *dev;
5641         size_t alloc_size;
5642         struct net_device *p;
5643
5644         BUG_ON(strlen(name) >= sizeof(dev->name));
5645
5646         if (txqs < 1) {
5647                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5648                 return NULL;
5649         }
5650
5651 #ifdef CONFIG_RPS
5652         if (rxqs < 1) {
5653                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5654                 return NULL;
5655         }
5656 #endif
5657
5658         alloc_size = sizeof(struct net_device);
5659         if (sizeof_priv) {
5660                 /* ensure 32-byte alignment of private area */
5661                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5662                 alloc_size += sizeof_priv;
5663         }
5664         /* ensure 32-byte alignment of whole construct */
5665         alloc_size += NETDEV_ALIGN - 1;
5666
5667         p = kzalloc(alloc_size, GFP_KERNEL);
5668         if (!p)
5669                 return NULL;
5670
5671         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5672         dev->padded = (char *)dev - (char *)p;
5673
5674         dev->pcpu_refcnt = alloc_percpu(int);
5675         if (!dev->pcpu_refcnt)
5676                 goto free_p;
5677
5678         if (dev_addr_init(dev))
5679                 goto free_pcpu;
5680
5681         dev_mc_init(dev);
5682         dev_uc_init(dev);
5683
5684         dev_net_set(dev, &init_net);
5685
5686         dev->gso_max_size = GSO_MAX_SIZE;
5687         dev->gso_max_segs = GSO_MAX_SEGS;
5688
5689         INIT_LIST_HEAD(&dev->napi_list);
5690         INIT_LIST_HEAD(&dev->unreg_list);
5691         INIT_LIST_HEAD(&dev->link_watch_list);
5692         INIT_LIST_HEAD(&dev->upper_dev_list);
5693         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5694         setup(dev);
5695
5696         dev->num_tx_queues = txqs;
5697         dev->real_num_tx_queues = txqs;
5698         if (netif_alloc_netdev_queues(dev))
5699                 goto free_all;
5700
5701 #ifdef CONFIG_RPS
5702         dev->num_rx_queues = rxqs;
5703         dev->real_num_rx_queues = rxqs;
5704         if (netif_alloc_rx_queues(dev))
5705                 goto free_all;
5706 #endif
5707
5708         strcpy(dev->name, name);
5709         dev->group = INIT_NETDEV_GROUP;
5710         if (!dev->ethtool_ops)
5711                 dev->ethtool_ops = &default_ethtool_ops;
5712         return dev;
5713
5714 free_all:
5715         free_netdev(dev);
5716         return NULL;
5717
5718 free_pcpu:
5719         free_percpu(dev->pcpu_refcnt);
5720         kfree(dev->_tx);
5721 #ifdef CONFIG_RPS
5722         kfree(dev->_rx);
5723 #endif
5724
5725 free_p:
5726         kfree(p);
5727         return NULL;
5728 }
5729 EXPORT_SYMBOL(alloc_netdev_mqs);
5730
5731 /**
5732  *      free_netdev - free network device
5733  *      @dev: device
5734  *
5735  *      This function does the last stage of destroying an allocated device
5736  *      interface. The reference to the device object is released.
5737  *      If this is the last reference then it will be freed.
5738  */
5739 void free_netdev(struct net_device *dev)
5740 {
5741         struct napi_struct *p, *n;
5742
5743         release_net(dev_net(dev));
5744
5745         kfree(dev->_tx);
5746 #ifdef CONFIG_RPS
5747         kfree(dev->_rx);
5748 #endif
5749
5750         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5751
5752         /* Flush device addresses */
5753         dev_addr_flush(dev);
5754
5755         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5756                 netif_napi_del(p);
5757
5758         free_percpu(dev->pcpu_refcnt);
5759         dev->pcpu_refcnt = NULL;
5760
5761         /*  Compatibility with error handling in drivers */
5762         if (dev->reg_state == NETREG_UNINITIALIZED) {
5763                 kfree((char *)dev - dev->padded);
5764                 return;
5765         }
5766
5767         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5768         dev->reg_state = NETREG_RELEASED;
5769
5770         /* will free via device release */
5771         put_device(&dev->dev);
5772 }
5773 EXPORT_SYMBOL(free_netdev);
5774
5775 /**
5776  *      synchronize_net -  Synchronize with packet receive processing
5777  *
5778  *      Wait for packets currently being received to be done.
5779  *      Does not block later packets from starting.
5780  */
5781 void synchronize_net(void)
5782 {
5783         might_sleep();
5784         if (rtnl_is_locked())
5785                 synchronize_rcu_expedited();
5786         else
5787                 synchronize_rcu();
5788 }
5789 EXPORT_SYMBOL(synchronize_net);
5790
5791 /**
5792  *      unregister_netdevice_queue - remove device from the kernel
5793  *      @dev: device
5794  *      @head: list
5795  *
5796  *      This function shuts down a device interface and removes it
5797  *      from the kernel tables.
5798  *      If head not NULL, device is queued to be unregistered later.
5799  *
5800  *      Callers must hold the rtnl semaphore.  You may want
5801  *      unregister_netdev() instead of this.
5802  */
5803
5804 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5805 {
5806         ASSERT_RTNL();
5807
5808         if (head) {
5809                 list_move_tail(&dev->unreg_list, head);
5810         } else {
5811                 rollback_registered(dev);
5812                 /* Finish processing unregister after unlock */
5813                 net_set_todo(dev);
5814         }
5815 }
5816 EXPORT_SYMBOL(unregister_netdevice_queue);
5817
5818 /**
5819  *      unregister_netdevice_many - unregister many devices
5820  *      @head: list of devices
5821  */
5822 void unregister_netdevice_many(struct list_head *head)
5823 {
5824         struct net_device *dev;
5825
5826         if (!list_empty(head)) {
5827                 rollback_registered_many(head);
5828                 list_for_each_entry(dev, head, unreg_list)
5829                         net_set_todo(dev);
5830         }
5831 }
5832 EXPORT_SYMBOL(unregister_netdevice_many);
5833
5834 /**
5835  *      unregister_netdev - remove device from the kernel
5836  *      @dev: device
5837  *
5838  *      This function shuts down a device interface and removes it
5839  *      from the kernel tables.
5840  *
5841  *      This is just a wrapper for unregister_netdevice that takes
5842  *      the rtnl semaphore.  In general you want to use this and not
5843  *      unregister_netdevice.
5844  */
5845 void unregister_netdev(struct net_device *dev)
5846 {
5847         rtnl_lock();
5848         unregister_netdevice(dev);
5849         rtnl_unlock();
5850 }
5851 EXPORT_SYMBOL(unregister_netdev);
5852
5853 /**
5854  *      dev_change_net_namespace - move device to different nethost namespace
5855  *      @dev: device
5856  *      @net: network namespace
5857  *      @pat: If not NULL name pattern to try if the current device name
5858  *            is already taken in the destination network namespace.
5859  *
5860  *      This function shuts down a device interface and moves it
5861  *      to a new network namespace. On success 0 is returned, on
5862  *      a failure a netagive errno code is returned.
5863  *
5864  *      Callers must hold the rtnl semaphore.
5865  */
5866
5867 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5868 {
5869         int err;
5870
5871         ASSERT_RTNL();
5872
5873         /* Don't allow namespace local devices to be moved. */
5874         err = -EINVAL;
5875         if (dev->features & NETIF_F_NETNS_LOCAL)
5876                 goto out;
5877
5878         /* Ensure the device has been registrered */
5879         if (dev->reg_state != NETREG_REGISTERED)
5880                 goto out;
5881
5882         /* Get out if there is nothing todo */
5883         err = 0;
5884         if (net_eq(dev_net(dev), net))
5885                 goto out;
5886
5887         /* Pick the destination device name, and ensure
5888          * we can use it in the destination network namespace.
5889          */
5890         err = -EEXIST;
5891         if (__dev_get_by_name(net, dev->name)) {
5892                 /* We get here if we can't use the current device name */
5893                 if (!pat)
5894                         goto out;
5895                 if (dev_get_valid_name(net, dev, pat) < 0)
5896                         goto out;
5897         }
5898
5899         /*
5900          * And now a mini version of register_netdevice unregister_netdevice.
5901          */
5902
5903         /* If device is running close it first. */
5904         dev_close(dev);
5905
5906         /* And unlink it from device chain */
5907         err = -ENODEV;
5908         unlist_netdevice(dev);
5909
5910         synchronize_net();
5911
5912         /* Shutdown queueing discipline. */
5913         dev_shutdown(dev);
5914
5915         /* Notify protocols, that we are about to destroy
5916            this device. They should clean all the things.
5917
5918            Note that dev->reg_state stays at NETREG_REGISTERED.
5919            This is wanted because this way 8021q and macvlan know
5920            the device is just moving and can keep their slaves up.
5921         */
5922         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5923         rcu_barrier();
5924         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5925         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5926
5927         /*
5928          *      Flush the unicast and multicast chains
5929          */
5930         dev_uc_flush(dev);
5931         dev_mc_flush(dev);
5932
5933         /* Send a netdev-removed uevent to the old namespace */
5934         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
5935
5936         /* Actually switch the network namespace */
5937         dev_net_set(dev, net);
5938
5939         /* If there is an ifindex conflict assign a new one */
5940         if (__dev_get_by_index(net, dev->ifindex)) {
5941                 int iflink = (dev->iflink == dev->ifindex);
5942                 dev->ifindex = dev_new_index(net);
5943                 if (iflink)
5944                         dev->iflink = dev->ifindex;
5945         }
5946
5947         /* Send a netdev-add uevent to the new namespace */
5948         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
5949
5950         /* Fixup kobjects */
5951         err = device_rename(&dev->dev, dev->name);
5952         WARN_ON(err);
5953
5954         /* Add the device back in the hashes */
5955         list_netdevice(dev);
5956
5957         /* Notify protocols, that a new device appeared. */
5958         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5959
5960         /*
5961          *      Prevent userspace races by waiting until the network
5962          *      device is fully setup before sending notifications.
5963          */
5964         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5965
5966         synchronize_net();
5967         err = 0;
5968 out:
5969         return err;
5970 }
5971 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
5972
5973 static int dev_cpu_callback(struct notifier_block *nfb,
5974                             unsigned long action,
5975                             void *ocpu)
5976 {
5977         struct sk_buff **list_skb;
5978         struct sk_buff *skb;
5979         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5980         struct softnet_data *sd, *oldsd;
5981
5982         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5983                 return NOTIFY_OK;
5984
5985         local_irq_disable();
5986         cpu = smp_processor_id();
5987         sd = &per_cpu(softnet_data, cpu);
5988         oldsd = &per_cpu(softnet_data, oldcpu);
5989
5990         /* Find end of our completion_queue. */
5991         list_skb = &sd->completion_queue;
5992         while (*list_skb)
5993                 list_skb = &(*list_skb)->next;
5994         /* Append completion queue from offline CPU. */
5995         *list_skb = oldsd->completion_queue;
5996         oldsd->completion_queue = NULL;
5997
5998         /* Append output queue from offline CPU. */
5999         if (oldsd->output_queue) {
6000                 *sd->output_queue_tailp = oldsd->output_queue;
6001                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6002                 oldsd->output_queue = NULL;
6003                 oldsd->output_queue_tailp = &oldsd->output_queue;
6004         }
6005         /* Append NAPI poll list from offline CPU. */
6006         if (!list_empty(&oldsd->poll_list)) {
6007                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6008                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6009         }
6010
6011         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6012         local_irq_enable();
6013
6014         /* Process offline CPU's input_pkt_queue */
6015         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6016                 netif_rx(skb);
6017                 input_queue_head_incr(oldsd);
6018         }
6019         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6020                 netif_rx(skb);
6021                 input_queue_head_incr(oldsd);
6022         }
6023
6024         return NOTIFY_OK;
6025 }
6026
6027
6028 /**
6029  *      netdev_increment_features - increment feature set by one
6030  *      @all: current feature set
6031  *      @one: new feature set
6032  *      @mask: mask feature set
6033  *
6034  *      Computes a new feature set after adding a device with feature set
6035  *      @one to the master device with current feature set @all.  Will not
6036  *      enable anything that is off in @mask. Returns the new feature set.
6037  */
6038 netdev_features_t netdev_increment_features(netdev_features_t all,
6039         netdev_features_t one, netdev_features_t mask)
6040 {
6041         if (mask & NETIF_F_GEN_CSUM)
6042                 mask |= NETIF_F_ALL_CSUM;
6043         mask |= NETIF_F_VLAN_CHALLENGED;
6044
6045         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6046         all &= one | ~NETIF_F_ALL_FOR_ALL;
6047
6048         /* If one device supports hw checksumming, set for all. */
6049         if (all & NETIF_F_GEN_CSUM)
6050                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6051
6052         return all;
6053 }
6054 EXPORT_SYMBOL(netdev_increment_features);
6055
6056 static struct hlist_head *netdev_create_hash(void)
6057 {
6058         int i;
6059         struct hlist_head *hash;
6060
6061         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6062         if (hash != NULL)
6063                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6064                         INIT_HLIST_HEAD(&hash[i]);
6065
6066         return hash;
6067 }
6068
6069 /* Initialize per network namespace state */
6070 static int __net_init netdev_init(struct net *net)
6071 {
6072         if (net != &init_net)
6073                 INIT_LIST_HEAD(&net->dev_base_head);
6074
6075         net->dev_name_head = netdev_create_hash();
6076         if (net->dev_name_head == NULL)
6077                 goto err_name;
6078
6079         net->dev_index_head = netdev_create_hash();
6080         if (net->dev_index_head == NULL)
6081                 goto err_idx;
6082
6083         return 0;
6084
6085 err_idx:
6086         kfree(net->dev_name_head);
6087 err_name:
6088         return -ENOMEM;
6089 }
6090
6091 /**
6092  *      netdev_drivername - network driver for the device
6093  *      @dev: network device
6094  *
6095  *      Determine network driver for device.
6096  */
6097 const char *netdev_drivername(const struct net_device *dev)
6098 {
6099         const struct device_driver *driver;
6100         const struct device *parent;
6101         const char *empty = "";
6102
6103         parent = dev->dev.parent;
6104         if (!parent)
6105                 return empty;
6106
6107         driver = parent->driver;
6108         if (driver && driver->name)
6109                 return driver->name;
6110         return empty;
6111 }
6112
6113 static int __netdev_printk(const char *level, const struct net_device *dev,
6114                            struct va_format *vaf)
6115 {
6116         int r;
6117
6118         if (dev && dev->dev.parent) {
6119                 r = dev_printk_emit(level[1] - '0',
6120                                     dev->dev.parent,
6121                                     "%s %s %s: %pV",
6122                                     dev_driver_string(dev->dev.parent),
6123                                     dev_name(dev->dev.parent),
6124                                     netdev_name(dev), vaf);
6125         } else if (dev) {
6126                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6127         } else {
6128                 r = printk("%s(NULL net_device): %pV", level, vaf);
6129         }
6130
6131         return r;
6132 }
6133
6134 int netdev_printk(const char *level, const struct net_device *dev,
6135                   const char *format, ...)
6136 {
6137         struct va_format vaf;
6138         va_list args;
6139         int r;
6140
6141         va_start(args, format);
6142
6143         vaf.fmt = format;
6144         vaf.va = &args;
6145
6146         r = __netdev_printk(level, dev, &vaf);
6147
6148         va_end(args);
6149
6150         return r;
6151 }
6152 EXPORT_SYMBOL(netdev_printk);
6153
6154 #define define_netdev_printk_level(func, level)                 \
6155 int func(const struct net_device *dev, const char *fmt, ...)    \
6156 {                                                               \
6157         int r;                                                  \
6158         struct va_format vaf;                                   \
6159         va_list args;                                           \
6160                                                                 \
6161         va_start(args, fmt);                                    \
6162                                                                 \
6163         vaf.fmt = fmt;                                          \
6164         vaf.va = &args;                                         \
6165                                                                 \
6166         r = __netdev_printk(level, dev, &vaf);                  \
6167                                                                 \
6168         va_end(args);                                           \
6169                                                                 \
6170         return r;                                               \
6171 }                                                               \
6172 EXPORT_SYMBOL(func);
6173
6174 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6175 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6176 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6177 define_netdev_printk_level(netdev_err, KERN_ERR);
6178 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6179 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6180 define_netdev_printk_level(netdev_info, KERN_INFO);
6181
6182 static void __net_exit netdev_exit(struct net *net)
6183 {
6184         kfree(net->dev_name_head);
6185         kfree(net->dev_index_head);
6186 }
6187
6188 static struct pernet_operations __net_initdata netdev_net_ops = {
6189         .init = netdev_init,
6190         .exit = netdev_exit,
6191 };
6192
6193 static void __net_exit default_device_exit(struct net *net)
6194 {
6195         struct net_device *dev, *aux;
6196         /*
6197          * Push all migratable network devices back to the
6198          * initial network namespace
6199          */
6200         rtnl_lock();
6201         for_each_netdev_safe(net, dev, aux) {
6202                 int err;
6203                 char fb_name[IFNAMSIZ];
6204
6205                 /* Ignore unmoveable devices (i.e. loopback) */
6206                 if (dev->features & NETIF_F_NETNS_LOCAL)
6207                         continue;
6208
6209                 /* Leave virtual devices for the generic cleanup */
6210                 if (dev->rtnl_link_ops)
6211                         continue;
6212
6213                 /* Push remaining network devices to init_net */
6214                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6215                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6216                 if (err) {
6217                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6218                                  __func__, dev->name, err);
6219                         BUG();
6220                 }
6221         }
6222         rtnl_unlock();
6223 }
6224
6225 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6226 {
6227         /* At exit all network devices most be removed from a network
6228          * namespace.  Do this in the reverse order of registration.
6229          * Do this across as many network namespaces as possible to
6230          * improve batching efficiency.
6231          */
6232         struct net_device *dev;
6233         struct net *net;
6234         LIST_HEAD(dev_kill_list);
6235
6236         rtnl_lock();
6237         list_for_each_entry(net, net_list, exit_list) {
6238                 for_each_netdev_reverse(net, dev) {
6239                         if (dev->rtnl_link_ops)
6240                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6241                         else
6242                                 unregister_netdevice_queue(dev, &dev_kill_list);
6243                 }
6244         }
6245         unregister_netdevice_many(&dev_kill_list);
6246         list_del(&dev_kill_list);
6247         rtnl_unlock();
6248 }
6249
6250 static struct pernet_operations __net_initdata default_device_ops = {
6251         .exit = default_device_exit,
6252         .exit_batch = default_device_exit_batch,
6253 };
6254
6255 /*
6256  *      Initialize the DEV module. At boot time this walks the device list and
6257  *      unhooks any devices that fail to initialise (normally hardware not
6258  *      present) and leaves us with a valid list of present and active devices.
6259  *
6260  */
6261
6262 /*
6263  *       This is called single threaded during boot, so no need
6264  *       to take the rtnl semaphore.
6265  */
6266 static int __init net_dev_init(void)
6267 {
6268         int i, rc = -ENOMEM;
6269
6270         BUG_ON(!dev_boot_phase);
6271
6272         if (dev_proc_init())
6273                 goto out;
6274
6275         if (netdev_kobject_init())
6276                 goto out;
6277
6278         INIT_LIST_HEAD(&ptype_all);
6279         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6280                 INIT_LIST_HEAD(&ptype_base[i]);
6281
6282         INIT_LIST_HEAD(&offload_base);
6283
6284         if (register_pernet_subsys(&netdev_net_ops))
6285                 goto out;
6286
6287         /*
6288          *      Initialise the packet receive queues.
6289          */
6290
6291         for_each_possible_cpu(i) {
6292                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6293
6294                 memset(sd, 0, sizeof(*sd));
6295                 skb_queue_head_init(&sd->input_pkt_queue);
6296                 skb_queue_head_init(&sd->process_queue);
6297                 sd->completion_queue = NULL;
6298                 INIT_LIST_HEAD(&sd->poll_list);
6299                 sd->output_queue = NULL;
6300                 sd->output_queue_tailp = &sd->output_queue;
6301 #ifdef CONFIG_RPS
6302                 sd->csd.func = rps_trigger_softirq;
6303                 sd->csd.info = sd;
6304                 sd->csd.flags = 0;
6305                 sd->cpu = i;
6306 #endif
6307
6308                 sd->backlog.poll = process_backlog;
6309                 sd->backlog.weight = weight_p;
6310                 sd->backlog.gro_list = NULL;
6311                 sd->backlog.gro_count = 0;
6312
6313 #ifdef CONFIG_NET_FLOW_LIMIT
6314                 sd->flow_limit = NULL;
6315 #endif
6316         }
6317
6318         dev_boot_phase = 0;
6319
6320         /* The loopback device is special if any other network devices
6321          * is present in a network namespace the loopback device must
6322          * be present. Since we now dynamically allocate and free the
6323          * loopback device ensure this invariant is maintained by
6324          * keeping the loopback device as the first device on the
6325          * list of network devices.  Ensuring the loopback devices
6326          * is the first device that appears and the last network device
6327          * that disappears.
6328          */
6329         if (register_pernet_device(&loopback_net_ops))
6330                 goto out;
6331
6332         if (register_pernet_device(&default_device_ops))
6333                 goto out;
6334
6335         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6336         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6337
6338         hotcpu_notifier(dev_cpu_callback, 0);
6339         dst_init();
6340         rc = 0;
6341 out:
6342         return rc;
6343 }
6344
6345 subsys_initcall(net_dev_init);