Pileus Git - ~andy/linux/blob - net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <linux/hashtable.h>
 133 #include <linux/vmalloc.h>
 134 #include <linux/if_macvlan.h>
 135
 136 #include "net-sysfs.h"
 137
 138 /* Instead of increasing this, you should create a hash table. */
 139 #define MAX_GRO_SKBS 8
 140
 141 /* This should be increased if a protocol with a bigger head is added. */
 142 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 143
 144 static DEFINE_SPINLOCK(ptype_lock);
 145 static DEFINE_SPINLOCK(offload_lock);
 146 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 147 struct list_head ptype_all __read_mostly;       /* Taps */
 148 static struct list_head offload_base __read_mostly;
 149
 150 /*
 151  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 152  * semaphore.
 153  *
 154  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 155  *
 156  * Writers must hold the rtnl semaphore while they loop through the
 157  * dev_base_head list, and hold dev_base_lock for writing when they do the
 158  * actual updates.  This allows pure readers to access the list even
 159  * while a writer is preparing to update it.
 160  *
 161  * To put it another way, dev_base_lock is held for writing only to
 162  * protect against pure readers; the rtnl semaphore provides the
 163  * protection against other writers.
 164  *
 165  * See, for example usages, register_netdevice() and
 166  * unregister_netdevice(), which must be called with the rtnl
 167  * semaphore held.
 168  */
 169 DEFINE_RWLOCK(dev_base_lock);
 170 EXPORT_SYMBOL(dev_base_lock);
 171
 172 /* protects napi_hash addition/deletion and napi_gen_id */
 173 static DEFINE_SPINLOCK(napi_hash_lock);
 174
 175 static unsigned int napi_gen_id;
 176 static DEFINE_HASHTABLE(napi_hash, 8);
 177
 178 static seqcount_t devnet_rename_seq;
 179
 180 static inline void dev_base_seq_inc(struct net *net)
 181 {
 182         while (++net->dev_base_seq == 0);
 183 }
 184
 185 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 186 {
 187         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 188
 189         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 190 }
 191
 192 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 193 {
 194         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 195 }
 196
 197 static inline void rps_lock(struct softnet_data *sd)
 198 {
 199 #ifdef CONFIG_RPS
 200         spin_lock(&sd->input_pkt_queue.lock);
 201 #endif
 202 }
 203
 204 static inline void rps_unlock(struct softnet_data *sd)
 205 {
 206 #ifdef CONFIG_RPS
 207         spin_unlock(&sd->input_pkt_queue.lock);
 208 #endif
 209 }
 210
 211 /* Device list insertion */
 212 static void list_netdevice(struct net_device *dev)
 213 {
 214         struct net *net = dev_net(dev);
 215
 216         ASSERT_RTNL();
 217
 218         write_lock_bh(&dev_base_lock);
 219         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 220         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 221         hlist_add_head_rcu(&dev->index_hlist,
 222                            dev_index_hash(net, dev->ifindex));
 223         write_unlock_bh(&dev_base_lock);
 224
 225         dev_base_seq_inc(net);
 226 }
 227
 228 /* Device list removal
 229  * caller must respect a RCU grace period before freeing/reusing dev
 230  */
 231 static void unlist_netdevice(struct net_device *dev)
 232 {
 233         ASSERT_RTNL();
 234
 235         /* Unlink dev from the device chain */
 236         write_lock_bh(&dev_base_lock);
 237         list_del_rcu(&dev->dev_list);
 238         hlist_del_rcu(&dev->name_hlist);
 239         hlist_del_rcu(&dev->index_hlist);
 240         write_unlock_bh(&dev_base_lock);
 241
 242         dev_base_seq_inc(dev_net(dev));
 243 }
 244
 245 /*
 246  *      Our notifier list
 247  */
 248
 249 static RAW_NOTIFIER_HEAD(netdev_chain);
 250
 251 /*
 252  *      Device drivers call our routines to queue packets here. We empty the
 253  *      queue in the local softnet handler.
 254  */
 255
 256 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 257 EXPORT_PER_CPU_SYMBOL(softnet_data);
 258
 259 #ifdef CONFIG_LOCKDEP
 260 /*
 261  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 262  * according to dev->type
 263  */
 264 static const unsigned short netdev_lock_type[] =
 265         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 266          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 267          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 268          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 269          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 270          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 271          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 272          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 273          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 274          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 275          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 276          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 277          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 278          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 279          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 280
 281 static const char *const netdev_lock_name[] =
 282         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 283          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 284          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 285          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 286          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 287          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 288          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 289          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 290          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 291          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 292          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 293          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 294          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 295          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 296          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 297
 298 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 299 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 300
 301 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 302 {
 303         int i;
 304
 305         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 306                 if (netdev_lock_type[i] == dev_type)
 307                         return i;
 308         /* the last key is used by default */
 309         return ARRAY_SIZE(netdev_lock_type) - 1;
 310 }
 311
 312 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 313                                                  unsigned short dev_type)
 314 {
 315         int i;
 316
 317         i = netdev_lock_pos(dev_type);
 318         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 319                                    netdev_lock_name[i]);
 320 }
 321
 322 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 323 {
 324         int i;
 325
 326         i = netdev_lock_pos(dev->type);
 327         lockdep_set_class_and_name(&dev->addr_list_lock,
 328                                    &netdev_addr_lock_key[i],
 329                                    netdev_lock_name[i]);
 330 }
 331 #else
 332 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 333                                                  unsigned short dev_type)
 334 {
 335 }
 336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337 {
 338 }
 339 #endif
 340
 341 /*******************************************************************************
 342
 343                 Protocol management and registration routines
 344
 345 *******************************************************************************/
 346
 347 /*
 348  *      Add a protocol ID to the list. Now that the input handler is
 349  *      smarter we can dispense with all the messy stuff that used to be
 350  *      here.
 351  *
 352  *      BEWARE!!! Protocol handlers, mangling input packets,
 353  *      MUST BE last in hash buckets and checking protocol handlers
 354  *      MUST start from promiscuous ptype_all chain in net_bh.
 355  *      It is true now, do not change it.
 356  *      Explanation follows: if protocol handler, mangling packet, will
 357  *      be the first on list, it is not able to sense, that packet
 358  *      is cloned and should be copied-on-write, so that it will
 359  *      change it and subsequent readers will get broken packet.
 360  *                                                      --ANK (980803)
 361  */
 362
 363 static inline struct list_head *ptype_head(const struct packet_type *pt)
 364 {
 365         if (pt->type == htons(ETH_P_ALL))
 366                 return &ptype_all;
 367         else
 368                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 369 }
 370
 371 /**
 372  *      dev_add_pack - add packet handler
 373  *      @pt: packet type declaration
 374  *
 375  *      Add a protocol handler to the networking stack. The passed &packet_type
 376  *      is linked into kernel lists and may not be freed until it has been
 377  *      removed from the kernel lists.
 378  *
 379  *      This call does not sleep therefore it can not
 380  *      guarantee all CPU's that are in middle of receiving packets
 381  *      will see the new packet type (until the next received packet).
 382  */
 383
 384 void dev_add_pack(struct packet_type *pt)
 385 {
 386         struct list_head *head = ptype_head(pt);
 387
 388         spin_lock(&ptype_lock);
 389         list_add_rcu(&pt->list, head);
 390         spin_unlock(&ptype_lock);
 391 }
 392 EXPORT_SYMBOL(dev_add_pack);
 393
 394 /**
 395  *      __dev_remove_pack        - remove packet handler
 396  *      @pt: packet type declaration
 397  *
 398  *      Remove a protocol handler that was previously added to the kernel
 399  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 400  *      from the kernel lists and can be freed or reused once this function
 401  *      returns.
 402  *
 403  *      The packet type might still be in use by receivers
 404  *      and must not be freed until after all the CPU's have gone
 405  *      through a quiescent state.
 406  */
 407 void __dev_remove_pack(struct packet_type *pt)
 408 {
 409         struct list_head *head = ptype_head(pt);
 410         struct packet_type *pt1;
 411
 412         spin_lock(&ptype_lock);
 413
 414         list_for_each_entry(pt1, head, list) {
 415                 if (pt == pt1) {
 416                         list_del_rcu(&pt->list);
 417                         goto out;
 418                 }
 419         }
 420
 421         pr_warn("dev_remove_pack: %p not found\n", pt);
 422 out:
 423         spin_unlock(&ptype_lock);
 424 }
 425 EXPORT_SYMBOL(__dev_remove_pack);
 426
 427 /**
 428  *      dev_remove_pack  - remove packet handler
 429  *      @pt: packet type declaration
 430  *
 431  *      Remove a protocol handler that was previously added to the kernel
 432  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 433  *      from the kernel lists and can be freed or reused once this function
 434  *      returns.
 435  *
 436  *      This call sleeps to guarantee that no CPU is looking at the packet
 437  *      type after return.
 438  */
 439 void dev_remove_pack(struct packet_type *pt)
 440 {
 441         __dev_remove_pack(pt);
 442
 443         synchronize_net();
 444 }
 445 EXPORT_SYMBOL(dev_remove_pack);
 446
 447
 448 /**
 449  *      dev_add_offload - register offload handlers
 450  *      @po: protocol offload declaration
 451  *
 452  *      Add protocol offload handlers to the networking stack. The passed
 453  *      &proto_offload is linked into kernel lists and may not be freed until
 454  *      it has been removed from the kernel lists.
 455  *
 456  *      This call does not sleep therefore it can not
 457  *      guarantee all CPU's that are in middle of receiving packets
 458  *      will see the new offload handlers (until the next received packet).
 459  */
 460 void dev_add_offload(struct packet_offload *po)
 461 {
 462         struct list_head *head = &offload_base;
 463
 464         spin_lock(&offload_lock);
 465         list_add_rcu(&po->list, head);
 466         spin_unlock(&offload_lock);
 467 }
 468 EXPORT_SYMBOL(dev_add_offload);
 469
 470 /**
 471  *      __dev_remove_offload     - remove offload handler
 472  *      @po: packet offload declaration
 473  *
 474  *      Remove a protocol offload handler that was previously added to the
 475  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 476  *      is removed from the kernel lists and can be freed or reused once this
 477  *      function returns.
 478  *
 479  *      The packet type might still be in use by receivers
 480  *      and must not be freed until after all the CPU's have gone
 481  *      through a quiescent state.
 482  */
 483 void __dev_remove_offload(struct packet_offload *po)
 484 {
 485         struct list_head *head = &offload_base;
 486         struct packet_offload *po1;
 487
 488         spin_lock(&offload_lock);
 489
 490         list_for_each_entry(po1, head, list) {
 491                 if (po == po1) {
 492                         list_del_rcu(&po->list);
 493                         goto out;
 494                 }
 495         }
 496
 497         pr_warn("dev_remove_offload: %p not found\n", po);
 498 out:
 499         spin_unlock(&offload_lock);
 500 }
 501 EXPORT_SYMBOL(__dev_remove_offload);
 502
 503 /**
 504  *      dev_remove_offload       - remove packet offload handler
 505  *      @po: packet offload declaration
 506  *
 507  *      Remove a packet offload handler that was previously added to the kernel
 508  *      offload handlers by dev_add_offload(). The passed &offload_type is
 509  *      removed from the kernel lists and can be freed or reused once this
 510  *      function returns.
 511  *
 512  *      This call sleeps to guarantee that no CPU is looking at the packet
 513  *      type after return.
 514  */
 515 void dev_remove_offload(struct packet_offload *po)
 516 {
 517         __dev_remove_offload(po);
 518
 519         synchronize_net();
 520 }
 521 EXPORT_SYMBOL(dev_remove_offload);
 522
 523 /******************************************************************************
 524
 525                       Device Boot-time Settings Routines
 526
 527 *******************************************************************************/
 528
 529 /* Boot time configuration table */
 530 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 531
 532 /**
 533  *      netdev_boot_setup_add   - add new setup entry
 534  *      @name: name of the device
 535  *      @map: configured settings for the device
 536  *
 537  *      Adds new setup entry to the dev_boot_setup list.  The function
 538  *      returns 0 on error and 1 on success.  This is a generic routine to
 539  *      all netdevices.
 540  */
 541 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 542 {
 543         struct netdev_boot_setup *s;
 544         int i;
 545
 546         s = dev_boot_setup;
 547         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 548                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 549                         memset(s[i].name, 0, sizeof(s[i].name));
 550                         strlcpy(s[i].name, name, IFNAMSIZ);
 551                         memcpy(&s[i].map, map, sizeof(s[i].map));
 552                         break;
 553                 }
 554         }
 555
 556         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 557 }
 558
 559 /**
 560  *      netdev_boot_setup_check - check boot time settings
 561  *      @dev: the netdevice
 562  *
 563  *      Check boot time settings for the device.
 564  *      The found settings are set for the device to be used
 565  *      later in the device probing.
 566  *      Returns 0 if no settings found, 1 if they are.
 567  */
 568 int netdev_boot_setup_check(struct net_device *dev)
 569 {
 570         struct netdev_boot_setup *s = dev_boot_setup;
 571         int i;
 572
 573         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 574                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 575                     !strcmp(dev->name, s[i].name)) {
 576                         dev->irq        = s[i].map.irq;
 577                         dev->base_addr  = s[i].map.base_addr;
 578                         dev->mem_start  = s[i].map.mem_start;
 579                         dev->mem_end    = s[i].map.mem_end;
 580                         return 1;
 581                 }
 582         }
 583         return 0;
 584 }
 585 EXPORT_SYMBOL(netdev_boot_setup_check);
 586
 587
 588 /**
 589  *      netdev_boot_base        - get address from boot time settings
 590  *      @prefix: prefix for network device
 591  *      @unit: id for network device
 592  *
 593  *      Check boot time settings for the base address of device.
 594  *      The found settings are set for the device to be used
 595  *      later in the device probing.
 596  *      Returns 0 if no settings found.
 597  */
 598 unsigned long netdev_boot_base(const char *prefix, int unit)
 599 {
 600         const struct netdev_boot_setup *s = dev_boot_setup;
 601         char name[IFNAMSIZ];
 602         int i;
 603
 604         sprintf(name, "%s%d", prefix, unit);
 605
 606         /*
 607          * If device already registered then return base of 1
 608          * to indicate not to probe for this interface
 609          */
 610         if (__dev_get_by_name(&init_net, name))
 611                 return 1;
 612
 613         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 614                 if (!strcmp(name, s[i].name))
 615                         return s[i].map.base_addr;
 616         return 0;
 617 }
 618
 619 /*
 620  * Saves at boot time configured settings for any netdevice.
 621  */
 622 int __init netdev_boot_setup(char *str)
 623 {
 624         int ints[5];
 625         struct ifmap map;
 626
 627         str = get_options(str, ARRAY_SIZE(ints), ints);
 628         if (!str || !*str)
 629                 return 0;
 630
 631         /* Save settings */
 632         memset(&map, 0, sizeof(map));
 633         if (ints[0] > 0)
 634                 map.irq = ints[1];
 635         if (ints[0] > 1)
 636                 map.base_addr = ints[2];
 637         if (ints[0] > 2)
 638                 map.mem_start = ints[3];
 639         if (ints[0] > 3)
 640                 map.mem_end = ints[4];
 641
 642         /* Add new entry to the list */
 643         return netdev_boot_setup_add(str, &map);
 644 }
 645
 646 __setup("netdev=", netdev_boot_setup);
 647
 648 /*******************************************************************************
 649
 650                             Device Interface Subroutines
 651
 652 *******************************************************************************/
 653
 654 /**
 655  *      __dev_get_by_name       - find a device by its name
 656  *      @net: the applicable net namespace
 657  *      @name: name to find
 658  *
 659  *      Find an interface by name. Must be called under RTNL semaphore
 660  *      or @dev_base_lock. If the name is found a pointer to the device
 661  *      is returned. If the name is not found then %NULL is returned. The
 662  *      reference counters are not incremented so the caller must be
 663  *      careful with locks.
 664  */
 665
 666 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 667 {
 668         struct net_device *dev;
 669         struct hlist_head *head = dev_name_hash(net, name);
 670
 671         hlist_for_each_entry(dev, head, name_hlist)
 672                 if (!strncmp(dev->name, name, IFNAMSIZ))
 673                         return dev;
 674
 675         return NULL;
 676 }
 677 EXPORT_SYMBOL(__dev_get_by_name);
 678
 679 /**
 680  *      dev_get_by_name_rcu     - find a device by its name
 681  *      @net: the applicable net namespace
 682  *      @name: name to find
 683  *
 684  *      Find an interface by name.
 685  *      If the name is found a pointer to the device is returned.
 686  *      If the name is not found then %NULL is returned.
 687  *      The reference counters are not incremented so the caller must be
 688  *      careful with locks. The caller must hold RCU lock.
 689  */
 690
 691 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 692 {
 693         struct net_device *dev;
 694         struct hlist_head *head = dev_name_hash(net, name);
 695
 696         hlist_for_each_entry_rcu(dev, head, name_hlist)
 697                 if (!strncmp(dev->name, name, IFNAMSIZ))
 698                         return dev;
 699
 700         return NULL;
 701 }
 702 EXPORT_SYMBOL(dev_get_by_name_rcu);
 703
 704 /**
 705  *      dev_get_by_name         - find a device by its name
 706  *      @net: the applicable net namespace
 707  *      @name: name to find
 708  *
 709  *      Find an interface by name. This can be called from any
 710  *      context and does its own locking. The returned handle has
 711  *      the usage count incremented and the caller must use dev_put() to
 712  *      release it when it is no longer needed. %NULL is returned if no
 713  *      matching device is found.
 714  */
 715
 716 struct net_device *dev_get_by_name(struct net *net, const char *name)
 717 {
 718         struct net_device *dev;
 719
 720         rcu_read_lock();
 721         dev = dev_get_by_name_rcu(net, name);
 722         if (dev)
 723                 dev_hold(dev);
 724         rcu_read_unlock();
 725         return dev;
 726 }
 727 EXPORT_SYMBOL(dev_get_by_name);
 728
 729 /**
 730  *      __dev_get_by_index - find a device by its ifindex
 731  *      @net: the applicable net namespace
 732  *      @ifindex: index of device
 733  *
 734  *      Search for an interface by index. Returns %NULL if the device
 735  *      is not found or a pointer to the device. The device has not
 736  *      had its reference counter increased so the caller must be careful
 737  *      about locking. The caller must hold either the RTNL semaphore
 738  *      or @dev_base_lock.
 739  */
 740
 741 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 742 {
 743         struct net_device *dev;
 744         struct hlist_head *head = dev_index_hash(net, ifindex);
 745
 746         hlist_for_each_entry(dev, head, index_hlist)
 747                 if (dev->ifindex == ifindex)
 748                         return dev;
 749
 750         return NULL;
 751 }
 752 EXPORT_SYMBOL(__dev_get_by_index);
 753
 754 /**
 755  *      dev_get_by_index_rcu - find a device by its ifindex
 756  *      @net: the applicable net namespace
 757  *      @ifindex: index of device
 758  *
 759  *      Search for an interface by index. Returns %NULL if the device
 760  *      is not found or a pointer to the device. The device has not
 761  *      had its reference counter increased so the caller must be careful
 762  *      about locking. The caller must hold RCU lock.
 763  */
 764
 765 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 766 {
 767         struct net_device *dev;
 768         struct hlist_head *head = dev_index_hash(net, ifindex);
 769
 770         hlist_for_each_entry_rcu(dev, head, index_hlist)
 771                 if (dev->ifindex == ifindex)
 772                         return dev;
 773
 774         return NULL;
 775 }
 776 EXPORT_SYMBOL(dev_get_by_index_rcu);
 777
 778
 779 /**
 780  *      dev_get_by_index - find a device by its ifindex
 781  *      @net: the applicable net namespace
 782  *      @ifindex: index of device
 783  *
 784  *      Search for an interface by index. Returns NULL if the device
 785  *      is not found or a pointer to the device. The device returned has
 786  *      had a reference added and the pointer is safe until the user calls
 787  *      dev_put to indicate they have finished with it.
 788  */
 789
 790 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 791 {
 792         struct net_device *dev;
 793
 794         rcu_read_lock();
 795         dev = dev_get_by_index_rcu(net, ifindex);
 796         if (dev)
 797                 dev_hold(dev);
 798         rcu_read_unlock();
 799         return dev;
 800 }
 801 EXPORT_SYMBOL(dev_get_by_index);
 802
 803 /**
 804  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 805  *      @net: network namespace
 806  *      @name: a pointer to the buffer where the name will be stored.
 807  *      @ifindex: the ifindex of the interface to get the name from.
 808  *
 809  *      The use of raw_seqcount_begin() and cond_resched() before
 810  *      retrying is required as we want to give the writers a chance
 811  *      to complete when CONFIG_PREEMPT is not set.
 812  */
 813 int netdev_get_name(struct net *net, char *name, int ifindex)
 814 {
 815         struct net_device *dev;
 816         unsigned int seq;
 817
 818 retry:
 819         seq = raw_seqcount_begin(&devnet_rename_seq);
 820         rcu_read_lock();
 821         dev = dev_get_by_index_rcu(net, ifindex);
 822         if (!dev) {
 823                 rcu_read_unlock();
 824                 return -ENODEV;
 825         }
 826
 827         strcpy(name, dev->name);
 828         rcu_read_unlock();
 829         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 830                 cond_resched();
 831                 goto retry;
 832         }
 833
 834         return 0;
 835 }
 836
 837 /**
 838  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 839  *      @net: the applicable net namespace
 840  *      @type: media type of device
 841  *      @ha: hardware address
 842  *
 843  *      Search for an interface by MAC address. Returns NULL if the device
 844  *      is not found or a pointer to the device.
 845  *      The caller must hold RCU or RTNL.
 846  *      The returned device has not had its ref count increased
 847  *      and the caller must therefore be careful about locking
 848  *
 849  */
 850
 851 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 852                                        const char *ha)
 853 {
 854         struct net_device *dev;
 855
 856         for_each_netdev_rcu(net, dev)
 857                 if (dev->type == type &&
 858                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 859                         return dev;
 860
 861         return NULL;
 862 }
 863 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 864
 865 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 866 {
 867         struct net_device *dev;
 868
 869         ASSERT_RTNL();
 870         for_each_netdev(net, dev)
 871                 if (dev->type == type)
 872                         return dev;
 873
 874         return NULL;
 875 }
 876 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 877
 878 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 879 {
 880         struct net_device *dev, *ret = NULL;
 881
 882         rcu_read_lock();
 883         for_each_netdev_rcu(net, dev)
 884                 if (dev->type == type) {
 885                         dev_hold(dev);
 886                         ret = dev;
 887                         break;
 888                 }
 889         rcu_read_unlock();
 890         return ret;
 891 }
 892 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 893
 894 /**
 895  *      dev_get_by_flags_rcu - find any device with given flags
 896  *      @net: the applicable net namespace
 897  *      @if_flags: IFF_* values
 898  *      @mask: bitmask of bits in if_flags to check
 899  *
 900  *      Search for any interface with the given flags. Returns NULL if a device
 901  *      is not found or a pointer to the device. Must be called inside
 902  *      rcu_read_lock(), and result refcount is unchanged.
 903  */
 904
 905 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 906                                     unsigned short mask)
 907 {
 908         struct net_device *dev, *ret;
 909
 910         ret = NULL;
 911         for_each_netdev_rcu(net, dev) {
 912                 if (((dev->flags ^ if_flags) & mask) == 0) {
 913                         ret = dev;
 914                         break;
 915                 }
 916         }
 917         return ret;
 918 }
 919 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 920
 921 /**
 922  *      dev_valid_name - check if name is okay for network device
 923  *      @name: name string
 924  *
 925  *      Network device names need to be valid file names to
 926  *      to allow sysfs to work.  We also disallow any kind of
 927  *      whitespace.
 928  */
 929 bool dev_valid_name(const char *name)
 930 {
 931         if (*name == '\0')
 932                 return false;
 933         if (strlen(name) >= IFNAMSIZ)
 934                 return false;
 935         if (!strcmp(name, ".") || !strcmp(name, ".."))
 936                 return false;
 937
 938         while (*name) {
 939                 if (*name == '/' || isspace(*name))
 940                         return false;
 941                 name++;
 942         }
 943         return true;
 944 }
 945 EXPORT_SYMBOL(dev_valid_name);
 946
 947 /**
 948  *      __dev_alloc_name - allocate a name for a device
 949  *      @net: network namespace to allocate the device name in
 950  *      @name: name format string
 951  *      @buf:  scratch buffer and result name string
 952  *
 953  *      Passed a format string - eg "lt%d" it will try and find a suitable
 954  *      id. It scans list of devices to build up a free map, then chooses
 955  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 956  *      while allocating the name and adding the device in order to avoid
 957  *      duplicates.
 958  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 959  *      Returns the number of the unit assigned or a negative errno code.
 960  */
 961
 962 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 963 {
 964         int i = 0;
 965         const char *p;
 966         const int max_netdevices = 8*PAGE_SIZE;
 967         unsigned long *inuse;
 968         struct net_device *d;
 969
 970         p = strnchr(name, IFNAMSIZ-1, '%');
 971         if (p) {
 972                 /*
 973                  * Verify the string as this thing may have come from
 974                  * the user.  There must be either one "%d" and no other "%"
 975                  * characters.
 976                  */
 977                 if (p[1] != 'd' || strchr(p + 2, '%'))
 978                         return -EINVAL;
 979
 980                 /* Use one page as a bit array of possible slots */
 981                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 982                 if (!inuse)
 983                         return -ENOMEM;
 984
 985                 for_each_netdev(net, d) {
 986                         if (!sscanf(d->name, name, &i))
 987                                 continue;
 988                         if (i < 0 || i >= max_netdevices)
 989                                 continue;
 990
 991                         /*  avoid cases where sscanf is not exact inverse of printf */
 992                         snprintf(buf, IFNAMSIZ, name, i);
 993                         if (!strncmp(buf, d->name, IFNAMSIZ))
 994                                 set_bit(i, inuse);
 995                 }
 996
 997                 i = find_first_zero_bit(inuse, max_netdevices);
 998                 free_page((unsigned long) inuse);
 999         }
1000
1001         if (buf != name)
1002                 snprintf(buf, IFNAMSIZ, name, i);
1003         if (!__dev_get_by_name(net, buf))
1004                 return i;
1005
1006         /* It is possible to run out of possible slots
1007          * when the name is long and there isn't enough space left
1008          * for the digits, or if all bits are used.
1009          */
1010         return -ENFILE;
1011 }
1012
1013 /**
1014  *      dev_alloc_name - allocate a name for a device
1015  *      @dev: device
1016  *      @name: name format string
1017  *
1018  *      Passed a format string - eg "lt%d" it will try and find a suitable
1019  *      id. It scans list of devices to build up a free map, then chooses
1020  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1021  *      while allocating the name and adding the device in order to avoid
1022  *      duplicates.
1023  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1024  *      Returns the number of the unit assigned or a negative errno code.
1025  */
1026
1027 int dev_alloc_name(struct net_device *dev, const char *name)
1028 {
1029         char buf[IFNAMSIZ];
1030         struct net *net;
1031         int ret;
1032
1033         BUG_ON(!dev_net(dev));
1034         net = dev_net(dev);
1035         ret = __dev_alloc_name(net, name, buf);
1036         if (ret >= 0)
1037                 strlcpy(dev->name, buf, IFNAMSIZ);
1038         return ret;
1039 }
1040 EXPORT_SYMBOL(dev_alloc_name);
1041
1042 static int dev_alloc_name_ns(struct net *net,
1043                              struct net_device *dev,
1044                              const char *name)
1045 {
1046         char buf[IFNAMSIZ];
1047         int ret;
1048
1049         ret = __dev_alloc_name(net, name, buf);
1050         if (ret >= 0)
1051                 strlcpy(dev->name, buf, IFNAMSIZ);
1052         return ret;
1053 }
1054
1055 static int dev_get_valid_name(struct net *net,
1056                               struct net_device *dev,
1057                               const char *name)
1058 {
1059         BUG_ON(!net);
1060
1061         if (!dev_valid_name(name))
1062                 return -EINVAL;
1063
1064         if (strchr(name, '%'))
1065                 return dev_alloc_name_ns(net, dev, name);
1066         else if (__dev_get_by_name(net, name))
1067                 return -EEXIST;
1068         else if (dev->name != name)
1069                 strlcpy(dev->name, name, IFNAMSIZ);
1070
1071         return 0;
1072 }
1073
1074 /**
1075  *      dev_change_name - change name of a device
1076  *      @dev: device
1077  *      @newname: name (or format string) must be at least IFNAMSIZ
1078  *
1079  *      Change name of a device, can pass format strings "eth%d".
1080  *      for wildcarding.
1081  */
1082 int dev_change_name(struct net_device *dev, const char *newname)
1083 {
1084         char oldname[IFNAMSIZ];
1085         int err = 0;
1086         int ret;
1087         struct net *net;
1088
1089         ASSERT_RTNL();
1090         BUG_ON(!dev_net(dev));
1091
1092         net = dev_net(dev);
1093         if (dev->flags & IFF_UP)
1094                 return -EBUSY;
1095
1096         write_seqcount_begin(&devnet_rename_seq);
1097
1098         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1099                 write_seqcount_end(&devnet_rename_seq);
1100                 return 0;
1101         }
1102
1103         memcpy(oldname, dev->name, IFNAMSIZ);
1104
1105         err = dev_get_valid_name(net, dev, newname);
1106         if (err < 0) {
1107                 write_seqcount_end(&devnet_rename_seq);
1108                 return err;
1109         }
1110
1111 rollback:
1112         ret = device_rename(&dev->dev, dev->name);
1113         if (ret) {
1114                 memcpy(dev->name, oldname, IFNAMSIZ);
1115                 write_seqcount_end(&devnet_rename_seq);
1116                 return ret;
1117         }
1118
1119         write_seqcount_end(&devnet_rename_seq);
1120
1121         write_lock_bh(&dev_base_lock);
1122         hlist_del_rcu(&dev->name_hlist);
1123         write_unlock_bh(&dev_base_lock);
1124
1125         synchronize_rcu();
1126
1127         write_lock_bh(&dev_base_lock);
1128         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1129         write_unlock_bh(&dev_base_lock);
1130
1131         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1132         ret = notifier_to_errno(ret);
1133
1134         if (ret) {
1135                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1136                 if (err >= 0) {
1137                         err = ret;
1138                         write_seqcount_begin(&devnet_rename_seq);
1139                         memcpy(dev->name, oldname, IFNAMSIZ);
1140                         goto rollback;
1141                 } else {
1142                         pr_err("%s: name change rollback failed: %d\n",
1143                                dev->name, ret);
1144                 }
1145         }
1146
1147         return err;
1148 }
1149
1150 /**
1151  *      dev_set_alias - change ifalias of a device
1152  *      @dev: device
1153  *      @alias: name up to IFALIASZ
1154  *      @len: limit of bytes to copy from info
1155  *
1156  *      Set ifalias for a device,
1157  */
1158 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1159 {
1160         char *new_ifalias;
1161
1162         ASSERT_RTNL();
1163
1164         if (len >= IFALIASZ)
1165                 return -EINVAL;
1166
1167         if (!len) {
1168                 kfree(dev->ifalias);
1169                 dev->ifalias = NULL;
1170                 return 0;
1171         }
1172
1173         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1174         if (!new_ifalias)
1175                 return -ENOMEM;
1176         dev->ifalias = new_ifalias;
1177
1178         strlcpy(dev->ifalias, alias, len+1);
1179         return len;
1180 }
1181
1182
1183 /**
1184  *      netdev_features_change - device changes features
1185  *      @dev: device to cause notification
1186  *
1187  *      Called to indicate a device has changed features.
1188  */
1189 void netdev_features_change(struct net_device *dev)
1190 {
1191         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1192 }
1193 EXPORT_SYMBOL(netdev_features_change);
1194
1195 /**
1196  *      netdev_state_change - device changes state
1197  *      @dev: device to cause notification
1198  *
1199  *      Called to indicate a device has changed state. This function calls
1200  *      the notifier chains for netdev_chain and sends a NEWLINK message
1201  *      to the routing socket.
1202  */
1203 void netdev_state_change(struct net_device *dev)
1204 {
1205         if (dev->flags & IFF_UP) {
1206                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1207                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1208         }
1209 }
1210 EXPORT_SYMBOL(netdev_state_change);
1211
1212 /**
1213  *      netdev_notify_peers - notify network peers about existence of @dev
1214  *      @dev: network device
1215  *
1216  * Generate traffic such that interested network peers are aware of
1217  * @dev, such as by generating a gratuitous ARP. This may be used when
1218  * a device wants to inform the rest of the network about some sort of
1219  * reconfiguration such as a failover event or virtual machine
1220  * migration.
1221  */
1222 void netdev_notify_peers(struct net_device *dev)
1223 {
1224         rtnl_lock();
1225         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1226         rtnl_unlock();
1227 }
1228 EXPORT_SYMBOL(netdev_notify_peers);
1229
1230 static int __dev_open(struct net_device *dev)
1231 {
1232         const struct net_device_ops *ops = dev->netdev_ops;
1233         int ret;
1234
1235         ASSERT_RTNL();
1236
1237         if (!netif_device_present(dev))
1238                 return -ENODEV;
1239
1240         /* Block netpoll from trying to do any rx path servicing.
1241          * If we don't do this there is a chance ndo_poll_controller
1242          * or ndo_poll may be running while we open the device
1243          */
1244         netpoll_rx_disable(dev);
1245
1246         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1247         ret = notifier_to_errno(ret);
1248         if (ret)
1249                 return ret;
1250
1251         set_bit(__LINK_STATE_START, &dev->state);
1252
1253         if (ops->ndo_validate_addr)
1254                 ret = ops->ndo_validate_addr(dev);
1255
1256         if (!ret && ops->ndo_open)
1257                 ret = ops->ndo_open(dev);
1258
1259         netpoll_rx_enable(dev);
1260
1261         if (ret)
1262                 clear_bit(__LINK_STATE_START, &dev->state);
1263         else {
1264                 dev->flags |= IFF_UP;
1265                 net_dmaengine_get();
1266                 dev_set_rx_mode(dev);
1267                 dev_activate(dev);
1268                 add_device_randomness(dev->dev_addr, dev->addr_len);
1269         }
1270
1271         return ret;
1272 }
1273
1274 /**
1275  *      dev_open        - prepare an interface for use.
1276  *      @dev:   device to open
1277  *
1278  *      Takes a device from down to up state. The device's private open
1279  *      function is invoked and then the multicast lists are loaded. Finally
1280  *      the device is moved into the up state and a %NETDEV_UP message is
1281  *      sent to the netdev notifier chain.
1282  *
1283  *      Calling this function on an active interface is a nop. On a failure
1284  *      a negative errno code is returned.
1285  */
1286 int dev_open(struct net_device *dev)
1287 {
1288         int ret;
1289
1290         if (dev->flags & IFF_UP)
1291                 return 0;
1292
1293         ret = __dev_open(dev);
1294         if (ret < 0)
1295                 return ret;
1296
1297         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1298         call_netdevice_notifiers(NETDEV_UP, dev);
1299
1300         return ret;
1301 }
1302 EXPORT_SYMBOL(dev_open);
1303
1304 static int __dev_close_many(struct list_head *head)
1305 {
1306         struct net_device *dev;
1307
1308         ASSERT_RTNL();
1309         might_sleep();
1310
1311         list_for_each_entry(dev, head, close_list) {
1312                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1313
1314                 clear_bit(__LINK_STATE_START, &dev->state);
1315
1316                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1317                  * can be even on different cpu. So just clear netif_running().
1318                  *
1319                  * dev->stop() will invoke napi_disable() on all of it's
1320                  * napi_struct instances on this device.
1321                  */
1322                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1323         }
1324
1325         dev_deactivate_many(head);
1326
1327         list_for_each_entry(dev, head, close_list) {
1328                 const struct net_device_ops *ops = dev->netdev_ops;
1329
1330                 /*
1331                  *      Call the device specific close. This cannot fail.
1332                  *      Only if device is UP
1333                  *
1334                  *      We allow it to be called even after a DETACH hot-plug
1335                  *      event.
1336                  */
1337                 if (ops->ndo_stop)
1338                         ops->ndo_stop(dev);
1339
1340                 dev->flags &= ~IFF_UP;
1341                 net_dmaengine_put();
1342         }
1343
1344         return 0;
1345 }
1346
1347 static int __dev_close(struct net_device *dev)
1348 {
1349         int retval;
1350         LIST_HEAD(single);
1351
1352         /* Temporarily disable netpoll until the interface is down */
1353         netpoll_rx_disable(dev);
1354
1355         list_add(&dev->close_list, &single);
1356         retval = __dev_close_many(&single);
1357         list_del(&single);
1358
1359         netpoll_rx_enable(dev);
1360         return retval;
1361 }
1362
1363 static int dev_close_many(struct list_head *head)
1364 {
1365         struct net_device *dev, *tmp;
1366
1367         /* Remove the devices that don't need to be closed */
1368         list_for_each_entry_safe(dev, tmp, head, close_list)
1369                 if (!(dev->flags & IFF_UP))
1370                         list_del_init(&dev->close_list);
1371
1372         __dev_close_many(head);
1373
1374         list_for_each_entry_safe(dev, tmp, head, close_list) {
1375                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1377                 list_del_init(&dev->close_list);
1378         }
1379
1380         return 0;
1381 }
1382
1383 /**
1384  *      dev_close - shutdown an interface.
1385  *      @dev: device to shutdown
1386  *
1387  *      This function moves an active device into down state. A
1388  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1389  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1390  *      chain.
1391  */
1392 int dev_close(struct net_device *dev)
1393 {
1394         if (dev->flags & IFF_UP) {
1395                 LIST_HEAD(single);
1396
1397                 /* Block netpoll rx while the interface is going down */
1398                 netpoll_rx_disable(dev);
1399
1400                 list_add(&dev->close_list, &single);
1401                 dev_close_many(&single);
1402                 list_del(&single);
1403
1404                 netpoll_rx_enable(dev);
1405         }
1406         return 0;
1407 }
1408 EXPORT_SYMBOL(dev_close);
1409
1410
1411 /**
1412  *      dev_disable_lro - disable Large Receive Offload on a device
1413  *      @dev: device
1414  *
1415  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1416  *      called under RTNL.  This is needed if received packets may be
1417  *      forwarded to another interface.
1418  */
1419 void dev_disable_lro(struct net_device *dev)
1420 {
1421         /*
1422          * If we're trying to disable lro on a vlan device
1423          * use the underlying physical device instead
1424          */
1425         if (is_vlan_dev(dev))
1426                 dev = vlan_dev_real_dev(dev);
1427
1428         /* the same for macvlan devices */
1429         if (netif_is_macvlan(dev))
1430                 dev = macvlan_dev_real_dev(dev);
1431
1432         dev->wanted_features &= ~NETIF_F_LRO;
1433         netdev_update_features(dev);
1434
1435         if (unlikely(dev->features & NETIF_F_LRO))
1436                 netdev_WARN(dev, "failed to disable LRO!\n");
1437 }
1438 EXPORT_SYMBOL(dev_disable_lro);
1439
1440 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1441                                    struct net_device *dev)
1442 {
1443         struct netdev_notifier_info info;
1444
1445         netdev_notifier_info_init(&info, dev);
1446         return nb->notifier_call(nb, val, &info);
1447 }
1448
1449 static int dev_boot_phase = 1;
1450
1451 /**
1452  *      register_netdevice_notifier - register a network notifier block
1453  *      @nb: notifier
1454  *
1455  *      Register a notifier to be called when network device events occur.
1456  *      The notifier passed is linked into the kernel structures and must
1457  *      not be reused until it has been unregistered. A negative errno code
1458  *      is returned on a failure.
1459  *
1460  *      When registered all registration and up events are replayed
1461  *      to the new notifier to allow device to have a race free
1462  *      view of the network device list.
1463  */
1464
1465 int register_netdevice_notifier(struct notifier_block *nb)
1466 {
1467         struct net_device *dev;
1468         struct net_device *last;
1469         struct net *net;
1470         int err;
1471
1472         rtnl_lock();
1473         err = raw_notifier_chain_register(&netdev_chain, nb);
1474         if (err)
1475                 goto unlock;
1476         if (dev_boot_phase)
1477                 goto unlock;
1478         for_each_net(net) {
1479                 for_each_netdev(net, dev) {
1480                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1481                         err = notifier_to_errno(err);
1482                         if (err)
1483                                 goto rollback;
1484
1485                         if (!(dev->flags & IFF_UP))
1486                                 continue;
1487
1488                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1489                 }
1490         }
1491
1492 unlock:
1493         rtnl_unlock();
1494         return err;
1495
1496 rollback:
1497         last = dev;
1498         for_each_net(net) {
1499                 for_each_netdev(net, dev) {
1500                         if (dev == last)
1501                                 goto outroll;
1502
1503                         if (dev->flags & IFF_UP) {
1504                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1505                                                         dev);
1506                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1507                         }
1508                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1509                 }
1510         }
1511
1512 outroll:
1513         raw_notifier_chain_unregister(&netdev_chain, nb);
1514         goto unlock;
1515 }
1516 EXPORT_SYMBOL(register_netdevice_notifier);
1517
1518 /**
1519  *      unregister_netdevice_notifier - unregister a network notifier block
1520  *      @nb: notifier
1521  *
1522  *      Unregister a notifier previously registered by
1523  *      register_netdevice_notifier(). The notifier is unlinked into the
1524  *      kernel structures and may then be reused. A negative errno code
1525  *      is returned on a failure.
1526  *
1527  *      After unregistering unregister and down device events are synthesized
1528  *      for all devices on the device list to the removed notifier to remove
1529  *      the need for special case cleanup code.
1530  */
1531
1532 int unregister_netdevice_notifier(struct notifier_block *nb)
1533 {
1534         struct net_device *dev;
1535         struct net *net;
1536         int err;
1537
1538         rtnl_lock();
1539         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1540         if (err)
1541                 goto unlock;
1542
1543         for_each_net(net) {
1544                 for_each_netdev(net, dev) {
1545                         if (dev->flags & IFF_UP) {
1546                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1547                                                         dev);
1548                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1549                         }
1550                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1551                 }
1552         }
1553 unlock:
1554         rtnl_unlock();
1555         return err;
1556 }
1557 EXPORT_SYMBOL(unregister_netdevice_notifier);
1558
1559 /**
1560  *      call_netdevice_notifiers_info - call all network notifier blocks
1561  *      @val: value passed unmodified to notifier function
1562  *      @dev: net_device pointer passed unmodified to notifier function
1563  *      @info: notifier information data
1564  *
1565  *      Call all network notifier blocks.  Parameters and return value
1566  *      are as for raw_notifier_call_chain().
1567  */
1568
1569 int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1570                                   struct netdev_notifier_info *info)
1571 {
1572         ASSERT_RTNL();
1573         netdev_notifier_info_init(info, dev);
1574         return raw_notifier_call_chain(&netdev_chain, val, info);
1575 }
1576 EXPORT_SYMBOL(call_netdevice_notifiers_info);
1577
1578 /**
1579  *      call_netdevice_notifiers - call all network notifier blocks
1580  *      @val: value passed unmodified to notifier function
1581  *      @dev: net_device pointer passed unmodified to notifier function
1582  *
1583  *      Call all network notifier blocks.  Parameters and return value
1584  *      are as for raw_notifier_call_chain().
1585  */
1586
1587 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1588 {
1589         struct netdev_notifier_info info;
1590
1591         return call_netdevice_notifiers_info(val, dev, &info);
1592 }
1593 EXPORT_SYMBOL(call_netdevice_notifiers);
1594
1595 static struct static_key netstamp_needed __read_mostly;
1596 #ifdef HAVE_JUMP_LABEL
1597 /* We are not allowed to call static_key_slow_dec() from irq context
1598  * If net_disable_timestamp() is called from irq context, defer the
1599  * static_key_slow_dec() calls.
1600  */
1601 static atomic_t netstamp_needed_deferred;
1602 #endif
1603
1604 void net_enable_timestamp(void)
1605 {
1606 #ifdef HAVE_JUMP_LABEL
1607         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1608
1609         if (deferred) {
1610                 while (--deferred)
1611                         static_key_slow_dec(&netstamp_needed);
1612                 return;
1613         }
1614 #endif
1615         static_key_slow_inc(&netstamp_needed);
1616 }
1617 EXPORT_SYMBOL(net_enable_timestamp);
1618
1619 void net_disable_timestamp(void)
1620 {
1621 #ifdef HAVE_JUMP_LABEL
1622         if (in_interrupt()) {
1623                 atomic_inc(&netstamp_needed_deferred);
1624                 return;
1625         }
1626 #endif
1627         static_key_slow_dec(&netstamp_needed);
1628 }
1629 EXPORT_SYMBOL(net_disable_timestamp);
1630
1631 static inline void net_timestamp_set(struct sk_buff *skb)
1632 {
1633         skb->tstamp.tv64 = 0;
1634         if (static_key_false(&netstamp_needed))
1635                 __net_timestamp(skb);
1636 }
1637
1638 #define net_timestamp_check(COND, SKB)                  \
1639         if (static_key_false(&netstamp_needed)) {               \
1640                 if ((COND) && !(SKB)->tstamp.tv64)      \
1641                         __net_timestamp(SKB);           \
1642         }                                               \
1643
1644 static inline bool is_skb_forwardable(struct net_device *dev,
1645                                       struct sk_buff *skb)
1646 {
1647         unsigned int len;
1648
1649         if (!(dev->flags & IFF_UP))
1650                 return false;
1651
1652         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1653         if (skb->len <= len)
1654                 return true;
1655
1656         /* if TSO is enabled, we don't care about the length as the packet
1657          * could be forwarded without being segmented before
1658          */
1659         if (skb_is_gso(skb))
1660                 return true;
1661
1662         return false;
1663 }
1664
1665 /**
1666  * dev_forward_skb - loopback an skb to another netif
1667  *
1668  * @dev: destination network device
1669  * @skb: buffer to forward
1670  *
1671  * return values:
1672  *      NET_RX_SUCCESS  (no congestion)
1673  *      NET_RX_DROP     (packet was dropped, but freed)
1674  *
1675  * dev_forward_skb can be used for injecting an skb from the
1676  * start_xmit function of one device into the receive queue
1677  * of another device.
1678  *
1679  * The receiving device may be in another namespace, so
1680  * we have to clear all information in the skb that could
1681  * impact namespace isolation.
1682  */
1683 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1684 {
1685         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1686                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1687                         atomic_long_inc(&dev->rx_dropped);
1688                         kfree_skb(skb);
1689                         return NET_RX_DROP;
1690                 }
1691         }
1692
1693         if (unlikely(!is_skb_forwardable(dev, skb))) {
1694                 atomic_long_inc(&dev->rx_dropped);
1695                 kfree_skb(skb);
1696                 return NET_RX_DROP;
1697         }
1698
1699         skb_scrub_packet(skb, true);
1700         skb->protocol = eth_type_trans(skb, dev);
1701
1702         return netif_rx(skb);
1703 }
1704 EXPORT_SYMBOL_GPL(dev_forward_skb);
1705
1706 static inline int deliver_skb(struct sk_buff *skb,
1707                               struct packet_type *pt_prev,
1708                               struct net_device *orig_dev)
1709 {
1710         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1711                 return -ENOMEM;
1712         atomic_inc(&skb->users);
1713         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1714 }
1715
1716 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1717 {
1718         if (!ptype->af_packet_priv || !skb->sk)
1719                 return false;
1720
1721         if (ptype->id_match)
1722                 return ptype->id_match(ptype, skb->sk);
1723         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1724                 return true;
1725
1726         return false;
1727 }
1728
1729 /*
1730  *      Support routine. Sends outgoing frames to any network
1731  *      taps currently in use.
1732  */
1733
1734 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1735 {
1736         struct packet_type *ptype;
1737         struct sk_buff *skb2 = NULL;
1738         struct packet_type *pt_prev = NULL;
1739
1740         rcu_read_lock();
1741         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1742                 /* Never send packets back to the socket
1743                  * they originated from - MvS (miquels@drinkel.ow.org)
1744                  */
1745                 if ((ptype->dev == dev || !ptype->dev) &&
1746                     (!skb_loop_sk(ptype, skb))) {
1747                         if (pt_prev) {
1748                                 deliver_skb(skb2, pt_prev, skb->dev);
1749                                 pt_prev = ptype;
1750                                 continue;
1751                         }
1752
1753                         skb2 = skb_clone(skb, GFP_ATOMIC);
1754                         if (!skb2)
1755                                 break;
1756
1757                         net_timestamp_set(skb2);
1758
1759                         /* skb->nh should be correctly
1760                            set by sender, so that the second statement is
1761                            just protection against buggy protocols.
1762                          */
1763                         skb_reset_mac_header(skb2);
1764
1765                         if (skb_network_header(skb2) < skb2->data ||
1766                             skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1767                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1768                                                      ntohs(skb2->protocol),
1769                                                      dev->name);
1770                                 skb_reset_network_header(skb2);
1771                         }
1772
1773                         skb2->transport_header = skb2->network_header;
1774                         skb2->pkt_type = PACKET_OUTGOING;
1775                         pt_prev = ptype;
1776                 }
1777         }
1778         if (pt_prev)
1779                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1780         rcu_read_unlock();
1781 }
1782
1783 /**
1784  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1785  * @dev: Network device
1786  * @txq: number of queues available
1787  *
1788  * If real_num_tx_queues is changed the tc mappings may no longer be
1789  * valid. To resolve this verify the tc mapping remains valid and if
1790  * not NULL the mapping. With no priorities mapping to this
1791  * offset/count pair it will no longer be used. In the worst case TC0
1792  * is invalid nothing can be done so disable priority mappings. If is
1793  * expected that drivers will fix this mapping if they can before
1794  * calling netif_set_real_num_tx_queues.
1795  */
1796 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1797 {
1798         int i;
1799         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1800
1801         /* If TC0 is invalidated disable TC mapping */
1802         if (tc->offset + tc->count > txq) {
1803                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1804                 dev->num_tc = 0;
1805                 return;
1806         }
1807
1808         /* Invalidated prio to tc mappings set to TC0 */
1809         for (i = 1; i < TC_BITMASK + 1; i++) {
1810                 int q = netdev_get_prio_tc_map(dev, i);
1811
1812                 tc = &dev->tc_to_txq[q];
1813                 if (tc->offset + tc->count > txq) {
1814                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1815                                 i, q);
1816                         netdev_set_prio_tc_map(dev, i, 0);
1817                 }
1818         }
1819 }
1820
1821 #ifdef CONFIG_XPS
1822 static DEFINE_MUTEX(xps_map_mutex);
1823 #define xmap_dereference(P)             \
1824         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1825
1826 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1827                                         int cpu, u16 index)
1828 {
1829         struct xps_map *map = NULL;
1830         int pos;
1831
1832         if (dev_maps)
1833                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1834
1835         for (pos = 0; map && pos < map->len; pos++) {
1836                 if (map->queues[pos] == index) {
1837                         if (map->len > 1) {
1838                                 map->queues[pos] = map->queues[--map->len];
1839                         } else {
1840                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1841                                 kfree_rcu(map, rcu);
1842                                 map = NULL;
1843                         }
1844                         break;
1845                 }
1846         }
1847
1848         return map;
1849 }
1850
1851 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1852 {
1853         struct xps_dev_maps *dev_maps;
1854         int cpu, i;
1855         bool active = false;
1856
1857         mutex_lock(&xps_map_mutex);
1858         dev_maps = xmap_dereference(dev->xps_maps);
1859
1860         if (!dev_maps)
1861                 goto out_no_maps;
1862
1863         for_each_possible_cpu(cpu) {
1864                 for (i = index; i < dev->num_tx_queues; i++) {
1865                         if (!remove_xps_queue(dev_maps, cpu, i))
1866                                 break;
1867                 }
1868                 if (i == dev->num_tx_queues)
1869                         active = true;
1870         }
1871
1872         if (!active) {
1873                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1874                 kfree_rcu(dev_maps, rcu);
1875         }
1876
1877         for (i = index; i < dev->num_tx_queues; i++)
1878                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1879                                              NUMA_NO_NODE);
1880
1881 out_no_maps:
1882         mutex_unlock(&xps_map_mutex);
1883 }
1884
1885 static struct xps_map *expand_xps_map(struct xps_map *map,
1886                                       int cpu, u16 index)
1887 {
1888         struct xps_map *new_map;
1889         int alloc_len = XPS_MIN_MAP_ALLOC;
1890         int i, pos;
1891
1892         for (pos = 0; map && pos < map->len; pos++) {
1893                 if (map->queues[pos] != index)
1894                         continue;
1895                 return map;
1896         }
1897
1898         /* Need to add queue to this CPU's existing map */
1899         if (map) {
1900                 if (pos < map->alloc_len)
1901                         return map;
1902
1903                 alloc_len = map->alloc_len * 2;
1904         }
1905
1906         /* Need to allocate new map to store queue on this CPU's map */
1907         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1908                                cpu_to_node(cpu));
1909         if (!new_map)
1910                 return NULL;
1911
1912         for (i = 0; i < pos; i++)
1913                 new_map->queues[i] = map->queues[i];
1914         new_map->alloc_len = alloc_len;
1915         new_map->len = pos;
1916
1917         return new_map;
1918 }
1919
1920 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
1921                         u16 index)
1922 {
1923         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1924         struct xps_map *map, *new_map;
1925         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1926         int cpu, numa_node_id = -2;
1927         bool active = false;
1928
1929         mutex_lock(&xps_map_mutex);
1930
1931         dev_maps = xmap_dereference(dev->xps_maps);
1932
1933         /* allocate memory for queue storage */
1934         for_each_online_cpu(cpu) {
1935                 if (!cpumask_test_cpu(cpu, mask))
1936                         continue;
1937
1938                 if (!new_dev_maps)
1939                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1940                 if (!new_dev_maps) {
1941                         mutex_unlock(&xps_map_mutex);
1942                         return -ENOMEM;
1943                 }
1944
1945                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1946                                  NULL;
1947
1948                 map = expand_xps_map(map, cpu, index);
1949                 if (!map)
1950                         goto error;
1951
1952                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1953         }
1954
1955         if (!new_dev_maps)
1956                 goto out_no_new_maps;
1957
1958         for_each_possible_cpu(cpu) {
1959                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1960                         /* add queue to CPU maps */
1961                         int pos = 0;
1962
1963                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1964                         while ((pos < map->len) && (map->queues[pos] != index))
1965                                 pos++;
1966
1967                         if (pos == map->len)
1968                                 map->queues[map->len++] = index;
1969 #ifdef CONFIG_NUMA
1970                         if (numa_node_id == -2)
1971                                 numa_node_id = cpu_to_node(cpu);
1972                         else if (numa_node_id != cpu_to_node(cpu))
1973                                 numa_node_id = -1;
1974 #endif
1975                 } else if (dev_maps) {
1976                         /* fill in the new device map from the old device map */
1977                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1978                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1979                 }
1980
1981         }
1982
1983         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1984
1985         /* Cleanup old maps */
1986         if (dev_maps) {
1987                 for_each_possible_cpu(cpu) {
1988                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1989                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1990                         if (map && map != new_map)
1991                                 kfree_rcu(map, rcu);
1992                 }
1993
1994                 kfree_rcu(dev_maps, rcu);
1995         }
1996
1997         dev_maps = new_dev_maps;
1998         active = true;
1999
2000 out_no_new_maps:
2001         /* update Tx queue numa node */
2002         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2003                                      (numa_node_id >= 0) ? numa_node_id :
2004                                      NUMA_NO_NODE);
2005
2006         if (!dev_maps)
2007                 goto out_no_maps;
2008
2009         /* removes queue from unused CPUs */
2010         for_each_possible_cpu(cpu) {
2011                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2012                         continue;
2013
2014                 if (remove_xps_queue(dev_maps, cpu, index))
2015                         active = true;
2016         }
2017
2018         /* free map if not active */
2019         if (!active) {
2020                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2021                 kfree_rcu(dev_maps, rcu);
2022         }
2023
2024 out_no_maps:
2025         mutex_unlock(&xps_map_mutex);
2026
2027         return 0;
2028 error:
2029         /* remove any maps that we added */
2030         for_each_possible_cpu(cpu) {
2031                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2032                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2033                                  NULL;
2034                 if (new_map && new_map != map)
2035                         kfree(new_map);
2036         }
2037
2038         mutex_unlock(&xps_map_mutex);
2039
2040         kfree(new_dev_maps);
2041         return -ENOMEM;
2042 }
2043 EXPORT_SYMBOL(netif_set_xps_queue);
2044
2045 #endif
2046 /*
2047  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2048  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2049  */
2050 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2051 {
2052         int rc;
2053
2054         if (txq < 1 || txq > dev->num_tx_queues)
2055                 return -EINVAL;
2056
2057         if (dev->reg_state == NETREG_REGISTERED ||
2058             dev->reg_state == NETREG_UNREGISTERING) {
2059                 ASSERT_RTNL();
2060
2061                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2062                                                   txq);
2063                 if (rc)
2064                         return rc;
2065
2066                 if (dev->num_tc)
2067                         netif_setup_tc(dev, txq);
2068
2069                 if (txq < dev->real_num_tx_queues) {
2070                         qdisc_reset_all_tx_gt(dev, txq);
2071 #ifdef CONFIG_XPS
2072                         netif_reset_xps_queues_gt(dev, txq);
2073 #endif
2074                 }
2075         }
2076
2077         dev->real_num_tx_queues = txq;
2078         return 0;
2079 }
2080 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2081
2082 #ifdef CONFIG_RPS
2083 /**
2084  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2085  *      @dev: Network device
2086  *      @rxq: Actual number of RX queues
2087  *
2088  *      This must be called either with the rtnl_lock held or before
2089  *      registration of the net device.  Returns 0 on success, or a
2090  *      negative error code.  If called before registration, it always
2091  *      succeeds.
2092  */
2093 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2094 {
2095         int rc;
2096
2097         if (rxq < 1 || rxq > dev->num_rx_queues)
2098                 return -EINVAL;
2099
2100         if (dev->reg_state == NETREG_REGISTERED) {
2101                 ASSERT_RTNL();
2102
2103                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2104                                                   rxq);
2105                 if (rc)
2106                         return rc;
2107         }
2108
2109         dev->real_num_rx_queues = rxq;
2110         return 0;
2111 }
2112 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2113 #endif
2114
2115 /**
2116  * netif_get_num_default_rss_queues - default number of RSS queues
2117  *
2118  * This routine should set an upper limit on the number of RSS queues
2119  * used by default by multiqueue devices.
2120  */
2121 int netif_get_num_default_rss_queues(void)
2122 {
2123         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2124 }
2125 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2126
2127 static inline void __netif_reschedule(struct Qdisc *q)
2128 {
2129         struct softnet_data *sd;
2130         unsigned long flags;
2131
2132         local_irq_save(flags);
2133         sd = &__get_cpu_var(softnet_data);
2134         q->next_sched = NULL;
2135         *sd->output_queue_tailp = q;
2136         sd->output_queue_tailp = &q->next_sched;
2137         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2138         local_irq_restore(flags);
2139 }
2140
2141 void __netif_schedule(struct Qdisc *q)
2142 {
2143         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2144                 __netif_reschedule(q);
2145 }
2146 EXPORT_SYMBOL(__netif_schedule);
2147
2148 struct dev_kfree_skb_cb {
2149         enum skb_free_reason reason;
2150 };
2151
2152 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2153 {
2154         return (struct dev_kfree_skb_cb *)skb->cb;
2155 }
2156
2157 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2158 {
2159         unsigned long flags;
2160
2161         if (likely(atomic_read(&skb->users) == 1)) {
2162                 smp_rmb();
2163                 atomic_set(&skb->users, 0);
2164         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2165                 return;
2166         }
2167         get_kfree_skb_cb(skb)->reason = reason;
2168         local_irq_save(flags);
2169         skb->next = __this_cpu_read(softnet_data.completion_queue);
2170         __this_cpu_write(softnet_data.completion_queue, skb);
2171         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2172         local_irq_restore(flags);
2173 }
2174 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2175
2176 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2177 {
2178         if (in_irq() || irqs_disabled())
2179                 __dev_kfree_skb_irq(skb, reason);
2180         else
2181                 dev_kfree_skb(skb);
2182 }
2183 EXPORT_SYMBOL(__dev_kfree_skb_any);
2184
2185
2186 /**
2187  * netif_device_detach - mark device as removed
2188  * @dev: network device
2189  *
2190  * Mark device as removed from system and therefore no longer available.
2191  */
2192 void netif_device_detach(struct net_device *dev)
2193 {
2194         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2195             netif_running(dev)) {
2196                 netif_tx_stop_all_queues(dev);
2197         }
2198 }
2199 EXPORT_SYMBOL(netif_device_detach);
2200
2201 /**
2202  * netif_device_attach - mark device as attached
2203  * @dev: network device
2204  *
2205  * Mark device as attached from system and restart if needed.
2206  */
2207 void netif_device_attach(struct net_device *dev)
2208 {
2209         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2210             netif_running(dev)) {
2211                 netif_tx_wake_all_queues(dev);
2212                 __netdev_watchdog_up(dev);
2213         }
2214 }
2215 EXPORT_SYMBOL(netif_device_attach);
2216
2217 static void skb_warn_bad_offload(const struct sk_buff *skb)
2218 {
2219         static const netdev_features_t null_features = 0;
2220         struct net_device *dev = skb->dev;
2221         const char *driver = "";
2222
2223         if (!net_ratelimit())
2224                 return;
2225
2226         if (dev && dev->dev.parent)
2227                 driver = dev_driver_string(dev->dev.parent);
2228
2229         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2230              "gso_type=%d ip_summed=%d\n",
2231              driver, dev ? &dev->features : &null_features,
2232              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2233              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2234              skb_shinfo(skb)->gso_type, skb->ip_summed);
2235 }
2236
2237 /*
2238  * Invalidate hardware checksum when packet is to be mangled, and
2239  * complete checksum manually on outgoing path.
2240  */
2241 int skb_checksum_help(struct sk_buff *skb)
2242 {
2243         __wsum csum;
2244         int ret = 0, offset;
2245
2246         if (skb->ip_summed == CHECKSUM_COMPLETE)
2247                 goto out_set_summed;
2248
2249         if (unlikely(skb_shinfo(skb)->gso_size)) {
2250                 skb_warn_bad_offload(skb);
2251                 return -EINVAL;
2252         }
2253
2254         /* Before computing a checksum, we should make sure no frag could
2255          * be modified by an external entity : checksum could be wrong.
2256          */
2257         if (skb_has_shared_frag(skb)) {
2258                 ret = __skb_linearize(skb);
2259                 if (ret)
2260                         goto out;
2261         }
2262
2263         offset = skb_checksum_start_offset(skb);
2264         BUG_ON(offset >= skb_headlen(skb));
2265         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2266
2267         offset += skb->csum_offset;
2268         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2269
2270         if (skb_cloned(skb) &&
2271             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2272                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2273                 if (ret)
2274                         goto out;
2275         }
2276
2277         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2278 out_set_summed:
2279         skb->ip_summed = CHECKSUM_NONE;
2280 out:
2281         return ret;
2282 }
2283 EXPORT_SYMBOL(skb_checksum_help);
2284
2285 __be16 skb_network_protocol(struct sk_buff *skb)
2286 {
2287         __be16 type = skb->protocol;
2288         int vlan_depth = ETH_HLEN;
2289
2290         /* Tunnel gso handlers can set protocol to ethernet. */
2291         if (type == htons(ETH_P_TEB)) {
2292                 struct ethhdr *eth;
2293
2294                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2295                         return 0;
2296
2297                 eth = (struct ethhdr *)skb_mac_header(skb);
2298                 type = eth->h_proto;
2299         }
2300
2301         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2302                 struct vlan_hdr *vh;
2303
2304                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2305                         return 0;
2306
2307                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2308                 type = vh->h_vlan_encapsulated_proto;
2309                 vlan_depth += VLAN_HLEN;
2310         }
2311
2312         return type;
2313 }
2314
2315 /**
2316  *      skb_mac_gso_segment - mac layer segmentation handler.
2317  *      @skb: buffer to segment
2318  *      @features: features for the output path (see dev->features)
2319  */
2320 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2321                                     netdev_features_t features)
2322 {
2323         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2324         struct packet_offload *ptype;
2325         __be16 type = skb_network_protocol(skb);
2326
2327         if (unlikely(!type))
2328                 return ERR_PTR(-EINVAL);
2329
2330         __skb_pull(skb, skb->mac_len);
2331
2332         rcu_read_lock();
2333         list_for_each_entry_rcu(ptype, &offload_base, list) {
2334                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2335                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2336                                 int err;
2337
2338                                 err = ptype->callbacks.gso_send_check(skb);
2339                                 segs = ERR_PTR(err);
2340                                 if (err || skb_gso_ok(skb, features))
2341                                         break;
2342                                 __skb_push(skb, (skb->data -
2343                                                  skb_network_header(skb)));
2344                         }
2345                         segs = ptype->callbacks.gso_segment(skb, features);
2346                         break;
2347                 }
2348         }
2349         rcu_read_unlock();
2350
2351         __skb_push(skb, skb->data - skb_mac_header(skb));
2352
2353         return segs;
2354 }
2355 EXPORT_SYMBOL(skb_mac_gso_segment);
2356
2357
2358 /* openvswitch calls this on rx path, so we need a different check.
2359  */
2360 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2361 {
2362         if (tx_path)
2363                 return skb->ip_summed != CHECKSUM_PARTIAL;
2364         else
2365                 return skb->ip_summed == CHECKSUM_NONE;
2366 }
2367
2368 /**
2369  *      __skb_gso_segment - Perform segmentation on skb.
2370  *      @skb: buffer to segment
2371  *      @features: features for the output path (see dev->features)
2372  *      @tx_path: whether it is called in TX path
2373  *
2374  *      This function segments the given skb and returns a list of segments.
2375  *
2376  *      It may return NULL if the skb requires no segmentation.  This is
2377  *      only possible when GSO is used for verifying header integrity.
2378  */
2379 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2380                                   netdev_features_t features, bool tx_path)
2381 {
2382         if (unlikely(skb_needs_check(skb, tx_path))) {
2383                 int err;
2384
2385                 skb_warn_bad_offload(skb);
2386
2387                 if (skb_header_cloned(skb) &&
2388                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2389                         return ERR_PTR(err);
2390         }
2391
2392         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2393         SKB_GSO_CB(skb)->encap_level = 0;
2394
2395         skb_reset_mac_header(skb);
2396         skb_reset_mac_len(skb);
2397
2398         return skb_mac_gso_segment(skb, features);
2399 }
2400 EXPORT_SYMBOL(__skb_gso_segment);
2401
2402 /* Take action when hardware reception checksum errors are detected. */
2403 #ifdef CONFIG_BUG
2404 void netdev_rx_csum_fault(struct net_device *dev)
2405 {
2406         if (net_ratelimit()) {
2407                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2408                 dump_stack();
2409         }
2410 }
2411 EXPORT_SYMBOL(netdev_rx_csum_fault);
2412 #endif
2413
2414 /* Actually, we should eliminate this check as soon as we know, that:
2415  * 1. IOMMU is present and allows to map all the memory.
2416  * 2. No high memory really exists on this machine.
2417  */
2418
2419 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2420 {
2421 #ifdef CONFIG_HIGHMEM
2422         int i;
2423         if (!(dev->features & NETIF_F_HIGHDMA)) {
2424                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2425                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2426                         if (PageHighMem(skb_frag_page(frag)))
2427                                 return 1;
2428                 }
2429         }
2430
2431         if (PCI_DMA_BUS_IS_PHYS) {
2432                 struct device *pdev = dev->dev.parent;
2433
2434                 if (!pdev)
2435                         return 0;
2436                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2437                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2438                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2439                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2440                                 return 1;
2441                 }
2442         }
2443 #endif
2444         return 0;
2445 }
2446
2447 struct dev_gso_cb {
2448         void (*destructor)(struct sk_buff *skb);
2449 };
2450
2451 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2452
2453 static void dev_gso_skb_destructor(struct sk_buff *skb)
2454 {
2455         struct dev_gso_cb *cb;
2456
2457         kfree_skb_list(skb->next);
2458         skb->next = NULL;
2459
2460         cb = DEV_GSO_CB(skb);
2461         if (cb->destructor)
2462                 cb->destructor(skb);
2463 }
2464
2465 /**
2466  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2467  *      @skb: buffer to segment
2468  *      @features: device features as applicable to this skb
2469  *
2470  *      This function segments the given skb and stores the list of segments
2471  *      in skb->next.
2472  */
2473 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2474 {
2475         struct sk_buff *segs;
2476
2477         segs = skb_gso_segment(skb, features);
2478
2479         /* Verifying header integrity only. */
2480         if (!segs)
2481                 return 0;
2482
2483         if (IS_ERR(segs))
2484                 return PTR_ERR(segs);
2485
2486         skb->next = segs;
2487         DEV_GSO_CB(skb)->destructor = skb->destructor;
2488         skb->destructor = dev_gso_skb_destructor;
2489
2490         return 0;
2491 }
2492
2493 static netdev_features_t harmonize_features(struct sk_buff *skb,
2494         netdev_features_t features)
2495 {
2496         if (skb->ip_summed != CHECKSUM_NONE &&
2497             !can_checksum_protocol(features, skb_network_protocol(skb))) {
2498                 features &= ~NETIF_F_ALL_CSUM;
2499         } else if (illegal_highdma(skb->dev, skb)) {
2500                 features &= ~NETIF_F_SG;
2501         }
2502
2503         return features;
2504 }
2505
2506 netdev_features_t netif_skb_features(struct sk_buff *skb)
2507 {
2508         __be16 protocol = skb->protocol;
2509         netdev_features_t features = skb->dev->features;
2510
2511         if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2512                 features &= ~NETIF_F_GSO_MASK;
2513
2514         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2515                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2516                 protocol = veh->h_vlan_encapsulated_proto;
2517         } else if (!vlan_tx_tag_present(skb)) {
2518                 return harmonize_features(skb, features);
2519         }
2520
2521         features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2522                                                NETIF_F_HW_VLAN_STAG_TX);
2523
2524         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD))
2525                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2526                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2527                                 NETIF_F_HW_VLAN_STAG_TX;
2528
2529         return harmonize_features(skb, features);
2530 }
2531 EXPORT_SYMBOL(netif_skb_features);
2532
2533 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2534                         struct netdev_queue *txq, void *accel_priv)
2535 {
2536         const struct net_device_ops *ops = dev->netdev_ops;
2537         int rc = NETDEV_TX_OK;
2538         unsigned int skb_len;
2539
2540         if (likely(!skb->next)) {
2541                 netdev_features_t features;
2542
2543                 /*
2544                  * If device doesn't need skb->dst, release it right now while
2545                  * its hot in this cpu cache
2546                  */
2547                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2548                         skb_dst_drop(skb);
2549
2550                 features = netif_skb_features(skb);
2551
2552                 if (vlan_tx_tag_present(skb) &&
2553                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2554                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2555                                              vlan_tx_tag_get(skb));
2556                         if (unlikely(!skb))
2557                                 goto out;
2558
2559                         skb->vlan_tci = 0;
2560                 }
2561
2562                 /* If encapsulation offload request, verify we are testing
2563                  * hardware encapsulation features instead of standard
2564                  * features for the netdev
2565                  */
2566                 if (skb->encapsulation)
2567                         features &= dev->hw_enc_features;
2568
2569                 if (netif_needs_gso(skb, features)) {
2570                         if (unlikely(dev_gso_segment(skb, features)))
2571                                 goto out_kfree_skb;
2572                         if (skb->next)
2573                                 goto gso;
2574                 } else {
2575                         if (skb_needs_linearize(skb, features) &&
2576                             __skb_linearize(skb))
2577                                 goto out_kfree_skb;
2578
2579                         /* If packet is not checksummed and device does not
2580                          * support checksumming for this protocol, complete
2581                          * checksumming here.
2582                          */
2583                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2584                                 if (skb->encapsulation)
2585                                         skb_set_inner_transport_header(skb,
2586                                                 skb_checksum_start_offset(skb));
2587                                 else
2588                                         skb_set_transport_header(skb,
2589                                                 skb_checksum_start_offset(skb));
2590                                 if (!(features & NETIF_F_ALL_CSUM) &&
2591                                      skb_checksum_help(skb))
2592                                         goto out_kfree_skb;
2593                         }
2594                 }
2595
2596                 if (!list_empty(&ptype_all))
2597                         dev_queue_xmit_nit(skb, dev);
2598
2599                 skb_len = skb->len;
2600                 if (accel_priv)
2601                         rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv);
2602                 else
2603                         rc = ops->ndo_start_xmit(skb, dev);
2604
2605                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2606                 if (rc == NETDEV_TX_OK && txq)
2607                         txq_trans_update(txq);
2608                 return rc;
2609         }
2610
2611 gso:
2612         do {
2613                 struct sk_buff *nskb = skb->next;
2614
2615                 skb->next = nskb->next;
2616                 nskb->next = NULL;
2617
2618                 if (!list_empty(&ptype_all))
2619                         dev_queue_xmit_nit(nskb, dev);
2620
2621                 skb_len = nskb->len;
2622                 if (accel_priv)
2623                         rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv);
2624                 else
2625                         rc = ops->ndo_start_xmit(nskb, dev);
2626                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2627                 if (unlikely(rc != NETDEV_TX_OK)) {
2628                         if (rc & ~NETDEV_TX_MASK)
2629                                 goto out_kfree_gso_skb;
2630                         nskb->next = skb->next;
2631                         skb->next = nskb;
2632                         return rc;
2633                 }
2634                 txq_trans_update(txq);
2635                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2636                         return NETDEV_TX_BUSY;
2637         } while (skb->next);
2638
2639 out_kfree_gso_skb:
2640         if (likely(skb->next == NULL)) {
2641                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2642                 consume_skb(skb);
2643                 return rc;
2644         }
2645 out_kfree_skb:
2646         kfree_skb(skb);
2647 out:
2648         return rc;
2649 }
2650 EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
2651
2652 static void qdisc_pkt_len_init(struct sk_buff *skb)
2653 {
2654         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2655
2656         qdisc_skb_cb(skb)->pkt_len = skb->len;
2657
2658         /* To get more precise estimation of bytes sent on wire,
2659          * we add to pkt_len the headers size of all segments
2660          */
2661         if (shinfo->gso_size)  {
2662                 unsigned int hdr_len;
2663                 u16 gso_segs = shinfo->gso_segs;
2664
2665                 /* mac layer + network layer */
2666                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2667
2668                 /* + transport layer */
2669                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2670                         hdr_len += tcp_hdrlen(skb);
2671                 else
2672                         hdr_len += sizeof(struct udphdr);
2673
2674                 if (shinfo->gso_type & SKB_GSO_DODGY)
2675                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2676                                                 shinfo->gso_size);
2677
2678                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2679         }
2680 }
2681
2682 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2683                                  struct net_device *dev,
2684                                  struct netdev_queue *txq)
2685 {
2686         spinlock_t *root_lock = qdisc_lock(q);
2687         bool contended;
2688         int rc;
2689
2690         qdisc_pkt_len_init(skb);
2691         qdisc_calculate_pkt_len(skb, q);
2692         /*
2693          * Heuristic to force contended enqueues to serialize on a
2694          * separate lock before trying to get qdisc main lock.
2695          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2696          * and dequeue packets faster.
2697          */
2698         contended = qdisc_is_running(q);
2699         if (unlikely(contended))
2700                 spin_lock(&q->busylock);
2701
2702         spin_lock(root_lock);
2703         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2704                 kfree_skb(skb);
2705                 rc = NET_XMIT_DROP;
2706         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2707                    qdisc_run_begin(q)) {
2708                 /*
2709                  * This is a work-conserving queue; there are no old skbs
2710                  * waiting to be sent out; and the qdisc is not running -
2711                  * xmit the skb directly.
2712                  */
2713                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2714                         skb_dst_force(skb);
2715
2716                 qdisc_bstats_update(q, skb);
2717
2718                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2719                         if (unlikely(contended)) {
2720                                 spin_unlock(&q->busylock);
2721                                 contended = false;
2722                         }
2723                         __qdisc_run(q);
2724                 } else
2725                         qdisc_run_end(q);
2726
2727                 rc = NET_XMIT_SUCCESS;
2728         } else {
2729                 skb_dst_force(skb);
2730                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2731                 if (qdisc_run_begin(q)) {
2732                         if (unlikely(contended)) {
2733                                 spin_unlock(&q->busylock);
2734                                 contended = false;
2735                         }
2736                         __qdisc_run(q);
2737                 }
2738         }
2739         spin_unlock(root_lock);
2740         if (unlikely(contended))
2741                 spin_unlock(&q->busylock);
2742         return rc;
2743 }
2744
2745 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2746 static void skb_update_prio(struct sk_buff *skb)
2747 {
2748         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2749
2750         if (!skb->priority && skb->sk && map) {
2751                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2752
2753                 if (prioidx < map->priomap_len)
2754                         skb->priority = map->priomap[prioidx];
2755         }
2756 }
2757 #else
2758 #define skb_update_prio(skb)
2759 #endif
2760
2761 static DEFINE_PER_CPU(int, xmit_recursion);
2762 #define RECURSION_LIMIT 10
2763
2764 /**
2765  *      dev_loopback_xmit - loop back @skb
2766  *      @skb: buffer to transmit
2767  */
2768 int dev_loopback_xmit(struct sk_buff *skb)
2769 {
2770         skb_reset_mac_header(skb);
2771         __skb_pull(skb, skb_network_offset(skb));
2772         skb->pkt_type = PACKET_LOOPBACK;
2773         skb->ip_summed = CHECKSUM_UNNECESSARY;
2774         WARN_ON(!skb_dst(skb));
2775         skb_dst_force(skb);
2776         netif_rx_ni(skb);
2777         return 0;
2778 }
2779 EXPORT_SYMBOL(dev_loopback_xmit);
2780
2781 /**
2782  *      dev_queue_xmit - transmit a buffer
2783  *      @skb: buffer to transmit
2784  *
2785  *      Queue a buffer for transmission to a network device. The caller must
2786  *      have set the device and priority and built the buffer before calling
2787  *      this function. The function can be called from an interrupt.
2788  *
2789  *      A negative errno code is returned on a failure. A success does not
2790  *      guarantee the frame will be transmitted as it may be dropped due
2791  *      to congestion or traffic shaping.
2792  *
2793  * -----------------------------------------------------------------------------------
2794  *      I notice this method can also return errors from the queue disciplines,
2795  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2796  *      be positive.
2797  *
2798  *      Regardless of the return value, the skb is consumed, so it is currently
2799  *      difficult to retry a send to this method.  (You can bump the ref count
2800  *      before sending to hold a reference for retry if you are careful.)
2801  *
2802  *      When calling this method, interrupts MUST be enabled.  This is because
2803  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2804  *          --BLG
2805  */
2806 int dev_queue_xmit(struct sk_buff *skb)
2807 {
2808         struct net_device *dev = skb->dev;
2809         struct netdev_queue *txq;
2810         struct Qdisc *q;
2811         int rc = -ENOMEM;
2812
2813         skb_reset_mac_header(skb);
2814
2815         /* Disable soft irqs for various locks below. Also
2816          * stops preemption for RCU.
2817          */
2818         rcu_read_lock_bh();
2819
2820         skb_update_prio(skb);
2821
2822         txq = netdev_pick_tx(dev, skb);
2823         q = rcu_dereference_bh(txq->qdisc);
2824
2825 #ifdef CONFIG_NET_CLS_ACT
2826         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2827 #endif
2828         trace_net_dev_queue(skb);
2829         if (q->enqueue) {
2830                 rc = __dev_xmit_skb(skb, q, dev, txq);
2831                 goto out;
2832         }
2833
2834         /* The device has no queue. Common case for software devices:
2835            loopback, all the sorts of tunnels...
2836
2837            Really, it is unlikely that netif_tx_lock protection is necessary
2838            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2839            counters.)
2840            However, it is possible, that they rely on protection
2841            made by us here.
2842
2843            Check this and shot the lock. It is not prone from deadlocks.
2844            Either shot noqueue qdisc, it is even simpler 8)
2845          */
2846         if (dev->flags & IFF_UP) {
2847                 int cpu = smp_processor_id(); /* ok because BHs are off */
2848
2849                 if (txq->xmit_lock_owner != cpu) {
2850
2851                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2852                                 goto recursion_alert;
2853
2854                         HARD_TX_LOCK(dev, txq, cpu);
2855
2856                         if (!netif_xmit_stopped(txq)) {
2857                                 __this_cpu_inc(xmit_recursion);
2858                                 rc = dev_hard_start_xmit(skb, dev, txq, NULL);
2859                                 __this_cpu_dec(xmit_recursion);
2860                                 if (dev_xmit_complete(rc)) {
2861                                         HARD_TX_UNLOCK(dev, txq);
2862                                         goto out;
2863                                 }
2864                         }
2865                         HARD_TX_UNLOCK(dev, txq);
2866                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2867                                              dev->name);
2868                 } else {
2869                         /* Recursion is detected! It is possible,
2870                          * unfortunately
2871                          */
2872 recursion_alert:
2873                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2874                                              dev->name);
2875                 }
2876         }
2877
2878         rc = -ENETDOWN;
2879         rcu_read_unlock_bh();
2880
2881         kfree_skb(skb);
2882         return rc;
2883 out:
2884         rcu_read_unlock_bh();
2885         return rc;
2886 }
2887 EXPORT_SYMBOL(dev_queue_xmit);
2888
2889
2890 /*=======================================================================
2891                         Receiver routines
2892   =======================================================================*/
2893
2894 int netdev_max_backlog __read_mostly = 1000;
2895 EXPORT_SYMBOL(netdev_max_backlog);
2896
2897 int netdev_tstamp_prequeue __read_mostly = 1;
2898 int netdev_budget __read_mostly = 300;
2899 int weight_p __read_mostly = 64;            /* old backlog weight */
2900
2901 /* Called with irq disabled */
2902 static inline void ____napi_schedule(struct softnet_data *sd,
2903                                      struct napi_struct *napi)
2904 {
2905         list_add_tail(&napi->poll_list, &sd->poll_list);
2906         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2907 }
2908
2909 #ifdef CONFIG_RPS
2910
2911 /* One global table that all flow-based protocols share. */
2912 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2913 EXPORT_SYMBOL(rps_sock_flow_table);
2914
2915 struct static_key rps_needed __read_mostly;
2916
2917 static struct rps_dev_flow *
2918 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2919             struct rps_dev_flow *rflow, u16 next_cpu)
2920 {
2921         if (next_cpu != RPS_NO_CPU) {
2922 #ifdef CONFIG_RFS_ACCEL
2923                 struct netdev_rx_queue *rxqueue;
2924                 struct rps_dev_flow_table *flow_table;
2925                 struct rps_dev_flow *old_rflow;
2926                 u32 flow_id;
2927                 u16 rxq_index;
2928                 int rc;
2929
2930                 /* Should we steer this flow to a different hardware queue? */
2931                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2932                     !(dev->features & NETIF_F_NTUPLE))
2933                         goto out;
2934                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2935                 if (rxq_index == skb_get_rx_queue(skb))
2936                         goto out;
2937
2938                 rxqueue = dev->_rx + rxq_index;
2939                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2940                 if (!flow_table)
2941                         goto out;
2942                 flow_id = skb->rxhash & flow_table->mask;
2943                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2944                                                         rxq_index, flow_id);
2945                 if (rc < 0)
2946                         goto out;
2947                 old_rflow = rflow;
2948                 rflow = &flow_table->flows[flow_id];
2949                 rflow->filter = rc;
2950                 if (old_rflow->filter == rflow->filter)
2951                         old_rflow->filter = RPS_NO_FILTER;
2952         out:
2953 #endif
2954                 rflow->last_qtail =
2955                         per_cpu(softnet_data, next_cpu).input_queue_head;
2956         }
2957
2958         rflow->cpu = next_cpu;
2959         return rflow;
2960 }
2961
2962 /*
2963  * get_rps_cpu is called from netif_receive_skb and returns the target
2964  * CPU from the RPS map of the receiving queue for a given skb.
2965  * rcu_read_lock must be held on entry.
2966  */
2967 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2968                        struct rps_dev_flow **rflowp)
2969 {
2970         struct netdev_rx_queue *rxqueue;
2971         struct rps_map *map;
2972         struct rps_dev_flow_table *flow_table;
2973         struct rps_sock_flow_table *sock_flow_table;
2974         int cpu = -1;
2975         u16 tcpu;
2976
2977         if (skb_rx_queue_recorded(skb)) {
2978                 u16 index = skb_get_rx_queue(skb);
2979                 if (unlikely(index >= dev->real_num_rx_queues)) {
2980                         WARN_ONCE(dev->real_num_rx_queues > 1,
2981                                   "%s received packet on queue %u, but number "
2982                                   "of RX queues is %u\n",
2983                                   dev->name, index, dev->real_num_rx_queues);
2984                         goto done;
2985                 }
2986                 rxqueue = dev->_rx + index;
2987         } else
2988                 rxqueue = dev->_rx;
2989
2990         map = rcu_dereference(rxqueue->rps_map);
2991         if (map) {
2992                 if (map->len == 1 &&
2993                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2994                         tcpu = map->cpus[0];
2995                         if (cpu_online(tcpu))
2996                                 cpu = tcpu;
2997                         goto done;
2998                 }
2999         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3000                 goto done;
3001         }
3002
3003         skb_reset_network_header(skb);
3004         if (!skb_get_hash(skb))
3005                 goto done;
3006
3007         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3008         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3009         if (flow_table && sock_flow_table) {
3010                 u16 next_cpu;
3011                 struct rps_dev_flow *rflow;
3012
3013                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3014                 tcpu = rflow->cpu;
3015
3016                 next_cpu = sock_flow_table->ents[skb->rxhash &
3017                     sock_flow_table->mask];
3018
3019                 /*
3020                  * If the desired CPU (where last recvmsg was done) is
3021                  * different from current CPU (one in the rx-queue flow
3022                  * table entry), switch if one of the following holds:
3023                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3024                  *   - Current CPU is offline.
3025                  *   - The current CPU's queue tail has advanced beyond the
3026                  *     last packet that was enqueued using this table entry.
3027                  *     This guarantees that all previous packets for the flow
3028                  *     have been dequeued, thus preserving in order delivery.
3029                  */
3030                 if (unlikely(tcpu != next_cpu) &&
3031                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3032                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3033                       rflow->last_qtail)) >= 0)) {
3034                         tcpu = next_cpu;
3035                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3036                 }
3037
3038                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3039                         *rflowp = rflow;
3040                         cpu = tcpu;
3041                         goto done;
3042                 }
3043         }
3044
3045         if (map) {
3046                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3047
3048                 if (cpu_online(tcpu)) {
3049                         cpu = tcpu;
3050                         goto done;
3051                 }
3052         }
3053
3054 done:
3055         return cpu;
3056 }
3057
3058 #ifdef CONFIG_RFS_ACCEL
3059
3060 /**
3061  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3062  * @dev: Device on which the filter was set
3063  * @rxq_index: RX queue index
3064  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3065  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3066  *
3067  * Drivers that implement ndo_rx_flow_steer() should periodically call
3068  * this function for each installed filter and remove the filters for
3069  * which it returns %true.
3070  */
3071 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3072                          u32 flow_id, u16 filter_id)
3073 {
3074         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3075         struct rps_dev_flow_table *flow_table;
3076         struct rps_dev_flow *rflow;
3077         bool expire = true;
3078         int cpu;
3079
3080         rcu_read_lock();
3081         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3082         if (flow_table && flow_id <= flow_table->mask) {
3083                 rflow = &flow_table->flows[flow_id];
3084                 cpu = ACCESS_ONCE(rflow->cpu);
3085                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3086                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3087                            rflow->last_qtail) <
3088                      (int)(10 * flow_table->mask)))
3089                         expire = false;
3090         }
3091         rcu_read_unlock();
3092         return expire;
3093 }
3094 EXPORT_SYMBOL(rps_may_expire_flow);
3095
3096 #endif /* CONFIG_RFS_ACCEL */
3097
3098 /* Called from hardirq (IPI) context */
3099 static void rps_trigger_softirq(void *data)
3100 {
3101         struct softnet_data *sd = data;
3102
3103         ____napi_schedule(sd, &sd->backlog);
3104         sd->received_rps++;
3105 }
3106
3107 #endif /* CONFIG_RPS */
3108
3109 /*
3110  * Check if this softnet_data structure is another cpu one
3111  * If yes, queue it to our IPI list and return 1
3112  * If no, return 0
3113  */
3114 static int rps_ipi_queued(struct softnet_data *sd)
3115 {
3116 #ifdef CONFIG_RPS
3117         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3118
3119         if (sd != mysd) {
3120                 sd->rps_ipi_next = mysd->rps_ipi_list;
3121                 mysd->rps_ipi_list = sd;
3122
3123                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3124                 return 1;
3125         }
3126 #endif /* CONFIG_RPS */
3127         return 0;
3128 }
3129
3130 #ifdef CONFIG_NET_FLOW_LIMIT
3131 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3132 #endif
3133
3134 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3135 {
3136 #ifdef CONFIG_NET_FLOW_LIMIT
3137         struct sd_flow_limit *fl;
3138         struct softnet_data *sd;
3139         unsigned int old_flow, new_flow;
3140
3141         if (qlen < (netdev_max_backlog >> 1))
3142                 return false;
3143
3144         sd = &__get_cpu_var(softnet_data);
3145
3146         rcu_read_lock();
3147         fl = rcu_dereference(sd->flow_limit);
3148         if (fl) {
3149                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3150                 old_flow = fl->history[fl->history_head];
3151                 fl->history[fl->history_head] = new_flow;
3152
3153                 fl->history_head++;
3154                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3155
3156                 if (likely(fl->buckets[old_flow]))
3157                         fl->buckets[old_flow]--;
3158
3159                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3160                         fl->count++;
3161                         rcu_read_unlock();
3162                         return true;
3163                 }
3164         }
3165         rcu_read_unlock();
3166 #endif
3167         return false;
3168 }
3169
3170 /*
3171  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3172  * queue (may be a remote CPU queue).
3173  */
3174 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3175                               unsigned int *qtail)
3176 {
3177         struct softnet_data *sd;
3178         unsigned long flags;
3179         unsigned int qlen;
3180
3181         sd = &per_cpu(softnet_data, cpu);
3182
3183         local_irq_save(flags);
3184
3185         rps_lock(sd);
3186         qlen = skb_queue_len(&sd->input_pkt_queue);
3187         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3188                 if (skb_queue_len(&sd->input_pkt_queue)) {
3189 enqueue:
3190                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3191                         input_queue_tail_incr_save(sd, qtail);
3192                         rps_unlock(sd);
3193                         local_irq_restore(flags);
3194                         return NET_RX_SUCCESS;
3195                 }
3196
3197                 /* Schedule NAPI for backlog device
3198                  * We can use non atomic operation since we own the queue lock
3199                  */
3200                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3201                         if (!rps_ipi_queued(sd))
3202                                 ____napi_schedule(sd, &sd->backlog);
3203                 }
3204                 goto enqueue;
3205         }
3206
3207         sd->dropped++;
3208         rps_unlock(sd);
3209
3210         local_irq_restore(flags);
3211
3212         atomic_long_inc(&skb->dev->rx_dropped);
3213         kfree_skb(skb);
3214         return NET_RX_DROP;
3215 }
3216
3217 /**
3218  *      netif_rx        -       post buffer to the network code
3219  *      @skb: buffer to post
3220  *
3221  *      This function receives a packet from a device driver and queues it for
3222  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3223  *      may be dropped during processing for congestion control or by the
3224  *      protocol layers.
3225  *
3226  *      return values:
3227  *      NET_RX_SUCCESS  (no congestion)
3228  *      NET_RX_DROP     (packet was dropped)
3229  *
3230  */
3231
3232 int netif_rx(struct sk_buff *skb)
3233 {
3234         int ret;
3235
3236         /* if netpoll wants it, pretend we never saw it */
3237         if (netpoll_rx(skb))
3238                 return NET_RX_DROP;
3239
3240         net_timestamp_check(netdev_tstamp_prequeue, skb);
3241
3242         trace_netif_rx(skb);
3243 #ifdef CONFIG_RPS
3244         if (static_key_false(&rps_needed)) {
3245                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3246                 int cpu;
3247
3248                 preempt_disable();
3249                 rcu_read_lock();
3250
3251                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3252                 if (cpu < 0)
3253                         cpu = smp_processor_id();
3254
3255                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3256
3257                 rcu_read_unlock();
3258                 preempt_enable();
3259         } else
3260 #endif
3261         {
3262                 unsigned int qtail;
3263                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3264                 put_cpu();
3265         }
3266         return ret;
3267 }
3268 EXPORT_SYMBOL(netif_rx);
3269
3270 int netif_rx_ni(struct sk_buff *skb)
3271 {
3272         int err;
3273
3274         preempt_disable();
3275         err = netif_rx(skb);
3276         if (local_softirq_pending())
3277                 do_softirq();
3278         preempt_enable();
3279
3280         return err;
3281 }
3282 EXPORT_SYMBOL(netif_rx_ni);
3283
3284 static void net_tx_action(struct softirq_action *h)
3285 {
3286         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3287
3288         if (sd->completion_queue) {
3289                 struct sk_buff *clist;
3290
3291                 local_irq_disable();
3292                 clist = sd->completion_queue;
3293                 sd->completion_queue = NULL;
3294                 local_irq_enable();
3295
3296                 while (clist) {
3297                         struct sk_buff *skb = clist;
3298                         clist = clist->next;
3299
3300                         WARN_ON(atomic_read(&skb->users));
3301                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3302                                 trace_consume_skb(skb);
3303                         else
3304                                 trace_kfree_skb(skb, net_tx_action);
3305                         __kfree_skb(skb);
3306                 }
3307         }
3308
3309         if (sd->output_queue) {
3310                 struct Qdisc *head;
3311
3312                 local_irq_disable();
3313                 head = sd->output_queue;
3314                 sd->output_queue = NULL;
3315                 sd->output_queue_tailp = &sd->output_queue;
3316                 local_irq_enable();
3317
3318                 while (head) {
3319                         struct Qdisc *q = head;
3320                         spinlock_t *root_lock;
3321
3322                         head = head->next_sched;
3323
3324                         root_lock = qdisc_lock(q);
3325                         if (spin_trylock(root_lock)) {
3326                                 smp_mb__before_clear_bit();
3327                                 clear_bit(__QDISC_STATE_SCHED,
3328                                           &q->state);
3329                                 qdisc_run(q);
3330                                 spin_unlock(root_lock);
3331                         } else {
3332                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3333                                               &q->state)) {
3334                                         __netif_reschedule(q);
3335                                 } else {
3336                                         smp_mb__before_clear_bit();
3337                                         clear_bit(__QDISC_STATE_SCHED,
3338                                                   &q->state);
3339                                 }
3340                         }
3341                 }
3342         }
3343 }
3344
3345 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3346     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3347 /* This hook is defined here for ATM LANE */
3348 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3349                              unsigned char *addr) __read_mostly;
3350 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3351 #endif
3352
3353 #ifdef CONFIG_NET_CLS_ACT
3354 /* TODO: Maybe we should just force sch_ingress to be compiled in
3355  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3356  * a compare and 2 stores extra right now if we dont have it on
3357  * but have CONFIG_NET_CLS_ACT
3358  * NOTE: This doesn't stop any functionality; if you dont have
3359  * the ingress scheduler, you just can't add policies on ingress.
3360  *
3361  */
3362 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3363 {
3364         struct net_device *dev = skb->dev;
3365         u32 ttl = G_TC_RTTL(skb->tc_verd);
3366         int result = TC_ACT_OK;
3367         struct Qdisc *q;
3368
3369         if (unlikely(MAX_RED_LOOP < ttl++)) {
3370                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3371                                      skb->skb_iif, dev->ifindex);
3372                 return TC_ACT_SHOT;
3373         }
3374
3375         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3376         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3377
3378         q = rxq->qdisc;
3379         if (q != &noop_qdisc) {
3380                 spin_lock(qdisc_lock(q));
3381                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3382                         result = qdisc_enqueue_root(skb, q);
3383                 spin_unlock(qdisc_lock(q));
3384         }
3385
3386         return result;
3387 }
3388
3389 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3390                                          struct packet_type **pt_prev,
3391                                          int *ret, struct net_device *orig_dev)
3392 {
3393         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3394
3395         if (!rxq || rxq->qdisc == &noop_qdisc)
3396                 goto out;
3397
3398         if (*pt_prev) {
3399                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3400                 *pt_prev = NULL;
3401         }
3402
3403         switch (ing_filter(skb, rxq)) {
3404         case TC_ACT_SHOT:
3405         case TC_ACT_STOLEN:
3406                 kfree_skb(skb);
3407                 return NULL;
3408         }
3409
3410 out:
3411         skb->tc_verd = 0;
3412         return skb;
3413 }
3414 #endif
3415
3416 /**
3417  *      netdev_rx_handler_register - register receive handler
3418  *      @dev: device to register a handler for
3419  *      @rx_handler: receive handler to register
3420  *      @rx_handler_data: data pointer that is used by rx handler
3421  *
3422  *      Register a receive hander for a device. This handler will then be
3423  *      called from __netif_receive_skb. A negative errno code is returned
3424  *      on a failure.
3425  *
3426  *      The caller must hold the rtnl_mutex.
3427  *
3428  *      For a general description of rx_handler, see enum rx_handler_result.
3429  */
3430 int netdev_rx_handler_register(struct net_device *dev,
3431                                rx_handler_func_t *rx_handler,
3432                                void *rx_handler_data)
3433 {
3434         ASSERT_RTNL();
3435
3436         if (dev->rx_handler)
3437                 return -EBUSY;
3438
3439         /* Note: rx_handler_data must be set before rx_handler */
3440         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3441         rcu_assign_pointer(dev->rx_handler, rx_handler);
3442
3443         return 0;
3444 }
3445 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3446
3447 /**
3448  *      netdev_rx_handler_unregister - unregister receive handler
3449  *      @dev: device to unregister a handler from
3450  *
3451  *      Unregister a receive handler from a device.
3452  *
3453  *      The caller must hold the rtnl_mutex.
3454  */
3455 void netdev_rx_handler_unregister(struct net_device *dev)
3456 {
3457
3458         ASSERT_RTNL();
3459         RCU_INIT_POINTER(dev->rx_handler, NULL);
3460         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3461          * section has a guarantee to see a non NULL rx_handler_data
3462          * as well.
3463          */
3464         synchronize_net();
3465         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3466 }
3467 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3468
3469 /*
3470  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3471  * the special handling of PFMEMALLOC skbs.
3472  */
3473 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3474 {
3475         switch (skb->protocol) {
3476         case __constant_htons(ETH_P_ARP):
3477         case __constant_htons(ETH_P_IP):
3478         case __constant_htons(ETH_P_IPV6):
3479         case __constant_htons(ETH_P_8021Q):
3480         case __constant_htons(ETH_P_8021AD):
3481                 return true;
3482         default:
3483                 return false;
3484         }
3485 }
3486
3487 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3488 {
3489         struct packet_type *ptype, *pt_prev;
3490         rx_handler_func_t *rx_handler;
3491         struct net_device *orig_dev;
3492         struct net_device *null_or_dev;
3493         bool deliver_exact = false;
3494         int ret = NET_RX_DROP;
3495         __be16 type;
3496
3497         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3498
3499         trace_netif_receive_skb(skb);
3500
3501         /* if we've gotten here through NAPI, check netpoll */
3502         if (netpoll_receive_skb(skb))
3503                 goto out;
3504
3505         orig_dev = skb->dev;
3506
3507         skb_reset_network_header(skb);
3508         if (!skb_transport_header_was_set(skb))
3509                 skb_reset_transport_header(skb);
3510         skb_reset_mac_len(skb);
3511
3512         pt_prev = NULL;
3513
3514         rcu_read_lock();
3515
3516 another_round:
3517         skb->skb_iif = skb->dev->ifindex;
3518
3519         __this_cpu_inc(softnet_data.processed);
3520
3521         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3522             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3523                 skb = vlan_untag(skb);
3524                 if (unlikely(!skb))
3525                         goto unlock;
3526         }
3527
3528 #ifdef CONFIG_NET_CLS_ACT
3529         if (skb->tc_verd & TC_NCLS) {
3530                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3531                 goto ncls;
3532         }
3533 #endif
3534
3535         if (pfmemalloc)
3536                 goto skip_taps;
3537
3538         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3539                 if (!ptype->dev || ptype->dev == skb->dev) {
3540                         if (pt_prev)
3541                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3542                         pt_prev = ptype;
3543                 }
3544         }
3545
3546 skip_taps:
3547 #ifdef CONFIG_NET_CLS_ACT
3548         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3549         if (!skb)
3550                 goto unlock;
3551 ncls:
3552 #endif
3553
3554         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3555                 goto drop;
3556
3557         if (vlan_tx_tag_present(skb)) {
3558                 if (pt_prev) {
3559                         ret = deliver_skb(skb, pt_prev, orig_dev);
3560                         pt_prev = NULL;
3561                 }
3562                 if (vlan_do_receive(&skb))
3563                         goto another_round;
3564                 else if (unlikely(!skb))
3565                         goto unlock;
3566         }
3567
3568         rx_handler = rcu_dereference(skb->dev->rx_handler);
3569         if (rx_handler) {
3570                 if (pt_prev) {
3571                         ret = deliver_skb(skb, pt_prev, orig_dev);
3572                         pt_prev = NULL;
3573                 }
3574                 switch (rx_handler(&skb)) {
3575                 case RX_HANDLER_CONSUMED:
3576                         ret = NET_RX_SUCCESS;
3577                         goto unlock;
3578                 case RX_HANDLER_ANOTHER:
3579                         goto another_round;
3580                 case RX_HANDLER_EXACT:
3581                         deliver_exact = true;
3582                 case RX_HANDLER_PASS:
3583                         break;
3584                 default:
3585                         BUG();
3586                 }
3587         }
3588
3589         if (unlikely(vlan_tx_tag_present(skb))) {
3590                 if (vlan_tx_tag_get_id(skb))
3591                         skb->pkt_type = PACKET_OTHERHOST;
3592                 /* Note: we might in the future use prio bits
3593                  * and set skb->priority like in vlan_do_receive()
3594                  * For the time being, just ignore Priority Code Point
3595                  */
3596                 skb->vlan_tci = 0;
3597         }
3598
3599         /* deliver only exact match when indicated */
3600         null_or_dev = deliver_exact ? skb->dev : NULL;
3601
3602         type = skb->protocol;
3603         list_for_each_entry_rcu(ptype,
3604                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3605                 if (ptype->type == type &&
3606                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3607                      ptype->dev == orig_dev)) {
3608                         if (pt_prev)
3609                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3610                         pt_prev = ptype;
3611                 }
3612         }
3613
3614         if (pt_prev) {
3615                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3616                         goto drop;
3617                 else
3618                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3619         } else {
3620 drop:
3621                 atomic_long_inc(&skb->dev->rx_dropped);
3622                 kfree_skb(skb);
3623                 /* Jamal, now you will not able to escape explaining
3624                  * me how you were going to use this. :-)
3625                  */
3626                 ret = NET_RX_DROP;
3627         }
3628
3629 unlock:
3630         rcu_read_unlock();
3631 out:
3632         return ret;
3633 }
3634
3635 static int __netif_receive_skb(struct sk_buff *skb)
3636 {
3637         int ret;
3638
3639         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3640                 unsigned long pflags = current->flags;
3641
3642                 /*
3643                  * PFMEMALLOC skbs are special, they should
3644                  * - be delivered to SOCK_MEMALLOC sockets only
3645                  * - stay away from userspace
3646                  * - have bounded memory usage
3647                  *
3648                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3649                  * context down to all allocation sites.
3650                  */
3651                 current->flags |= PF_MEMALLOC;
3652                 ret = __netif_receive_skb_core(skb, true);
3653                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3654         } else
3655                 ret = __netif_receive_skb_core(skb, false);
3656
3657         return ret;
3658 }
3659
3660 /**
3661  *      netif_receive_skb - process receive buffer from network
3662  *      @skb: buffer to process
3663  *
3664  *      netif_receive_skb() is the main receive data processing function.
3665  *      It always succeeds. The buffer may be dropped during processing
3666  *      for congestion control or by the protocol layers.
3667  *
3668  *      This function may only be called from softirq context and interrupts
3669  *      should be enabled.
3670  *
3671  *      Return values (usually ignored):
3672  *      NET_RX_SUCCESS: no congestion
3673  *      NET_RX_DROP: packet was dropped
3674  */
3675 int netif_receive_skb(struct sk_buff *skb)
3676 {
3677         net_timestamp_check(netdev_tstamp_prequeue, skb);
3678
3679         if (skb_defer_rx_timestamp(skb))
3680                 return NET_RX_SUCCESS;
3681
3682 #ifdef CONFIG_RPS
3683         if (static_key_false(&rps_needed)) {
3684                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3685                 int cpu, ret;
3686
3687                 rcu_read_lock();
3688
3689                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3690
3691                 if (cpu >= 0) {
3692                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3693                         rcu_read_unlock();
3694                         return ret;
3695                 }
3696                 rcu_read_unlock();
3697         }
3698 #endif
3699         return __netif_receive_skb(skb);
3700 }
3701 EXPORT_SYMBOL(netif_receive_skb);
3702
3703 /* Network device is going away, flush any packets still pending
3704  * Called with irqs disabled.
3705  */
3706 static void flush_backlog(void *arg)
3707 {
3708         struct net_device *dev = arg;
3709         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3710         struct sk_buff *skb, *tmp;
3711
3712         rps_lock(sd);
3713         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3714                 if (skb->dev == dev) {
3715                         __skb_unlink(skb, &sd->input_pkt_queue);
3716                         kfree_skb(skb);
3717                         input_queue_head_incr(sd);
3718                 }
3719         }
3720         rps_unlock(sd);
3721
3722         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3723                 if (skb->dev == dev) {
3724                         __skb_unlink(skb, &sd->process_queue);
3725                         kfree_skb(skb);
3726                         input_queue_head_incr(sd);
3727                 }
3728         }
3729 }
3730
3731 static int napi_gro_complete(struct sk_buff *skb)
3732 {
3733         struct packet_offload *ptype;
3734         __be16 type = skb->protocol;
3735         struct list_head *head = &offload_base;
3736         int err = -ENOENT;
3737
3738         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3739
3740         if (NAPI_GRO_CB(skb)->count == 1) {
3741                 skb_shinfo(skb)->gso_size = 0;
3742                 goto out;
3743         }
3744
3745         rcu_read_lock();
3746         list_for_each_entry_rcu(ptype, head, list) {
3747                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3748                         continue;
3749
3750                 err = ptype->callbacks.gro_complete(skb, 0);
3751                 break;
3752         }
3753         rcu_read_unlock();
3754
3755         if (err) {
3756                 WARN_ON(&ptype->list == head);
3757                 kfree_skb(skb);
3758                 return NET_RX_SUCCESS;
3759         }
3760
3761 out:
3762         return netif_receive_skb(skb);
3763 }
3764
3765 /* napi->gro_list contains packets ordered by age.
3766  * youngest packets at the head of it.
3767  * Complete skbs in reverse order to reduce latencies.
3768  */
3769 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3770 {
3771         struct sk_buff *skb, *prev = NULL;
3772
3773         /* scan list and build reverse chain */
3774         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3775                 skb->prev = prev;
3776                 prev = skb;
3777         }
3778
3779         for (skb = prev; skb; skb = prev) {
3780                 skb->next = NULL;
3781
3782                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3783                         return;
3784
3785                 prev = skb->prev;
3786                 napi_gro_complete(skb);
3787                 napi->gro_count--;
3788         }
3789
3790         napi->gro_list = NULL;
3791 }
3792 EXPORT_SYMBOL(napi_gro_flush);
3793
3794 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3795 {
3796         struct sk_buff *p;
3797         unsigned int maclen = skb->dev->hard_header_len;
3798
3799         for (p = napi->gro_list; p; p = p->next) {
3800                 unsigned long diffs;
3801
3802                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3803                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3804                 if (maclen == ETH_HLEN)
3805                         diffs |= compare_ether_header(skb_mac_header(p),
3806                                                       skb_gro_mac_header(skb));
3807                 else if (!diffs)
3808                         diffs = memcmp(skb_mac_header(p),
3809                                        skb_gro_mac_header(skb),
3810                                        maclen);
3811                 NAPI_GRO_CB(p)->same_flow = !diffs;
3812                 NAPI_GRO_CB(p)->flush = 0;
3813         }
3814 }
3815
3816 static void skb_gro_reset_offset(struct sk_buff *skb)
3817 {
3818         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3819         const skb_frag_t *frag0 = &pinfo->frags[0];
3820
3821         NAPI_GRO_CB(skb)->data_offset = 0;
3822         NAPI_GRO_CB(skb)->frag0 = NULL;
3823         NAPI_GRO_CB(skb)->frag0_len = 0;
3824
3825         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3826             pinfo->nr_frags &&
3827             !PageHighMem(skb_frag_page(frag0))) {
3828                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3829                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3830         }
3831 }
3832
3833 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3834 {
3835         struct sk_buff **pp = NULL;
3836         struct packet_offload *ptype;
3837         __be16 type = skb->protocol;
3838         struct list_head *head = &offload_base;
3839         int same_flow;
3840         enum gro_result ret;
3841
3842         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3843                 goto normal;
3844
3845         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3846                 goto normal;
3847
3848         skb_gro_reset_offset(skb);
3849         gro_list_prepare(napi, skb);
3850
3851         rcu_read_lock();
3852         list_for_each_entry_rcu(ptype, head, list) {
3853                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3854                         continue;
3855
3856                 skb_set_network_header(skb, skb_gro_offset(skb));
3857                 skb_reset_mac_len(skb);
3858                 NAPI_GRO_CB(skb)->same_flow = 0;
3859                 NAPI_GRO_CB(skb)->flush = 0;
3860                 NAPI_GRO_CB(skb)->free = 0;
3861
3862                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3863                 break;
3864         }
3865         rcu_read_unlock();
3866
3867         if (&ptype->list == head)
3868                 goto normal;
3869
3870         same_flow = NAPI_GRO_CB(skb)->same_flow;
3871         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3872
3873         if (pp) {
3874                 struct sk_buff *nskb = *pp;
3875
3876                 *pp = nskb->next;
3877                 nskb->next = NULL;
3878                 napi_gro_complete(nskb);
3879                 napi->gro_count--;
3880         }
3881
3882         if (same_flow)
3883                 goto ok;
3884
3885         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3886                 goto normal;
3887
3888         napi->gro_count++;
3889         NAPI_GRO_CB(skb)->count = 1;
3890         NAPI_GRO_CB(skb)->age = jiffies;
3891         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3892         skb->next = napi->gro_list;
3893         napi->gro_list = skb;
3894         ret = GRO_HELD;
3895
3896 pull:
3897         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3898                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3899
3900                 BUG_ON(skb->end - skb->tail < grow);
3901
3902                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3903
3904                 skb->tail += grow;
3905                 skb->data_len -= grow;
3906
3907                 skb_shinfo(skb)->frags[0].page_offset += grow;
3908                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3909
3910                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3911                         skb_frag_unref(skb, 0);
3912                         memmove(skb_shinfo(skb)->frags,
3913                                 skb_shinfo(skb)->frags + 1,
3914                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3915                 }
3916         }
3917
3918 ok:
3919         return ret;
3920
3921 normal:
3922         ret = GRO_NORMAL;
3923         goto pull;
3924 }
3925
3926
3927 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3928 {
3929         switch (ret) {
3930         case GRO_NORMAL:
3931                 if (netif_receive_skb(skb))
3932                         ret = GRO_DROP;
3933                 break;
3934
3935         case GRO_DROP:
3936                 kfree_skb(skb);
3937                 break;
3938
3939         case GRO_MERGED_FREE:
3940                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3941                         kmem_cache_free(skbuff_head_cache, skb);
3942                 else
3943                         __kfree_skb(skb);
3944                 break;
3945
3946         case GRO_HELD:
3947         case GRO_MERGED:
3948                 break;
3949         }
3950
3951         return ret;
3952 }
3953
3954 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3955 {
3956         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3957 }
3958 EXPORT_SYMBOL(napi_gro_receive);
3959
3960 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3961 {
3962         __skb_pull(skb, skb_headlen(skb));
3963         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3964         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3965         skb->vlan_tci = 0;
3966         skb->dev = napi->dev;
3967         skb->skb_iif = 0;
3968
3969         napi->skb = skb;
3970 }
3971
3972 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3973 {
3974         struct sk_buff *skb = napi->skb;
3975
3976         if (!skb) {
3977                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3978                 napi->skb = skb;
3979         }
3980         return skb;
3981 }
3982 EXPORT_SYMBOL(napi_get_frags);
3983
3984 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3985                                gro_result_t ret)
3986 {
3987         switch (ret) {
3988         case GRO_NORMAL:
3989                 if (netif_receive_skb(skb))
3990                         ret = GRO_DROP;
3991                 break;
3992
3993         case GRO_DROP:
3994         case GRO_MERGED_FREE:
3995                 napi_reuse_skb(napi, skb);
3996                 break;
3997
3998         case GRO_HELD:
3999         case GRO_MERGED:
4000                 break;
4001         }
4002
4003         return ret;
4004 }
4005
4006 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4007 {
4008         struct sk_buff *skb = napi->skb;
4009
4010         napi->skb = NULL;
4011
4012         if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) {
4013                 napi_reuse_skb(napi, skb);
4014                 return NULL;
4015         }
4016         skb->protocol = eth_type_trans(skb, skb->dev);
4017
4018         return skb;
4019 }
4020
4021 gro_result_t napi_gro_frags(struct napi_struct *napi)
4022 {
4023         struct sk_buff *skb = napi_frags_skb(napi);
4024
4025         if (!skb)
4026                 return GRO_DROP;
4027
4028         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4029 }
4030 EXPORT_SYMBOL(napi_gro_frags);
4031
4032 /*
4033  * net_rps_action sends any pending IPI's for rps.
4034  * Note: called with local irq disabled, but exits with local irq enabled.
4035  */
4036 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4037 {
4038 #ifdef CONFIG_RPS
4039         struct softnet_data *remsd = sd->rps_ipi_list;
4040
4041         if (remsd) {
4042                 sd->rps_ipi_list = NULL;
4043
4044                 local_irq_enable();
4045
4046                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4047                 while (remsd) {
4048                         struct softnet_data *next = remsd->rps_ipi_next;
4049
4050                         if (cpu_online(remsd->cpu))
4051                                 __smp_call_function_single(remsd->cpu,
4052                                                            &remsd->csd, 0);
4053                         remsd = next;
4054                 }
4055         } else
4056 #endif
4057                 local_irq_enable();
4058 }
4059
4060 static int process_backlog(struct napi_struct *napi, int quota)
4061 {
4062         int work = 0;
4063         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4064
4065 #ifdef CONFIG_RPS
4066         /* Check if we have pending ipi, its better to send them now,
4067          * not waiting net_rx_action() end.
4068          */
4069         if (sd->rps_ipi_list) {
4070                 local_irq_disable();
4071                 net_rps_action_and_irq_enable(sd);
4072         }
4073 #endif
4074         napi->weight = weight_p;
4075         local_irq_disable();
4076         while (work < quota) {
4077                 struct sk_buff *skb;
4078                 unsigned int qlen;
4079
4080                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4081                         local_irq_enable();
4082                         __netif_receive_skb(skb);
4083                         local_irq_disable();
4084                         input_queue_head_incr(sd);
4085                         if (++work >= quota) {
4086                                 local_irq_enable();
4087                                 return work;
4088                         }
4089                 }
4090
4091                 rps_lock(sd);
4092                 qlen = skb_queue_len(&sd->input_pkt_queue);
4093                 if (qlen)
4094                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4095                                                    &sd->process_queue);
4096
4097                 if (qlen < quota - work) {
4098                         /*
4099                          * Inline a custom version of __napi_complete().
4100                          * only current cpu owns and manipulates this napi,
4101                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4102                          * we can use a plain write instead of clear_bit(),
4103                          * and we dont need an smp_mb() memory barrier.
4104                          */
4105                         list_del(&napi->poll_list);
4106                         napi->state = 0;
4107
4108                         quota = work + qlen;
4109                 }
4110                 rps_unlock(sd);
4111         }
4112         local_irq_enable();
4113
4114         return work;
4115 }
4116
4117 /**
4118  * __napi_schedule - schedule for receive
4119  * @n: entry to schedule
4120  *
4121  * The entry's receive function will be scheduled to run
4122  */
4123 void __napi_schedule(struct napi_struct *n)
4124 {
4125         unsigned long flags;
4126
4127         local_irq_save(flags);
4128         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4129         local_irq_restore(flags);
4130 }
4131 EXPORT_SYMBOL(__napi_schedule);
4132
4133 void __napi_complete(struct napi_struct *n)
4134 {
4135         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4136         BUG_ON(n->gro_list);
4137
4138         list_del(&n->poll_list);
4139         smp_mb__before_clear_bit();
4140         clear_bit(NAPI_STATE_SCHED, &n->state);
4141 }
4142 EXPORT_SYMBOL(__napi_complete);
4143
4144 void napi_complete(struct napi_struct *n)
4145 {
4146         unsigned long flags;
4147
4148         /*
4149          * don't let napi dequeue from the cpu poll list
4150          * just in case its running on a different cpu
4151          */
4152         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4153                 return;
4154
4155         napi_gro_flush(n, false);
4156         local_irq_save(flags);
4157         __napi_complete(n);
4158         local_irq_restore(flags);
4159 }
4160 EXPORT_SYMBOL(napi_complete);
4161
4162 /* must be called under rcu_read_lock(), as we dont take a reference */
4163 struct napi_struct *napi_by_id(unsigned int napi_id)
4164 {
4165         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4166         struct napi_struct *napi;
4167
4168         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4169                 if (napi->napi_id == napi_id)
4170                         return napi;
4171
4172         return NULL;
4173 }
4174 EXPORT_SYMBOL_GPL(napi_by_id);
4175
4176 void napi_hash_add(struct napi_struct *napi)
4177 {
4178         if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4179
4180                 spin_lock(&napi_hash_lock);
4181
4182                 /* 0 is not a valid id, we also skip an id that is taken
4183                  * we expect both events to be extremely rare
4184                  */
4185                 napi->napi_id = 0;
4186                 while (!napi->napi_id) {
4187                         napi->napi_id = ++napi_gen_id;
4188                         if (napi_by_id(napi->napi_id))
4189                                 napi->napi_id = 0;
4190                 }
4191
4192                 hlist_add_head_rcu(&napi->napi_hash_node,
4193                         &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4194
4195                 spin_unlock(&napi_hash_lock);
4196         }
4197 }
4198 EXPORT_SYMBOL_GPL(napi_hash_add);
4199
4200 /* Warning : caller is responsible to make sure rcu grace period
4201  * is respected before freeing memory containing @napi
4202  */
4203 void napi_hash_del(struct napi_struct *napi)
4204 {
4205         spin_lock(&napi_hash_lock);
4206
4207         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4208                 hlist_del_rcu(&napi->napi_hash_node);
4209
4210         spin_unlock(&napi_hash_lock);
4211 }
4212 EXPORT_SYMBOL_GPL(napi_hash_del);
4213
4214 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4215                     int (*poll)(struct napi_struct *, int), int weight)
4216 {
4217         INIT_LIST_HEAD(&napi->poll_list);
4218         napi->gro_count = 0;
4219         napi->gro_list = NULL;
4220         napi->skb = NULL;
4221         napi->poll = poll;
4222         if (weight > NAPI_POLL_WEIGHT)
4223                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4224                             weight, dev->name);
4225         napi->weight = weight;
4226         list_add(&napi->dev_list, &dev->napi_list);
4227         napi->dev = dev;
4228 #ifdef CONFIG_NETPOLL
4229         spin_lock_init(&napi->poll_lock);
4230         napi->poll_owner = -1;
4231 #endif
4232         set_bit(NAPI_STATE_SCHED, &napi->state);
4233 }
4234 EXPORT_SYMBOL(netif_napi_add);
4235
4236 void netif_napi_del(struct napi_struct *napi)
4237 {
4238         list_del_init(&napi->dev_list);
4239         napi_free_frags(napi);
4240
4241         kfree_skb_list(napi->gro_list);
4242         napi->gro_list = NULL;
4243         napi->gro_count = 0;
4244 }
4245 EXPORT_SYMBOL(netif_napi_del);
4246
4247 static void net_rx_action(struct softirq_action *h)
4248 {
4249         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4250         unsigned long time_limit = jiffies + 2;
4251         int budget = netdev_budget;
4252         void *have;
4253
4254         local_irq_disable();
4255
4256         while (!list_empty(&sd->poll_list)) {
4257                 struct napi_struct *n;
4258                 int work, weight;
4259
4260                 /* If softirq window is exhuasted then punt.
4261                  * Allow this to run for 2 jiffies since which will allow
4262                  * an average latency of 1.5/HZ.
4263                  */
4264                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4265                         goto softnet_break;
4266
4267                 local_irq_enable();
4268
4269                 /* Even though interrupts have been re-enabled, this
4270                  * access is safe because interrupts can only add new
4271                  * entries to the tail of this list, and only ->poll()
4272                  * calls can remove this head entry from the list.
4273                  */
4274                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4275
4276                 have = netpoll_poll_lock(n);
4277
4278                 weight = n->weight;
4279
4280                 /* This NAPI_STATE_SCHED test is for avoiding a race
4281                  * with netpoll's poll_napi().  Only the entity which
4282                  * obtains the lock and sees NAPI_STATE_SCHED set will
4283                  * actually make the ->poll() call.  Therefore we avoid
4284                  * accidentally calling ->poll() when NAPI is not scheduled.
4285                  */
4286                 work = 0;
4287                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4288                         work = n->poll(n, weight);
4289                         trace_napi_poll(n);
4290                 }
4291
4292                 WARN_ON_ONCE(work > weight);
4293
4294                 budget -= work;
4295
4296                 local_irq_disable();
4297
4298                 /* Drivers must not modify the NAPI state if they
4299                  * consume the entire weight.  In such cases this code
4300                  * still "owns" the NAPI instance and therefore can
4301                  * move the instance around on the list at-will.
4302                  */
4303                 if (unlikely(work == weight)) {
4304                         if (unlikely(napi_disable_pending(n))) {
4305                                 local_irq_enable();
4306                                 napi_complete(n);
4307                                 local_irq_disable();
4308                         } else {
4309                                 if (n->gro_list) {
4310                                         /* flush too old packets
4311                                          * If HZ < 1000, flush all packets.
4312                                          */
4313                                         local_irq_enable();
4314                                         napi_gro_flush(n, HZ >= 1000);
4315                                         local_irq_disable();
4316                                 }
4317                                 list_move_tail(&n->poll_list, &sd->poll_list);
4318                         }
4319                 }
4320
4321                 netpoll_poll_unlock(have);
4322         }
4323 out:
4324         net_rps_action_and_irq_enable(sd);
4325
4326 #ifdef CONFIG_NET_DMA
4327         /*
4328          * There may not be any more sk_buffs coming right now, so push
4329          * any pending DMA copies to hardware
4330          */
4331         dma_issue_pending_all();
4332 #endif
4333
4334         return;
4335
4336 softnet_break:
4337         sd->time_squeeze++;
4338         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4339         goto out;
4340 }
4341
4342 struct netdev_adjacent {
4343         struct net_device *dev;
4344
4345         /* upper master flag, there can only be one master device per list */
4346         bool master;
4347
4348         /* counter for the number of times this device was added to us */
4349         u16 ref_nr;
4350
4351         /* private field for the users */
4352         void *private;
4353
4354         struct list_head list;
4355         struct rcu_head rcu;
4356 };
4357
4358 static struct netdev_adjacent *__netdev_find_adj_rcu(struct net_device *dev,
4359                                                      struct net_device *adj_dev,
4360                                                      struct list_head *adj_list)
4361 {
4362         struct netdev_adjacent *adj;
4363
4364         list_for_each_entry_rcu(adj, adj_list, list) {
4365                 if (adj->dev == adj_dev)
4366                         return adj;
4367         }
4368         return NULL;
4369 }
4370
4371 static struct netdev_adjacent *__netdev_find_adj(struct net_device *dev,
4372                                                  struct net_device *adj_dev,
4373                                                  struct list_head *adj_list)
4374 {
4375         struct netdev_adjacent *adj;
4376
4377         list_for_each_entry(adj, adj_list, list) {
4378                 if (adj->dev == adj_dev)
4379                         return adj;
4380         }
4381         return NULL;
4382 }
4383
4384 /**
4385  * netdev_has_upper_dev - Check if device is linked to an upper device
4386  * @dev: device
4387  * @upper_dev: upper device to check
4388  *
4389  * Find out if a device is linked to specified upper device and return true
4390  * in case it is. Note that this checks only immediate upper device,
4391  * not through a complete stack of devices. The caller must hold the RTNL lock.
4392  */
4393 bool netdev_has_upper_dev(struct net_device *dev,
4394                           struct net_device *upper_dev)
4395 {
4396         ASSERT_RTNL();
4397
4398         return __netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper);
4399 }
4400 EXPORT_SYMBOL(netdev_has_upper_dev);
4401
4402 /**
4403  * netdev_has_any_upper_dev - Check if device is linked to some device
4404  * @dev: device
4405  *
4406  * Find out if a device is linked to an upper device and return true in case
4407  * it is. The caller must hold the RTNL lock.
4408  */
4409 bool netdev_has_any_upper_dev(struct net_device *dev)
4410 {
4411         ASSERT_RTNL();
4412
4413         return !list_empty(&dev->all_adj_list.upper);
4414 }
4415 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4416
4417 /**
4418  * netdev_master_upper_dev_get - Get master upper device
4419  * @dev: device
4420  *
4421  * Find a master upper device and return pointer to it or NULL in case
4422  * it's not there. The caller must hold the RTNL lock.
4423  */
4424 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4425 {
4426         struct netdev_adjacent *upper;
4427
4428         ASSERT_RTNL();
4429
4430         if (list_empty(&dev->adj_list.upper))
4431                 return NULL;
4432
4433         upper = list_first_entry(&dev->adj_list.upper,
4434                                  struct netdev_adjacent, list);
4435         if (likely(upper->master))
4436                 return upper->dev;
4437         return NULL;
4438 }
4439 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4440
4441 void *netdev_adjacent_get_private(struct list_head *adj_list)
4442 {
4443         struct netdev_adjacent *adj;
4444
4445         adj = list_entry(adj_list, struct netdev_adjacent, list);
4446
4447         return adj->private;
4448 }
4449 EXPORT_SYMBOL(netdev_adjacent_get_private);
4450
4451 /**
4452  * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
4453  * @dev: device
4454  * @iter: list_head ** of the current position
4455  *
4456  * Gets the next device from the dev's upper list, starting from iter
4457  * position. The caller must hold RCU read lock.
4458  */
4459 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
4460                                                      struct list_head **iter)
4461 {
4462         struct netdev_adjacent *upper;
4463
4464         WARN_ON_ONCE(!rcu_read_lock_held());
4465
4466         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4467
4468         if (&upper->list == &dev->all_adj_list.upper)
4469                 return NULL;
4470
4471         *iter = &upper->list;
4472
4473         return upper->dev;
4474 }
4475 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
4476
4477 /**
4478  * netdev_lower_get_next_private - Get the next ->private from the
4479  *                                 lower neighbour list
4480  * @dev: device
4481  * @iter: list_head ** of the current position
4482  *
4483  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4484  * list, starting from iter position. The caller must hold either hold the
4485  * RTNL lock or its own locking that guarantees that the neighbour lower
4486  * list will remain unchainged.
4487  */
4488 void *netdev_lower_get_next_private(struct net_device *dev,
4489                                     struct list_head **iter)
4490 {
4491         struct netdev_adjacent *lower;
4492
4493         lower = list_entry(*iter, struct netdev_adjacent, list);
4494
4495         if (&lower->list == &dev->adj_list.lower)
4496                 return NULL;
4497
4498         if (iter)
4499                 *iter = lower->list.next;
4500
4501         return lower->private;
4502 }
4503 EXPORT_SYMBOL(netdev_lower_get_next_private);
4504
4505 /**
4506  * netdev_lower_get_next_private_rcu - Get the next ->private from the
4507  *                                     lower neighbour list, RCU
4508  *                                     variant
4509  * @dev: device
4510  * @iter: list_head ** of the current position
4511  *
4512  * Gets the next netdev_adjacent->private from the dev's lower neighbour
4513  * list, starting from iter position. The caller must hold RCU read lock.
4514  */
4515 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4516                                         struct list_head **iter)
4517 {
4518         struct netdev_adjacent *lower;
4519
4520         WARN_ON_ONCE(!rcu_read_lock_held());
4521
4522         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
4523
4524         if (&lower->list == &dev->adj_list.lower)
4525                 return NULL;
4526
4527         if (iter)
4528                 *iter = &lower->list;
4529
4530         return lower->private;
4531 }
4532 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
4533
4534 /**
4535  * netdev_lower_get_first_private_rcu - Get the first ->private from the
4536  *                                     lower neighbour list, RCU
4537  *                                     variant
4538  * @dev: device
4539  *
4540  * Gets the first netdev_adjacent->private from the dev's lower neighbour
4541  * list. The caller must hold RCU read lock.
4542  */
4543 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
4544 {
4545         struct netdev_adjacent *lower;
4546
4547         lower = list_first_or_null_rcu(&dev->adj_list.lower,
4548                         struct netdev_adjacent, list);
4549         if (lower)
4550                 return lower->private;
4551         return NULL;
4552 }
4553 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
4554
4555 /**
4556  * netdev_master_upper_dev_get_rcu - Get master upper device
4557  * @dev: device
4558  *
4559  * Find a master upper device and return pointer to it or NULL in case
4560  * it's not there. The caller must hold the RCU read lock.
4561  */
4562 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4563 {
4564         struct netdev_adjacent *upper;
4565
4566         upper = list_first_or_null_rcu(&dev->adj_list.upper,
4567                                        struct netdev_adjacent, list);
4568         if (upper && likely(upper->master))
4569                 return upper->dev;
4570         return NULL;
4571 }
4572 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4573
4574 static int __netdev_adjacent_dev_insert(struct net_device *dev,
4575                                         struct net_device *adj_dev,
4576                                         struct list_head *dev_list,
4577                                         void *private, bool master)
4578 {
4579         struct netdev_adjacent *adj;
4580         char linkname[IFNAMSIZ+7];
4581         int ret;
4582
4583         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4584
4585         if (adj) {
4586                 adj->ref_nr++;
4587                 return 0;
4588         }
4589
4590         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
4591         if (!adj)
4592                 return -ENOMEM;
4593
4594         adj->dev = adj_dev;
4595         adj->master = master;
4596         adj->ref_nr = 1;
4597         adj->private = private;
4598         dev_hold(adj_dev);
4599
4600         pr_debug("dev_hold for %s, because of link added from %s to %s\n",
4601                  adj_dev->name, dev->name, adj_dev->name);
4602
4603         if (dev_list == &dev->adj_list.lower) {
4604                 sprintf(linkname, "lower_%s", adj_dev->name);
4605                 ret = sysfs_create_link(&(dev->dev.kobj),
4606                                         &(adj_dev->dev.kobj), linkname);
4607                 if (ret)
4608                         goto free_adj;
4609         } else if (dev_list == &dev->adj_list.upper) {
4610                 sprintf(linkname, "upper_%s", adj_dev->name);
4611                 ret = sysfs_create_link(&(dev->dev.kobj),
4612                                         &(adj_dev->dev.kobj), linkname);
4613                 if (ret)
4614                         goto free_adj;
4615         }
4616
4617         /* Ensure that master link is always the first item in list. */
4618         if (master) {
4619                 ret = sysfs_create_link(&(dev->dev.kobj),
4620                                         &(adj_dev->dev.kobj), "master");
4621                 if (ret)
4622                         goto remove_symlinks;
4623
4624                 list_add_rcu(&adj->list, dev_list);
4625         } else {
4626                 list_add_tail_rcu(&adj->list, dev_list);
4627         }
4628
4629         return 0;
4630
4631 remove_symlinks:
4632         if (dev_list == &dev->adj_list.lower) {
4633                 sprintf(linkname, "lower_%s", adj_dev->name);
4634                 sysfs_remove_link(&(dev->dev.kobj), linkname);
4635         } else if (dev_list == &dev->adj_list.upper) {
4636                 sprintf(linkname, "upper_%s", adj_dev->name);
4637                 sysfs_remove_link(&(dev->dev.kobj), linkname);
4638         }
4639
4640 free_adj:
4641         kfree(adj);
4642         dev_put(adj_dev);
4643
4644         return ret;
4645 }
4646
4647 void __netdev_adjacent_dev_remove(struct net_device *dev,
4648                                   struct net_device *adj_dev,
4649                                   struct list_head *dev_list)
4650 {
4651         struct netdev_adjacent *adj;
4652         char linkname[IFNAMSIZ+7];
4653
4654         adj = __netdev_find_adj(dev, adj_dev, dev_list);
4655
4656         if (!adj) {
4657                 pr_err("tried to remove device %s from %s\n",
4658                        dev->name, adj_dev->name);
4659                 BUG();
4660         }
4661
4662         if (adj->ref_nr > 1) {
4663                 pr_debug("%s to %s ref_nr-- = %d\n", dev->name, adj_dev->name,
4664                          adj->ref_nr-1);
4665                 adj->ref_nr--;
4666                 return;
4667         }
4668
4669         if (adj->master)
4670                 sysfs_remove_link(&(dev->dev.kobj), "master");
4671
4672         if (dev_list == &dev->adj_list.lower) {
4673                 sprintf(linkname, "lower_%s", adj_dev->name);
4674                 sysfs_remove_link(&(dev->dev.kobj), linkname);
4675         } else if (dev_list == &dev->adj_list.upper) {
4676                 sprintf(linkname, "upper_%s", adj_dev->name);
4677                 sysfs_remove_link(&(dev->dev.kobj), linkname);
4678         }
4679
4680         list_del_rcu(&adj->list);
4681         pr_debug("dev_put for %s, because link removed from %s to %s\n",
4682                  adj_dev->name, dev->name, adj_dev->name);
4683         dev_put(adj_dev);
4684         kfree_rcu(adj, rcu);
4685 }
4686
4687 int __netdev_adjacent_dev_link_lists(struct net_device *dev,
4688                                      struct net_device *upper_dev,
4689                                      struct list_head *up_list,
4690                                      struct list_head *down_list,
4691                                      void *private, bool master)
4692 {
4693         int ret;
4694
4695         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list, private,
4696                                            master);
4697         if (ret)
4698                 return ret;
4699
4700         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list, private,
4701                                            false);
4702         if (ret) {
4703                 __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4704                 return ret;
4705         }
4706
4707         return 0;
4708 }
4709
4710 int __netdev_adjacent_dev_link(struct net_device *dev,
4711                                struct net_device *upper_dev)
4712 {
4713         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
4714                                                 &dev->all_adj_list.upper,
4715                                                 &upper_dev->all_adj_list.lower,
4716                                                 NULL, false);
4717 }
4718
4719 void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
4720                                         struct net_device *upper_dev,
4721                                         struct list_head *up_list,
4722                                         struct list_head *down_list)
4723 {
4724         __netdev_adjacent_dev_remove(dev, upper_dev, up_list);
4725         __netdev_adjacent_dev_remove(upper_dev, dev, down_list);
4726 }
4727
4728 void __netdev_adjacent_dev_unlink(struct net_device *dev,
4729                                   struct net_device *upper_dev)
4730 {
4731         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4732                                            &dev->all_adj_list.upper,
4733                                            &upper_dev->all_adj_list.lower);
4734 }
4735
4736 int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
4737                                          struct net_device *upper_dev,
4738                                          void *private, bool master)
4739 {
4740         int ret = __netdev_adjacent_dev_link(dev, upper_dev);
4741
4742         if (ret)
4743                 return ret;
4744
4745         ret = __netdev_adjacent_dev_link_lists(dev, upper_dev,
4746                                                &dev->adj_list.upper,
4747                                                &upper_dev->adj_list.lower,
4748                                                private, master);
4749         if (ret) {
4750                 __netdev_adjacent_dev_unlink(dev, upper_dev);
4751                 return ret;
4752         }
4753
4754         return 0;
4755 }
4756
4757 void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
4758                                             struct net_device *upper_dev)
4759 {
4760         __netdev_adjacent_dev_unlink(dev, upper_dev);
4761         __netdev_adjacent_dev_unlink_lists(dev, upper_dev,
4762                                            &dev->adj_list.upper,
4763                                            &upper_dev->adj_list.lower);
4764 }
4765
4766 static int __netdev_upper_dev_link(struct net_device *dev,
4767                                    struct net_device *upper_dev, bool master,
4768                                    void *private)
4769 {
4770         struct netdev_adjacent *i, *j, *to_i, *to_j;
4771         int ret = 0;
4772
4773         ASSERT_RTNL();
4774
4775         if (dev == upper_dev)
4776                 return -EBUSY;
4777
4778         /* To prevent loops, check if dev is not upper device to upper_dev. */
4779         if (__netdev_find_adj(upper_dev, dev, &upper_dev->all_adj_list.upper))
4780                 return -EBUSY;
4781
4782         if (__netdev_find_adj(dev, upper_dev, &dev->all_adj_list.upper))
4783                 return -EEXIST;
4784
4785         if (master && netdev_master_upper_dev_get(dev))
4786                 return -EBUSY;
4787
4788         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
4789                                                    master);
4790         if (ret)
4791                 return ret;
4792
4793         /* Now that we linked these devs, make all the upper_dev's
4794          * all_adj_list.upper visible to every dev's all_adj_list.lower an
4795          * versa, and don't forget the devices itself. All of these
4796          * links are non-neighbours.
4797          */
4798         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4799                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4800                         pr_debug("Interlinking %s with %s, non-neighbour\n",
4801                                  i->dev->name, j->dev->name);
4802                         ret = __netdev_adjacent_dev_link(i->dev, j->dev);
4803                         if (ret)
4804                                 goto rollback_mesh;
4805                 }
4806         }
4807
4808         /* add dev to every upper_dev's upper device */
4809         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4810                 pr_debug("linking %s's upper device %s with %s\n",
4811                          upper_dev->name, i->dev->name, dev->name);
4812                 ret = __netdev_adjacent_dev_link(dev, i->dev);
4813                 if (ret)
4814                         goto rollback_upper_mesh;
4815         }
4816
4817         /* add upper_dev to every dev's lower device */
4818         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4819                 pr_debug("linking %s's lower device %s with %s\n", dev->name,
4820                          i->dev->name, upper_dev->name);
4821                 ret = __netdev_adjacent_dev_link(i->dev, upper_dev);
4822                 if (ret)
4823                         goto rollback_lower_mesh;
4824         }
4825
4826         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4827         return 0;
4828
4829 rollback_lower_mesh:
4830         to_i = i;
4831         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4832                 if (i == to_i)
4833                         break;
4834                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
4835         }
4836
4837         i = NULL;
4838
4839 rollback_upper_mesh:
4840         to_i = i;
4841         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
4842                 if (i == to_i)
4843                         break;
4844                 __netdev_adjacent_dev_unlink(dev, i->dev);
4845         }
4846
4847         i = j = NULL;
4848
4849 rollback_mesh:
4850         to_i = i;
4851         to_j = j;
4852         list_for_each_entry(i, &dev->all_adj_list.lower, list) {
4853                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
4854                         if (i == to_i && j == to_j)
4855                                 break;
4856                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
4857                 }
4858                 if (i == to_i)
4859                         break;
4860         }
4861
4862         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4863
4864         return ret;
4865 }
4866
4867 /**
4868  * netdev_upper_dev_link - Add a link to the upper device
4869  * @dev: device
4870  * @upper_dev: new upper device
4871  *
4872  * Adds a link to device which is upper to this one. The caller must hold
4873  * the RTNL lock. On a failure a negative errno code is returned.
4874  * On success the reference counts are adjusted and the function
4875  * returns zero.
4876  */
4877 int netdev_upper_dev_link(struct net_device *dev,
4878                           struct net_device *upper_dev)
4879 {
4880         return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
4881 }
4882 EXPORT_SYMBOL(netdev_upper_dev_link);
4883
4884 /**
4885  * netdev_master_upper_dev_link - Add a master link to the upper device
4886  * @dev: device
4887  * @upper_dev: new upper device
4888  *
4889  * Adds a link to device which is upper to this one. In this case, only
4890  * one master upper device can be linked, although other non-master devices
4891  * might be linked as well. The caller must hold the RTNL lock.
4892  * On a failure a negative errno code is returned. On success the reference
4893  * counts are adjusted and the function returns zero.
4894  */
4895 int netdev_master_upper_dev_link(struct net_device *dev,
4896                                  struct net_device *upper_dev)
4897 {
4898         return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
4899 }
4900 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4901
4902 int netdev_master_upper_dev_link_private(struct net_device *dev,
4903                                          struct net_device *upper_dev,
4904                                          void *private)
4905 {
4906         return __netdev_upper_dev_link(dev, upper_dev, true, private);
4907 }
4908 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
4909
4910 /**
4911  * netdev_upper_dev_unlink - Removes a link to upper device
4912  * @dev: device
4913  * @upper_dev: new upper device
4914  *
4915  * Removes a link to device which is upper to this one. The caller must hold
4916  * the RTNL lock.
4917  */
4918 void netdev_upper_dev_unlink(struct net_device *dev,
4919                              struct net_device *upper_dev)
4920 {
4921         struct netdev_adjacent *i, *j;
4922         ASSERT_RTNL();
4923
4924         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
4925
4926         /* Here is the tricky part. We must remove all dev's lower
4927          * devices from all upper_dev's upper devices and vice
4928          * versa, to maintain the graph relationship.
4929          */
4930         list_for_each_entry(i, &dev->all_adj_list.lower, list)
4931                 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
4932                         __netdev_adjacent_dev_unlink(i->dev, j->dev);
4933
4934         /* remove also the devices itself from lower/upper device
4935          * list
4936          */
4937         list_for_each_entry(i, &dev->all_adj_list.lower, list)
4938                 __netdev_adjacent_dev_unlink(i->dev, upper_dev);
4939
4940         list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
4941                 __netdev_adjacent_dev_unlink(dev, i->dev);
4942
4943         call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4944 }
4945 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4946
4947 void *netdev_lower_dev_get_private_rcu(struct net_device *dev,
4948                                        struct net_device *lower_dev)
4949 {
4950         struct netdev_adjacent *lower;
4951
4952         if (!lower_dev)
4953                 return NULL;
4954         lower = __netdev_find_adj_rcu(dev, lower_dev, &dev->adj_list.lower);
4955         if (!lower)
4956                 return NULL;
4957
4958         return lower->private;
4959 }
4960 EXPORT_SYMBOL(netdev_lower_dev_get_private_rcu);
4961
4962 void *netdev_lower_dev_get_private(struct net_device *dev,
4963                                    struct net_device *lower_dev)
4964 {
4965         struct netdev_adjacent *lower;
4966
4967         if (!lower_dev)
4968                 return NULL;
4969         lower = __netdev_find_adj(dev, lower_dev, &dev->adj_list.lower);
4970         if (!lower)
4971                 return NULL;
4972
4973         return lower->private;
4974 }
4975 EXPORT_SYMBOL(netdev_lower_dev_get_private);
4976
4977 static void dev_change_rx_flags(struct net_device *dev, int flags)
4978 {
4979         const struct net_device_ops *ops = dev->netdev_ops;
4980
4981         if (ops->ndo_change_rx_flags)
4982                 ops->ndo_change_rx_flags(dev, flags);
4983 }
4984
4985 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
4986 {
4987         unsigned int old_flags = dev->flags;
4988         kuid_t uid;
4989         kgid_t gid;
4990
4991         ASSERT_RTNL();
4992
4993         dev->flags |= IFF_PROMISC;
4994         dev->promiscuity += inc;
4995         if (dev->promiscuity == 0) {
4996                 /*
4997                  * Avoid overflow.
4998                  * If inc causes overflow, untouch promisc and return error.
4999                  */
5000                 if (inc < 0)
5001                         dev->flags &= ~IFF_PROMISC;
5002                 else {
5003                         dev->promiscuity -= inc;
5004                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5005                                 dev->name);
5006                         return -EOVERFLOW;
5007                 }
5008         }
5009         if (dev->flags != old_flags) {
5010                 pr_info("device %s %s promiscuous mode\n",
5011                         dev->name,
5012                         dev->flags & IFF_PROMISC ? "entered" : "left");
5013                 if (audit_enabled) {
5014                         current_uid_gid(&uid, &gid);
5015                         audit_log(current->audit_context, GFP_ATOMIC,
5016                                 AUDIT_ANOM_PROMISCUOUS,
5017                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5018                                 dev->name, (dev->flags & IFF_PROMISC),
5019                                 (old_flags & IFF_PROMISC),
5020                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5021                                 from_kuid(&init_user_ns, uid),
5022                                 from_kgid(&init_user_ns, gid),
5023                                 audit_get_sessionid(current));
5024                 }
5025
5026                 dev_change_rx_flags(dev, IFF_PROMISC);
5027         }
5028         if (notify)
5029                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5030         return 0;
5031 }
5032
5033 /**
5034  *      dev_set_promiscuity     - update promiscuity count on a device
5035  *      @dev: device
5036  *      @inc: modifier
5037  *
5038  *      Add or remove promiscuity from a device. While the count in the device
5039  *      remains above zero the interface remains promiscuous. Once it hits zero
5040  *      the device reverts back to normal filtering operation. A negative inc
5041  *      value is used to drop promiscuity on the device.
5042  *      Return 0 if successful or a negative errno code on error.
5043  */
5044 int dev_set_promiscuity(struct net_device *dev, int inc)
5045 {
5046         unsigned int old_flags = dev->flags;
5047         int err;
5048
5049         err = __dev_set_promiscuity(dev, inc, true);
5050         if (err < 0)
5051                 return err;
5052         if (dev->flags != old_flags)
5053                 dev_set_rx_mode(dev);
5054         return err;
5055 }
5056 EXPORT_SYMBOL(dev_set_promiscuity);
5057
5058 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5059 {
5060         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5061
5062         ASSERT_RTNL();
5063
5064         dev->flags |= IFF_ALLMULTI;
5065         dev->allmulti += inc;
5066         if (dev->allmulti == 0) {
5067                 /*
5068                  * Avoid overflow.
5069                  * If inc causes overflow, untouch allmulti and return error.
5070                  */
5071                 if (inc < 0)
5072                         dev->flags &= ~IFF_ALLMULTI;
5073                 else {
5074                         dev->allmulti -= inc;
5075                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5076                                 dev->name);
5077                         return -EOVERFLOW;
5078                 }
5079         }
5080         if (dev->flags ^ old_flags) {
5081                 dev_change_rx_flags(dev, IFF_ALLMULTI);
5082                 dev_set_rx_mode(dev);
5083                 if (notify)
5084                         __dev_notify_flags(dev, old_flags,
5085                                            dev->gflags ^ old_gflags);
5086         }
5087         return 0;
5088 }
5089
5090 /**
5091  *      dev_set_allmulti        - update allmulti count on a device
5092  *      @dev: device
5093  *      @inc: modifier
5094  *
5095  *      Add or remove reception of all multicast frames to a device. While the
5096  *      count in the device remains above zero the interface remains listening
5097  *      to all interfaces. Once it hits zero the device reverts back to normal
5098  *      filtering operation. A negative @inc value is used to drop the counter
5099  *      when releasing a resource needing all multicasts.
5100  *      Return 0 if successful or a negative errno code on error.
5101  */
5102
5103 int dev_set_allmulti(struct net_device *dev, int inc)
5104 {
5105         return __dev_set_allmulti(dev, inc, true);
5106 }
5107 EXPORT_SYMBOL(dev_set_allmulti);
5108
5109 /*
5110  *      Upload unicast and multicast address lists to device and
5111  *      configure RX filtering. When the device doesn't support unicast
5112  *      filtering it is put in promiscuous mode while unicast addresses
5113  *      are present.
5114  */
5115 void __dev_set_rx_mode(struct net_device *dev)
5116 {
5117         const struct net_device_ops *ops = dev->netdev_ops;
5118
5119         /* dev_open will call this function so the list will stay sane. */
5120         if (!(dev->flags&IFF_UP))
5121                 return;
5122
5123         if (!netif_device_present(dev))
5124                 return;
5125
5126         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5127                 /* Unicast addresses changes may only happen under the rtnl,
5128                  * therefore calling __dev_set_promiscuity here is safe.
5129                  */
5130                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5131                         __dev_set_promiscuity(dev, 1, false);
5132                         dev->uc_promisc = true;
5133                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5134                         __dev_set_promiscuity(dev, -1, false);
5135                         dev->uc_promisc = false;
5136                 }
5137         }
5138
5139         if (ops->ndo_set_rx_mode)
5140                 ops->ndo_set_rx_mode(dev);
5141 }
5142
5143 void dev_set_rx_mode(struct net_device *dev)
5144 {
5145         netif_addr_lock_bh(dev);
5146         __dev_set_rx_mode(dev);
5147         netif_addr_unlock_bh(dev);
5148 }
5149
5150 /**
5151  *      dev_get_flags - get flags reported to userspace
5152  *      @dev: device
5153  *
5154  *      Get the combination of flag bits exported through APIs to userspace.
5155  */
5156 unsigned int dev_get_flags(const struct net_device *dev)
5157 {
5158         unsigned int flags;
5159
5160         flags = (dev->flags & ~(IFF_PROMISC |
5161                                 IFF_ALLMULTI |
5162                                 IFF_RUNNING |
5163                                 IFF_LOWER_UP |
5164                                 IFF_DORMANT)) |
5165                 (dev->gflags & (IFF_PROMISC |
5166                                 IFF_ALLMULTI));
5167
5168         if (netif_running(dev)) {
5169                 if (netif_oper_up(dev))
5170                         flags |= IFF_RUNNING;
5171                 if (netif_carrier_ok(dev))
5172                         flags |= IFF_LOWER_UP;
5173                 if (netif_dormant(dev))
5174                         flags |= IFF_DORMANT;
5175         }
5176
5177         return flags;
5178 }
5179 EXPORT_SYMBOL(dev_get_flags);
5180
5181 int __dev_change_flags(struct net_device *dev, unsigned int flags)
5182 {
5183         unsigned int old_flags = dev->flags;
5184         int ret;
5185
5186         ASSERT_RTNL();
5187
5188         /*
5189          *      Set the flags on our device.
5190          */
5191
5192         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
5193                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
5194                                IFF_AUTOMEDIA)) |
5195                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5196                                     IFF_ALLMULTI));
5197
5198         /*
5199          *      Load in the correct multicast list now the flags have changed.
5200          */
5201
5202         if ((old_flags ^ flags) & IFF_MULTICAST)
5203                 dev_change_rx_flags(dev, IFF_MULTICAST);
5204
5205         dev_set_rx_mode(dev);
5206
5207         /*
5208          *      Have we downed the interface. We handle IFF_UP ourselves
5209          *      according to user attempts to set it, rather than blindly
5210          *      setting it.
5211          */
5212
5213         ret = 0;
5214         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
5215                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
5216
5217                 if (!ret)
5218                         dev_set_rx_mode(dev);
5219         }
5220
5221         if ((flags ^ dev->gflags) & IFF_PROMISC) {
5222                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5223                 unsigned int old_flags = dev->flags;
5224
5225                 dev->gflags ^= IFF_PROMISC;
5226
5227                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
5228                         if (dev->flags != old_flags)
5229                                 dev_set_rx_mode(dev);
5230         }
5231
5232         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5233            is important. Some (broken) drivers set IFF_PROMISC, when
5234            IFF_ALLMULTI is requested not asking us and not reporting.
5235          */
5236         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
5237                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5238
5239                 dev->gflags ^= IFF_ALLMULTI;
5240                 __dev_set_allmulti(dev, inc, false);
5241         }
5242
5243         return ret;
5244 }
5245
5246 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
5247                         unsigned int gchanges)
5248 {
5249         unsigned int changes = dev->flags ^ old_flags;
5250
5251         if (gchanges)
5252                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
5253
5254         if (changes & IFF_UP) {
5255                 if (dev->flags & IFF_UP)
5256                         call_netdevice_notifiers(NETDEV_UP, dev);
5257                 else
5258                         call_netdevice_notifiers(NETDEV_DOWN, dev);
5259         }
5260
5261         if (dev->flags & IFF_UP &&
5262             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
5263                 struct netdev_notifier_change_info change_info;
5264
5265                 change_info.flags_changed = changes;
5266                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
5267                                               &change_info.info);
5268         }
5269 }
5270
5271 /**
5272  *      dev_change_flags - change device settings
5273  *      @dev: device
5274  *      @flags: device state flags
5275  *
5276  *      Change settings on device based state flags. The flags are
5277  *      in the userspace exported format.
5278  */
5279 int dev_change_flags(struct net_device *dev, unsigned int flags)
5280 {
5281         int ret;
5282         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
5283
5284         ret = __dev_change_flags(dev, flags);
5285         if (ret < 0)
5286                 return ret;
5287
5288         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
5289         __dev_notify_flags(dev, old_flags, changes);
5290         return ret;
5291 }
5292 EXPORT_SYMBOL(dev_change_flags);
5293
5294 /**
5295  *      dev_set_mtu - Change maximum transfer unit
5296  *      @dev: device
5297  *      @new_mtu: new transfer unit
5298  *
5299  *      Change the maximum transfer size of the network device.
5300  */
5301 int dev_set_mtu(struct net_device *dev, int new_mtu)
5302 {
5303         const struct net_device_ops *ops = dev->netdev_ops;
5304         int err;
5305
5306         if (new_mtu == dev->mtu)
5307                 return 0;
5308
5309         /*      MTU must be positive.    */
5310         if (new_mtu < 0)
5311                 return -EINVAL;
5312
5313         if (!netif_device_present(dev))
5314                 return -ENODEV;
5315
5316         err = 0;
5317         if (ops->ndo_change_mtu)
5318                 err = ops->ndo_change_mtu(dev, new_mtu);
5319         else
5320                 dev->mtu = new_mtu;
5321
5322         if (!err)
5323                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
5324         return err;
5325 }
5326 EXPORT_SYMBOL(dev_set_mtu);
5327
5328 /**
5329  *      dev_set_group - Change group this device belongs to
5330  *      @dev: device
5331  *      @new_group: group this device should belong to
5332  */
5333 void dev_set_group(struct net_device *dev, int new_group)
5334 {
5335         dev->group = new_group;
5336 }
5337 EXPORT_SYMBOL(dev_set_group);
5338
5339 /**
5340  *      dev_set_mac_address - Change Media Access Control Address
5341  *      @dev: device
5342  *      @sa: new address
5343  *
5344  *      Change the hardware (MAC) address of the device
5345  */
5346 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5347 {
5348         const struct net_device_ops *ops = dev->netdev_ops;
5349         int err;
5350
5351         if (!ops->ndo_set_mac_address)
5352                 return -EOPNOTSUPP;
5353         if (sa->sa_family != dev->type)
5354                 return -EINVAL;
5355         if (!netif_device_present(dev))
5356                 return -ENODEV;
5357         err = ops->ndo_set_mac_address(dev, sa);
5358         if (err)
5359                 return err;
5360         dev->addr_assign_type = NET_ADDR_SET;
5361         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
5362         add_device_randomness(dev->dev_addr, dev->addr_len);
5363         return 0;
5364 }
5365 EXPORT_SYMBOL(dev_set_mac_address);
5366
5367 /**
5368  *      dev_change_carrier - Change device carrier
5369  *      @dev: device
5370  *      @new_carrier: new value
5371  *
5372  *      Change device carrier
5373  */
5374 int dev_change_carrier(struct net_device *dev, bool new_carrier)
5375 {
5376         const struct net_device_ops *ops = dev->netdev_ops;
5377
5378         if (!ops->ndo_change_carrier)
5379                 return -EOPNOTSUPP;
5380         if (!netif_device_present(dev))
5381                 return -ENODEV;
5382         return ops->ndo_change_carrier(dev, new_carrier);
5383 }
5384 EXPORT_SYMBOL(dev_change_carrier);
5385
5386 /**
5387  *      dev_get_phys_port_id - Get device physical port ID
5388  *      @dev: device
5389  *      @ppid: port ID
5390  *
5391  *      Get device physical port ID
5392  */
5393 int dev_get_phys_port_id(struct net_device *dev,
5394                          struct netdev_phys_port_id *ppid)
5395 {
5396         const struct net_device_ops *ops = dev->netdev_ops;
5397
5398         if (!ops->ndo_get_phys_port_id)
5399                 return -EOPNOTSUPP;
5400         return ops->ndo_get_phys_port_id(dev, ppid);
5401 }
5402 EXPORT_SYMBOL(dev_get_phys_port_id);
5403
5404 /**
5405  *      dev_new_index   -       allocate an ifindex
5406  *      @net: the applicable net namespace
5407  *
5408  *      Returns a suitable unique value for a new device interface
5409  *      number.  The caller must hold the rtnl semaphore or the
5410  *      dev_base_lock to be sure it remains unique.
5411  */
5412 static int dev_new_index(struct net *net)
5413 {
5414         int ifindex = net->ifindex;
5415         for (;;) {
5416                 if (++ifindex <= 0)
5417                         ifindex = 1;
5418                 if (!__dev_get_by_index(net, ifindex))
5419                         return net->ifindex = ifindex;
5420         }
5421 }
5422
5423 /* Delayed registration/unregisteration */
5424 static LIST_HEAD(net_todo_list);
5425 static DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
5426
5427 static void net_set_todo(struct net_device *dev)
5428 {
5429         list_add_tail(&dev->todo_list, &net_todo_list);
5430         dev_net(dev)->dev_unreg_count++;
5431 }
5432
5433 static void rollback_registered_many(struct list_head *head)
5434 {
5435         struct net_device *dev, *tmp;
5436         LIST_HEAD(close_head);
5437
5438         BUG_ON(dev_boot_phase);
5439         ASSERT_RTNL();
5440
5441         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5442                 /* Some devices call without registering
5443                  * for initialization unwind. Remove those
5444                  * devices and proceed with the remaining.
5445                  */
5446                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5447                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5448                                  dev->name, dev);
5449
5450                         WARN_ON(1);
5451                         list_del(&dev->unreg_list);
5452                         continue;
5453                 }
5454                 dev->dismantle = true;
5455                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5456         }
5457
5458         /* If device is running, close it first. */
5459         list_for_each_entry(dev, head, unreg_list)
5460                 list_add_tail(&dev->close_list, &close_head);
5461         dev_close_many(&close_head);
5462
5463         list_for_each_entry(dev, head, unreg_list) {
5464                 /* And unlink it from device chain. */
5465                 unlist_netdevice(dev);
5466
5467                 dev->reg_state = NETREG_UNREGISTERING;
5468         }
5469
5470         synchronize_net();
5471
5472         list_for_each_entry(dev, head, unreg_list) {
5473                 /* Shutdown queueing discipline. */
5474                 dev_shutdown(dev);
5475
5476
5477                 /* Notify protocols, that we are about to destroy
5478                    this device. They should clean all the things.
5479                 */
5480                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5481
5482                 if (!dev->rtnl_link_ops ||
5483                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5484                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
5485
5486                 /*
5487                  *      Flush the unicast and multicast chains
5488                  */
5489                 dev_uc_flush(dev);
5490                 dev_mc_flush(dev);
5491
5492                 if (dev->netdev_ops->ndo_uninit)
5493                         dev->netdev_ops->ndo_uninit(dev);
5494
5495                 /* Notifier chain MUST detach us all upper devices. */
5496                 WARN_ON(netdev_has_any_upper_dev(dev));
5497
5498                 /* Remove entries from kobject tree */
5499                 netdev_unregister_kobject(dev);
5500 #ifdef CONFIG_XPS
5501                 /* Remove XPS queueing entries */
5502                 netif_reset_xps_queues_gt(dev, 0);
5503 #endif
5504         }
5505
5506         synchronize_net();
5507
5508         list_for_each_entry(dev, head, unreg_list)
5509                 dev_put(dev);
5510 }
5511
5512 static void rollback_registered(struct net_device *dev)
5513 {
5514         LIST_HEAD(single);
5515
5516         list_add(&dev->unreg_list, &single);
5517         rollback_registered_many(&single);
5518         list_del(&single);
5519 }
5520
5521 static netdev_features_t netdev_fix_features(struct net_device *dev,
5522         netdev_features_t features)
5523 {
5524         /* Fix illegal checksum combinations */
5525         if ((features & NETIF_F_HW_CSUM) &&
5526             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5527                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5528                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5529         }
5530
5531         /* TSO requires that SG is present as well. */
5532         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5533                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5534                 features &= ~NETIF_F_ALL_TSO;
5535         }
5536
5537         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5538                                         !(features & NETIF_F_IP_CSUM)) {
5539                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5540                 features &= ~NETIF_F_TSO;
5541                 features &= ~NETIF_F_TSO_ECN;
5542         }
5543
5544         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5545                                          !(features & NETIF_F_IPV6_CSUM)) {
5546                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5547                 features &= ~NETIF_F_TSO6;
5548         }
5549
5550         /* TSO ECN requires that TSO is present as well. */
5551         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5552                 features &= ~NETIF_F_TSO_ECN;
5553
5554         /* Software GSO depends on SG. */
5555         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5556                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5557                 features &= ~NETIF_F_GSO;
5558         }
5559
5560         /* UFO needs SG and checksumming */
5561         if (features & NETIF_F_UFO) {
5562                 /* maybe split UFO into V4 and V6? */
5563                 if (!((features & NETIF_F_GEN_CSUM) ||
5564                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5565                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5566                         netdev_dbg(dev,
5567                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5568                         features &= ~NETIF_F_UFO;
5569                 }
5570
5571                 if (!(features & NETIF_F_SG)) {
5572                         netdev_dbg(dev,
5573                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5574                         features &= ~NETIF_F_UFO;
5575                 }
5576         }
5577
5578         return features;
5579 }
5580
5581 int __netdev_update_features(struct net_device *dev)
5582 {
5583         netdev_features_t features;
5584         int err = 0;
5585
5586         ASSERT_RTNL();
5587
5588         features = netdev_get_wanted_features(dev);
5589
5590         if (dev->netdev_ops->ndo_fix_features)
5591                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5592
5593         /* driver might be less strict about feature dependencies */
5594         features = netdev_fix_features(dev, features);
5595
5596         if (dev->features == features)
5597                 return 0;
5598
5599         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5600                 &dev->features, &features);
5601
5602         if (dev->netdev_ops->ndo_set_features)
5603                 err = dev->netdev_ops->ndo_set_features(dev, features);
5604
5605         if (unlikely(err < 0)) {
5606                 netdev_err(dev,
5607                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5608                         err, &features, &dev->features);
5609                 return -1;
5610         }
5611
5612         if (!err)
5613                 dev->features = features;
5614
5615         return 1;
5616 }
5617
5618 /**
5619  *      netdev_update_features - recalculate device features
5620  *      @dev: the device to check
5621  *
5622  *      Recalculate dev->features set and send notifications if it
5623  *      has changed. Should be called after driver or hardware dependent
5624  *      conditions might have changed that influence the features.
5625  */
5626 void netdev_update_features(struct net_device *dev)
5627 {
5628         if (__netdev_update_features(dev))
5629                 netdev_features_change(dev);
5630 }
5631 EXPORT_SYMBOL(netdev_update_features);
5632
5633 /**
5634  *      netdev_change_features - recalculate device features
5635  *      @dev: the device to check
5636  *
5637  *      Recalculate dev->features set and send notifications even
5638  *      if they have not changed. Should be called instead of
5639  *      netdev_update_features() if also dev->vlan_features might
5640  *      have changed to allow the changes to be propagated to stacked
5641  *      VLAN devices.
5642  */
5643 void netdev_change_features(struct net_device *dev)
5644 {
5645         __netdev_update_features(dev);
5646         netdev_features_change(dev);
5647 }
5648 EXPORT_SYMBOL(netdev_change_features);
5649
5650 /**
5651  *      netif_stacked_transfer_operstate -      transfer operstate
5652  *      @rootdev: the root or lower level device to transfer state from
5653  *      @dev: the device to transfer operstate to
5654  *
5655  *      Transfer operational state from root to device. This is normally
5656  *      called when a stacking relationship exists between the root
5657  *      device and the device(a leaf device).
5658  */
5659 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5660                                         struct net_device *dev)
5661 {
5662         if (rootdev->operstate == IF_OPER_DORMANT)
5663                 netif_dormant_on(dev);
5664         else
5665                 netif_dormant_off(dev);
5666
5667         if (netif_carrier_ok(rootdev)) {
5668                 if (!netif_carrier_ok(dev))
5669                         netif_carrier_on(dev);
5670         } else {
5671                 if (netif_carrier_ok(dev))
5672                         netif_carrier_off(dev);
5673         }
5674 }
5675 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5676
5677 #ifdef CONFIG_RPS
5678 static int netif_alloc_rx_queues(struct net_device *dev)
5679 {
5680         unsigned int i, count = dev->num_rx_queues;
5681         struct netdev_rx_queue *rx;
5682
5683         BUG_ON(count < 1);
5684
5685         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5686         if (!rx)
5687                 return -ENOMEM;
5688
5689         dev->_rx = rx;
5690
5691         for (i = 0; i < count; i++)
5692                 rx[i].dev = dev;
5693         return 0;
5694 }
5695 #endif
5696
5697 static void netdev_init_one_queue(struct net_device *dev,
5698                                   struct netdev_queue *queue, void *_unused)
5699 {
5700         /* Initialize queue lock */
5701         spin_lock_init(&queue->_xmit_lock);
5702         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5703         queue->xmit_lock_owner = -1;
5704         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5705         queue->dev = dev;
5706 #ifdef CONFIG_BQL
5707         dql_init(&queue->dql, HZ);
5708 #endif
5709 }
5710
5711 static void netif_free_tx_queues(struct net_device *dev)
5712 {
5713         if (is_vmalloc_addr(dev->_tx))
5714                 vfree(dev->_tx);
5715         else
5716                 kfree(dev->_tx);
5717 }
5718
5719 static int netif_alloc_netdev_queues(struct net_device *dev)
5720 {
5721         unsigned int count = dev->num_tx_queues;
5722         struct netdev_queue *tx;
5723         size_t sz = count * sizeof(*tx);
5724
5725         BUG_ON(count < 1 || count > 0xffff);
5726
5727         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5728         if (!tx) {
5729                 tx = vzalloc(sz);
5730                 if (!tx)
5731                         return -ENOMEM;
5732         }
5733         dev->_tx = tx;
5734
5735         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5736         spin_lock_init(&dev->tx_global_lock);
5737
5738         return 0;
5739 }
5740
5741 /**
5742  *      register_netdevice      - register a network device
5743  *      @dev: device to register
5744  *
5745  *      Take a completed network device structure and add it to the kernel
5746  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5747  *      chain. 0 is returned on success. A negative errno code is returned
5748  *      on a failure to set up the device, or if the name is a duplicate.
5749  *
5750  *      Callers must hold the rtnl semaphore. You may want
5751  *      register_netdev() instead of this.
5752  *
5753  *      BUGS:
5754  *      The locking appears insufficient to guarantee two parallel registers
5755  *      will not get the same name.
5756  */
5757
5758 int register_netdevice(struct net_device *dev)
5759 {
5760         int ret;
5761         struct net *net = dev_net(dev);
5762
5763         BUG_ON(dev_boot_phase);
5764         ASSERT_RTNL();
5765
5766         might_sleep();
5767
5768         /* When net_device's are persistent, this will be fatal. */
5769         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5770         BUG_ON(!net);
5771
5772         spin_lock_init(&dev->addr_list_lock);
5773         netdev_set_addr_lockdep_class(dev);
5774
5775         dev->iflink = -1;
5776
5777         ret = dev_get_valid_name(net, dev, dev->name);
5778         if (ret < 0)
5779                 goto out;
5780
5781         /* Init, if this function is available */
5782         if (dev->netdev_ops->ndo_init) {
5783                 ret = dev->netdev_ops->ndo_init(dev);
5784                 if (ret) {
5785                         if (ret > 0)
5786                                 ret = -EIO;
5787                         goto out;
5788                 }
5789         }
5790
5791         if (((dev->hw_features | dev->features) &
5792              NETIF_F_HW_VLAN_CTAG_FILTER) &&
5793             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5794              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5795                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5796                 ret = -EINVAL;
5797                 goto err_uninit;
5798         }
5799
5800         ret = -EBUSY;
5801         if (!dev->ifindex)
5802                 dev->ifindex = dev_new_index(net);
5803         else if (__dev_get_by_index(net, dev->ifindex))
5804                 goto err_uninit;
5805
5806         if (dev->iflink == -1)
5807                 dev->iflink = dev->ifindex;
5808
5809         /* Transfer changeable features to wanted_features and enable
5810          * software offloads (GSO and GRO).
5811          */
5812         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5813         dev->features |= NETIF_F_SOFT_FEATURES;
5814         dev->wanted_features = dev->features & dev->hw_features;
5815
5816         /* Turn on no cache copy if HW is doing checksum */
5817         if (!(dev->flags & IFF_LOOPBACK)) {
5818                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5819                 if (dev->features & NETIF_F_ALL_CSUM) {
5820                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5821                         dev->features |= NETIF_F_NOCACHE_COPY;
5822                 }
5823         }
5824
5825         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5826          */
5827         dev->vlan_features |= NETIF_F_HIGHDMA;
5828
5829         /* Make NETIF_F_SG inheritable to tunnel devices.
5830          */
5831         dev->hw_enc_features |= NETIF_F_SG;
5832
5833         /* Make NETIF_F_SG inheritable to MPLS.
5834          */
5835         dev->mpls_features |= NETIF_F_SG;
5836
5837         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5838         ret = notifier_to_errno(ret);
5839         if (ret)
5840                 goto err_uninit;
5841
5842         ret = netdev_register_kobject(dev);
5843         if (ret)
5844                 goto err_uninit;
5845         dev->reg_state = NETREG_REGISTERED;
5846
5847         __netdev_update_features(dev);
5848
5849         /*
5850          *      Default initial state at registry is that the
5851          *      device is present.
5852          */
5853
5854         set_bit(__LINK_STATE_PRESENT, &dev->state);
5855
5856         linkwatch_init_dev(dev);
5857
5858         dev_init_scheduler(dev);
5859         dev_hold(dev);
5860         list_netdevice(dev);
5861         add_device_randomness(dev->dev_addr, dev->addr_len);
5862
5863         /* If the device has permanent device address, driver should
5864          * set dev_addr and also addr_assign_type should be set to
5865          * NET_ADDR_PERM (default value).
5866          */
5867         if (dev->addr_assign_type == NET_ADDR_PERM)
5868                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5869
5870         /* Notify protocols, that a new device appeared. */
5871         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5872         ret = notifier_to_errno(ret);
5873         if (ret) {
5874                 rollback_registered(dev);
5875                 dev->reg_state = NETREG_UNREGISTERED;
5876         }
5877         /*
5878          *      Prevent userspace races by waiting until the network
5879          *      device is fully setup before sending notifications.
5880          */
5881         if (!dev->rtnl_link_ops ||
5882             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5883                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
5884
5885 out:
5886         return ret;
5887
5888 err_uninit:
5889         if (dev->netdev_ops->ndo_uninit)
5890                 dev->netdev_ops->ndo_uninit(dev);
5891         goto out;
5892 }
5893 EXPORT_SYMBOL(register_netdevice);
5894
5895 /**
5896  *      init_dummy_netdev       - init a dummy network device for NAPI
5897  *      @dev: device to init
5898  *
5899  *      This takes a network device structure and initialize the minimum
5900  *      amount of fields so it can be used to schedule NAPI polls without
5901  *      registering a full blown interface. This is to be used by drivers
5902  *      that need to tie several hardware interfaces to a single NAPI
5903  *      poll scheduler due to HW limitations.
5904  */
5905 int init_dummy_netdev(struct net_device *dev)
5906 {
5907         /* Clear everything. Note we don't initialize spinlocks
5908          * are they aren't supposed to be taken by any of the
5909          * NAPI code and this dummy netdev is supposed to be
5910          * only ever used for NAPI polls
5911          */
5912         memset(dev, 0, sizeof(struct net_device));
5913
5914         /* make sure we BUG if trying to hit standard
5915          * register/unregister code path
5916          */
5917         dev->reg_state = NETREG_DUMMY;
5918
5919         /* NAPI wants this */
5920         INIT_LIST_HEAD(&dev->napi_list);
5921
5922         /* a dummy interface is started by default */
5923         set_bit(__LINK_STATE_PRESENT, &dev->state);
5924         set_bit(__LINK_STATE_START, &dev->state);
5925
5926         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5927          * because users of this 'device' dont need to change
5928          * its refcount.
5929          */
5930
5931         return 0;
5932 }
5933 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5934
5935
5936 /**
5937  *      register_netdev - register a network device
5938  *      @dev: device to register
5939  *
5940  *      Take a completed network device structure and add it to the kernel
5941  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5942  *      chain. 0 is returned on success. A negative errno code is returned
5943  *      on a failure to set up the device, or if the name is a duplicate.
5944  *
5945  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5946  *      and expands the device name if you passed a format string to
5947  *      alloc_netdev.
5948  */
5949 int register_netdev(struct net_device *dev)
5950 {
5951         int err;
5952
5953         rtnl_lock();
5954         err = register_netdevice(dev);
5955         rtnl_unlock();
5956         return err;
5957 }
5958 EXPORT_SYMBOL(register_netdev);
5959
5960 int netdev_refcnt_read(const struct net_device *dev)
5961 {
5962         int i, refcnt = 0;
5963
5964         for_each_possible_cpu(i)
5965                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5966         return refcnt;
5967 }
5968 EXPORT_SYMBOL(netdev_refcnt_read);
5969
5970 /**
5971  * netdev_wait_allrefs - wait until all references are gone.
5972  * @dev: target net_device
5973  *
5974  * This is called when unregistering network devices.
5975  *
5976  * Any protocol or device that holds a reference should register
5977  * for netdevice notification, and cleanup and put back the
5978  * reference if they receive an UNREGISTER event.
5979  * We can get stuck here if buggy protocols don't correctly
5980  * call dev_put.
5981  */
5982 static void netdev_wait_allrefs(struct net_device *dev)
5983 {
5984         unsigned long rebroadcast_time, warning_time;
5985         int refcnt;
5986
5987         linkwatch_forget_dev(dev);
5988
5989         rebroadcast_time = warning_time = jiffies;
5990         refcnt = netdev_refcnt_read(dev);
5991
5992         while (refcnt != 0) {
5993                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5994                         rtnl_lock();
5995
5996                         /* Rebroadcast unregister notification */
5997                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5998
5999                         __rtnl_unlock();
6000                         rcu_barrier();
6001                         rtnl_lock();
6002
6003                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6004                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6005                                      &dev->state)) {
6006                                 /* We must not have linkwatch events
6007                                  * pending on unregister. If this
6008                                  * happens, we simply run the queue
6009                                  * unscheduled, resulting in a noop
6010                                  * for this device.
6011                                  */
6012                                 linkwatch_run_queue();
6013                         }
6014
6015                         __rtnl_unlock();
6016
6017                         rebroadcast_time = jiffies;
6018                 }
6019
6020                 msleep(250);
6021
6022                 refcnt = netdev_refcnt_read(dev);
6023
6024                 if (time_after(jiffies, warning_time + 10 * HZ)) {
6025                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6026                                  dev->name, refcnt);
6027                         warning_time = jiffies;
6028                 }
6029         }
6030 }
6031
6032 /* The sequence is:
6033  *
6034  *      rtnl_lock();
6035  *      ...
6036  *      register_netdevice(x1);
6037  *      register_netdevice(x2);
6038  *      ...
6039  *      unregister_netdevice(y1);
6040  *      unregister_netdevice(y2);
6041  *      ...
6042  *      rtnl_unlock();
6043  *      free_netdev(y1);
6044  *      free_netdev(y2);
6045  *
6046  * We are invoked by rtnl_unlock().
6047  * This allows us to deal with problems:
6048  * 1) We can delete sysfs objects which invoke hotplug
6049  *    without deadlocking with linkwatch via keventd.
6050  * 2) Since we run with the RTNL semaphore not held, we can sleep
6051  *    safely in order to wait for the netdev refcnt to drop to zero.
6052  *
6053  * We must not return until all unregister events added during
6054  * the interval the lock was held have been completed.
6055  */
6056 void netdev_run_todo(void)
6057 {
6058         struct list_head list;
6059
6060         /* Snapshot list, allow later requests */
6061         list_replace_init(&net_todo_list, &list);
6062
6063         __rtnl_unlock();
6064
6065
6066         /* Wait for rcu callbacks to finish before next phase */
6067         if (!list_empty(&list))
6068                 rcu_barrier();
6069
6070         while (!list_empty(&list)) {
6071                 struct net_device *dev
6072                         = list_first_entry(&list, struct net_device, todo_list);
6073                 list_del(&dev->todo_list);
6074
6075                 rtnl_lock();
6076                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6077                 __rtnl_unlock();
6078
6079                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
6080                         pr_err("network todo '%s' but state %d\n",
6081                                dev->name, dev->reg_state);
6082                         dump_stack();
6083                         continue;
6084                 }
6085
6086                 dev->reg_state = NETREG_UNREGISTERED;
6087
6088                 on_each_cpu(flush_backlog, dev, 1);
6089
6090                 netdev_wait_allrefs(dev);
6091
6092                 /* paranoia */
6093                 BUG_ON(netdev_refcnt_read(dev));
6094                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
6095                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
6096                 WARN_ON(dev->dn_ptr);
6097
6098                 if (dev->destructor)
6099                         dev->destructor(dev);
6100
6101                 /* Report a network device has been unregistered */
6102                 rtnl_lock();
6103                 dev_net(dev)->dev_unreg_count--;
6104                 __rtnl_unlock();
6105                 wake_up(&netdev_unregistering_wq);
6106
6107                 /* Free network device */
6108                 kobject_put(&dev->dev.kobj);
6109         }
6110 }
6111
6112 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
6113  * fields in the same order, with only the type differing.
6114  */
6115 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
6116                              const struct net_device_stats *netdev_stats)
6117 {
6118 #if BITS_PER_LONG == 64
6119         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
6120         memcpy(stats64, netdev_stats, sizeof(*stats64));
6121 #else
6122         size_t i, n = sizeof(*stats64) / sizeof(u64);
6123         const unsigned long *src = (const unsigned long *)netdev_stats;
6124         u64 *dst = (u64 *)stats64;
6125
6126         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
6127                      sizeof(*stats64) / sizeof(u64));
6128         for (i = 0; i < n; i++)
6129                 dst[i] = src[i];
6130 #endif
6131 }
6132 EXPORT_SYMBOL(netdev_stats_to_stats64);
6133
6134 /**
6135  *      dev_get_stats   - get network device statistics
6136  *      @dev: device to get statistics from
6137  *      @storage: place to store stats
6138  *
6139  *      Get network statistics from device. Return @storage.
6140  *      The device driver may provide its own method by setting
6141  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
6142  *      otherwise the internal statistics structure is used.
6143  */
6144 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
6145                                         struct rtnl_link_stats64 *storage)
6146 {
6147         const struct net_device_ops *ops = dev->netdev_ops;
6148
6149         if (ops->ndo_get_stats64) {
6150                 memset(storage, 0, sizeof(*storage));
6151                 ops->ndo_get_stats64(dev, storage);
6152         } else if (ops->ndo_get_stats) {
6153                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
6154         } else {
6155                 netdev_stats_to_stats64(storage, &dev->stats);
6156         }
6157         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
6158         return storage;
6159 }
6160 EXPORT_SYMBOL(dev_get_stats);
6161
6162 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
6163 {
6164         struct netdev_queue *queue = dev_ingress_queue(dev);
6165
6166 #ifdef CONFIG_NET_CLS_ACT
6167         if (queue)
6168                 return queue;
6169         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
6170         if (!queue)
6171                 return NULL;
6172         netdev_init_one_queue(dev, queue, NULL);
6173         queue->qdisc = &noop_qdisc;
6174         queue->qdisc_sleeping = &noop_qdisc;
6175         rcu_assign_pointer(dev->ingress_queue, queue);
6176 #endif
6177         return queue;
6178 }
6179
6180 static const struct ethtool_ops default_ethtool_ops;
6181
6182 void netdev_set_default_ethtool_ops(struct net_device *dev,
6183                                     const struct ethtool_ops *ops)
6184 {
6185         if (dev->ethtool_ops == &default_ethtool_ops)
6186                 dev->ethtool_ops = ops;
6187 }
6188 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
6189
6190 void netdev_freemem(struct net_device *dev)
6191 {
6192         char *addr = (char *)dev - dev->padded;
6193
6194         if (is_vmalloc_addr(addr))
6195                 vfree(addr);
6196         else
6197                 kfree(addr);
6198 }
6199
6200 /**
6201  *      alloc_netdev_mqs - allocate network device
6202  *      @sizeof_priv:   size of private data to allocate space for
6203  *      @name:          device name format string
6204  *      @setup:         callback to initialize device
6205  *      @txqs:          the number of TX subqueues to allocate
6206  *      @rxqs:          the number of RX subqueues to allocate
6207  *
6208  *      Allocates a struct net_device with private data area for driver use
6209  *      and performs basic initialization.  Also allocates subquue structs
6210  *      for each queue on the device.
6211  */
6212 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
6213                 void (*setup)(struct net_device *),
6214                 unsigned int txqs, unsigned int rxqs)
6215 {
6216         struct net_device *dev;
6217         size_t alloc_size;
6218         struct net_device *p;
6219
6220         BUG_ON(strlen(name) >= sizeof(dev->name));
6221
6222         if (txqs < 1) {
6223                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
6224                 return NULL;
6225         }
6226
6227 #ifdef CONFIG_RPS
6228         if (rxqs < 1) {
6229                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
6230                 return NULL;
6231         }
6232 #endif
6233
6234         alloc_size = sizeof(struct net_device);
6235         if (sizeof_priv) {
6236                 /* ensure 32-byte alignment of private area */
6237                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
6238                 alloc_size += sizeof_priv;
6239         }
6240         /* ensure 32-byte alignment of whole construct */
6241         alloc_size += NETDEV_ALIGN - 1;
6242
6243         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6244         if (!p)
6245                 p = vzalloc(alloc_size);
6246         if (!p)
6247                 return NULL;
6248
6249         dev = PTR_ALIGN(p, NETDEV_ALIGN);
6250         dev->padded = (char *)dev - (char *)p;
6251
6252         dev->pcpu_refcnt = alloc_percpu(int);
6253         if (!dev->pcpu_refcnt)
6254                 goto free_dev;
6255
6256         if (dev_addr_init(dev))
6257                 goto free_pcpu;
6258
6259         dev_mc_init(dev);
6260         dev_uc_init(dev);
6261
6262         dev_net_set(dev, &init_net);
6263
6264         dev->gso_max_size = GSO_MAX_SIZE;
6265         dev->gso_max_segs = GSO_MAX_SEGS;
6266
6267         INIT_LIST_HEAD(&dev->napi_list);
6268         INIT_LIST_HEAD(&dev->unreg_list);
6269         INIT_LIST_HEAD(&dev->close_list);
6270         INIT_LIST_HEAD(&dev->link_watch_list);
6271         INIT_LIST_HEAD(&dev->adj_list.upper);
6272         INIT_LIST_HEAD(&dev->adj_list.lower);
6273         INIT_LIST_HEAD(&dev->all_adj_list.upper);
6274         INIT_LIST_HEAD(&dev->all_adj_list.lower);
6275         dev->priv_flags = IFF_XMIT_DST_RELEASE;
6276         setup(dev);
6277
6278         dev->num_tx_queues = txqs;
6279         dev->real_num_tx_queues = txqs;
6280         if (netif_alloc_netdev_queues(dev))
6281                 goto free_all;
6282
6283 #ifdef CONFIG_RPS
6284         dev->num_rx_queues = rxqs;
6285         dev->real_num_rx_queues = rxqs;
6286         if (netif_alloc_rx_queues(dev))
6287                 goto free_all;
6288 #endif
6289
6290         strcpy(dev->name, name);
6291         dev->group = INIT_NETDEV_GROUP;
6292         if (!dev->ethtool_ops)
6293                 dev->ethtool_ops = &default_ethtool_ops;
6294         return dev;
6295
6296 free_all:
6297         free_netdev(dev);
6298         return NULL;
6299
6300 free_pcpu:
6301         free_percpu(dev->pcpu_refcnt);
6302         netif_free_tx_queues(dev);
6303 #ifdef CONFIG_RPS
6304         kfree(dev->_rx);
6305 #endif
6306
6307 free_dev:
6308         netdev_freemem(dev);
6309         return NULL;
6310 }
6311 EXPORT_SYMBOL(alloc_netdev_mqs);
6312
6313 /**
6314  *      free_netdev - free network device
6315  *      @dev: device
6316  *
6317  *      This function does the last stage of destroying an allocated device
6318  *      interface. The reference to the device object is released.
6319  *      If this is the last reference then it will be freed.
6320  */
6321 void free_netdev(struct net_device *dev)
6322 {
6323         struct napi_struct *p, *n;
6324
6325         release_net(dev_net(dev));
6326
6327         netif_free_tx_queues(dev);
6328 #ifdef CONFIG_RPS
6329         kfree(dev->_rx);
6330 #endif
6331
6332         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
6333
6334         /* Flush device addresses */
6335         dev_addr_flush(dev);
6336
6337         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6338                 netif_napi_del(p);
6339
6340         free_percpu(dev->pcpu_refcnt);
6341         dev->pcpu_refcnt = NULL;
6342
6343         /*  Compatibility with error handling in drivers */
6344         if (dev->reg_state == NETREG_UNINITIALIZED) {
6345                 netdev_freemem(dev);
6346                 return;
6347         }
6348
6349         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6350         dev->reg_state = NETREG_RELEASED;
6351
6352         /* will free via device release */
6353         put_device(&dev->dev);
6354 }
6355 EXPORT_SYMBOL(free_netdev);
6356
6357 /**
6358  *      synchronize_net -  Synchronize with packet receive processing
6359  *
6360  *      Wait for packets currently being received to be done.
6361  *      Does not block later packets from starting.
6362  */
6363 void synchronize_net(void)
6364 {
6365         might_sleep();
6366         if (rtnl_is_locked())
6367                 synchronize_rcu_expedited();
6368         else
6369                 synchronize_rcu();
6370 }
6371 EXPORT_SYMBOL(synchronize_net);
6372
6373 /**
6374  *      unregister_netdevice_queue - remove device from the kernel
6375  *      @dev: device
6376  *      @head: list
6377  *
6378  *      This function shuts down a device interface and removes it
6379  *      from the kernel tables.
6380  *      If head not NULL, device is queued to be unregistered later.
6381  *
6382  *      Callers must hold the rtnl semaphore.  You may want
6383  *      unregister_netdev() instead of this.
6384  */
6385
6386 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
6387 {
6388         ASSERT_RTNL();
6389
6390         if (head) {
6391                 list_move_tail(&dev->unreg_list, head);
6392         } else {
6393                 rollback_registered(dev);
6394                 /* Finish processing unregister after unlock */
6395                 net_set_todo(dev);
6396         }
6397 }
6398 EXPORT_SYMBOL(unregister_netdevice_queue);
6399
6400 /**
6401  *      unregister_netdevice_many - unregister many devices
6402  *      @head: list of devices
6403  */
6404 void unregister_netdevice_many(struct list_head *head)
6405 {
6406         struct net_device *dev;
6407
6408         if (!list_empty(head)) {
6409                 rollback_registered_many(head);
6410                 list_for_each_entry(dev, head, unreg_list)
6411                         net_set_todo(dev);
6412         }
6413 }
6414 EXPORT_SYMBOL(unregister_netdevice_many);
6415
6416 /**
6417  *      unregister_netdev - remove device from the kernel
6418  *      @dev: device
6419  *
6420  *      This function shuts down a device interface and removes it
6421  *      from the kernel tables.
6422  *
6423  *      This is just a wrapper for unregister_netdevice that takes
6424  *      the rtnl semaphore.  In general you want to use this and not
6425  *      unregister_netdevice.
6426  */
6427 void unregister_netdev(struct net_device *dev)
6428 {
6429         rtnl_lock();
6430         unregister_netdevice(dev);
6431         rtnl_unlock();
6432 }
6433 EXPORT_SYMBOL(unregister_netdev);
6434
6435 /**
6436  *      dev_change_net_namespace - move device to different nethost namespace
6437  *      @dev: device
6438  *      @net: network namespace
6439  *      @pat: If not NULL name pattern to try if the current device name
6440  *            is already taken in the destination network namespace.
6441  *
6442  *      This function shuts down a device interface and moves it
6443  *      to a new network namespace. On success 0 is returned, on
6444  *      a failure a netagive errno code is returned.
6445  *
6446  *      Callers must hold the rtnl semaphore.
6447  */
6448
6449 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6450 {
6451         int err;
6452
6453         ASSERT_RTNL();
6454
6455         /* Don't allow namespace local devices to be moved. */
6456         err = -EINVAL;
6457         if (dev->features & NETIF_F_NETNS_LOCAL)
6458                 goto out;
6459
6460         /* Ensure the device has been registrered */
6461         if (dev->reg_state != NETREG_REGISTERED)
6462                 goto out;
6463
6464         /* Get out if there is nothing todo */
6465         err = 0;
6466         if (net_eq(dev_net(dev), net))
6467                 goto out;
6468
6469         /* Pick the destination device name, and ensure
6470          * we can use it in the destination network namespace.
6471          */
6472         err = -EEXIST;
6473         if (__dev_get_by_name(net, dev->name)) {
6474                 /* We get here if we can't use the current device name */
6475                 if (!pat)
6476                         goto out;
6477                 if (dev_get_valid_name(net, dev, pat) < 0)
6478                         goto out;
6479         }
6480
6481         /*
6482          * And now a mini version of register_netdevice unregister_netdevice.
6483          */
6484
6485         /* If device is running close it first. */
6486         dev_close(dev);
6487
6488         /* And unlink it from device chain */
6489         err = -ENODEV;
6490         unlist_netdevice(dev);
6491
6492         synchronize_net();
6493
6494         /* Shutdown queueing discipline. */
6495         dev_shutdown(dev);
6496
6497         /* Notify protocols, that we are about to destroy
6498            this device. They should clean all the things.
6499
6500            Note that dev->reg_state stays at NETREG_REGISTERED.
6501            This is wanted because this way 8021q and macvlan know
6502            the device is just moving and can keep their slaves up.
6503         */
6504         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6505         rcu_barrier();
6506         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6507         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
6508
6509         /*
6510          *      Flush the unicast and multicast chains
6511          */
6512         dev_uc_flush(dev);
6513         dev_mc_flush(dev);
6514
6515         /* Send a netdev-removed uevent to the old namespace */
6516         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6517
6518         /* Actually switch the network namespace */
6519         dev_net_set(dev, net);
6520
6521         /* If there is an ifindex conflict assign a new one */
6522         if (__dev_get_by_index(net, dev->ifindex)) {
6523                 int iflink = (dev->iflink == dev->ifindex);
6524                 dev->ifindex = dev_new_index(net);
6525                 if (iflink)
6526                         dev->iflink = dev->ifindex;
6527         }
6528
6529         /* Send a netdev-add uevent to the new namespace */
6530         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6531
6532         /* Fixup kobjects */
6533         err = device_rename(&dev->dev, dev->name);
6534         WARN_ON(err);
6535
6536         /* Add the device back in the hashes */
6537         list_netdevice(dev);
6538
6539         /* Notify protocols, that a new device appeared. */
6540         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6541
6542         /*
6543          *      Prevent userspace races by waiting until the network
6544          *      device is fully setup before sending notifications.
6545          */
6546         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6547
6548         synchronize_net();
6549         err = 0;
6550 out:
6551         return err;
6552 }
6553 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6554
6555 static int dev_cpu_callback(struct notifier_block *nfb,
6556                             unsigned long action,
6557                             void *ocpu)
6558 {
6559         struct sk_buff **list_skb;
6560         struct sk_buff *skb;
6561         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6562         struct softnet_data *sd, *oldsd;
6563
6564         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6565                 return NOTIFY_OK;
6566
6567         local_irq_disable();
6568         cpu = smp_processor_id();
6569         sd = &per_cpu(softnet_data, cpu);
6570         oldsd = &per_cpu(softnet_data, oldcpu);
6571
6572         /* Find end of our completion_queue. */
6573         list_skb = &sd->completion_queue;
6574         while (*list_skb)
6575                 list_skb = &(*list_skb)->next;
6576         /* Append completion queue from offline CPU. */
6577         *list_skb = oldsd->completion_queue;
6578         oldsd->completion_queue = NULL;
6579
6580         /* Append output queue from offline CPU. */
6581         if (oldsd->output_queue) {
6582                 *sd->output_queue_tailp = oldsd->output_queue;
6583                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6584                 oldsd->output_queue = NULL;
6585                 oldsd->output_queue_tailp = &oldsd->output_queue;
6586         }
6587         /* Append NAPI poll list from offline CPU. */
6588         if (!list_empty(&oldsd->poll_list)) {
6589                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6590                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6591         }
6592
6593         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6594         local_irq_enable();
6595
6596         /* Process offline CPU's input_pkt_queue */
6597         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6598                 netif_rx(skb);
6599                 input_queue_head_incr(oldsd);
6600         }
6601         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6602                 netif_rx(skb);
6603                 input_queue_head_incr(oldsd);
6604         }
6605
6606         return NOTIFY_OK;
6607 }
6608
6609
6610 /**
6611  *      netdev_increment_features - increment feature set by one
6612  *      @all: current feature set
6613  *      @one: new feature set
6614  *      @mask: mask feature set
6615  *
6616  *      Computes a new feature set after adding a device with feature set
6617  *      @one to the master device with current feature set @all.  Will not
6618  *      enable anything that is off in @mask. Returns the new feature set.
6619  */
6620 netdev_features_t netdev_increment_features(netdev_features_t all,
6621         netdev_features_t one, netdev_features_t mask)
6622 {
6623         if (mask & NETIF_F_GEN_CSUM)
6624                 mask |= NETIF_F_ALL_CSUM;
6625         mask |= NETIF_F_VLAN_CHALLENGED;
6626
6627         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6628         all &= one | ~NETIF_F_ALL_FOR_ALL;
6629
6630         /* If one device supports hw checksumming, set for all. */
6631         if (all & NETIF_F_GEN_CSUM)
6632                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6633
6634         return all;
6635 }
6636 EXPORT_SYMBOL(netdev_increment_features);
6637
6638 static struct hlist_head * __net_init netdev_create_hash(void)
6639 {
6640         int i;
6641         struct hlist_head *hash;
6642
6643         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6644         if (hash != NULL)
6645                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6646                         INIT_HLIST_HEAD(&hash[i]);
6647
6648         return hash;
6649 }
6650
6651 /* Initialize per network namespace state */
6652 static int __net_init netdev_init(struct net *net)
6653 {
6654         if (net != &init_net)
6655                 INIT_LIST_HEAD(&net->dev_base_head);
6656
6657         net->dev_name_head = netdev_create_hash();
6658         if (net->dev_name_head == NULL)
6659                 goto err_name;
6660
6661         net->dev_index_head = netdev_create_hash();
6662         if (net->dev_index_head == NULL)
6663                 goto err_idx;
6664
6665         return 0;
6666
6667 err_idx:
6668         kfree(net->dev_name_head);
6669 err_name:
6670         return -ENOMEM;
6671 }
6672
6673 /**
6674  *      netdev_drivername - network driver for the device
6675  *      @dev: network device
6676  *
6677  *      Determine network driver for device.
6678  */
6679 const char *netdev_drivername(const struct net_device *dev)
6680 {
6681         const struct device_driver *driver;
6682         const struct device *parent;
6683         const char *empty = "";
6684
6685         parent = dev->dev.parent;
6686         if (!parent)
6687                 return empty;
6688
6689         driver = parent->driver;
6690         if (driver && driver->name)
6691                 return driver->name;
6692         return empty;
6693 }
6694
6695 static int __netdev_printk(const char *level, const struct net_device *dev,
6696                            struct va_format *vaf)
6697 {
6698         int r;
6699
6700         if (dev && dev->dev.parent) {
6701                 r = dev_printk_emit(level[1] - '0',
6702                                     dev->dev.parent,
6703                                     "%s %s %s: %pV",
6704                                     dev_driver_string(dev->dev.parent),
6705                                     dev_name(dev->dev.parent),
6706                                     netdev_name(dev), vaf);
6707         } else if (dev) {
6708                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6709         } else {
6710                 r = printk("%s(NULL net_device): %pV", level, vaf);
6711         }
6712
6713         return r;
6714 }
6715
6716 int netdev_printk(const char *level, const struct net_device *dev,
6717                   const char *format, ...)
6718 {
6719         struct va_format vaf;
6720         va_list args;
6721         int r;
6722
6723         va_start(args, format);
6724
6725         vaf.fmt = format;
6726         vaf.va = &args;
6727
6728         r = __netdev_printk(level, dev, &vaf);
6729
6730         va_end(args);
6731
6732         return r;
6733 }
6734 EXPORT_SYMBOL(netdev_printk);
6735
6736 #define define_netdev_printk_level(func, level)                 \
6737 int func(const struct net_device *dev, const char *fmt, ...)    \
6738 {                                                               \
6739         int r;                                                  \
6740         struct va_format vaf;                                   \
6741         va_list args;                                           \
6742                                                                 \
6743         va_start(args, fmt);                                    \
6744                                                                 \
6745         vaf.fmt = fmt;                                          \
6746         vaf.va = &args;                                         \
6747                                                                 \
6748         r = __netdev_printk(level, dev, &vaf);                  \
6749                                                                 \
6750         va_end(args);                                           \
6751                                                                 \
6752         return r;                                               \
6753 }                                                               \
6754 EXPORT_SYMBOL(func);
6755
6756 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6757 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6758 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6759 define_netdev_printk_level(netdev_err, KERN_ERR);
6760 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6761 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6762 define_netdev_printk_level(netdev_info, KERN_INFO);
6763
6764 static void __net_exit netdev_exit(struct net *net)
6765 {
6766         kfree(net->dev_name_head);
6767         kfree(net->dev_index_head);
6768 }
6769
6770 static struct pernet_operations __net_initdata netdev_net_ops = {
6771         .init = netdev_init,
6772         .exit = netdev_exit,
6773 };
6774
6775 static void __net_exit default_device_exit(struct net *net)
6776 {
6777         struct net_device *dev, *aux;
6778         /*
6779          * Push all migratable network devices back to the
6780          * initial network namespace
6781          */
6782         rtnl_lock();
6783         for_each_netdev_safe(net, dev, aux) {
6784                 int err;
6785                 char fb_name[IFNAMSIZ];
6786
6787                 /* Ignore unmoveable devices (i.e. loopback) */
6788                 if (dev->features & NETIF_F_NETNS_LOCAL)
6789                         continue;
6790
6791                 /* Leave virtual devices for the generic cleanup */
6792                 if (dev->rtnl_link_ops)
6793                         continue;
6794
6795                 /* Push remaining network devices to init_net */
6796                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6797                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6798                 if (err) {
6799                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6800                                  __func__, dev->name, err);
6801                         BUG();
6802                 }
6803         }
6804         rtnl_unlock();
6805 }
6806
6807 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
6808 {
6809         /* Return with the rtnl_lock held when there are no network
6810          * devices unregistering in any network namespace in net_list.
6811          */
6812         struct net *net;
6813         bool unregistering;
6814         DEFINE_WAIT(wait);
6815
6816         for (;;) {
6817                 prepare_to_wait(&netdev_unregistering_wq, &wait,
6818                                 TASK_UNINTERRUPTIBLE);
6819                 unregistering = false;
6820                 rtnl_lock();
6821                 list_for_each_entry(net, net_list, exit_list) {
6822                         if (net->dev_unreg_count > 0) {
6823                                 unregistering = true;
6824                                 break;
6825                         }
6826                 }
6827                 if (!unregistering)
6828                         break;
6829                 __rtnl_unlock();
6830                 schedule();
6831         }
6832         finish_wait(&netdev_unregistering_wq, &wait);
6833 }
6834
6835 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6836 {
6837         /* At exit all network devices most be removed from a network
6838          * namespace.  Do this in the reverse order of registration.
6839          * Do this across as many network namespaces as possible to
6840          * improve batching efficiency.
6841          */
6842         struct net_device *dev;
6843         struct net *net;
6844         LIST_HEAD(dev_kill_list);
6845
6846         /* To prevent network device cleanup code from dereferencing
6847          * loopback devices or network devices that have been freed
6848          * wait here for all pending unregistrations to complete,
6849          * before unregistring the loopback device and allowing the
6850          * network namespace be freed.
6851          *
6852          * The netdev todo list containing all network devices
6853          * unregistrations that happen in default_device_exit_batch
6854          * will run in the rtnl_unlock() at the end of
6855          * default_device_exit_batch.
6856          */
6857         rtnl_lock_unregistering(net_list);
6858         list_for_each_entry(net, net_list, exit_list) {
6859                 for_each_netdev_reverse(net, dev) {
6860                         if (dev->rtnl_link_ops)
6861                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6862                         else
6863                                 unregister_netdevice_queue(dev, &dev_kill_list);
6864                 }
6865         }
6866         unregister_netdevice_many(&dev_kill_list);
6867         list_del(&dev_kill_list);
6868         rtnl_unlock();
6869 }
6870
6871 static struct pernet_operations __net_initdata default_device_ops = {
6872         .exit = default_device_exit,
6873         .exit_batch = default_device_exit_batch,
6874 };
6875
6876 /*
6877  *      Initialize the DEV module. At boot time this walks the device list and
6878  *      unhooks any devices that fail to initialise (normally hardware not
6879  *      present) and leaves us with a valid list of present and active devices.
6880  *
6881  */
6882
6883 /*
6884  *       This is called single threaded during boot, so no need
6885  *       to take the rtnl semaphore.
6886  */
6887 static int __init net_dev_init(void)
6888 {
6889         int i, rc = -ENOMEM;
6890
6891         BUG_ON(!dev_boot_phase);
6892
6893         if (dev_proc_init())
6894                 goto out;
6895
6896         if (netdev_kobject_init())
6897                 goto out;
6898
6899         INIT_LIST_HEAD(&ptype_all);
6900         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6901                 INIT_LIST_HEAD(&ptype_base[i]);
6902
6903         INIT_LIST_HEAD(&offload_base);
6904
6905         if (register_pernet_subsys(&netdev_net_ops))
6906                 goto out;
6907
6908         /*
6909          *      Initialise the packet receive queues.
6910          */
6911
6912         for_each_possible_cpu(i) {
6913                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6914
6915                 memset(sd, 0, sizeof(*sd));
6916                 skb_queue_head_init(&sd->input_pkt_queue);
6917                 skb_queue_head_init(&sd->process_queue);
6918                 sd->completion_queue = NULL;
6919                 INIT_LIST_HEAD(&sd->poll_list);
6920                 sd->output_queue = NULL;
6921                 sd->output_queue_tailp = &sd->output_queue;
6922 #ifdef CONFIG_RPS
6923                 sd->csd.func = rps_trigger_softirq;
6924                 sd->csd.info = sd;
6925                 sd->csd.flags = 0;
6926                 sd->cpu = i;
6927 #endif
6928
6929                 sd->backlog.poll = process_backlog;
6930                 sd->backlog.weight = weight_p;
6931                 sd->backlog.gro_list = NULL;
6932                 sd->backlog.gro_count = 0;
6933
6934 #ifdef CONFIG_NET_FLOW_LIMIT
6935                 sd->flow_limit = NULL;
6936 #endif
6937         }
6938
6939         dev_boot_phase = 0;
6940
6941         /* The loopback device is special if any other network devices
6942          * is present in a network namespace the loopback device must
6943          * be present. Since we now dynamically allocate and free the
6944          * loopback device ensure this invariant is maintained by
6945          * keeping the loopback device as the first device on the
6946          * list of network devices.  Ensuring the loopback devices
6947          * is the first device that appears and the last network device
6948          * that disappears.
6949          */
6950         if (register_pernet_device(&loopback_net_ops))
6951                 goto out;
6952
6953         if (register_pernet_device(&default_device_ops))
6954                 goto out;
6955
6956         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6957         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6958
6959         hotcpu_notifier(dev_cpu_callback, 0);
6960         dst_init();
6961         rc = 0;
6962 out:
6963         return rc;
6964 }
6965
6966 subsys_initcall(net_dev_init);