2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
121 #include <linux/ipv6.h>
122 #include <linux/in.h>
123 #include <linux/jhash.h>
124 #include <linux/random.h>
125 #include <trace/events/napi.h>
126 #include <trace/events/net.h>
127 #include <trace/events/skb.h>
128 #include <linux/pci.h>
129 #include <linux/inetdevice.h>
130 #include <linux/cpu_rmap.h>
131 #include <linux/static_key.h>
132 #include <linux/hashtable.h>
133 #include <linux/vmalloc.h>
135 #include "net-sysfs.h"
137 /* Instead of increasing this, you should create a hash table. */
138 #define MAX_GRO_SKBS 8
140 /* This should be increased if a protocol with a bigger head is added. */
141 #define GRO_MAX_HEAD (MAX_HEADER + 128)
143 static DEFINE_SPINLOCK(ptype_lock);
144 static DEFINE_SPINLOCK(offload_lock);
145 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
146 struct list_head ptype_all __read_mostly; /* Taps */
147 static struct list_head offload_base __read_mostly;
150 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
153 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
155 * Writers must hold the rtnl semaphore while they loop through the
156 * dev_base_head list, and hold dev_base_lock for writing when they do the
157 * actual updates. This allows pure readers to access the list even
158 * while a writer is preparing to update it.
160 * To put it another way, dev_base_lock is held for writing only to
161 * protect against pure readers; the rtnl semaphore provides the
162 * protection against other writers.
164 * See, for example usages, register_netdevice() and
165 * unregister_netdevice(), which must be called with the rtnl
168 DEFINE_RWLOCK(dev_base_lock);
169 EXPORT_SYMBOL(dev_base_lock);
171 /* protects napi_hash addition/deletion and napi_gen_id */
172 static DEFINE_SPINLOCK(napi_hash_lock);
174 static unsigned int napi_gen_id;
175 static DEFINE_HASHTABLE(napi_hash, 8);
177 seqcount_t devnet_rename_seq;
179 static inline void dev_base_seq_inc(struct net *net)
181 while (++net->dev_base_seq == 0);
184 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
186 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
188 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
191 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
193 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
196 static inline void rps_lock(struct softnet_data *sd)
199 spin_lock(&sd->input_pkt_queue.lock);
203 static inline void rps_unlock(struct softnet_data *sd)
206 spin_unlock(&sd->input_pkt_queue.lock);
210 /* Device list insertion */
211 static void list_netdevice(struct net_device *dev)
213 struct net *net = dev_net(dev);
217 write_lock_bh(&dev_base_lock);
218 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
219 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
220 hlist_add_head_rcu(&dev->index_hlist,
221 dev_index_hash(net, dev->ifindex));
222 write_unlock_bh(&dev_base_lock);
224 dev_base_seq_inc(net);
227 /* Device list removal
228 * caller must respect a RCU grace period before freeing/reusing dev
230 static void unlist_netdevice(struct net_device *dev)
234 /* Unlink dev from the device chain */
235 write_lock_bh(&dev_base_lock);
236 list_del_rcu(&dev->dev_list);
237 hlist_del_rcu(&dev->name_hlist);
238 hlist_del_rcu(&dev->index_hlist);
239 write_unlock_bh(&dev_base_lock);
241 dev_base_seq_inc(dev_net(dev));
248 static RAW_NOTIFIER_HEAD(netdev_chain);
251 * Device drivers call our routines to queue packets here. We empty the
252 * queue in the local softnet handler.
255 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
256 EXPORT_PER_CPU_SYMBOL(softnet_data);
258 #ifdef CONFIG_LOCKDEP
260 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
261 * according to dev->type
263 static const unsigned short netdev_lock_type[] =
264 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
265 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
266 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
267 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
268 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
269 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
270 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
271 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
272 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
273 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
274 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
275 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
276 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
277 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
278 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
280 static const char *const netdev_lock_name[] =
281 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
282 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
283 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
284 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
285 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
286 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
287 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
288 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
289 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
290 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
291 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
292 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
293 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
294 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
295 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
297 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
298 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
300 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
304 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
305 if (netdev_lock_type[i] == dev_type)
307 /* the last key is used by default */
308 return ARRAY_SIZE(netdev_lock_type) - 1;
311 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
312 unsigned short dev_type)
316 i = netdev_lock_pos(dev_type);
317 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
318 netdev_lock_name[i]);
321 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
325 i = netdev_lock_pos(dev->type);
326 lockdep_set_class_and_name(&dev->addr_list_lock,
327 &netdev_addr_lock_key[i],
328 netdev_lock_name[i]);
331 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
332 unsigned short dev_type)
335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
340 /*******************************************************************************
342 Protocol management and registration routines
344 *******************************************************************************/
347 * Add a protocol ID to the list. Now that the input handler is
348 * smarter we can dispense with all the messy stuff that used to be
351 * BEWARE!!! Protocol handlers, mangling input packets,
352 * MUST BE last in hash buckets and checking protocol handlers
353 * MUST start from promiscuous ptype_all chain in net_bh.
354 * It is true now, do not change it.
355 * Explanation follows: if protocol handler, mangling packet, will
356 * be the first on list, it is not able to sense, that packet
357 * is cloned and should be copied-on-write, so that it will
358 * change it and subsequent readers will get broken packet.
362 static inline struct list_head *ptype_head(const struct packet_type *pt)
364 if (pt->type == htons(ETH_P_ALL))
367 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
371 * dev_add_pack - add packet handler
372 * @pt: packet type declaration
374 * Add a protocol handler to the networking stack. The passed &packet_type
375 * is linked into kernel lists and may not be freed until it has been
376 * removed from the kernel lists.
378 * This call does not sleep therefore it can not
379 * guarantee all CPU's that are in middle of receiving packets
380 * will see the new packet type (until the next received packet).
383 void dev_add_pack(struct packet_type *pt)
385 struct list_head *head = ptype_head(pt);
387 spin_lock(&ptype_lock);
388 list_add_rcu(&pt->list, head);
389 spin_unlock(&ptype_lock);
391 EXPORT_SYMBOL(dev_add_pack);
394 * __dev_remove_pack - remove packet handler
395 * @pt: packet type declaration
397 * Remove a protocol handler that was previously added to the kernel
398 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
399 * from the kernel lists and can be freed or reused once this function
402 * The packet type might still be in use by receivers
403 * and must not be freed until after all the CPU's have gone
404 * through a quiescent state.
406 void __dev_remove_pack(struct packet_type *pt)
408 struct list_head *head = ptype_head(pt);
409 struct packet_type *pt1;
411 spin_lock(&ptype_lock);
413 list_for_each_entry(pt1, head, list) {
415 list_del_rcu(&pt->list);
420 pr_warn("dev_remove_pack: %p not found\n", pt);
422 spin_unlock(&ptype_lock);
424 EXPORT_SYMBOL(__dev_remove_pack);
427 * dev_remove_pack - remove packet handler
428 * @pt: packet type declaration
430 * Remove a protocol handler that was previously added to the kernel
431 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
432 * from the kernel lists and can be freed or reused once this function
435 * This call sleeps to guarantee that no CPU is looking at the packet
438 void dev_remove_pack(struct packet_type *pt)
440 __dev_remove_pack(pt);
444 EXPORT_SYMBOL(dev_remove_pack);
448 * dev_add_offload - register offload handlers
449 * @po: protocol offload declaration
451 * Add protocol offload handlers to the networking stack. The passed
452 * &proto_offload is linked into kernel lists and may not be freed until
453 * it has been removed from the kernel lists.
455 * This call does not sleep therefore it can not
456 * guarantee all CPU's that are in middle of receiving packets
457 * will see the new offload handlers (until the next received packet).
459 void dev_add_offload(struct packet_offload *po)
461 struct list_head *head = &offload_base;
463 spin_lock(&offload_lock);
464 list_add_rcu(&po->list, head);
465 spin_unlock(&offload_lock);
467 EXPORT_SYMBOL(dev_add_offload);
470 * __dev_remove_offload - remove offload handler
471 * @po: packet offload declaration
473 * Remove a protocol offload handler that was previously added to the
474 * kernel offload handlers by dev_add_offload(). The passed &offload_type
475 * is removed from the kernel lists and can be freed or reused once this
478 * The packet type might still be in use by receivers
479 * and must not be freed until after all the CPU's have gone
480 * through a quiescent state.
482 void __dev_remove_offload(struct packet_offload *po)
484 struct list_head *head = &offload_base;
485 struct packet_offload *po1;
487 spin_lock(&offload_lock);
489 list_for_each_entry(po1, head, list) {
491 list_del_rcu(&po->list);
496 pr_warn("dev_remove_offload: %p not found\n", po);
498 spin_unlock(&offload_lock);
500 EXPORT_SYMBOL(__dev_remove_offload);
503 * dev_remove_offload - remove packet offload handler
504 * @po: packet offload declaration
506 * Remove a packet offload handler that was previously added to the kernel
507 * offload handlers by dev_add_offload(). The passed &offload_type is
508 * removed from the kernel lists and can be freed or reused once this
511 * This call sleeps to guarantee that no CPU is looking at the packet
514 void dev_remove_offload(struct packet_offload *po)
516 __dev_remove_offload(po);
520 EXPORT_SYMBOL(dev_remove_offload);
522 /******************************************************************************
524 Device Boot-time Settings Routines
526 *******************************************************************************/
528 /* Boot time configuration table */
529 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
532 * netdev_boot_setup_add - add new setup entry
533 * @name: name of the device
534 * @map: configured settings for the device
536 * Adds new setup entry to the dev_boot_setup list. The function
537 * returns 0 on error and 1 on success. This is a generic routine to
540 static int netdev_boot_setup_add(char *name, struct ifmap *map)
542 struct netdev_boot_setup *s;
546 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
547 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
548 memset(s[i].name, 0, sizeof(s[i].name));
549 strlcpy(s[i].name, name, IFNAMSIZ);
550 memcpy(&s[i].map, map, sizeof(s[i].map));
555 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
559 * netdev_boot_setup_check - check boot time settings
560 * @dev: the netdevice
562 * Check boot time settings for the device.
563 * The found settings are set for the device to be used
564 * later in the device probing.
565 * Returns 0 if no settings found, 1 if they are.
567 int netdev_boot_setup_check(struct net_device *dev)
569 struct netdev_boot_setup *s = dev_boot_setup;
572 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
573 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
574 !strcmp(dev->name, s[i].name)) {
575 dev->irq = s[i].map.irq;
576 dev->base_addr = s[i].map.base_addr;
577 dev->mem_start = s[i].map.mem_start;
578 dev->mem_end = s[i].map.mem_end;
584 EXPORT_SYMBOL(netdev_boot_setup_check);
588 * netdev_boot_base - get address from boot time settings
589 * @prefix: prefix for network device
590 * @unit: id for network device
592 * Check boot time settings for the base address of device.
593 * The found settings are set for the device to be used
594 * later in the device probing.
595 * Returns 0 if no settings found.
597 unsigned long netdev_boot_base(const char *prefix, int unit)
599 const struct netdev_boot_setup *s = dev_boot_setup;
603 sprintf(name, "%s%d", prefix, unit);
606 * If device already registered then return base of 1
607 * to indicate not to probe for this interface
609 if (__dev_get_by_name(&init_net, name))
612 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
613 if (!strcmp(name, s[i].name))
614 return s[i].map.base_addr;
619 * Saves at boot time configured settings for any netdevice.
621 int __init netdev_boot_setup(char *str)
626 str = get_options(str, ARRAY_SIZE(ints), ints);
631 memset(&map, 0, sizeof(map));
635 map.base_addr = ints[2];
637 map.mem_start = ints[3];
639 map.mem_end = ints[4];
641 /* Add new entry to the list */
642 return netdev_boot_setup_add(str, &map);
645 __setup("netdev=", netdev_boot_setup);
647 /*******************************************************************************
649 Device Interface Subroutines
651 *******************************************************************************/
654 * __dev_get_by_name - find a device by its name
655 * @net: the applicable net namespace
656 * @name: name to find
658 * Find an interface by name. Must be called under RTNL semaphore
659 * or @dev_base_lock. If the name is found a pointer to the device
660 * is returned. If the name is not found then %NULL is returned. The
661 * reference counters are not incremented so the caller must be
662 * careful with locks.
665 struct net_device *__dev_get_by_name(struct net *net, const char *name)
667 struct net_device *dev;
668 struct hlist_head *head = dev_name_hash(net, name);
670 hlist_for_each_entry(dev, head, name_hlist)
671 if (!strncmp(dev->name, name, IFNAMSIZ))
676 EXPORT_SYMBOL(__dev_get_by_name);
679 * dev_get_by_name_rcu - find a device by its name
680 * @net: the applicable net namespace
681 * @name: name to find
683 * Find an interface by name.
684 * If the name is found a pointer to the device is returned.
685 * If the name is not found then %NULL is returned.
686 * The reference counters are not incremented so the caller must be
687 * careful with locks. The caller must hold RCU lock.
690 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
692 struct net_device *dev;
693 struct hlist_head *head = dev_name_hash(net, name);
695 hlist_for_each_entry_rcu(dev, head, name_hlist)
696 if (!strncmp(dev->name, name, IFNAMSIZ))
701 EXPORT_SYMBOL(dev_get_by_name_rcu);
704 * dev_get_by_name - find a device by its name
705 * @net: the applicable net namespace
706 * @name: name to find
708 * Find an interface by name. This can be called from any
709 * context and does its own locking. The returned handle has
710 * the usage count incremented and the caller must use dev_put() to
711 * release it when it is no longer needed. %NULL is returned if no
712 * matching device is found.
715 struct net_device *dev_get_by_name(struct net *net, const char *name)
717 struct net_device *dev;
720 dev = dev_get_by_name_rcu(net, name);
726 EXPORT_SYMBOL(dev_get_by_name);
729 * __dev_get_by_index - find a device by its ifindex
730 * @net: the applicable net namespace
731 * @ifindex: index of device
733 * Search for an interface by index. Returns %NULL if the device
734 * is not found or a pointer to the device. The device has not
735 * had its reference counter increased so the caller must be careful
736 * about locking. The caller must hold either the RTNL semaphore
740 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
742 struct net_device *dev;
743 struct hlist_head *head = dev_index_hash(net, ifindex);
745 hlist_for_each_entry(dev, head, index_hlist)
746 if (dev->ifindex == ifindex)
751 EXPORT_SYMBOL(__dev_get_by_index);
754 * dev_get_by_index_rcu - find a device by its ifindex
755 * @net: the applicable net namespace
756 * @ifindex: index of device
758 * Search for an interface by index. Returns %NULL if the device
759 * is not found or a pointer to the device. The device has not
760 * had its reference counter increased so the caller must be careful
761 * about locking. The caller must hold RCU lock.
764 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
766 struct net_device *dev;
767 struct hlist_head *head = dev_index_hash(net, ifindex);
769 hlist_for_each_entry_rcu(dev, head, index_hlist)
770 if (dev->ifindex == ifindex)
775 EXPORT_SYMBOL(dev_get_by_index_rcu);
779 * dev_get_by_index - find a device by its ifindex
780 * @net: the applicable net namespace
781 * @ifindex: index of device
783 * Search for an interface by index. Returns NULL if the device
784 * is not found or a pointer to the device. The device returned has
785 * had a reference added and the pointer is safe until the user calls
786 * dev_put to indicate they have finished with it.
789 struct net_device *dev_get_by_index(struct net *net, int ifindex)
791 struct net_device *dev;
794 dev = dev_get_by_index_rcu(net, ifindex);
800 EXPORT_SYMBOL(dev_get_by_index);
803 * dev_getbyhwaddr_rcu - find a device by its hardware address
804 * @net: the applicable net namespace
805 * @type: media type of device
806 * @ha: hardware address
808 * Search for an interface by MAC address. Returns NULL if the device
809 * is not found or a pointer to the device.
810 * The caller must hold RCU or RTNL.
811 * The returned device has not had its ref count increased
812 * and the caller must therefore be careful about locking
816 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
819 struct net_device *dev;
821 for_each_netdev_rcu(net, dev)
822 if (dev->type == type &&
823 !memcmp(dev->dev_addr, ha, dev->addr_len))
828 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
830 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
832 struct net_device *dev;
835 for_each_netdev(net, dev)
836 if (dev->type == type)
841 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
843 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
845 struct net_device *dev, *ret = NULL;
848 for_each_netdev_rcu(net, dev)
849 if (dev->type == type) {
857 EXPORT_SYMBOL(dev_getfirstbyhwtype);
860 * dev_get_by_flags_rcu - find any device with given flags
861 * @net: the applicable net namespace
862 * @if_flags: IFF_* values
863 * @mask: bitmask of bits in if_flags to check
865 * Search for any interface with the given flags. Returns NULL if a device
866 * is not found or a pointer to the device. Must be called inside
867 * rcu_read_lock(), and result refcount is unchanged.
870 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
873 struct net_device *dev, *ret;
876 for_each_netdev_rcu(net, dev) {
877 if (((dev->flags ^ if_flags) & mask) == 0) {
884 EXPORT_SYMBOL(dev_get_by_flags_rcu);
887 * dev_valid_name - check if name is okay for network device
890 * Network device names need to be valid file names to
891 * to allow sysfs to work. We also disallow any kind of
894 bool dev_valid_name(const char *name)
898 if (strlen(name) >= IFNAMSIZ)
900 if (!strcmp(name, ".") || !strcmp(name, ".."))
904 if (*name == '/' || isspace(*name))
910 EXPORT_SYMBOL(dev_valid_name);
913 * __dev_alloc_name - allocate a name for a device
914 * @net: network namespace to allocate the device name in
915 * @name: name format string
916 * @buf: scratch buffer and result name string
918 * Passed a format string - eg "lt%d" it will try and find a suitable
919 * id. It scans list of devices to build up a free map, then chooses
920 * the first empty slot. The caller must hold the dev_base or rtnl lock
921 * while allocating the name and adding the device in order to avoid
923 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
924 * Returns the number of the unit assigned or a negative errno code.
927 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
931 const int max_netdevices = 8*PAGE_SIZE;
932 unsigned long *inuse;
933 struct net_device *d;
935 p = strnchr(name, IFNAMSIZ-1, '%');
938 * Verify the string as this thing may have come from
939 * the user. There must be either one "%d" and no other "%"
942 if (p[1] != 'd' || strchr(p + 2, '%'))
945 /* Use one page as a bit array of possible slots */
946 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
950 for_each_netdev(net, d) {
951 if (!sscanf(d->name, name, &i))
953 if (i < 0 || i >= max_netdevices)
956 /* avoid cases where sscanf is not exact inverse of printf */
957 snprintf(buf, IFNAMSIZ, name, i);
958 if (!strncmp(buf, d->name, IFNAMSIZ))
962 i = find_first_zero_bit(inuse, max_netdevices);
963 free_page((unsigned long) inuse);
967 snprintf(buf, IFNAMSIZ, name, i);
968 if (!__dev_get_by_name(net, buf))
971 /* It is possible to run out of possible slots
972 * when the name is long and there isn't enough space left
973 * for the digits, or if all bits are used.
979 * dev_alloc_name - allocate a name for a device
981 * @name: name format string
983 * Passed a format string - eg "lt%d" it will try and find a suitable
984 * id. It scans list of devices to build up a free map, then chooses
985 * the first empty slot. The caller must hold the dev_base or rtnl lock
986 * while allocating the name and adding the device in order to avoid
988 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
989 * Returns the number of the unit assigned or a negative errno code.
992 int dev_alloc_name(struct net_device *dev, const char *name)
998 BUG_ON(!dev_net(dev));
1000 ret = __dev_alloc_name(net, name, buf);
1002 strlcpy(dev->name, buf, IFNAMSIZ);
1005 EXPORT_SYMBOL(dev_alloc_name);
1007 static int dev_alloc_name_ns(struct net *net,
1008 struct net_device *dev,
1014 ret = __dev_alloc_name(net, name, buf);
1016 strlcpy(dev->name, buf, IFNAMSIZ);
1020 static int dev_get_valid_name(struct net *net,
1021 struct net_device *dev,
1026 if (!dev_valid_name(name))
1029 if (strchr(name, '%'))
1030 return dev_alloc_name_ns(net, dev, name);
1031 else if (__dev_get_by_name(net, name))
1033 else if (dev->name != name)
1034 strlcpy(dev->name, name, IFNAMSIZ);
1040 * dev_change_name - change name of a device
1042 * @newname: name (or format string) must be at least IFNAMSIZ
1044 * Change name of a device, can pass format strings "eth%d".
1047 int dev_change_name(struct net_device *dev, const char *newname)
1049 char oldname[IFNAMSIZ];
1055 BUG_ON(!dev_net(dev));
1058 if (dev->flags & IFF_UP)
1061 write_seqcount_begin(&devnet_rename_seq);
1063 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1064 write_seqcount_end(&devnet_rename_seq);
1068 memcpy(oldname, dev->name, IFNAMSIZ);
1070 err = dev_get_valid_name(net, dev, newname);
1072 write_seqcount_end(&devnet_rename_seq);
1077 ret = device_rename(&dev->dev, dev->name);
1079 memcpy(dev->name, oldname, IFNAMSIZ);
1080 write_seqcount_end(&devnet_rename_seq);
1084 write_seqcount_end(&devnet_rename_seq);
1086 write_lock_bh(&dev_base_lock);
1087 hlist_del_rcu(&dev->name_hlist);
1088 write_unlock_bh(&dev_base_lock);
1092 write_lock_bh(&dev_base_lock);
1093 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1094 write_unlock_bh(&dev_base_lock);
1096 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1097 ret = notifier_to_errno(ret);
1100 /* err >= 0 after dev_alloc_name() or stores the first errno */
1103 write_seqcount_begin(&devnet_rename_seq);
1104 memcpy(dev->name, oldname, IFNAMSIZ);
1107 pr_err("%s: name change rollback failed: %d\n",
1116 * dev_set_alias - change ifalias of a device
1118 * @alias: name up to IFALIASZ
1119 * @len: limit of bytes to copy from info
1121 * Set ifalias for a device,
1123 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1129 if (len >= IFALIASZ)
1133 kfree(dev->ifalias);
1134 dev->ifalias = NULL;
1138 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1141 dev->ifalias = new_ifalias;
1143 strlcpy(dev->ifalias, alias, len+1);
1149 * netdev_features_change - device changes features
1150 * @dev: device to cause notification
1152 * Called to indicate a device has changed features.
1154 void netdev_features_change(struct net_device *dev)
1156 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1158 EXPORT_SYMBOL(netdev_features_change);
1161 * netdev_state_change - device changes state
1162 * @dev: device to cause notification
1164 * Called to indicate a device has changed state. This function calls
1165 * the notifier chains for netdev_chain and sends a NEWLINK message
1166 * to the routing socket.
1168 void netdev_state_change(struct net_device *dev)
1170 if (dev->flags & IFF_UP) {
1171 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1172 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1175 EXPORT_SYMBOL(netdev_state_change);
1178 * netdev_notify_peers - notify network peers about existence of @dev
1179 * @dev: network device
1181 * Generate traffic such that interested network peers are aware of
1182 * @dev, such as by generating a gratuitous ARP. This may be used when
1183 * a device wants to inform the rest of the network about some sort of
1184 * reconfiguration such as a failover event or virtual machine
1187 void netdev_notify_peers(struct net_device *dev)
1190 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1193 EXPORT_SYMBOL(netdev_notify_peers);
1195 static int __dev_open(struct net_device *dev)
1197 const struct net_device_ops *ops = dev->netdev_ops;
1202 if (!netif_device_present(dev))
1205 /* Block netpoll from trying to do any rx path servicing.
1206 * If we don't do this there is a chance ndo_poll_controller
1207 * or ndo_poll may be running while we open the device
1209 netpoll_rx_disable(dev);
1211 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1212 ret = notifier_to_errno(ret);
1216 set_bit(__LINK_STATE_START, &dev->state);
1218 if (ops->ndo_validate_addr)
1219 ret = ops->ndo_validate_addr(dev);
1221 if (!ret && ops->ndo_open)
1222 ret = ops->ndo_open(dev);
1224 netpoll_rx_enable(dev);
1227 clear_bit(__LINK_STATE_START, &dev->state);
1229 dev->flags |= IFF_UP;
1230 net_dmaengine_get();
1231 dev_set_rx_mode(dev);
1233 add_device_randomness(dev->dev_addr, dev->addr_len);
1240 * dev_open - prepare an interface for use.
1241 * @dev: device to open
1243 * Takes a device from down to up state. The device's private open
1244 * function is invoked and then the multicast lists are loaded. Finally
1245 * the device is moved into the up state and a %NETDEV_UP message is
1246 * sent to the netdev notifier chain.
1248 * Calling this function on an active interface is a nop. On a failure
1249 * a negative errno code is returned.
1251 int dev_open(struct net_device *dev)
1255 if (dev->flags & IFF_UP)
1258 ret = __dev_open(dev);
1262 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1263 call_netdevice_notifiers(NETDEV_UP, dev);
1267 EXPORT_SYMBOL(dev_open);
1269 static int __dev_close_many(struct list_head *head)
1271 struct net_device *dev;
1276 list_for_each_entry(dev, head, unreg_list) {
1277 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1279 clear_bit(__LINK_STATE_START, &dev->state);
1281 /* Synchronize to scheduled poll. We cannot touch poll list, it
1282 * can be even on different cpu. So just clear netif_running().
1284 * dev->stop() will invoke napi_disable() on all of it's
1285 * napi_struct instances on this device.
1287 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1290 dev_deactivate_many(head);
1292 list_for_each_entry(dev, head, unreg_list) {
1293 const struct net_device_ops *ops = dev->netdev_ops;
1296 * Call the device specific close. This cannot fail.
1297 * Only if device is UP
1299 * We allow it to be called even after a DETACH hot-plug
1305 dev->flags &= ~IFF_UP;
1306 net_dmaengine_put();
1312 static int __dev_close(struct net_device *dev)
1317 /* Temporarily disable netpoll until the interface is down */
1318 netpoll_rx_disable(dev);
1320 list_add(&dev->unreg_list, &single);
1321 retval = __dev_close_many(&single);
1324 netpoll_rx_enable(dev);
1328 static int dev_close_many(struct list_head *head)
1330 struct net_device *dev, *tmp;
1331 LIST_HEAD(tmp_list);
1333 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1334 if (!(dev->flags & IFF_UP))
1335 list_move(&dev->unreg_list, &tmp_list);
1337 __dev_close_many(head);
1339 list_for_each_entry(dev, head, unreg_list) {
1340 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1341 call_netdevice_notifiers(NETDEV_DOWN, dev);
1344 /* rollback_registered_many needs the complete original list */
1345 list_splice(&tmp_list, head);
1350 * dev_close - shutdown an interface.
1351 * @dev: device to shutdown
1353 * This function moves an active device into down state. A
1354 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1355 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1358 int dev_close(struct net_device *dev)
1360 if (dev->flags & IFF_UP) {
1363 /* Block netpoll rx while the interface is going down */
1364 netpoll_rx_disable(dev);
1366 list_add(&dev->unreg_list, &single);
1367 dev_close_many(&single);
1370 netpoll_rx_enable(dev);
1374 EXPORT_SYMBOL(dev_close);
1378 * dev_disable_lro - disable Large Receive Offload on a device
1381 * Disable Large Receive Offload (LRO) on a net device. Must be
1382 * called under RTNL. This is needed if received packets may be
1383 * forwarded to another interface.
1385 void dev_disable_lro(struct net_device *dev)
1388 * If we're trying to disable lro on a vlan device
1389 * use the underlying physical device instead
1391 if (is_vlan_dev(dev))
1392 dev = vlan_dev_real_dev(dev);
1394 dev->wanted_features &= ~NETIF_F_LRO;
1395 netdev_update_features(dev);
1397 if (unlikely(dev->features & NETIF_F_LRO))
1398 netdev_WARN(dev, "failed to disable LRO!\n");
1400 EXPORT_SYMBOL(dev_disable_lro);
1402 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1403 struct net_device *dev)
1405 struct netdev_notifier_info info;
1407 netdev_notifier_info_init(&info, dev);
1408 return nb->notifier_call(nb, val, &info);
1411 static int dev_boot_phase = 1;
1414 * register_netdevice_notifier - register a network notifier block
1417 * Register a notifier to be called when network device events occur.
1418 * The notifier passed is linked into the kernel structures and must
1419 * not be reused until it has been unregistered. A negative errno code
1420 * is returned on a failure.
1422 * When registered all registration and up events are replayed
1423 * to the new notifier to allow device to have a race free
1424 * view of the network device list.
1427 int register_netdevice_notifier(struct notifier_block *nb)
1429 struct net_device *dev;
1430 struct net_device *last;
1435 err = raw_notifier_chain_register(&netdev_chain, nb);
1441 for_each_netdev(net, dev) {
1442 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1443 err = notifier_to_errno(err);
1447 if (!(dev->flags & IFF_UP))
1450 call_netdevice_notifier(nb, NETDEV_UP, dev);
1461 for_each_netdev(net, dev) {
1465 if (dev->flags & IFF_UP) {
1466 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1468 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1470 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1475 raw_notifier_chain_unregister(&netdev_chain, nb);
1478 EXPORT_SYMBOL(register_netdevice_notifier);
1481 * unregister_netdevice_notifier - unregister a network notifier block
1484 * Unregister a notifier previously registered by
1485 * register_netdevice_notifier(). The notifier is unlinked into the
1486 * kernel structures and may then be reused. A negative errno code
1487 * is returned on a failure.
1489 * After unregistering unregister and down device events are synthesized
1490 * for all devices on the device list to the removed notifier to remove
1491 * the need for special case cleanup code.
1494 int unregister_netdevice_notifier(struct notifier_block *nb)
1496 struct net_device *dev;
1501 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1506 for_each_netdev(net, dev) {
1507 if (dev->flags & IFF_UP) {
1508 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1510 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1512 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1519 EXPORT_SYMBOL(unregister_netdevice_notifier);
1522 * call_netdevice_notifiers_info - call all network notifier blocks
1523 * @val: value passed unmodified to notifier function
1524 * @dev: net_device pointer passed unmodified to notifier function
1525 * @info: notifier information data
1527 * Call all network notifier blocks. Parameters and return value
1528 * are as for raw_notifier_call_chain().
1531 int call_netdevice_notifiers_info(unsigned long val, struct net_device *dev,
1532 struct netdev_notifier_info *info)
1535 netdev_notifier_info_init(info, dev);
1536 return raw_notifier_call_chain(&netdev_chain, val, info);
1538 EXPORT_SYMBOL(call_netdevice_notifiers_info);
1541 * call_netdevice_notifiers - call all network notifier blocks
1542 * @val: value passed unmodified to notifier function
1543 * @dev: net_device pointer passed unmodified to notifier function
1545 * Call all network notifier blocks. Parameters and return value
1546 * are as for raw_notifier_call_chain().
1549 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1551 struct netdev_notifier_info info;
1553 return call_netdevice_notifiers_info(val, dev, &info);
1555 EXPORT_SYMBOL(call_netdevice_notifiers);
1557 static struct static_key netstamp_needed __read_mostly;
1558 #ifdef HAVE_JUMP_LABEL
1559 /* We are not allowed to call static_key_slow_dec() from irq context
1560 * If net_disable_timestamp() is called from irq context, defer the
1561 * static_key_slow_dec() calls.
1563 static atomic_t netstamp_needed_deferred;
1566 void net_enable_timestamp(void)
1568 #ifdef HAVE_JUMP_LABEL
1569 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1573 static_key_slow_dec(&netstamp_needed);
1577 static_key_slow_inc(&netstamp_needed);
1579 EXPORT_SYMBOL(net_enable_timestamp);
1581 void net_disable_timestamp(void)
1583 #ifdef HAVE_JUMP_LABEL
1584 if (in_interrupt()) {
1585 atomic_inc(&netstamp_needed_deferred);
1589 static_key_slow_dec(&netstamp_needed);
1591 EXPORT_SYMBOL(net_disable_timestamp);
1593 static inline void net_timestamp_set(struct sk_buff *skb)
1595 skb->tstamp.tv64 = 0;
1596 if (static_key_false(&netstamp_needed))
1597 __net_timestamp(skb);
1600 #define net_timestamp_check(COND, SKB) \
1601 if (static_key_false(&netstamp_needed)) { \
1602 if ((COND) && !(SKB)->tstamp.tv64) \
1603 __net_timestamp(SKB); \
1606 static inline bool is_skb_forwardable(struct net_device *dev,
1607 struct sk_buff *skb)
1611 if (!(dev->flags & IFF_UP))
1614 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1615 if (skb->len <= len)
1618 /* if TSO is enabled, we don't care about the length as the packet
1619 * could be forwarded without being segmented before
1621 if (skb_is_gso(skb))
1628 * dev_forward_skb - loopback an skb to another netif
1630 * @dev: destination network device
1631 * @skb: buffer to forward
1634 * NET_RX_SUCCESS (no congestion)
1635 * NET_RX_DROP (packet was dropped, but freed)
1637 * dev_forward_skb can be used for injecting an skb from the
1638 * start_xmit function of one device into the receive queue
1639 * of another device.
1641 * The receiving device may be in another namespace, so
1642 * we have to clear all information in the skb that could
1643 * impact namespace isolation.
1645 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1647 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1648 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1649 atomic_long_inc(&dev->rx_dropped);
1655 if (unlikely(!is_skb_forwardable(dev, skb))) {
1656 atomic_long_inc(&dev->rx_dropped);
1660 skb_scrub_packet(skb);
1661 skb->protocol = eth_type_trans(skb, dev);
1662 return netif_rx(skb);
1664 EXPORT_SYMBOL_GPL(dev_forward_skb);
1666 static inline int deliver_skb(struct sk_buff *skb,
1667 struct packet_type *pt_prev,
1668 struct net_device *orig_dev)
1670 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1672 atomic_inc(&skb->users);
1673 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1676 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1678 if (!ptype->af_packet_priv || !skb->sk)
1681 if (ptype->id_match)
1682 return ptype->id_match(ptype, skb->sk);
1683 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1690 * Support routine. Sends outgoing frames to any network
1691 * taps currently in use.
1694 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1696 struct packet_type *ptype;
1697 struct sk_buff *skb2 = NULL;
1698 struct packet_type *pt_prev = NULL;
1701 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1702 /* Never send packets back to the socket
1703 * they originated from - MvS (miquels@drinkel.ow.org)
1705 if ((ptype->dev == dev || !ptype->dev) &&
1706 (!skb_loop_sk(ptype, skb))) {
1708 deliver_skb(skb2, pt_prev, skb->dev);
1713 skb2 = skb_clone(skb, GFP_ATOMIC);
1717 net_timestamp_set(skb2);
1719 /* skb->nh should be correctly
1720 set by sender, so that the second statement is
1721 just protection against buggy protocols.
1723 skb_reset_mac_header(skb2);
1725 if (skb_network_header(skb2) < skb2->data ||
1726 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1727 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1728 ntohs(skb2->protocol),
1730 skb_reset_network_header(skb2);
1733 skb2->transport_header = skb2->network_header;
1734 skb2->pkt_type = PACKET_OUTGOING;
1739 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1744 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1745 * @dev: Network device
1746 * @txq: number of queues available
1748 * If real_num_tx_queues is changed the tc mappings may no longer be
1749 * valid. To resolve this verify the tc mapping remains valid and if
1750 * not NULL the mapping. With no priorities mapping to this
1751 * offset/count pair it will no longer be used. In the worst case TC0
1752 * is invalid nothing can be done so disable priority mappings. If is
1753 * expected that drivers will fix this mapping if they can before
1754 * calling netif_set_real_num_tx_queues.
1756 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1759 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1761 /* If TC0 is invalidated disable TC mapping */
1762 if (tc->offset + tc->count > txq) {
1763 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1768 /* Invalidated prio to tc mappings set to TC0 */
1769 for (i = 1; i < TC_BITMASK + 1; i++) {
1770 int q = netdev_get_prio_tc_map(dev, i);
1772 tc = &dev->tc_to_txq[q];
1773 if (tc->offset + tc->count > txq) {
1774 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1776 netdev_set_prio_tc_map(dev, i, 0);
1782 static DEFINE_MUTEX(xps_map_mutex);
1783 #define xmap_dereference(P) \
1784 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1786 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1789 struct xps_map *map = NULL;
1793 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1795 for (pos = 0; map && pos < map->len; pos++) {
1796 if (map->queues[pos] == index) {
1798 map->queues[pos] = map->queues[--map->len];
1800 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1801 kfree_rcu(map, rcu);
1811 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1813 struct xps_dev_maps *dev_maps;
1815 bool active = false;
1817 mutex_lock(&xps_map_mutex);
1818 dev_maps = xmap_dereference(dev->xps_maps);
1823 for_each_possible_cpu(cpu) {
1824 for (i = index; i < dev->num_tx_queues; i++) {
1825 if (!remove_xps_queue(dev_maps, cpu, i))
1828 if (i == dev->num_tx_queues)
1833 RCU_INIT_POINTER(dev->xps_maps, NULL);
1834 kfree_rcu(dev_maps, rcu);
1837 for (i = index; i < dev->num_tx_queues; i++)
1838 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1842 mutex_unlock(&xps_map_mutex);
1845 static struct xps_map *expand_xps_map(struct xps_map *map,
1848 struct xps_map *new_map;
1849 int alloc_len = XPS_MIN_MAP_ALLOC;
1852 for (pos = 0; map && pos < map->len; pos++) {
1853 if (map->queues[pos] != index)
1858 /* Need to add queue to this CPU's existing map */
1860 if (pos < map->alloc_len)
1863 alloc_len = map->alloc_len * 2;
1866 /* Need to allocate new map to store queue on this CPU's map */
1867 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1872 for (i = 0; i < pos; i++)
1873 new_map->queues[i] = map->queues[i];
1874 new_map->alloc_len = alloc_len;
1880 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1882 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1883 struct xps_map *map, *new_map;
1884 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1885 int cpu, numa_node_id = -2;
1886 bool active = false;
1888 mutex_lock(&xps_map_mutex);
1890 dev_maps = xmap_dereference(dev->xps_maps);
1892 /* allocate memory for queue storage */
1893 for_each_online_cpu(cpu) {
1894 if (!cpumask_test_cpu(cpu, mask))
1898 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1899 if (!new_dev_maps) {
1900 mutex_unlock(&xps_map_mutex);
1904 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1907 map = expand_xps_map(map, cpu, index);
1911 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1915 goto out_no_new_maps;
1917 for_each_possible_cpu(cpu) {
1918 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1919 /* add queue to CPU maps */
1922 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1923 while ((pos < map->len) && (map->queues[pos] != index))
1926 if (pos == map->len)
1927 map->queues[map->len++] = index;
1929 if (numa_node_id == -2)
1930 numa_node_id = cpu_to_node(cpu);
1931 else if (numa_node_id != cpu_to_node(cpu))
1934 } else if (dev_maps) {
1935 /* fill in the new device map from the old device map */
1936 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1937 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1942 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1944 /* Cleanup old maps */
1946 for_each_possible_cpu(cpu) {
1947 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1948 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1949 if (map && map != new_map)
1950 kfree_rcu(map, rcu);
1953 kfree_rcu(dev_maps, rcu);
1956 dev_maps = new_dev_maps;
1960 /* update Tx queue numa node */
1961 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1962 (numa_node_id >= 0) ? numa_node_id :
1968 /* removes queue from unused CPUs */
1969 for_each_possible_cpu(cpu) {
1970 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1973 if (remove_xps_queue(dev_maps, cpu, index))
1977 /* free map if not active */
1979 RCU_INIT_POINTER(dev->xps_maps, NULL);
1980 kfree_rcu(dev_maps, rcu);
1984 mutex_unlock(&xps_map_mutex);
1988 /* remove any maps that we added */
1989 for_each_possible_cpu(cpu) {
1990 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1991 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1993 if (new_map && new_map != map)
1997 mutex_unlock(&xps_map_mutex);
1999 kfree(new_dev_maps);
2002 EXPORT_SYMBOL(netif_set_xps_queue);
2006 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2007 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2009 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2013 if (txq < 1 || txq > dev->num_tx_queues)
2016 if (dev->reg_state == NETREG_REGISTERED ||
2017 dev->reg_state == NETREG_UNREGISTERING) {
2020 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2026 netif_setup_tc(dev, txq);
2028 if (txq < dev->real_num_tx_queues) {
2029 qdisc_reset_all_tx_gt(dev, txq);
2031 netif_reset_xps_queues_gt(dev, txq);
2036 dev->real_num_tx_queues = txq;
2039 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2043 * netif_set_real_num_rx_queues - set actual number of RX queues used
2044 * @dev: Network device
2045 * @rxq: Actual number of RX queues
2047 * This must be called either with the rtnl_lock held or before
2048 * registration of the net device. Returns 0 on success, or a
2049 * negative error code. If called before registration, it always
2052 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2056 if (rxq < 1 || rxq > dev->num_rx_queues)
2059 if (dev->reg_state == NETREG_REGISTERED) {
2062 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2068 dev->real_num_rx_queues = rxq;
2071 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2075 * netif_get_num_default_rss_queues - default number of RSS queues
2077 * This routine should set an upper limit on the number of RSS queues
2078 * used by default by multiqueue devices.
2080 int netif_get_num_default_rss_queues(void)
2082 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2084 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2086 static inline void __netif_reschedule(struct Qdisc *q)
2088 struct softnet_data *sd;
2089 unsigned long flags;
2091 local_irq_save(flags);
2092 sd = &__get_cpu_var(softnet_data);
2093 q->next_sched = NULL;
2094 *sd->output_queue_tailp = q;
2095 sd->output_queue_tailp = &q->next_sched;
2096 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2097 local_irq_restore(flags);
2100 void __netif_schedule(struct Qdisc *q)
2102 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2103 __netif_reschedule(q);
2105 EXPORT_SYMBOL(__netif_schedule);
2107 void dev_kfree_skb_irq(struct sk_buff *skb)
2109 if (atomic_dec_and_test(&skb->users)) {
2110 struct softnet_data *sd;
2111 unsigned long flags;
2113 local_irq_save(flags);
2114 sd = &__get_cpu_var(softnet_data);
2115 skb->next = sd->completion_queue;
2116 sd->completion_queue = skb;
2117 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2118 local_irq_restore(flags);
2121 EXPORT_SYMBOL(dev_kfree_skb_irq);
2123 void dev_kfree_skb_any(struct sk_buff *skb)
2125 if (in_irq() || irqs_disabled())
2126 dev_kfree_skb_irq(skb);
2130 EXPORT_SYMBOL(dev_kfree_skb_any);
2134 * netif_device_detach - mark device as removed
2135 * @dev: network device
2137 * Mark device as removed from system and therefore no longer available.
2139 void netif_device_detach(struct net_device *dev)
2141 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2142 netif_running(dev)) {
2143 netif_tx_stop_all_queues(dev);
2146 EXPORT_SYMBOL(netif_device_detach);
2149 * netif_device_attach - mark device as attached
2150 * @dev: network device
2152 * Mark device as attached from system and restart if needed.
2154 void netif_device_attach(struct net_device *dev)
2156 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2157 netif_running(dev)) {
2158 netif_tx_wake_all_queues(dev);
2159 __netdev_watchdog_up(dev);
2162 EXPORT_SYMBOL(netif_device_attach);
2164 static void skb_warn_bad_offload(const struct sk_buff *skb)
2166 static const netdev_features_t null_features = 0;
2167 struct net_device *dev = skb->dev;
2168 const char *driver = "";
2170 if (!net_ratelimit())
2173 if (dev && dev->dev.parent)
2174 driver = dev_driver_string(dev->dev.parent);
2176 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2177 "gso_type=%d ip_summed=%d\n",
2178 driver, dev ? &dev->features : &null_features,
2179 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2180 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2181 skb_shinfo(skb)->gso_type, skb->ip_summed);
2185 * Invalidate hardware checksum when packet is to be mangled, and
2186 * complete checksum manually on outgoing path.
2188 int skb_checksum_help(struct sk_buff *skb)
2191 int ret = 0, offset;
2193 if (skb->ip_summed == CHECKSUM_COMPLETE)
2194 goto out_set_summed;
2196 if (unlikely(skb_shinfo(skb)->gso_size)) {
2197 skb_warn_bad_offload(skb);
2201 /* Before computing a checksum, we should make sure no frag could
2202 * be modified by an external entity : checksum could be wrong.
2204 if (skb_has_shared_frag(skb)) {
2205 ret = __skb_linearize(skb);
2210 offset = skb_checksum_start_offset(skb);
2211 BUG_ON(offset >= skb_headlen(skb));
2212 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2214 offset += skb->csum_offset;
2215 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2217 if (skb_cloned(skb) &&
2218 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2219 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2224 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2226 skb->ip_summed = CHECKSUM_NONE;
2230 EXPORT_SYMBOL(skb_checksum_help);
2232 __be16 skb_network_protocol(struct sk_buff *skb)
2234 __be16 type = skb->protocol;
2235 int vlan_depth = ETH_HLEN;
2237 /* Tunnel gso handlers can set protocol to ethernet. */
2238 if (type == htons(ETH_P_TEB)) {
2241 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2244 eth = (struct ethhdr *)skb_mac_header(skb);
2245 type = eth->h_proto;
2248 while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2249 struct vlan_hdr *vh;
2251 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2254 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2255 type = vh->h_vlan_encapsulated_proto;
2256 vlan_depth += VLAN_HLEN;
2263 * skb_mac_gso_segment - mac layer segmentation handler.
2264 * @skb: buffer to segment
2265 * @features: features for the output path (see dev->features)
2267 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2268 netdev_features_t features)
2270 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2271 struct packet_offload *ptype;
2272 __be16 type = skb_network_protocol(skb);
2274 if (unlikely(!type))
2275 return ERR_PTR(-EINVAL);
2277 __skb_pull(skb, skb->mac_len);
2280 list_for_each_entry_rcu(ptype, &offload_base, list) {
2281 if (ptype->type == type && ptype->callbacks.gso_segment) {
2282 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2285 err = ptype->callbacks.gso_send_check(skb);
2286 segs = ERR_PTR(err);
2287 if (err || skb_gso_ok(skb, features))
2289 __skb_push(skb, (skb->data -
2290 skb_network_header(skb)));
2292 segs = ptype->callbacks.gso_segment(skb, features);
2298 __skb_push(skb, skb->data - skb_mac_header(skb));
2302 EXPORT_SYMBOL(skb_mac_gso_segment);
2305 /* openvswitch calls this on rx path, so we need a different check.
2307 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2310 return skb->ip_summed != CHECKSUM_PARTIAL;
2312 return skb->ip_summed == CHECKSUM_NONE;
2316 * __skb_gso_segment - Perform segmentation on skb.
2317 * @skb: buffer to segment
2318 * @features: features for the output path (see dev->features)
2319 * @tx_path: whether it is called in TX path
2321 * This function segments the given skb and returns a list of segments.
2323 * It may return NULL if the skb requires no segmentation. This is
2324 * only possible when GSO is used for verifying header integrity.
2326 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2327 netdev_features_t features, bool tx_path)
2329 if (unlikely(skb_needs_check(skb, tx_path))) {
2332 skb_warn_bad_offload(skb);
2334 if (skb_header_cloned(skb) &&
2335 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2336 return ERR_PTR(err);
2339 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2340 skb_reset_mac_header(skb);
2341 skb_reset_mac_len(skb);
2343 return skb_mac_gso_segment(skb, features);
2345 EXPORT_SYMBOL(__skb_gso_segment);
2347 /* Take action when hardware reception checksum errors are detected. */
2349 void netdev_rx_csum_fault(struct net_device *dev)
2351 if (net_ratelimit()) {
2352 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2356 EXPORT_SYMBOL(netdev_rx_csum_fault);
2359 /* Actually, we should eliminate this check as soon as we know, that:
2360 * 1. IOMMU is present and allows to map all the memory.
2361 * 2. No high memory really exists on this machine.
2364 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2366 #ifdef CONFIG_HIGHMEM
2368 if (!(dev->features & NETIF_F_HIGHDMA)) {
2369 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2370 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2371 if (PageHighMem(skb_frag_page(frag)))
2376 if (PCI_DMA_BUS_IS_PHYS) {
2377 struct device *pdev = dev->dev.parent;
2381 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2382 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2383 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2384 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2393 void (*destructor)(struct sk_buff *skb);
2396 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2398 static void dev_gso_skb_destructor(struct sk_buff *skb)
2400 struct dev_gso_cb *cb;
2403 struct sk_buff *nskb = skb->next;
2405 skb->next = nskb->next;
2408 } while (skb->next);
2410 cb = DEV_GSO_CB(skb);
2412 cb->destructor(skb);
2416 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2417 * @skb: buffer to segment
2418 * @features: device features as applicable to this skb
2420 * This function segments the given skb and stores the list of segments
2423 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2425 struct sk_buff *segs;
2427 segs = skb_gso_segment(skb, features);
2429 /* Verifying header integrity only. */
2434 return PTR_ERR(segs);
2437 DEV_GSO_CB(skb)->destructor = skb->destructor;
2438 skb->destructor = dev_gso_skb_destructor;
2443 static netdev_features_t harmonize_features(struct sk_buff *skb,
2444 __be16 protocol, netdev_features_t features)
2446 if (skb->ip_summed != CHECKSUM_NONE &&
2447 !can_checksum_protocol(features, protocol)) {
2448 features &= ~NETIF_F_ALL_CSUM;
2449 } else if (illegal_highdma(skb->dev, skb)) {
2450 features &= ~NETIF_F_SG;
2456 netdev_features_t netif_skb_features(struct sk_buff *skb)
2458 __be16 protocol = skb->protocol;
2459 netdev_features_t features = skb->dev->features;
2461 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2462 features &= ~NETIF_F_GSO_MASK;
2464 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2465 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2466 protocol = veh->h_vlan_encapsulated_proto;
2467 } else if (!vlan_tx_tag_present(skb)) {
2468 return harmonize_features(skb, protocol, features);
2471 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2472 NETIF_F_HW_VLAN_STAG_TX);
2474 if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
2475 return harmonize_features(skb, protocol, features);
2477 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2478 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2479 NETIF_F_HW_VLAN_STAG_TX;
2480 return harmonize_features(skb, protocol, features);
2483 EXPORT_SYMBOL(netif_skb_features);
2486 * Returns true if either:
2487 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2488 * 2. skb is fragmented and the device does not support SG.
2490 static inline int skb_needs_linearize(struct sk_buff *skb,
2491 netdev_features_t features)
2493 return skb_is_nonlinear(skb) &&
2494 ((skb_has_frag_list(skb) &&
2495 !(features & NETIF_F_FRAGLIST)) ||
2496 (skb_shinfo(skb)->nr_frags &&
2497 !(features & NETIF_F_SG)));
2500 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2501 struct netdev_queue *txq)
2503 const struct net_device_ops *ops = dev->netdev_ops;
2504 int rc = NETDEV_TX_OK;
2505 unsigned int skb_len;
2507 if (likely(!skb->next)) {
2508 netdev_features_t features;
2511 * If device doesn't need skb->dst, release it right now while
2512 * its hot in this cpu cache
2514 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2517 features = netif_skb_features(skb);
2519 if (vlan_tx_tag_present(skb) &&
2520 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2521 skb = __vlan_put_tag(skb, skb->vlan_proto,
2522 vlan_tx_tag_get(skb));
2529 /* If encapsulation offload request, verify we are testing
2530 * hardware encapsulation features instead of standard
2531 * features for the netdev
2533 if (skb->encapsulation)
2534 features &= dev->hw_enc_features;
2536 if (netif_needs_gso(skb, features)) {
2537 if (unlikely(dev_gso_segment(skb, features)))
2542 if (skb_needs_linearize(skb, features) &&
2543 __skb_linearize(skb))
2546 /* If packet is not checksummed and device does not
2547 * support checksumming for this protocol, complete
2548 * checksumming here.
2550 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2551 if (skb->encapsulation)
2552 skb_set_inner_transport_header(skb,
2553 skb_checksum_start_offset(skb));
2555 skb_set_transport_header(skb,
2556 skb_checksum_start_offset(skb));
2557 if (!(features & NETIF_F_ALL_CSUM) &&
2558 skb_checksum_help(skb))
2563 if (!list_empty(&ptype_all))
2564 dev_queue_xmit_nit(skb, dev);
2567 rc = ops->ndo_start_xmit(skb, dev);
2568 trace_net_dev_xmit(skb, rc, dev, skb_len);
2569 if (rc == NETDEV_TX_OK)
2570 txq_trans_update(txq);
2576 struct sk_buff *nskb = skb->next;
2578 skb->next = nskb->next;
2581 if (!list_empty(&ptype_all))
2582 dev_queue_xmit_nit(nskb, dev);
2584 skb_len = nskb->len;
2585 rc = ops->ndo_start_xmit(nskb, dev);
2586 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2587 if (unlikely(rc != NETDEV_TX_OK)) {
2588 if (rc & ~NETDEV_TX_MASK)
2589 goto out_kfree_gso_skb;
2590 nskb->next = skb->next;
2594 txq_trans_update(txq);
2595 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2596 return NETDEV_TX_BUSY;
2597 } while (skb->next);
2600 if (likely(skb->next == NULL)) {
2601 skb->destructor = DEV_GSO_CB(skb)->destructor;
2611 static void qdisc_pkt_len_init(struct sk_buff *skb)
2613 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2615 qdisc_skb_cb(skb)->pkt_len = skb->len;
2617 /* To get more precise estimation of bytes sent on wire,
2618 * we add to pkt_len the headers size of all segments
2620 if (shinfo->gso_size) {
2621 unsigned int hdr_len;
2622 u16 gso_segs = shinfo->gso_segs;
2624 /* mac layer + network layer */
2625 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2627 /* + transport layer */
2628 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2629 hdr_len += tcp_hdrlen(skb);
2631 hdr_len += sizeof(struct udphdr);
2633 if (shinfo->gso_type & SKB_GSO_DODGY)
2634 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2637 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2641 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2642 struct net_device *dev,
2643 struct netdev_queue *txq)
2645 spinlock_t *root_lock = qdisc_lock(q);
2649 qdisc_pkt_len_init(skb);
2650 qdisc_calculate_pkt_len(skb, q);
2652 * Heuristic to force contended enqueues to serialize on a
2653 * separate lock before trying to get qdisc main lock.
2654 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2655 * and dequeue packets faster.
2657 contended = qdisc_is_running(q);
2658 if (unlikely(contended))
2659 spin_lock(&q->busylock);
2661 spin_lock(root_lock);
2662 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2665 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2666 qdisc_run_begin(q)) {
2668 * This is a work-conserving queue; there are no old skbs
2669 * waiting to be sent out; and the qdisc is not running -
2670 * xmit the skb directly.
2672 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2675 qdisc_bstats_update(q, skb);
2677 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2678 if (unlikely(contended)) {
2679 spin_unlock(&q->busylock);
2686 rc = NET_XMIT_SUCCESS;
2689 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2690 if (qdisc_run_begin(q)) {
2691 if (unlikely(contended)) {
2692 spin_unlock(&q->busylock);
2698 spin_unlock(root_lock);
2699 if (unlikely(contended))
2700 spin_unlock(&q->busylock);
2704 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2705 static void skb_update_prio(struct sk_buff *skb)
2707 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2709 if (!skb->priority && skb->sk && map) {
2710 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2712 if (prioidx < map->priomap_len)
2713 skb->priority = map->priomap[prioidx];
2717 #define skb_update_prio(skb)
2720 static DEFINE_PER_CPU(int, xmit_recursion);
2721 #define RECURSION_LIMIT 10
2724 * dev_loopback_xmit - loop back @skb
2725 * @skb: buffer to transmit
2727 int dev_loopback_xmit(struct sk_buff *skb)
2729 skb_reset_mac_header(skb);
2730 __skb_pull(skb, skb_network_offset(skb));
2731 skb->pkt_type = PACKET_LOOPBACK;
2732 skb->ip_summed = CHECKSUM_UNNECESSARY;
2733 WARN_ON(!skb_dst(skb));
2738 EXPORT_SYMBOL(dev_loopback_xmit);
2741 * dev_queue_xmit - transmit a buffer
2742 * @skb: buffer to transmit
2744 * Queue a buffer for transmission to a network device. The caller must
2745 * have set the device and priority and built the buffer before calling
2746 * this function. The function can be called from an interrupt.
2748 * A negative errno code is returned on a failure. A success does not
2749 * guarantee the frame will be transmitted as it may be dropped due
2750 * to congestion or traffic shaping.
2752 * -----------------------------------------------------------------------------------
2753 * I notice this method can also return errors from the queue disciplines,
2754 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2757 * Regardless of the return value, the skb is consumed, so it is currently
2758 * difficult to retry a send to this method. (You can bump the ref count
2759 * before sending to hold a reference for retry if you are careful.)
2761 * When calling this method, interrupts MUST be enabled. This is because
2762 * the BH enable code must have IRQs enabled so that it will not deadlock.
2765 int dev_queue_xmit(struct sk_buff *skb)
2767 struct net_device *dev = skb->dev;
2768 struct netdev_queue *txq;
2772 skb_reset_mac_header(skb);
2774 /* Disable soft irqs for various locks below. Also
2775 * stops preemption for RCU.
2779 skb_update_prio(skb);
2781 txq = netdev_pick_tx(dev, skb);
2782 q = rcu_dereference_bh(txq->qdisc);
2784 #ifdef CONFIG_NET_CLS_ACT
2785 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2787 trace_net_dev_queue(skb);
2789 rc = __dev_xmit_skb(skb, q, dev, txq);
2793 /* The device has no queue. Common case for software devices:
2794 loopback, all the sorts of tunnels...
2796 Really, it is unlikely that netif_tx_lock protection is necessary
2797 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2799 However, it is possible, that they rely on protection
2802 Check this and shot the lock. It is not prone from deadlocks.
2803 Either shot noqueue qdisc, it is even simpler 8)
2805 if (dev->flags & IFF_UP) {
2806 int cpu = smp_processor_id(); /* ok because BHs are off */
2808 if (txq->xmit_lock_owner != cpu) {
2810 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2811 goto recursion_alert;
2813 HARD_TX_LOCK(dev, txq, cpu);
2815 if (!netif_xmit_stopped(txq)) {
2816 __this_cpu_inc(xmit_recursion);
2817 rc = dev_hard_start_xmit(skb, dev, txq);
2818 __this_cpu_dec(xmit_recursion);
2819 if (dev_xmit_complete(rc)) {
2820 HARD_TX_UNLOCK(dev, txq);
2824 HARD_TX_UNLOCK(dev, txq);
2825 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2828 /* Recursion is detected! It is possible,
2832 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2838 rcu_read_unlock_bh();
2843 rcu_read_unlock_bh();
2846 EXPORT_SYMBOL(dev_queue_xmit);
2849 /*=======================================================================
2851 =======================================================================*/
2853 int netdev_max_backlog __read_mostly = 1000;
2854 EXPORT_SYMBOL(netdev_max_backlog);
2856 int netdev_tstamp_prequeue __read_mostly = 1;
2857 int netdev_budget __read_mostly = 300;
2858 int weight_p __read_mostly = 64; /* old backlog weight */
2860 /* Called with irq disabled */
2861 static inline void ____napi_schedule(struct softnet_data *sd,
2862 struct napi_struct *napi)
2864 list_add_tail(&napi->poll_list, &sd->poll_list);
2865 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2870 /* One global table that all flow-based protocols share. */
2871 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2872 EXPORT_SYMBOL(rps_sock_flow_table);
2874 struct static_key rps_needed __read_mostly;
2876 static struct rps_dev_flow *
2877 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2878 struct rps_dev_flow *rflow, u16 next_cpu)
2880 if (next_cpu != RPS_NO_CPU) {
2881 #ifdef CONFIG_RFS_ACCEL
2882 struct netdev_rx_queue *rxqueue;
2883 struct rps_dev_flow_table *flow_table;
2884 struct rps_dev_flow *old_rflow;
2889 /* Should we steer this flow to a different hardware queue? */
2890 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2891 !(dev->features & NETIF_F_NTUPLE))
2893 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2894 if (rxq_index == skb_get_rx_queue(skb))
2897 rxqueue = dev->_rx + rxq_index;
2898 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2901 flow_id = skb->rxhash & flow_table->mask;
2902 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2903 rxq_index, flow_id);
2907 rflow = &flow_table->flows[flow_id];
2909 if (old_rflow->filter == rflow->filter)
2910 old_rflow->filter = RPS_NO_FILTER;
2914 per_cpu(softnet_data, next_cpu).input_queue_head;
2917 rflow->cpu = next_cpu;
2922 * get_rps_cpu is called from netif_receive_skb and returns the target
2923 * CPU from the RPS map of the receiving queue for a given skb.
2924 * rcu_read_lock must be held on entry.
2926 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2927 struct rps_dev_flow **rflowp)
2929 struct netdev_rx_queue *rxqueue;
2930 struct rps_map *map;
2931 struct rps_dev_flow_table *flow_table;
2932 struct rps_sock_flow_table *sock_flow_table;
2936 if (skb_rx_queue_recorded(skb)) {
2937 u16 index = skb_get_rx_queue(skb);
2938 if (unlikely(index >= dev->real_num_rx_queues)) {
2939 WARN_ONCE(dev->real_num_rx_queues > 1,
2940 "%s received packet on queue %u, but number "
2941 "of RX queues is %u\n",
2942 dev->name, index, dev->real_num_rx_queues);
2945 rxqueue = dev->_rx + index;
2949 map = rcu_dereference(rxqueue->rps_map);
2951 if (map->len == 1 &&
2952 !rcu_access_pointer(rxqueue->rps_flow_table)) {
2953 tcpu = map->cpus[0];
2954 if (cpu_online(tcpu))
2958 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2962 skb_reset_network_header(skb);
2963 if (!skb_get_rxhash(skb))
2966 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2967 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2968 if (flow_table && sock_flow_table) {
2970 struct rps_dev_flow *rflow;
2972 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2975 next_cpu = sock_flow_table->ents[skb->rxhash &
2976 sock_flow_table->mask];
2979 * If the desired CPU (where last recvmsg was done) is
2980 * different from current CPU (one in the rx-queue flow
2981 * table entry), switch if one of the following holds:
2982 * - Current CPU is unset (equal to RPS_NO_CPU).
2983 * - Current CPU is offline.
2984 * - The current CPU's queue tail has advanced beyond the
2985 * last packet that was enqueued using this table entry.
2986 * This guarantees that all previous packets for the flow
2987 * have been dequeued, thus preserving in order delivery.
2989 if (unlikely(tcpu != next_cpu) &&
2990 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2991 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2992 rflow->last_qtail)) >= 0)) {
2994 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2997 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3005 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3007 if (cpu_online(tcpu)) {
3017 #ifdef CONFIG_RFS_ACCEL
3020 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3021 * @dev: Device on which the filter was set
3022 * @rxq_index: RX queue index
3023 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3024 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3026 * Drivers that implement ndo_rx_flow_steer() should periodically call
3027 * this function for each installed filter and remove the filters for
3028 * which it returns %true.
3030 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3031 u32 flow_id, u16 filter_id)
3033 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3034 struct rps_dev_flow_table *flow_table;
3035 struct rps_dev_flow *rflow;
3040 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3041 if (flow_table && flow_id <= flow_table->mask) {
3042 rflow = &flow_table->flows[flow_id];
3043 cpu = ACCESS_ONCE(rflow->cpu);
3044 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3045 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3046 rflow->last_qtail) <
3047 (int)(10 * flow_table->mask)))
3053 EXPORT_SYMBOL(rps_may_expire_flow);
3055 #endif /* CONFIG_RFS_ACCEL */
3057 /* Called from hardirq (IPI) context */
3058 static void rps_trigger_softirq(void *data)
3060 struct softnet_data *sd = data;
3062 ____napi_schedule(sd, &sd->backlog);
3066 #endif /* CONFIG_RPS */
3069 * Check if this softnet_data structure is another cpu one
3070 * If yes, queue it to our IPI list and return 1
3073 static int rps_ipi_queued(struct softnet_data *sd)
3076 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3079 sd->rps_ipi_next = mysd->rps_ipi_list;
3080 mysd->rps_ipi_list = sd;
3082 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3085 #endif /* CONFIG_RPS */
3089 #ifdef CONFIG_NET_FLOW_LIMIT
3090 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3093 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3095 #ifdef CONFIG_NET_FLOW_LIMIT
3096 struct sd_flow_limit *fl;
3097 struct softnet_data *sd;
3098 unsigned int old_flow, new_flow;
3100 if (qlen < (netdev_max_backlog >> 1))
3103 sd = &__get_cpu_var(softnet_data);
3106 fl = rcu_dereference(sd->flow_limit);
3108 new_flow = skb_get_rxhash(skb) & (fl->num_buckets - 1);
3109 old_flow = fl->history[fl->history_head];
3110 fl->history[fl->history_head] = new_flow;
3113 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3115 if (likely(fl->buckets[old_flow]))
3116 fl->buckets[old_flow]--;
3118 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3130 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3131 * queue (may be a remote CPU queue).
3133 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3134 unsigned int *qtail)
3136 struct softnet_data *sd;
3137 unsigned long flags;
3140 sd = &per_cpu(softnet_data, cpu);
3142 local_irq_save(flags);
3145 qlen = skb_queue_len(&sd->input_pkt_queue);
3146 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3147 if (skb_queue_len(&sd->input_pkt_queue)) {
3149 __skb_queue_tail(&sd->input_pkt_queue, skb);
3150 input_queue_tail_incr_save(sd, qtail);
3152 local_irq_restore(flags);
3153 return NET_RX_SUCCESS;
3156 /* Schedule NAPI for backlog device
3157 * We can use non atomic operation since we own the queue lock
3159 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3160 if (!rps_ipi_queued(sd))
3161 ____napi_schedule(sd, &sd->backlog);
3169 local_irq_restore(flags);
3171 atomic_long_inc(&skb->dev->rx_dropped);
3177 * netif_rx - post buffer to the network code
3178 * @skb: buffer to post
3180 * This function receives a packet from a device driver and queues it for
3181 * the upper (protocol) levels to process. It always succeeds. The buffer
3182 * may be dropped during processing for congestion control or by the
3186 * NET_RX_SUCCESS (no congestion)
3187 * NET_RX_DROP (packet was dropped)
3191 int netif_rx(struct sk_buff *skb)
3195 /* if netpoll wants it, pretend we never saw it */
3196 if (netpoll_rx(skb))
3199 net_timestamp_check(netdev_tstamp_prequeue, skb);
3201 trace_netif_rx(skb);
3203 if (static_key_false(&rps_needed)) {
3204 struct rps_dev_flow voidflow, *rflow = &voidflow;
3210 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3212 cpu = smp_processor_id();
3214 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3222 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3227 EXPORT_SYMBOL(netif_rx);
3229 int netif_rx_ni(struct sk_buff *skb)
3234 err = netif_rx(skb);
3235 if (local_softirq_pending())
3241 EXPORT_SYMBOL(netif_rx_ni);
3243 static void net_tx_action(struct softirq_action *h)
3245 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3247 if (sd->completion_queue) {
3248 struct sk_buff *clist;
3250 local_irq_disable();
3251 clist = sd->completion_queue;
3252 sd->completion_queue = NULL;
3256 struct sk_buff *skb = clist;
3257 clist = clist->next;
3259 WARN_ON(atomic_read(&skb->users));
3260 trace_kfree_skb(skb, net_tx_action);
3265 if (sd->output_queue) {
3268 local_irq_disable();
3269 head = sd->output_queue;
3270 sd->output_queue = NULL;
3271 sd->output_queue_tailp = &sd->output_queue;
3275 struct Qdisc *q = head;
3276 spinlock_t *root_lock;
3278 head = head->next_sched;
3280 root_lock = qdisc_lock(q);
3281 if (spin_trylock(root_lock)) {
3282 smp_mb__before_clear_bit();
3283 clear_bit(__QDISC_STATE_SCHED,
3286 spin_unlock(root_lock);
3288 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3290 __netif_reschedule(q);
3292 smp_mb__before_clear_bit();
3293 clear_bit(__QDISC_STATE_SCHED,
3301 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3302 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3303 /* This hook is defined here for ATM LANE */
3304 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3305 unsigned char *addr) __read_mostly;
3306 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3309 #ifdef CONFIG_NET_CLS_ACT
3310 /* TODO: Maybe we should just force sch_ingress to be compiled in
3311 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3312 * a compare and 2 stores extra right now if we dont have it on
3313 * but have CONFIG_NET_CLS_ACT
3314 * NOTE: This doesn't stop any functionality; if you dont have
3315 * the ingress scheduler, you just can't add policies on ingress.
3318 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3320 struct net_device *dev = skb->dev;
3321 u32 ttl = G_TC_RTTL(skb->tc_verd);
3322 int result = TC_ACT_OK;
3325 if (unlikely(MAX_RED_LOOP < ttl++)) {
3326 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3327 skb->skb_iif, dev->ifindex);
3331 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3332 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3335 if (q != &noop_qdisc) {
3336 spin_lock(qdisc_lock(q));
3337 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3338 result = qdisc_enqueue_root(skb, q);
3339 spin_unlock(qdisc_lock(q));
3345 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3346 struct packet_type **pt_prev,
3347 int *ret, struct net_device *orig_dev)
3349 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3351 if (!rxq || rxq->qdisc == &noop_qdisc)
3355 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3359 switch (ing_filter(skb, rxq)) {
3373 * netdev_rx_handler_register - register receive handler
3374 * @dev: device to register a handler for
3375 * @rx_handler: receive handler to register
3376 * @rx_handler_data: data pointer that is used by rx handler
3378 * Register a receive hander for a device. This handler will then be
3379 * called from __netif_receive_skb. A negative errno code is returned
3382 * The caller must hold the rtnl_mutex.
3384 * For a general description of rx_handler, see enum rx_handler_result.
3386 int netdev_rx_handler_register(struct net_device *dev,
3387 rx_handler_func_t *rx_handler,
3388 void *rx_handler_data)
3392 if (dev->rx_handler)
3395 /* Note: rx_handler_data must be set before rx_handler */
3396 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3397 rcu_assign_pointer(dev->rx_handler, rx_handler);
3401 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3404 * netdev_rx_handler_unregister - unregister receive handler
3405 * @dev: device to unregister a handler from
3407 * Unregister a receive handler from a device.
3409 * The caller must hold the rtnl_mutex.
3411 void netdev_rx_handler_unregister(struct net_device *dev)
3415 RCU_INIT_POINTER(dev->rx_handler, NULL);
3416 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3417 * section has a guarantee to see a non NULL rx_handler_data
3421 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3423 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3426 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3427 * the special handling of PFMEMALLOC skbs.
3429 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3431 switch (skb->protocol) {
3432 case __constant_htons(ETH_P_ARP):
3433 case __constant_htons(ETH_P_IP):
3434 case __constant_htons(ETH_P_IPV6):
3435 case __constant_htons(ETH_P_8021Q):
3436 case __constant_htons(ETH_P_8021AD):
3443 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3445 struct packet_type *ptype, *pt_prev;
3446 rx_handler_func_t *rx_handler;
3447 struct net_device *orig_dev;
3448 struct net_device *null_or_dev;
3449 bool deliver_exact = false;
3450 int ret = NET_RX_DROP;
3453 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3455 trace_netif_receive_skb(skb);
3457 /* if we've gotten here through NAPI, check netpoll */
3458 if (netpoll_receive_skb(skb))
3461 orig_dev = skb->dev;
3463 skb_reset_network_header(skb);
3464 if (!skb_transport_header_was_set(skb))
3465 skb_reset_transport_header(skb);
3466 skb_reset_mac_len(skb);
3473 skb->skb_iif = skb->dev->ifindex;
3475 __this_cpu_inc(softnet_data.processed);
3477 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3478 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3479 skb = vlan_untag(skb);
3484 #ifdef CONFIG_NET_CLS_ACT
3485 if (skb->tc_verd & TC_NCLS) {
3486 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3494 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3495 if (!ptype->dev || ptype->dev == skb->dev) {
3497 ret = deliver_skb(skb, pt_prev, orig_dev);
3503 #ifdef CONFIG_NET_CLS_ACT
3504 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3510 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3513 if (vlan_tx_tag_present(skb)) {
3515 ret = deliver_skb(skb, pt_prev, orig_dev);
3518 if (vlan_do_receive(&skb))
3520 else if (unlikely(!skb))
3524 rx_handler = rcu_dereference(skb->dev->rx_handler);
3527 ret = deliver_skb(skb, pt_prev, orig_dev);
3530 switch (rx_handler(&skb)) {
3531 case RX_HANDLER_CONSUMED:
3532 ret = NET_RX_SUCCESS;
3534 case RX_HANDLER_ANOTHER:
3536 case RX_HANDLER_EXACT:
3537 deliver_exact = true;
3538 case RX_HANDLER_PASS:
3545 if (vlan_tx_nonzero_tag_present(skb))
3546 skb->pkt_type = PACKET_OTHERHOST;
3548 /* deliver only exact match when indicated */
3549 null_or_dev = deliver_exact ? skb->dev : NULL;
3551 type = skb->protocol;
3552 list_for_each_entry_rcu(ptype,
3553 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3554 if (ptype->type == type &&
3555 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3556 ptype->dev == orig_dev)) {
3558 ret = deliver_skb(skb, pt_prev, orig_dev);
3564 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3567 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3570 atomic_long_inc(&skb->dev->rx_dropped);
3572 /* Jamal, now you will not able to escape explaining
3573 * me how you were going to use this. :-)
3584 static int __netif_receive_skb(struct sk_buff *skb)
3588 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3589 unsigned long pflags = current->flags;
3592 * PFMEMALLOC skbs are special, they should
3593 * - be delivered to SOCK_MEMALLOC sockets only
3594 * - stay away from userspace
3595 * - have bounded memory usage
3597 * Use PF_MEMALLOC as this saves us from propagating the allocation
3598 * context down to all allocation sites.
3600 current->flags |= PF_MEMALLOC;
3601 ret = __netif_receive_skb_core(skb, true);
3602 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3604 ret = __netif_receive_skb_core(skb, false);
3610 * netif_receive_skb - process receive buffer from network
3611 * @skb: buffer to process
3613 * netif_receive_skb() is the main receive data processing function.
3614 * It always succeeds. The buffer may be dropped during processing
3615 * for congestion control or by the protocol layers.
3617 * This function may only be called from softirq context and interrupts
3618 * should be enabled.
3620 * Return values (usually ignored):
3621 * NET_RX_SUCCESS: no congestion
3622 * NET_RX_DROP: packet was dropped
3624 int netif_receive_skb(struct sk_buff *skb)
3626 net_timestamp_check(netdev_tstamp_prequeue, skb);
3628 if (skb_defer_rx_timestamp(skb))
3629 return NET_RX_SUCCESS;
3632 if (static_key_false(&rps_needed)) {
3633 struct rps_dev_flow voidflow, *rflow = &voidflow;
3638 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3641 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3648 return __netif_receive_skb(skb);
3650 EXPORT_SYMBOL(netif_receive_skb);
3652 /* Network device is going away, flush any packets still pending
3653 * Called with irqs disabled.
3655 static void flush_backlog(void *arg)
3657 struct net_device *dev = arg;
3658 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3659 struct sk_buff *skb, *tmp;
3662 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3663 if (skb->dev == dev) {
3664 __skb_unlink(skb, &sd->input_pkt_queue);
3666 input_queue_head_incr(sd);
3671 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3672 if (skb->dev == dev) {
3673 __skb_unlink(skb, &sd->process_queue);
3675 input_queue_head_incr(sd);
3680 static int napi_gro_complete(struct sk_buff *skb)
3682 struct packet_offload *ptype;
3683 __be16 type = skb->protocol;
3684 struct list_head *head = &offload_base;
3687 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3689 if (NAPI_GRO_CB(skb)->count == 1) {
3690 skb_shinfo(skb)->gso_size = 0;
3695 list_for_each_entry_rcu(ptype, head, list) {
3696 if (ptype->type != type || !ptype->callbacks.gro_complete)
3699 err = ptype->callbacks.gro_complete(skb);
3705 WARN_ON(&ptype->list == head);
3707 return NET_RX_SUCCESS;
3711 return netif_receive_skb(skb);
3714 /* napi->gro_list contains packets ordered by age.
3715 * youngest packets at the head of it.
3716 * Complete skbs in reverse order to reduce latencies.
3718 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3720 struct sk_buff *skb, *prev = NULL;
3722 /* scan list and build reverse chain */
3723 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3728 for (skb = prev; skb; skb = prev) {
3731 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3735 napi_gro_complete(skb);
3739 napi->gro_list = NULL;
3741 EXPORT_SYMBOL(napi_gro_flush);
3743 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3746 unsigned int maclen = skb->dev->hard_header_len;
3748 for (p = napi->gro_list; p; p = p->next) {
3749 unsigned long diffs;
3751 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3752 diffs |= p->vlan_tci ^ skb->vlan_tci;
3753 if (maclen == ETH_HLEN)
3754 diffs |= compare_ether_header(skb_mac_header(p),
3755 skb_gro_mac_header(skb));
3757 diffs = memcmp(skb_mac_header(p),
3758 skb_gro_mac_header(skb),
3760 NAPI_GRO_CB(p)->same_flow = !diffs;
3761 NAPI_GRO_CB(p)->flush = 0;
3765 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3767 struct sk_buff **pp = NULL;
3768 struct packet_offload *ptype;
3769 __be16 type = skb->protocol;
3770 struct list_head *head = &offload_base;
3772 enum gro_result ret;
3774 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3777 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3780 gro_list_prepare(napi, skb);
3783 list_for_each_entry_rcu(ptype, head, list) {
3784 if (ptype->type != type || !ptype->callbacks.gro_receive)
3787 skb_set_network_header(skb, skb_gro_offset(skb));
3788 skb_reset_mac_len(skb);
3789 NAPI_GRO_CB(skb)->same_flow = 0;
3790 NAPI_GRO_CB(skb)->flush = 0;
3791 NAPI_GRO_CB(skb)->free = 0;
3793 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3798 if (&ptype->list == head)
3801 same_flow = NAPI_GRO_CB(skb)->same_flow;
3802 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3805 struct sk_buff *nskb = *pp;
3809 napi_gro_complete(nskb);
3816 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3820 NAPI_GRO_CB(skb)->count = 1;
3821 NAPI_GRO_CB(skb)->age = jiffies;
3822 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3823 skb->next = napi->gro_list;
3824 napi->gro_list = skb;
3828 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3829 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3831 BUG_ON(skb->end - skb->tail < grow);
3833 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3836 skb->data_len -= grow;
3838 skb_shinfo(skb)->frags[0].page_offset += grow;
3839 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3841 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3842 skb_frag_unref(skb, 0);
3843 memmove(skb_shinfo(skb)->frags,
3844 skb_shinfo(skb)->frags + 1,
3845 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3858 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3862 if (netif_receive_skb(skb))
3870 case GRO_MERGED_FREE:
3871 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3872 kmem_cache_free(skbuff_head_cache, skb);
3885 static void skb_gro_reset_offset(struct sk_buff *skb)
3887 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3888 const skb_frag_t *frag0 = &pinfo->frags[0];
3890 NAPI_GRO_CB(skb)->data_offset = 0;
3891 NAPI_GRO_CB(skb)->frag0 = NULL;
3892 NAPI_GRO_CB(skb)->frag0_len = 0;
3894 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
3896 !PageHighMem(skb_frag_page(frag0))) {
3897 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3898 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3902 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3904 skb_gro_reset_offset(skb);
3906 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3908 EXPORT_SYMBOL(napi_gro_receive);
3910 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3912 __skb_pull(skb, skb_headlen(skb));
3913 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3914 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3916 skb->dev = napi->dev;
3922 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3924 struct sk_buff *skb = napi->skb;
3927 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3933 EXPORT_SYMBOL(napi_get_frags);
3935 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3941 skb->protocol = eth_type_trans(skb, skb->dev);
3943 if (ret == GRO_HELD)
3944 skb_gro_pull(skb, -ETH_HLEN);
3945 else if (netif_receive_skb(skb))
3950 case GRO_MERGED_FREE:
3951 napi_reuse_skb(napi, skb);
3961 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3963 struct sk_buff *skb = napi->skb;
3970 skb_reset_mac_header(skb);
3971 skb_gro_reset_offset(skb);
3973 off = skb_gro_offset(skb);
3974 hlen = off + sizeof(*eth);
3975 eth = skb_gro_header_fast(skb, off);
3976 if (skb_gro_header_hard(skb, hlen)) {
3977 eth = skb_gro_header_slow(skb, hlen, off);
3978 if (unlikely(!eth)) {
3979 napi_reuse_skb(napi, skb);
3985 skb_gro_pull(skb, sizeof(*eth));
3988 * This works because the only protocols we care about don't require
3989 * special handling. We'll fix it up properly at the end.
3991 skb->protocol = eth->h_proto;
3997 gro_result_t napi_gro_frags(struct napi_struct *napi)
3999 struct sk_buff *skb = napi_frags_skb(napi);
4004 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4006 EXPORT_SYMBOL(napi_gro_frags);
4009 * net_rps_action sends any pending IPI's for rps.
4010 * Note: called with local irq disabled, but exits with local irq enabled.
4012 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4015 struct softnet_data *remsd = sd->rps_ipi_list;
4018 sd->rps_ipi_list = NULL;
4022 /* Send pending IPI's to kick RPS processing on remote cpus. */
4024 struct softnet_data *next = remsd->rps_ipi_next;
4026 if (cpu_online(remsd->cpu))
4027 __smp_call_function_single(remsd->cpu,
4036 static int process_backlog(struct napi_struct *napi, int quota)
4039 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4042 /* Check if we have pending ipi, its better to send them now,
4043 * not waiting net_rx_action() end.
4045 if (sd->rps_ipi_list) {
4046 local_irq_disable();
4047 net_rps_action_and_irq_enable(sd);
4050 napi->weight = weight_p;
4051 local_irq_disable();
4052 while (work < quota) {
4053 struct sk_buff *skb;
4056 while ((skb = __skb_dequeue(&sd->process_queue))) {
4058 __netif_receive_skb(skb);
4059 local_irq_disable();
4060 input_queue_head_incr(sd);
4061 if (++work >= quota) {
4068 qlen = skb_queue_len(&sd->input_pkt_queue);
4070 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4071 &sd->process_queue);
4073 if (qlen < quota - work) {
4075 * Inline a custom version of __napi_complete().
4076 * only current cpu owns and manipulates this napi,
4077 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4078 * we can use a plain write instead of clear_bit(),
4079 * and we dont need an smp_mb() memory barrier.
4081 list_del(&napi->poll_list);
4084 quota = work + qlen;
4094 * __napi_schedule - schedule for receive
4095 * @n: entry to schedule
4097 * The entry's receive function will be scheduled to run
4099 void __napi_schedule(struct napi_struct *n)
4101 unsigned long flags;
4103 local_irq_save(flags);
4104 ____napi_schedule(&__get_cpu_var(softnet_data), n);
4105 local_irq_restore(flags);
4107 EXPORT_SYMBOL(__napi_schedule);
4109 void __napi_complete(struct napi_struct *n)
4111 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4112 BUG_ON(n->gro_list);
4114 list_del(&n->poll_list);
4115 smp_mb__before_clear_bit();
4116 clear_bit(NAPI_STATE_SCHED, &n->state);
4118 EXPORT_SYMBOL(__napi_complete);
4120 void napi_complete(struct napi_struct *n)
4122 unsigned long flags;
4125 * don't let napi dequeue from the cpu poll list
4126 * just in case its running on a different cpu
4128 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4131 napi_gro_flush(n, false);
4132 local_irq_save(flags);
4134 local_irq_restore(flags);
4136 EXPORT_SYMBOL(napi_complete);
4138 /* must be called under rcu_read_lock(), as we dont take a reference */
4139 struct napi_struct *napi_by_id(unsigned int napi_id)
4141 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4142 struct napi_struct *napi;
4144 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4145 if (napi->napi_id == napi_id)
4150 EXPORT_SYMBOL_GPL(napi_by_id);
4152 void napi_hash_add(struct napi_struct *napi)
4154 if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
4156 spin_lock(&napi_hash_lock);
4158 /* 0 is not a valid id, we also skip an id that is taken
4159 * we expect both events to be extremely rare
4162 while (!napi->napi_id) {
4163 napi->napi_id = ++napi_gen_id;
4164 if (napi_by_id(napi->napi_id))
4168 hlist_add_head_rcu(&napi->napi_hash_node,
4169 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4171 spin_unlock(&napi_hash_lock);
4174 EXPORT_SYMBOL_GPL(napi_hash_add);
4176 /* Warning : caller is responsible to make sure rcu grace period
4177 * is respected before freeing memory containing @napi
4179 void napi_hash_del(struct napi_struct *napi)
4181 spin_lock(&napi_hash_lock);
4183 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4184 hlist_del_rcu(&napi->napi_hash_node);
4186 spin_unlock(&napi_hash_lock);
4188 EXPORT_SYMBOL_GPL(napi_hash_del);
4190 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4191 int (*poll)(struct napi_struct *, int), int weight)
4193 INIT_LIST_HEAD(&napi->poll_list);
4194 napi->gro_count = 0;
4195 napi->gro_list = NULL;
4198 if (weight > NAPI_POLL_WEIGHT)
4199 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4201 napi->weight = weight;
4202 list_add(&napi->dev_list, &dev->napi_list);
4204 #ifdef CONFIG_NETPOLL
4205 spin_lock_init(&napi->poll_lock);
4206 napi->poll_owner = -1;
4208 set_bit(NAPI_STATE_SCHED, &napi->state);
4210 EXPORT_SYMBOL(netif_napi_add);
4212 void netif_napi_del(struct napi_struct *napi)
4214 struct sk_buff *skb, *next;
4216 list_del_init(&napi->dev_list);
4217 napi_free_frags(napi);
4219 for (skb = napi->gro_list; skb; skb = next) {
4225 napi->gro_list = NULL;
4226 napi->gro_count = 0;
4228 EXPORT_SYMBOL(netif_napi_del);
4230 static void net_rx_action(struct softirq_action *h)
4232 struct softnet_data *sd = &__get_cpu_var(softnet_data);
4233 unsigned long time_limit = jiffies + 2;
4234 int budget = netdev_budget;
4237 local_irq_disable();
4239 while (!list_empty(&sd->poll_list)) {
4240 struct napi_struct *n;
4243 /* If softirq window is exhuasted then punt.
4244 * Allow this to run for 2 jiffies since which will allow
4245 * an average latency of 1.5/HZ.
4247 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4252 /* Even though interrupts have been re-enabled, this
4253 * access is safe because interrupts can only add new
4254 * entries to the tail of this list, and only ->poll()
4255 * calls can remove this head entry from the list.
4257 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4259 have = netpoll_poll_lock(n);
4263 /* This NAPI_STATE_SCHED test is for avoiding a race
4264 * with netpoll's poll_napi(). Only the entity which
4265 * obtains the lock and sees NAPI_STATE_SCHED set will
4266 * actually make the ->poll() call. Therefore we avoid
4267 * accidentally calling ->poll() when NAPI is not scheduled.
4270 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4271 work = n->poll(n, weight);
4275 WARN_ON_ONCE(work > weight);
4279 local_irq_disable();
4281 /* Drivers must not modify the NAPI state if they
4282 * consume the entire weight. In such cases this code
4283 * still "owns" the NAPI instance and therefore can
4284 * move the instance around on the list at-will.
4286 if (unlikely(work == weight)) {
4287 if (unlikely(napi_disable_pending(n))) {
4290 local_irq_disable();
4293 /* flush too old packets
4294 * If HZ < 1000, flush all packets.
4297 napi_gro_flush(n, HZ >= 1000);
4298 local_irq_disable();
4300 list_move_tail(&n->poll_list, &sd->poll_list);
4304 netpoll_poll_unlock(have);
4307 net_rps_action_and_irq_enable(sd);
4309 #ifdef CONFIG_NET_DMA
4311 * There may not be any more sk_buffs coming right now, so push
4312 * any pending DMA copies to hardware
4314 dma_issue_pending_all();
4321 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4325 struct netdev_upper {
4326 struct net_device *dev;
4328 struct list_head list;
4329 struct rcu_head rcu;
4330 struct list_head search_list;
4333 static void __append_search_uppers(struct list_head *search_list,
4334 struct net_device *dev)
4336 struct netdev_upper *upper;
4338 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4339 /* check if this upper is not already in search list */
4340 if (list_empty(&upper->search_list))
4341 list_add_tail(&upper->search_list, search_list);
4345 static bool __netdev_search_upper_dev(struct net_device *dev,
4346 struct net_device *upper_dev)
4348 LIST_HEAD(search_list);
4349 struct netdev_upper *upper;
4350 struct netdev_upper *tmp;
4353 __append_search_uppers(&search_list, dev);
4354 list_for_each_entry(upper, &search_list, search_list) {
4355 if (upper->dev == upper_dev) {
4359 __append_search_uppers(&search_list, upper->dev);
4361 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4362 INIT_LIST_HEAD(&upper->search_list);
4366 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4367 struct net_device *upper_dev)
4369 struct netdev_upper *upper;
4371 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4372 if (upper->dev == upper_dev)
4379 * netdev_has_upper_dev - Check if device is linked to an upper device
4381 * @upper_dev: upper device to check
4383 * Find out if a device is linked to specified upper device and return true
4384 * in case it is. Note that this checks only immediate upper device,
4385 * not through a complete stack of devices. The caller must hold the RTNL lock.
4387 bool netdev_has_upper_dev(struct net_device *dev,
4388 struct net_device *upper_dev)
4392 return __netdev_find_upper(dev, upper_dev);
4394 EXPORT_SYMBOL(netdev_has_upper_dev);
4397 * netdev_has_any_upper_dev - Check if device is linked to some device
4400 * Find out if a device is linked to an upper device and return true in case
4401 * it is. The caller must hold the RTNL lock.
4403 bool netdev_has_any_upper_dev(struct net_device *dev)
4407 return !list_empty(&dev->upper_dev_list);
4409 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4412 * netdev_master_upper_dev_get - Get master upper device
4415 * Find a master upper device and return pointer to it or NULL in case
4416 * it's not there. The caller must hold the RTNL lock.
4418 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4420 struct netdev_upper *upper;
4424 if (list_empty(&dev->upper_dev_list))
4427 upper = list_first_entry(&dev->upper_dev_list,
4428 struct netdev_upper, list);
4429 if (likely(upper->master))
4433 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4436 * netdev_master_upper_dev_get_rcu - Get master upper device
4439 * Find a master upper device and return pointer to it or NULL in case
4440 * it's not there. The caller must hold the RCU read lock.
4442 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4444 struct netdev_upper *upper;
4446 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4447 struct netdev_upper, list);
4448 if (upper && likely(upper->master))
4452 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4454 static int __netdev_upper_dev_link(struct net_device *dev,
4455 struct net_device *upper_dev, bool master)
4457 struct netdev_upper *upper;
4461 if (dev == upper_dev)
4464 /* To prevent loops, check if dev is not upper device to upper_dev. */
4465 if (__netdev_search_upper_dev(upper_dev, dev))
4468 if (__netdev_find_upper(dev, upper_dev))
4471 if (master && netdev_master_upper_dev_get(dev))
4474 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4478 upper->dev = upper_dev;
4479 upper->master = master;
4480 INIT_LIST_HEAD(&upper->search_list);
4482 /* Ensure that master upper link is always the first item in list. */
4484 list_add_rcu(&upper->list, &dev->upper_dev_list);
4486 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4487 dev_hold(upper_dev);
4488 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4493 * netdev_upper_dev_link - Add a link to the upper device
4495 * @upper_dev: new upper device
4497 * Adds a link to device which is upper to this one. The caller must hold
4498 * the RTNL lock. On a failure a negative errno code is returned.
4499 * On success the reference counts are adjusted and the function
4502 int netdev_upper_dev_link(struct net_device *dev,
4503 struct net_device *upper_dev)
4505 return __netdev_upper_dev_link(dev, upper_dev, false);
4507 EXPORT_SYMBOL(netdev_upper_dev_link);
4510 * netdev_master_upper_dev_link - Add a master link to the upper device
4512 * @upper_dev: new upper device
4514 * Adds a link to device which is upper to this one. In this case, only
4515 * one master upper device can be linked, although other non-master devices
4516 * might be linked as well. The caller must hold the RTNL lock.
4517 * On a failure a negative errno code is returned. On success the reference
4518 * counts are adjusted and the function returns zero.
4520 int netdev_master_upper_dev_link(struct net_device *dev,
4521 struct net_device *upper_dev)
4523 return __netdev_upper_dev_link(dev, upper_dev, true);
4525 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4528 * netdev_upper_dev_unlink - Removes a link to upper device
4530 * @upper_dev: new upper device
4532 * Removes a link to device which is upper to this one. The caller must hold
4535 void netdev_upper_dev_unlink(struct net_device *dev,
4536 struct net_device *upper_dev)
4538 struct netdev_upper *upper;
4542 upper = __netdev_find_upper(dev, upper_dev);
4545 list_del_rcu(&upper->list);
4547 kfree_rcu(upper, rcu);
4548 call_netdevice_notifiers(NETDEV_CHANGEUPPER, dev);
4550 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4552 static void dev_change_rx_flags(struct net_device *dev, int flags)
4554 const struct net_device_ops *ops = dev->netdev_ops;
4556 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4557 ops->ndo_change_rx_flags(dev, flags);
4560 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4562 unsigned int old_flags = dev->flags;
4568 dev->flags |= IFF_PROMISC;
4569 dev->promiscuity += inc;
4570 if (dev->promiscuity == 0) {
4573 * If inc causes overflow, untouch promisc and return error.
4576 dev->flags &= ~IFF_PROMISC;
4578 dev->promiscuity -= inc;
4579 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4584 if (dev->flags != old_flags) {
4585 pr_info("device %s %s promiscuous mode\n",
4587 dev->flags & IFF_PROMISC ? "entered" : "left");
4588 if (audit_enabled) {
4589 current_uid_gid(&uid, &gid);
4590 audit_log(current->audit_context, GFP_ATOMIC,
4591 AUDIT_ANOM_PROMISCUOUS,
4592 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4593 dev->name, (dev->flags & IFF_PROMISC),
4594 (old_flags & IFF_PROMISC),
4595 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4596 from_kuid(&init_user_ns, uid),
4597 from_kgid(&init_user_ns, gid),
4598 audit_get_sessionid(current));
4601 dev_change_rx_flags(dev, IFF_PROMISC);
4607 * dev_set_promiscuity - update promiscuity count on a device
4611 * Add or remove promiscuity from a device. While the count in the device
4612 * remains above zero the interface remains promiscuous. Once it hits zero
4613 * the device reverts back to normal filtering operation. A negative inc
4614 * value is used to drop promiscuity on the device.
4615 * Return 0 if successful or a negative errno code on error.
4617 int dev_set_promiscuity(struct net_device *dev, int inc)
4619 unsigned int old_flags = dev->flags;
4622 err = __dev_set_promiscuity(dev, inc);
4625 if (dev->flags != old_flags)
4626 dev_set_rx_mode(dev);
4629 EXPORT_SYMBOL(dev_set_promiscuity);
4632 * dev_set_allmulti - update allmulti count on a device
4636 * Add or remove reception of all multicast frames to a device. While the
4637 * count in the device remains above zero the interface remains listening
4638 * to all interfaces. Once it hits zero the device reverts back to normal
4639 * filtering operation. A negative @inc value is used to drop the counter
4640 * when releasing a resource needing all multicasts.
4641 * Return 0 if successful or a negative errno code on error.
4644 int dev_set_allmulti(struct net_device *dev, int inc)
4646 unsigned int old_flags = dev->flags;
4650 dev->flags |= IFF_ALLMULTI;
4651 dev->allmulti += inc;
4652 if (dev->allmulti == 0) {
4655 * If inc causes overflow, untouch allmulti and return error.
4658 dev->flags &= ~IFF_ALLMULTI;
4660 dev->allmulti -= inc;
4661 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4666 if (dev->flags ^ old_flags) {
4667 dev_change_rx_flags(dev, IFF_ALLMULTI);
4668 dev_set_rx_mode(dev);
4672 EXPORT_SYMBOL(dev_set_allmulti);
4675 * Upload unicast and multicast address lists to device and
4676 * configure RX filtering. When the device doesn't support unicast
4677 * filtering it is put in promiscuous mode while unicast addresses
4680 void __dev_set_rx_mode(struct net_device *dev)
4682 const struct net_device_ops *ops = dev->netdev_ops;
4684 /* dev_open will call this function so the list will stay sane. */
4685 if (!(dev->flags&IFF_UP))
4688 if (!netif_device_present(dev))
4691 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4692 /* Unicast addresses changes may only happen under the rtnl,
4693 * therefore calling __dev_set_promiscuity here is safe.
4695 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4696 __dev_set_promiscuity(dev, 1);
4697 dev->uc_promisc = true;
4698 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4699 __dev_set_promiscuity(dev, -1);
4700 dev->uc_promisc = false;
4704 if (ops->ndo_set_rx_mode)
4705 ops->ndo_set_rx_mode(dev);
4708 void dev_set_rx_mode(struct net_device *dev)
4710 netif_addr_lock_bh(dev);
4711 __dev_set_rx_mode(dev);
4712 netif_addr_unlock_bh(dev);
4716 * dev_get_flags - get flags reported to userspace
4719 * Get the combination of flag bits exported through APIs to userspace.
4721 unsigned int dev_get_flags(const struct net_device *dev)
4725 flags = (dev->flags & ~(IFF_PROMISC |
4730 (dev->gflags & (IFF_PROMISC |
4733 if (netif_running(dev)) {
4734 if (netif_oper_up(dev))
4735 flags |= IFF_RUNNING;
4736 if (netif_carrier_ok(dev))
4737 flags |= IFF_LOWER_UP;
4738 if (netif_dormant(dev))
4739 flags |= IFF_DORMANT;
4744 EXPORT_SYMBOL(dev_get_flags);
4746 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4748 unsigned int old_flags = dev->flags;
4754 * Set the flags on our device.
4757 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4758 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4760 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4764 * Load in the correct multicast list now the flags have changed.
4767 if ((old_flags ^ flags) & IFF_MULTICAST)
4768 dev_change_rx_flags(dev, IFF_MULTICAST);
4770 dev_set_rx_mode(dev);
4773 * Have we downed the interface. We handle IFF_UP ourselves
4774 * according to user attempts to set it, rather than blindly
4779 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4780 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4783 dev_set_rx_mode(dev);
4786 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4787 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4789 dev->gflags ^= IFF_PROMISC;
4790 dev_set_promiscuity(dev, inc);
4793 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4794 is important. Some (broken) drivers set IFF_PROMISC, when
4795 IFF_ALLMULTI is requested not asking us and not reporting.
4797 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4798 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4800 dev->gflags ^= IFF_ALLMULTI;
4801 dev_set_allmulti(dev, inc);
4807 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4809 unsigned int changes = dev->flags ^ old_flags;
4811 if (changes & IFF_UP) {
4812 if (dev->flags & IFF_UP)
4813 call_netdevice_notifiers(NETDEV_UP, dev);
4815 call_netdevice_notifiers(NETDEV_DOWN, dev);
4818 if (dev->flags & IFF_UP &&
4819 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
4820 struct netdev_notifier_change_info change_info;
4822 change_info.flags_changed = changes;
4823 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
4829 * dev_change_flags - change device settings
4831 * @flags: device state flags
4833 * Change settings on device based state flags. The flags are
4834 * in the userspace exported format.
4836 int dev_change_flags(struct net_device *dev, unsigned int flags)
4839 unsigned int changes, old_flags = dev->flags;
4841 ret = __dev_change_flags(dev, flags);
4845 changes = old_flags ^ dev->flags;
4847 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4849 __dev_notify_flags(dev, old_flags);
4852 EXPORT_SYMBOL(dev_change_flags);
4855 * dev_set_mtu - Change maximum transfer unit
4857 * @new_mtu: new transfer unit
4859 * Change the maximum transfer size of the network device.
4861 int dev_set_mtu(struct net_device *dev, int new_mtu)
4863 const struct net_device_ops *ops = dev->netdev_ops;
4866 if (new_mtu == dev->mtu)
4869 /* MTU must be positive. */
4873 if (!netif_device_present(dev))
4877 if (ops->ndo_change_mtu)
4878 err = ops->ndo_change_mtu(dev, new_mtu);
4883 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4886 EXPORT_SYMBOL(dev_set_mtu);
4889 * dev_set_group - Change group this device belongs to
4891 * @new_group: group this device should belong to
4893 void dev_set_group(struct net_device *dev, int new_group)
4895 dev->group = new_group;
4897 EXPORT_SYMBOL(dev_set_group);
4900 * dev_set_mac_address - Change Media Access Control Address
4904 * Change the hardware (MAC) address of the device
4906 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4908 const struct net_device_ops *ops = dev->netdev_ops;
4911 if (!ops->ndo_set_mac_address)
4913 if (sa->sa_family != dev->type)
4915 if (!netif_device_present(dev))
4917 err = ops->ndo_set_mac_address(dev, sa);
4920 dev->addr_assign_type = NET_ADDR_SET;
4921 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4922 add_device_randomness(dev->dev_addr, dev->addr_len);
4925 EXPORT_SYMBOL(dev_set_mac_address);
4928 * dev_change_carrier - Change device carrier
4930 * @new_carrier: new value
4932 * Change device carrier
4934 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4936 const struct net_device_ops *ops = dev->netdev_ops;
4938 if (!ops->ndo_change_carrier)
4940 if (!netif_device_present(dev))
4942 return ops->ndo_change_carrier(dev, new_carrier);
4944 EXPORT_SYMBOL(dev_change_carrier);
4947 * dev_new_index - allocate an ifindex
4948 * @net: the applicable net namespace
4950 * Returns a suitable unique value for a new device interface
4951 * number. The caller must hold the rtnl semaphore or the
4952 * dev_base_lock to be sure it remains unique.
4954 static int dev_new_index(struct net *net)
4956 int ifindex = net->ifindex;
4960 if (!__dev_get_by_index(net, ifindex))
4961 return net->ifindex = ifindex;
4965 /* Delayed registration/unregisteration */
4966 static LIST_HEAD(net_todo_list);
4968 static void net_set_todo(struct net_device *dev)
4970 list_add_tail(&dev->todo_list, &net_todo_list);
4973 static void rollback_registered_many(struct list_head *head)
4975 struct net_device *dev, *tmp;
4977 BUG_ON(dev_boot_phase);
4980 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4981 /* Some devices call without registering
4982 * for initialization unwind. Remove those
4983 * devices and proceed with the remaining.
4985 if (dev->reg_state == NETREG_UNINITIALIZED) {
4986 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4990 list_del(&dev->unreg_list);
4993 dev->dismantle = true;
4994 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4997 /* If device is running, close it first. */
4998 dev_close_many(head);
5000 list_for_each_entry(dev, head, unreg_list) {
5001 /* And unlink it from device chain. */
5002 unlist_netdevice(dev);
5004 dev->reg_state = NETREG_UNREGISTERING;
5009 list_for_each_entry(dev, head, unreg_list) {
5010 /* Shutdown queueing discipline. */
5014 /* Notify protocols, that we are about to destroy
5015 this device. They should clean all the things.
5017 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5019 if (!dev->rtnl_link_ops ||
5020 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5021 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5024 * Flush the unicast and multicast chains
5029 if (dev->netdev_ops->ndo_uninit)
5030 dev->netdev_ops->ndo_uninit(dev);
5032 /* Notifier chain MUST detach us all upper devices. */
5033 WARN_ON(netdev_has_any_upper_dev(dev));
5035 /* Remove entries from kobject tree */
5036 netdev_unregister_kobject(dev);
5038 /* Remove XPS queueing entries */
5039 netif_reset_xps_queues_gt(dev, 0);
5045 list_for_each_entry(dev, head, unreg_list)
5049 static void rollback_registered(struct net_device *dev)
5053 list_add(&dev->unreg_list, &single);
5054 rollback_registered_many(&single);
5058 static netdev_features_t netdev_fix_features(struct net_device *dev,
5059 netdev_features_t features)
5061 /* Fix illegal checksum combinations */
5062 if ((features & NETIF_F_HW_CSUM) &&
5063 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5064 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5065 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5068 /* TSO requires that SG is present as well. */
5069 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5070 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5071 features &= ~NETIF_F_ALL_TSO;
5074 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5075 !(features & NETIF_F_IP_CSUM)) {
5076 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5077 features &= ~NETIF_F_TSO;
5078 features &= ~NETIF_F_TSO_ECN;
5081 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5082 !(features & NETIF_F_IPV6_CSUM)) {
5083 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5084 features &= ~NETIF_F_TSO6;
5087 /* TSO ECN requires that TSO is present as well. */
5088 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5089 features &= ~NETIF_F_TSO_ECN;
5091 /* Software GSO depends on SG. */
5092 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5093 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5094 features &= ~NETIF_F_GSO;
5097 /* UFO needs SG and checksumming */
5098 if (features & NETIF_F_UFO) {
5099 /* maybe split UFO into V4 and V6? */
5100 if (!((features & NETIF_F_GEN_CSUM) ||
5101 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5102 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5104 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5105 features &= ~NETIF_F_UFO;
5108 if (!(features & NETIF_F_SG)) {
5110 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5111 features &= ~NETIF_F_UFO;
5118 int __netdev_update_features(struct net_device *dev)
5120 netdev_features_t features;
5125 features = netdev_get_wanted_features(dev);
5127 if (dev->netdev_ops->ndo_fix_features)
5128 features = dev->netdev_ops->ndo_fix_features(dev, features);
5130 /* driver might be less strict about feature dependencies */
5131 features = netdev_fix_features(dev, features);
5133 if (dev->features == features)
5136 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5137 &dev->features, &features);
5139 if (dev->netdev_ops->ndo_set_features)
5140 err = dev->netdev_ops->ndo_set_features(dev, features);
5142 if (unlikely(err < 0)) {
5144 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5145 err, &features, &dev->features);
5150 dev->features = features;
5156 * netdev_update_features - recalculate device features
5157 * @dev: the device to check
5159 * Recalculate dev->features set and send notifications if it
5160 * has changed. Should be called after driver or hardware dependent
5161 * conditions might have changed that influence the features.
5163 void netdev_update_features(struct net_device *dev)
5165 if (__netdev_update_features(dev))
5166 netdev_features_change(dev);
5168 EXPORT_SYMBOL(netdev_update_features);
5171 * netdev_change_features - recalculate device features
5172 * @dev: the device to check
5174 * Recalculate dev->features set and send notifications even
5175 * if they have not changed. Should be called instead of
5176 * netdev_update_features() if also dev->vlan_features might
5177 * have changed to allow the changes to be propagated to stacked
5180 void netdev_change_features(struct net_device *dev)
5182 __netdev_update_features(dev);
5183 netdev_features_change(dev);
5185 EXPORT_SYMBOL(netdev_change_features);
5188 * netif_stacked_transfer_operstate - transfer operstate
5189 * @rootdev: the root or lower level device to transfer state from
5190 * @dev: the device to transfer operstate to
5192 * Transfer operational state from root to device. This is normally
5193 * called when a stacking relationship exists between the root
5194 * device and the device(a leaf device).
5196 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5197 struct net_device *dev)
5199 if (rootdev->operstate == IF_OPER_DORMANT)
5200 netif_dormant_on(dev);
5202 netif_dormant_off(dev);
5204 if (netif_carrier_ok(rootdev)) {
5205 if (!netif_carrier_ok(dev))
5206 netif_carrier_on(dev);
5208 if (netif_carrier_ok(dev))
5209 netif_carrier_off(dev);
5212 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5215 static int netif_alloc_rx_queues(struct net_device *dev)
5217 unsigned int i, count = dev->num_rx_queues;
5218 struct netdev_rx_queue *rx;
5222 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5228 for (i = 0; i < count; i++)
5234 static void netdev_init_one_queue(struct net_device *dev,
5235 struct netdev_queue *queue, void *_unused)
5237 /* Initialize queue lock */
5238 spin_lock_init(&queue->_xmit_lock);
5239 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5240 queue->xmit_lock_owner = -1;
5241 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5244 dql_init(&queue->dql, HZ);
5248 static void netif_free_tx_queues(struct net_device *dev)
5250 if (is_vmalloc_addr(dev->_tx))
5256 static int netif_alloc_netdev_queues(struct net_device *dev)
5258 unsigned int count = dev->num_tx_queues;
5259 struct netdev_queue *tx;
5260 size_t sz = count * sizeof(*tx);
5262 BUG_ON(count < 1 || count > 0xffff);
5264 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
5272 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5273 spin_lock_init(&dev->tx_global_lock);
5279 * register_netdevice - register a network device
5280 * @dev: device to register
5282 * Take a completed network device structure and add it to the kernel
5283 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5284 * chain. 0 is returned on success. A negative errno code is returned
5285 * on a failure to set up the device, or if the name is a duplicate.
5287 * Callers must hold the rtnl semaphore. You may want
5288 * register_netdev() instead of this.
5291 * The locking appears insufficient to guarantee two parallel registers
5292 * will not get the same name.
5295 int register_netdevice(struct net_device *dev)
5298 struct net *net = dev_net(dev);
5300 BUG_ON(dev_boot_phase);
5305 /* When net_device's are persistent, this will be fatal. */
5306 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5309 spin_lock_init(&dev->addr_list_lock);
5310 netdev_set_addr_lockdep_class(dev);
5314 ret = dev_get_valid_name(net, dev, dev->name);
5318 /* Init, if this function is available */
5319 if (dev->netdev_ops->ndo_init) {
5320 ret = dev->netdev_ops->ndo_init(dev);
5328 if (((dev->hw_features | dev->features) &
5329 NETIF_F_HW_VLAN_CTAG_FILTER) &&
5330 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5331 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5332 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5339 dev->ifindex = dev_new_index(net);
5340 else if (__dev_get_by_index(net, dev->ifindex))
5343 if (dev->iflink == -1)
5344 dev->iflink = dev->ifindex;
5346 /* Transfer changeable features to wanted_features and enable
5347 * software offloads (GSO and GRO).
5349 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5350 dev->features |= NETIF_F_SOFT_FEATURES;
5351 dev->wanted_features = dev->features & dev->hw_features;
5353 /* Turn on no cache copy if HW is doing checksum */
5354 if (!(dev->flags & IFF_LOOPBACK)) {
5355 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5356 if (dev->features & NETIF_F_ALL_CSUM) {
5357 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5358 dev->features |= NETIF_F_NOCACHE_COPY;
5362 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5364 dev->vlan_features |= NETIF_F_HIGHDMA;
5366 /* Make NETIF_F_SG inheritable to tunnel devices.
5368 dev->hw_enc_features |= NETIF_F_SG;
5370 /* Make NETIF_F_SG inheritable to MPLS.
5372 dev->mpls_features |= NETIF_F_SG;
5374 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5375 ret = notifier_to_errno(ret);
5379 ret = netdev_register_kobject(dev);
5382 dev->reg_state = NETREG_REGISTERED;
5384 __netdev_update_features(dev);
5387 * Default initial state at registry is that the
5388 * device is present.
5391 set_bit(__LINK_STATE_PRESENT, &dev->state);
5393 linkwatch_init_dev(dev);
5395 dev_init_scheduler(dev);
5397 list_netdevice(dev);
5398 add_device_randomness(dev->dev_addr, dev->addr_len);
5400 /* If the device has permanent device address, driver should
5401 * set dev_addr and also addr_assign_type should be set to
5402 * NET_ADDR_PERM (default value).
5404 if (dev->addr_assign_type == NET_ADDR_PERM)
5405 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5407 /* Notify protocols, that a new device appeared. */
5408 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5409 ret = notifier_to_errno(ret);
5411 rollback_registered(dev);
5412 dev->reg_state = NETREG_UNREGISTERED;
5415 * Prevent userspace races by waiting until the network
5416 * device is fully setup before sending notifications.
5418 if (!dev->rtnl_link_ops ||
5419 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5420 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5426 if (dev->netdev_ops->ndo_uninit)
5427 dev->netdev_ops->ndo_uninit(dev);
5430 EXPORT_SYMBOL(register_netdevice);
5433 * init_dummy_netdev - init a dummy network device for NAPI
5434 * @dev: device to init
5436 * This takes a network device structure and initialize the minimum
5437 * amount of fields so it can be used to schedule NAPI polls without
5438 * registering a full blown interface. This is to be used by drivers
5439 * that need to tie several hardware interfaces to a single NAPI
5440 * poll scheduler due to HW limitations.
5442 int init_dummy_netdev(struct net_device *dev)
5444 /* Clear everything. Note we don't initialize spinlocks
5445 * are they aren't supposed to be taken by any of the
5446 * NAPI code and this dummy netdev is supposed to be
5447 * only ever used for NAPI polls
5449 memset(dev, 0, sizeof(struct net_device));
5451 /* make sure we BUG if trying to hit standard
5452 * register/unregister code path
5454 dev->reg_state = NETREG_DUMMY;
5456 /* NAPI wants this */
5457 INIT_LIST_HEAD(&dev->napi_list);
5459 /* a dummy interface is started by default */
5460 set_bit(__LINK_STATE_PRESENT, &dev->state);
5461 set_bit(__LINK_STATE_START, &dev->state);
5463 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5464 * because users of this 'device' dont need to change
5470 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5474 * register_netdev - register a network device
5475 * @dev: device to register
5477 * Take a completed network device structure and add it to the kernel
5478 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5479 * chain. 0 is returned on success. A negative errno code is returned
5480 * on a failure to set up the device, or if the name is a duplicate.
5482 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5483 * and expands the device name if you passed a format string to
5486 int register_netdev(struct net_device *dev)
5491 err = register_netdevice(dev);
5495 EXPORT_SYMBOL(register_netdev);
5497 int netdev_refcnt_read(const struct net_device *dev)
5501 for_each_possible_cpu(i)
5502 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5505 EXPORT_SYMBOL(netdev_refcnt_read);
5508 * netdev_wait_allrefs - wait until all references are gone.
5509 * @dev: target net_device
5511 * This is called when unregistering network devices.
5513 * Any protocol or device that holds a reference should register
5514 * for netdevice notification, and cleanup and put back the
5515 * reference if they receive an UNREGISTER event.
5516 * We can get stuck here if buggy protocols don't correctly
5519 static void netdev_wait_allrefs(struct net_device *dev)
5521 unsigned long rebroadcast_time, warning_time;
5524 linkwatch_forget_dev(dev);
5526 rebroadcast_time = warning_time = jiffies;
5527 refcnt = netdev_refcnt_read(dev);
5529 while (refcnt != 0) {
5530 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5533 /* Rebroadcast unregister notification */
5534 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5540 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5541 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5543 /* We must not have linkwatch events
5544 * pending on unregister. If this
5545 * happens, we simply run the queue
5546 * unscheduled, resulting in a noop
5549 linkwatch_run_queue();
5554 rebroadcast_time = jiffies;
5559 refcnt = netdev_refcnt_read(dev);
5561 if (time_after(jiffies, warning_time + 10 * HZ)) {
5562 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5564 warning_time = jiffies;
5573 * register_netdevice(x1);
5574 * register_netdevice(x2);
5576 * unregister_netdevice(y1);
5577 * unregister_netdevice(y2);
5583 * We are invoked by rtnl_unlock().
5584 * This allows us to deal with problems:
5585 * 1) We can delete sysfs objects which invoke hotplug
5586 * without deadlocking with linkwatch via keventd.
5587 * 2) Since we run with the RTNL semaphore not held, we can sleep
5588 * safely in order to wait for the netdev refcnt to drop to zero.
5590 * We must not return until all unregister events added during
5591 * the interval the lock was held have been completed.
5593 void netdev_run_todo(void)
5595 struct list_head list;
5597 /* Snapshot list, allow later requests */
5598 list_replace_init(&net_todo_list, &list);
5603 /* Wait for rcu callbacks to finish before next phase */
5604 if (!list_empty(&list))
5607 while (!list_empty(&list)) {
5608 struct net_device *dev
5609 = list_first_entry(&list, struct net_device, todo_list);
5610 list_del(&dev->todo_list);
5613 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5616 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5617 pr_err("network todo '%s' but state %d\n",
5618 dev->name, dev->reg_state);
5623 dev->reg_state = NETREG_UNREGISTERED;
5625 on_each_cpu(flush_backlog, dev, 1);
5627 netdev_wait_allrefs(dev);
5630 BUG_ON(netdev_refcnt_read(dev));
5631 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5632 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5633 WARN_ON(dev->dn_ptr);
5635 if (dev->destructor)
5636 dev->destructor(dev);
5638 /* Free network device */
5639 kobject_put(&dev->dev.kobj);
5643 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5644 * fields in the same order, with only the type differing.
5646 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5647 const struct net_device_stats *netdev_stats)
5649 #if BITS_PER_LONG == 64
5650 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5651 memcpy(stats64, netdev_stats, sizeof(*stats64));
5653 size_t i, n = sizeof(*stats64) / sizeof(u64);
5654 const unsigned long *src = (const unsigned long *)netdev_stats;
5655 u64 *dst = (u64 *)stats64;
5657 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5658 sizeof(*stats64) / sizeof(u64));
5659 for (i = 0; i < n; i++)
5663 EXPORT_SYMBOL(netdev_stats_to_stats64);
5666 * dev_get_stats - get network device statistics
5667 * @dev: device to get statistics from
5668 * @storage: place to store stats
5670 * Get network statistics from device. Return @storage.
5671 * The device driver may provide its own method by setting
5672 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5673 * otherwise the internal statistics structure is used.
5675 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5676 struct rtnl_link_stats64 *storage)
5678 const struct net_device_ops *ops = dev->netdev_ops;
5680 if (ops->ndo_get_stats64) {
5681 memset(storage, 0, sizeof(*storage));
5682 ops->ndo_get_stats64(dev, storage);
5683 } else if (ops->ndo_get_stats) {
5684 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5686 netdev_stats_to_stats64(storage, &dev->stats);
5688 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5691 EXPORT_SYMBOL(dev_get_stats);
5693 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5695 struct netdev_queue *queue = dev_ingress_queue(dev);
5697 #ifdef CONFIG_NET_CLS_ACT
5700 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5703 netdev_init_one_queue(dev, queue, NULL);
5704 queue->qdisc = &noop_qdisc;
5705 queue->qdisc_sleeping = &noop_qdisc;
5706 rcu_assign_pointer(dev->ingress_queue, queue);
5711 static const struct ethtool_ops default_ethtool_ops;
5713 void netdev_set_default_ethtool_ops(struct net_device *dev,
5714 const struct ethtool_ops *ops)
5716 if (dev->ethtool_ops == &default_ethtool_ops)
5717 dev->ethtool_ops = ops;
5719 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5722 * alloc_netdev_mqs - allocate network device
5723 * @sizeof_priv: size of private data to allocate space for
5724 * @name: device name format string
5725 * @setup: callback to initialize device
5726 * @txqs: the number of TX subqueues to allocate
5727 * @rxqs: the number of RX subqueues to allocate
5729 * Allocates a struct net_device with private data area for driver use
5730 * and performs basic initialization. Also allocates subquue structs
5731 * for each queue on the device.
5733 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5734 void (*setup)(struct net_device *),
5735 unsigned int txqs, unsigned int rxqs)
5737 struct net_device *dev;
5739 struct net_device *p;
5741 BUG_ON(strlen(name) >= sizeof(dev->name));
5744 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5750 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5755 alloc_size = sizeof(struct net_device);
5757 /* ensure 32-byte alignment of private area */
5758 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5759 alloc_size += sizeof_priv;
5761 /* ensure 32-byte alignment of whole construct */
5762 alloc_size += NETDEV_ALIGN - 1;
5764 p = kzalloc(alloc_size, GFP_KERNEL);
5768 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5769 dev->padded = (char *)dev - (char *)p;
5771 dev->pcpu_refcnt = alloc_percpu(int);
5772 if (!dev->pcpu_refcnt)
5775 if (dev_addr_init(dev))
5781 dev_net_set(dev, &init_net);
5783 dev->gso_max_size = GSO_MAX_SIZE;
5784 dev->gso_max_segs = GSO_MAX_SEGS;
5786 INIT_LIST_HEAD(&dev->napi_list);
5787 INIT_LIST_HEAD(&dev->unreg_list);
5788 INIT_LIST_HEAD(&dev->link_watch_list);
5789 INIT_LIST_HEAD(&dev->upper_dev_list);
5790 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5793 dev->num_tx_queues = txqs;
5794 dev->real_num_tx_queues = txqs;
5795 if (netif_alloc_netdev_queues(dev))
5799 dev->num_rx_queues = rxqs;
5800 dev->real_num_rx_queues = rxqs;
5801 if (netif_alloc_rx_queues(dev))
5805 strcpy(dev->name, name);
5806 dev->group = INIT_NETDEV_GROUP;
5807 if (!dev->ethtool_ops)
5808 dev->ethtool_ops = &default_ethtool_ops;
5816 free_percpu(dev->pcpu_refcnt);
5817 netif_free_tx_queues(dev);
5826 EXPORT_SYMBOL(alloc_netdev_mqs);
5829 * free_netdev - free network device
5832 * This function does the last stage of destroying an allocated device
5833 * interface. The reference to the device object is released.
5834 * If this is the last reference then it will be freed.
5836 void free_netdev(struct net_device *dev)
5838 struct napi_struct *p, *n;
5840 release_net(dev_net(dev));
5842 netif_free_tx_queues(dev);
5847 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5849 /* Flush device addresses */
5850 dev_addr_flush(dev);
5852 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5855 free_percpu(dev->pcpu_refcnt);
5856 dev->pcpu_refcnt = NULL;
5858 /* Compatibility with error handling in drivers */
5859 if (dev->reg_state == NETREG_UNINITIALIZED) {
5860 kfree((char *)dev - dev->padded);
5864 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5865 dev->reg_state = NETREG_RELEASED;
5867 /* will free via device release */
5868 put_device(&dev->dev);
5870 EXPORT_SYMBOL(free_netdev);
5873 * synchronize_net - Synchronize with packet receive processing
5875 * Wait for packets currently being received to be done.
5876 * Does not block later packets from starting.
5878 void synchronize_net(void)
5881 if (rtnl_is_locked())
5882 synchronize_rcu_expedited();
5886 EXPORT_SYMBOL(synchronize_net);
5889 * unregister_netdevice_queue - remove device from the kernel
5893 * This function shuts down a device interface and removes it
5894 * from the kernel tables.
5895 * If head not NULL, device is queued to be unregistered later.
5897 * Callers must hold the rtnl semaphore. You may want
5898 * unregister_netdev() instead of this.
5901 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5906 list_move_tail(&dev->unreg_list, head);
5908 rollback_registered(dev);
5909 /* Finish processing unregister after unlock */
5913 EXPORT_SYMBOL(unregister_netdevice_queue);
5916 * unregister_netdevice_many - unregister many devices
5917 * @head: list of devices
5919 void unregister_netdevice_many(struct list_head *head)
5921 struct net_device *dev;
5923 if (!list_empty(head)) {
5924 rollback_registered_many(head);
5925 list_for_each_entry(dev, head, unreg_list)
5929 EXPORT_SYMBOL(unregister_netdevice_many);
5932 * unregister_netdev - remove device from the kernel
5935 * This function shuts down a device interface and removes it
5936 * from the kernel tables.
5938 * This is just a wrapper for unregister_netdevice that takes
5939 * the rtnl semaphore. In general you want to use this and not
5940 * unregister_netdevice.
5942 void unregister_netdev(struct net_device *dev)
5945 unregister_netdevice(dev);
5948 EXPORT_SYMBOL(unregister_netdev);
5951 * dev_change_net_namespace - move device to different nethost namespace
5953 * @net: network namespace
5954 * @pat: If not NULL name pattern to try if the current device name
5955 * is already taken in the destination network namespace.
5957 * This function shuts down a device interface and moves it
5958 * to a new network namespace. On success 0 is returned, on
5959 * a failure a netagive errno code is returned.
5961 * Callers must hold the rtnl semaphore.
5964 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5970 /* Don't allow namespace local devices to be moved. */
5972 if (dev->features & NETIF_F_NETNS_LOCAL)
5975 /* Ensure the device has been registrered */
5976 if (dev->reg_state != NETREG_REGISTERED)
5979 /* Get out if there is nothing todo */
5981 if (net_eq(dev_net(dev), net))
5984 /* Pick the destination device name, and ensure
5985 * we can use it in the destination network namespace.
5988 if (__dev_get_by_name(net, dev->name)) {
5989 /* We get here if we can't use the current device name */
5992 if (dev_get_valid_name(net, dev, pat) < 0)
5997 * And now a mini version of register_netdevice unregister_netdevice.
6000 /* If device is running close it first. */
6003 /* And unlink it from device chain */
6005 unlist_netdevice(dev);
6009 /* Shutdown queueing discipline. */
6012 /* Notify protocols, that we are about to destroy
6013 this device. They should clean all the things.
6015 Note that dev->reg_state stays at NETREG_REGISTERED.
6016 This is wanted because this way 8021q and macvlan know
6017 the device is just moving and can keep their slaves up.
6019 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6021 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6022 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6025 * Flush the unicast and multicast chains
6030 /* Send a netdev-removed uevent to the old namespace */
6031 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6033 /* Actually switch the network namespace */
6034 dev_net_set(dev, net);
6036 /* If there is an ifindex conflict assign a new one */
6037 if (__dev_get_by_index(net, dev->ifindex)) {
6038 int iflink = (dev->iflink == dev->ifindex);
6039 dev->ifindex = dev_new_index(net);
6041 dev->iflink = dev->ifindex;
6044 /* Send a netdev-add uevent to the new namespace */
6045 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6047 /* Fixup kobjects */
6048 err = device_rename(&dev->dev, dev->name);
6051 /* Add the device back in the hashes */
6052 list_netdevice(dev);
6054 /* Notify protocols, that a new device appeared. */
6055 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6058 * Prevent userspace races by waiting until the network
6059 * device is fully setup before sending notifications.
6061 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6068 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6070 static int dev_cpu_callback(struct notifier_block *nfb,
6071 unsigned long action,
6074 struct sk_buff **list_skb;
6075 struct sk_buff *skb;
6076 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6077 struct softnet_data *sd, *oldsd;
6079 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6082 local_irq_disable();
6083 cpu = smp_processor_id();
6084 sd = &per_cpu(softnet_data, cpu);
6085 oldsd = &per_cpu(softnet_data, oldcpu);
6087 /* Find end of our completion_queue. */
6088 list_skb = &sd->completion_queue;
6090 list_skb = &(*list_skb)->next;
6091 /* Append completion queue from offline CPU. */
6092 *list_skb = oldsd->completion_queue;
6093 oldsd->completion_queue = NULL;
6095 /* Append output queue from offline CPU. */
6096 if (oldsd->output_queue) {
6097 *sd->output_queue_tailp = oldsd->output_queue;
6098 sd->output_queue_tailp = oldsd->output_queue_tailp;
6099 oldsd->output_queue = NULL;
6100 oldsd->output_queue_tailp = &oldsd->output_queue;
6102 /* Append NAPI poll list from offline CPU. */
6103 if (!list_empty(&oldsd->poll_list)) {
6104 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6105 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6108 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6111 /* Process offline CPU's input_pkt_queue */
6112 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6114 input_queue_head_incr(oldsd);
6116 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6118 input_queue_head_incr(oldsd);
6126 * netdev_increment_features - increment feature set by one
6127 * @all: current feature set
6128 * @one: new feature set
6129 * @mask: mask feature set
6131 * Computes a new feature set after adding a device with feature set
6132 * @one to the master device with current feature set @all. Will not
6133 * enable anything that is off in @mask. Returns the new feature set.
6135 netdev_features_t netdev_increment_features(netdev_features_t all,
6136 netdev_features_t one, netdev_features_t mask)
6138 if (mask & NETIF_F_GEN_CSUM)
6139 mask |= NETIF_F_ALL_CSUM;
6140 mask |= NETIF_F_VLAN_CHALLENGED;
6142 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6143 all &= one | ~NETIF_F_ALL_FOR_ALL;
6145 /* If one device supports hw checksumming, set for all. */
6146 if (all & NETIF_F_GEN_CSUM)
6147 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6151 EXPORT_SYMBOL(netdev_increment_features);
6153 static struct hlist_head * __net_init netdev_create_hash(void)
6156 struct hlist_head *hash;
6158 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6160 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6161 INIT_HLIST_HEAD(&hash[i]);
6166 /* Initialize per network namespace state */
6167 static int __net_init netdev_init(struct net *net)
6169 if (net != &init_net)
6170 INIT_LIST_HEAD(&net->dev_base_head);
6172 net->dev_name_head = netdev_create_hash();
6173 if (net->dev_name_head == NULL)
6176 net->dev_index_head = netdev_create_hash();
6177 if (net->dev_index_head == NULL)
6183 kfree(net->dev_name_head);
6189 * netdev_drivername - network driver for the device
6190 * @dev: network device
6192 * Determine network driver for device.
6194 const char *netdev_drivername(const struct net_device *dev)
6196 const struct device_driver *driver;
6197 const struct device *parent;
6198 const char *empty = "";
6200 parent = dev->dev.parent;
6204 driver = parent->driver;
6205 if (driver && driver->name)
6206 return driver->name;
6210 static int __netdev_printk(const char *level, const struct net_device *dev,
6211 struct va_format *vaf)
6215 if (dev && dev->dev.parent) {
6216 r = dev_printk_emit(level[1] - '0',
6219 dev_driver_string(dev->dev.parent),
6220 dev_name(dev->dev.parent),
6221 netdev_name(dev), vaf);
6223 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6225 r = printk("%s(NULL net_device): %pV", level, vaf);
6231 int netdev_printk(const char *level, const struct net_device *dev,
6232 const char *format, ...)
6234 struct va_format vaf;
6238 va_start(args, format);
6243 r = __netdev_printk(level, dev, &vaf);
6249 EXPORT_SYMBOL(netdev_printk);
6251 #define define_netdev_printk_level(func, level) \
6252 int func(const struct net_device *dev, const char *fmt, ...) \
6255 struct va_format vaf; \
6258 va_start(args, fmt); \
6263 r = __netdev_printk(level, dev, &vaf); \
6269 EXPORT_SYMBOL(func);
6271 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6272 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6273 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6274 define_netdev_printk_level(netdev_err, KERN_ERR);
6275 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6276 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6277 define_netdev_printk_level(netdev_info, KERN_INFO);
6279 static void __net_exit netdev_exit(struct net *net)
6281 kfree(net->dev_name_head);
6282 kfree(net->dev_index_head);
6285 static struct pernet_operations __net_initdata netdev_net_ops = {
6286 .init = netdev_init,
6287 .exit = netdev_exit,
6290 static void __net_exit default_device_exit(struct net *net)
6292 struct net_device *dev, *aux;
6294 * Push all migratable network devices back to the
6295 * initial network namespace
6298 for_each_netdev_safe(net, dev, aux) {
6300 char fb_name[IFNAMSIZ];
6302 /* Ignore unmoveable devices (i.e. loopback) */
6303 if (dev->features & NETIF_F_NETNS_LOCAL)
6306 /* Leave virtual devices for the generic cleanup */
6307 if (dev->rtnl_link_ops)
6310 /* Push remaining network devices to init_net */
6311 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6312 err = dev_change_net_namespace(dev, &init_net, fb_name);
6314 pr_emerg("%s: failed to move %s to init_net: %d\n",
6315 __func__, dev->name, err);
6322 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6324 /* At exit all network devices most be removed from a network
6325 * namespace. Do this in the reverse order of registration.
6326 * Do this across as many network namespaces as possible to
6327 * improve batching efficiency.
6329 struct net_device *dev;
6331 LIST_HEAD(dev_kill_list);
6334 list_for_each_entry(net, net_list, exit_list) {
6335 for_each_netdev_reverse(net, dev) {
6336 if (dev->rtnl_link_ops)
6337 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6339 unregister_netdevice_queue(dev, &dev_kill_list);
6342 unregister_netdevice_many(&dev_kill_list);
6343 list_del(&dev_kill_list);
6347 static struct pernet_operations __net_initdata default_device_ops = {
6348 .exit = default_device_exit,
6349 .exit_batch = default_device_exit_batch,
6353 * Initialize the DEV module. At boot time this walks the device list and
6354 * unhooks any devices that fail to initialise (normally hardware not
6355 * present) and leaves us with a valid list of present and active devices.
6360 * This is called single threaded during boot, so no need
6361 * to take the rtnl semaphore.
6363 static int __init net_dev_init(void)
6365 int i, rc = -ENOMEM;
6367 BUG_ON(!dev_boot_phase);
6369 if (dev_proc_init())
6372 if (netdev_kobject_init())
6375 INIT_LIST_HEAD(&ptype_all);
6376 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6377 INIT_LIST_HEAD(&ptype_base[i]);
6379 INIT_LIST_HEAD(&offload_base);
6381 if (register_pernet_subsys(&netdev_net_ops))
6385 * Initialise the packet receive queues.
6388 for_each_possible_cpu(i) {
6389 struct softnet_data *sd = &per_cpu(softnet_data, i);
6391 memset(sd, 0, sizeof(*sd));
6392 skb_queue_head_init(&sd->input_pkt_queue);
6393 skb_queue_head_init(&sd->process_queue);
6394 sd->completion_queue = NULL;
6395 INIT_LIST_HEAD(&sd->poll_list);
6396 sd->output_queue = NULL;
6397 sd->output_queue_tailp = &sd->output_queue;
6399 sd->csd.func = rps_trigger_softirq;
6405 sd->backlog.poll = process_backlog;
6406 sd->backlog.weight = weight_p;
6407 sd->backlog.gro_list = NULL;
6408 sd->backlog.gro_count = 0;
6410 #ifdef CONFIG_NET_FLOW_LIMIT
6411 sd->flow_limit = NULL;
6417 /* The loopback device is special if any other network devices
6418 * is present in a network namespace the loopback device must
6419 * be present. Since we now dynamically allocate and free the
6420 * loopback device ensure this invariant is maintained by
6421 * keeping the loopback device as the first device on the
6422 * list of network devices. Ensuring the loopback devices
6423 * is the first device that appears and the last network device
6426 if (register_pernet_device(&loopback_net_ops))
6429 if (register_pernet_device(&default_device_ops))
6432 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6433 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6435 hotcpu_notifier(dev_cpu_callback, 0);
6442 subsys_initcall(net_dev_init);