]> Pileus Git - ~andy/linux/blob - net/ipv4/ip_tunnel.c
Merge tag 'sunxi-fixes-for-3.14' of https://github.com/mripard/linux into fixes
[~andy/linux] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67         return hash_32((__force u32)key ^ (__force u32)remote,
68                          IP_TNL_HASH_BITS);
69 }
70
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72                              struct dst_entry *dst)
73 {
74         struct dst_entry *old_dst;
75
76         if (dst) {
77                 if (dst->flags & DST_NOCACHE)
78                         dst = NULL;
79                 else
80                         dst_clone(dst);
81         }
82         old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
83         dst_release(old_dst);
84 }
85
86 static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
87 {
88         __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
89 }
90
91 static void tunnel_dst_reset(struct ip_tunnel *t)
92 {
93         tunnel_dst_set(t, NULL);
94 }
95
96 static void tunnel_dst_reset_all(struct ip_tunnel *t)
97 {
98         int i;
99
100         for_each_possible_cpu(i)
101                 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
102 }
103
104 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
105 {
106         struct dst_entry *dst;
107
108         rcu_read_lock();
109         dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
110         if (dst) {
111                 if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
112                         rcu_read_unlock();
113                         tunnel_dst_reset(t);
114                         return NULL;
115                 }
116                 dst_hold(dst);
117         }
118         rcu_read_unlock();
119         return (struct rtable *)dst;
120 }
121
122 /* Often modified stats are per cpu, other are shared (netdev->stats) */
123 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
124                                                 struct rtnl_link_stats64 *tot)
125 {
126         int i;
127
128         for_each_possible_cpu(i) {
129                 const struct pcpu_sw_netstats *tstats =
130                                                    per_cpu_ptr(dev->tstats, i);
131                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
132                 unsigned int start;
133
134                 do {
135                         start = u64_stats_fetch_begin_bh(&tstats->syncp);
136                         rx_packets = tstats->rx_packets;
137                         tx_packets = tstats->tx_packets;
138                         rx_bytes = tstats->rx_bytes;
139                         tx_bytes = tstats->tx_bytes;
140                 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
141
142                 tot->rx_packets += rx_packets;
143                 tot->tx_packets += tx_packets;
144                 tot->rx_bytes   += rx_bytes;
145                 tot->tx_bytes   += tx_bytes;
146         }
147
148         tot->multicast = dev->stats.multicast;
149
150         tot->rx_crc_errors = dev->stats.rx_crc_errors;
151         tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
152         tot->rx_length_errors = dev->stats.rx_length_errors;
153         tot->rx_frame_errors = dev->stats.rx_frame_errors;
154         tot->rx_errors = dev->stats.rx_errors;
155
156         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
157         tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
158         tot->tx_dropped = dev->stats.tx_dropped;
159         tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
160         tot->tx_errors = dev->stats.tx_errors;
161
162         tot->collisions  = dev->stats.collisions;
163
164         return tot;
165 }
166 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
167
168 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
169                                 __be16 flags, __be32 key)
170 {
171         if (p->i_flags & TUNNEL_KEY) {
172                 if (flags & TUNNEL_KEY)
173                         return key == p->i_key;
174                 else
175                         /* key expected, none present */
176                         return false;
177         } else
178                 return !(flags & TUNNEL_KEY);
179 }
180
181 /* Fallback tunnel: no source, no destination, no key, no options
182
183    Tunnel hash table:
184    We require exact key match i.e. if a key is present in packet
185    it will match only tunnel with the same key; if it is not present,
186    it will match only keyless tunnel.
187
188    All keysless packets, if not matched configured keyless tunnels
189    will match fallback tunnel.
190    Given src, dst and key, find appropriate for input tunnel.
191 */
192 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
193                                    int link, __be16 flags,
194                                    __be32 remote, __be32 local,
195                                    __be32 key)
196 {
197         unsigned int hash;
198         struct ip_tunnel *t, *cand = NULL;
199         struct hlist_head *head;
200
201         hash = ip_tunnel_hash(key, remote);
202         head = &itn->tunnels[hash];
203
204         hlist_for_each_entry_rcu(t, head, hash_node) {
205                 if (local != t->parms.iph.saddr ||
206                     remote != t->parms.iph.daddr ||
207                     !(t->dev->flags & IFF_UP))
208                         continue;
209
210                 if (!ip_tunnel_key_match(&t->parms, flags, key))
211                         continue;
212
213                 if (t->parms.link == link)
214                         return t;
215                 else
216                         cand = t;
217         }
218
219         hlist_for_each_entry_rcu(t, head, hash_node) {
220                 if (remote != t->parms.iph.daddr ||
221                     !(t->dev->flags & IFF_UP))
222                         continue;
223
224                 if (!ip_tunnel_key_match(&t->parms, flags, key))
225                         continue;
226
227                 if (t->parms.link == link)
228                         return t;
229                 else if (!cand)
230                         cand = t;
231         }
232
233         hash = ip_tunnel_hash(key, 0);
234         head = &itn->tunnels[hash];
235
236         hlist_for_each_entry_rcu(t, head, hash_node) {
237                 if ((local != t->parms.iph.saddr &&
238                      (local != t->parms.iph.daddr ||
239                       !ipv4_is_multicast(local))) ||
240                     !(t->dev->flags & IFF_UP))
241                         continue;
242
243                 if (!ip_tunnel_key_match(&t->parms, flags, key))
244                         continue;
245
246                 if (t->parms.link == link)
247                         return t;
248                 else if (!cand)
249                         cand = t;
250         }
251
252         if (flags & TUNNEL_NO_KEY)
253                 goto skip_key_lookup;
254
255         hlist_for_each_entry_rcu(t, head, hash_node) {
256                 if (t->parms.i_key != key ||
257                     !(t->dev->flags & IFF_UP))
258                         continue;
259
260                 if (t->parms.link == link)
261                         return t;
262                 else if (!cand)
263                         cand = t;
264         }
265
266 skip_key_lookup:
267         if (cand)
268                 return cand;
269
270         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
271                 return netdev_priv(itn->fb_tunnel_dev);
272
273
274         return NULL;
275 }
276 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
277
278 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
279                                     struct ip_tunnel_parm *parms)
280 {
281         unsigned int h;
282         __be32 remote;
283
284         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
285                 remote = parms->iph.daddr;
286         else
287                 remote = 0;
288
289         h = ip_tunnel_hash(parms->i_key, remote);
290         return &itn->tunnels[h];
291 }
292
293 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
294 {
295         struct hlist_head *head = ip_bucket(itn, &t->parms);
296
297         hlist_add_head_rcu(&t->hash_node, head);
298 }
299
300 static void ip_tunnel_del(struct ip_tunnel *t)
301 {
302         hlist_del_init_rcu(&t->hash_node);
303 }
304
305 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
306                                         struct ip_tunnel_parm *parms,
307                                         int type)
308 {
309         __be32 remote = parms->iph.daddr;
310         __be32 local = parms->iph.saddr;
311         __be32 key = parms->i_key;
312         int link = parms->link;
313         struct ip_tunnel *t = NULL;
314         struct hlist_head *head = ip_bucket(itn, parms);
315
316         hlist_for_each_entry_rcu(t, head, hash_node) {
317                 if (local == t->parms.iph.saddr &&
318                     remote == t->parms.iph.daddr &&
319                     key == t->parms.i_key &&
320                     link == t->parms.link &&
321                     type == t->dev->type)
322                         break;
323         }
324         return t;
325 }
326
327 static struct net_device *__ip_tunnel_create(struct net *net,
328                                              const struct rtnl_link_ops *ops,
329                                              struct ip_tunnel_parm *parms)
330 {
331         int err;
332         struct ip_tunnel *tunnel;
333         struct net_device *dev;
334         char name[IFNAMSIZ];
335
336         if (parms->name[0])
337                 strlcpy(name, parms->name, IFNAMSIZ);
338         else {
339                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
340                         err = -E2BIG;
341                         goto failed;
342                 }
343                 strlcpy(name, ops->kind, IFNAMSIZ);
344                 strncat(name, "%d", 2);
345         }
346
347         ASSERT_RTNL();
348         dev = alloc_netdev(ops->priv_size, name, ops->setup);
349         if (!dev) {
350                 err = -ENOMEM;
351                 goto failed;
352         }
353         dev_net_set(dev, net);
354
355         dev->rtnl_link_ops = ops;
356
357         tunnel = netdev_priv(dev);
358         tunnel->parms = *parms;
359         tunnel->net = net;
360
361         err = register_netdevice(dev);
362         if (err)
363                 goto failed_free;
364
365         return dev;
366
367 failed_free:
368         free_netdev(dev);
369 failed:
370         return ERR_PTR(err);
371 }
372
373 static inline void init_tunnel_flow(struct flowi4 *fl4,
374                                     int proto,
375                                     __be32 daddr, __be32 saddr,
376                                     __be32 key, __u8 tos, int oif)
377 {
378         memset(fl4, 0, sizeof(*fl4));
379         fl4->flowi4_oif = oif;
380         fl4->daddr = daddr;
381         fl4->saddr = saddr;
382         fl4->flowi4_tos = tos;
383         fl4->flowi4_proto = proto;
384         fl4->fl4_gre_key = key;
385 }
386
387 static int ip_tunnel_bind_dev(struct net_device *dev)
388 {
389         struct net_device *tdev = NULL;
390         struct ip_tunnel *tunnel = netdev_priv(dev);
391         const struct iphdr *iph;
392         int hlen = LL_MAX_HEADER;
393         int mtu = ETH_DATA_LEN;
394         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
395
396         iph = &tunnel->parms.iph;
397
398         /* Guess output device to choose reasonable mtu and needed_headroom */
399         if (iph->daddr) {
400                 struct flowi4 fl4;
401                 struct rtable *rt;
402
403                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
404                                  iph->saddr, tunnel->parms.o_key,
405                                  RT_TOS(iph->tos), tunnel->parms.link);
406                 rt = ip_route_output_key(tunnel->net, &fl4);
407
408                 if (!IS_ERR(rt)) {
409                         tdev = rt->dst.dev;
410                         tunnel_dst_set(tunnel, &rt->dst);
411                         ip_rt_put(rt);
412                 }
413                 if (dev->type != ARPHRD_ETHER)
414                         dev->flags |= IFF_POINTOPOINT;
415         }
416
417         if (!tdev && tunnel->parms.link)
418                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
419
420         if (tdev) {
421                 hlen = tdev->hard_header_len + tdev->needed_headroom;
422                 mtu = tdev->mtu;
423         }
424         dev->iflink = tunnel->parms.link;
425
426         dev->needed_headroom = t_hlen + hlen;
427         mtu -= (dev->hard_header_len + t_hlen);
428
429         if (mtu < 68)
430                 mtu = 68;
431
432         return mtu;
433 }
434
435 static struct ip_tunnel *ip_tunnel_create(struct net *net,
436                                           struct ip_tunnel_net *itn,
437                                           struct ip_tunnel_parm *parms)
438 {
439         struct ip_tunnel *nt, *fbt;
440         struct net_device *dev;
441
442         BUG_ON(!itn->fb_tunnel_dev);
443         fbt = netdev_priv(itn->fb_tunnel_dev);
444         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
445         if (IS_ERR(dev))
446                 return NULL;
447
448         dev->mtu = ip_tunnel_bind_dev(dev);
449
450         nt = netdev_priv(dev);
451         ip_tunnel_add(itn, nt);
452         return nt;
453 }
454
455 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
456                   const struct tnl_ptk_info *tpi, bool log_ecn_error)
457 {
458         struct pcpu_sw_netstats *tstats;
459         const struct iphdr *iph = ip_hdr(skb);
460         int err;
461
462 #ifdef CONFIG_NET_IPGRE_BROADCAST
463         if (ipv4_is_multicast(iph->daddr)) {
464                 /* Looped back packet, drop it! */
465                 if (rt_is_output_route(skb_rtable(skb)))
466                         goto drop;
467                 tunnel->dev->stats.multicast++;
468                 skb->pkt_type = PACKET_BROADCAST;
469         }
470 #endif
471
472         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
473              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
474                 tunnel->dev->stats.rx_crc_errors++;
475                 tunnel->dev->stats.rx_errors++;
476                 goto drop;
477         }
478
479         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
480                 if (!(tpi->flags&TUNNEL_SEQ) ||
481                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
482                         tunnel->dev->stats.rx_fifo_errors++;
483                         tunnel->dev->stats.rx_errors++;
484                         goto drop;
485                 }
486                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
487         }
488
489         err = IP_ECN_decapsulate(iph, skb);
490         if (unlikely(err)) {
491                 if (log_ecn_error)
492                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
493                                         &iph->saddr, iph->tos);
494                 if (err > 1) {
495                         ++tunnel->dev->stats.rx_frame_errors;
496                         ++tunnel->dev->stats.rx_errors;
497                         goto drop;
498                 }
499         }
500
501         tstats = this_cpu_ptr(tunnel->dev->tstats);
502         u64_stats_update_begin(&tstats->syncp);
503         tstats->rx_packets++;
504         tstats->rx_bytes += skb->len;
505         u64_stats_update_end(&tstats->syncp);
506
507         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
508
509         if (tunnel->dev->type == ARPHRD_ETHER) {
510                 skb->protocol = eth_type_trans(skb, tunnel->dev);
511                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
512         } else {
513                 skb->dev = tunnel->dev;
514         }
515
516         gro_cells_receive(&tunnel->gro_cells, skb);
517         return 0;
518
519 drop:
520         kfree_skb(skb);
521         return 0;
522 }
523 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
524
525 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
526                             struct rtable *rt, __be16 df)
527 {
528         struct ip_tunnel *tunnel = netdev_priv(dev);
529         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
530         int mtu;
531
532         if (df)
533                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
534                                         - sizeof(struct iphdr) - tunnel->hlen;
535         else
536                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
537
538         if (skb_dst(skb))
539                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
540
541         if (skb->protocol == htons(ETH_P_IP)) {
542                 if (!skb_is_gso(skb) &&
543                     (df & htons(IP_DF)) && mtu < pkt_size) {
544                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
545                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
546                         return -E2BIG;
547                 }
548         }
549 #if IS_ENABLED(CONFIG_IPV6)
550         else if (skb->protocol == htons(ETH_P_IPV6)) {
551                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
552
553                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
554                            mtu >= IPV6_MIN_MTU) {
555                         if ((tunnel->parms.iph.daddr &&
556                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
557                             rt6->rt6i_dst.plen == 128) {
558                                 rt6->rt6i_flags |= RTF_MODIFIED;
559                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
560                         }
561                 }
562
563                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
564                                         mtu < pkt_size) {
565                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
566                         return -E2BIG;
567                 }
568         }
569 #endif
570         return 0;
571 }
572
573 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
574                     const struct iphdr *tnl_params, const u8 protocol)
575 {
576         struct ip_tunnel *tunnel = netdev_priv(dev);
577         const struct iphdr *inner_iph;
578         struct flowi4 fl4;
579         u8     tos, ttl;
580         __be16 df;
581         struct rtable *rt;              /* Route to the other host */
582         unsigned int max_headroom;      /* The extra header space needed */
583         __be32 dst;
584         int err;
585         bool connected = true;
586
587         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
588
589         dst = tnl_params->daddr;
590         if (dst == 0) {
591                 /* NBMA tunnel */
592
593                 if (skb_dst(skb) == NULL) {
594                         dev->stats.tx_fifo_errors++;
595                         goto tx_error;
596                 }
597
598                 if (skb->protocol == htons(ETH_P_IP)) {
599                         rt = skb_rtable(skb);
600                         dst = rt_nexthop(rt, inner_iph->daddr);
601                 }
602 #if IS_ENABLED(CONFIG_IPV6)
603                 else if (skb->protocol == htons(ETH_P_IPV6)) {
604                         const struct in6_addr *addr6;
605                         struct neighbour *neigh;
606                         bool do_tx_error_icmp;
607                         int addr_type;
608
609                         neigh = dst_neigh_lookup(skb_dst(skb),
610                                                  &ipv6_hdr(skb)->daddr);
611                         if (neigh == NULL)
612                                 goto tx_error;
613
614                         addr6 = (const struct in6_addr *)&neigh->primary_key;
615                         addr_type = ipv6_addr_type(addr6);
616
617                         if (addr_type == IPV6_ADDR_ANY) {
618                                 addr6 = &ipv6_hdr(skb)->daddr;
619                                 addr_type = ipv6_addr_type(addr6);
620                         }
621
622                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
623                                 do_tx_error_icmp = true;
624                         else {
625                                 do_tx_error_icmp = false;
626                                 dst = addr6->s6_addr32[3];
627                         }
628                         neigh_release(neigh);
629                         if (do_tx_error_icmp)
630                                 goto tx_error_icmp;
631                 }
632 #endif
633                 else
634                         goto tx_error;
635
636                 connected = false;
637         }
638
639         tos = tnl_params->tos;
640         if (tos & 0x1) {
641                 tos &= ~0x1;
642                 if (skb->protocol == htons(ETH_P_IP)) {
643                         tos = inner_iph->tos;
644                         connected = false;
645                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
646                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
647                         connected = false;
648                 }
649         }
650
651         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
652                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
653
654         rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
655
656         if (!rt) {
657                 rt = ip_route_output_key(tunnel->net, &fl4);
658
659                 if (IS_ERR(rt)) {
660                         dev->stats.tx_carrier_errors++;
661                         goto tx_error;
662                 }
663                 if (connected)
664                         tunnel_dst_set(tunnel, &rt->dst);
665         }
666
667         if (rt->dst.dev == dev) {
668                 ip_rt_put(rt);
669                 dev->stats.collisions++;
670                 goto tx_error;
671         }
672
673         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
674                 ip_rt_put(rt);
675                 goto tx_error;
676         }
677
678         if (tunnel->err_count > 0) {
679                 if (time_before(jiffies,
680                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
681                         tunnel->err_count--;
682
683                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
684                         dst_link_failure(skb);
685                 } else
686                         tunnel->err_count = 0;
687         }
688
689         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
690         ttl = tnl_params->ttl;
691         if (ttl == 0) {
692                 if (skb->protocol == htons(ETH_P_IP))
693                         ttl = inner_iph->ttl;
694 #if IS_ENABLED(CONFIG_IPV6)
695                 else if (skb->protocol == htons(ETH_P_IPV6))
696                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
697 #endif
698                 else
699                         ttl = ip4_dst_hoplimit(&rt->dst);
700         }
701
702         df = tnl_params->frag_off;
703         if (skb->protocol == htons(ETH_P_IP))
704                 df |= (inner_iph->frag_off&htons(IP_DF));
705
706         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
707                         + rt->dst.header_len;
708         if (max_headroom > dev->needed_headroom)
709                 dev->needed_headroom = max_headroom;
710
711         if (skb_cow_head(skb, dev->needed_headroom)) {
712                 dev->stats.tx_dropped++;
713                 kfree_skb(skb);
714                 return;
715         }
716
717         err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
718                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
719         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
720
721         return;
722
723 #if IS_ENABLED(CONFIG_IPV6)
724 tx_error_icmp:
725         dst_link_failure(skb);
726 #endif
727 tx_error:
728         dev->stats.tx_errors++;
729         kfree_skb(skb);
730 }
731 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
732
733 static void ip_tunnel_update(struct ip_tunnel_net *itn,
734                              struct ip_tunnel *t,
735                              struct net_device *dev,
736                              struct ip_tunnel_parm *p,
737                              bool set_mtu)
738 {
739         ip_tunnel_del(t);
740         t->parms.iph.saddr = p->iph.saddr;
741         t->parms.iph.daddr = p->iph.daddr;
742         t->parms.i_key = p->i_key;
743         t->parms.o_key = p->o_key;
744         if (dev->type != ARPHRD_ETHER) {
745                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
746                 memcpy(dev->broadcast, &p->iph.daddr, 4);
747         }
748         ip_tunnel_add(itn, t);
749
750         t->parms.iph.ttl = p->iph.ttl;
751         t->parms.iph.tos = p->iph.tos;
752         t->parms.iph.frag_off = p->iph.frag_off;
753
754         if (t->parms.link != p->link) {
755                 int mtu;
756
757                 t->parms.link = p->link;
758                 mtu = ip_tunnel_bind_dev(dev);
759                 if (set_mtu)
760                         dev->mtu = mtu;
761         }
762         tunnel_dst_reset_all(t);
763         netdev_state_change(dev);
764 }
765
766 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
767 {
768         int err = 0;
769         struct ip_tunnel *t;
770         struct net *net = dev_net(dev);
771         struct ip_tunnel *tunnel = netdev_priv(dev);
772         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
773
774         BUG_ON(!itn->fb_tunnel_dev);
775         switch (cmd) {
776         case SIOCGETTUNNEL:
777                 t = NULL;
778                 if (dev == itn->fb_tunnel_dev)
779                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
780                 if (t == NULL)
781                         t = netdev_priv(dev);
782                 memcpy(p, &t->parms, sizeof(*p));
783                 break;
784
785         case SIOCADDTUNNEL:
786         case SIOCCHGTUNNEL:
787                 err = -EPERM;
788                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
789                         goto done;
790                 if (p->iph.ttl)
791                         p->iph.frag_off |= htons(IP_DF);
792                 if (!(p->i_flags&TUNNEL_KEY))
793                         p->i_key = 0;
794                 if (!(p->o_flags&TUNNEL_KEY))
795                         p->o_key = 0;
796
797                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
798
799                 if (!t && (cmd == SIOCADDTUNNEL))
800                         t = ip_tunnel_create(net, itn, p);
801
802                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
803                         if (t != NULL) {
804                                 if (t->dev != dev) {
805                                         err = -EEXIST;
806                                         break;
807                                 }
808                         } else {
809                                 unsigned int nflags = 0;
810
811                                 if (ipv4_is_multicast(p->iph.daddr))
812                                         nflags = IFF_BROADCAST;
813                                 else if (p->iph.daddr)
814                                         nflags = IFF_POINTOPOINT;
815
816                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
817                                         err = -EINVAL;
818                                         break;
819                                 }
820
821                                 t = netdev_priv(dev);
822                         }
823                 }
824
825                 if (t) {
826                         err = 0;
827                         ip_tunnel_update(itn, t, dev, p, true);
828                 } else
829                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
830                 break;
831
832         case SIOCDELTUNNEL:
833                 err = -EPERM;
834                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
835                         goto done;
836
837                 if (dev == itn->fb_tunnel_dev) {
838                         err = -ENOENT;
839                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
840                         if (t == NULL)
841                                 goto done;
842                         err = -EPERM;
843                         if (t == netdev_priv(itn->fb_tunnel_dev))
844                                 goto done;
845                         dev = t->dev;
846                 }
847                 unregister_netdevice(dev);
848                 err = 0;
849                 break;
850
851         default:
852                 err = -EINVAL;
853         }
854
855 done:
856         return err;
857 }
858 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
859
860 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
861 {
862         struct ip_tunnel *tunnel = netdev_priv(dev);
863         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
864
865         if (new_mtu < 68 ||
866             new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
867                 return -EINVAL;
868         dev->mtu = new_mtu;
869         return 0;
870 }
871 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
872
873 static void ip_tunnel_dev_free(struct net_device *dev)
874 {
875         struct ip_tunnel *tunnel = netdev_priv(dev);
876
877         gro_cells_destroy(&tunnel->gro_cells);
878         free_percpu(tunnel->dst_cache);
879         free_percpu(dev->tstats);
880         free_netdev(dev);
881 }
882
883 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
884 {
885         struct ip_tunnel *tunnel = netdev_priv(dev);
886         struct ip_tunnel_net *itn;
887
888         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
889
890         if (itn->fb_tunnel_dev != dev) {
891                 ip_tunnel_del(netdev_priv(dev));
892                 unregister_netdevice_queue(dev, head);
893         }
894 }
895 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
896
897 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
898                                   struct rtnl_link_ops *ops, char *devname)
899 {
900         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
901         struct ip_tunnel_parm parms;
902         unsigned int i;
903
904         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
905                 INIT_HLIST_HEAD(&itn->tunnels[i]);
906
907         if (!ops) {
908                 itn->fb_tunnel_dev = NULL;
909                 return 0;
910         }
911
912         memset(&parms, 0, sizeof(parms));
913         if (devname)
914                 strlcpy(parms.name, devname, IFNAMSIZ);
915
916         rtnl_lock();
917         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
918         /* FB netdevice is special: we have one, and only one per netns.
919          * Allowing to move it to another netns is clearly unsafe.
920          */
921         if (!IS_ERR(itn->fb_tunnel_dev)) {
922                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
923                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
924         }
925         rtnl_unlock();
926
927         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
928 }
929 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
930
931 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
932                               struct rtnl_link_ops *ops)
933 {
934         struct net *net = dev_net(itn->fb_tunnel_dev);
935         struct net_device *dev, *aux;
936         int h;
937
938         for_each_netdev_safe(net, dev, aux)
939                 if (dev->rtnl_link_ops == ops)
940                         unregister_netdevice_queue(dev, head);
941
942         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
943                 struct ip_tunnel *t;
944                 struct hlist_node *n;
945                 struct hlist_head *thead = &itn->tunnels[h];
946
947                 hlist_for_each_entry_safe(t, n, thead, hash_node)
948                         /* If dev is in the same netns, it has already
949                          * been added to the list by the previous loop.
950                          */
951                         if (!net_eq(dev_net(t->dev), net))
952                                 unregister_netdevice_queue(t->dev, head);
953         }
954 }
955
956 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
957 {
958         LIST_HEAD(list);
959
960         rtnl_lock();
961         ip_tunnel_destroy(itn, &list, ops);
962         unregister_netdevice_many(&list);
963         rtnl_unlock();
964 }
965 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
966
967 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
968                       struct ip_tunnel_parm *p)
969 {
970         struct ip_tunnel *nt;
971         struct net *net = dev_net(dev);
972         struct ip_tunnel_net *itn;
973         int mtu;
974         int err;
975
976         nt = netdev_priv(dev);
977         itn = net_generic(net, nt->ip_tnl_net_id);
978
979         if (ip_tunnel_find(itn, p, dev->type))
980                 return -EEXIST;
981
982         nt->net = net;
983         nt->parms = *p;
984         err = register_netdevice(dev);
985         if (err)
986                 goto out;
987
988         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
989                 eth_hw_addr_random(dev);
990
991         mtu = ip_tunnel_bind_dev(dev);
992         if (!tb[IFLA_MTU])
993                 dev->mtu = mtu;
994
995         ip_tunnel_add(itn, nt);
996
997 out:
998         return err;
999 }
1000 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1001
1002 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1003                          struct ip_tunnel_parm *p)
1004 {
1005         struct ip_tunnel *t;
1006         struct ip_tunnel *tunnel = netdev_priv(dev);
1007         struct net *net = tunnel->net;
1008         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1009
1010         if (dev == itn->fb_tunnel_dev)
1011                 return -EINVAL;
1012
1013         t = ip_tunnel_find(itn, p, dev->type);
1014
1015         if (t) {
1016                 if (t->dev != dev)
1017                         return -EEXIST;
1018         } else {
1019                 t = tunnel;
1020
1021                 if (dev->type != ARPHRD_ETHER) {
1022                         unsigned int nflags = 0;
1023
1024                         if (ipv4_is_multicast(p->iph.daddr))
1025                                 nflags = IFF_BROADCAST;
1026                         else if (p->iph.daddr)
1027                                 nflags = IFF_POINTOPOINT;
1028
1029                         if ((dev->flags ^ nflags) &
1030                             (IFF_POINTOPOINT | IFF_BROADCAST))
1031                                 return -EINVAL;
1032                 }
1033         }
1034
1035         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1036         return 0;
1037 }
1038 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1039
1040 int ip_tunnel_init(struct net_device *dev)
1041 {
1042         struct ip_tunnel *tunnel = netdev_priv(dev);
1043         struct iphdr *iph = &tunnel->parms.iph;
1044         int i, err;
1045
1046         dev->destructor = ip_tunnel_dev_free;
1047         dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
1048         if (!dev->tstats)
1049                 return -ENOMEM;
1050
1051         for_each_possible_cpu(i) {
1052                 struct pcpu_sw_netstats *ipt_stats;
1053                 ipt_stats = per_cpu_ptr(dev->tstats, i);
1054                 u64_stats_init(&ipt_stats->syncp);
1055         }
1056
1057         tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1058         if (!tunnel->dst_cache) {
1059                 free_percpu(dev->tstats);
1060                 return -ENOMEM;
1061         }
1062
1063         err = gro_cells_init(&tunnel->gro_cells, dev);
1064         if (err) {
1065                 free_percpu(tunnel->dst_cache);
1066                 free_percpu(dev->tstats);
1067                 return err;
1068         }
1069
1070         tunnel->dev = dev;
1071         tunnel->net = dev_net(dev);
1072         strcpy(tunnel->parms.name, dev->name);
1073         iph->version            = 4;
1074         iph->ihl                = 5;
1075
1076         return 0;
1077 }
1078 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1079
1080 void ip_tunnel_uninit(struct net_device *dev)
1081 {
1082         struct ip_tunnel *tunnel = netdev_priv(dev);
1083         struct net *net = tunnel->net;
1084         struct ip_tunnel_net *itn;
1085
1086         itn = net_generic(net, tunnel->ip_tnl_net_id);
1087         /* fb_tunnel_dev will be unregisted in net-exit call. */
1088         if (itn->fb_tunnel_dev != dev)
1089                 ip_tunnel_del(netdev_priv(dev));
1090
1091         tunnel_dst_reset_all(tunnel);
1092 }
1093 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1094
1095 /* Do least required initialization, rest of init is done in tunnel_init call */
1096 void ip_tunnel_setup(struct net_device *dev, int net_id)
1097 {
1098         struct ip_tunnel *tunnel = netdev_priv(dev);
1099         tunnel->ip_tnl_net_id = net_id;
1100 }
1101 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1102
1103 MODULE_LICENSE("GPL");