]> Pileus Git - ~andy/linux/blob - net/ipv4/ip_tunnel.c
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec...
[~andy/linux] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43
44 #include <net/sock.h>
45 #include <net/ip.h>
46 #include <net/icmp.h>
47 #include <net/protocol.h>
48 #include <net/ip_tunnels.h>
49 #include <net/arp.h>
50 #include <net/checksum.h>
51 #include <net/dsfield.h>
52 #include <net/inet_ecn.h>
53 #include <net/xfrm.h>
54 #include <net/net_namespace.h>
55 #include <net/netns/generic.h>
56 #include <net/rtnetlink.h>
57
58 #if IS_ENABLED(CONFIG_IPV6)
59 #include <net/ipv6.h>
60 #include <net/ip6_fib.h>
61 #include <net/ip6_route.h>
62 #endif
63
64 static unsigned int ip_tunnel_hash(struct ip_tunnel_net *itn,
65                                    __be32 key, __be32 remote)
66 {
67         return hash_32((__force u32)key ^ (__force u32)remote,
68                          IP_TNL_HASH_BITS);
69 }
70
71 static inline void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72                                     struct dst_entry *dst)
73 {
74         struct dst_entry *old_dst;
75
76         if (dst && (dst->flags & DST_NOCACHE))
77                 dst = NULL;
78
79         spin_lock_bh(&idst->lock);
80         old_dst = rcu_dereference(idst->dst);
81         rcu_assign_pointer(idst->dst, dst);
82         dst_release(old_dst);
83         spin_unlock_bh(&idst->lock);
84 }
85
86 static inline void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
87 {
88         __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
89 }
90
91 static inline void tunnel_dst_reset(struct ip_tunnel *t)
92 {
93         tunnel_dst_set(t, NULL);
94 }
95
96 static void tunnel_dst_reset_all(struct ip_tunnel *t)
97 {
98         int i;
99
100         for_each_possible_cpu(i)
101                 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
102 }
103
104 static inline struct dst_entry *tunnel_dst_get(struct ip_tunnel *t)
105 {
106         struct dst_entry *dst;
107
108         rcu_read_lock();
109         dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
110         if (dst)
111                 dst_hold(dst);
112         rcu_read_unlock();
113         return dst;
114 }
115
116 static struct dst_entry *tunnel_dst_check(struct ip_tunnel *t, u32 cookie)
117 {
118         struct dst_entry *dst = tunnel_dst_get(t);
119
120         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
121                 tunnel_dst_reset(t);
122                 return NULL;
123         }
124
125         return dst;
126 }
127
128 /* Often modified stats are per cpu, other are shared (netdev->stats) */
129 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
130                                                 struct rtnl_link_stats64 *tot)
131 {
132         int i;
133
134         for_each_possible_cpu(i) {
135                 const struct pcpu_sw_netstats *tstats =
136                                                    per_cpu_ptr(dev->tstats, i);
137                 u64 rx_packets, rx_bytes, tx_packets, tx_bytes;
138                 unsigned int start;
139
140                 do {
141                         start = u64_stats_fetch_begin_bh(&tstats->syncp);
142                         rx_packets = tstats->rx_packets;
143                         tx_packets = tstats->tx_packets;
144                         rx_bytes = tstats->rx_bytes;
145                         tx_bytes = tstats->tx_bytes;
146                 } while (u64_stats_fetch_retry_bh(&tstats->syncp, start));
147
148                 tot->rx_packets += rx_packets;
149                 tot->tx_packets += tx_packets;
150                 tot->rx_bytes   += rx_bytes;
151                 tot->tx_bytes   += tx_bytes;
152         }
153
154         tot->multicast = dev->stats.multicast;
155
156         tot->rx_crc_errors = dev->stats.rx_crc_errors;
157         tot->rx_fifo_errors = dev->stats.rx_fifo_errors;
158         tot->rx_length_errors = dev->stats.rx_length_errors;
159         tot->rx_frame_errors = dev->stats.rx_frame_errors;
160         tot->rx_errors = dev->stats.rx_errors;
161
162         tot->tx_fifo_errors = dev->stats.tx_fifo_errors;
163         tot->tx_carrier_errors = dev->stats.tx_carrier_errors;
164         tot->tx_dropped = dev->stats.tx_dropped;
165         tot->tx_aborted_errors = dev->stats.tx_aborted_errors;
166         tot->tx_errors = dev->stats.tx_errors;
167
168         tot->collisions  = dev->stats.collisions;
169
170         return tot;
171 }
172 EXPORT_SYMBOL_GPL(ip_tunnel_get_stats64);
173
174 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
175                                 __be16 flags, __be32 key)
176 {
177         if (p->i_flags & TUNNEL_KEY) {
178                 if (flags & TUNNEL_KEY)
179                         return key == p->i_key;
180                 else
181                         /* key expected, none present */
182                         return false;
183         } else
184                 return !(flags & TUNNEL_KEY);
185 }
186
187 /* Fallback tunnel: no source, no destination, no key, no options
188
189    Tunnel hash table:
190    We require exact key match i.e. if a key is present in packet
191    it will match only tunnel with the same key; if it is not present,
192    it will match only keyless tunnel.
193
194    All keysless packets, if not matched configured keyless tunnels
195    will match fallback tunnel.
196    Given src, dst and key, find appropriate for input tunnel.
197 */
198 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
199                                    int link, __be16 flags,
200                                    __be32 remote, __be32 local,
201                                    __be32 key)
202 {
203         unsigned int hash;
204         struct ip_tunnel *t, *cand = NULL;
205         struct hlist_head *head;
206
207         hash = ip_tunnel_hash(itn, key, remote);
208         head = &itn->tunnels[hash];
209
210         hlist_for_each_entry_rcu(t, head, hash_node) {
211                 if (local != t->parms.iph.saddr ||
212                     remote != t->parms.iph.daddr ||
213                     !(t->dev->flags & IFF_UP))
214                         continue;
215
216                 if (!ip_tunnel_key_match(&t->parms, flags, key))
217                         continue;
218
219                 if (t->parms.link == link)
220                         return t;
221                 else
222                         cand = t;
223         }
224
225         hlist_for_each_entry_rcu(t, head, hash_node) {
226                 if (remote != t->parms.iph.daddr ||
227                     !(t->dev->flags & IFF_UP))
228                         continue;
229
230                 if (!ip_tunnel_key_match(&t->parms, flags, key))
231                         continue;
232
233                 if (t->parms.link == link)
234                         return t;
235                 else if (!cand)
236                         cand = t;
237         }
238
239         hash = ip_tunnel_hash(itn, key, 0);
240         head = &itn->tunnels[hash];
241
242         hlist_for_each_entry_rcu(t, head, hash_node) {
243                 if ((local != t->parms.iph.saddr &&
244                      (local != t->parms.iph.daddr ||
245                       !ipv4_is_multicast(local))) ||
246                     !(t->dev->flags & IFF_UP))
247                         continue;
248
249                 if (!ip_tunnel_key_match(&t->parms, flags, key))
250                         continue;
251
252                 if (t->parms.link == link)
253                         return t;
254                 else if (!cand)
255                         cand = t;
256         }
257
258         if (flags & TUNNEL_NO_KEY)
259                 goto skip_key_lookup;
260
261         hlist_for_each_entry_rcu(t, head, hash_node) {
262                 if (t->parms.i_key != key ||
263                     !(t->dev->flags & IFF_UP))
264                         continue;
265
266                 if (t->parms.link == link)
267                         return t;
268                 else if (!cand)
269                         cand = t;
270         }
271
272 skip_key_lookup:
273         if (cand)
274                 return cand;
275
276         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
277                 return netdev_priv(itn->fb_tunnel_dev);
278
279
280         return NULL;
281 }
282 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
283
284 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
285                                     struct ip_tunnel_parm *parms)
286 {
287         unsigned int h;
288         __be32 remote;
289
290         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
291                 remote = parms->iph.daddr;
292         else
293                 remote = 0;
294
295         h = ip_tunnel_hash(itn, parms->i_key, remote);
296         return &itn->tunnels[h];
297 }
298
299 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
300 {
301         struct hlist_head *head = ip_bucket(itn, &t->parms);
302
303         hlist_add_head_rcu(&t->hash_node, head);
304 }
305
306 static void ip_tunnel_del(struct ip_tunnel *t)
307 {
308         hlist_del_init_rcu(&t->hash_node);
309 }
310
311 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
312                                         struct ip_tunnel_parm *parms,
313                                         int type)
314 {
315         __be32 remote = parms->iph.daddr;
316         __be32 local = parms->iph.saddr;
317         __be32 key = parms->i_key;
318         int link = parms->link;
319         struct ip_tunnel *t = NULL;
320         struct hlist_head *head = ip_bucket(itn, parms);
321
322         hlist_for_each_entry_rcu(t, head, hash_node) {
323                 if (local == t->parms.iph.saddr &&
324                     remote == t->parms.iph.daddr &&
325                     key == t->parms.i_key &&
326                     link == t->parms.link &&
327                     type == t->dev->type)
328                         break;
329         }
330         return t;
331 }
332
333 static struct net_device *__ip_tunnel_create(struct net *net,
334                                              const struct rtnl_link_ops *ops,
335                                              struct ip_tunnel_parm *parms)
336 {
337         int err;
338         struct ip_tunnel *tunnel;
339         struct net_device *dev;
340         char name[IFNAMSIZ];
341
342         if (parms->name[0])
343                 strlcpy(name, parms->name, IFNAMSIZ);
344         else {
345                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
346                         err = -E2BIG;
347                         goto failed;
348                 }
349                 strlcpy(name, ops->kind, IFNAMSIZ);
350                 strncat(name, "%d", 2);
351         }
352
353         ASSERT_RTNL();
354         dev = alloc_netdev(ops->priv_size, name, ops->setup);
355         if (!dev) {
356                 err = -ENOMEM;
357                 goto failed;
358         }
359         dev_net_set(dev, net);
360
361         dev->rtnl_link_ops = ops;
362
363         tunnel = netdev_priv(dev);
364         tunnel->parms = *parms;
365         tunnel->net = net;
366
367         err = register_netdevice(dev);
368         if (err)
369                 goto failed_free;
370
371         return dev;
372
373 failed_free:
374         free_netdev(dev);
375 failed:
376         return ERR_PTR(err);
377 }
378
379 static inline void init_tunnel_flow(struct flowi4 *fl4,
380                                     int proto,
381                                     __be32 daddr, __be32 saddr,
382                                     __be32 key, __u8 tos, int oif)
383 {
384         memset(fl4, 0, sizeof(*fl4));
385         fl4->flowi4_oif = oif;
386         fl4->daddr = daddr;
387         fl4->saddr = saddr;
388         fl4->flowi4_tos = tos;
389         fl4->flowi4_proto = proto;
390         fl4->fl4_gre_key = key;
391 }
392
393 static int ip_tunnel_bind_dev(struct net_device *dev)
394 {
395         struct net_device *tdev = NULL;
396         struct ip_tunnel *tunnel = netdev_priv(dev);
397         const struct iphdr *iph;
398         int hlen = LL_MAX_HEADER;
399         int mtu = ETH_DATA_LEN;
400         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
401
402         iph = &tunnel->parms.iph;
403
404         /* Guess output device to choose reasonable mtu and needed_headroom */
405         if (iph->daddr) {
406                 struct flowi4 fl4;
407                 struct rtable *rt;
408
409                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
410                                  iph->saddr, tunnel->parms.o_key,
411                                  RT_TOS(iph->tos), tunnel->parms.link);
412                 rt = ip_route_output_key(tunnel->net, &fl4);
413
414                 if (!IS_ERR(rt)) {
415                         tdev = rt->dst.dev;
416                         tunnel_dst_set(tunnel, dst_clone(&rt->dst));
417                         ip_rt_put(rt);
418                 }
419                 if (dev->type != ARPHRD_ETHER)
420                         dev->flags |= IFF_POINTOPOINT;
421         }
422
423         if (!tdev && tunnel->parms.link)
424                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
425
426         if (tdev) {
427                 hlen = tdev->hard_header_len + tdev->needed_headroom;
428                 mtu = tdev->mtu;
429         }
430         dev->iflink = tunnel->parms.link;
431
432         dev->needed_headroom = t_hlen + hlen;
433         mtu -= (dev->hard_header_len + t_hlen);
434
435         if (mtu < 68)
436                 mtu = 68;
437
438         return mtu;
439 }
440
441 static struct ip_tunnel *ip_tunnel_create(struct net *net,
442                                           struct ip_tunnel_net *itn,
443                                           struct ip_tunnel_parm *parms)
444 {
445         struct ip_tunnel *nt, *fbt;
446         struct net_device *dev;
447
448         BUG_ON(!itn->fb_tunnel_dev);
449         fbt = netdev_priv(itn->fb_tunnel_dev);
450         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
451         if (IS_ERR(dev))
452                 return NULL;
453
454         dev->mtu = ip_tunnel_bind_dev(dev);
455
456         nt = netdev_priv(dev);
457         ip_tunnel_add(itn, nt);
458         return nt;
459 }
460
461 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
462                   const struct tnl_ptk_info *tpi, bool log_ecn_error)
463 {
464         struct pcpu_sw_netstats *tstats;
465         const struct iphdr *iph = ip_hdr(skb);
466         int err;
467
468 #ifdef CONFIG_NET_IPGRE_BROADCAST
469         if (ipv4_is_multicast(iph->daddr)) {
470                 /* Looped back packet, drop it! */
471                 if (rt_is_output_route(skb_rtable(skb)))
472                         goto drop;
473                 tunnel->dev->stats.multicast++;
474                 skb->pkt_type = PACKET_BROADCAST;
475         }
476 #endif
477
478         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
479              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
480                 tunnel->dev->stats.rx_crc_errors++;
481                 tunnel->dev->stats.rx_errors++;
482                 goto drop;
483         }
484
485         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
486                 if (!(tpi->flags&TUNNEL_SEQ) ||
487                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
488                         tunnel->dev->stats.rx_fifo_errors++;
489                         tunnel->dev->stats.rx_errors++;
490                         goto drop;
491                 }
492                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
493         }
494
495         err = IP_ECN_decapsulate(iph, skb);
496         if (unlikely(err)) {
497                 if (log_ecn_error)
498                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
499                                         &iph->saddr, iph->tos);
500                 if (err > 1) {
501                         ++tunnel->dev->stats.rx_frame_errors;
502                         ++tunnel->dev->stats.rx_errors;
503                         goto drop;
504                 }
505         }
506
507         tstats = this_cpu_ptr(tunnel->dev->tstats);
508         u64_stats_update_begin(&tstats->syncp);
509         tstats->rx_packets++;
510         tstats->rx_bytes += skb->len;
511         u64_stats_update_end(&tstats->syncp);
512
513         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
514
515         if (tunnel->dev->type == ARPHRD_ETHER) {
516                 skb->protocol = eth_type_trans(skb, tunnel->dev);
517                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
518         } else {
519                 skb->dev = tunnel->dev;
520         }
521
522         gro_cells_receive(&tunnel->gro_cells, skb);
523         return 0;
524
525 drop:
526         kfree_skb(skb);
527         return 0;
528 }
529 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
530
531 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
532                             struct rtable *rt, __be16 df)
533 {
534         struct ip_tunnel *tunnel = netdev_priv(dev);
535         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
536         int mtu;
537
538         if (df)
539                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
540                                         - sizeof(struct iphdr) - tunnel->hlen;
541         else
542                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
543
544         if (skb_dst(skb))
545                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
546
547         if (skb->protocol == htons(ETH_P_IP)) {
548                 if (!skb_is_gso(skb) &&
549                     (df & htons(IP_DF)) && mtu < pkt_size) {
550                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
551                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
552                         return -E2BIG;
553                 }
554         }
555 #if IS_ENABLED(CONFIG_IPV6)
556         else if (skb->protocol == htons(ETH_P_IPV6)) {
557                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
558
559                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
560                            mtu >= IPV6_MIN_MTU) {
561                         if ((tunnel->parms.iph.daddr &&
562                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
563                             rt6->rt6i_dst.plen == 128) {
564                                 rt6->rt6i_flags |= RTF_MODIFIED;
565                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
566                         }
567                 }
568
569                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
570                                         mtu < pkt_size) {
571                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
572                         return -E2BIG;
573                 }
574         }
575 #endif
576         return 0;
577 }
578
579 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
580                     const struct iphdr *tnl_params, const u8 protocol)
581 {
582         struct ip_tunnel *tunnel = netdev_priv(dev);
583         const struct iphdr *inner_iph;
584         struct flowi4 fl4;
585         u8     tos, ttl;
586         __be16 df;
587         struct rtable *rt = NULL;       /* Route to the other host */
588         unsigned int max_headroom;      /* The extra header space needed */
589         __be32 dst;
590         int err;
591         bool connected = true;
592
593         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
594
595         dst = tnl_params->daddr;
596         if (dst == 0) {
597                 /* NBMA tunnel */
598
599                 if (skb_dst(skb) == NULL) {
600                         dev->stats.tx_fifo_errors++;
601                         goto tx_error;
602                 }
603
604                 if (skb->protocol == htons(ETH_P_IP)) {
605                         rt = skb_rtable(skb);
606                         dst = rt_nexthop(rt, inner_iph->daddr);
607                 }
608 #if IS_ENABLED(CONFIG_IPV6)
609                 else if (skb->protocol == htons(ETH_P_IPV6)) {
610                         const struct in6_addr *addr6;
611                         struct neighbour *neigh;
612                         bool do_tx_error_icmp;
613                         int addr_type;
614
615                         neigh = dst_neigh_lookup(skb_dst(skb),
616                                                  &ipv6_hdr(skb)->daddr);
617                         if (neigh == NULL)
618                                 goto tx_error;
619
620                         addr6 = (const struct in6_addr *)&neigh->primary_key;
621                         addr_type = ipv6_addr_type(addr6);
622
623                         if (addr_type == IPV6_ADDR_ANY) {
624                                 addr6 = &ipv6_hdr(skb)->daddr;
625                                 addr_type = ipv6_addr_type(addr6);
626                         }
627
628                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
629                                 do_tx_error_icmp = true;
630                         else {
631                                 do_tx_error_icmp = false;
632                                 dst = addr6->s6_addr32[3];
633                         }
634                         neigh_release(neigh);
635                         if (do_tx_error_icmp)
636                                 goto tx_error_icmp;
637                 }
638 #endif
639                 else
640                         goto tx_error;
641
642                 connected = false;
643         }
644
645         tos = tnl_params->tos;
646         if (tos & 0x1) {
647                 tos &= ~0x1;
648                 if (skb->protocol == htons(ETH_P_IP)) {
649                         tos = inner_iph->tos;
650                         connected = false;
651                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
652                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
653                         connected = false;
654                 }
655         }
656
657         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
658                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
659
660         if (connected)
661                 rt = (struct rtable *)tunnel_dst_check(tunnel, 0);
662
663         if (!rt) {
664                 rt = ip_route_output_key(tunnel->net, &fl4);
665
666                 if (IS_ERR(rt)) {
667                         dev->stats.tx_carrier_errors++;
668                         goto tx_error;
669                 }
670                 if (connected)
671                         tunnel_dst_set(tunnel, dst_clone(&rt->dst));
672         }
673
674         if (rt->dst.dev == dev) {
675                 ip_rt_put(rt);
676                 dev->stats.collisions++;
677                 goto tx_error;
678         }
679
680         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
681                 ip_rt_put(rt);
682                 goto tx_error;
683         }
684
685         if (tunnel->err_count > 0) {
686                 if (time_before(jiffies,
687                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
688                         tunnel->err_count--;
689
690                         dst_link_failure(skb);
691                 } else
692                         tunnel->err_count = 0;
693         }
694
695         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
696         ttl = tnl_params->ttl;
697         if (ttl == 0) {
698                 if (skb->protocol == htons(ETH_P_IP))
699                         ttl = inner_iph->ttl;
700 #if IS_ENABLED(CONFIG_IPV6)
701                 else if (skb->protocol == htons(ETH_P_IPV6))
702                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
703 #endif
704                 else
705                         ttl = ip4_dst_hoplimit(&rt->dst);
706         }
707
708         df = tnl_params->frag_off;
709         if (skb->protocol == htons(ETH_P_IP))
710                 df |= (inner_iph->frag_off&htons(IP_DF));
711
712         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
713                         + rt->dst.header_len;
714         if (max_headroom > dev->needed_headroom)
715                 dev->needed_headroom = max_headroom;
716
717         if (skb_cow_head(skb, dev->needed_headroom)) {
718                 dev->stats.tx_dropped++;
719                 dev_kfree_skb(skb);
720                 return;
721         }
722
723         err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
724                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
725         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
726
727         return;
728
729 #if IS_ENABLED(CONFIG_IPV6)
730 tx_error_icmp:
731         dst_link_failure(skb);
732 #endif
733 tx_error:
734         dev->stats.tx_errors++;
735         dev_kfree_skb(skb);
736 }
737 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
738
739 static void ip_tunnel_update(struct ip_tunnel_net *itn,
740                              struct ip_tunnel *t,
741                              struct net_device *dev,
742                              struct ip_tunnel_parm *p,
743                              bool set_mtu)
744 {
745         ip_tunnel_del(t);
746         t->parms.iph.saddr = p->iph.saddr;
747         t->parms.iph.daddr = p->iph.daddr;
748         t->parms.i_key = p->i_key;
749         t->parms.o_key = p->o_key;
750         if (dev->type != ARPHRD_ETHER) {
751                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
752                 memcpy(dev->broadcast, &p->iph.daddr, 4);
753         }
754         ip_tunnel_add(itn, t);
755
756         t->parms.iph.ttl = p->iph.ttl;
757         t->parms.iph.tos = p->iph.tos;
758         t->parms.iph.frag_off = p->iph.frag_off;
759
760         if (t->parms.link != p->link) {
761                 int mtu;
762
763                 t->parms.link = p->link;
764                 mtu = ip_tunnel_bind_dev(dev);
765                 if (set_mtu)
766                         dev->mtu = mtu;
767         }
768         tunnel_dst_reset_all(t);
769         netdev_state_change(dev);
770 }
771
772 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
773 {
774         int err = 0;
775         struct ip_tunnel *t;
776         struct net *net = dev_net(dev);
777         struct ip_tunnel *tunnel = netdev_priv(dev);
778         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
779
780         BUG_ON(!itn->fb_tunnel_dev);
781         switch (cmd) {
782         case SIOCGETTUNNEL:
783                 t = NULL;
784                 if (dev == itn->fb_tunnel_dev)
785                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
786                 if (t == NULL)
787                         t = netdev_priv(dev);
788                 memcpy(p, &t->parms, sizeof(*p));
789                 break;
790
791         case SIOCADDTUNNEL:
792         case SIOCCHGTUNNEL:
793                 err = -EPERM;
794                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
795                         goto done;
796                 if (p->iph.ttl)
797                         p->iph.frag_off |= htons(IP_DF);
798                 if (!(p->i_flags&TUNNEL_KEY))
799                         p->i_key = 0;
800                 if (!(p->o_flags&TUNNEL_KEY))
801                         p->o_key = 0;
802
803                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
804
805                 if (!t && (cmd == SIOCADDTUNNEL))
806                         t = ip_tunnel_create(net, itn, p);
807
808                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
809                         if (t != NULL) {
810                                 if (t->dev != dev) {
811                                         err = -EEXIST;
812                                         break;
813                                 }
814                         } else {
815                                 unsigned int nflags = 0;
816
817                                 if (ipv4_is_multicast(p->iph.daddr))
818                                         nflags = IFF_BROADCAST;
819                                 else if (p->iph.daddr)
820                                         nflags = IFF_POINTOPOINT;
821
822                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
823                                         err = -EINVAL;
824                                         break;
825                                 }
826
827                                 t = netdev_priv(dev);
828                         }
829                 }
830
831                 if (t) {
832                         err = 0;
833                         ip_tunnel_update(itn, t, dev, p, true);
834                 } else
835                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
836                 break;
837
838         case SIOCDELTUNNEL:
839                 err = -EPERM;
840                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
841                         goto done;
842
843                 if (dev == itn->fb_tunnel_dev) {
844                         err = -ENOENT;
845                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
846                         if (t == NULL)
847                                 goto done;
848                         err = -EPERM;
849                         if (t == netdev_priv(itn->fb_tunnel_dev))
850                                 goto done;
851                         dev = t->dev;
852                 }
853                 unregister_netdevice(dev);
854                 err = 0;
855                 break;
856
857         default:
858                 err = -EINVAL;
859         }
860
861 done:
862         return err;
863 }
864 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
865
866 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
867 {
868         struct ip_tunnel *tunnel = netdev_priv(dev);
869         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
870
871         if (new_mtu < 68 ||
872             new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
873                 return -EINVAL;
874         dev->mtu = new_mtu;
875         return 0;
876 }
877 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
878
879 static void ip_tunnel_dev_free(struct net_device *dev)
880 {
881         struct ip_tunnel *tunnel = netdev_priv(dev);
882
883         gro_cells_destroy(&tunnel->gro_cells);
884         free_percpu(tunnel->dst_cache);
885         free_percpu(dev->tstats);
886         free_netdev(dev);
887 }
888
889 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
890 {
891         struct ip_tunnel *tunnel = netdev_priv(dev);
892         struct ip_tunnel_net *itn;
893
894         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
895
896         if (itn->fb_tunnel_dev != dev) {
897                 ip_tunnel_del(netdev_priv(dev));
898                 unregister_netdevice_queue(dev, head);
899         }
900 }
901 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
902
903 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
904                                   struct rtnl_link_ops *ops, char *devname)
905 {
906         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
907         struct ip_tunnel_parm parms;
908         unsigned int i;
909
910         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
911                 INIT_HLIST_HEAD(&itn->tunnels[i]);
912
913         if (!ops) {
914                 itn->fb_tunnel_dev = NULL;
915                 return 0;
916         }
917
918         memset(&parms, 0, sizeof(parms));
919         if (devname)
920                 strlcpy(parms.name, devname, IFNAMSIZ);
921
922         rtnl_lock();
923         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
924         /* FB netdevice is special: we have one, and only one per netns.
925          * Allowing to move it to another netns is clearly unsafe.
926          */
927         if (!IS_ERR(itn->fb_tunnel_dev)) {
928                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
929                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
930         }
931         rtnl_unlock();
932
933         return PTR_RET(itn->fb_tunnel_dev);
934 }
935 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
936
937 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
938                               struct rtnl_link_ops *ops)
939 {
940         struct net *net = dev_net(itn->fb_tunnel_dev);
941         struct net_device *dev, *aux;
942         int h;
943
944         for_each_netdev_safe(net, dev, aux)
945                 if (dev->rtnl_link_ops == ops)
946                         unregister_netdevice_queue(dev, head);
947
948         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
949                 struct ip_tunnel *t;
950                 struct hlist_node *n;
951                 struct hlist_head *thead = &itn->tunnels[h];
952
953                 hlist_for_each_entry_safe(t, n, thead, hash_node)
954                         /* If dev is in the same netns, it has already
955                          * been added to the list by the previous loop.
956                          */
957                         if (!net_eq(dev_net(t->dev), net))
958                                 unregister_netdevice_queue(t->dev, head);
959         }
960 }
961
962 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
963 {
964         LIST_HEAD(list);
965
966         rtnl_lock();
967         ip_tunnel_destroy(itn, &list, ops);
968         unregister_netdevice_many(&list);
969         rtnl_unlock();
970 }
971 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
972
973 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
974                       struct ip_tunnel_parm *p)
975 {
976         struct ip_tunnel *nt;
977         struct net *net = dev_net(dev);
978         struct ip_tunnel_net *itn;
979         int mtu;
980         int err;
981
982         nt = netdev_priv(dev);
983         itn = net_generic(net, nt->ip_tnl_net_id);
984
985         if (ip_tunnel_find(itn, p, dev->type))
986                 return -EEXIST;
987
988         nt->net = net;
989         nt->parms = *p;
990         err = register_netdevice(dev);
991         if (err)
992                 goto out;
993
994         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
995                 eth_hw_addr_random(dev);
996
997         mtu = ip_tunnel_bind_dev(dev);
998         if (!tb[IFLA_MTU])
999                 dev->mtu = mtu;
1000
1001         ip_tunnel_add(itn, nt);
1002
1003 out:
1004         return err;
1005 }
1006 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
1007
1008 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
1009                          struct ip_tunnel_parm *p)
1010 {
1011         struct ip_tunnel *t;
1012         struct ip_tunnel *tunnel = netdev_priv(dev);
1013         struct net *net = tunnel->net;
1014         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
1015
1016         if (dev == itn->fb_tunnel_dev)
1017                 return -EINVAL;
1018
1019         t = ip_tunnel_find(itn, p, dev->type);
1020
1021         if (t) {
1022                 if (t->dev != dev)
1023                         return -EEXIST;
1024         } else {
1025                 t = tunnel;
1026
1027                 if (dev->type != ARPHRD_ETHER) {
1028                         unsigned int nflags = 0;
1029
1030                         if (ipv4_is_multicast(p->iph.daddr))
1031                                 nflags = IFF_BROADCAST;
1032                         else if (p->iph.daddr)
1033                                 nflags = IFF_POINTOPOINT;
1034
1035                         if ((dev->flags ^ nflags) &
1036                             (IFF_POINTOPOINT | IFF_BROADCAST))
1037                                 return -EINVAL;
1038                 }
1039         }
1040
1041         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
1042         return 0;
1043 }
1044 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
1045
1046 int ip_tunnel_init(struct net_device *dev)
1047 {
1048         struct ip_tunnel *tunnel = netdev_priv(dev);
1049         struct iphdr *iph = &tunnel->parms.iph;
1050         int i, err;
1051
1052         dev->destructor = ip_tunnel_dev_free;
1053         dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
1054         if (!dev->tstats)
1055                 return -ENOMEM;
1056
1057         for_each_possible_cpu(i) {
1058                 struct pcpu_sw_netstats *ipt_stats;
1059                 ipt_stats = per_cpu_ptr(dev->tstats, i);
1060                 u64_stats_init(&ipt_stats->syncp);
1061         }
1062
1063         tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1064         if (!tunnel->dst_cache) {
1065                 free_percpu(dev->tstats);
1066                 return -ENOMEM;
1067         }
1068
1069         for_each_possible_cpu(i) {
1070                 struct ip_tunnel_dst *idst = per_cpu_ptr(tunnel->dst_cache, i);
1071                 idst-> dst = NULL;
1072                 spin_lock_init(&idst->lock);
1073         }
1074
1075         err = gro_cells_init(&tunnel->gro_cells, dev);
1076         if (err) {
1077                 free_percpu(tunnel->dst_cache);
1078                 free_percpu(dev->tstats);
1079                 return err;
1080         }
1081
1082         tunnel->dev = dev;
1083         tunnel->net = dev_net(dev);
1084         strcpy(tunnel->parms.name, dev->name);
1085         iph->version            = 4;
1086         iph->ihl                = 5;
1087
1088         return 0;
1089 }
1090 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1091
1092 void ip_tunnel_uninit(struct net_device *dev)
1093 {
1094         struct ip_tunnel *tunnel = netdev_priv(dev);
1095         struct net *net = tunnel->net;
1096         struct ip_tunnel_net *itn;
1097
1098         itn = net_generic(net, tunnel->ip_tnl_net_id);
1099         /* fb_tunnel_dev will be unregisted in net-exit call. */
1100         if (itn->fb_tunnel_dev != dev)
1101                 ip_tunnel_del(netdev_priv(dev));
1102
1103         tunnel_dst_reset_all(tunnel);
1104 }
1105 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1106
1107 /* Do least required initialization, rest of init is done in tunnel_init call */
1108 void ip_tunnel_setup(struct net_device *dev, int net_id)
1109 {
1110         struct ip_tunnel *tunnel = netdev_priv(dev);
1111         tunnel->ip_tnl_net_id = net_id;
1112 }
1113 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1114
1115 MODULE_LICENSE("GPL");