]> Pileus Git - ~andy/linux/blob - net/ipv4/ip_tunnel.c
ip_tunnel: Fix dst ref-count.
[~andy/linux] / net / ipv4 / ip_tunnel.c
1 /*
2  * Copyright (c) 2013 Nicira, Inc.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of version 2 of the GNU General Public
6  * License as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public License
14  * along with this program; if not, write to the Free Software
15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16  * 02110-1301, USA
17  */
18
19 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
20
21 #include <linux/capability.h>
22 #include <linux/module.h>
23 #include <linux/types.h>
24 #include <linux/kernel.h>
25 #include <linux/slab.h>
26 #include <linux/uaccess.h>
27 #include <linux/skbuff.h>
28 #include <linux/netdevice.h>
29 #include <linux/in.h>
30 #include <linux/tcp.h>
31 #include <linux/udp.h>
32 #include <linux/if_arp.h>
33 #include <linux/mroute.h>
34 #include <linux/init.h>
35 #include <linux/in6.h>
36 #include <linux/inetdevice.h>
37 #include <linux/igmp.h>
38 #include <linux/netfilter_ipv4.h>
39 #include <linux/etherdevice.h>
40 #include <linux/if_ether.h>
41 #include <linux/if_vlan.h>
42 #include <linux/rculist.h>
43 #include <linux/err.h>
44
45 #include <net/sock.h>
46 #include <net/ip.h>
47 #include <net/icmp.h>
48 #include <net/protocol.h>
49 #include <net/ip_tunnels.h>
50 #include <net/arp.h>
51 #include <net/checksum.h>
52 #include <net/dsfield.h>
53 #include <net/inet_ecn.h>
54 #include <net/xfrm.h>
55 #include <net/net_namespace.h>
56 #include <net/netns/generic.h>
57 #include <net/rtnetlink.h>
58
59 #if IS_ENABLED(CONFIG_IPV6)
60 #include <net/ipv6.h>
61 #include <net/ip6_fib.h>
62 #include <net/ip6_route.h>
63 #endif
64
65 static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
66 {
67         return hash_32((__force u32)key ^ (__force u32)remote,
68                          IP_TNL_HASH_BITS);
69 }
70
71 static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
72                              struct dst_entry *dst)
73 {
74         struct dst_entry *old_dst;
75
76         if (dst) {
77                 if (dst->flags & DST_NOCACHE)
78                         dst = NULL;
79                 else
80                         dst_clone(dst);
81         }
82         old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
83         dst_release(old_dst);
84 }
85
86 static void tunnel_dst_set(struct ip_tunnel *t, struct dst_entry *dst)
87 {
88         __tunnel_dst_set(this_cpu_ptr(t->dst_cache), dst);
89 }
90
91 static void tunnel_dst_reset(struct ip_tunnel *t)
92 {
93         tunnel_dst_set(t, NULL);
94 }
95
96 void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
97 {
98         int i;
99
100         for_each_possible_cpu(i)
101                 __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL);
102 }
103 EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
104
105 static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie)
106 {
107         struct dst_entry *dst;
108
109         rcu_read_lock();
110         dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst);
111         if (dst) {
112                 if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
113                         rcu_read_unlock();
114                         tunnel_dst_reset(t);
115                         return NULL;
116                 }
117                 dst_hold(dst);
118         }
119         rcu_read_unlock();
120         return (struct rtable *)dst;
121 }
122
123 static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
124                                 __be16 flags, __be32 key)
125 {
126         if (p->i_flags & TUNNEL_KEY) {
127                 if (flags & TUNNEL_KEY)
128                         return key == p->i_key;
129                 else
130                         /* key expected, none present */
131                         return false;
132         } else
133                 return !(flags & TUNNEL_KEY);
134 }
135
136 /* Fallback tunnel: no source, no destination, no key, no options
137
138    Tunnel hash table:
139    We require exact key match i.e. if a key is present in packet
140    it will match only tunnel with the same key; if it is not present,
141    it will match only keyless tunnel.
142
143    All keysless packets, if not matched configured keyless tunnels
144    will match fallback tunnel.
145    Given src, dst and key, find appropriate for input tunnel.
146 */
147 struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn,
148                                    int link, __be16 flags,
149                                    __be32 remote, __be32 local,
150                                    __be32 key)
151 {
152         unsigned int hash;
153         struct ip_tunnel *t, *cand = NULL;
154         struct hlist_head *head;
155
156         hash = ip_tunnel_hash(key, remote);
157         head = &itn->tunnels[hash];
158
159         hlist_for_each_entry_rcu(t, head, hash_node) {
160                 if (local != t->parms.iph.saddr ||
161                     remote != t->parms.iph.daddr ||
162                     !(t->dev->flags & IFF_UP))
163                         continue;
164
165                 if (!ip_tunnel_key_match(&t->parms, flags, key))
166                         continue;
167
168                 if (t->parms.link == link)
169                         return t;
170                 else
171                         cand = t;
172         }
173
174         hlist_for_each_entry_rcu(t, head, hash_node) {
175                 if (remote != t->parms.iph.daddr ||
176                     !(t->dev->flags & IFF_UP))
177                         continue;
178
179                 if (!ip_tunnel_key_match(&t->parms, flags, key))
180                         continue;
181
182                 if (t->parms.link == link)
183                         return t;
184                 else if (!cand)
185                         cand = t;
186         }
187
188         hash = ip_tunnel_hash(key, 0);
189         head = &itn->tunnels[hash];
190
191         hlist_for_each_entry_rcu(t, head, hash_node) {
192                 if ((local != t->parms.iph.saddr &&
193                      (local != t->parms.iph.daddr ||
194                       !ipv4_is_multicast(local))) ||
195                     !(t->dev->flags & IFF_UP))
196                         continue;
197
198                 if (!ip_tunnel_key_match(&t->parms, flags, key))
199                         continue;
200
201                 if (t->parms.link == link)
202                         return t;
203                 else if (!cand)
204                         cand = t;
205         }
206
207         if (flags & TUNNEL_NO_KEY)
208                 goto skip_key_lookup;
209
210         hlist_for_each_entry_rcu(t, head, hash_node) {
211                 if (t->parms.i_key != key ||
212                     !(t->dev->flags & IFF_UP))
213                         continue;
214
215                 if (t->parms.link == link)
216                         return t;
217                 else if (!cand)
218                         cand = t;
219         }
220
221 skip_key_lookup:
222         if (cand)
223                 return cand;
224
225         if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP)
226                 return netdev_priv(itn->fb_tunnel_dev);
227
228
229         return NULL;
230 }
231 EXPORT_SYMBOL_GPL(ip_tunnel_lookup);
232
233 static struct hlist_head *ip_bucket(struct ip_tunnel_net *itn,
234                                     struct ip_tunnel_parm *parms)
235 {
236         unsigned int h;
237         __be32 remote;
238
239         if (parms->iph.daddr && !ipv4_is_multicast(parms->iph.daddr))
240                 remote = parms->iph.daddr;
241         else
242                 remote = 0;
243
244         h = ip_tunnel_hash(parms->i_key, remote);
245         return &itn->tunnels[h];
246 }
247
248 static void ip_tunnel_add(struct ip_tunnel_net *itn, struct ip_tunnel *t)
249 {
250         struct hlist_head *head = ip_bucket(itn, &t->parms);
251
252         hlist_add_head_rcu(&t->hash_node, head);
253 }
254
255 static void ip_tunnel_del(struct ip_tunnel *t)
256 {
257         hlist_del_init_rcu(&t->hash_node);
258 }
259
260 static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn,
261                                         struct ip_tunnel_parm *parms,
262                                         int type)
263 {
264         __be32 remote = parms->iph.daddr;
265         __be32 local = parms->iph.saddr;
266         __be32 key = parms->i_key;
267         int link = parms->link;
268         struct ip_tunnel *t = NULL;
269         struct hlist_head *head = ip_bucket(itn, parms);
270
271         hlist_for_each_entry_rcu(t, head, hash_node) {
272                 if (local == t->parms.iph.saddr &&
273                     remote == t->parms.iph.daddr &&
274                     key == t->parms.i_key &&
275                     link == t->parms.link &&
276                     type == t->dev->type)
277                         break;
278         }
279         return t;
280 }
281
282 static struct net_device *__ip_tunnel_create(struct net *net,
283                                              const struct rtnl_link_ops *ops,
284                                              struct ip_tunnel_parm *parms)
285 {
286         int err;
287         struct ip_tunnel *tunnel;
288         struct net_device *dev;
289         char name[IFNAMSIZ];
290
291         if (parms->name[0])
292                 strlcpy(name, parms->name, IFNAMSIZ);
293         else {
294                 if (strlen(ops->kind) > (IFNAMSIZ - 3)) {
295                         err = -E2BIG;
296                         goto failed;
297                 }
298                 strlcpy(name, ops->kind, IFNAMSIZ);
299                 strncat(name, "%d", 2);
300         }
301
302         ASSERT_RTNL();
303         dev = alloc_netdev(ops->priv_size, name, ops->setup);
304         if (!dev) {
305                 err = -ENOMEM;
306                 goto failed;
307         }
308         dev_net_set(dev, net);
309
310         dev->rtnl_link_ops = ops;
311
312         tunnel = netdev_priv(dev);
313         tunnel->parms = *parms;
314         tunnel->net = net;
315
316         err = register_netdevice(dev);
317         if (err)
318                 goto failed_free;
319
320         return dev;
321
322 failed_free:
323         free_netdev(dev);
324 failed:
325         return ERR_PTR(err);
326 }
327
328 static inline void init_tunnel_flow(struct flowi4 *fl4,
329                                     int proto,
330                                     __be32 daddr, __be32 saddr,
331                                     __be32 key, __u8 tos, int oif)
332 {
333         memset(fl4, 0, sizeof(*fl4));
334         fl4->flowi4_oif = oif;
335         fl4->daddr = daddr;
336         fl4->saddr = saddr;
337         fl4->flowi4_tos = tos;
338         fl4->flowi4_proto = proto;
339         fl4->fl4_gre_key = key;
340 }
341
342 static int ip_tunnel_bind_dev(struct net_device *dev)
343 {
344         struct net_device *tdev = NULL;
345         struct ip_tunnel *tunnel = netdev_priv(dev);
346         const struct iphdr *iph;
347         int hlen = LL_MAX_HEADER;
348         int mtu = ETH_DATA_LEN;
349         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
350
351         iph = &tunnel->parms.iph;
352
353         /* Guess output device to choose reasonable mtu and needed_headroom */
354         if (iph->daddr) {
355                 struct flowi4 fl4;
356                 struct rtable *rt;
357
358                 init_tunnel_flow(&fl4, iph->protocol, iph->daddr,
359                                  iph->saddr, tunnel->parms.o_key,
360                                  RT_TOS(iph->tos), tunnel->parms.link);
361                 rt = ip_route_output_key(tunnel->net, &fl4);
362
363                 if (!IS_ERR(rt)) {
364                         tdev = rt->dst.dev;
365                         tunnel_dst_set(tunnel, &rt->dst);
366                         ip_rt_put(rt);
367                 }
368                 if (dev->type != ARPHRD_ETHER)
369                         dev->flags |= IFF_POINTOPOINT;
370         }
371
372         if (!tdev && tunnel->parms.link)
373                 tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
374
375         if (tdev) {
376                 hlen = tdev->hard_header_len + tdev->needed_headroom;
377                 mtu = tdev->mtu;
378         }
379         dev->iflink = tunnel->parms.link;
380
381         dev->needed_headroom = t_hlen + hlen;
382         mtu -= (dev->hard_header_len + t_hlen);
383
384         if (mtu < 68)
385                 mtu = 68;
386
387         return mtu;
388 }
389
390 static struct ip_tunnel *ip_tunnel_create(struct net *net,
391                                           struct ip_tunnel_net *itn,
392                                           struct ip_tunnel_parm *parms)
393 {
394         struct ip_tunnel *nt, *fbt;
395         struct net_device *dev;
396
397         BUG_ON(!itn->fb_tunnel_dev);
398         fbt = netdev_priv(itn->fb_tunnel_dev);
399         dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
400         if (IS_ERR(dev))
401                 return NULL;
402
403         dev->mtu = ip_tunnel_bind_dev(dev);
404
405         nt = netdev_priv(dev);
406         ip_tunnel_add(itn, nt);
407         return nt;
408 }
409
410 int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
411                   const struct tnl_ptk_info *tpi, bool log_ecn_error)
412 {
413         struct pcpu_sw_netstats *tstats;
414         const struct iphdr *iph = ip_hdr(skb);
415         int err;
416
417 #ifdef CONFIG_NET_IPGRE_BROADCAST
418         if (ipv4_is_multicast(iph->daddr)) {
419                 tunnel->dev->stats.multicast++;
420                 skb->pkt_type = PACKET_BROADCAST;
421         }
422 #endif
423
424         if ((!(tpi->flags&TUNNEL_CSUM) &&  (tunnel->parms.i_flags&TUNNEL_CSUM)) ||
425              ((tpi->flags&TUNNEL_CSUM) && !(tunnel->parms.i_flags&TUNNEL_CSUM))) {
426                 tunnel->dev->stats.rx_crc_errors++;
427                 tunnel->dev->stats.rx_errors++;
428                 goto drop;
429         }
430
431         if (tunnel->parms.i_flags&TUNNEL_SEQ) {
432                 if (!(tpi->flags&TUNNEL_SEQ) ||
433                     (tunnel->i_seqno && (s32)(ntohl(tpi->seq) - tunnel->i_seqno) < 0)) {
434                         tunnel->dev->stats.rx_fifo_errors++;
435                         tunnel->dev->stats.rx_errors++;
436                         goto drop;
437                 }
438                 tunnel->i_seqno = ntohl(tpi->seq) + 1;
439         }
440
441         err = IP_ECN_decapsulate(iph, skb);
442         if (unlikely(err)) {
443                 if (log_ecn_error)
444                         net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
445                                         &iph->saddr, iph->tos);
446                 if (err > 1) {
447                         ++tunnel->dev->stats.rx_frame_errors;
448                         ++tunnel->dev->stats.rx_errors;
449                         goto drop;
450                 }
451         }
452
453         tstats = this_cpu_ptr(tunnel->dev->tstats);
454         u64_stats_update_begin(&tstats->syncp);
455         tstats->rx_packets++;
456         tstats->rx_bytes += skb->len;
457         u64_stats_update_end(&tstats->syncp);
458
459         skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(tunnel->dev)));
460
461         if (tunnel->dev->type == ARPHRD_ETHER) {
462                 skb->protocol = eth_type_trans(skb, tunnel->dev);
463                 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
464         } else {
465                 skb->dev = tunnel->dev;
466         }
467
468         gro_cells_receive(&tunnel->gro_cells, skb);
469         return 0;
470
471 drop:
472         kfree_skb(skb);
473         return 0;
474 }
475 EXPORT_SYMBOL_GPL(ip_tunnel_rcv);
476
477 static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
478                             struct rtable *rt, __be16 df)
479 {
480         struct ip_tunnel *tunnel = netdev_priv(dev);
481         int pkt_size = skb->len - tunnel->hlen - dev->hard_header_len;
482         int mtu;
483
484         if (df)
485                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len
486                                         - sizeof(struct iphdr) - tunnel->hlen;
487         else
488                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
489
490         if (skb_dst(skb))
491                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
492
493         if (skb->protocol == htons(ETH_P_IP)) {
494                 if (!skb_is_gso(skb) &&
495                     (df & htons(IP_DF)) && mtu < pkt_size) {
496                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
497                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
498                         return -E2BIG;
499                 }
500         }
501 #if IS_ENABLED(CONFIG_IPV6)
502         else if (skb->protocol == htons(ETH_P_IPV6)) {
503                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
504
505                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) &&
506                            mtu >= IPV6_MIN_MTU) {
507                         if ((tunnel->parms.iph.daddr &&
508                             !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
509                             rt6->rt6i_dst.plen == 128) {
510                                 rt6->rt6i_flags |= RTF_MODIFIED;
511                                 dst_metric_set(skb_dst(skb), RTAX_MTU, mtu);
512                         }
513                 }
514
515                 if (!skb_is_gso(skb) && mtu >= IPV6_MIN_MTU &&
516                                         mtu < pkt_size) {
517                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
518                         return -E2BIG;
519                 }
520         }
521 #endif
522         return 0;
523 }
524
525 void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
526                     const struct iphdr *tnl_params, const u8 protocol)
527 {
528         struct ip_tunnel *tunnel = netdev_priv(dev);
529         const struct iphdr *inner_iph;
530         struct flowi4 fl4;
531         u8     tos, ttl;
532         __be16 df;
533         struct rtable *rt;              /* Route to the other host */
534         unsigned int max_headroom;      /* The extra header space needed */
535         __be32 dst;
536         int err;
537         bool connected = true;
538
539         inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
540
541         dst = tnl_params->daddr;
542         if (dst == 0) {
543                 /* NBMA tunnel */
544
545                 if (skb_dst(skb) == NULL) {
546                         dev->stats.tx_fifo_errors++;
547                         goto tx_error;
548                 }
549
550                 if (skb->protocol == htons(ETH_P_IP)) {
551                         rt = skb_rtable(skb);
552                         dst = rt_nexthop(rt, inner_iph->daddr);
553                 }
554 #if IS_ENABLED(CONFIG_IPV6)
555                 else if (skb->protocol == htons(ETH_P_IPV6)) {
556                         const struct in6_addr *addr6;
557                         struct neighbour *neigh;
558                         bool do_tx_error_icmp;
559                         int addr_type;
560
561                         neigh = dst_neigh_lookup(skb_dst(skb),
562                                                  &ipv6_hdr(skb)->daddr);
563                         if (neigh == NULL)
564                                 goto tx_error;
565
566                         addr6 = (const struct in6_addr *)&neigh->primary_key;
567                         addr_type = ipv6_addr_type(addr6);
568
569                         if (addr_type == IPV6_ADDR_ANY) {
570                                 addr6 = &ipv6_hdr(skb)->daddr;
571                                 addr_type = ipv6_addr_type(addr6);
572                         }
573
574                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
575                                 do_tx_error_icmp = true;
576                         else {
577                                 do_tx_error_icmp = false;
578                                 dst = addr6->s6_addr32[3];
579                         }
580                         neigh_release(neigh);
581                         if (do_tx_error_icmp)
582                                 goto tx_error_icmp;
583                 }
584 #endif
585                 else
586                         goto tx_error;
587
588                 connected = false;
589         }
590
591         tos = tnl_params->tos;
592         if (tos & 0x1) {
593                 tos &= ~0x1;
594                 if (skb->protocol == htons(ETH_P_IP)) {
595                         tos = inner_iph->tos;
596                         connected = false;
597                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
598                         tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph);
599                         connected = false;
600                 }
601         }
602
603         init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr,
604                          tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link);
605
606         rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL;
607
608         if (!rt) {
609                 rt = ip_route_output_key(tunnel->net, &fl4);
610
611                 if (IS_ERR(rt)) {
612                         dev->stats.tx_carrier_errors++;
613                         goto tx_error;
614                 }
615                 if (connected)
616                         tunnel_dst_set(tunnel, &rt->dst);
617         }
618
619         if (rt->dst.dev == dev) {
620                 ip_rt_put(rt);
621                 dev->stats.collisions++;
622                 goto tx_error;
623         }
624
625         if (tnl_update_pmtu(dev, skb, rt, tnl_params->frag_off)) {
626                 ip_rt_put(rt);
627                 goto tx_error;
628         }
629
630         if (tunnel->err_count > 0) {
631                 if (time_before(jiffies,
632                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
633                         tunnel->err_count--;
634
635                         memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
636                         dst_link_failure(skb);
637                 } else
638                         tunnel->err_count = 0;
639         }
640
641         tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
642         ttl = tnl_params->ttl;
643         if (ttl == 0) {
644                 if (skb->protocol == htons(ETH_P_IP))
645                         ttl = inner_iph->ttl;
646 #if IS_ENABLED(CONFIG_IPV6)
647                 else if (skb->protocol == htons(ETH_P_IPV6))
648                         ttl = ((const struct ipv6hdr *)inner_iph)->hop_limit;
649 #endif
650                 else
651                         ttl = ip4_dst_hoplimit(&rt->dst);
652         }
653
654         df = tnl_params->frag_off;
655         if (skb->protocol == htons(ETH_P_IP))
656                 df |= (inner_iph->frag_off&htons(IP_DF));
657
658         max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
659                         + rt->dst.header_len;
660         if (max_headroom > dev->needed_headroom)
661                 dev->needed_headroom = max_headroom;
662
663         if (skb_cow_head(skb, dev->needed_headroom)) {
664                 dev->stats.tx_dropped++;
665                 kfree_skb(skb);
666                 return;
667         }
668
669         err = iptunnel_xmit(rt, skb, fl4.saddr, fl4.daddr, protocol,
670                             tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
671         iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
672
673         return;
674
675 #if IS_ENABLED(CONFIG_IPV6)
676 tx_error_icmp:
677         dst_link_failure(skb);
678 #endif
679 tx_error:
680         dev->stats.tx_errors++;
681         kfree_skb(skb);
682 }
683 EXPORT_SYMBOL_GPL(ip_tunnel_xmit);
684
685 static void ip_tunnel_update(struct ip_tunnel_net *itn,
686                              struct ip_tunnel *t,
687                              struct net_device *dev,
688                              struct ip_tunnel_parm *p,
689                              bool set_mtu)
690 {
691         ip_tunnel_del(t);
692         t->parms.iph.saddr = p->iph.saddr;
693         t->parms.iph.daddr = p->iph.daddr;
694         t->parms.i_key = p->i_key;
695         t->parms.o_key = p->o_key;
696         if (dev->type != ARPHRD_ETHER) {
697                 memcpy(dev->dev_addr, &p->iph.saddr, 4);
698                 memcpy(dev->broadcast, &p->iph.daddr, 4);
699         }
700         ip_tunnel_add(itn, t);
701
702         t->parms.iph.ttl = p->iph.ttl;
703         t->parms.iph.tos = p->iph.tos;
704         t->parms.iph.frag_off = p->iph.frag_off;
705
706         if (t->parms.link != p->link) {
707                 int mtu;
708
709                 t->parms.link = p->link;
710                 mtu = ip_tunnel_bind_dev(dev);
711                 if (set_mtu)
712                         dev->mtu = mtu;
713         }
714         ip_tunnel_dst_reset_all(t);
715         netdev_state_change(dev);
716 }
717
718 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd)
719 {
720         int err = 0;
721         struct ip_tunnel *t;
722         struct net *net = dev_net(dev);
723         struct ip_tunnel *tunnel = netdev_priv(dev);
724         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
725
726         BUG_ON(!itn->fb_tunnel_dev);
727         switch (cmd) {
728         case SIOCGETTUNNEL:
729                 t = NULL;
730                 if (dev == itn->fb_tunnel_dev)
731                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
732                 if (t == NULL)
733                         t = netdev_priv(dev);
734                 memcpy(p, &t->parms, sizeof(*p));
735                 break;
736
737         case SIOCADDTUNNEL:
738         case SIOCCHGTUNNEL:
739                 err = -EPERM;
740                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
741                         goto done;
742                 if (p->iph.ttl)
743                         p->iph.frag_off |= htons(IP_DF);
744                 if (!(p->i_flags&TUNNEL_KEY))
745                         p->i_key = 0;
746                 if (!(p->o_flags&TUNNEL_KEY))
747                         p->o_key = 0;
748
749                 t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
750
751                 if (!t && (cmd == SIOCADDTUNNEL))
752                         t = ip_tunnel_create(net, itn, p);
753
754                 if (dev != itn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
755                         if (t != NULL) {
756                                 if (t->dev != dev) {
757                                         err = -EEXIST;
758                                         break;
759                                 }
760                         } else {
761                                 unsigned int nflags = 0;
762
763                                 if (ipv4_is_multicast(p->iph.daddr))
764                                         nflags = IFF_BROADCAST;
765                                 else if (p->iph.daddr)
766                                         nflags = IFF_POINTOPOINT;
767
768                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
769                                         err = -EINVAL;
770                                         break;
771                                 }
772
773                                 t = netdev_priv(dev);
774                         }
775                 }
776
777                 if (t) {
778                         err = 0;
779                         ip_tunnel_update(itn, t, dev, p, true);
780                 } else
781                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
782                 break;
783
784         case SIOCDELTUNNEL:
785                 err = -EPERM;
786                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
787                         goto done;
788
789                 if (dev == itn->fb_tunnel_dev) {
790                         err = -ENOENT;
791                         t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
792                         if (t == NULL)
793                                 goto done;
794                         err = -EPERM;
795                         if (t == netdev_priv(itn->fb_tunnel_dev))
796                                 goto done;
797                         dev = t->dev;
798                 }
799                 unregister_netdevice(dev);
800                 err = 0;
801                 break;
802
803         default:
804                 err = -EINVAL;
805         }
806
807 done:
808         return err;
809 }
810 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
811
812 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
813 {
814         struct ip_tunnel *tunnel = netdev_priv(dev);
815         int t_hlen = tunnel->hlen + sizeof(struct iphdr);
816
817         if (new_mtu < 68 ||
818             new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
819                 return -EINVAL;
820         dev->mtu = new_mtu;
821         return 0;
822 }
823 EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
824
825 static void ip_tunnel_dev_free(struct net_device *dev)
826 {
827         struct ip_tunnel *tunnel = netdev_priv(dev);
828
829         gro_cells_destroy(&tunnel->gro_cells);
830         free_percpu(tunnel->dst_cache);
831         free_percpu(dev->tstats);
832         free_netdev(dev);
833 }
834
835 void ip_tunnel_dellink(struct net_device *dev, struct list_head *head)
836 {
837         struct ip_tunnel *tunnel = netdev_priv(dev);
838         struct ip_tunnel_net *itn;
839
840         itn = net_generic(tunnel->net, tunnel->ip_tnl_net_id);
841
842         if (itn->fb_tunnel_dev != dev) {
843                 ip_tunnel_del(netdev_priv(dev));
844                 unregister_netdevice_queue(dev, head);
845         }
846 }
847 EXPORT_SYMBOL_GPL(ip_tunnel_dellink);
848
849 int ip_tunnel_init_net(struct net *net, int ip_tnl_net_id,
850                                   struct rtnl_link_ops *ops, char *devname)
851 {
852         struct ip_tunnel_net *itn = net_generic(net, ip_tnl_net_id);
853         struct ip_tunnel_parm parms;
854         unsigned int i;
855
856         for (i = 0; i < IP_TNL_HASH_SIZE; i++)
857                 INIT_HLIST_HEAD(&itn->tunnels[i]);
858
859         if (!ops) {
860                 itn->fb_tunnel_dev = NULL;
861                 return 0;
862         }
863
864         memset(&parms, 0, sizeof(parms));
865         if (devname)
866                 strlcpy(parms.name, devname, IFNAMSIZ);
867
868         rtnl_lock();
869         itn->fb_tunnel_dev = __ip_tunnel_create(net, ops, &parms);
870         /* FB netdevice is special: we have one, and only one per netns.
871          * Allowing to move it to another netns is clearly unsafe.
872          */
873         if (!IS_ERR(itn->fb_tunnel_dev)) {
874                 itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
875                 ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
876         }
877         rtnl_unlock();
878
879         return PTR_ERR_OR_ZERO(itn->fb_tunnel_dev);
880 }
881 EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
882
883 static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
884                               struct rtnl_link_ops *ops)
885 {
886         struct net *net = dev_net(itn->fb_tunnel_dev);
887         struct net_device *dev, *aux;
888         int h;
889
890         for_each_netdev_safe(net, dev, aux)
891                 if (dev->rtnl_link_ops == ops)
892                         unregister_netdevice_queue(dev, head);
893
894         for (h = 0; h < IP_TNL_HASH_SIZE; h++) {
895                 struct ip_tunnel *t;
896                 struct hlist_node *n;
897                 struct hlist_head *thead = &itn->tunnels[h];
898
899                 hlist_for_each_entry_safe(t, n, thead, hash_node)
900                         /* If dev is in the same netns, it has already
901                          * been added to the list by the previous loop.
902                          */
903                         if (!net_eq(dev_net(t->dev), net))
904                                 unregister_netdevice_queue(t->dev, head);
905         }
906 }
907
908 void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops)
909 {
910         LIST_HEAD(list);
911
912         rtnl_lock();
913         ip_tunnel_destroy(itn, &list, ops);
914         unregister_netdevice_many(&list);
915         rtnl_unlock();
916 }
917 EXPORT_SYMBOL_GPL(ip_tunnel_delete_net);
918
919 int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[],
920                       struct ip_tunnel_parm *p)
921 {
922         struct ip_tunnel *nt;
923         struct net *net = dev_net(dev);
924         struct ip_tunnel_net *itn;
925         int mtu;
926         int err;
927
928         nt = netdev_priv(dev);
929         itn = net_generic(net, nt->ip_tnl_net_id);
930
931         if (ip_tunnel_find(itn, p, dev->type))
932                 return -EEXIST;
933
934         nt->net = net;
935         nt->parms = *p;
936         err = register_netdevice(dev);
937         if (err)
938                 goto out;
939
940         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
941                 eth_hw_addr_random(dev);
942
943         mtu = ip_tunnel_bind_dev(dev);
944         if (!tb[IFLA_MTU])
945                 dev->mtu = mtu;
946
947         ip_tunnel_add(itn, nt);
948
949 out:
950         return err;
951 }
952 EXPORT_SYMBOL_GPL(ip_tunnel_newlink);
953
954 int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[],
955                          struct ip_tunnel_parm *p)
956 {
957         struct ip_tunnel *t;
958         struct ip_tunnel *tunnel = netdev_priv(dev);
959         struct net *net = tunnel->net;
960         struct ip_tunnel_net *itn = net_generic(net, tunnel->ip_tnl_net_id);
961
962         if (dev == itn->fb_tunnel_dev)
963                 return -EINVAL;
964
965         t = ip_tunnel_find(itn, p, dev->type);
966
967         if (t) {
968                 if (t->dev != dev)
969                         return -EEXIST;
970         } else {
971                 t = tunnel;
972
973                 if (dev->type != ARPHRD_ETHER) {
974                         unsigned int nflags = 0;
975
976                         if (ipv4_is_multicast(p->iph.daddr))
977                                 nflags = IFF_BROADCAST;
978                         else if (p->iph.daddr)
979                                 nflags = IFF_POINTOPOINT;
980
981                         if ((dev->flags ^ nflags) &
982                             (IFF_POINTOPOINT | IFF_BROADCAST))
983                                 return -EINVAL;
984                 }
985         }
986
987         ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]);
988         return 0;
989 }
990 EXPORT_SYMBOL_GPL(ip_tunnel_changelink);
991
992 int ip_tunnel_init(struct net_device *dev)
993 {
994         struct ip_tunnel *tunnel = netdev_priv(dev);
995         struct iphdr *iph = &tunnel->parms.iph;
996         int i, err;
997
998         dev->destructor = ip_tunnel_dev_free;
999         dev->tstats = alloc_percpu(struct pcpu_sw_netstats);
1000         if (!dev->tstats)
1001                 return -ENOMEM;
1002
1003         for_each_possible_cpu(i) {
1004                 struct pcpu_sw_netstats *ipt_stats;
1005                 ipt_stats = per_cpu_ptr(dev->tstats, i);
1006                 u64_stats_init(&ipt_stats->syncp);
1007         }
1008
1009         tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
1010         if (!tunnel->dst_cache) {
1011                 free_percpu(dev->tstats);
1012                 return -ENOMEM;
1013         }
1014
1015         err = gro_cells_init(&tunnel->gro_cells, dev);
1016         if (err) {
1017                 free_percpu(tunnel->dst_cache);
1018                 free_percpu(dev->tstats);
1019                 return err;
1020         }
1021
1022         tunnel->dev = dev;
1023         tunnel->net = dev_net(dev);
1024         strcpy(tunnel->parms.name, dev->name);
1025         iph->version            = 4;
1026         iph->ihl                = 5;
1027
1028         return 0;
1029 }
1030 EXPORT_SYMBOL_GPL(ip_tunnel_init);
1031
1032 void ip_tunnel_uninit(struct net_device *dev)
1033 {
1034         struct ip_tunnel *tunnel = netdev_priv(dev);
1035         struct net *net = tunnel->net;
1036         struct ip_tunnel_net *itn;
1037
1038         itn = net_generic(net, tunnel->ip_tnl_net_id);
1039         /* fb_tunnel_dev will be unregisted in net-exit call. */
1040         if (itn->fb_tunnel_dev != dev)
1041                 ip_tunnel_del(netdev_priv(dev));
1042
1043         ip_tunnel_dst_reset_all(tunnel);
1044 }
1045 EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
1046
1047 /* Do least required initialization, rest of init is done in tunnel_init call */
1048 void ip_tunnel_setup(struct net_device *dev, int net_id)
1049 {
1050         struct ip_tunnel *tunnel = netdev_priv(dev);
1051         tunnel->ip_tnl_net_id = net_id;
1052 }
1053 EXPORT_SYMBOL_GPL(ip_tunnel_setup);
1054
1055 MODULE_LICENSE("GPL");