]> Pileus Git - ~andy/linux/blob - net/ipv4/ip_gre.c
gre: Add Transparent Ethernet Bridging
[~andy/linux] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: t->recursion lock breaks dead loops. It looks
70    like dev->tbusy flag, but I preferred new variable, because
71    the semantics is different. One day, when hard_start_xmit
72    will be multithreaded we will have to use skb->encapsulation.
73
74
75
76    2. Networking dead loops would not kill routers, but would really
77    kill network. IP hop limit plays role of "t->recursion" in this case,
78    if we copy it from packet being encapsulated to upper header.
79    It is very good solution, but it introduces two problems:
80
81    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
82      do not work over tunnels.
83    - traceroute does not work. I planned to relay ICMP from tunnel,
84      so that this problem would be solved and traceroute output
85      would even more informative. This idea appeared to be wrong:
86      only Linux complies to rfc1812 now (yes, guys, Linux is the only
87      true router now :-)), all routers (at least, in neighbourhood of mine)
88      return only 8 bytes of payload. It is the end.
89
90    Hence, if we want that OSPF worked or traceroute said something reasonable,
91    we should search for another solution.
92
93    One of them is to parse packet trying to detect inner encapsulation
94    made by our node. It is difficult or even impossible, especially,
95    taking into account fragmentation. TO be short, tt is not solution at all.
96
97    Current solution: The solution was UNEXPECTEDLY SIMPLE.
98    We force DF flag on tunnels with preconfigured hop limit,
99    that is ALL. :-) Well, it does not remove the problem completely,
100    but exponential growth of network traffic is changed to linear
101    (branches, that exceed pmtu are pruned) and tunnel mtu
102    fastly degrades to value <68, where looping stops.
103    Yes, it is not good if there exists a router in the loop,
104    which does not force DF, even when encapsulating packets have DF set.
105    But it is not our problem! Nobody could accuse us, we made
106    all that we could make. Even if it is your gated who injected
107    fatal route to network, even if it were you who configured
108    fatal static route: you are innocent. :-)
109
110
111
112    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
113    practically identical code. It would be good to glue them
114    together, but it is not very evident, how to make them modular.
115    sit is integral part of IPv6, ipip and gre are naturally modular.
116    We could extract common parts (hash table, ioctl etc)
117    to a separate module (ip_tunnel.c).
118
119    Alexey Kuznetsov.
120  */
121
122 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
123 static int ipgre_tunnel_init(struct net_device *dev);
124 static void ipgre_tunnel_setup(struct net_device *dev);
125 static int ipgre_tunnel_bind_dev(struct net_device *dev);
126
127 /* Fallback tunnel: no source, no destination, no key, no options */
128
129 static int ipgre_fb_tunnel_init(struct net_device *dev);
130
131 #define HASH_SIZE  16
132
133 static int ipgre_net_id;
134 struct ipgre_net {
135         struct ip_tunnel *tunnels[4][HASH_SIZE];
136
137         struct net_device *fb_tunnel_dev;
138 };
139
140 /* Tunnel hash table */
141
142 /*
143    4 hash tables:
144
145    3: (remote,local)
146    2: (remote,*)
147    1: (*,local)
148    0: (*,*)
149
150    We require exact key match i.e. if a key is present in packet
151    it will match only tunnel with the same key; if it is not present,
152    it will match only keyless tunnel.
153
154    All keysless packets, if not matched configured keyless tunnels
155    will match fallback tunnel.
156  */
157
158 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
159
160 #define tunnels_r_l     tunnels[3]
161 #define tunnels_r       tunnels[2]
162 #define tunnels_l       tunnels[1]
163 #define tunnels_wc      tunnels[0]
164
165 static DEFINE_RWLOCK(ipgre_lock);
166
167 /* Given src, dst and key, find appropriate for input tunnel. */
168
169 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
170                                               __be32 remote, __be32 local,
171                                               __be32 key, __be16 gre_proto)
172 {
173         unsigned h0 = HASH(remote);
174         unsigned h1 = HASH(key);
175         struct ip_tunnel *t;
176         struct ip_tunnel *t2 = NULL;
177         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
178         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
179                        ARPHRD_ETHER : ARPHRD_IPGRE;
180
181         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
182                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
183                         if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
184                                 if (t->dev->type == dev_type)
185                                         return t;
186                                 if (t->dev->type == ARPHRD_IPGRE && !t2)
187                                         t2 = t;
188                         }
189                 }
190         }
191
192         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
193                 if (remote == t->parms.iph.daddr) {
194                         if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
195                                 if (t->dev->type == dev_type)
196                                         return t;
197                                 if (t->dev->type == ARPHRD_IPGRE && !t2)
198                                         t2 = t;
199                         }
200                 }
201         }
202
203         for (t = ign->tunnels_l[h1]; t; t = t->next) {
204                 if (local == t->parms.iph.saddr ||
205                      (local == t->parms.iph.daddr &&
206                       ipv4_is_multicast(local))) {
207                         if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
208                                 if (t->dev->type == dev_type)
209                                         return t;
210                                 if (t->dev->type == ARPHRD_IPGRE && !t2)
211                                         t2 = t;
212                         }
213                 }
214         }
215
216         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
217                 if (t->parms.i_key == key && t->dev->flags & IFF_UP) {
218                         if (t->dev->type == dev_type)
219                                 return t;
220                         if (t->dev->type == ARPHRD_IPGRE && !t2)
221                                 t2 = t;
222                 }
223         }
224
225         if (t2)
226                 return t2;
227
228         if (ign->fb_tunnel_dev->flags&IFF_UP)
229                 return netdev_priv(ign->fb_tunnel_dev);
230         return NULL;
231 }
232
233 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
234                 struct ip_tunnel_parm *parms)
235 {
236         __be32 remote = parms->iph.daddr;
237         __be32 local = parms->iph.saddr;
238         __be32 key = parms->i_key;
239         unsigned h = HASH(key);
240         int prio = 0;
241
242         if (local)
243                 prio |= 1;
244         if (remote && !ipv4_is_multicast(remote)) {
245                 prio |= 2;
246                 h ^= HASH(remote);
247         }
248
249         return &ign->tunnels[prio][h];
250 }
251
252 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
253                 struct ip_tunnel *t)
254 {
255         return __ipgre_bucket(ign, &t->parms);
256 }
257
258 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
259 {
260         struct ip_tunnel **tp = ipgre_bucket(ign, t);
261
262         t->next = *tp;
263         write_lock_bh(&ipgre_lock);
264         *tp = t;
265         write_unlock_bh(&ipgre_lock);
266 }
267
268 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
269 {
270         struct ip_tunnel **tp;
271
272         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
273                 if (t == *tp) {
274                         write_lock_bh(&ipgre_lock);
275                         *tp = t->next;
276                         write_unlock_bh(&ipgre_lock);
277                         break;
278                 }
279         }
280 }
281
282 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
283                                            struct ip_tunnel_parm *parms,
284                                            int type)
285 {
286         __be32 remote = parms->iph.daddr;
287         __be32 local = parms->iph.saddr;
288         __be32 key = parms->i_key;
289         struct ip_tunnel *t, **tp;
290         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
291
292         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
293                 if (local == t->parms.iph.saddr &&
294                     remote == t->parms.iph.daddr &&
295                     key == t->parms.i_key &&
296                     type == t->dev->type)
297                         break;
298
299         return t;
300 }
301
302 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
303                 struct ip_tunnel_parm *parms, int create)
304 {
305         struct ip_tunnel *t, *nt;
306         struct net_device *dev;
307         char name[IFNAMSIZ];
308         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
309
310         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
311         if (t || !create)
312                 return t;
313
314         if (parms->name[0])
315                 strlcpy(name, parms->name, IFNAMSIZ);
316         else
317                 sprintf(name, "gre%%d");
318
319         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
320         if (!dev)
321           return NULL;
322
323         dev_net_set(dev, net);
324
325         if (strchr(name, '%')) {
326                 if (dev_alloc_name(dev, name) < 0)
327                         goto failed_free;
328         }
329
330         nt = netdev_priv(dev);
331         nt->parms = *parms;
332         dev->rtnl_link_ops = &ipgre_link_ops;
333
334         dev->mtu = ipgre_tunnel_bind_dev(dev);
335
336         if (register_netdevice(dev) < 0)
337                 goto failed_free;
338
339         dev_hold(dev);
340         ipgre_tunnel_link(ign, nt);
341         return nt;
342
343 failed_free:
344         free_netdev(dev);
345         return NULL;
346 }
347
348 static void ipgre_tunnel_uninit(struct net_device *dev)
349 {
350         struct net *net = dev_net(dev);
351         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
352
353         ipgre_tunnel_unlink(ign, netdev_priv(dev));
354         dev_put(dev);
355 }
356
357
358 static void ipgre_err(struct sk_buff *skb, u32 info)
359 {
360
361 /* All the routers (except for Linux) return only
362    8 bytes of packet payload. It means, that precise relaying of
363    ICMP in the real Internet is absolutely infeasible.
364
365    Moreover, Cisco "wise men" put GRE key to the third word
366    in GRE header. It makes impossible maintaining even soft state for keyed
367    GRE tunnels with enabled checksum. Tell them "thank you".
368
369    Well, I wonder, rfc1812 was written by Cisco employee,
370    what the hell these idiots break standrads established
371    by themself???
372  */
373
374         struct iphdr *iph = (struct iphdr*)skb->data;
375         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
376         int grehlen = (iph->ihl<<2) + 4;
377         const int type = icmp_hdr(skb)->type;
378         const int code = icmp_hdr(skb)->code;
379         struct ip_tunnel *t;
380         __be16 flags;
381
382         flags = p[0];
383         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
384                 if (flags&(GRE_VERSION|GRE_ROUTING))
385                         return;
386                 if (flags&GRE_KEY) {
387                         grehlen += 4;
388                         if (flags&GRE_CSUM)
389                                 grehlen += 4;
390                 }
391         }
392
393         /* If only 8 bytes returned, keyed message will be dropped here */
394         if (skb_headlen(skb) < grehlen)
395                 return;
396
397         switch (type) {
398         default:
399         case ICMP_PARAMETERPROB:
400                 return;
401
402         case ICMP_DEST_UNREACH:
403                 switch (code) {
404                 case ICMP_SR_FAILED:
405                 case ICMP_PORT_UNREACH:
406                         /* Impossible event. */
407                         return;
408                 case ICMP_FRAG_NEEDED:
409                         /* Soft state for pmtu is maintained by IP core. */
410                         return;
411                 default:
412                         /* All others are translated to HOST_UNREACH.
413                            rfc2003 contains "deep thoughts" about NET_UNREACH,
414                            I believe they are just ether pollution. --ANK
415                          */
416                         break;
417                 }
418                 break;
419         case ICMP_TIME_EXCEEDED:
420                 if (code != ICMP_EXC_TTL)
421                         return;
422                 break;
423         }
424
425         read_lock(&ipgre_lock);
426         t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
427                                 flags & GRE_KEY ?
428                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
429                                 p[1]);
430         if (t == NULL || t->parms.iph.daddr == 0 ||
431             ipv4_is_multicast(t->parms.iph.daddr))
432                 goto out;
433
434         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
435                 goto out;
436
437         if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
438                 t->err_count++;
439         else
440                 t->err_count = 1;
441         t->err_time = jiffies;
442 out:
443         read_unlock(&ipgre_lock);
444         return;
445 }
446
447 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
448 {
449         if (INET_ECN_is_ce(iph->tos)) {
450                 if (skb->protocol == htons(ETH_P_IP)) {
451                         IP_ECN_set_ce(ip_hdr(skb));
452                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
453                         IP6_ECN_set_ce(ipv6_hdr(skb));
454                 }
455         }
456 }
457
458 static inline u8
459 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
460 {
461         u8 inner = 0;
462         if (skb->protocol == htons(ETH_P_IP))
463                 inner = old_iph->tos;
464         else if (skb->protocol == htons(ETH_P_IPV6))
465                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
466         return INET_ECN_encapsulate(tos, inner);
467 }
468
469 static int ipgre_rcv(struct sk_buff *skb)
470 {
471         struct iphdr *iph;
472         u8     *h;
473         __be16    flags;
474         __sum16   csum = 0;
475         __be32 key = 0;
476         u32    seqno = 0;
477         struct ip_tunnel *tunnel;
478         int    offset = 4;
479         __be16 gre_proto;
480
481         if (!pskb_may_pull(skb, 16))
482                 goto drop_nolock;
483
484         iph = ip_hdr(skb);
485         h = skb->data;
486         flags = *(__be16*)h;
487
488         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
489                 /* - Version must be 0.
490                    - We do not support routing headers.
491                  */
492                 if (flags&(GRE_VERSION|GRE_ROUTING))
493                         goto drop_nolock;
494
495                 if (flags&GRE_CSUM) {
496                         switch (skb->ip_summed) {
497                         case CHECKSUM_COMPLETE:
498                                 csum = csum_fold(skb->csum);
499                                 if (!csum)
500                                         break;
501                                 /* fall through */
502                         case CHECKSUM_NONE:
503                                 skb->csum = 0;
504                                 csum = __skb_checksum_complete(skb);
505                                 skb->ip_summed = CHECKSUM_COMPLETE;
506                         }
507                         offset += 4;
508                 }
509                 if (flags&GRE_KEY) {
510                         key = *(__be32*)(h + offset);
511                         offset += 4;
512                 }
513                 if (flags&GRE_SEQ) {
514                         seqno = ntohl(*(__be32*)(h + offset));
515                         offset += 4;
516                 }
517         }
518
519         gre_proto = *(__be16 *)(h + 2);
520
521         read_lock(&ipgre_lock);
522         if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
523                                           iph->saddr, iph->daddr, key,
524                                           gre_proto))) {
525                 struct net_device_stats *stats = &tunnel->dev->stats;
526
527                 secpath_reset(skb);
528
529                 skb->protocol = gre_proto;
530                 /* WCCP version 1 and 2 protocol decoding.
531                  * - Change protocol to IP
532                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
533                  */
534                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
535                         skb->protocol = htons(ETH_P_IP);
536                         if ((*(h + offset) & 0xF0) != 0x40)
537                                 offset += 4;
538                 }
539
540                 skb->mac_header = skb->network_header;
541                 __pskb_pull(skb, offset);
542                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
543                 skb->pkt_type = PACKET_HOST;
544 #ifdef CONFIG_NET_IPGRE_BROADCAST
545                 if (ipv4_is_multicast(iph->daddr)) {
546                         /* Looped back packet, drop it! */
547                         if (skb->rtable->fl.iif == 0)
548                                 goto drop;
549                         stats->multicast++;
550                         skb->pkt_type = PACKET_BROADCAST;
551                 }
552 #endif
553
554                 if (((flags&GRE_CSUM) && csum) ||
555                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
556                         stats->rx_crc_errors++;
557                         stats->rx_errors++;
558                         goto drop;
559                 }
560                 if (tunnel->parms.i_flags&GRE_SEQ) {
561                         if (!(flags&GRE_SEQ) ||
562                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
563                                 stats->rx_fifo_errors++;
564                                 stats->rx_errors++;
565                                 goto drop;
566                         }
567                         tunnel->i_seqno = seqno + 1;
568                 }
569
570                 /* Warning: All skb pointers will be invalidated! */
571                 if (tunnel->dev->type == ARPHRD_ETHER) {
572                         if (!pskb_may_pull(skb, ETH_HLEN)) {
573                                 stats->rx_length_errors++;
574                                 stats->rx_errors++;
575                                 goto drop;
576                         }
577
578                         iph = ip_hdr(skb);
579                         skb->protocol = eth_type_trans(skb, tunnel->dev);
580                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
581                 }
582
583                 stats->rx_packets++;
584                 stats->rx_bytes += skb->len;
585                 skb->dev = tunnel->dev;
586                 dst_release(skb->dst);
587                 skb->dst = NULL;
588                 nf_reset(skb);
589
590                 skb_reset_network_header(skb);
591                 ipgre_ecn_decapsulate(iph, skb);
592
593                 netif_rx(skb);
594                 read_unlock(&ipgre_lock);
595                 return(0);
596         }
597         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
598
599 drop:
600         read_unlock(&ipgre_lock);
601 drop_nolock:
602         kfree_skb(skb);
603         return(0);
604 }
605
606 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
607 {
608         struct ip_tunnel *tunnel = netdev_priv(dev);
609         struct net_device_stats *stats = &tunnel->dev->stats;
610         struct iphdr  *old_iph = ip_hdr(skb);
611         struct iphdr  *tiph;
612         u8     tos;
613         __be16 df;
614         struct rtable *rt;                      /* Route to the other host */
615         struct net_device *tdev;                        /* Device to other host */
616         struct iphdr  *iph;                     /* Our new IP header */
617         unsigned int max_headroom;              /* The extra header space needed */
618         int    gre_hlen;
619         __be32 dst;
620         int    mtu;
621
622         if (tunnel->recursion++) {
623                 stats->collisions++;
624                 goto tx_error;
625         }
626
627         if (dev->type == ARPHRD_ETHER)
628                 IPCB(skb)->flags = 0;
629
630         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
631                 gre_hlen = 0;
632                 tiph = (struct iphdr*)skb->data;
633         } else {
634                 gre_hlen = tunnel->hlen;
635                 tiph = &tunnel->parms.iph;
636         }
637
638         if ((dst = tiph->daddr) == 0) {
639                 /* NBMA tunnel */
640
641                 if (skb->dst == NULL) {
642                         stats->tx_fifo_errors++;
643                         goto tx_error;
644                 }
645
646                 if (skb->protocol == htons(ETH_P_IP)) {
647                         rt = skb->rtable;
648                         if ((dst = rt->rt_gateway) == 0)
649                                 goto tx_error_icmp;
650                 }
651 #ifdef CONFIG_IPV6
652                 else if (skb->protocol == htons(ETH_P_IPV6)) {
653                         struct in6_addr *addr6;
654                         int addr_type;
655                         struct neighbour *neigh = skb->dst->neighbour;
656
657                         if (neigh == NULL)
658                                 goto tx_error;
659
660                         addr6 = (struct in6_addr*)&neigh->primary_key;
661                         addr_type = ipv6_addr_type(addr6);
662
663                         if (addr_type == IPV6_ADDR_ANY) {
664                                 addr6 = &ipv6_hdr(skb)->daddr;
665                                 addr_type = ipv6_addr_type(addr6);
666                         }
667
668                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
669                                 goto tx_error_icmp;
670
671                         dst = addr6->s6_addr32[3];
672                 }
673 #endif
674                 else
675                         goto tx_error;
676         }
677
678         tos = tiph->tos;
679         if (tos&1) {
680                 if (skb->protocol == htons(ETH_P_IP))
681                         tos = old_iph->tos;
682                 tos &= ~1;
683         }
684
685         {
686                 struct flowi fl = { .oif = tunnel->parms.link,
687                                     .nl_u = { .ip4_u =
688                                               { .daddr = dst,
689                                                 .saddr = tiph->saddr,
690                                                 .tos = RT_TOS(tos) } },
691                                     .proto = IPPROTO_GRE };
692                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
693                         stats->tx_carrier_errors++;
694                         goto tx_error;
695                 }
696         }
697         tdev = rt->u.dst.dev;
698
699         if (tdev == dev) {
700                 ip_rt_put(rt);
701                 stats->collisions++;
702                 goto tx_error;
703         }
704
705         df = tiph->frag_off;
706         if (df)
707                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
708         else
709                 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
710
711         if (skb->dst)
712                 skb->dst->ops->update_pmtu(skb->dst, mtu);
713
714         if (skb->protocol == htons(ETH_P_IP)) {
715                 df |= (old_iph->frag_off&htons(IP_DF));
716
717                 if ((old_iph->frag_off&htons(IP_DF)) &&
718                     mtu < ntohs(old_iph->tot_len)) {
719                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
720                         ip_rt_put(rt);
721                         goto tx_error;
722                 }
723         }
724 #ifdef CONFIG_IPV6
725         else if (skb->protocol == htons(ETH_P_IPV6)) {
726                 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
727
728                 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
729                         if ((tunnel->parms.iph.daddr &&
730                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
731                             rt6->rt6i_dst.plen == 128) {
732                                 rt6->rt6i_flags |= RTF_MODIFIED;
733                                 skb->dst->metrics[RTAX_MTU-1] = mtu;
734                         }
735                 }
736
737                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
738                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
739                         ip_rt_put(rt);
740                         goto tx_error;
741                 }
742         }
743 #endif
744
745         if (tunnel->err_count > 0) {
746                 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
747                         tunnel->err_count--;
748
749                         dst_link_failure(skb);
750                 } else
751                         tunnel->err_count = 0;
752         }
753
754         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
755
756         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
757             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
758                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
759                 if (!new_skb) {
760                         ip_rt_put(rt);
761                         stats->tx_dropped++;
762                         dev_kfree_skb(skb);
763                         tunnel->recursion--;
764                         return 0;
765                 }
766                 if (skb->sk)
767                         skb_set_owner_w(new_skb, skb->sk);
768                 dev_kfree_skb(skb);
769                 skb = new_skb;
770                 old_iph = ip_hdr(skb);
771         }
772
773         skb->transport_header = skb->network_header;
774         skb_push(skb, gre_hlen);
775         skb_reset_network_header(skb);
776         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
777         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
778                               IPSKB_REROUTED);
779         dst_release(skb->dst);
780         skb->dst = &rt->u.dst;
781
782         /*
783          *      Push down and install the IPIP header.
784          */
785
786         iph                     =       ip_hdr(skb);
787         iph->version            =       4;
788         iph->ihl                =       sizeof(struct iphdr) >> 2;
789         iph->frag_off           =       df;
790         iph->protocol           =       IPPROTO_GRE;
791         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
792         iph->daddr              =       rt->rt_dst;
793         iph->saddr              =       rt->rt_src;
794
795         if ((iph->ttl = tiph->ttl) == 0) {
796                 if (skb->protocol == htons(ETH_P_IP))
797                         iph->ttl = old_iph->ttl;
798 #ifdef CONFIG_IPV6
799                 else if (skb->protocol == htons(ETH_P_IPV6))
800                         iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
801 #endif
802                 else
803                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
804         }
805
806         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
807         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
808                                    htons(ETH_P_TEB) : skb->protocol;
809
810         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
811                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
812
813                 if (tunnel->parms.o_flags&GRE_SEQ) {
814                         ++tunnel->o_seqno;
815                         *ptr = htonl(tunnel->o_seqno);
816                         ptr--;
817                 }
818                 if (tunnel->parms.o_flags&GRE_KEY) {
819                         *ptr = tunnel->parms.o_key;
820                         ptr--;
821                 }
822                 if (tunnel->parms.o_flags&GRE_CSUM) {
823                         *ptr = 0;
824                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
825                 }
826         }
827
828         nf_reset(skb);
829
830         IPTUNNEL_XMIT();
831         tunnel->recursion--;
832         return 0;
833
834 tx_error_icmp:
835         dst_link_failure(skb);
836
837 tx_error:
838         stats->tx_errors++;
839         dev_kfree_skb(skb);
840         tunnel->recursion--;
841         return 0;
842 }
843
844 static int ipgre_tunnel_bind_dev(struct net_device *dev)
845 {
846         struct net_device *tdev = NULL;
847         struct ip_tunnel *tunnel;
848         struct iphdr *iph;
849         int hlen = LL_MAX_HEADER;
850         int mtu = ETH_DATA_LEN;
851         int addend = sizeof(struct iphdr) + 4;
852
853         tunnel = netdev_priv(dev);
854         iph = &tunnel->parms.iph;
855
856         /* Guess output device to choose reasonable mtu and needed_headroom */
857
858         if (iph->daddr) {
859                 struct flowi fl = { .oif = tunnel->parms.link,
860                                     .nl_u = { .ip4_u =
861                                               { .daddr = iph->daddr,
862                                                 .saddr = iph->saddr,
863                                                 .tos = RT_TOS(iph->tos) } },
864                                     .proto = IPPROTO_GRE };
865                 struct rtable *rt;
866                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
867                         tdev = rt->u.dst.dev;
868                         ip_rt_put(rt);
869                 }
870
871                 if (dev->type != ARPHRD_ETHER)
872                         dev->flags |= IFF_POINTOPOINT;
873         }
874
875         if (!tdev && tunnel->parms.link)
876                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
877
878         if (tdev) {
879                 hlen = tdev->hard_header_len + tdev->needed_headroom;
880                 mtu = tdev->mtu;
881         }
882         dev->iflink = tunnel->parms.link;
883
884         /* Precalculate GRE options length */
885         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
886                 if (tunnel->parms.o_flags&GRE_CSUM)
887                         addend += 4;
888                 if (tunnel->parms.o_flags&GRE_KEY)
889                         addend += 4;
890                 if (tunnel->parms.o_flags&GRE_SEQ)
891                         addend += 4;
892         }
893         dev->needed_headroom = addend + hlen;
894         mtu -= dev->hard_header_len - addend;
895
896         if (mtu < 68)
897                 mtu = 68;
898
899         tunnel->hlen = addend;
900
901         return mtu;
902 }
903
904 static int
905 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
906 {
907         int err = 0;
908         struct ip_tunnel_parm p;
909         struct ip_tunnel *t;
910         struct net *net = dev_net(dev);
911         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
912
913         switch (cmd) {
914         case SIOCGETTUNNEL:
915                 t = NULL;
916                 if (dev == ign->fb_tunnel_dev) {
917                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
918                                 err = -EFAULT;
919                                 break;
920                         }
921                         t = ipgre_tunnel_locate(net, &p, 0);
922                 }
923                 if (t == NULL)
924                         t = netdev_priv(dev);
925                 memcpy(&p, &t->parms, sizeof(p));
926                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
927                         err = -EFAULT;
928                 break;
929
930         case SIOCADDTUNNEL:
931         case SIOCCHGTUNNEL:
932                 err = -EPERM;
933                 if (!capable(CAP_NET_ADMIN))
934                         goto done;
935
936                 err = -EFAULT;
937                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
938                         goto done;
939
940                 err = -EINVAL;
941                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
942                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
943                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
944                         goto done;
945                 if (p.iph.ttl)
946                         p.iph.frag_off |= htons(IP_DF);
947
948                 if (!(p.i_flags&GRE_KEY))
949                         p.i_key = 0;
950                 if (!(p.o_flags&GRE_KEY))
951                         p.o_key = 0;
952
953                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
954
955                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
956                         if (t != NULL) {
957                                 if (t->dev != dev) {
958                                         err = -EEXIST;
959                                         break;
960                                 }
961                         } else {
962                                 unsigned nflags=0;
963
964                                 t = netdev_priv(dev);
965
966                                 if (ipv4_is_multicast(p.iph.daddr))
967                                         nflags = IFF_BROADCAST;
968                                 else if (p.iph.daddr)
969                                         nflags = IFF_POINTOPOINT;
970
971                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
972                                         err = -EINVAL;
973                                         break;
974                                 }
975                                 ipgre_tunnel_unlink(ign, t);
976                                 t->parms.iph.saddr = p.iph.saddr;
977                                 t->parms.iph.daddr = p.iph.daddr;
978                                 t->parms.i_key = p.i_key;
979                                 t->parms.o_key = p.o_key;
980                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
981                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
982                                 ipgre_tunnel_link(ign, t);
983                                 netdev_state_change(dev);
984                         }
985                 }
986
987                 if (t) {
988                         err = 0;
989                         if (cmd == SIOCCHGTUNNEL) {
990                                 t->parms.iph.ttl = p.iph.ttl;
991                                 t->parms.iph.tos = p.iph.tos;
992                                 t->parms.iph.frag_off = p.iph.frag_off;
993                                 if (t->parms.link != p.link) {
994                                         t->parms.link = p.link;
995                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
996                                         netdev_state_change(dev);
997                                 }
998                         }
999                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1000                                 err = -EFAULT;
1001                 } else
1002                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1003                 break;
1004
1005         case SIOCDELTUNNEL:
1006                 err = -EPERM;
1007                 if (!capable(CAP_NET_ADMIN))
1008                         goto done;
1009
1010                 if (dev == ign->fb_tunnel_dev) {
1011                         err = -EFAULT;
1012                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1013                                 goto done;
1014                         err = -ENOENT;
1015                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1016                                 goto done;
1017                         err = -EPERM;
1018                         if (t == netdev_priv(ign->fb_tunnel_dev))
1019                                 goto done;
1020                         dev = t->dev;
1021                 }
1022                 unregister_netdevice(dev);
1023                 err = 0;
1024                 break;
1025
1026         default:
1027                 err = -EINVAL;
1028         }
1029
1030 done:
1031         return err;
1032 }
1033
1034 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1035 {
1036         struct ip_tunnel *tunnel = netdev_priv(dev);
1037         if (new_mtu < 68 ||
1038             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1039                 return -EINVAL;
1040         dev->mtu = new_mtu;
1041         return 0;
1042 }
1043
1044 /* Nice toy. Unfortunately, useless in real life :-)
1045    It allows to construct virtual multiprotocol broadcast "LAN"
1046    over the Internet, provided multicast routing is tuned.
1047
1048
1049    I have no idea was this bicycle invented before me,
1050    so that I had to set ARPHRD_IPGRE to a random value.
1051    I have an impression, that Cisco could make something similar,
1052    but this feature is apparently missing in IOS<=11.2(8).
1053
1054    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1055    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1056
1057    ping -t 255 224.66.66.66
1058
1059    If nobody answers, mbone does not work.
1060
1061    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1062    ip addr add 10.66.66.<somewhat>/24 dev Universe
1063    ifconfig Universe up
1064    ifconfig Universe add fe80::<Your_real_addr>/10
1065    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1066    ftp 10.66.66.66
1067    ...
1068    ftp fec0:6666:6666::193.233.7.65
1069    ...
1070
1071  */
1072
1073 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1074                         unsigned short type,
1075                         const void *daddr, const void *saddr, unsigned len)
1076 {
1077         struct ip_tunnel *t = netdev_priv(dev);
1078         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1079         __be16 *p = (__be16*)(iph+1);
1080
1081         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1082         p[0]            = t->parms.o_flags;
1083         p[1]            = htons(type);
1084
1085         /*
1086          *      Set the source hardware address.
1087          */
1088
1089         if (saddr)
1090                 memcpy(&iph->saddr, saddr, 4);
1091
1092         if (daddr) {
1093                 memcpy(&iph->daddr, daddr, 4);
1094                 return t->hlen;
1095         }
1096         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1097                 return t->hlen;
1098
1099         return -t->hlen;
1100 }
1101
1102 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1103 {
1104         struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1105         memcpy(haddr, &iph->saddr, 4);
1106         return 4;
1107 }
1108
1109 static const struct header_ops ipgre_header_ops = {
1110         .create = ipgre_header,
1111         .parse  = ipgre_header_parse,
1112 };
1113
1114 #ifdef CONFIG_NET_IPGRE_BROADCAST
1115 static int ipgre_open(struct net_device *dev)
1116 {
1117         struct ip_tunnel *t = netdev_priv(dev);
1118
1119         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1120                 struct flowi fl = { .oif = t->parms.link,
1121                                     .nl_u = { .ip4_u =
1122                                               { .daddr = t->parms.iph.daddr,
1123                                                 .saddr = t->parms.iph.saddr,
1124                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1125                                     .proto = IPPROTO_GRE };
1126                 struct rtable *rt;
1127                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1128                         return -EADDRNOTAVAIL;
1129                 dev = rt->u.dst.dev;
1130                 ip_rt_put(rt);
1131                 if (__in_dev_get_rtnl(dev) == NULL)
1132                         return -EADDRNOTAVAIL;
1133                 t->mlink = dev->ifindex;
1134                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1135         }
1136         return 0;
1137 }
1138
1139 static int ipgre_close(struct net_device *dev)
1140 {
1141         struct ip_tunnel *t = netdev_priv(dev);
1142         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1143                 struct in_device *in_dev;
1144                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1145                 if (in_dev) {
1146                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1147                         in_dev_put(in_dev);
1148                 }
1149         }
1150         return 0;
1151 }
1152
1153 #endif
1154
1155 static void ipgre_tunnel_setup(struct net_device *dev)
1156 {
1157         dev->init               = ipgre_tunnel_init;
1158         dev->uninit             = ipgre_tunnel_uninit;
1159         dev->destructor         = free_netdev;
1160         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1161         dev->do_ioctl           = ipgre_tunnel_ioctl;
1162         dev->change_mtu         = ipgre_tunnel_change_mtu;
1163
1164         dev->type               = ARPHRD_IPGRE;
1165         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1166         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1167         dev->flags              = IFF_NOARP;
1168         dev->iflink             = 0;
1169         dev->addr_len           = 4;
1170         dev->features           |= NETIF_F_NETNS_LOCAL;
1171 }
1172
1173 static int ipgre_tunnel_init(struct net_device *dev)
1174 {
1175         struct ip_tunnel *tunnel;
1176         struct iphdr *iph;
1177
1178         tunnel = netdev_priv(dev);
1179         iph = &tunnel->parms.iph;
1180
1181         tunnel->dev = dev;
1182         strcpy(tunnel->parms.name, dev->name);
1183
1184         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1185         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1186
1187         if (iph->daddr) {
1188 #ifdef CONFIG_NET_IPGRE_BROADCAST
1189                 if (ipv4_is_multicast(iph->daddr)) {
1190                         if (!iph->saddr)
1191                                 return -EINVAL;
1192                         dev->flags = IFF_BROADCAST;
1193                         dev->header_ops = &ipgre_header_ops;
1194                         dev->open = ipgre_open;
1195                         dev->stop = ipgre_close;
1196                 }
1197 #endif
1198         } else
1199                 dev->header_ops = &ipgre_header_ops;
1200
1201         return 0;
1202 }
1203
1204 static int ipgre_fb_tunnel_init(struct net_device *dev)
1205 {
1206         struct ip_tunnel *tunnel = netdev_priv(dev);
1207         struct iphdr *iph = &tunnel->parms.iph;
1208         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1209
1210         tunnel->dev = dev;
1211         strcpy(tunnel->parms.name, dev->name);
1212
1213         iph->version            = 4;
1214         iph->protocol           = IPPROTO_GRE;
1215         iph->ihl                = 5;
1216         tunnel->hlen            = sizeof(struct iphdr) + 4;
1217
1218         dev_hold(dev);
1219         ign->tunnels_wc[0]      = tunnel;
1220         return 0;
1221 }
1222
1223
1224 static struct net_protocol ipgre_protocol = {
1225         .handler        =       ipgre_rcv,
1226         .err_handler    =       ipgre_err,
1227         .netns_ok       =       1,
1228 };
1229
1230 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1231 {
1232         int prio;
1233
1234         for (prio = 0; prio < 4; prio++) {
1235                 int h;
1236                 for (h = 0; h < HASH_SIZE; h++) {
1237                         struct ip_tunnel *t;
1238                         while ((t = ign->tunnels[prio][h]) != NULL)
1239                                 unregister_netdevice(t->dev);
1240                 }
1241         }
1242 }
1243
1244 static int ipgre_init_net(struct net *net)
1245 {
1246         int err;
1247         struct ipgre_net *ign;
1248
1249         err = -ENOMEM;
1250         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1251         if (ign == NULL)
1252                 goto err_alloc;
1253
1254         err = net_assign_generic(net, ipgre_net_id, ign);
1255         if (err < 0)
1256                 goto err_assign;
1257
1258         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1259                                            ipgre_tunnel_setup);
1260         if (!ign->fb_tunnel_dev) {
1261                 err = -ENOMEM;
1262                 goto err_alloc_dev;
1263         }
1264
1265         ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1266         dev_net_set(ign->fb_tunnel_dev, net);
1267         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1268
1269         if ((err = register_netdev(ign->fb_tunnel_dev)))
1270                 goto err_reg_dev;
1271
1272         return 0;
1273
1274 err_reg_dev:
1275         free_netdev(ign->fb_tunnel_dev);
1276 err_alloc_dev:
1277         /* nothing */
1278 err_assign:
1279         kfree(ign);
1280 err_alloc:
1281         return err;
1282 }
1283
1284 static void ipgre_exit_net(struct net *net)
1285 {
1286         struct ipgre_net *ign;
1287
1288         ign = net_generic(net, ipgre_net_id);
1289         rtnl_lock();
1290         ipgre_destroy_tunnels(ign);
1291         rtnl_unlock();
1292         kfree(ign);
1293 }
1294
1295 static struct pernet_operations ipgre_net_ops = {
1296         .init = ipgre_init_net,
1297         .exit = ipgre_exit_net,
1298 };
1299
1300 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1301 {
1302         __be16 flags;
1303
1304         if (!data)
1305                 return 0;
1306
1307         flags = 0;
1308         if (data[IFLA_GRE_IFLAGS])
1309                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1310         if (data[IFLA_GRE_OFLAGS])
1311                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1312         if (flags & (GRE_VERSION|GRE_ROUTING))
1313                 return -EINVAL;
1314
1315         return 0;
1316 }
1317
1318 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1319 {
1320         __be32 daddr;
1321
1322         if (tb[IFLA_ADDRESS]) {
1323                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1324                         return -EINVAL;
1325                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1326                         return -EADDRNOTAVAIL;
1327         }
1328
1329         if (!data)
1330                 goto out;
1331
1332         if (data[IFLA_GRE_REMOTE]) {
1333                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1334                 if (!daddr)
1335                         return -EINVAL;
1336         }
1337
1338 out:
1339         return ipgre_tunnel_validate(tb, data);
1340 }
1341
1342 static void ipgre_netlink_parms(struct nlattr *data[],
1343                                 struct ip_tunnel_parm *parms)
1344 {
1345         memset(parms, 0, sizeof(parms));
1346
1347         parms->iph.protocol = IPPROTO_GRE;
1348
1349         if (!data)
1350                 return;
1351
1352         if (data[IFLA_GRE_LINK])
1353                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1354
1355         if (data[IFLA_GRE_IFLAGS])
1356                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1357
1358         if (data[IFLA_GRE_OFLAGS])
1359                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1360
1361         if (data[IFLA_GRE_IKEY])
1362                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1363
1364         if (data[IFLA_GRE_OKEY])
1365                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1366
1367         if (data[IFLA_GRE_LOCAL])
1368                 memcpy(&parms->iph.saddr, nla_data(data[IFLA_GRE_LOCAL]), 4);
1369
1370         if (data[IFLA_GRE_REMOTE])
1371                 memcpy(&parms->iph.daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1372
1373         if (data[IFLA_GRE_TTL])
1374                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1375
1376         if (data[IFLA_GRE_TOS])
1377                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1378
1379         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1380                 parms->iph.frag_off = htons(IP_DF);
1381 }
1382
1383 static int ipgre_tap_init(struct net_device *dev)
1384 {
1385         struct ip_tunnel *tunnel;
1386
1387         tunnel = netdev_priv(dev);
1388
1389         tunnel->dev = dev;
1390         strcpy(tunnel->parms.name, dev->name);
1391
1392         ipgre_tunnel_bind_dev(dev);
1393
1394         return 0;
1395 }
1396
1397 static void ipgre_tap_setup(struct net_device *dev)
1398 {
1399
1400         ether_setup(dev);
1401
1402         dev->init               = ipgre_tap_init;
1403         dev->uninit             = ipgre_tunnel_uninit;
1404         dev->destructor         = free_netdev;
1405         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1406         dev->change_mtu         = ipgre_tunnel_change_mtu;
1407
1408         dev->iflink             = 0;
1409         dev->features           |= NETIF_F_NETNS_LOCAL;
1410 }
1411
1412 static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1413                          struct nlattr *data[])
1414 {
1415         struct ip_tunnel *nt;
1416         struct net *net = dev_net(dev);
1417         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1418         int mtu;
1419         int err;
1420
1421         nt = netdev_priv(dev);
1422         ipgre_netlink_parms(data, &nt->parms);
1423
1424         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1425                 return -EEXIST;
1426
1427         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1428                 random_ether_addr(dev->dev_addr);
1429
1430         mtu = ipgre_tunnel_bind_dev(dev);
1431         if (!tb[IFLA_MTU])
1432                 dev->mtu = mtu;
1433
1434         err = register_netdevice(dev);
1435         if (err)
1436                 goto out;
1437
1438         dev_hold(dev);
1439         ipgre_tunnel_link(ign, nt);
1440
1441 out:
1442         return err;
1443 }
1444
1445 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1446                             struct nlattr *data[])
1447 {
1448         struct ip_tunnel *t, *nt;
1449         struct net *net = dev_net(dev);
1450         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1451         struct ip_tunnel_parm p;
1452         int mtu;
1453
1454         if (dev == ign->fb_tunnel_dev)
1455                 return -EINVAL;
1456
1457         nt = netdev_priv(dev);
1458         ipgre_netlink_parms(data, &p);
1459
1460         t = ipgre_tunnel_locate(net, &p, 0);
1461
1462         if (t) {
1463                 if (t->dev != dev)
1464                         return -EEXIST;
1465         } else {
1466                 unsigned nflags = 0;
1467
1468                 t = nt;
1469
1470                 if (ipv4_is_multicast(p.iph.daddr))
1471                         nflags = IFF_BROADCAST;
1472                 else if (p.iph.daddr)
1473                         nflags = IFF_POINTOPOINT;
1474
1475                 if ((dev->flags ^ nflags) &
1476                     (IFF_POINTOPOINT | IFF_BROADCAST))
1477                         return -EINVAL;
1478
1479                 ipgre_tunnel_unlink(ign, t);
1480                 t->parms.iph.saddr = p.iph.saddr;
1481                 t->parms.iph.daddr = p.iph.daddr;
1482                 t->parms.i_key = p.i_key;
1483                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1484                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1485                 ipgre_tunnel_link(ign, t);
1486                 netdev_state_change(dev);
1487         }
1488
1489         t->parms.o_key = p.o_key;
1490         t->parms.iph.ttl = p.iph.ttl;
1491         t->parms.iph.tos = p.iph.tos;
1492         t->parms.iph.frag_off = p.iph.frag_off;
1493
1494         if (t->parms.link != p.link) {
1495                 t->parms.link = p.link;
1496                 mtu = ipgre_tunnel_bind_dev(dev);
1497                 if (!tb[IFLA_MTU])
1498                         dev->mtu = mtu;
1499                 netdev_state_change(dev);
1500         }
1501
1502         return 0;
1503 }
1504
1505 static size_t ipgre_get_size(const struct net_device *dev)
1506 {
1507         return
1508                 /* IFLA_GRE_LINK */
1509                 nla_total_size(4) +
1510                 /* IFLA_GRE_IFLAGS */
1511                 nla_total_size(2) +
1512                 /* IFLA_GRE_OFLAGS */
1513                 nla_total_size(2) +
1514                 /* IFLA_GRE_IKEY */
1515                 nla_total_size(4) +
1516                 /* IFLA_GRE_OKEY */
1517                 nla_total_size(4) +
1518                 /* IFLA_GRE_LOCAL */
1519                 nla_total_size(4) +
1520                 /* IFLA_GRE_REMOTE */
1521                 nla_total_size(4) +
1522                 /* IFLA_GRE_TTL */
1523                 nla_total_size(1) +
1524                 /* IFLA_GRE_TOS */
1525                 nla_total_size(1) +
1526                 /* IFLA_GRE_PMTUDISC */
1527                 nla_total_size(1) +
1528                 0;
1529 }
1530
1531 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1532 {
1533         struct ip_tunnel *t = netdev_priv(dev);
1534         struct ip_tunnel_parm *p = &t->parms;
1535
1536         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1537         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1538         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1539         NLA_PUT_BE32(skb, IFLA_GRE_IFLAGS, p->i_flags);
1540         NLA_PUT_BE32(skb, IFLA_GRE_OFLAGS, p->o_flags);
1541         NLA_PUT(skb, IFLA_GRE_LOCAL, 4, &p->iph.saddr);
1542         NLA_PUT(skb, IFLA_GRE_REMOTE, 4, &p->iph.daddr);
1543         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1544         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1545         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1546
1547         return 0;
1548
1549 nla_put_failure:
1550         return -EMSGSIZE;
1551 }
1552
1553 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1554         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1555         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1556         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1557         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1558         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1559         [IFLA_GRE_LOCAL]        = { .len = 4 },
1560         [IFLA_GRE_REMOTE]       = { .len = 4 },
1561         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1562         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1563         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1564 };
1565
1566 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1567         .kind           = "gre",
1568         .maxtype        = IFLA_GRE_MAX,
1569         .policy         = ipgre_policy,
1570         .priv_size      = sizeof(struct ip_tunnel),
1571         .setup          = ipgre_tunnel_setup,
1572         .validate       = ipgre_tunnel_validate,
1573         .newlink        = ipgre_newlink,
1574         .changelink     = ipgre_changelink,
1575         .get_size       = ipgre_get_size,
1576         .fill_info      = ipgre_fill_info,
1577 };
1578
1579 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1580         .kind           = "gretap",
1581         .maxtype        = IFLA_GRE_MAX,
1582         .policy         = ipgre_policy,
1583         .priv_size      = sizeof(struct ip_tunnel),
1584         .setup          = ipgre_tap_setup,
1585         .validate       = ipgre_tap_validate,
1586         .newlink        = ipgre_newlink,
1587         .changelink     = ipgre_changelink,
1588         .get_size       = ipgre_get_size,
1589         .fill_info      = ipgre_fill_info,
1590 };
1591
1592 /*
1593  *      And now the modules code and kernel interface.
1594  */
1595
1596 static int __init ipgre_init(void)
1597 {
1598         int err;
1599
1600         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1601
1602         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1603                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1604                 return -EAGAIN;
1605         }
1606
1607         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1608         if (err < 0)
1609                 goto gen_device_failed;
1610
1611         err = rtnl_link_register(&ipgre_link_ops);
1612         if (err < 0)
1613                 goto rtnl_link_failed;
1614
1615         err = rtnl_link_register(&ipgre_tap_ops);
1616         if (err < 0)
1617                 goto tap_ops_failed;
1618
1619 out:
1620         return err;
1621
1622 tap_ops_failed:
1623         rtnl_link_unregister(&ipgre_link_ops);
1624 rtnl_link_failed:
1625         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1626 gen_device_failed:
1627         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1628         goto out;
1629 }
1630
1631 static void __exit ipgre_fini(void)
1632 {
1633         rtnl_link_unregister(&ipgre_tap_ops);
1634         rtnl_link_unregister(&ipgre_link_ops);
1635         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1636         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1637                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1638 }
1639
1640 module_init(ipgre_init);
1641 module_exit(ipgre_fini);
1642 MODULE_LICENSE("GPL");
1643 MODULE_ALIAS("rtnl-link-gre");
1644 MODULE_ALIAS("rtnl-link-gretap");