]> Pileus Git - ~andy/linux/blob - net/ipv6/route.c
7946b53692da1ae28d5c882c8b06e913b684cd5e
[~andy/linux] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
101 {
102         struct rt6_info *rt = (struct rt6_info *) dst;
103         struct inet_peer *peer;
104         u32 *p = NULL;
105
106         if (!rt->rt6i_peer)
107                 rt6_bind_peer(rt, 1);
108
109         peer = rt->rt6i_peer;
110         if (peer) {
111                 u32 *old_p = __DST_METRICS_PTR(old);
112                 unsigned long prev, new;
113
114                 p = peer->metrics;
115                 if (inet_metrics_new(peer))
116                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
117
118                 new = (unsigned long) p;
119                 prev = cmpxchg(&dst->_metrics, old, new);
120
121                 if (prev != old) {
122                         p = __DST_METRICS_PTR(prev);
123                         if (prev & DST_METRICS_READ_ONLY)
124                                 p = NULL;
125                 }
126         }
127         return p;
128 }
129
130 static struct dst_ops ip6_dst_ops_template = {
131         .family                 =       AF_INET6,
132         .protocol               =       cpu_to_be16(ETH_P_IPV6),
133         .gc                     =       ip6_dst_gc,
134         .gc_thresh              =       1024,
135         .check                  =       ip6_dst_check,
136         .default_advmss         =       ip6_default_advmss,
137         .default_mtu            =       ip6_default_mtu,
138         .cow_metrics            =       ipv6_cow_metrics,
139         .destroy                =       ip6_dst_destroy,
140         .ifdown                 =       ip6_dst_ifdown,
141         .negative_advice        =       ip6_negative_advice,
142         .link_failure           =       ip6_link_failure,
143         .update_pmtu            =       ip6_rt_update_pmtu,
144         .local_out              =       __ip6_local_out,
145 };
146
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
148 {
149         return 0;
150 }
151
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
153 {
154 }
155
156 static struct dst_ops ip6_dst_blackhole_ops = {
157         .family                 =       AF_INET6,
158         .protocol               =       cpu_to_be16(ETH_P_IPV6),
159         .destroy                =       ip6_dst_destroy,
160         .check                  =       ip6_dst_check,
161         .default_mtu            =       ip6_blackhole_default_mtu,
162         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
163 };
164
165 static const u32 ip6_template_metrics[RTAX_MAX] = {
166         [RTAX_HOPLIMIT - 1] = 255,
167 };
168
169 static struct rt6_info ip6_null_entry_template = {
170         .dst = {
171                 .__refcnt       = ATOMIC_INIT(1),
172                 .__use          = 1,
173                 .obsolete       = -1,
174                 .error          = -ENETUNREACH,
175                 .input          = ip6_pkt_discard,
176                 .output         = ip6_pkt_discard_out,
177         },
178         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
179         .rt6i_protocol  = RTPROT_KERNEL,
180         .rt6i_metric    = ~(u32) 0,
181         .rt6i_ref       = ATOMIC_INIT(1),
182 };
183
184 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
185
186 static int ip6_pkt_prohibit(struct sk_buff *skb);
187 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
188
189 static struct rt6_info ip6_prohibit_entry_template = {
190         .dst = {
191                 .__refcnt       = ATOMIC_INIT(1),
192                 .__use          = 1,
193                 .obsolete       = -1,
194                 .error          = -EACCES,
195                 .input          = ip6_pkt_prohibit,
196                 .output         = ip6_pkt_prohibit_out,
197         },
198         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
199         .rt6i_protocol  = RTPROT_KERNEL,
200         .rt6i_metric    = ~(u32) 0,
201         .rt6i_ref       = ATOMIC_INIT(1),
202 };
203
204 static struct rt6_info ip6_blk_hole_entry_template = {
205         .dst = {
206                 .__refcnt       = ATOMIC_INIT(1),
207                 .__use          = 1,
208                 .obsolete       = -1,
209                 .error          = -EINVAL,
210                 .input          = dst_discard,
211                 .output         = dst_discard,
212         },
213         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
214         .rt6i_protocol  = RTPROT_KERNEL,
215         .rt6i_metric    = ~(u32) 0,
216         .rt6i_ref       = ATOMIC_INIT(1),
217 };
218
219 #endif
220
221 /* allocate dst with ip6_dst_ops */
222 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
223 {
224         return (struct rt6_info *)dst_alloc(ops, 0);
225 }
226
227 static void ip6_dst_destroy(struct dst_entry *dst)
228 {
229         struct rt6_info *rt = (struct rt6_info *)dst;
230         struct inet6_dev *idev = rt->rt6i_idev;
231         struct inet_peer *peer = rt->rt6i_peer;
232
233         if (idev != NULL) {
234                 rt->rt6i_idev = NULL;
235                 in6_dev_put(idev);
236         }
237         if (peer) {
238                 rt->rt6i_peer = NULL;
239                 inet_putpeer(peer);
240         }
241 }
242
243 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
244
245 static u32 rt6_peer_genid(void)
246 {
247         return atomic_read(&__rt6_peer_genid);
248 }
249
250 void rt6_bind_peer(struct rt6_info *rt, int create)
251 {
252         struct inet_peer *peer;
253
254         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
255         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
256                 inet_putpeer(peer);
257         else
258                 rt->rt6i_peer_genid = rt6_peer_genid();
259 }
260
261 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
262                            int how)
263 {
264         struct rt6_info *rt = (struct rt6_info *)dst;
265         struct inet6_dev *idev = rt->rt6i_idev;
266         struct net_device *loopback_dev =
267                 dev_net(dev)->loopback_dev;
268
269         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
270                 struct inet6_dev *loopback_idev =
271                         in6_dev_get(loopback_dev);
272                 if (loopback_idev != NULL) {
273                         rt->rt6i_idev = loopback_idev;
274                         in6_dev_put(idev);
275                 }
276         }
277 }
278
279 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
280 {
281         return (rt->rt6i_flags & RTF_EXPIRES) &&
282                 time_after(jiffies, rt->rt6i_expires);
283 }
284
285 static inline int rt6_need_strict(struct in6_addr *daddr)
286 {
287         return ipv6_addr_type(daddr) &
288                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
289 }
290
291 /*
292  *      Route lookup. Any table->tb6_lock is implied.
293  */
294
295 static inline struct rt6_info *rt6_device_match(struct net *net,
296                                                     struct rt6_info *rt,
297                                                     struct in6_addr *saddr,
298                                                     int oif,
299                                                     int flags)
300 {
301         struct rt6_info *local = NULL;
302         struct rt6_info *sprt;
303
304         if (!oif && ipv6_addr_any(saddr))
305                 goto out;
306
307         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
308                 struct net_device *dev = sprt->rt6i_dev;
309
310                 if (oif) {
311                         if (dev->ifindex == oif)
312                                 return sprt;
313                         if (dev->flags & IFF_LOOPBACK) {
314                                 if (sprt->rt6i_idev == NULL ||
315                                     sprt->rt6i_idev->dev->ifindex != oif) {
316                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
317                                                 continue;
318                                         if (local && (!oif ||
319                                                       local->rt6i_idev->dev->ifindex == oif))
320                                                 continue;
321                                 }
322                                 local = sprt;
323                         }
324                 } else {
325                         if (ipv6_chk_addr(net, saddr, dev,
326                                           flags & RT6_LOOKUP_F_IFACE))
327                                 return sprt;
328                 }
329         }
330
331         if (oif) {
332                 if (local)
333                         return local;
334
335                 if (flags & RT6_LOOKUP_F_IFACE)
336                         return net->ipv6.ip6_null_entry;
337         }
338 out:
339         return rt;
340 }
341
342 #ifdef CONFIG_IPV6_ROUTER_PREF
343 static void rt6_probe(struct rt6_info *rt)
344 {
345         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
346         /*
347          * Okay, this does not seem to be appropriate
348          * for now, however, we need to check if it
349          * is really so; aka Router Reachability Probing.
350          *
351          * Router Reachability Probe MUST be rate-limited
352          * to no more than one per minute.
353          */
354         if (!neigh || (neigh->nud_state & NUD_VALID))
355                 return;
356         read_lock_bh(&neigh->lock);
357         if (!(neigh->nud_state & NUD_VALID) &&
358             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
359                 struct in6_addr mcaddr;
360                 struct in6_addr *target;
361
362                 neigh->updated = jiffies;
363                 read_unlock_bh(&neigh->lock);
364
365                 target = (struct in6_addr *)&neigh->primary_key;
366                 addrconf_addr_solict_mult(target, &mcaddr);
367                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
368         } else
369                 read_unlock_bh(&neigh->lock);
370 }
371 #else
372 static inline void rt6_probe(struct rt6_info *rt)
373 {
374 }
375 #endif
376
377 /*
378  * Default Router Selection (RFC 2461 6.3.6)
379  */
380 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
381 {
382         struct net_device *dev = rt->rt6i_dev;
383         if (!oif || dev->ifindex == oif)
384                 return 2;
385         if ((dev->flags & IFF_LOOPBACK) &&
386             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
387                 return 1;
388         return 0;
389 }
390
391 static inline int rt6_check_neigh(struct rt6_info *rt)
392 {
393         struct neighbour *neigh = rt->rt6i_nexthop;
394         int m;
395         if (rt->rt6i_flags & RTF_NONEXTHOP ||
396             !(rt->rt6i_flags & RTF_GATEWAY))
397                 m = 1;
398         else if (neigh) {
399                 read_lock_bh(&neigh->lock);
400                 if (neigh->nud_state & NUD_VALID)
401                         m = 2;
402 #ifdef CONFIG_IPV6_ROUTER_PREF
403                 else if (neigh->nud_state & NUD_FAILED)
404                         m = 0;
405 #endif
406                 else
407                         m = 1;
408                 read_unlock_bh(&neigh->lock);
409         } else
410                 m = 0;
411         return m;
412 }
413
414 static int rt6_score_route(struct rt6_info *rt, int oif,
415                            int strict)
416 {
417         int m, n;
418
419         m = rt6_check_dev(rt, oif);
420         if (!m && (strict & RT6_LOOKUP_F_IFACE))
421                 return -1;
422 #ifdef CONFIG_IPV6_ROUTER_PREF
423         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
424 #endif
425         n = rt6_check_neigh(rt);
426         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
427                 return -1;
428         return m;
429 }
430
431 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
432                                    int *mpri, struct rt6_info *match)
433 {
434         int m;
435
436         if (rt6_check_expired(rt))
437                 goto out;
438
439         m = rt6_score_route(rt, oif, strict);
440         if (m < 0)
441                 goto out;
442
443         if (m > *mpri) {
444                 if (strict & RT6_LOOKUP_F_REACHABLE)
445                         rt6_probe(match);
446                 *mpri = m;
447                 match = rt;
448         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
449                 rt6_probe(rt);
450         }
451
452 out:
453         return match;
454 }
455
456 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
457                                      struct rt6_info *rr_head,
458                                      u32 metric, int oif, int strict)
459 {
460         struct rt6_info *rt, *match;
461         int mpri = -1;
462
463         match = NULL;
464         for (rt = rr_head; rt && rt->rt6i_metric == metric;
465              rt = rt->dst.rt6_next)
466                 match = find_match(rt, oif, strict, &mpri, match);
467         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
468              rt = rt->dst.rt6_next)
469                 match = find_match(rt, oif, strict, &mpri, match);
470
471         return match;
472 }
473
474 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
475 {
476         struct rt6_info *match, *rt0;
477         struct net *net;
478
479         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
480                   __func__, fn->leaf, oif);
481
482         rt0 = fn->rr_ptr;
483         if (!rt0)
484                 fn->rr_ptr = rt0 = fn->leaf;
485
486         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
487
488         if (!match &&
489             (strict & RT6_LOOKUP_F_REACHABLE)) {
490                 struct rt6_info *next = rt0->dst.rt6_next;
491
492                 /* no entries matched; do round-robin */
493                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
494                         next = fn->leaf;
495
496                 if (next != rt0)
497                         fn->rr_ptr = next;
498         }
499
500         RT6_TRACE("%s() => %p\n",
501                   __func__, match);
502
503         net = dev_net(rt0->rt6i_dev);
504         return match ? match : net->ipv6.ip6_null_entry;
505 }
506
507 #ifdef CONFIG_IPV6_ROUTE_INFO
508 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
509                   struct in6_addr *gwaddr)
510 {
511         struct net *net = dev_net(dev);
512         struct route_info *rinfo = (struct route_info *) opt;
513         struct in6_addr prefix_buf, *prefix;
514         unsigned int pref;
515         unsigned long lifetime;
516         struct rt6_info *rt;
517
518         if (len < sizeof(struct route_info)) {
519                 return -EINVAL;
520         }
521
522         /* Sanity check for prefix_len and length */
523         if (rinfo->length > 3) {
524                 return -EINVAL;
525         } else if (rinfo->prefix_len > 128) {
526                 return -EINVAL;
527         } else if (rinfo->prefix_len > 64) {
528                 if (rinfo->length < 2) {
529                         return -EINVAL;
530                 }
531         } else if (rinfo->prefix_len > 0) {
532                 if (rinfo->length < 1) {
533                         return -EINVAL;
534                 }
535         }
536
537         pref = rinfo->route_pref;
538         if (pref == ICMPV6_ROUTER_PREF_INVALID)
539                 return -EINVAL;
540
541         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
542
543         if (rinfo->length == 3)
544                 prefix = (struct in6_addr *)rinfo->prefix;
545         else {
546                 /* this function is safe */
547                 ipv6_addr_prefix(&prefix_buf,
548                                  (struct in6_addr *)rinfo->prefix,
549                                  rinfo->prefix_len);
550                 prefix = &prefix_buf;
551         }
552
553         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
554                                 dev->ifindex);
555
556         if (rt && !lifetime) {
557                 ip6_del_rt(rt);
558                 rt = NULL;
559         }
560
561         if (!rt && lifetime)
562                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
563                                         pref);
564         else if (rt)
565                 rt->rt6i_flags = RTF_ROUTEINFO |
566                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
567
568         if (rt) {
569                 if (!addrconf_finite_timeout(lifetime)) {
570                         rt->rt6i_flags &= ~RTF_EXPIRES;
571                 } else {
572                         rt->rt6i_expires = jiffies + HZ * lifetime;
573                         rt->rt6i_flags |= RTF_EXPIRES;
574                 }
575                 dst_release(&rt->dst);
576         }
577         return 0;
578 }
579 #endif
580
581 #define BACKTRACK(__net, saddr)                 \
582 do { \
583         if (rt == __net->ipv6.ip6_null_entry) { \
584                 struct fib6_node *pn; \
585                 while (1) { \
586                         if (fn->fn_flags & RTN_TL_ROOT) \
587                                 goto out; \
588                         pn = fn->parent; \
589                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
590                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
591                         else \
592                                 fn = pn; \
593                         if (fn->fn_flags & RTN_RTINFO) \
594                                 goto restart; \
595                 } \
596         } \
597 } while(0)
598
599 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
600                                              struct fib6_table *table,
601                                              struct flowi *fl, int flags)
602 {
603         struct fib6_node *fn;
604         struct rt6_info *rt;
605
606         read_lock_bh(&table->tb6_lock);
607         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
608 restart:
609         rt = fn->leaf;
610         rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
611         BACKTRACK(net, &fl->fl6_src);
612 out:
613         dst_use(&rt->dst, jiffies);
614         read_unlock_bh(&table->tb6_lock);
615         return rt;
616
617 }
618
619 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
620                             const struct in6_addr *saddr, int oif, int strict)
621 {
622         struct flowi fl = {
623                 .oif = oif,
624                 .fl6_dst = *daddr,
625         };
626         struct dst_entry *dst;
627         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
628
629         if (saddr) {
630                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
631                 flags |= RT6_LOOKUP_F_HAS_SADDR;
632         }
633
634         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
635         if (dst->error == 0)
636                 return (struct rt6_info *) dst;
637
638         dst_release(dst);
639
640         return NULL;
641 }
642
643 EXPORT_SYMBOL(rt6_lookup);
644
645 /* ip6_ins_rt is called with FREE table->tb6_lock.
646    It takes new route entry, the addition fails by any reason the
647    route is freed. In any case, if caller does not hold it, it may
648    be destroyed.
649  */
650
651 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
652 {
653         int err;
654         struct fib6_table *table;
655
656         table = rt->rt6i_table;
657         write_lock_bh(&table->tb6_lock);
658         err = fib6_add(&table->tb6_root, rt, info);
659         write_unlock_bh(&table->tb6_lock);
660
661         return err;
662 }
663
664 int ip6_ins_rt(struct rt6_info *rt)
665 {
666         struct nl_info info = {
667                 .nl_net = dev_net(rt->rt6i_dev),
668         };
669         return __ip6_ins_rt(rt, &info);
670 }
671
672 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
673                                       struct in6_addr *saddr)
674 {
675         struct rt6_info *rt;
676
677         /*
678          *      Clone the route.
679          */
680
681         rt = ip6_rt_copy(ort);
682
683         if (rt) {
684                 struct neighbour *neigh;
685                 int attempts = !in_softirq();
686
687                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
688                         if (rt->rt6i_dst.plen != 128 &&
689                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
690                                 rt->rt6i_flags |= RTF_ANYCAST;
691                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
692                 }
693
694                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
695                 rt->rt6i_dst.plen = 128;
696                 rt->rt6i_flags |= RTF_CACHE;
697                 rt->dst.flags |= DST_HOST;
698
699 #ifdef CONFIG_IPV6_SUBTREES
700                 if (rt->rt6i_src.plen && saddr) {
701                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
702                         rt->rt6i_src.plen = 128;
703                 }
704 #endif
705
706         retry:
707                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
708                 if (IS_ERR(neigh)) {
709                         struct net *net = dev_net(rt->rt6i_dev);
710                         int saved_rt_min_interval =
711                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
712                         int saved_rt_elasticity =
713                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
714
715                         if (attempts-- > 0) {
716                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
717                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
718
719                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
720
721                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
722                                         saved_rt_elasticity;
723                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
724                                         saved_rt_min_interval;
725                                 goto retry;
726                         }
727
728                         if (net_ratelimit())
729                                 printk(KERN_WARNING
730                                        "ipv6: Neighbour table overflow.\n");
731                         dst_free(&rt->dst);
732                         return NULL;
733                 }
734                 rt->rt6i_nexthop = neigh;
735
736         }
737
738         return rt;
739 }
740
741 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
742 {
743         struct rt6_info *rt = ip6_rt_copy(ort);
744         if (rt) {
745                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
746                 rt->rt6i_dst.plen = 128;
747                 rt->rt6i_flags |= RTF_CACHE;
748                 rt->dst.flags |= DST_HOST;
749                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
750         }
751         return rt;
752 }
753
754 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
755                                       struct flowi *fl, int flags)
756 {
757         struct fib6_node *fn;
758         struct rt6_info *rt, *nrt;
759         int strict = 0;
760         int attempts = 3;
761         int err;
762         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
763
764         strict |= flags & RT6_LOOKUP_F_IFACE;
765
766 relookup:
767         read_lock_bh(&table->tb6_lock);
768
769 restart_2:
770         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
771
772 restart:
773         rt = rt6_select(fn, oif, strict | reachable);
774
775         BACKTRACK(net, &fl->fl6_src);
776         if (rt == net->ipv6.ip6_null_entry ||
777             rt->rt6i_flags & RTF_CACHE)
778                 goto out;
779
780         dst_hold(&rt->dst);
781         read_unlock_bh(&table->tb6_lock);
782
783         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
784                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
785         else
786                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
787
788         dst_release(&rt->dst);
789         rt = nrt ? : net->ipv6.ip6_null_entry;
790
791         dst_hold(&rt->dst);
792         if (nrt) {
793                 err = ip6_ins_rt(nrt);
794                 if (!err)
795                         goto out2;
796         }
797
798         if (--attempts <= 0)
799                 goto out2;
800
801         /*
802          * Race condition! In the gap, when table->tb6_lock was
803          * released someone could insert this route.  Relookup.
804          */
805         dst_release(&rt->dst);
806         goto relookup;
807
808 out:
809         if (reachable) {
810                 reachable = 0;
811                 goto restart_2;
812         }
813         dst_hold(&rt->dst);
814         read_unlock_bh(&table->tb6_lock);
815 out2:
816         rt->dst.lastuse = jiffies;
817         rt->dst.__use++;
818
819         return rt;
820 }
821
822 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
823                                             struct flowi *fl, int flags)
824 {
825         return ip6_pol_route(net, table, fl->iif, fl, flags);
826 }
827
828 void ip6_route_input(struct sk_buff *skb)
829 {
830         struct ipv6hdr *iph = ipv6_hdr(skb);
831         struct net *net = dev_net(skb->dev);
832         int flags = RT6_LOOKUP_F_HAS_SADDR;
833         struct flowi fl = {
834                 .iif = skb->dev->ifindex,
835                 .fl6_dst = iph->daddr,
836                 .fl6_src = iph->saddr,
837                 .fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
838                 .mark = skb->mark,
839                 .proto = iph->nexthdr,
840         };
841
842         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
843                 flags |= RT6_LOOKUP_F_IFACE;
844
845         skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
846 }
847
848 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
849                                              struct flowi *fl, int flags)
850 {
851         return ip6_pol_route(net, table, fl->oif, fl, flags);
852 }
853
854 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
855                                     struct flowi *fl)
856 {
857         int flags = 0;
858
859         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
860                 flags |= RT6_LOOKUP_F_IFACE;
861
862         if (!ipv6_addr_any(&fl->fl6_src))
863                 flags |= RT6_LOOKUP_F_HAS_SADDR;
864         else if (sk)
865                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
866
867         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
868 }
869
870 EXPORT_SYMBOL(ip6_route_output);
871
872 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
873 {
874         struct rt6_info *ort = (struct rt6_info *) *dstp;
875         struct rt6_info *rt = (struct rt6_info *)
876                 dst_alloc(&ip6_dst_blackhole_ops, 1);
877         struct dst_entry *new = NULL;
878
879         if (rt) {
880                 new = &rt->dst;
881
882                 new->__use = 1;
883                 new->input = dst_discard;
884                 new->output = dst_discard;
885
886                 dst_copy_metrics(new, &ort->dst);
887                 new->dev = ort->dst.dev;
888                 if (new->dev)
889                         dev_hold(new->dev);
890                 rt->rt6i_idev = ort->rt6i_idev;
891                 if (rt->rt6i_idev)
892                         in6_dev_hold(rt->rt6i_idev);
893                 rt->rt6i_expires = 0;
894
895                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
896                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
897                 rt->rt6i_metric = 0;
898
899                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
900 #ifdef CONFIG_IPV6_SUBTREES
901                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
902 #endif
903
904                 dst_free(new);
905         }
906
907         dst_release(*dstp);
908         *dstp = new;
909         return new ? 0 : -ENOMEM;
910 }
911 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
912
913 /*
914  *      Destination cache support functions
915  */
916
917 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
918 {
919         struct rt6_info *rt;
920
921         rt = (struct rt6_info *) dst;
922
923         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
924                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
925                         if (!rt->rt6i_peer)
926                                 rt6_bind_peer(rt, 0);
927                         rt->rt6i_peer_genid = rt6_peer_genid();
928                 }
929                 return dst;
930         }
931         return NULL;
932 }
933
934 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
935 {
936         struct rt6_info *rt = (struct rt6_info *) dst;
937
938         if (rt) {
939                 if (rt->rt6i_flags & RTF_CACHE) {
940                         if (rt6_check_expired(rt)) {
941                                 ip6_del_rt(rt);
942                                 dst = NULL;
943                         }
944                 } else {
945                         dst_release(dst);
946                         dst = NULL;
947                 }
948         }
949         return dst;
950 }
951
952 static void ip6_link_failure(struct sk_buff *skb)
953 {
954         struct rt6_info *rt;
955
956         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
957
958         rt = (struct rt6_info *) skb_dst(skb);
959         if (rt) {
960                 if (rt->rt6i_flags&RTF_CACHE) {
961                         dst_set_expires(&rt->dst, 0);
962                         rt->rt6i_flags |= RTF_EXPIRES;
963                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
964                         rt->rt6i_node->fn_sernum = -1;
965         }
966 }
967
968 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
969 {
970         struct rt6_info *rt6 = (struct rt6_info*)dst;
971
972         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
973                 rt6->rt6i_flags |= RTF_MODIFIED;
974                 if (mtu < IPV6_MIN_MTU) {
975                         u32 features = dst_metric(dst, RTAX_FEATURES);
976                         mtu = IPV6_MIN_MTU;
977                         features |= RTAX_FEATURE_ALLFRAG;
978                         dst_metric_set(dst, RTAX_FEATURES, features);
979                 }
980                 dst_metric_set(dst, RTAX_MTU, mtu);
981         }
982 }
983
984 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
985 {
986         struct net_device *dev = dst->dev;
987         unsigned int mtu = dst_mtu(dst);
988         struct net *net = dev_net(dev);
989
990         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
991
992         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
993                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
994
995         /*
996          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
997          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
998          * IPV6_MAXPLEN is also valid and means: "any MSS,
999          * rely only on pmtu discovery"
1000          */
1001         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1002                 mtu = IPV6_MAXPLEN;
1003         return mtu;
1004 }
1005
1006 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1007 {
1008         unsigned int mtu = IPV6_MIN_MTU;
1009         struct inet6_dev *idev;
1010
1011         rcu_read_lock();
1012         idev = __in6_dev_get(dst->dev);
1013         if (idev)
1014                 mtu = idev->cnf.mtu6;
1015         rcu_read_unlock();
1016
1017         return mtu;
1018 }
1019
1020 static struct dst_entry *icmp6_dst_gc_list;
1021 static DEFINE_SPINLOCK(icmp6_dst_lock);
1022
1023 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1024                                   struct neighbour *neigh,
1025                                   const struct in6_addr *addr)
1026 {
1027         struct rt6_info *rt;
1028         struct inet6_dev *idev = in6_dev_get(dev);
1029         struct net *net = dev_net(dev);
1030
1031         if (unlikely(idev == NULL))
1032                 return NULL;
1033
1034         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1035         if (unlikely(rt == NULL)) {
1036                 in6_dev_put(idev);
1037                 goto out;
1038         }
1039
1040         dev_hold(dev);
1041         if (neigh)
1042                 neigh_hold(neigh);
1043         else {
1044                 neigh = ndisc_get_neigh(dev, addr);
1045                 if (IS_ERR(neigh))
1046                         neigh = NULL;
1047         }
1048
1049         rt->rt6i_dev      = dev;
1050         rt->rt6i_idev     = idev;
1051         rt->rt6i_nexthop  = neigh;
1052         atomic_set(&rt->dst.__refcnt, 1);
1053         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1054         rt->dst.output  = ip6_output;
1055
1056 #if 0   /* there's no chance to use these for ndisc */
1057         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1058                                 ? DST_HOST
1059                                 : 0;
1060         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1061         rt->rt6i_dst.plen = 128;
1062 #endif
1063
1064         spin_lock_bh(&icmp6_dst_lock);
1065         rt->dst.next = icmp6_dst_gc_list;
1066         icmp6_dst_gc_list = &rt->dst;
1067         spin_unlock_bh(&icmp6_dst_lock);
1068
1069         fib6_force_start_gc(net);
1070
1071 out:
1072         return &rt->dst;
1073 }
1074
1075 int icmp6_dst_gc(void)
1076 {
1077         struct dst_entry *dst, *next, **pprev;
1078         int more = 0;
1079
1080         next = NULL;
1081
1082         spin_lock_bh(&icmp6_dst_lock);
1083         pprev = &icmp6_dst_gc_list;
1084
1085         while ((dst = *pprev) != NULL) {
1086                 if (!atomic_read(&dst->__refcnt)) {
1087                         *pprev = dst->next;
1088                         dst_free(dst);
1089                 } else {
1090                         pprev = &dst->next;
1091                         ++more;
1092                 }
1093         }
1094
1095         spin_unlock_bh(&icmp6_dst_lock);
1096
1097         return more;
1098 }
1099
1100 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1101                             void *arg)
1102 {
1103         struct dst_entry *dst, **pprev;
1104
1105         spin_lock_bh(&icmp6_dst_lock);
1106         pprev = &icmp6_dst_gc_list;
1107         while ((dst = *pprev) != NULL) {
1108                 struct rt6_info *rt = (struct rt6_info *) dst;
1109                 if (func(rt, arg)) {
1110                         *pprev = dst->next;
1111                         dst_free(dst);
1112                 } else {
1113                         pprev = &dst->next;
1114                 }
1115         }
1116         spin_unlock_bh(&icmp6_dst_lock);
1117 }
1118
1119 static int ip6_dst_gc(struct dst_ops *ops)
1120 {
1121         unsigned long now = jiffies;
1122         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1123         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1124         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1125         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1126         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1127         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1128         int entries;
1129
1130         entries = dst_entries_get_fast(ops);
1131         if (time_after(rt_last_gc + rt_min_interval, now) &&
1132             entries <= rt_max_size)
1133                 goto out;
1134
1135         net->ipv6.ip6_rt_gc_expire++;
1136         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1137         net->ipv6.ip6_rt_last_gc = now;
1138         entries = dst_entries_get_slow(ops);
1139         if (entries < ops->gc_thresh)
1140                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1141 out:
1142         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1143         return entries > rt_max_size;
1144 }
1145
1146 /* Clean host part of a prefix. Not necessary in radix tree,
1147    but results in cleaner routing tables.
1148
1149    Remove it only when all the things will work!
1150  */
1151
1152 int ip6_dst_hoplimit(struct dst_entry *dst)
1153 {
1154         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1155         if (hoplimit == 0) {
1156                 struct net_device *dev = dst->dev;
1157                 struct inet6_dev *idev;
1158
1159                 rcu_read_lock();
1160                 idev = __in6_dev_get(dev);
1161                 if (idev)
1162                         hoplimit = idev->cnf.hop_limit;
1163                 else
1164                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1165                 rcu_read_unlock();
1166         }
1167         return hoplimit;
1168 }
1169 EXPORT_SYMBOL(ip6_dst_hoplimit);
1170
1171 /*
1172  *
1173  */
1174
1175 int ip6_route_add(struct fib6_config *cfg)
1176 {
1177         int err;
1178         struct net *net = cfg->fc_nlinfo.nl_net;
1179         struct rt6_info *rt = NULL;
1180         struct net_device *dev = NULL;
1181         struct inet6_dev *idev = NULL;
1182         struct fib6_table *table;
1183         int addr_type;
1184
1185         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1186                 return -EINVAL;
1187 #ifndef CONFIG_IPV6_SUBTREES
1188         if (cfg->fc_src_len)
1189                 return -EINVAL;
1190 #endif
1191         if (cfg->fc_ifindex) {
1192                 err = -ENODEV;
1193                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1194                 if (!dev)
1195                         goto out;
1196                 idev = in6_dev_get(dev);
1197                 if (!idev)
1198                         goto out;
1199         }
1200
1201         if (cfg->fc_metric == 0)
1202                 cfg->fc_metric = IP6_RT_PRIO_USER;
1203
1204         table = fib6_new_table(net, cfg->fc_table);
1205         if (table == NULL) {
1206                 err = -ENOBUFS;
1207                 goto out;
1208         }
1209
1210         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1211
1212         if (rt == NULL) {
1213                 err = -ENOMEM;
1214                 goto out;
1215         }
1216
1217         rt->dst.obsolete = -1;
1218         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1219                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1220                                 0;
1221
1222         if (cfg->fc_protocol == RTPROT_UNSPEC)
1223                 cfg->fc_protocol = RTPROT_BOOT;
1224         rt->rt6i_protocol = cfg->fc_protocol;
1225
1226         addr_type = ipv6_addr_type(&cfg->fc_dst);
1227
1228         if (addr_type & IPV6_ADDR_MULTICAST)
1229                 rt->dst.input = ip6_mc_input;
1230         else if (cfg->fc_flags & RTF_LOCAL)
1231                 rt->dst.input = ip6_input;
1232         else
1233                 rt->dst.input = ip6_forward;
1234
1235         rt->dst.output = ip6_output;
1236
1237         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1238         rt->rt6i_dst.plen = cfg->fc_dst_len;
1239         if (rt->rt6i_dst.plen == 128)
1240                rt->dst.flags = DST_HOST;
1241
1242 #ifdef CONFIG_IPV6_SUBTREES
1243         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1244         rt->rt6i_src.plen = cfg->fc_src_len;
1245 #endif
1246
1247         rt->rt6i_metric = cfg->fc_metric;
1248
1249         /* We cannot add true routes via loopback here,
1250            they would result in kernel looping; promote them to reject routes
1251          */
1252         if ((cfg->fc_flags & RTF_REJECT) ||
1253             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1254                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1255                 /* hold loopback dev/idev if we haven't done so. */
1256                 if (dev != net->loopback_dev) {
1257                         if (dev) {
1258                                 dev_put(dev);
1259                                 in6_dev_put(idev);
1260                         }
1261                         dev = net->loopback_dev;
1262                         dev_hold(dev);
1263                         idev = in6_dev_get(dev);
1264                         if (!idev) {
1265                                 err = -ENODEV;
1266                                 goto out;
1267                         }
1268                 }
1269                 rt->dst.output = ip6_pkt_discard_out;
1270                 rt->dst.input = ip6_pkt_discard;
1271                 rt->dst.error = -ENETUNREACH;
1272                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1273                 goto install_route;
1274         }
1275
1276         if (cfg->fc_flags & RTF_GATEWAY) {
1277                 struct in6_addr *gw_addr;
1278                 int gwa_type;
1279
1280                 gw_addr = &cfg->fc_gateway;
1281                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1282                 gwa_type = ipv6_addr_type(gw_addr);
1283
1284                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1285                         struct rt6_info *grt;
1286
1287                         /* IPv6 strictly inhibits using not link-local
1288                            addresses as nexthop address.
1289                            Otherwise, router will not able to send redirects.
1290                            It is very good, but in some (rare!) circumstances
1291                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1292                            some exceptions. --ANK
1293                          */
1294                         err = -EINVAL;
1295                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1296                                 goto out;
1297
1298                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1299
1300                         err = -EHOSTUNREACH;
1301                         if (grt == NULL)
1302                                 goto out;
1303                         if (dev) {
1304                                 if (dev != grt->rt6i_dev) {
1305                                         dst_release(&grt->dst);
1306                                         goto out;
1307                                 }
1308                         } else {
1309                                 dev = grt->rt6i_dev;
1310                                 idev = grt->rt6i_idev;
1311                                 dev_hold(dev);
1312                                 in6_dev_hold(grt->rt6i_idev);
1313                         }
1314                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1315                                 err = 0;
1316                         dst_release(&grt->dst);
1317
1318                         if (err)
1319                                 goto out;
1320                 }
1321                 err = -EINVAL;
1322                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1323                         goto out;
1324         }
1325
1326         err = -ENODEV;
1327         if (dev == NULL)
1328                 goto out;
1329
1330         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1331                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1332                 if (IS_ERR(rt->rt6i_nexthop)) {
1333                         err = PTR_ERR(rt->rt6i_nexthop);
1334                         rt->rt6i_nexthop = NULL;
1335                         goto out;
1336                 }
1337         }
1338
1339         rt->rt6i_flags = cfg->fc_flags;
1340
1341 install_route:
1342         if (cfg->fc_mx) {
1343                 struct nlattr *nla;
1344                 int remaining;
1345
1346                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1347                         int type = nla_type(nla);
1348
1349                         if (type) {
1350                                 if (type > RTAX_MAX) {
1351                                         err = -EINVAL;
1352                                         goto out;
1353                                 }
1354
1355                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1356                         }
1357                 }
1358         }
1359
1360         rt->dst.dev = dev;
1361         rt->rt6i_idev = idev;
1362         rt->rt6i_table = table;
1363
1364         cfg->fc_nlinfo.nl_net = dev_net(dev);
1365
1366         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1367
1368 out:
1369         if (dev)
1370                 dev_put(dev);
1371         if (idev)
1372                 in6_dev_put(idev);
1373         if (rt)
1374                 dst_free(&rt->dst);
1375         return err;
1376 }
1377
1378 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1379 {
1380         int err;
1381         struct fib6_table *table;
1382         struct net *net = dev_net(rt->rt6i_dev);
1383
1384         if (rt == net->ipv6.ip6_null_entry)
1385                 return -ENOENT;
1386
1387         table = rt->rt6i_table;
1388         write_lock_bh(&table->tb6_lock);
1389
1390         err = fib6_del(rt, info);
1391         dst_release(&rt->dst);
1392
1393         write_unlock_bh(&table->tb6_lock);
1394
1395         return err;
1396 }
1397
1398 int ip6_del_rt(struct rt6_info *rt)
1399 {
1400         struct nl_info info = {
1401                 .nl_net = dev_net(rt->rt6i_dev),
1402         };
1403         return __ip6_del_rt(rt, &info);
1404 }
1405
1406 static int ip6_route_del(struct fib6_config *cfg)
1407 {
1408         struct fib6_table *table;
1409         struct fib6_node *fn;
1410         struct rt6_info *rt;
1411         int err = -ESRCH;
1412
1413         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1414         if (table == NULL)
1415                 return err;
1416
1417         read_lock_bh(&table->tb6_lock);
1418
1419         fn = fib6_locate(&table->tb6_root,
1420                          &cfg->fc_dst, cfg->fc_dst_len,
1421                          &cfg->fc_src, cfg->fc_src_len);
1422
1423         if (fn) {
1424                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1425                         if (cfg->fc_ifindex &&
1426                             (rt->rt6i_dev == NULL ||
1427                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1428                                 continue;
1429                         if (cfg->fc_flags & RTF_GATEWAY &&
1430                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1431                                 continue;
1432                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1433                                 continue;
1434                         dst_hold(&rt->dst);
1435                         read_unlock_bh(&table->tb6_lock);
1436
1437                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1438                 }
1439         }
1440         read_unlock_bh(&table->tb6_lock);
1441
1442         return err;
1443 }
1444
1445 /*
1446  *      Handle redirects
1447  */
1448 struct ip6rd_flowi {
1449         struct flowi fl;
1450         struct in6_addr gateway;
1451 };
1452
1453 static struct rt6_info *__ip6_route_redirect(struct net *net,
1454                                              struct fib6_table *table,
1455                                              struct flowi *fl,
1456                                              int flags)
1457 {
1458         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1459         struct rt6_info *rt;
1460         struct fib6_node *fn;
1461
1462         /*
1463          * Get the "current" route for this destination and
1464          * check if the redirect has come from approriate router.
1465          *
1466          * RFC 2461 specifies that redirects should only be
1467          * accepted if they come from the nexthop to the target.
1468          * Due to the way the routes are chosen, this notion
1469          * is a bit fuzzy and one might need to check all possible
1470          * routes.
1471          */
1472
1473         read_lock_bh(&table->tb6_lock);
1474         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1475 restart:
1476         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1477                 /*
1478                  * Current route is on-link; redirect is always invalid.
1479                  *
1480                  * Seems, previous statement is not true. It could
1481                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1482                  * But then router serving it might decide, that we should
1483                  * know truth 8)8) --ANK (980726).
1484                  */
1485                 if (rt6_check_expired(rt))
1486                         continue;
1487                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1488                         continue;
1489                 if (fl->oif != rt->rt6i_dev->ifindex)
1490                         continue;
1491                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1492                         continue;
1493                 break;
1494         }
1495
1496         if (!rt)
1497                 rt = net->ipv6.ip6_null_entry;
1498         BACKTRACK(net, &fl->fl6_src);
1499 out:
1500         dst_hold(&rt->dst);
1501
1502         read_unlock_bh(&table->tb6_lock);
1503
1504         return rt;
1505 };
1506
1507 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1508                                            struct in6_addr *src,
1509                                            struct in6_addr *gateway,
1510                                            struct net_device *dev)
1511 {
1512         int flags = RT6_LOOKUP_F_HAS_SADDR;
1513         struct net *net = dev_net(dev);
1514         struct ip6rd_flowi rdfl = {
1515                 .fl = {
1516                         .oif = dev->ifindex,
1517                         .fl6_dst = *dest,
1518                         .fl6_src = *src,
1519                 },
1520         };
1521
1522         ipv6_addr_copy(&rdfl.gateway, gateway);
1523
1524         if (rt6_need_strict(dest))
1525                 flags |= RT6_LOOKUP_F_IFACE;
1526
1527         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1528                                                    flags, __ip6_route_redirect);
1529 }
1530
1531 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1532                   struct in6_addr *saddr,
1533                   struct neighbour *neigh, u8 *lladdr, int on_link)
1534 {
1535         struct rt6_info *rt, *nrt = NULL;
1536         struct netevent_redirect netevent;
1537         struct net *net = dev_net(neigh->dev);
1538
1539         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1540
1541         if (rt == net->ipv6.ip6_null_entry) {
1542                 if (net_ratelimit())
1543                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1544                                "for redirect target\n");
1545                 goto out;
1546         }
1547
1548         /*
1549          *      We have finally decided to accept it.
1550          */
1551
1552         neigh_update(neigh, lladdr, NUD_STALE,
1553                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1554                      NEIGH_UPDATE_F_OVERRIDE|
1555                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1556                                      NEIGH_UPDATE_F_ISROUTER))
1557                      );
1558
1559         /*
1560          * Redirect received -> path was valid.
1561          * Look, redirects are sent only in response to data packets,
1562          * so that this nexthop apparently is reachable. --ANK
1563          */
1564         dst_confirm(&rt->dst);
1565
1566         /* Duplicate redirect: silently ignore. */
1567         if (neigh == rt->dst.neighbour)
1568                 goto out;
1569
1570         nrt = ip6_rt_copy(rt);
1571         if (nrt == NULL)
1572                 goto out;
1573
1574         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1575         if (on_link)
1576                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1577
1578         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1579         nrt->rt6i_dst.plen = 128;
1580         nrt->dst.flags |= DST_HOST;
1581
1582         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1583         nrt->rt6i_nexthop = neigh_clone(neigh);
1584
1585         if (ip6_ins_rt(nrt))
1586                 goto out;
1587
1588         netevent.old = &rt->dst;
1589         netevent.new = &nrt->dst;
1590         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1591
1592         if (rt->rt6i_flags&RTF_CACHE) {
1593                 ip6_del_rt(rt);
1594                 return;
1595         }
1596
1597 out:
1598         dst_release(&rt->dst);
1599 }
1600
1601 /*
1602  *      Handle ICMP "packet too big" messages
1603  *      i.e. Path MTU discovery
1604  */
1605
1606 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1607                              struct net *net, u32 pmtu, int ifindex)
1608 {
1609         struct rt6_info *rt, *nrt;
1610         int allfrag = 0;
1611 again:
1612         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1613         if (rt == NULL)
1614                 return;
1615
1616         if (rt6_check_expired(rt)) {
1617                 ip6_del_rt(rt);
1618                 goto again;
1619         }
1620
1621         if (pmtu >= dst_mtu(&rt->dst))
1622                 goto out;
1623
1624         if (pmtu < IPV6_MIN_MTU) {
1625                 /*
1626                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1627                  * MTU (1280) and a fragment header should always be included
1628                  * after a node receiving Too Big message reporting PMTU is
1629                  * less than the IPv6 Minimum Link MTU.
1630                  */
1631                 pmtu = IPV6_MIN_MTU;
1632                 allfrag = 1;
1633         }
1634
1635         /* New mtu received -> path was valid.
1636            They are sent only in response to data packets,
1637            so that this nexthop apparently is reachable. --ANK
1638          */
1639         dst_confirm(&rt->dst);
1640
1641         /* Host route. If it is static, it would be better
1642            not to override it, but add new one, so that
1643            when cache entry will expire old pmtu
1644            would return automatically.
1645          */
1646         if (rt->rt6i_flags & RTF_CACHE) {
1647                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1648                 if (allfrag) {
1649                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1650                         features |= RTAX_FEATURE_ALLFRAG;
1651                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1652                 }
1653                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1654                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1655                 goto out;
1656         }
1657
1658         /* Network route.
1659            Two cases are possible:
1660            1. It is connected route. Action: COW
1661            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1662          */
1663         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1664                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1665         else
1666                 nrt = rt6_alloc_clone(rt, daddr);
1667
1668         if (nrt) {
1669                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1670                 if (allfrag) {
1671                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1672                         features |= RTAX_FEATURE_ALLFRAG;
1673                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1674                 }
1675
1676                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1677                  * happened within 5 mins, the recommended timer is 10 mins.
1678                  * Here this route expiration time is set to ip6_rt_mtu_expires
1679                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1680                  * and detecting PMTU increase will be automatically happened.
1681                  */
1682                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1683                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1684
1685                 ip6_ins_rt(nrt);
1686         }
1687 out:
1688         dst_release(&rt->dst);
1689 }
1690
1691 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1692                         struct net_device *dev, u32 pmtu)
1693 {
1694         struct net *net = dev_net(dev);
1695
1696         /*
1697          * RFC 1981 states that a node "MUST reduce the size of the packets it
1698          * is sending along the path" that caused the Packet Too Big message.
1699          * Since it's not possible in the general case to determine which
1700          * interface was used to send the original packet, we update the MTU
1701          * on the interface that will be used to send future packets. We also
1702          * update the MTU on the interface that received the Packet Too Big in
1703          * case the original packet was forced out that interface with
1704          * SO_BINDTODEVICE or similar. This is the next best thing to the
1705          * correct behaviour, which would be to update the MTU on all
1706          * interfaces.
1707          */
1708         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1709         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1710 }
1711
1712 /*
1713  *      Misc support functions
1714  */
1715
1716 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1717 {
1718         struct net *net = dev_net(ort->rt6i_dev);
1719         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1720
1721         if (rt) {
1722                 rt->dst.input = ort->dst.input;
1723                 rt->dst.output = ort->dst.output;
1724
1725                 dst_copy_metrics(&rt->dst, &ort->dst);
1726                 rt->dst.error = ort->dst.error;
1727                 rt->dst.dev = ort->dst.dev;
1728                 if (rt->dst.dev)
1729                         dev_hold(rt->dst.dev);
1730                 rt->rt6i_idev = ort->rt6i_idev;
1731                 if (rt->rt6i_idev)
1732                         in6_dev_hold(rt->rt6i_idev);
1733                 rt->dst.lastuse = jiffies;
1734                 rt->rt6i_expires = 0;
1735
1736                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1737                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1738                 rt->rt6i_metric = 0;
1739
1740                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1741 #ifdef CONFIG_IPV6_SUBTREES
1742                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1743 #endif
1744                 rt->rt6i_table = ort->rt6i_table;
1745         }
1746         return rt;
1747 }
1748
1749 #ifdef CONFIG_IPV6_ROUTE_INFO
1750 static struct rt6_info *rt6_get_route_info(struct net *net,
1751                                            struct in6_addr *prefix, int prefixlen,
1752                                            struct in6_addr *gwaddr, int ifindex)
1753 {
1754         struct fib6_node *fn;
1755         struct rt6_info *rt = NULL;
1756         struct fib6_table *table;
1757
1758         table = fib6_get_table(net, RT6_TABLE_INFO);
1759         if (table == NULL)
1760                 return NULL;
1761
1762         write_lock_bh(&table->tb6_lock);
1763         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1764         if (!fn)
1765                 goto out;
1766
1767         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1768                 if (rt->rt6i_dev->ifindex != ifindex)
1769                         continue;
1770                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1771                         continue;
1772                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1773                         continue;
1774                 dst_hold(&rt->dst);
1775                 break;
1776         }
1777 out:
1778         write_unlock_bh(&table->tb6_lock);
1779         return rt;
1780 }
1781
1782 static struct rt6_info *rt6_add_route_info(struct net *net,
1783                                            struct in6_addr *prefix, int prefixlen,
1784                                            struct in6_addr *gwaddr, int ifindex,
1785                                            unsigned pref)
1786 {
1787         struct fib6_config cfg = {
1788                 .fc_table       = RT6_TABLE_INFO,
1789                 .fc_metric      = IP6_RT_PRIO_USER,
1790                 .fc_ifindex     = ifindex,
1791                 .fc_dst_len     = prefixlen,
1792                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1793                                   RTF_UP | RTF_PREF(pref),
1794                 .fc_nlinfo.pid = 0,
1795                 .fc_nlinfo.nlh = NULL,
1796                 .fc_nlinfo.nl_net = net,
1797         };
1798
1799         ipv6_addr_copy(&cfg.fc_dst, prefix);
1800         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1801
1802         /* We should treat it as a default route if prefix length is 0. */
1803         if (!prefixlen)
1804                 cfg.fc_flags |= RTF_DEFAULT;
1805
1806         ip6_route_add(&cfg);
1807
1808         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1809 }
1810 #endif
1811
1812 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1813 {
1814         struct rt6_info *rt;
1815         struct fib6_table *table;
1816
1817         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1818         if (table == NULL)
1819                 return NULL;
1820
1821         write_lock_bh(&table->tb6_lock);
1822         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1823                 if (dev == rt->rt6i_dev &&
1824                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1825                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1826                         break;
1827         }
1828         if (rt)
1829                 dst_hold(&rt->dst);
1830         write_unlock_bh(&table->tb6_lock);
1831         return rt;
1832 }
1833
1834 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1835                                      struct net_device *dev,
1836                                      unsigned int pref)
1837 {
1838         struct fib6_config cfg = {
1839                 .fc_table       = RT6_TABLE_DFLT,
1840                 .fc_metric      = IP6_RT_PRIO_USER,
1841                 .fc_ifindex     = dev->ifindex,
1842                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1843                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1844                 .fc_nlinfo.pid = 0,
1845                 .fc_nlinfo.nlh = NULL,
1846                 .fc_nlinfo.nl_net = dev_net(dev),
1847         };
1848
1849         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1850
1851         ip6_route_add(&cfg);
1852
1853         return rt6_get_dflt_router(gwaddr, dev);
1854 }
1855
1856 void rt6_purge_dflt_routers(struct net *net)
1857 {
1858         struct rt6_info *rt;
1859         struct fib6_table *table;
1860
1861         /* NOTE: Keep consistent with rt6_get_dflt_router */
1862         table = fib6_get_table(net, RT6_TABLE_DFLT);
1863         if (table == NULL)
1864                 return;
1865
1866 restart:
1867         read_lock_bh(&table->tb6_lock);
1868         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1869                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1870                         dst_hold(&rt->dst);
1871                         read_unlock_bh(&table->tb6_lock);
1872                         ip6_del_rt(rt);
1873                         goto restart;
1874                 }
1875         }
1876         read_unlock_bh(&table->tb6_lock);
1877 }
1878
1879 static void rtmsg_to_fib6_config(struct net *net,
1880                                  struct in6_rtmsg *rtmsg,
1881                                  struct fib6_config *cfg)
1882 {
1883         memset(cfg, 0, sizeof(*cfg));
1884
1885         cfg->fc_table = RT6_TABLE_MAIN;
1886         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1887         cfg->fc_metric = rtmsg->rtmsg_metric;
1888         cfg->fc_expires = rtmsg->rtmsg_info;
1889         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1890         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1891         cfg->fc_flags = rtmsg->rtmsg_flags;
1892
1893         cfg->fc_nlinfo.nl_net = net;
1894
1895         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1896         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1897         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1898 }
1899
1900 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1901 {
1902         struct fib6_config cfg;
1903         struct in6_rtmsg rtmsg;
1904         int err;
1905
1906         switch(cmd) {
1907         case SIOCADDRT:         /* Add a route */
1908         case SIOCDELRT:         /* Delete a route */
1909                 if (!capable(CAP_NET_ADMIN))
1910                         return -EPERM;
1911                 err = copy_from_user(&rtmsg, arg,
1912                                      sizeof(struct in6_rtmsg));
1913                 if (err)
1914                         return -EFAULT;
1915
1916                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1917
1918                 rtnl_lock();
1919                 switch (cmd) {
1920                 case SIOCADDRT:
1921                         err = ip6_route_add(&cfg);
1922                         break;
1923                 case SIOCDELRT:
1924                         err = ip6_route_del(&cfg);
1925                         break;
1926                 default:
1927                         err = -EINVAL;
1928                 }
1929                 rtnl_unlock();
1930
1931                 return err;
1932         }
1933
1934         return -EINVAL;
1935 }
1936
1937 /*
1938  *      Drop the packet on the floor
1939  */
1940
1941 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1942 {
1943         int type;
1944         struct dst_entry *dst = skb_dst(skb);
1945         switch (ipstats_mib_noroutes) {
1946         case IPSTATS_MIB_INNOROUTES:
1947                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1948                 if (type == IPV6_ADDR_ANY) {
1949                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1950                                       IPSTATS_MIB_INADDRERRORS);
1951                         break;
1952                 }
1953                 /* FALLTHROUGH */
1954         case IPSTATS_MIB_OUTNOROUTES:
1955                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1956                               ipstats_mib_noroutes);
1957                 break;
1958         }
1959         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1960         kfree_skb(skb);
1961         return 0;
1962 }
1963
1964 static int ip6_pkt_discard(struct sk_buff *skb)
1965 {
1966         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1967 }
1968
1969 static int ip6_pkt_discard_out(struct sk_buff *skb)
1970 {
1971         skb->dev = skb_dst(skb)->dev;
1972         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1973 }
1974
1975 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1976
1977 static int ip6_pkt_prohibit(struct sk_buff *skb)
1978 {
1979         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1980 }
1981
1982 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1983 {
1984         skb->dev = skb_dst(skb)->dev;
1985         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1986 }
1987
1988 #endif
1989
1990 /*
1991  *      Allocate a dst for local (unicast / anycast) address.
1992  */
1993
1994 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1995                                     const struct in6_addr *addr,
1996                                     int anycast)
1997 {
1998         struct net *net = dev_net(idev->dev);
1999         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
2000         struct neighbour *neigh;
2001
2002         if (rt == NULL) {
2003                 if (net_ratelimit())
2004                         pr_warning("IPv6:  Maximum number of routes reached,"
2005                                    " consider increasing route/max_size.\n");
2006                 return ERR_PTR(-ENOMEM);
2007         }
2008
2009         dev_hold(net->loopback_dev);
2010         in6_dev_hold(idev);
2011
2012         rt->dst.flags = DST_HOST;
2013         rt->dst.input = ip6_input;
2014         rt->dst.output = ip6_output;
2015         rt->rt6i_dev = net->loopback_dev;
2016         rt->rt6i_idev = idev;
2017         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
2018         rt->dst.obsolete = -1;
2019
2020         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2021         if (anycast)
2022                 rt->rt6i_flags |= RTF_ANYCAST;
2023         else
2024                 rt->rt6i_flags |= RTF_LOCAL;
2025         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2026         if (IS_ERR(neigh)) {
2027                 dst_free(&rt->dst);
2028
2029                 /* We are casting this because that is the return
2030                  * value type.  But an errno encoded pointer is the
2031                  * same regardless of the underlying pointer type,
2032                  * and that's what we are returning.  So this is OK.
2033                  */
2034                 return (struct rt6_info *) neigh;
2035         }
2036         rt->rt6i_nexthop = neigh;
2037
2038         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2039         rt->rt6i_dst.plen = 128;
2040         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2041
2042         atomic_set(&rt->dst.__refcnt, 1);
2043
2044         return rt;
2045 }
2046
2047 struct arg_dev_net {
2048         struct net_device *dev;
2049         struct net *net;
2050 };
2051
2052 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2053 {
2054         const struct arg_dev_net *adn = arg;
2055         const struct net_device *dev = adn->dev;
2056
2057         if ((rt->rt6i_dev == dev || dev == NULL) &&
2058             rt != adn->net->ipv6.ip6_null_entry) {
2059                 RT6_TRACE("deleted by ifdown %p\n", rt);
2060                 return -1;
2061         }
2062         return 0;
2063 }
2064
2065 void rt6_ifdown(struct net *net, struct net_device *dev)
2066 {
2067         struct arg_dev_net adn = {
2068                 .dev = dev,
2069                 .net = net,
2070         };
2071
2072         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2073         icmp6_clean_all(fib6_ifdown, &adn);
2074 }
2075
2076 struct rt6_mtu_change_arg
2077 {
2078         struct net_device *dev;
2079         unsigned mtu;
2080 };
2081
2082 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2083 {
2084         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2085         struct inet6_dev *idev;
2086
2087         /* In IPv6 pmtu discovery is not optional,
2088            so that RTAX_MTU lock cannot disable it.
2089            We still use this lock to block changes
2090            caused by addrconf/ndisc.
2091         */
2092
2093         idev = __in6_dev_get(arg->dev);
2094         if (idev == NULL)
2095                 return 0;
2096
2097         /* For administrative MTU increase, there is no way to discover
2098            IPv6 PMTU increase, so PMTU increase should be updated here.
2099            Since RFC 1981 doesn't include administrative MTU increase
2100            update PMTU increase is a MUST. (i.e. jumbo frame)
2101          */
2102         /*
2103            If new MTU is less than route PMTU, this new MTU will be the
2104            lowest MTU in the path, update the route PMTU to reflect PMTU
2105            decreases; if new MTU is greater than route PMTU, and the
2106            old MTU is the lowest MTU in the path, update the route PMTU
2107            to reflect the increase. In this case if the other nodes' MTU
2108            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2109            PMTU discouvery.
2110          */
2111         if (rt->rt6i_dev == arg->dev &&
2112             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2113             (dst_mtu(&rt->dst) >= arg->mtu ||
2114              (dst_mtu(&rt->dst) < arg->mtu &&
2115               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2116                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2117         }
2118         return 0;
2119 }
2120
2121 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2122 {
2123         struct rt6_mtu_change_arg arg = {
2124                 .dev = dev,
2125                 .mtu = mtu,
2126         };
2127
2128         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2129 }
2130
2131 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2132         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2133         [RTA_OIF]               = { .type = NLA_U32 },
2134         [RTA_IIF]               = { .type = NLA_U32 },
2135         [RTA_PRIORITY]          = { .type = NLA_U32 },
2136         [RTA_METRICS]           = { .type = NLA_NESTED },
2137 };
2138
2139 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2140                               struct fib6_config *cfg)
2141 {
2142         struct rtmsg *rtm;
2143         struct nlattr *tb[RTA_MAX+1];
2144         int err;
2145
2146         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2147         if (err < 0)
2148                 goto errout;
2149
2150         err = -EINVAL;
2151         rtm = nlmsg_data(nlh);
2152         memset(cfg, 0, sizeof(*cfg));
2153
2154         cfg->fc_table = rtm->rtm_table;
2155         cfg->fc_dst_len = rtm->rtm_dst_len;
2156         cfg->fc_src_len = rtm->rtm_src_len;
2157         cfg->fc_flags = RTF_UP;
2158         cfg->fc_protocol = rtm->rtm_protocol;
2159
2160         if (rtm->rtm_type == RTN_UNREACHABLE)
2161                 cfg->fc_flags |= RTF_REJECT;
2162
2163         if (rtm->rtm_type == RTN_LOCAL)
2164                 cfg->fc_flags |= RTF_LOCAL;
2165
2166         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2167         cfg->fc_nlinfo.nlh = nlh;
2168         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2169
2170         if (tb[RTA_GATEWAY]) {
2171                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2172                 cfg->fc_flags |= RTF_GATEWAY;
2173         }
2174
2175         if (tb[RTA_DST]) {
2176                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2177
2178                 if (nla_len(tb[RTA_DST]) < plen)
2179                         goto errout;
2180
2181                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2182         }
2183
2184         if (tb[RTA_SRC]) {
2185                 int plen = (rtm->rtm_src_len + 7) >> 3;
2186
2187                 if (nla_len(tb[RTA_SRC]) < plen)
2188                         goto errout;
2189
2190                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2191         }
2192
2193         if (tb[RTA_OIF])
2194                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2195
2196         if (tb[RTA_PRIORITY])
2197                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2198
2199         if (tb[RTA_METRICS]) {
2200                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2201                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2202         }
2203
2204         if (tb[RTA_TABLE])
2205                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2206
2207         err = 0;
2208 errout:
2209         return err;
2210 }
2211
2212 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2213 {
2214         struct fib6_config cfg;
2215         int err;
2216
2217         err = rtm_to_fib6_config(skb, nlh, &cfg);
2218         if (err < 0)
2219                 return err;
2220
2221         return ip6_route_del(&cfg);
2222 }
2223
2224 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2225 {
2226         struct fib6_config cfg;
2227         int err;
2228
2229         err = rtm_to_fib6_config(skb, nlh, &cfg);
2230         if (err < 0)
2231                 return err;
2232
2233         return ip6_route_add(&cfg);
2234 }
2235
2236 static inline size_t rt6_nlmsg_size(void)
2237 {
2238         return NLMSG_ALIGN(sizeof(struct rtmsg))
2239                + nla_total_size(16) /* RTA_SRC */
2240                + nla_total_size(16) /* RTA_DST */
2241                + nla_total_size(16) /* RTA_GATEWAY */
2242                + nla_total_size(16) /* RTA_PREFSRC */
2243                + nla_total_size(4) /* RTA_TABLE */
2244                + nla_total_size(4) /* RTA_IIF */
2245                + nla_total_size(4) /* RTA_OIF */
2246                + nla_total_size(4) /* RTA_PRIORITY */
2247                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2248                + nla_total_size(sizeof(struct rta_cacheinfo));
2249 }
2250
2251 static int rt6_fill_node(struct net *net,
2252                          struct sk_buff *skb, struct rt6_info *rt,
2253                          struct in6_addr *dst, struct in6_addr *src,
2254                          int iif, int type, u32 pid, u32 seq,
2255                          int prefix, int nowait, unsigned int flags)
2256 {
2257         struct rtmsg *rtm;
2258         struct nlmsghdr *nlh;
2259         long expires;
2260         u32 table;
2261
2262         if (prefix) {   /* user wants prefix routes only */
2263                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2264                         /* success since this is not a prefix route */
2265                         return 1;
2266                 }
2267         }
2268
2269         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2270         if (nlh == NULL)
2271                 return -EMSGSIZE;
2272
2273         rtm = nlmsg_data(nlh);
2274         rtm->rtm_family = AF_INET6;
2275         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2276         rtm->rtm_src_len = rt->rt6i_src.plen;
2277         rtm->rtm_tos = 0;
2278         if (rt->rt6i_table)
2279                 table = rt->rt6i_table->tb6_id;
2280         else
2281                 table = RT6_TABLE_UNSPEC;
2282         rtm->rtm_table = table;
2283         NLA_PUT_U32(skb, RTA_TABLE, table);
2284         if (rt->rt6i_flags&RTF_REJECT)
2285                 rtm->rtm_type = RTN_UNREACHABLE;
2286         else if (rt->rt6i_flags&RTF_LOCAL)
2287                 rtm->rtm_type = RTN_LOCAL;
2288         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2289                 rtm->rtm_type = RTN_LOCAL;
2290         else
2291                 rtm->rtm_type = RTN_UNICAST;
2292         rtm->rtm_flags = 0;
2293         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2294         rtm->rtm_protocol = rt->rt6i_protocol;
2295         if (rt->rt6i_flags&RTF_DYNAMIC)
2296                 rtm->rtm_protocol = RTPROT_REDIRECT;
2297         else if (rt->rt6i_flags & RTF_ADDRCONF)
2298                 rtm->rtm_protocol = RTPROT_KERNEL;
2299         else if (rt->rt6i_flags&RTF_DEFAULT)
2300                 rtm->rtm_protocol = RTPROT_RA;
2301
2302         if (rt->rt6i_flags&RTF_CACHE)
2303                 rtm->rtm_flags |= RTM_F_CLONED;
2304
2305         if (dst) {
2306                 NLA_PUT(skb, RTA_DST, 16, dst);
2307                 rtm->rtm_dst_len = 128;
2308         } else if (rtm->rtm_dst_len)
2309                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2310 #ifdef CONFIG_IPV6_SUBTREES
2311         if (src) {
2312                 NLA_PUT(skb, RTA_SRC, 16, src);
2313                 rtm->rtm_src_len = 128;
2314         } else if (rtm->rtm_src_len)
2315                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2316 #endif
2317         if (iif) {
2318 #ifdef CONFIG_IPV6_MROUTE
2319                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2320                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2321                         if (err <= 0) {
2322                                 if (!nowait) {
2323                                         if (err == 0)
2324                                                 return 0;
2325                                         goto nla_put_failure;
2326                                 } else {
2327                                         if (err == -EMSGSIZE)
2328                                                 goto nla_put_failure;
2329                                 }
2330                         }
2331                 } else
2332 #endif
2333                         NLA_PUT_U32(skb, RTA_IIF, iif);
2334         } else if (dst) {
2335                 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2336                 struct in6_addr saddr_buf;
2337                 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2338                                        dst, 0, &saddr_buf) == 0)
2339                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2340         }
2341
2342         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2343                 goto nla_put_failure;
2344
2345         if (rt->dst.neighbour)
2346                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2347
2348         if (rt->dst.dev)
2349                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2350
2351         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2352
2353         if (!(rt->rt6i_flags & RTF_EXPIRES))
2354                 expires = 0;
2355         else if (rt->rt6i_expires - jiffies < INT_MAX)
2356                 expires = rt->rt6i_expires - jiffies;
2357         else
2358                 expires = INT_MAX;
2359
2360         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2361                                expires, rt->dst.error) < 0)
2362                 goto nla_put_failure;
2363
2364         return nlmsg_end(skb, nlh);
2365
2366 nla_put_failure:
2367         nlmsg_cancel(skb, nlh);
2368         return -EMSGSIZE;
2369 }
2370
2371 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2372 {
2373         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2374         int prefix;
2375
2376         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2377                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2378                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2379         } else
2380                 prefix = 0;
2381
2382         return rt6_fill_node(arg->net,
2383                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2384                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2385                      prefix, 0, NLM_F_MULTI);
2386 }
2387
2388 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2389 {
2390         struct net *net = sock_net(in_skb->sk);
2391         struct nlattr *tb[RTA_MAX+1];
2392         struct rt6_info *rt;
2393         struct sk_buff *skb;
2394         struct rtmsg *rtm;
2395         struct flowi fl;
2396         int err, iif = 0;
2397
2398         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2399         if (err < 0)
2400                 goto errout;
2401
2402         err = -EINVAL;
2403         memset(&fl, 0, sizeof(fl));
2404
2405         if (tb[RTA_SRC]) {
2406                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2407                         goto errout;
2408
2409                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2410         }
2411
2412         if (tb[RTA_DST]) {
2413                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2414                         goto errout;
2415
2416                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2417         }
2418
2419         if (tb[RTA_IIF])
2420                 iif = nla_get_u32(tb[RTA_IIF]);
2421
2422         if (tb[RTA_OIF])
2423                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2424
2425         if (iif) {
2426                 struct net_device *dev;
2427                 dev = __dev_get_by_index(net, iif);
2428                 if (!dev) {
2429                         err = -ENODEV;
2430                         goto errout;
2431                 }
2432         }
2433
2434         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2435         if (skb == NULL) {
2436                 err = -ENOBUFS;
2437                 goto errout;
2438         }
2439
2440         /* Reserve room for dummy headers, this skb can pass
2441            through good chunk of routing engine.
2442          */
2443         skb_reset_mac_header(skb);
2444         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2445
2446         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2447         skb_dst_set(skb, &rt->dst);
2448
2449         err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2450                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2451                             nlh->nlmsg_seq, 0, 0, 0);
2452         if (err < 0) {
2453                 kfree_skb(skb);
2454                 goto errout;
2455         }
2456
2457         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2458 errout:
2459         return err;
2460 }
2461
2462 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2463 {
2464         struct sk_buff *skb;
2465         struct net *net = info->nl_net;
2466         u32 seq;
2467         int err;
2468
2469         err = -ENOBUFS;
2470         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2471
2472         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2473         if (skb == NULL)
2474                 goto errout;
2475
2476         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2477                                 event, info->pid, seq, 0, 0, 0);
2478         if (err < 0) {
2479                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2480                 WARN_ON(err == -EMSGSIZE);
2481                 kfree_skb(skb);
2482                 goto errout;
2483         }
2484         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2485                     info->nlh, gfp_any());
2486         return;
2487 errout:
2488         if (err < 0)
2489                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2490 }
2491
2492 static int ip6_route_dev_notify(struct notifier_block *this,
2493                                 unsigned long event, void *data)
2494 {
2495         struct net_device *dev = (struct net_device *)data;
2496         struct net *net = dev_net(dev);
2497
2498         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2499                 net->ipv6.ip6_null_entry->dst.dev = dev;
2500                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2501 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2502                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2503                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2504                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2505                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2506 #endif
2507         }
2508
2509         return NOTIFY_OK;
2510 }
2511
2512 /*
2513  *      /proc
2514  */
2515
2516 #ifdef CONFIG_PROC_FS
2517
2518 struct rt6_proc_arg
2519 {
2520         char *buffer;
2521         int offset;
2522         int length;
2523         int skip;
2524         int len;
2525 };
2526
2527 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2528 {
2529         struct seq_file *m = p_arg;
2530
2531         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2532
2533 #ifdef CONFIG_IPV6_SUBTREES
2534         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2535 #else
2536         seq_puts(m, "00000000000000000000000000000000 00 ");
2537 #endif
2538
2539         if (rt->rt6i_nexthop) {
2540                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2541         } else {
2542                 seq_puts(m, "00000000000000000000000000000000");
2543         }
2544         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2545                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2546                    rt->dst.__use, rt->rt6i_flags,
2547                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2548         return 0;
2549 }
2550
2551 static int ipv6_route_show(struct seq_file *m, void *v)
2552 {
2553         struct net *net = (struct net *)m->private;
2554         fib6_clean_all(net, rt6_info_route, 0, m);
2555         return 0;
2556 }
2557
2558 static int ipv6_route_open(struct inode *inode, struct file *file)
2559 {
2560         return single_open_net(inode, file, ipv6_route_show);
2561 }
2562
2563 static const struct file_operations ipv6_route_proc_fops = {
2564         .owner          = THIS_MODULE,
2565         .open           = ipv6_route_open,
2566         .read           = seq_read,
2567         .llseek         = seq_lseek,
2568         .release        = single_release_net,
2569 };
2570
2571 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2572 {
2573         struct net *net = (struct net *)seq->private;
2574         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2575                    net->ipv6.rt6_stats->fib_nodes,
2576                    net->ipv6.rt6_stats->fib_route_nodes,
2577                    net->ipv6.rt6_stats->fib_rt_alloc,
2578                    net->ipv6.rt6_stats->fib_rt_entries,
2579                    net->ipv6.rt6_stats->fib_rt_cache,
2580                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2581                    net->ipv6.rt6_stats->fib_discarded_routes);
2582
2583         return 0;
2584 }
2585
2586 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2587 {
2588         return single_open_net(inode, file, rt6_stats_seq_show);
2589 }
2590
2591 static const struct file_operations rt6_stats_seq_fops = {
2592         .owner   = THIS_MODULE,
2593         .open    = rt6_stats_seq_open,
2594         .read    = seq_read,
2595         .llseek  = seq_lseek,
2596         .release = single_release_net,
2597 };
2598 #endif  /* CONFIG_PROC_FS */
2599
2600 #ifdef CONFIG_SYSCTL
2601
2602 static
2603 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2604                               void __user *buffer, size_t *lenp, loff_t *ppos)
2605 {
2606         struct net *net = current->nsproxy->net_ns;
2607         int delay = net->ipv6.sysctl.flush_delay;
2608         if (write) {
2609                 proc_dointvec(ctl, write, buffer, lenp, ppos);
2610                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2611                 return 0;
2612         } else
2613                 return -EINVAL;
2614 }
2615
2616 ctl_table ipv6_route_table_template[] = {
2617         {
2618                 .procname       =       "flush",
2619                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2620                 .maxlen         =       sizeof(int),
2621                 .mode           =       0200,
2622                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2623         },
2624         {
2625                 .procname       =       "gc_thresh",
2626                 .data           =       &ip6_dst_ops_template.gc_thresh,
2627                 .maxlen         =       sizeof(int),
2628                 .mode           =       0644,
2629                 .proc_handler   =       proc_dointvec,
2630         },
2631         {
2632                 .procname       =       "max_size",
2633                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2634                 .maxlen         =       sizeof(int),
2635                 .mode           =       0644,
2636                 .proc_handler   =       proc_dointvec,
2637         },
2638         {
2639                 .procname       =       "gc_min_interval",
2640                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2641                 .maxlen         =       sizeof(int),
2642                 .mode           =       0644,
2643                 .proc_handler   =       proc_dointvec_jiffies,
2644         },
2645         {
2646                 .procname       =       "gc_timeout",
2647                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2648                 .maxlen         =       sizeof(int),
2649                 .mode           =       0644,
2650                 .proc_handler   =       proc_dointvec_jiffies,
2651         },
2652         {
2653                 .procname       =       "gc_interval",
2654                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2655                 .maxlen         =       sizeof(int),
2656                 .mode           =       0644,
2657                 .proc_handler   =       proc_dointvec_jiffies,
2658         },
2659         {
2660                 .procname       =       "gc_elasticity",
2661                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2662                 .maxlen         =       sizeof(int),
2663                 .mode           =       0644,
2664                 .proc_handler   =       proc_dointvec,
2665         },
2666         {
2667                 .procname       =       "mtu_expires",
2668                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2669                 .maxlen         =       sizeof(int),
2670                 .mode           =       0644,
2671                 .proc_handler   =       proc_dointvec_jiffies,
2672         },
2673         {
2674                 .procname       =       "min_adv_mss",
2675                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2676                 .maxlen         =       sizeof(int),
2677                 .mode           =       0644,
2678                 .proc_handler   =       proc_dointvec,
2679         },
2680         {
2681                 .procname       =       "gc_min_interval_ms",
2682                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2683                 .maxlen         =       sizeof(int),
2684                 .mode           =       0644,
2685                 .proc_handler   =       proc_dointvec_ms_jiffies,
2686         },
2687         { }
2688 };
2689
2690 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2691 {
2692         struct ctl_table *table;
2693
2694         table = kmemdup(ipv6_route_table_template,
2695                         sizeof(ipv6_route_table_template),
2696                         GFP_KERNEL);
2697
2698         if (table) {
2699                 table[0].data = &net->ipv6.sysctl.flush_delay;
2700                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2701                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2702                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2703                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2704                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2705                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2706                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2707                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2708                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2709         }
2710
2711         return table;
2712 }
2713 #endif
2714
2715 static int __net_init ip6_route_net_init(struct net *net)
2716 {
2717         int ret = -ENOMEM;
2718
2719         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2720                sizeof(net->ipv6.ip6_dst_ops));
2721
2722         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2723                 goto out_ip6_dst_ops;
2724
2725         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2726                                            sizeof(*net->ipv6.ip6_null_entry),
2727                                            GFP_KERNEL);
2728         if (!net->ipv6.ip6_null_entry)
2729                 goto out_ip6_dst_entries;
2730         net->ipv6.ip6_null_entry->dst.path =
2731                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2732         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2733         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2734                          ip6_template_metrics, true);
2735
2736 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2737         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2738                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2739                                                GFP_KERNEL);
2740         if (!net->ipv6.ip6_prohibit_entry)
2741                 goto out_ip6_null_entry;
2742         net->ipv6.ip6_prohibit_entry->dst.path =
2743                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2744         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2745         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2746                          ip6_template_metrics, true);
2747
2748         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2749                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2750                                                GFP_KERNEL);
2751         if (!net->ipv6.ip6_blk_hole_entry)
2752                 goto out_ip6_prohibit_entry;
2753         net->ipv6.ip6_blk_hole_entry->dst.path =
2754                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2755         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2756         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2757                          ip6_template_metrics, true);
2758 #endif
2759
2760         net->ipv6.sysctl.flush_delay = 0;
2761         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2762         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2763         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2764         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2765         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2766         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2767         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2768
2769 #ifdef CONFIG_PROC_FS
2770         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2771         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2772 #endif
2773         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2774
2775         ret = 0;
2776 out:
2777         return ret;
2778
2779 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2780 out_ip6_prohibit_entry:
2781         kfree(net->ipv6.ip6_prohibit_entry);
2782 out_ip6_null_entry:
2783         kfree(net->ipv6.ip6_null_entry);
2784 #endif
2785 out_ip6_dst_entries:
2786         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2787 out_ip6_dst_ops:
2788         goto out;
2789 }
2790
2791 static void __net_exit ip6_route_net_exit(struct net *net)
2792 {
2793 #ifdef CONFIG_PROC_FS
2794         proc_net_remove(net, "ipv6_route");
2795         proc_net_remove(net, "rt6_stats");
2796 #endif
2797         kfree(net->ipv6.ip6_null_entry);
2798 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2799         kfree(net->ipv6.ip6_prohibit_entry);
2800         kfree(net->ipv6.ip6_blk_hole_entry);
2801 #endif
2802         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2803 }
2804
2805 static struct pernet_operations ip6_route_net_ops = {
2806         .init = ip6_route_net_init,
2807         .exit = ip6_route_net_exit,
2808 };
2809
2810 static struct notifier_block ip6_route_dev_notifier = {
2811         .notifier_call = ip6_route_dev_notify,
2812         .priority = 0,
2813 };
2814
2815 int __init ip6_route_init(void)
2816 {
2817         int ret;
2818
2819         ret = -ENOMEM;
2820         ip6_dst_ops_template.kmem_cachep =
2821                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2822                                   SLAB_HWCACHE_ALIGN, NULL);
2823         if (!ip6_dst_ops_template.kmem_cachep)
2824                 goto out;
2825
2826         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2827         if (ret)
2828                 goto out_kmem_cache;
2829
2830         ret = register_pernet_subsys(&ip6_route_net_ops);
2831         if (ret)
2832                 goto out_dst_entries;
2833
2834         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2835
2836         /* Registering of the loopback is done before this portion of code,
2837          * the loopback reference in rt6_info will not be taken, do it
2838          * manually for init_net */
2839         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2840         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2841   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2842         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2843         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2844         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2845         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2846   #endif
2847         ret = fib6_init();
2848         if (ret)
2849                 goto out_register_subsys;
2850
2851         ret = xfrm6_init();
2852         if (ret)
2853                 goto out_fib6_init;
2854
2855         ret = fib6_rules_init();
2856         if (ret)
2857                 goto xfrm6_init;
2858
2859         ret = -ENOBUFS;
2860         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2861             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2862             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2863                 goto fib6_rules_init;
2864
2865         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2866         if (ret)
2867                 goto fib6_rules_init;
2868
2869 out:
2870         return ret;
2871
2872 fib6_rules_init:
2873         fib6_rules_cleanup();
2874 xfrm6_init:
2875         xfrm6_fini();
2876 out_fib6_init:
2877         fib6_gc_cleanup();
2878 out_register_subsys:
2879         unregister_pernet_subsys(&ip6_route_net_ops);
2880 out_dst_entries:
2881         dst_entries_destroy(&ip6_dst_blackhole_ops);
2882 out_kmem_cache:
2883         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2884         goto out;
2885 }
2886
2887 void ip6_route_cleanup(void)
2888 {
2889         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2890         fib6_rules_cleanup();
2891         xfrm6_fini();
2892         fib6_gc_cleanup();
2893         unregister_pernet_subsys(&ip6_route_net_ops);
2894         dst_entries_destroy(&ip6_dst_blackhole_ops);
2895         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2896 }