]> Pileus Git - ~andy/linux/blob - net/ipv6/route.c
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[~andy/linux] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/export.h>
30 #include <linux/types.h>
31 #include <linux/times.h>
32 #include <linux/socket.h>
33 #include <linux/sockios.h>
34 #include <linux/net.h>
35 #include <linux/route.h>
36 #include <linux/netdevice.h>
37 #include <linux/in6.h>
38 #include <linux/mroute6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <linux/slab.h>
45 #include <net/net_namespace.h>
46 #include <net/snmp.h>
47 #include <net/ipv6.h>
48 #include <net/ip6_fib.h>
49 #include <net/ip6_route.h>
50 #include <net/ndisc.h>
51 #include <net/addrconf.h>
52 #include <net/tcp.h>
53 #include <linux/rtnetlink.h>
54 #include <net/dst.h>
55 #include <net/xfrm.h>
56 #include <net/netevent.h>
57 #include <net/netlink.h>
58
59 #include <asm/uaccess.h>
60
61 #ifdef CONFIG_SYSCTL
62 #include <linux/sysctl.h>
63 #endif
64
65 /* Set to 3 to get tracing. */
66 #define RT6_DEBUG 2
67
68 #if RT6_DEBUG >= 3
69 #define RDBG(x) printk x
70 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
71 #else
72 #define RDBG(x)
73 #define RT6_TRACE(x...) do { ; } while (0)
74 #endif
75
76 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
77                                     const struct in6_addr *dest);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
80 static unsigned int      ip6_mtu(const struct dst_entry *dst);
81 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
82 static void             ip6_dst_destroy(struct dst_entry *);
83 static void             ip6_dst_ifdown(struct dst_entry *,
84                                        struct net_device *dev, int how);
85 static int               ip6_dst_gc(struct dst_ops *ops);
86
87 static int              ip6_pkt_discard(struct sk_buff *skb);
88 static int              ip6_pkt_discard_out(struct sk_buff *skb);
89 static void             ip6_link_failure(struct sk_buff *skb);
90 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
91
92 #ifdef CONFIG_IPV6_ROUTE_INFO
93 static struct rt6_info *rt6_add_route_info(struct net *net,
94                                            const struct in6_addr *prefix, int prefixlen,
95                                            const struct in6_addr *gwaddr, int ifindex,
96                                            unsigned pref);
97 static struct rt6_info *rt6_get_route_info(struct net *net,
98                                            const struct in6_addr *prefix, int prefixlen,
99                                            const struct in6_addr *gwaddr, int ifindex);
100 #endif
101
102 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
103 {
104         struct rt6_info *rt = (struct rt6_info *) dst;
105         struct inet_peer *peer;
106         u32 *p = NULL;
107
108         if (!(rt->dst.flags & DST_HOST))
109                 return NULL;
110
111         if (!rt->rt6i_peer)
112                 rt6_bind_peer(rt, 1);
113
114         peer = rt->rt6i_peer;
115         if (peer) {
116                 u32 *old_p = __DST_METRICS_PTR(old);
117                 unsigned long prev, new;
118
119                 p = peer->metrics;
120                 if (inet_metrics_new(peer))
121                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
122
123                 new = (unsigned long) p;
124                 prev = cmpxchg(&dst->_metrics, old, new);
125
126                 if (prev != old) {
127                         p = __DST_METRICS_PTR(prev);
128                         if (prev & DST_METRICS_READ_ONLY)
129                                 p = NULL;
130                 }
131         }
132         return p;
133 }
134
135 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
136 {
137         return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
138 }
139
140 static struct dst_ops ip6_dst_ops_template = {
141         .family                 =       AF_INET6,
142         .protocol               =       cpu_to_be16(ETH_P_IPV6),
143         .gc                     =       ip6_dst_gc,
144         .gc_thresh              =       1024,
145         .check                  =       ip6_dst_check,
146         .default_advmss         =       ip6_default_advmss,
147         .mtu                    =       ip6_mtu,
148         .cow_metrics            =       ipv6_cow_metrics,
149         .destroy                =       ip6_dst_destroy,
150         .ifdown                 =       ip6_dst_ifdown,
151         .negative_advice        =       ip6_negative_advice,
152         .link_failure           =       ip6_link_failure,
153         .update_pmtu            =       ip6_rt_update_pmtu,
154         .local_out              =       __ip6_local_out,
155         .neigh_lookup           =       ip6_neigh_lookup,
156 };
157
158 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159 {
160         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
161
162         return mtu ? : dst->dev->mtu;
163 }
164
165 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
166 {
167 }
168
169 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
170                                          unsigned long old)
171 {
172         return NULL;
173 }
174
175 static struct dst_ops ip6_dst_blackhole_ops = {
176         .family                 =       AF_INET6,
177         .protocol               =       cpu_to_be16(ETH_P_IPV6),
178         .destroy                =       ip6_dst_destroy,
179         .check                  =       ip6_dst_check,
180         .mtu                    =       ip6_blackhole_mtu,
181         .default_advmss         =       ip6_default_advmss,
182         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
183         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
184         .neigh_lookup           =       ip6_neigh_lookup,
185 };
186
187 static const u32 ip6_template_metrics[RTAX_MAX] = {
188         [RTAX_HOPLIMIT - 1] = 255,
189 };
190
191 static struct rt6_info ip6_null_entry_template = {
192         .dst = {
193                 .__refcnt       = ATOMIC_INIT(1),
194                 .__use          = 1,
195                 .obsolete       = -1,
196                 .error          = -ENETUNREACH,
197                 .input          = ip6_pkt_discard,
198                 .output         = ip6_pkt_discard_out,
199         },
200         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
201         .rt6i_protocol  = RTPROT_KERNEL,
202         .rt6i_metric    = ~(u32) 0,
203         .rt6i_ref       = ATOMIC_INIT(1),
204 };
205
206 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
207
208 static int ip6_pkt_prohibit(struct sk_buff *skb);
209 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
210
211 static struct rt6_info ip6_prohibit_entry_template = {
212         .dst = {
213                 .__refcnt       = ATOMIC_INIT(1),
214                 .__use          = 1,
215                 .obsolete       = -1,
216                 .error          = -EACCES,
217                 .input          = ip6_pkt_prohibit,
218                 .output         = ip6_pkt_prohibit_out,
219         },
220         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
221         .rt6i_protocol  = RTPROT_KERNEL,
222         .rt6i_metric    = ~(u32) 0,
223         .rt6i_ref       = ATOMIC_INIT(1),
224 };
225
226 static struct rt6_info ip6_blk_hole_entry_template = {
227         .dst = {
228                 .__refcnt       = ATOMIC_INIT(1),
229                 .__use          = 1,
230                 .obsolete       = -1,
231                 .error          = -EINVAL,
232                 .input          = dst_discard,
233                 .output         = dst_discard,
234         },
235         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
236         .rt6i_protocol  = RTPROT_KERNEL,
237         .rt6i_metric    = ~(u32) 0,
238         .rt6i_ref       = ATOMIC_INIT(1),
239 };
240
241 #endif
242
243 /* allocate dst with ip6_dst_ops */
244 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245                                              struct net_device *dev,
246                                              int flags)
247 {
248         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249
250         if (rt != NULL)
251                 memset(&rt->rt6i_table, 0,
252                         sizeof(*rt) - sizeof(struct dst_entry));
253
254         return rt;
255 }
256
257 static void ip6_dst_destroy(struct dst_entry *dst)
258 {
259         struct rt6_info *rt = (struct rt6_info *)dst;
260         struct inet6_dev *idev = rt->rt6i_idev;
261         struct inet_peer *peer = rt->rt6i_peer;
262
263         if (!(rt->dst.flags & DST_HOST))
264                 dst_destroy_metrics_generic(dst);
265
266         if (idev != NULL) {
267                 rt->rt6i_idev = NULL;
268                 in6_dev_put(idev);
269         }
270         if (peer) {
271                 rt->rt6i_peer = NULL;
272                 inet_putpeer(peer);
273         }
274 }
275
276 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
277
278 static u32 rt6_peer_genid(void)
279 {
280         return atomic_read(&__rt6_peer_genid);
281 }
282
283 void rt6_bind_peer(struct rt6_info *rt, int create)
284 {
285         struct inet_peer *peer;
286
287         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
288         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
289                 inet_putpeer(peer);
290         else
291                 rt->rt6i_peer_genid = rt6_peer_genid();
292 }
293
294 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
295                            int how)
296 {
297         struct rt6_info *rt = (struct rt6_info *)dst;
298         struct inet6_dev *idev = rt->rt6i_idev;
299         struct net_device *loopback_dev =
300                 dev_net(dev)->loopback_dev;
301
302         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
303                 struct inet6_dev *loopback_idev =
304                         in6_dev_get(loopback_dev);
305                 if (loopback_idev != NULL) {
306                         rt->rt6i_idev = loopback_idev;
307                         in6_dev_put(idev);
308                 }
309         }
310 }
311
312 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
313 {
314         return (rt->rt6i_flags & RTF_EXPIRES) &&
315                 time_after(jiffies, rt->rt6i_expires);
316 }
317
318 static inline int rt6_need_strict(const struct in6_addr *daddr)
319 {
320         return ipv6_addr_type(daddr) &
321                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
322 }
323
324 /*
325  *      Route lookup. Any table->tb6_lock is implied.
326  */
327
328 static inline struct rt6_info *rt6_device_match(struct net *net,
329                                                     struct rt6_info *rt,
330                                                     const struct in6_addr *saddr,
331                                                     int oif,
332                                                     int flags)
333 {
334         struct rt6_info *local = NULL;
335         struct rt6_info *sprt;
336
337         if (!oif && ipv6_addr_any(saddr))
338                 goto out;
339
340         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341                 struct net_device *dev = sprt->rt6i_dev;
342
343                 if (oif) {
344                         if (dev->ifindex == oif)
345                                 return sprt;
346                         if (dev->flags & IFF_LOOPBACK) {
347                                 if (sprt->rt6i_idev == NULL ||
348                                     sprt->rt6i_idev->dev->ifindex != oif) {
349                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
350                                                 continue;
351                                         if (local && (!oif ||
352                                                       local->rt6i_idev->dev->ifindex == oif))
353                                                 continue;
354                                 }
355                                 local = sprt;
356                         }
357                 } else {
358                         if (ipv6_chk_addr(net, saddr, dev,
359                                           flags & RT6_LOOKUP_F_IFACE))
360                                 return sprt;
361                 }
362         }
363
364         if (oif) {
365                 if (local)
366                         return local;
367
368                 if (flags & RT6_LOOKUP_F_IFACE)
369                         return net->ipv6.ip6_null_entry;
370         }
371 out:
372         return rt;
373 }
374
375 #ifdef CONFIG_IPV6_ROUTER_PREF
376 static void rt6_probe(struct rt6_info *rt)
377 {
378         struct neighbour *neigh;
379         /*
380          * Okay, this does not seem to be appropriate
381          * for now, however, we need to check if it
382          * is really so; aka Router Reachability Probing.
383          *
384          * Router Reachability Probe MUST be rate-limited
385          * to no more than one per minute.
386          */
387         rcu_read_lock();
388         neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389         if (!neigh || (neigh->nud_state & NUD_VALID))
390                 goto out;
391         read_lock_bh(&neigh->lock);
392         if (!(neigh->nud_state & NUD_VALID) &&
393             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394                 struct in6_addr mcaddr;
395                 struct in6_addr *target;
396
397                 neigh->updated = jiffies;
398                 read_unlock_bh(&neigh->lock);
399
400                 target = (struct in6_addr *)&neigh->primary_key;
401                 addrconf_addr_solict_mult(target, &mcaddr);
402                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403         } else {
404                 read_unlock_bh(&neigh->lock);
405         }
406 out:
407         rcu_read_unlock();
408 }
409 #else
410 static inline void rt6_probe(struct rt6_info *rt)
411 {
412 }
413 #endif
414
415 /*
416  * Default Router Selection (RFC 2461 6.3.6)
417  */
418 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 {
420         struct net_device *dev = rt->rt6i_dev;
421         if (!oif || dev->ifindex == oif)
422                 return 2;
423         if ((dev->flags & IFF_LOOPBACK) &&
424             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
425                 return 1;
426         return 0;
427 }
428
429 static inline int rt6_check_neigh(struct rt6_info *rt)
430 {
431         struct neighbour *neigh;
432         int m;
433
434         rcu_read_lock();
435         neigh = dst_get_neighbour(&rt->dst);
436         if (rt->rt6i_flags & RTF_NONEXTHOP ||
437             !(rt->rt6i_flags & RTF_GATEWAY))
438                 m = 1;
439         else if (neigh) {
440                 read_lock_bh(&neigh->lock);
441                 if (neigh->nud_state & NUD_VALID)
442                         m = 2;
443 #ifdef CONFIG_IPV6_ROUTER_PREF
444                 else if (neigh->nud_state & NUD_FAILED)
445                         m = 0;
446 #endif
447                 else
448                         m = 1;
449                 read_unlock_bh(&neigh->lock);
450         } else
451                 m = 0;
452         rcu_read_unlock();
453         return m;
454 }
455
456 static int rt6_score_route(struct rt6_info *rt, int oif,
457                            int strict)
458 {
459         int m, n;
460
461         m = rt6_check_dev(rt, oif);
462         if (!m && (strict & RT6_LOOKUP_F_IFACE))
463                 return -1;
464 #ifdef CONFIG_IPV6_ROUTER_PREF
465         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
466 #endif
467         n = rt6_check_neigh(rt);
468         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469                 return -1;
470         return m;
471 }
472
473 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
474                                    int *mpri, struct rt6_info *match)
475 {
476         int m;
477
478         if (rt6_check_expired(rt))
479                 goto out;
480
481         m = rt6_score_route(rt, oif, strict);
482         if (m < 0)
483                 goto out;
484
485         if (m > *mpri) {
486                 if (strict & RT6_LOOKUP_F_REACHABLE)
487                         rt6_probe(match);
488                 *mpri = m;
489                 match = rt;
490         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
491                 rt6_probe(rt);
492         }
493
494 out:
495         return match;
496 }
497
498 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
499                                      struct rt6_info *rr_head,
500                                      u32 metric, int oif, int strict)
501 {
502         struct rt6_info *rt, *match;
503         int mpri = -1;
504
505         match = NULL;
506         for (rt = rr_head; rt && rt->rt6i_metric == metric;
507              rt = rt->dst.rt6_next)
508                 match = find_match(rt, oif, strict, &mpri, match);
509         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510              rt = rt->dst.rt6_next)
511                 match = find_match(rt, oif, strict, &mpri, match);
512
513         return match;
514 }
515
516 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
517 {
518         struct rt6_info *match, *rt0;
519         struct net *net;
520
521         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522                   __func__, fn->leaf, oif);
523
524         rt0 = fn->rr_ptr;
525         if (!rt0)
526                 fn->rr_ptr = rt0 = fn->leaf;
527
528         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
529
530         if (!match &&
531             (strict & RT6_LOOKUP_F_REACHABLE)) {
532                 struct rt6_info *next = rt0->dst.rt6_next;
533
534                 /* no entries matched; do round-robin */
535                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
536                         next = fn->leaf;
537
538                 if (next != rt0)
539                         fn->rr_ptr = next;
540         }
541
542         RT6_TRACE("%s() => %p\n",
543                   __func__, match);
544
545         net = dev_net(rt0->rt6i_dev);
546         return match ? match : net->ipv6.ip6_null_entry;
547 }
548
549 #ifdef CONFIG_IPV6_ROUTE_INFO
550 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551                   const struct in6_addr *gwaddr)
552 {
553         struct net *net = dev_net(dev);
554         struct route_info *rinfo = (struct route_info *) opt;
555         struct in6_addr prefix_buf, *prefix;
556         unsigned int pref;
557         unsigned long lifetime;
558         struct rt6_info *rt;
559
560         if (len < sizeof(struct route_info)) {
561                 return -EINVAL;
562         }
563
564         /* Sanity check for prefix_len and length */
565         if (rinfo->length > 3) {
566                 return -EINVAL;
567         } else if (rinfo->prefix_len > 128) {
568                 return -EINVAL;
569         } else if (rinfo->prefix_len > 64) {
570                 if (rinfo->length < 2) {
571                         return -EINVAL;
572                 }
573         } else if (rinfo->prefix_len > 0) {
574                 if (rinfo->length < 1) {
575                         return -EINVAL;
576                 }
577         }
578
579         pref = rinfo->route_pref;
580         if (pref == ICMPV6_ROUTER_PREF_INVALID)
581                 return -EINVAL;
582
583         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584
585         if (rinfo->length == 3)
586                 prefix = (struct in6_addr *)rinfo->prefix;
587         else {
588                 /* this function is safe */
589                 ipv6_addr_prefix(&prefix_buf,
590                                  (struct in6_addr *)rinfo->prefix,
591                                  rinfo->prefix_len);
592                 prefix = &prefix_buf;
593         }
594
595         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
596                                 dev->ifindex);
597
598         if (rt && !lifetime) {
599                 ip6_del_rt(rt);
600                 rt = NULL;
601         }
602
603         if (!rt && lifetime)
604                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
605                                         pref);
606         else if (rt)
607                 rt->rt6i_flags = RTF_ROUTEINFO |
608                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
609
610         if (rt) {
611                 if (!addrconf_finite_timeout(lifetime)) {
612                         rt->rt6i_flags &= ~RTF_EXPIRES;
613                 } else {
614                         rt->rt6i_expires = jiffies + HZ * lifetime;
615                         rt->rt6i_flags |= RTF_EXPIRES;
616                 }
617                 dst_release(&rt->dst);
618         }
619         return 0;
620 }
621 #endif
622
623 #define BACKTRACK(__net, saddr)                 \
624 do { \
625         if (rt == __net->ipv6.ip6_null_entry) { \
626                 struct fib6_node *pn; \
627                 while (1) { \
628                         if (fn->fn_flags & RTN_TL_ROOT) \
629                                 goto out; \
630                         pn = fn->parent; \
631                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
633                         else \
634                                 fn = pn; \
635                         if (fn->fn_flags & RTN_RTINFO) \
636                                 goto restart; \
637                 } \
638         } \
639 } while(0)
640
641 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
642                                              struct fib6_table *table,
643                                              struct flowi6 *fl6, int flags)
644 {
645         struct fib6_node *fn;
646         struct rt6_info *rt;
647
648         read_lock_bh(&table->tb6_lock);
649         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
650 restart:
651         rt = fn->leaf;
652         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
653         BACKTRACK(net, &fl6->saddr);
654 out:
655         dst_use(&rt->dst, jiffies);
656         read_unlock_bh(&table->tb6_lock);
657         return rt;
658
659 }
660
661 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
662                             const struct in6_addr *saddr, int oif, int strict)
663 {
664         struct flowi6 fl6 = {
665                 .flowi6_oif = oif,
666                 .daddr = *daddr,
667         };
668         struct dst_entry *dst;
669         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
670
671         if (saddr) {
672                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673                 flags |= RT6_LOOKUP_F_HAS_SADDR;
674         }
675
676         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
677         if (dst->error == 0)
678                 return (struct rt6_info *) dst;
679
680         dst_release(dst);
681
682         return NULL;
683 }
684
685 EXPORT_SYMBOL(rt6_lookup);
686
687 /* ip6_ins_rt is called with FREE table->tb6_lock.
688    It takes new route entry, the addition fails by any reason the
689    route is freed. In any case, if caller does not hold it, it may
690    be destroyed.
691  */
692
693 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
694 {
695         int err;
696         struct fib6_table *table;
697
698         table = rt->rt6i_table;
699         write_lock_bh(&table->tb6_lock);
700         err = fib6_add(&table->tb6_root, rt, info);
701         write_unlock_bh(&table->tb6_lock);
702
703         return err;
704 }
705
706 int ip6_ins_rt(struct rt6_info *rt)
707 {
708         struct nl_info info = {
709                 .nl_net = dev_net(rt->rt6i_dev),
710         };
711         return __ip6_ins_rt(rt, &info);
712 }
713
714 static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
715                                       const struct in6_addr *daddr,
716                                       const struct in6_addr *saddr)
717 {
718         struct rt6_info *rt;
719
720         /*
721          *      Clone the route.
722          */
723
724         rt = ip6_rt_copy(ort, daddr);
725
726         if (rt) {
727                 struct neighbour *neigh;
728                 int attempts = !in_softirq();
729
730                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
731                         if (rt->rt6i_dst.plen != 128 &&
732                             ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733                                 rt->rt6i_flags |= RTF_ANYCAST;
734                         rt->rt6i_gateway = *daddr;
735                 }
736
737                 rt->rt6i_flags |= RTF_CACHE;
738
739 #ifdef CONFIG_IPV6_SUBTREES
740                 if (rt->rt6i_src.plen && saddr) {
741                         rt->rt6i_src.addr = *saddr;
742                         rt->rt6i_src.plen = 128;
743                 }
744 #endif
745
746         retry:
747                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
748                 if (IS_ERR(neigh)) {
749                         struct net *net = dev_net(rt->rt6i_dev);
750                         int saved_rt_min_interval =
751                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
752                         int saved_rt_elasticity =
753                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
754
755                         if (attempts-- > 0) {
756                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
757                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
758
759                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
760
761                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
762                                         saved_rt_elasticity;
763                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
764                                         saved_rt_min_interval;
765                                 goto retry;
766                         }
767
768                         if (net_ratelimit())
769                                 printk(KERN_WARNING
770                                        "ipv6: Neighbour table overflow.\n");
771                         dst_free(&rt->dst);
772                         return NULL;
773                 }
774                 dst_set_neighbour(&rt->dst, neigh);
775
776         }
777
778         return rt;
779 }
780
781 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
782                                         const struct in6_addr *daddr)
783 {
784         struct rt6_info *rt = ip6_rt_copy(ort, daddr);
785
786         if (rt) {
787                 rt->rt6i_flags |= RTF_CACHE;
788                 dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
789         }
790         return rt;
791 }
792
793 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
794                                       struct flowi6 *fl6, int flags)
795 {
796         struct fib6_node *fn;
797         struct rt6_info *rt, *nrt;
798         int strict = 0;
799         int attempts = 3;
800         int err;
801         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
802
803         strict |= flags & RT6_LOOKUP_F_IFACE;
804
805 relookup:
806         read_lock_bh(&table->tb6_lock);
807
808 restart_2:
809         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
810
811 restart:
812         rt = rt6_select(fn, oif, strict | reachable);
813
814         BACKTRACK(net, &fl6->saddr);
815         if (rt == net->ipv6.ip6_null_entry ||
816             rt->rt6i_flags & RTF_CACHE)
817                 goto out;
818
819         dst_hold(&rt->dst);
820         read_unlock_bh(&table->tb6_lock);
821
822         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
823                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
824         else if (!(rt->dst.flags & DST_HOST))
825                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
826         else
827                 goto out2;
828
829         dst_release(&rt->dst);
830         rt = nrt ? : net->ipv6.ip6_null_entry;
831
832         dst_hold(&rt->dst);
833         if (nrt) {
834                 err = ip6_ins_rt(nrt);
835                 if (!err)
836                         goto out2;
837         }
838
839         if (--attempts <= 0)
840                 goto out2;
841
842         /*
843          * Race condition! In the gap, when table->tb6_lock was
844          * released someone could insert this route.  Relookup.
845          */
846         dst_release(&rt->dst);
847         goto relookup;
848
849 out:
850         if (reachable) {
851                 reachable = 0;
852                 goto restart_2;
853         }
854         dst_hold(&rt->dst);
855         read_unlock_bh(&table->tb6_lock);
856 out2:
857         rt->dst.lastuse = jiffies;
858         rt->dst.__use++;
859
860         return rt;
861 }
862
863 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
864                                             struct flowi6 *fl6, int flags)
865 {
866         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
867 }
868
869 void ip6_route_input(struct sk_buff *skb)
870 {
871         const struct ipv6hdr *iph = ipv6_hdr(skb);
872         struct net *net = dev_net(skb->dev);
873         int flags = RT6_LOOKUP_F_HAS_SADDR;
874         struct flowi6 fl6 = {
875                 .flowi6_iif = skb->dev->ifindex,
876                 .daddr = iph->daddr,
877                 .saddr = iph->saddr,
878                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
879                 .flowi6_mark = skb->mark,
880                 .flowi6_proto = iph->nexthdr,
881         };
882
883         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
884                 flags |= RT6_LOOKUP_F_IFACE;
885
886         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
887 }
888
889 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
890                                              struct flowi6 *fl6, int flags)
891 {
892         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
893 }
894
895 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
896                                     struct flowi6 *fl6)
897 {
898         int flags = 0;
899
900         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
901                 flags |= RT6_LOOKUP_F_IFACE;
902
903         if (!ipv6_addr_any(&fl6->saddr))
904                 flags |= RT6_LOOKUP_F_HAS_SADDR;
905         else if (sk)
906                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
907
908         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
909 }
910
911 EXPORT_SYMBOL(ip6_route_output);
912
913 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
914 {
915         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
916         struct dst_entry *new = NULL;
917
918         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
919         if (rt) {
920                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
921
922                 new = &rt->dst;
923
924                 new->__use = 1;
925                 new->input = dst_discard;
926                 new->output = dst_discard;
927
928                 if (dst_metrics_read_only(&ort->dst))
929                         new->_metrics = ort->dst._metrics;
930                 else
931                         dst_copy_metrics(new, &ort->dst);
932                 rt->rt6i_idev = ort->rt6i_idev;
933                 if (rt->rt6i_idev)
934                         in6_dev_hold(rt->rt6i_idev);
935                 rt->rt6i_expires = 0;
936
937                 rt->rt6i_gateway = ort->rt6i_gateway;
938                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
939                 rt->rt6i_metric = 0;
940
941                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
942 #ifdef CONFIG_IPV6_SUBTREES
943                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
944 #endif
945
946                 dst_free(new);
947         }
948
949         dst_release(dst_orig);
950         return new ? new : ERR_PTR(-ENOMEM);
951 }
952
953 /*
954  *      Destination cache support functions
955  */
956
957 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
958 {
959         struct rt6_info *rt;
960
961         rt = (struct rt6_info *) dst;
962
963         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
964                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
965                         if (!rt->rt6i_peer)
966                                 rt6_bind_peer(rt, 0);
967                         rt->rt6i_peer_genid = rt6_peer_genid();
968                 }
969                 return dst;
970         }
971         return NULL;
972 }
973
974 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
975 {
976         struct rt6_info *rt = (struct rt6_info *) dst;
977
978         if (rt) {
979                 if (rt->rt6i_flags & RTF_CACHE) {
980                         if (rt6_check_expired(rt)) {
981                                 ip6_del_rt(rt);
982                                 dst = NULL;
983                         }
984                 } else {
985                         dst_release(dst);
986                         dst = NULL;
987                 }
988         }
989         return dst;
990 }
991
992 static void ip6_link_failure(struct sk_buff *skb)
993 {
994         struct rt6_info *rt;
995
996         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
997
998         rt = (struct rt6_info *) skb_dst(skb);
999         if (rt) {
1000                 if (rt->rt6i_flags&RTF_CACHE) {
1001                         dst_set_expires(&rt->dst, 0);
1002                         rt->rt6i_flags |= RTF_EXPIRES;
1003                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
1004                         rt->rt6i_node->fn_sernum = -1;
1005         }
1006 }
1007
1008 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1009 {
1010         struct rt6_info *rt6 = (struct rt6_info*)dst;
1011
1012         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1013                 rt6->rt6i_flags |= RTF_MODIFIED;
1014                 if (mtu < IPV6_MIN_MTU) {
1015                         u32 features = dst_metric(dst, RTAX_FEATURES);
1016                         mtu = IPV6_MIN_MTU;
1017                         features |= RTAX_FEATURE_ALLFRAG;
1018                         dst_metric_set(dst, RTAX_FEATURES, features);
1019                 }
1020                 dst_metric_set(dst, RTAX_MTU, mtu);
1021         }
1022 }
1023
1024 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1025 {
1026         struct net_device *dev = dst->dev;
1027         unsigned int mtu = dst_mtu(dst);
1028         struct net *net = dev_net(dev);
1029
1030         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1031
1032         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1033                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1034
1035         /*
1036          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1037          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1038          * IPV6_MAXPLEN is also valid and means: "any MSS,
1039          * rely only on pmtu discovery"
1040          */
1041         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1042                 mtu = IPV6_MAXPLEN;
1043         return mtu;
1044 }
1045
1046 static unsigned int ip6_mtu(const struct dst_entry *dst)
1047 {
1048         struct inet6_dev *idev;
1049         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
1050
1051         if (mtu)
1052                 return mtu;
1053
1054         mtu = IPV6_MIN_MTU;
1055
1056         rcu_read_lock();
1057         idev = __in6_dev_get(dst->dev);
1058         if (idev)
1059                 mtu = idev->cnf.mtu6;
1060         rcu_read_unlock();
1061
1062         return mtu;
1063 }
1064
1065 static struct dst_entry *icmp6_dst_gc_list;
1066 static DEFINE_SPINLOCK(icmp6_dst_lock);
1067
1068 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1069                                   struct neighbour *neigh,
1070                                   const struct in6_addr *addr)
1071 {
1072         struct rt6_info *rt;
1073         struct inet6_dev *idev = in6_dev_get(dev);
1074         struct net *net = dev_net(dev);
1075
1076         if (unlikely(idev == NULL))
1077                 return NULL;
1078
1079         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1080         if (unlikely(rt == NULL)) {
1081                 in6_dev_put(idev);
1082                 goto out;
1083         }
1084
1085         if (neigh)
1086                 neigh_hold(neigh);
1087         else {
1088                 neigh = ndisc_get_neigh(dev, addr);
1089                 if (IS_ERR(neigh))
1090                         neigh = NULL;
1091         }
1092
1093         rt->dst.flags |= DST_HOST;
1094         rt->dst.output  = ip6_output;
1095         dst_set_neighbour(&rt->dst, neigh);
1096         atomic_set(&rt->dst.__refcnt, 1);
1097         rt->rt6i_dst.addr = *addr;
1098         rt->rt6i_dst.plen = 128;
1099         rt->rt6i_idev     = idev;
1100         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1101
1102         spin_lock_bh(&icmp6_dst_lock);
1103         rt->dst.next = icmp6_dst_gc_list;
1104         icmp6_dst_gc_list = &rt->dst;
1105         spin_unlock_bh(&icmp6_dst_lock);
1106
1107         fib6_force_start_gc(net);
1108
1109 out:
1110         return &rt->dst;
1111 }
1112
1113 int icmp6_dst_gc(void)
1114 {
1115         struct dst_entry *dst, **pprev;
1116         int more = 0;
1117
1118         spin_lock_bh(&icmp6_dst_lock);
1119         pprev = &icmp6_dst_gc_list;
1120
1121         while ((dst = *pprev) != NULL) {
1122                 if (!atomic_read(&dst->__refcnt)) {
1123                         *pprev = dst->next;
1124                         dst_free(dst);
1125                 } else {
1126                         pprev = &dst->next;
1127                         ++more;
1128                 }
1129         }
1130
1131         spin_unlock_bh(&icmp6_dst_lock);
1132
1133         return more;
1134 }
1135
1136 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1137                             void *arg)
1138 {
1139         struct dst_entry *dst, **pprev;
1140
1141         spin_lock_bh(&icmp6_dst_lock);
1142         pprev = &icmp6_dst_gc_list;
1143         while ((dst = *pprev) != NULL) {
1144                 struct rt6_info *rt = (struct rt6_info *) dst;
1145                 if (func(rt, arg)) {
1146                         *pprev = dst->next;
1147                         dst_free(dst);
1148                 } else {
1149                         pprev = &dst->next;
1150                 }
1151         }
1152         spin_unlock_bh(&icmp6_dst_lock);
1153 }
1154
1155 static int ip6_dst_gc(struct dst_ops *ops)
1156 {
1157         unsigned long now = jiffies;
1158         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1159         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1160         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1161         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1162         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1163         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1164         int entries;
1165
1166         entries = dst_entries_get_fast(ops);
1167         if (time_after(rt_last_gc + rt_min_interval, now) &&
1168             entries <= rt_max_size)
1169                 goto out;
1170
1171         net->ipv6.ip6_rt_gc_expire++;
1172         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1173         net->ipv6.ip6_rt_last_gc = now;
1174         entries = dst_entries_get_slow(ops);
1175         if (entries < ops->gc_thresh)
1176                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1177 out:
1178         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1179         return entries > rt_max_size;
1180 }
1181
1182 /* Clean host part of a prefix. Not necessary in radix tree,
1183    but results in cleaner routing tables.
1184
1185    Remove it only when all the things will work!
1186  */
1187
1188 int ip6_dst_hoplimit(struct dst_entry *dst)
1189 {
1190         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1191         if (hoplimit == 0) {
1192                 struct net_device *dev = dst->dev;
1193                 struct inet6_dev *idev;
1194
1195                 rcu_read_lock();
1196                 idev = __in6_dev_get(dev);
1197                 if (idev)
1198                         hoplimit = idev->cnf.hop_limit;
1199                 else
1200                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1201                 rcu_read_unlock();
1202         }
1203         return hoplimit;
1204 }
1205 EXPORT_SYMBOL(ip6_dst_hoplimit);
1206
1207 /*
1208  *
1209  */
1210
1211 int ip6_route_add(struct fib6_config *cfg)
1212 {
1213         int err;
1214         struct net *net = cfg->fc_nlinfo.nl_net;
1215         struct rt6_info *rt = NULL;
1216         struct net_device *dev = NULL;
1217         struct inet6_dev *idev = NULL;
1218         struct fib6_table *table;
1219         int addr_type;
1220
1221         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1222                 return -EINVAL;
1223 #ifndef CONFIG_IPV6_SUBTREES
1224         if (cfg->fc_src_len)
1225                 return -EINVAL;
1226 #endif
1227         if (cfg->fc_ifindex) {
1228                 err = -ENODEV;
1229                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1230                 if (!dev)
1231                         goto out;
1232                 idev = in6_dev_get(dev);
1233                 if (!idev)
1234                         goto out;
1235         }
1236
1237         if (cfg->fc_metric == 0)
1238                 cfg->fc_metric = IP6_RT_PRIO_USER;
1239
1240         err = -ENOBUFS;
1241         if (NULL != cfg->fc_nlinfo.nlh &&
1242             !(cfg->fc_nlinfo.nlh->nlmsg_flags&NLM_F_CREATE)) {
1243                 table = fib6_get_table(net, cfg->fc_table);
1244                 if (table == NULL) {
1245                         printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
1246                         table = fib6_new_table(net, cfg->fc_table);
1247                 }
1248         } else {
1249                 table = fib6_new_table(net, cfg->fc_table);
1250         }
1251         if (table == NULL) {
1252                 goto out;
1253         }
1254
1255         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
1256
1257         if (rt == NULL) {
1258                 err = -ENOMEM;
1259                 goto out;
1260         }
1261
1262         rt->dst.obsolete = -1;
1263         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1264                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1265                                 0;
1266
1267         if (cfg->fc_protocol == RTPROT_UNSPEC)
1268                 cfg->fc_protocol = RTPROT_BOOT;
1269         rt->rt6i_protocol = cfg->fc_protocol;
1270
1271         addr_type = ipv6_addr_type(&cfg->fc_dst);
1272
1273         if (addr_type & IPV6_ADDR_MULTICAST)
1274                 rt->dst.input = ip6_mc_input;
1275         else if (cfg->fc_flags & RTF_LOCAL)
1276                 rt->dst.input = ip6_input;
1277         else
1278                 rt->dst.input = ip6_forward;
1279
1280         rt->dst.output = ip6_output;
1281
1282         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1283         rt->rt6i_dst.plen = cfg->fc_dst_len;
1284         if (rt->rt6i_dst.plen == 128)
1285                rt->dst.flags |= DST_HOST;
1286
1287         if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
1288                 u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1289                 if (!metrics) {
1290                         err = -ENOMEM;
1291                         goto out;
1292                 }
1293                 dst_init_metrics(&rt->dst, metrics, 0);
1294         }
1295 #ifdef CONFIG_IPV6_SUBTREES
1296         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1297         rt->rt6i_src.plen = cfg->fc_src_len;
1298 #endif
1299
1300         rt->rt6i_metric = cfg->fc_metric;
1301
1302         /* We cannot add true routes via loopback here,
1303            they would result in kernel looping; promote them to reject routes
1304          */
1305         if ((cfg->fc_flags & RTF_REJECT) ||
1306             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1307                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1308                 /* hold loopback dev/idev if we haven't done so. */
1309                 if (dev != net->loopback_dev) {
1310                         if (dev) {
1311                                 dev_put(dev);
1312                                 in6_dev_put(idev);
1313                         }
1314                         dev = net->loopback_dev;
1315                         dev_hold(dev);
1316                         idev = in6_dev_get(dev);
1317                         if (!idev) {
1318                                 err = -ENODEV;
1319                                 goto out;
1320                         }
1321                 }
1322                 rt->dst.output = ip6_pkt_discard_out;
1323                 rt->dst.input = ip6_pkt_discard;
1324                 rt->dst.error = -ENETUNREACH;
1325                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1326                 goto install_route;
1327         }
1328
1329         if (cfg->fc_flags & RTF_GATEWAY) {
1330                 const struct in6_addr *gw_addr;
1331                 int gwa_type;
1332
1333                 gw_addr = &cfg->fc_gateway;
1334                 rt->rt6i_gateway = *gw_addr;
1335                 gwa_type = ipv6_addr_type(gw_addr);
1336
1337                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1338                         struct rt6_info *grt;
1339
1340                         /* IPv6 strictly inhibits using not link-local
1341                            addresses as nexthop address.
1342                            Otherwise, router will not able to send redirects.
1343                            It is very good, but in some (rare!) circumstances
1344                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1345                            some exceptions. --ANK
1346                          */
1347                         err = -EINVAL;
1348                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1349                                 goto out;
1350
1351                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1352
1353                         err = -EHOSTUNREACH;
1354                         if (grt == NULL)
1355                                 goto out;
1356                         if (dev) {
1357                                 if (dev != grt->rt6i_dev) {
1358                                         dst_release(&grt->dst);
1359                                         goto out;
1360                                 }
1361                         } else {
1362                                 dev = grt->rt6i_dev;
1363                                 idev = grt->rt6i_idev;
1364                                 dev_hold(dev);
1365                                 in6_dev_hold(grt->rt6i_idev);
1366                         }
1367                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1368                                 err = 0;
1369                         dst_release(&grt->dst);
1370
1371                         if (err)
1372                                 goto out;
1373                 }
1374                 err = -EINVAL;
1375                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1376                         goto out;
1377         }
1378
1379         err = -ENODEV;
1380         if (dev == NULL)
1381                 goto out;
1382
1383         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1384                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1385                         err = -EINVAL;
1386                         goto out;
1387                 }
1388                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1389                 rt->rt6i_prefsrc.plen = 128;
1390         } else
1391                 rt->rt6i_prefsrc.plen = 0;
1392
1393         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1394                 struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1395                 if (IS_ERR(n)) {
1396                         err = PTR_ERR(n);
1397                         goto out;
1398                 }
1399                 dst_set_neighbour(&rt->dst, n);
1400         }
1401
1402         rt->rt6i_flags = cfg->fc_flags;
1403
1404 install_route:
1405         if (cfg->fc_mx) {
1406                 struct nlattr *nla;
1407                 int remaining;
1408
1409                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1410                         int type = nla_type(nla);
1411
1412                         if (type) {
1413                                 if (type > RTAX_MAX) {
1414                                         err = -EINVAL;
1415                                         goto out;
1416                                 }
1417
1418                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1419                         }
1420                 }
1421         }
1422
1423         rt->dst.dev = dev;
1424         rt->rt6i_idev = idev;
1425         rt->rt6i_table = table;
1426
1427         cfg->fc_nlinfo.nl_net = dev_net(dev);
1428
1429         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1430
1431 out:
1432         if (dev)
1433                 dev_put(dev);
1434         if (idev)
1435                 in6_dev_put(idev);
1436         if (rt)
1437                 dst_free(&rt->dst);
1438         return err;
1439 }
1440
1441 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1442 {
1443         int err;
1444         struct fib6_table *table;
1445         struct net *net = dev_net(rt->rt6i_dev);
1446
1447         if (rt == net->ipv6.ip6_null_entry)
1448                 return -ENOENT;
1449
1450         table = rt->rt6i_table;
1451         write_lock_bh(&table->tb6_lock);
1452
1453         err = fib6_del(rt, info);
1454         dst_release(&rt->dst);
1455
1456         write_unlock_bh(&table->tb6_lock);
1457
1458         return err;
1459 }
1460
1461 int ip6_del_rt(struct rt6_info *rt)
1462 {
1463         struct nl_info info = {
1464                 .nl_net = dev_net(rt->rt6i_dev),
1465         };
1466         return __ip6_del_rt(rt, &info);
1467 }
1468
1469 static int ip6_route_del(struct fib6_config *cfg)
1470 {
1471         struct fib6_table *table;
1472         struct fib6_node *fn;
1473         struct rt6_info *rt;
1474         int err = -ESRCH;
1475
1476         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1477         if (table == NULL)
1478                 return err;
1479
1480         read_lock_bh(&table->tb6_lock);
1481
1482         fn = fib6_locate(&table->tb6_root,
1483                          &cfg->fc_dst, cfg->fc_dst_len,
1484                          &cfg->fc_src, cfg->fc_src_len);
1485
1486         if (fn) {
1487                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1488                         if (cfg->fc_ifindex &&
1489                             (rt->rt6i_dev == NULL ||
1490                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1491                                 continue;
1492                         if (cfg->fc_flags & RTF_GATEWAY &&
1493                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1494                                 continue;
1495                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1496                                 continue;
1497                         dst_hold(&rt->dst);
1498                         read_unlock_bh(&table->tb6_lock);
1499
1500                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1501                 }
1502         }
1503         read_unlock_bh(&table->tb6_lock);
1504
1505         return err;
1506 }
1507
1508 /*
1509  *      Handle redirects
1510  */
1511 struct ip6rd_flowi {
1512         struct flowi6 fl6;
1513         struct in6_addr gateway;
1514 };
1515
1516 static struct rt6_info *__ip6_route_redirect(struct net *net,
1517                                              struct fib6_table *table,
1518                                              struct flowi6 *fl6,
1519                                              int flags)
1520 {
1521         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1522         struct rt6_info *rt;
1523         struct fib6_node *fn;
1524
1525         /*
1526          * Get the "current" route for this destination and
1527          * check if the redirect has come from approriate router.
1528          *
1529          * RFC 2461 specifies that redirects should only be
1530          * accepted if they come from the nexthop to the target.
1531          * Due to the way the routes are chosen, this notion
1532          * is a bit fuzzy and one might need to check all possible
1533          * routes.
1534          */
1535
1536         read_lock_bh(&table->tb6_lock);
1537         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1538 restart:
1539         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1540                 /*
1541                  * Current route is on-link; redirect is always invalid.
1542                  *
1543                  * Seems, previous statement is not true. It could
1544                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1545                  * But then router serving it might decide, that we should
1546                  * know truth 8)8) --ANK (980726).
1547                  */
1548                 if (rt6_check_expired(rt))
1549                         continue;
1550                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1551                         continue;
1552                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1553                         continue;
1554                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1555                         continue;
1556                 break;
1557         }
1558
1559         if (!rt)
1560                 rt = net->ipv6.ip6_null_entry;
1561         BACKTRACK(net, &fl6->saddr);
1562 out:
1563         dst_hold(&rt->dst);
1564
1565         read_unlock_bh(&table->tb6_lock);
1566
1567         return rt;
1568 };
1569
1570 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1571                                            const struct in6_addr *src,
1572                                            const struct in6_addr *gateway,
1573                                            struct net_device *dev)
1574 {
1575         int flags = RT6_LOOKUP_F_HAS_SADDR;
1576         struct net *net = dev_net(dev);
1577         struct ip6rd_flowi rdfl = {
1578                 .fl6 = {
1579                         .flowi6_oif = dev->ifindex,
1580                         .daddr = *dest,
1581                         .saddr = *src,
1582                 },
1583         };
1584
1585         rdfl.gateway = *gateway;
1586
1587         if (rt6_need_strict(dest))
1588                 flags |= RT6_LOOKUP_F_IFACE;
1589
1590         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1591                                                    flags, __ip6_route_redirect);
1592 }
1593
1594 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1595                   const struct in6_addr *saddr,
1596                   struct neighbour *neigh, u8 *lladdr, int on_link)
1597 {
1598         struct rt6_info *rt, *nrt = NULL;
1599         struct netevent_redirect netevent;
1600         struct net *net = dev_net(neigh->dev);
1601
1602         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1603
1604         if (rt == net->ipv6.ip6_null_entry) {
1605                 if (net_ratelimit())
1606                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1607                                "for redirect target\n");
1608                 goto out;
1609         }
1610
1611         /*
1612          *      We have finally decided to accept it.
1613          */
1614
1615         neigh_update(neigh, lladdr, NUD_STALE,
1616                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1617                      NEIGH_UPDATE_F_OVERRIDE|
1618                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1619                                      NEIGH_UPDATE_F_ISROUTER))
1620                      );
1621
1622         /*
1623          * Redirect received -> path was valid.
1624          * Look, redirects are sent only in response to data packets,
1625          * so that this nexthop apparently is reachable. --ANK
1626          */
1627         dst_confirm(&rt->dst);
1628
1629         /* Duplicate redirect: silently ignore. */
1630         if (neigh == dst_get_neighbour_raw(&rt->dst))
1631                 goto out;
1632
1633         nrt = ip6_rt_copy(rt, dest);
1634         if (nrt == NULL)
1635                 goto out;
1636
1637         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1638         if (on_link)
1639                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1640
1641         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1642         dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
1643
1644         if (ip6_ins_rt(nrt))
1645                 goto out;
1646
1647         netevent.old = &rt->dst;
1648         netevent.new = &nrt->dst;
1649         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1650
1651         if (rt->rt6i_flags&RTF_CACHE) {
1652                 ip6_del_rt(rt);
1653                 return;
1654         }
1655
1656 out:
1657         dst_release(&rt->dst);
1658 }
1659
1660 /*
1661  *      Handle ICMP "packet too big" messages
1662  *      i.e. Path MTU discovery
1663  */
1664
1665 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1666                              struct net *net, u32 pmtu, int ifindex)
1667 {
1668         struct rt6_info *rt, *nrt;
1669         int allfrag = 0;
1670 again:
1671         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1672         if (rt == NULL)
1673                 return;
1674
1675         if (rt6_check_expired(rt)) {
1676                 ip6_del_rt(rt);
1677                 goto again;
1678         }
1679
1680         if (pmtu >= dst_mtu(&rt->dst))
1681                 goto out;
1682
1683         if (pmtu < IPV6_MIN_MTU) {
1684                 /*
1685                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1686                  * MTU (1280) and a fragment header should always be included
1687                  * after a node receiving Too Big message reporting PMTU is
1688                  * less than the IPv6 Minimum Link MTU.
1689                  */
1690                 pmtu = IPV6_MIN_MTU;
1691                 allfrag = 1;
1692         }
1693
1694         /* New mtu received -> path was valid.
1695            They are sent only in response to data packets,
1696            so that this nexthop apparently is reachable. --ANK
1697          */
1698         dst_confirm(&rt->dst);
1699
1700         /* Host route. If it is static, it would be better
1701            not to override it, but add new one, so that
1702            when cache entry will expire old pmtu
1703            would return automatically.
1704          */
1705         if (rt->rt6i_flags & RTF_CACHE) {
1706                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1707                 if (allfrag) {
1708                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1709                         features |= RTAX_FEATURE_ALLFRAG;
1710                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1711                 }
1712                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1713                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1714                 goto out;
1715         }
1716
1717         /* Network route.
1718            Two cases are possible:
1719            1. It is connected route. Action: COW
1720            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1721          */
1722         if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1723                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1724         else
1725                 nrt = rt6_alloc_clone(rt, daddr);
1726
1727         if (nrt) {
1728                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1729                 if (allfrag) {
1730                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1731                         features |= RTAX_FEATURE_ALLFRAG;
1732                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1733                 }
1734
1735                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1736                  * happened within 5 mins, the recommended timer is 10 mins.
1737                  * Here this route expiration time is set to ip6_rt_mtu_expires
1738                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1739                  * and detecting PMTU increase will be automatically happened.
1740                  */
1741                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1742                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1743
1744                 ip6_ins_rt(nrt);
1745         }
1746 out:
1747         dst_release(&rt->dst);
1748 }
1749
1750 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1751                         struct net_device *dev, u32 pmtu)
1752 {
1753         struct net *net = dev_net(dev);
1754
1755         /*
1756          * RFC 1981 states that a node "MUST reduce the size of the packets it
1757          * is sending along the path" that caused the Packet Too Big message.
1758          * Since it's not possible in the general case to determine which
1759          * interface was used to send the original packet, we update the MTU
1760          * on the interface that will be used to send future packets. We also
1761          * update the MTU on the interface that received the Packet Too Big in
1762          * case the original packet was forced out that interface with
1763          * SO_BINDTODEVICE or similar. This is the next best thing to the
1764          * correct behaviour, which would be to update the MTU on all
1765          * interfaces.
1766          */
1767         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1768         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1769 }
1770
1771 /*
1772  *      Misc support functions
1773  */
1774
1775 static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
1776                                     const struct in6_addr *dest)
1777 {
1778         struct net *net = dev_net(ort->rt6i_dev);
1779         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1780                                             ort->dst.dev, 0);
1781
1782         if (rt) {
1783                 rt->dst.input = ort->dst.input;
1784                 rt->dst.output = ort->dst.output;
1785                 rt->dst.flags |= DST_HOST;
1786
1787                 rt->rt6i_dst.addr = *dest;
1788                 rt->rt6i_dst.plen = 128;
1789                 dst_copy_metrics(&rt->dst, &ort->dst);
1790                 rt->dst.error = ort->dst.error;
1791                 rt->rt6i_idev = ort->rt6i_idev;
1792                 if (rt->rt6i_idev)
1793                         in6_dev_hold(rt->rt6i_idev);
1794                 rt->dst.lastuse = jiffies;
1795                 rt->rt6i_expires = 0;
1796
1797                 rt->rt6i_gateway = ort->rt6i_gateway;
1798                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1799                 rt->rt6i_metric = 0;
1800
1801 #ifdef CONFIG_IPV6_SUBTREES
1802                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1803 #endif
1804                 memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
1805                 rt->rt6i_table = ort->rt6i_table;
1806         }
1807         return rt;
1808 }
1809
1810 #ifdef CONFIG_IPV6_ROUTE_INFO
1811 static struct rt6_info *rt6_get_route_info(struct net *net,
1812                                            const struct in6_addr *prefix, int prefixlen,
1813                                            const struct in6_addr *gwaddr, int ifindex)
1814 {
1815         struct fib6_node *fn;
1816         struct rt6_info *rt = NULL;
1817         struct fib6_table *table;
1818
1819         table = fib6_get_table(net, RT6_TABLE_INFO);
1820         if (table == NULL)
1821                 return NULL;
1822
1823         write_lock_bh(&table->tb6_lock);
1824         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1825         if (!fn)
1826                 goto out;
1827
1828         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1829                 if (rt->rt6i_dev->ifindex != ifindex)
1830                         continue;
1831                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1832                         continue;
1833                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1834                         continue;
1835                 dst_hold(&rt->dst);
1836                 break;
1837         }
1838 out:
1839         write_unlock_bh(&table->tb6_lock);
1840         return rt;
1841 }
1842
1843 static struct rt6_info *rt6_add_route_info(struct net *net,
1844                                            const struct in6_addr *prefix, int prefixlen,
1845                                            const struct in6_addr *gwaddr, int ifindex,
1846                                            unsigned pref)
1847 {
1848         struct fib6_config cfg = {
1849                 .fc_table       = RT6_TABLE_INFO,
1850                 .fc_metric      = IP6_RT_PRIO_USER,
1851                 .fc_ifindex     = ifindex,
1852                 .fc_dst_len     = prefixlen,
1853                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1854                                   RTF_UP | RTF_PREF(pref),
1855                 .fc_nlinfo.pid = 0,
1856                 .fc_nlinfo.nlh = NULL,
1857                 .fc_nlinfo.nl_net = net,
1858         };
1859
1860         cfg.fc_dst = *prefix;
1861         cfg.fc_gateway = *gwaddr;
1862
1863         /* We should treat it as a default route if prefix length is 0. */
1864         if (!prefixlen)
1865                 cfg.fc_flags |= RTF_DEFAULT;
1866
1867         ip6_route_add(&cfg);
1868
1869         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1870 }
1871 #endif
1872
1873 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1874 {
1875         struct rt6_info *rt;
1876         struct fib6_table *table;
1877
1878         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1879         if (table == NULL)
1880                 return NULL;
1881
1882         write_lock_bh(&table->tb6_lock);
1883         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1884                 if (dev == rt->rt6i_dev &&
1885                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1886                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1887                         break;
1888         }
1889         if (rt)
1890                 dst_hold(&rt->dst);
1891         write_unlock_bh(&table->tb6_lock);
1892         return rt;
1893 }
1894
1895 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1896                                      struct net_device *dev,
1897                                      unsigned int pref)
1898 {
1899         struct fib6_config cfg = {
1900                 .fc_table       = RT6_TABLE_DFLT,
1901                 .fc_metric      = IP6_RT_PRIO_USER,
1902                 .fc_ifindex     = dev->ifindex,
1903                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1904                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1905                 .fc_nlinfo.pid = 0,
1906                 .fc_nlinfo.nlh = NULL,
1907                 .fc_nlinfo.nl_net = dev_net(dev),
1908         };
1909
1910         cfg.fc_gateway = *gwaddr;
1911
1912         ip6_route_add(&cfg);
1913
1914         return rt6_get_dflt_router(gwaddr, dev);
1915 }
1916
1917 void rt6_purge_dflt_routers(struct net *net)
1918 {
1919         struct rt6_info *rt;
1920         struct fib6_table *table;
1921
1922         /* NOTE: Keep consistent with rt6_get_dflt_router */
1923         table = fib6_get_table(net, RT6_TABLE_DFLT);
1924         if (table == NULL)
1925                 return;
1926
1927 restart:
1928         read_lock_bh(&table->tb6_lock);
1929         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1930                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1931                         dst_hold(&rt->dst);
1932                         read_unlock_bh(&table->tb6_lock);
1933                         ip6_del_rt(rt);
1934                         goto restart;
1935                 }
1936         }
1937         read_unlock_bh(&table->tb6_lock);
1938 }
1939
1940 static void rtmsg_to_fib6_config(struct net *net,
1941                                  struct in6_rtmsg *rtmsg,
1942                                  struct fib6_config *cfg)
1943 {
1944         memset(cfg, 0, sizeof(*cfg));
1945
1946         cfg->fc_table = RT6_TABLE_MAIN;
1947         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1948         cfg->fc_metric = rtmsg->rtmsg_metric;
1949         cfg->fc_expires = rtmsg->rtmsg_info;
1950         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1951         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1952         cfg->fc_flags = rtmsg->rtmsg_flags;
1953
1954         cfg->fc_nlinfo.nl_net = net;
1955
1956         cfg->fc_dst = rtmsg->rtmsg_dst;
1957         cfg->fc_src = rtmsg->rtmsg_src;
1958         cfg->fc_gateway = rtmsg->rtmsg_gateway;
1959 }
1960
1961 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1962 {
1963         struct fib6_config cfg;
1964         struct in6_rtmsg rtmsg;
1965         int err;
1966
1967         switch(cmd) {
1968         case SIOCADDRT:         /* Add a route */
1969         case SIOCDELRT:         /* Delete a route */
1970                 if (!capable(CAP_NET_ADMIN))
1971                         return -EPERM;
1972                 err = copy_from_user(&rtmsg, arg,
1973                                      sizeof(struct in6_rtmsg));
1974                 if (err)
1975                         return -EFAULT;
1976
1977                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1978
1979                 rtnl_lock();
1980                 switch (cmd) {
1981                 case SIOCADDRT:
1982                         err = ip6_route_add(&cfg);
1983                         break;
1984                 case SIOCDELRT:
1985                         err = ip6_route_del(&cfg);
1986                         break;
1987                 default:
1988                         err = -EINVAL;
1989                 }
1990                 rtnl_unlock();
1991
1992                 return err;
1993         }
1994
1995         return -EINVAL;
1996 }
1997
1998 /*
1999  *      Drop the packet on the floor
2000  */
2001
2002 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2003 {
2004         int type;
2005         struct dst_entry *dst = skb_dst(skb);
2006         switch (ipstats_mib_noroutes) {
2007         case IPSTATS_MIB_INNOROUTES:
2008                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2009                 if (type == IPV6_ADDR_ANY) {
2010                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2011                                       IPSTATS_MIB_INADDRERRORS);
2012                         break;
2013                 }
2014                 /* FALLTHROUGH */
2015         case IPSTATS_MIB_OUTNOROUTES:
2016                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2017                               ipstats_mib_noroutes);
2018                 break;
2019         }
2020         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2021         kfree_skb(skb);
2022         return 0;
2023 }
2024
2025 static int ip6_pkt_discard(struct sk_buff *skb)
2026 {
2027         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2028 }
2029
2030 static int ip6_pkt_discard_out(struct sk_buff *skb)
2031 {
2032         skb->dev = skb_dst(skb)->dev;
2033         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2034 }
2035
2036 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2037
2038 static int ip6_pkt_prohibit(struct sk_buff *skb)
2039 {
2040         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2041 }
2042
2043 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
2044 {
2045         skb->dev = skb_dst(skb)->dev;
2046         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2047 }
2048
2049 #endif
2050
2051 /*
2052  *      Allocate a dst for local (unicast / anycast) address.
2053  */
2054
2055 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2056                                     const struct in6_addr *addr,
2057                                     int anycast)
2058 {
2059         struct net *net = dev_net(idev->dev);
2060         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2061                                             net->loopback_dev, 0);
2062         struct neighbour *neigh;
2063
2064         if (rt == NULL) {
2065                 if (net_ratelimit())
2066                         pr_warning("IPv6:  Maximum number of routes reached,"
2067                                    " consider increasing route/max_size.\n");
2068                 return ERR_PTR(-ENOMEM);
2069         }
2070
2071         in6_dev_hold(idev);
2072
2073         rt->dst.flags |= DST_HOST;
2074         rt->dst.input = ip6_input;
2075         rt->dst.output = ip6_output;
2076         rt->rt6i_idev = idev;
2077         rt->dst.obsolete = -1;
2078
2079         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2080         if (anycast)
2081                 rt->rt6i_flags |= RTF_ANYCAST;
2082         else
2083                 rt->rt6i_flags |= RTF_LOCAL;
2084         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2085         if (IS_ERR(neigh)) {
2086                 dst_free(&rt->dst);
2087
2088                 return ERR_CAST(neigh);
2089         }
2090         dst_set_neighbour(&rt->dst, neigh);
2091
2092         rt->rt6i_dst.addr = *addr;
2093         rt->rt6i_dst.plen = 128;
2094         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2095
2096         atomic_set(&rt->dst.__refcnt, 1);
2097
2098         return rt;
2099 }
2100
2101 int ip6_route_get_saddr(struct net *net,
2102                         struct rt6_info *rt,
2103                         const struct in6_addr *daddr,
2104                         unsigned int prefs,
2105                         struct in6_addr *saddr)
2106 {
2107         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2108         int err = 0;
2109         if (rt->rt6i_prefsrc.plen)
2110                 *saddr = rt->rt6i_prefsrc.addr;
2111         else
2112                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2113                                          daddr, prefs, saddr);
2114         return err;
2115 }
2116
2117 /* remove deleted ip from prefsrc entries */
2118 struct arg_dev_net_ip {
2119         struct net_device *dev;
2120         struct net *net;
2121         struct in6_addr *addr;
2122 };
2123
2124 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2125 {
2126         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2127         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2128         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2129
2130         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2131             rt != net->ipv6.ip6_null_entry &&
2132             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2133                 /* remove prefsrc entry */
2134                 rt->rt6i_prefsrc.plen = 0;
2135         }
2136         return 0;
2137 }
2138
2139 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2140 {
2141         struct net *net = dev_net(ifp->idev->dev);
2142         struct arg_dev_net_ip adni = {
2143                 .dev = ifp->idev->dev,
2144                 .net = net,
2145                 .addr = &ifp->addr,
2146         };
2147         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2148 }
2149
2150 struct arg_dev_net {
2151         struct net_device *dev;
2152         struct net *net;
2153 };
2154
2155 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2156 {
2157         const struct arg_dev_net *adn = arg;
2158         const struct net_device *dev = adn->dev;
2159
2160         if ((rt->rt6i_dev == dev || dev == NULL) &&
2161             rt != adn->net->ipv6.ip6_null_entry) {
2162                 RT6_TRACE("deleted by ifdown %p\n", rt);
2163                 return -1;
2164         }
2165         return 0;
2166 }
2167
2168 void rt6_ifdown(struct net *net, struct net_device *dev)
2169 {
2170         struct arg_dev_net adn = {
2171                 .dev = dev,
2172                 .net = net,
2173         };
2174
2175         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2176         icmp6_clean_all(fib6_ifdown, &adn);
2177 }
2178
2179 struct rt6_mtu_change_arg
2180 {
2181         struct net_device *dev;
2182         unsigned mtu;
2183 };
2184
2185 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2186 {
2187         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2188         struct inet6_dev *idev;
2189
2190         /* In IPv6 pmtu discovery is not optional,
2191            so that RTAX_MTU lock cannot disable it.
2192            We still use this lock to block changes
2193            caused by addrconf/ndisc.
2194         */
2195
2196         idev = __in6_dev_get(arg->dev);
2197         if (idev == NULL)
2198                 return 0;
2199
2200         /* For administrative MTU increase, there is no way to discover
2201            IPv6 PMTU increase, so PMTU increase should be updated here.
2202            Since RFC 1981 doesn't include administrative MTU increase
2203            update PMTU increase is a MUST. (i.e. jumbo frame)
2204          */
2205         /*
2206            If new MTU is less than route PMTU, this new MTU will be the
2207            lowest MTU in the path, update the route PMTU to reflect PMTU
2208            decreases; if new MTU is greater than route PMTU, and the
2209            old MTU is the lowest MTU in the path, update the route PMTU
2210            to reflect the increase. In this case if the other nodes' MTU
2211            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2212            PMTU discouvery.
2213          */
2214         if (rt->rt6i_dev == arg->dev &&
2215             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2216             (dst_mtu(&rt->dst) >= arg->mtu ||
2217              (dst_mtu(&rt->dst) < arg->mtu &&
2218               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2219                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2220         }
2221         return 0;
2222 }
2223
2224 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2225 {
2226         struct rt6_mtu_change_arg arg = {
2227                 .dev = dev,
2228                 .mtu = mtu,
2229         };
2230
2231         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2232 }
2233
2234 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2235         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2236         [RTA_OIF]               = { .type = NLA_U32 },
2237         [RTA_IIF]               = { .type = NLA_U32 },
2238         [RTA_PRIORITY]          = { .type = NLA_U32 },
2239         [RTA_METRICS]           = { .type = NLA_NESTED },
2240 };
2241
2242 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2243                               struct fib6_config *cfg)
2244 {
2245         struct rtmsg *rtm;
2246         struct nlattr *tb[RTA_MAX+1];
2247         int err;
2248
2249         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2250         if (err < 0)
2251                 goto errout;
2252
2253         err = -EINVAL;
2254         rtm = nlmsg_data(nlh);
2255         memset(cfg, 0, sizeof(*cfg));
2256
2257         cfg->fc_table = rtm->rtm_table;
2258         cfg->fc_dst_len = rtm->rtm_dst_len;
2259         cfg->fc_src_len = rtm->rtm_src_len;
2260         cfg->fc_flags = RTF_UP;
2261         cfg->fc_protocol = rtm->rtm_protocol;
2262
2263         if (rtm->rtm_type == RTN_UNREACHABLE)
2264                 cfg->fc_flags |= RTF_REJECT;
2265
2266         if (rtm->rtm_type == RTN_LOCAL)
2267                 cfg->fc_flags |= RTF_LOCAL;
2268
2269         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2270         cfg->fc_nlinfo.nlh = nlh;
2271         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2272
2273         if (tb[RTA_GATEWAY]) {
2274                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2275                 cfg->fc_flags |= RTF_GATEWAY;
2276         }
2277
2278         if (tb[RTA_DST]) {
2279                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2280
2281                 if (nla_len(tb[RTA_DST]) < plen)
2282                         goto errout;
2283
2284                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2285         }
2286
2287         if (tb[RTA_SRC]) {
2288                 int plen = (rtm->rtm_src_len + 7) >> 3;
2289
2290                 if (nla_len(tb[RTA_SRC]) < plen)
2291                         goto errout;
2292
2293                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2294         }
2295
2296         if (tb[RTA_PREFSRC])
2297                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2298
2299         if (tb[RTA_OIF])
2300                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2301
2302         if (tb[RTA_PRIORITY])
2303                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2304
2305         if (tb[RTA_METRICS]) {
2306                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2307                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2308         }
2309
2310         if (tb[RTA_TABLE])
2311                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2312
2313         err = 0;
2314 errout:
2315         return err;
2316 }
2317
2318 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2319 {
2320         struct fib6_config cfg;
2321         int err;
2322
2323         err = rtm_to_fib6_config(skb, nlh, &cfg);
2324         if (err < 0)
2325                 return err;
2326
2327         return ip6_route_del(&cfg);
2328 }
2329
2330 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2331 {
2332         struct fib6_config cfg;
2333         int err;
2334
2335         err = rtm_to_fib6_config(skb, nlh, &cfg);
2336         if (err < 0)
2337                 return err;
2338
2339         return ip6_route_add(&cfg);
2340 }
2341
2342 static inline size_t rt6_nlmsg_size(void)
2343 {
2344         return NLMSG_ALIGN(sizeof(struct rtmsg))
2345                + nla_total_size(16) /* RTA_SRC */
2346                + nla_total_size(16) /* RTA_DST */
2347                + nla_total_size(16) /* RTA_GATEWAY */
2348                + nla_total_size(16) /* RTA_PREFSRC */
2349                + nla_total_size(4) /* RTA_TABLE */
2350                + nla_total_size(4) /* RTA_IIF */
2351                + nla_total_size(4) /* RTA_OIF */
2352                + nla_total_size(4) /* RTA_PRIORITY */
2353                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2354                + nla_total_size(sizeof(struct rta_cacheinfo));
2355 }
2356
2357 static int rt6_fill_node(struct net *net,
2358                          struct sk_buff *skb, struct rt6_info *rt,
2359                          struct in6_addr *dst, struct in6_addr *src,
2360                          int iif, int type, u32 pid, u32 seq,
2361                          int prefix, int nowait, unsigned int flags)
2362 {
2363         struct rtmsg *rtm;
2364         struct nlmsghdr *nlh;
2365         long expires;
2366         u32 table;
2367         struct neighbour *n;
2368
2369         if (prefix) {   /* user wants prefix routes only */
2370                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2371                         /* success since this is not a prefix route */
2372                         return 1;
2373                 }
2374         }
2375
2376         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2377         if (nlh == NULL)
2378                 return -EMSGSIZE;
2379
2380         rtm = nlmsg_data(nlh);
2381         rtm->rtm_family = AF_INET6;
2382         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2383         rtm->rtm_src_len = rt->rt6i_src.plen;
2384         rtm->rtm_tos = 0;
2385         if (rt->rt6i_table)
2386                 table = rt->rt6i_table->tb6_id;
2387         else
2388                 table = RT6_TABLE_UNSPEC;
2389         rtm->rtm_table = table;
2390         NLA_PUT_U32(skb, RTA_TABLE, table);
2391         if (rt->rt6i_flags&RTF_REJECT)
2392                 rtm->rtm_type = RTN_UNREACHABLE;
2393         else if (rt->rt6i_flags&RTF_LOCAL)
2394                 rtm->rtm_type = RTN_LOCAL;
2395         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2396                 rtm->rtm_type = RTN_LOCAL;
2397         else
2398                 rtm->rtm_type = RTN_UNICAST;
2399         rtm->rtm_flags = 0;
2400         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2401         rtm->rtm_protocol = rt->rt6i_protocol;
2402         if (rt->rt6i_flags&RTF_DYNAMIC)
2403                 rtm->rtm_protocol = RTPROT_REDIRECT;
2404         else if (rt->rt6i_flags & RTF_ADDRCONF)
2405                 rtm->rtm_protocol = RTPROT_KERNEL;
2406         else if (rt->rt6i_flags&RTF_DEFAULT)
2407                 rtm->rtm_protocol = RTPROT_RA;
2408
2409         if (rt->rt6i_flags&RTF_CACHE)
2410                 rtm->rtm_flags |= RTM_F_CLONED;
2411
2412         if (dst) {
2413                 NLA_PUT(skb, RTA_DST, 16, dst);
2414                 rtm->rtm_dst_len = 128;
2415         } else if (rtm->rtm_dst_len)
2416                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2417 #ifdef CONFIG_IPV6_SUBTREES
2418         if (src) {
2419                 NLA_PUT(skb, RTA_SRC, 16, src);
2420                 rtm->rtm_src_len = 128;
2421         } else if (rtm->rtm_src_len)
2422                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2423 #endif
2424         if (iif) {
2425 #ifdef CONFIG_IPV6_MROUTE
2426                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2427                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2428                         if (err <= 0) {
2429                                 if (!nowait) {
2430                                         if (err == 0)
2431                                                 return 0;
2432                                         goto nla_put_failure;
2433                                 } else {
2434                                         if (err == -EMSGSIZE)
2435                                                 goto nla_put_failure;
2436                                 }
2437                         }
2438                 } else
2439 #endif
2440                         NLA_PUT_U32(skb, RTA_IIF, iif);
2441         } else if (dst) {
2442                 struct in6_addr saddr_buf;
2443                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2444                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2445         }
2446
2447         if (rt->rt6i_prefsrc.plen) {
2448                 struct in6_addr saddr_buf;
2449                 saddr_buf = rt->rt6i_prefsrc.addr;
2450                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2451         }
2452
2453         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2454                 goto nla_put_failure;
2455
2456         rcu_read_lock();
2457         n = dst_get_neighbour(&rt->dst);
2458         if (n)
2459                 NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
2460         rcu_read_unlock();
2461
2462         if (rt->dst.dev)
2463                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2464
2465         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2466
2467         if (!(rt->rt6i_flags & RTF_EXPIRES))
2468                 expires = 0;
2469         else if (rt->rt6i_expires - jiffies < INT_MAX)
2470                 expires = rt->rt6i_expires - jiffies;
2471         else
2472                 expires = INT_MAX;
2473
2474         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2475                                expires, rt->dst.error) < 0)
2476                 goto nla_put_failure;
2477
2478         return nlmsg_end(skb, nlh);
2479
2480 nla_put_failure:
2481         nlmsg_cancel(skb, nlh);
2482         return -EMSGSIZE;
2483 }
2484
2485 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2486 {
2487         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2488         int prefix;
2489
2490         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2491                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2492                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2493         } else
2494                 prefix = 0;
2495
2496         return rt6_fill_node(arg->net,
2497                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2498                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2499                      prefix, 0, NLM_F_MULTI);
2500 }
2501
2502 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2503 {
2504         struct net *net = sock_net(in_skb->sk);
2505         struct nlattr *tb[RTA_MAX+1];
2506         struct rt6_info *rt;
2507         struct sk_buff *skb;
2508         struct rtmsg *rtm;
2509         struct flowi6 fl6;
2510         int err, iif = 0;
2511
2512         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2513         if (err < 0)
2514                 goto errout;
2515
2516         err = -EINVAL;
2517         memset(&fl6, 0, sizeof(fl6));
2518
2519         if (tb[RTA_SRC]) {
2520                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2521                         goto errout;
2522
2523                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2524         }
2525
2526         if (tb[RTA_DST]) {
2527                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2528                         goto errout;
2529
2530                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2531         }
2532
2533         if (tb[RTA_IIF])
2534                 iif = nla_get_u32(tb[RTA_IIF]);
2535
2536         if (tb[RTA_OIF])
2537                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2538
2539         if (iif) {
2540                 struct net_device *dev;
2541                 dev = __dev_get_by_index(net, iif);
2542                 if (!dev) {
2543                         err = -ENODEV;
2544                         goto errout;
2545                 }
2546         }
2547
2548         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2549         if (skb == NULL) {
2550                 err = -ENOBUFS;
2551                 goto errout;
2552         }
2553
2554         /* Reserve room for dummy headers, this skb can pass
2555            through good chunk of routing engine.
2556          */
2557         skb_reset_mac_header(skb);
2558         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2559
2560         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2561         skb_dst_set(skb, &rt->dst);
2562
2563         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2564                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2565                             nlh->nlmsg_seq, 0, 0, 0);
2566         if (err < 0) {
2567                 kfree_skb(skb);
2568                 goto errout;
2569         }
2570
2571         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2572 errout:
2573         return err;
2574 }
2575
2576 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2577 {
2578         struct sk_buff *skb;
2579         struct net *net = info->nl_net;
2580         u32 seq;
2581         int err;
2582
2583         err = -ENOBUFS;
2584         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2585
2586         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2587         if (skb == NULL)
2588                 goto errout;
2589
2590         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2591                                 event, info->pid, seq, 0, 0, 0);
2592         if (err < 0) {
2593                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2594                 WARN_ON(err == -EMSGSIZE);
2595                 kfree_skb(skb);
2596                 goto errout;
2597         }
2598         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2599                     info->nlh, gfp_any());
2600         return;
2601 errout:
2602         if (err < 0)
2603                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2604 }
2605
2606 static int ip6_route_dev_notify(struct notifier_block *this,
2607                                 unsigned long event, void *data)
2608 {
2609         struct net_device *dev = (struct net_device *)data;
2610         struct net *net = dev_net(dev);
2611
2612         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2613                 net->ipv6.ip6_null_entry->dst.dev = dev;
2614                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2615 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2616                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2617                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2618                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2619                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2620 #endif
2621         }
2622
2623         return NOTIFY_OK;
2624 }
2625
2626 /*
2627  *      /proc
2628  */
2629
2630 #ifdef CONFIG_PROC_FS
2631
2632 struct rt6_proc_arg
2633 {
2634         char *buffer;
2635         int offset;
2636         int length;
2637         int skip;
2638         int len;
2639 };
2640
2641 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2642 {
2643         struct seq_file *m = p_arg;
2644         struct neighbour *n;
2645
2646         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2647
2648 #ifdef CONFIG_IPV6_SUBTREES
2649         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2650 #else
2651         seq_puts(m, "00000000000000000000000000000000 00 ");
2652 #endif
2653         rcu_read_lock();
2654         n = dst_get_neighbour(&rt->dst);
2655         if (n) {
2656                 seq_printf(m, "%pi6", n->primary_key);
2657         } else {
2658                 seq_puts(m, "00000000000000000000000000000000");
2659         }
2660         rcu_read_unlock();
2661         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2662                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2663                    rt->dst.__use, rt->rt6i_flags,
2664                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2665         return 0;
2666 }
2667
2668 static int ipv6_route_show(struct seq_file *m, void *v)
2669 {
2670         struct net *net = (struct net *)m->private;
2671         fib6_clean_all(net, rt6_info_route, 0, m);
2672         return 0;
2673 }
2674
2675 static int ipv6_route_open(struct inode *inode, struct file *file)
2676 {
2677         return single_open_net(inode, file, ipv6_route_show);
2678 }
2679
2680 static const struct file_operations ipv6_route_proc_fops = {
2681         .owner          = THIS_MODULE,
2682         .open           = ipv6_route_open,
2683         .read           = seq_read,
2684         .llseek         = seq_lseek,
2685         .release        = single_release_net,
2686 };
2687
2688 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2689 {
2690         struct net *net = (struct net *)seq->private;
2691         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2692                    net->ipv6.rt6_stats->fib_nodes,
2693                    net->ipv6.rt6_stats->fib_route_nodes,
2694                    net->ipv6.rt6_stats->fib_rt_alloc,
2695                    net->ipv6.rt6_stats->fib_rt_entries,
2696                    net->ipv6.rt6_stats->fib_rt_cache,
2697                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2698                    net->ipv6.rt6_stats->fib_discarded_routes);
2699
2700         return 0;
2701 }
2702
2703 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2704 {
2705         return single_open_net(inode, file, rt6_stats_seq_show);
2706 }
2707
2708 static const struct file_operations rt6_stats_seq_fops = {
2709         .owner   = THIS_MODULE,
2710         .open    = rt6_stats_seq_open,
2711         .read    = seq_read,
2712         .llseek  = seq_lseek,
2713         .release = single_release_net,
2714 };
2715 #endif  /* CONFIG_PROC_FS */
2716
2717 #ifdef CONFIG_SYSCTL
2718
2719 static
2720 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2721                               void __user *buffer, size_t *lenp, loff_t *ppos)
2722 {
2723         struct net *net;
2724         int delay;
2725         if (!write)
2726                 return -EINVAL;
2727
2728         net = (struct net *)ctl->extra1;
2729         delay = net->ipv6.sysctl.flush_delay;
2730         proc_dointvec(ctl, write, buffer, lenp, ppos);
2731         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2732         return 0;
2733 }
2734
2735 ctl_table ipv6_route_table_template[] = {
2736         {
2737                 .procname       =       "flush",
2738                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2739                 .maxlen         =       sizeof(int),
2740                 .mode           =       0200,
2741                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2742         },
2743         {
2744                 .procname       =       "gc_thresh",
2745                 .data           =       &ip6_dst_ops_template.gc_thresh,
2746                 .maxlen         =       sizeof(int),
2747                 .mode           =       0644,
2748                 .proc_handler   =       proc_dointvec,
2749         },
2750         {
2751                 .procname       =       "max_size",
2752                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2753                 .maxlen         =       sizeof(int),
2754                 .mode           =       0644,
2755                 .proc_handler   =       proc_dointvec,
2756         },
2757         {
2758                 .procname       =       "gc_min_interval",
2759                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2760                 .maxlen         =       sizeof(int),
2761                 .mode           =       0644,
2762                 .proc_handler   =       proc_dointvec_jiffies,
2763         },
2764         {
2765                 .procname       =       "gc_timeout",
2766                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2767                 .maxlen         =       sizeof(int),
2768                 .mode           =       0644,
2769                 .proc_handler   =       proc_dointvec_jiffies,
2770         },
2771         {
2772                 .procname       =       "gc_interval",
2773                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2774                 .maxlen         =       sizeof(int),
2775                 .mode           =       0644,
2776                 .proc_handler   =       proc_dointvec_jiffies,
2777         },
2778         {
2779                 .procname       =       "gc_elasticity",
2780                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2781                 .maxlen         =       sizeof(int),
2782                 .mode           =       0644,
2783                 .proc_handler   =       proc_dointvec,
2784         },
2785         {
2786                 .procname       =       "mtu_expires",
2787                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2788                 .maxlen         =       sizeof(int),
2789                 .mode           =       0644,
2790                 .proc_handler   =       proc_dointvec_jiffies,
2791         },
2792         {
2793                 .procname       =       "min_adv_mss",
2794                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2795                 .maxlen         =       sizeof(int),
2796                 .mode           =       0644,
2797                 .proc_handler   =       proc_dointvec,
2798         },
2799         {
2800                 .procname       =       "gc_min_interval_ms",
2801                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2802                 .maxlen         =       sizeof(int),
2803                 .mode           =       0644,
2804                 .proc_handler   =       proc_dointvec_ms_jiffies,
2805         },
2806         { }
2807 };
2808
2809 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2810 {
2811         struct ctl_table *table;
2812
2813         table = kmemdup(ipv6_route_table_template,
2814                         sizeof(ipv6_route_table_template),
2815                         GFP_KERNEL);
2816
2817         if (table) {
2818                 table[0].data = &net->ipv6.sysctl.flush_delay;
2819                 table[0].extra1 = net;
2820                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2821                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2822                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2823                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2824                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2825                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2826                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2827                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2828                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2829         }
2830
2831         return table;
2832 }
2833 #endif
2834
2835 static int __net_init ip6_route_net_init(struct net *net)
2836 {
2837         int ret = -ENOMEM;
2838
2839         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2840                sizeof(net->ipv6.ip6_dst_ops));
2841
2842         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2843                 goto out_ip6_dst_ops;
2844
2845         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2846                                            sizeof(*net->ipv6.ip6_null_entry),
2847                                            GFP_KERNEL);
2848         if (!net->ipv6.ip6_null_entry)
2849                 goto out_ip6_dst_entries;
2850         net->ipv6.ip6_null_entry->dst.path =
2851                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2852         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2853         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2854                          ip6_template_metrics, true);
2855
2856 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2857         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2858                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2859                                                GFP_KERNEL);
2860         if (!net->ipv6.ip6_prohibit_entry)
2861                 goto out_ip6_null_entry;
2862         net->ipv6.ip6_prohibit_entry->dst.path =
2863                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2864         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2865         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2866                          ip6_template_metrics, true);
2867
2868         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2869                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2870                                                GFP_KERNEL);
2871         if (!net->ipv6.ip6_blk_hole_entry)
2872                 goto out_ip6_prohibit_entry;
2873         net->ipv6.ip6_blk_hole_entry->dst.path =
2874                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2875         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2876         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2877                          ip6_template_metrics, true);
2878 #endif
2879
2880         net->ipv6.sysctl.flush_delay = 0;
2881         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2882         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2883         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2884         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2885         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2886         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2887         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2888
2889 #ifdef CONFIG_PROC_FS
2890         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2891         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2892 #endif
2893         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2894
2895         ret = 0;
2896 out:
2897         return ret;
2898
2899 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2900 out_ip6_prohibit_entry:
2901         kfree(net->ipv6.ip6_prohibit_entry);
2902 out_ip6_null_entry:
2903         kfree(net->ipv6.ip6_null_entry);
2904 #endif
2905 out_ip6_dst_entries:
2906         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2907 out_ip6_dst_ops:
2908         goto out;
2909 }
2910
2911 static void __net_exit ip6_route_net_exit(struct net *net)
2912 {
2913 #ifdef CONFIG_PROC_FS
2914         proc_net_remove(net, "ipv6_route");
2915         proc_net_remove(net, "rt6_stats");
2916 #endif
2917         kfree(net->ipv6.ip6_null_entry);
2918 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2919         kfree(net->ipv6.ip6_prohibit_entry);
2920         kfree(net->ipv6.ip6_blk_hole_entry);
2921 #endif
2922         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2923 }
2924
2925 static struct pernet_operations ip6_route_net_ops = {
2926         .init = ip6_route_net_init,
2927         .exit = ip6_route_net_exit,
2928 };
2929
2930 static struct notifier_block ip6_route_dev_notifier = {
2931         .notifier_call = ip6_route_dev_notify,
2932         .priority = 0,
2933 };
2934
2935 int __init ip6_route_init(void)
2936 {
2937         int ret;
2938
2939         ret = -ENOMEM;
2940         ip6_dst_ops_template.kmem_cachep =
2941                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2942                                   SLAB_HWCACHE_ALIGN, NULL);
2943         if (!ip6_dst_ops_template.kmem_cachep)
2944                 goto out;
2945
2946         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2947         if (ret)
2948                 goto out_kmem_cache;
2949
2950         ret = register_pernet_subsys(&ip6_route_net_ops);
2951         if (ret)
2952                 goto out_dst_entries;
2953
2954         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2955
2956         /* Registering of the loopback is done before this portion of code,
2957          * the loopback reference in rt6_info will not be taken, do it
2958          * manually for init_net */
2959         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2960         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2961   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2962         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2963         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2964         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2965         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2966   #endif
2967         ret = fib6_init();
2968         if (ret)
2969                 goto out_register_subsys;
2970
2971         ret = xfrm6_init();
2972         if (ret)
2973                 goto out_fib6_init;
2974
2975         ret = fib6_rules_init();
2976         if (ret)
2977                 goto xfrm6_init;
2978
2979         ret = -ENOBUFS;
2980         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
2981             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
2982             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2983                 goto fib6_rules_init;
2984
2985         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2986         if (ret)
2987                 goto fib6_rules_init;
2988
2989 out:
2990         return ret;
2991
2992 fib6_rules_init:
2993         fib6_rules_cleanup();
2994 xfrm6_init:
2995         xfrm6_fini();
2996 out_fib6_init:
2997         fib6_gc_cleanup();
2998 out_register_subsys:
2999         unregister_pernet_subsys(&ip6_route_net_ops);
3000 out_dst_entries:
3001         dst_entries_destroy(&ip6_dst_blackhole_ops);
3002 out_kmem_cache:
3003         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3004         goto out;
3005 }
3006
3007 void ip6_route_cleanup(void)
3008 {
3009         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3010         fib6_rules_cleanup();
3011         xfrm6_fini();
3012         fib6_gc_cleanup();
3013         unregister_pernet_subsys(&ip6_route_net_ops);
3014         dst_entries_destroy(&ip6_dst_blackhole_ops);
3015         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3016 }