]> Pileus Git - ~andy/linux/blob - net/ipv6/route.c
Merge ssh://master.kernel.org/pub/scm/linux/kernel/git/linville/wireless-next-2.6...
[~andy/linux] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int      ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            const struct in6_addr *prefix, int prefixlen,
93                                            const struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            const struct in6_addr *prefix, int prefixlen,
97                                            const struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
101 {
102         struct rt6_info *rt = (struct rt6_info *) dst;
103         struct inet_peer *peer;
104         u32 *p = NULL;
105
106         if (!rt->rt6i_peer)
107                 rt6_bind_peer(rt, 1);
108
109         peer = rt->rt6i_peer;
110         if (peer) {
111                 u32 *old_p = __DST_METRICS_PTR(old);
112                 unsigned long prev, new;
113
114                 p = peer->metrics;
115                 if (inet_metrics_new(peer))
116                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
117
118                 new = (unsigned long) p;
119                 prev = cmpxchg(&dst->_metrics, old, new);
120
121                 if (prev != old) {
122                         p = __DST_METRICS_PTR(prev);
123                         if (prev & DST_METRICS_READ_ONLY)
124                                 p = NULL;
125                 }
126         }
127         return p;
128 }
129
130 static struct dst_ops ip6_dst_ops_template = {
131         .family                 =       AF_INET6,
132         .protocol               =       cpu_to_be16(ETH_P_IPV6),
133         .gc                     =       ip6_dst_gc,
134         .gc_thresh              =       1024,
135         .check                  =       ip6_dst_check,
136         .default_advmss         =       ip6_default_advmss,
137         .default_mtu            =       ip6_default_mtu,
138         .cow_metrics            =       ipv6_cow_metrics,
139         .destroy                =       ip6_dst_destroy,
140         .ifdown                 =       ip6_dst_ifdown,
141         .negative_advice        =       ip6_negative_advice,
142         .link_failure           =       ip6_link_failure,
143         .update_pmtu            =       ip6_rt_update_pmtu,
144         .local_out              =       __ip6_local_out,
145 };
146
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
148 {
149         return 0;
150 }
151
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
153 {
154 }
155
156 static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
157                                          unsigned long old)
158 {
159         return NULL;
160 }
161
162 static struct dst_ops ip6_dst_blackhole_ops = {
163         .family                 =       AF_INET6,
164         .protocol               =       cpu_to_be16(ETH_P_IPV6),
165         .destroy                =       ip6_dst_destroy,
166         .check                  =       ip6_dst_check,
167         .default_mtu            =       ip6_blackhole_default_mtu,
168         .default_advmss         =       ip6_default_advmss,
169         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
170         .cow_metrics            =       ip6_rt_blackhole_cow_metrics,
171 };
172
173 static const u32 ip6_template_metrics[RTAX_MAX] = {
174         [RTAX_HOPLIMIT - 1] = 255,
175 };
176
177 static struct rt6_info ip6_null_entry_template = {
178         .dst = {
179                 .__refcnt       = ATOMIC_INIT(1),
180                 .__use          = 1,
181                 .obsolete       = -1,
182                 .error          = -ENETUNREACH,
183                 .input          = ip6_pkt_discard,
184                 .output         = ip6_pkt_discard_out,
185         },
186         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
187         .rt6i_protocol  = RTPROT_KERNEL,
188         .rt6i_metric    = ~(u32) 0,
189         .rt6i_ref       = ATOMIC_INIT(1),
190 };
191
192 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
193
194 static int ip6_pkt_prohibit(struct sk_buff *skb);
195 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
196
197 static struct rt6_info ip6_prohibit_entry_template = {
198         .dst = {
199                 .__refcnt       = ATOMIC_INIT(1),
200                 .__use          = 1,
201                 .obsolete       = -1,
202                 .error          = -EACCES,
203                 .input          = ip6_pkt_prohibit,
204                 .output         = ip6_pkt_prohibit_out,
205         },
206         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
207         .rt6i_protocol  = RTPROT_KERNEL,
208         .rt6i_metric    = ~(u32) 0,
209         .rt6i_ref       = ATOMIC_INIT(1),
210 };
211
212 static struct rt6_info ip6_blk_hole_entry_template = {
213         .dst = {
214                 .__refcnt       = ATOMIC_INIT(1),
215                 .__use          = 1,
216                 .obsolete       = -1,
217                 .error          = -EINVAL,
218                 .input          = dst_discard,
219                 .output         = dst_discard,
220         },
221         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
222         .rt6i_protocol  = RTPROT_KERNEL,
223         .rt6i_metric    = ~(u32) 0,
224         .rt6i_ref       = ATOMIC_INIT(1),
225 };
226
227 #endif
228
229 /* allocate dst with ip6_dst_ops */
230 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
231                                              struct net_device *dev)
232 {
233         struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, 0);
234
235         memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
236
237         return rt;
238 }
239
240 static void ip6_dst_destroy(struct dst_entry *dst)
241 {
242         struct rt6_info *rt = (struct rt6_info *)dst;
243         struct inet6_dev *idev = rt->rt6i_idev;
244         struct inet_peer *peer = rt->rt6i_peer;
245
246         if (idev != NULL) {
247                 rt->rt6i_idev = NULL;
248                 in6_dev_put(idev);
249         }
250         if (peer) {
251                 rt->rt6i_peer = NULL;
252                 inet_putpeer(peer);
253         }
254 }
255
256 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
257
258 static u32 rt6_peer_genid(void)
259 {
260         return atomic_read(&__rt6_peer_genid);
261 }
262
263 void rt6_bind_peer(struct rt6_info *rt, int create)
264 {
265         struct inet_peer *peer;
266
267         peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
268         if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
269                 inet_putpeer(peer);
270         else
271                 rt->rt6i_peer_genid = rt6_peer_genid();
272 }
273
274 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
275                            int how)
276 {
277         struct rt6_info *rt = (struct rt6_info *)dst;
278         struct inet6_dev *idev = rt->rt6i_idev;
279         struct net_device *loopback_dev =
280                 dev_net(dev)->loopback_dev;
281
282         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
283                 struct inet6_dev *loopback_idev =
284                         in6_dev_get(loopback_dev);
285                 if (loopback_idev != NULL) {
286                         rt->rt6i_idev = loopback_idev;
287                         in6_dev_put(idev);
288                 }
289         }
290 }
291
292 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
293 {
294         return (rt->rt6i_flags & RTF_EXPIRES) &&
295                 time_after(jiffies, rt->rt6i_expires);
296 }
297
298 static inline int rt6_need_strict(const struct in6_addr *daddr)
299 {
300         return ipv6_addr_type(daddr) &
301                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
302 }
303
304 /*
305  *      Route lookup. Any table->tb6_lock is implied.
306  */
307
308 static inline struct rt6_info *rt6_device_match(struct net *net,
309                                                     struct rt6_info *rt,
310                                                     const struct in6_addr *saddr,
311                                                     int oif,
312                                                     int flags)
313 {
314         struct rt6_info *local = NULL;
315         struct rt6_info *sprt;
316
317         if (!oif && ipv6_addr_any(saddr))
318                 goto out;
319
320         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
321                 struct net_device *dev = sprt->rt6i_dev;
322
323                 if (oif) {
324                         if (dev->ifindex == oif)
325                                 return sprt;
326                         if (dev->flags & IFF_LOOPBACK) {
327                                 if (sprt->rt6i_idev == NULL ||
328                                     sprt->rt6i_idev->dev->ifindex != oif) {
329                                         if (flags & RT6_LOOKUP_F_IFACE && oif)
330                                                 continue;
331                                         if (local && (!oif ||
332                                                       local->rt6i_idev->dev->ifindex == oif))
333                                                 continue;
334                                 }
335                                 local = sprt;
336                         }
337                 } else {
338                         if (ipv6_chk_addr(net, saddr, dev,
339                                           flags & RT6_LOOKUP_F_IFACE))
340                                 return sprt;
341                 }
342         }
343
344         if (oif) {
345                 if (local)
346                         return local;
347
348                 if (flags & RT6_LOOKUP_F_IFACE)
349                         return net->ipv6.ip6_null_entry;
350         }
351 out:
352         return rt;
353 }
354
355 #ifdef CONFIG_IPV6_ROUTER_PREF
356 static void rt6_probe(struct rt6_info *rt)
357 {
358         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
359         /*
360          * Okay, this does not seem to be appropriate
361          * for now, however, we need to check if it
362          * is really so; aka Router Reachability Probing.
363          *
364          * Router Reachability Probe MUST be rate-limited
365          * to no more than one per minute.
366          */
367         if (!neigh || (neigh->nud_state & NUD_VALID))
368                 return;
369         read_lock_bh(&neigh->lock);
370         if (!(neigh->nud_state & NUD_VALID) &&
371             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
372                 struct in6_addr mcaddr;
373                 struct in6_addr *target;
374
375                 neigh->updated = jiffies;
376                 read_unlock_bh(&neigh->lock);
377
378                 target = (struct in6_addr *)&neigh->primary_key;
379                 addrconf_addr_solict_mult(target, &mcaddr);
380                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
381         } else
382                 read_unlock_bh(&neigh->lock);
383 }
384 #else
385 static inline void rt6_probe(struct rt6_info *rt)
386 {
387 }
388 #endif
389
390 /*
391  * Default Router Selection (RFC 2461 6.3.6)
392  */
393 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
394 {
395         struct net_device *dev = rt->rt6i_dev;
396         if (!oif || dev->ifindex == oif)
397                 return 2;
398         if ((dev->flags & IFF_LOOPBACK) &&
399             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
400                 return 1;
401         return 0;
402 }
403
404 static inline int rt6_check_neigh(struct rt6_info *rt)
405 {
406         struct neighbour *neigh = rt->rt6i_nexthop;
407         int m;
408         if (rt->rt6i_flags & RTF_NONEXTHOP ||
409             !(rt->rt6i_flags & RTF_GATEWAY))
410                 m = 1;
411         else if (neigh) {
412                 read_lock_bh(&neigh->lock);
413                 if (neigh->nud_state & NUD_VALID)
414                         m = 2;
415 #ifdef CONFIG_IPV6_ROUTER_PREF
416                 else if (neigh->nud_state & NUD_FAILED)
417                         m = 0;
418 #endif
419                 else
420                         m = 1;
421                 read_unlock_bh(&neigh->lock);
422         } else
423                 m = 0;
424         return m;
425 }
426
427 static int rt6_score_route(struct rt6_info *rt, int oif,
428                            int strict)
429 {
430         int m, n;
431
432         m = rt6_check_dev(rt, oif);
433         if (!m && (strict & RT6_LOOKUP_F_IFACE))
434                 return -1;
435 #ifdef CONFIG_IPV6_ROUTER_PREF
436         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
437 #endif
438         n = rt6_check_neigh(rt);
439         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
440                 return -1;
441         return m;
442 }
443
444 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
445                                    int *mpri, struct rt6_info *match)
446 {
447         int m;
448
449         if (rt6_check_expired(rt))
450                 goto out;
451
452         m = rt6_score_route(rt, oif, strict);
453         if (m < 0)
454                 goto out;
455
456         if (m > *mpri) {
457                 if (strict & RT6_LOOKUP_F_REACHABLE)
458                         rt6_probe(match);
459                 *mpri = m;
460                 match = rt;
461         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
462                 rt6_probe(rt);
463         }
464
465 out:
466         return match;
467 }
468
469 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
470                                      struct rt6_info *rr_head,
471                                      u32 metric, int oif, int strict)
472 {
473         struct rt6_info *rt, *match;
474         int mpri = -1;
475
476         match = NULL;
477         for (rt = rr_head; rt && rt->rt6i_metric == metric;
478              rt = rt->dst.rt6_next)
479                 match = find_match(rt, oif, strict, &mpri, match);
480         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
481              rt = rt->dst.rt6_next)
482                 match = find_match(rt, oif, strict, &mpri, match);
483
484         return match;
485 }
486
487 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
488 {
489         struct rt6_info *match, *rt0;
490         struct net *net;
491
492         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
493                   __func__, fn->leaf, oif);
494
495         rt0 = fn->rr_ptr;
496         if (!rt0)
497                 fn->rr_ptr = rt0 = fn->leaf;
498
499         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
500
501         if (!match &&
502             (strict & RT6_LOOKUP_F_REACHABLE)) {
503                 struct rt6_info *next = rt0->dst.rt6_next;
504
505                 /* no entries matched; do round-robin */
506                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
507                         next = fn->leaf;
508
509                 if (next != rt0)
510                         fn->rr_ptr = next;
511         }
512
513         RT6_TRACE("%s() => %p\n",
514                   __func__, match);
515
516         net = dev_net(rt0->rt6i_dev);
517         return match ? match : net->ipv6.ip6_null_entry;
518 }
519
520 #ifdef CONFIG_IPV6_ROUTE_INFO
521 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
522                   const struct in6_addr *gwaddr)
523 {
524         struct net *net = dev_net(dev);
525         struct route_info *rinfo = (struct route_info *) opt;
526         struct in6_addr prefix_buf, *prefix;
527         unsigned int pref;
528         unsigned long lifetime;
529         struct rt6_info *rt;
530
531         if (len < sizeof(struct route_info)) {
532                 return -EINVAL;
533         }
534
535         /* Sanity check for prefix_len and length */
536         if (rinfo->length > 3) {
537                 return -EINVAL;
538         } else if (rinfo->prefix_len > 128) {
539                 return -EINVAL;
540         } else if (rinfo->prefix_len > 64) {
541                 if (rinfo->length < 2) {
542                         return -EINVAL;
543                 }
544         } else if (rinfo->prefix_len > 0) {
545                 if (rinfo->length < 1) {
546                         return -EINVAL;
547                 }
548         }
549
550         pref = rinfo->route_pref;
551         if (pref == ICMPV6_ROUTER_PREF_INVALID)
552                 return -EINVAL;
553
554         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
555
556         if (rinfo->length == 3)
557                 prefix = (struct in6_addr *)rinfo->prefix;
558         else {
559                 /* this function is safe */
560                 ipv6_addr_prefix(&prefix_buf,
561                                  (struct in6_addr *)rinfo->prefix,
562                                  rinfo->prefix_len);
563                 prefix = &prefix_buf;
564         }
565
566         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
567                                 dev->ifindex);
568
569         if (rt && !lifetime) {
570                 ip6_del_rt(rt);
571                 rt = NULL;
572         }
573
574         if (!rt && lifetime)
575                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
576                                         pref);
577         else if (rt)
578                 rt->rt6i_flags = RTF_ROUTEINFO |
579                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
580
581         if (rt) {
582                 if (!addrconf_finite_timeout(lifetime)) {
583                         rt->rt6i_flags &= ~RTF_EXPIRES;
584                 } else {
585                         rt->rt6i_expires = jiffies + HZ * lifetime;
586                         rt->rt6i_flags |= RTF_EXPIRES;
587                 }
588                 dst_release(&rt->dst);
589         }
590         return 0;
591 }
592 #endif
593
594 #define BACKTRACK(__net, saddr)                 \
595 do { \
596         if (rt == __net->ipv6.ip6_null_entry) { \
597                 struct fib6_node *pn; \
598                 while (1) { \
599                         if (fn->fn_flags & RTN_TL_ROOT) \
600                                 goto out; \
601                         pn = fn->parent; \
602                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
603                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
604                         else \
605                                 fn = pn; \
606                         if (fn->fn_flags & RTN_RTINFO) \
607                                 goto restart; \
608                 } \
609         } \
610 } while(0)
611
612 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
613                                              struct fib6_table *table,
614                                              struct flowi6 *fl6, int flags)
615 {
616         struct fib6_node *fn;
617         struct rt6_info *rt;
618
619         read_lock_bh(&table->tb6_lock);
620         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
621 restart:
622         rt = fn->leaf;
623         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
624         BACKTRACK(net, &fl6->saddr);
625 out:
626         dst_use(&rt->dst, jiffies);
627         read_unlock_bh(&table->tb6_lock);
628         return rt;
629
630 }
631
632 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
633                             const struct in6_addr *saddr, int oif, int strict)
634 {
635         struct flowi6 fl6 = {
636                 .flowi6_oif = oif,
637                 .daddr = *daddr,
638         };
639         struct dst_entry *dst;
640         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
641
642         if (saddr) {
643                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
644                 flags |= RT6_LOOKUP_F_HAS_SADDR;
645         }
646
647         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
648         if (dst->error == 0)
649                 return (struct rt6_info *) dst;
650
651         dst_release(dst);
652
653         return NULL;
654 }
655
656 EXPORT_SYMBOL(rt6_lookup);
657
658 /* ip6_ins_rt is called with FREE table->tb6_lock.
659    It takes new route entry, the addition fails by any reason the
660    route is freed. In any case, if caller does not hold it, it may
661    be destroyed.
662  */
663
664 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
665 {
666         int err;
667         struct fib6_table *table;
668
669         table = rt->rt6i_table;
670         write_lock_bh(&table->tb6_lock);
671         err = fib6_add(&table->tb6_root, rt, info);
672         write_unlock_bh(&table->tb6_lock);
673
674         return err;
675 }
676
677 int ip6_ins_rt(struct rt6_info *rt)
678 {
679         struct nl_info info = {
680                 .nl_net = dev_net(rt->rt6i_dev),
681         };
682         return __ip6_ins_rt(rt, &info);
683 }
684
685 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, const struct in6_addr *daddr,
686                                       const struct in6_addr *saddr)
687 {
688         struct rt6_info *rt;
689
690         /*
691          *      Clone the route.
692          */
693
694         rt = ip6_rt_copy(ort);
695
696         if (rt) {
697                 struct neighbour *neigh;
698                 int attempts = !in_softirq();
699
700                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
701                         if (rt->rt6i_dst.plen != 128 &&
702                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
703                                 rt->rt6i_flags |= RTF_ANYCAST;
704                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
705                 }
706
707                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
708                 rt->rt6i_dst.plen = 128;
709                 rt->rt6i_flags |= RTF_CACHE;
710                 rt->dst.flags |= DST_HOST;
711
712 #ifdef CONFIG_IPV6_SUBTREES
713                 if (rt->rt6i_src.plen && saddr) {
714                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
715                         rt->rt6i_src.plen = 128;
716                 }
717 #endif
718
719         retry:
720                 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
721                 if (IS_ERR(neigh)) {
722                         struct net *net = dev_net(rt->rt6i_dev);
723                         int saved_rt_min_interval =
724                                 net->ipv6.sysctl.ip6_rt_gc_min_interval;
725                         int saved_rt_elasticity =
726                                 net->ipv6.sysctl.ip6_rt_gc_elasticity;
727
728                         if (attempts-- > 0) {
729                                 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
730                                 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
731
732                                 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
733
734                                 net->ipv6.sysctl.ip6_rt_gc_elasticity =
735                                         saved_rt_elasticity;
736                                 net->ipv6.sysctl.ip6_rt_gc_min_interval =
737                                         saved_rt_min_interval;
738                                 goto retry;
739                         }
740
741                         if (net_ratelimit())
742                                 printk(KERN_WARNING
743                                        "ipv6: Neighbour table overflow.\n");
744                         dst_free(&rt->dst);
745                         return NULL;
746                 }
747                 rt->rt6i_nexthop = neigh;
748
749         }
750
751         return rt;
752 }
753
754 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, const struct in6_addr *daddr)
755 {
756         struct rt6_info *rt = ip6_rt_copy(ort);
757         if (rt) {
758                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
759                 rt->rt6i_dst.plen = 128;
760                 rt->rt6i_flags |= RTF_CACHE;
761                 rt->dst.flags |= DST_HOST;
762                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
763         }
764         return rt;
765 }
766
767 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
768                                       struct flowi6 *fl6, int flags)
769 {
770         struct fib6_node *fn;
771         struct rt6_info *rt, *nrt;
772         int strict = 0;
773         int attempts = 3;
774         int err;
775         int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
776
777         strict |= flags & RT6_LOOKUP_F_IFACE;
778
779 relookup:
780         read_lock_bh(&table->tb6_lock);
781
782 restart_2:
783         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
784
785 restart:
786         rt = rt6_select(fn, oif, strict | reachable);
787
788         BACKTRACK(net, &fl6->saddr);
789         if (rt == net->ipv6.ip6_null_entry ||
790             rt->rt6i_flags & RTF_CACHE)
791                 goto out;
792
793         dst_hold(&rt->dst);
794         read_unlock_bh(&table->tb6_lock);
795
796         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
797                 nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
798         else if (!(rt->dst.flags & DST_HOST))
799                 nrt = rt6_alloc_clone(rt, &fl6->daddr);
800         else
801                 goto out2;
802
803         dst_release(&rt->dst);
804         rt = nrt ? : net->ipv6.ip6_null_entry;
805
806         dst_hold(&rt->dst);
807         if (nrt) {
808                 err = ip6_ins_rt(nrt);
809                 if (!err)
810                         goto out2;
811         }
812
813         if (--attempts <= 0)
814                 goto out2;
815
816         /*
817          * Race condition! In the gap, when table->tb6_lock was
818          * released someone could insert this route.  Relookup.
819          */
820         dst_release(&rt->dst);
821         goto relookup;
822
823 out:
824         if (reachable) {
825                 reachable = 0;
826                 goto restart_2;
827         }
828         dst_hold(&rt->dst);
829         read_unlock_bh(&table->tb6_lock);
830 out2:
831         rt->dst.lastuse = jiffies;
832         rt->dst.__use++;
833
834         return rt;
835 }
836
837 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
838                                             struct flowi6 *fl6, int flags)
839 {
840         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
841 }
842
843 void ip6_route_input(struct sk_buff *skb)
844 {
845         const struct ipv6hdr *iph = ipv6_hdr(skb);
846         struct net *net = dev_net(skb->dev);
847         int flags = RT6_LOOKUP_F_HAS_SADDR;
848         struct flowi6 fl6 = {
849                 .flowi6_iif = skb->dev->ifindex,
850                 .daddr = iph->daddr,
851                 .saddr = iph->saddr,
852                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
853                 .flowi6_mark = skb->mark,
854                 .flowi6_proto = iph->nexthdr,
855         };
856
857         if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
858                 flags |= RT6_LOOKUP_F_IFACE;
859
860         skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
861 }
862
863 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
864                                              struct flowi6 *fl6, int flags)
865 {
866         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
867 }
868
869 struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
870                                     struct flowi6 *fl6)
871 {
872         int flags = 0;
873
874         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
875                 flags |= RT6_LOOKUP_F_IFACE;
876
877         if (!ipv6_addr_any(&fl6->saddr))
878                 flags |= RT6_LOOKUP_F_HAS_SADDR;
879         else if (sk)
880                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
881
882         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
883 }
884
885 EXPORT_SYMBOL(ip6_route_output);
886
887 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
888 {
889         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
890         struct dst_entry *new = NULL;
891
892         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
893         if (rt) {
894                 memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));
895
896                 new = &rt->dst;
897
898                 new->__use = 1;
899                 new->input = dst_discard;
900                 new->output = dst_discard;
901
902                 dst_copy_metrics(new, &ort->dst);
903                 rt->rt6i_idev = ort->rt6i_idev;
904                 if (rt->rt6i_idev)
905                         in6_dev_hold(rt->rt6i_idev);
906                 rt->rt6i_expires = 0;
907
908                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
909                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
910                 rt->rt6i_metric = 0;
911
912                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
913 #ifdef CONFIG_IPV6_SUBTREES
914                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
915 #endif
916
917                 dst_free(new);
918         }
919
920         dst_release(dst_orig);
921         return new ? new : ERR_PTR(-ENOMEM);
922 }
923
924 /*
925  *      Destination cache support functions
926  */
927
928 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
929 {
930         struct rt6_info *rt;
931
932         rt = (struct rt6_info *) dst;
933
934         if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
935                 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
936                         if (!rt->rt6i_peer)
937                                 rt6_bind_peer(rt, 0);
938                         rt->rt6i_peer_genid = rt6_peer_genid();
939                 }
940                 return dst;
941         }
942         return NULL;
943 }
944
945 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
946 {
947         struct rt6_info *rt = (struct rt6_info *) dst;
948
949         if (rt) {
950                 if (rt->rt6i_flags & RTF_CACHE) {
951                         if (rt6_check_expired(rt)) {
952                                 ip6_del_rt(rt);
953                                 dst = NULL;
954                         }
955                 } else {
956                         dst_release(dst);
957                         dst = NULL;
958                 }
959         }
960         return dst;
961 }
962
963 static void ip6_link_failure(struct sk_buff *skb)
964 {
965         struct rt6_info *rt;
966
967         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
968
969         rt = (struct rt6_info *) skb_dst(skb);
970         if (rt) {
971                 if (rt->rt6i_flags&RTF_CACHE) {
972                         dst_set_expires(&rt->dst, 0);
973                         rt->rt6i_flags |= RTF_EXPIRES;
974                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
975                         rt->rt6i_node->fn_sernum = -1;
976         }
977 }
978
979 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
980 {
981         struct rt6_info *rt6 = (struct rt6_info*)dst;
982
983         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
984                 rt6->rt6i_flags |= RTF_MODIFIED;
985                 if (mtu < IPV6_MIN_MTU) {
986                         u32 features = dst_metric(dst, RTAX_FEATURES);
987                         mtu = IPV6_MIN_MTU;
988                         features |= RTAX_FEATURE_ALLFRAG;
989                         dst_metric_set(dst, RTAX_FEATURES, features);
990                 }
991                 dst_metric_set(dst, RTAX_MTU, mtu);
992         }
993 }
994
995 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
996 {
997         struct net_device *dev = dst->dev;
998         unsigned int mtu = dst_mtu(dst);
999         struct net *net = dev_net(dev);
1000
1001         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1002
1003         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1004                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1005
1006         /*
1007          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1008          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1009          * IPV6_MAXPLEN is also valid and means: "any MSS,
1010          * rely only on pmtu discovery"
1011          */
1012         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1013                 mtu = IPV6_MAXPLEN;
1014         return mtu;
1015 }
1016
1017 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1018 {
1019         unsigned int mtu = IPV6_MIN_MTU;
1020         struct inet6_dev *idev;
1021
1022         rcu_read_lock();
1023         idev = __in6_dev_get(dst->dev);
1024         if (idev)
1025                 mtu = idev->cnf.mtu6;
1026         rcu_read_unlock();
1027
1028         return mtu;
1029 }
1030
1031 static struct dst_entry *icmp6_dst_gc_list;
1032 static DEFINE_SPINLOCK(icmp6_dst_lock);
1033
1034 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1035                                   struct neighbour *neigh,
1036                                   const struct in6_addr *addr)
1037 {
1038         struct rt6_info *rt;
1039         struct inet6_dev *idev = in6_dev_get(dev);
1040         struct net *net = dev_net(dev);
1041
1042         if (unlikely(idev == NULL))
1043                 return NULL;
1044
1045         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev);
1046         if (unlikely(rt == NULL)) {
1047                 in6_dev_put(idev);
1048                 goto out;
1049         }
1050
1051         if (neigh)
1052                 neigh_hold(neigh);
1053         else {
1054                 neigh = ndisc_get_neigh(dev, addr);
1055                 if (IS_ERR(neigh))
1056                         neigh = NULL;
1057         }
1058
1059         rt->rt6i_idev     = idev;
1060         rt->rt6i_nexthop  = neigh;
1061         atomic_set(&rt->dst.__refcnt, 1);
1062         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1063         rt->dst.output  = ip6_output;
1064
1065 #if 0   /* there's no chance to use these for ndisc */
1066         rt->dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1067                                 ? DST_HOST
1068                                 : 0;
1069         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1070         rt->rt6i_dst.plen = 128;
1071 #endif
1072
1073         spin_lock_bh(&icmp6_dst_lock);
1074         rt->dst.next = icmp6_dst_gc_list;
1075         icmp6_dst_gc_list = &rt->dst;
1076         spin_unlock_bh(&icmp6_dst_lock);
1077
1078         fib6_force_start_gc(net);
1079
1080 out:
1081         return &rt->dst;
1082 }
1083
1084 int icmp6_dst_gc(void)
1085 {
1086         struct dst_entry *dst, **pprev;
1087         int more = 0;
1088
1089         spin_lock_bh(&icmp6_dst_lock);
1090         pprev = &icmp6_dst_gc_list;
1091
1092         while ((dst = *pprev) != NULL) {
1093                 if (!atomic_read(&dst->__refcnt)) {
1094                         *pprev = dst->next;
1095                         dst_free(dst);
1096                 } else {
1097                         pprev = &dst->next;
1098                         ++more;
1099                 }
1100         }
1101
1102         spin_unlock_bh(&icmp6_dst_lock);
1103
1104         return more;
1105 }
1106
1107 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1108                             void *arg)
1109 {
1110         struct dst_entry *dst, **pprev;
1111
1112         spin_lock_bh(&icmp6_dst_lock);
1113         pprev = &icmp6_dst_gc_list;
1114         while ((dst = *pprev) != NULL) {
1115                 struct rt6_info *rt = (struct rt6_info *) dst;
1116                 if (func(rt, arg)) {
1117                         *pprev = dst->next;
1118                         dst_free(dst);
1119                 } else {
1120                         pprev = &dst->next;
1121                 }
1122         }
1123         spin_unlock_bh(&icmp6_dst_lock);
1124 }
1125
1126 static int ip6_dst_gc(struct dst_ops *ops)
1127 {
1128         unsigned long now = jiffies;
1129         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1130         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1131         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1132         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1133         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1134         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1135         int entries;
1136
1137         entries = dst_entries_get_fast(ops);
1138         if (time_after(rt_last_gc + rt_min_interval, now) &&
1139             entries <= rt_max_size)
1140                 goto out;
1141
1142         net->ipv6.ip6_rt_gc_expire++;
1143         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1144         net->ipv6.ip6_rt_last_gc = now;
1145         entries = dst_entries_get_slow(ops);
1146         if (entries < ops->gc_thresh)
1147                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1148 out:
1149         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1150         return entries > rt_max_size;
1151 }
1152
1153 /* Clean host part of a prefix. Not necessary in radix tree,
1154    but results in cleaner routing tables.
1155
1156    Remove it only when all the things will work!
1157  */
1158
1159 int ip6_dst_hoplimit(struct dst_entry *dst)
1160 {
1161         int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1162         if (hoplimit == 0) {
1163                 struct net_device *dev = dst->dev;
1164                 struct inet6_dev *idev;
1165
1166                 rcu_read_lock();
1167                 idev = __in6_dev_get(dev);
1168                 if (idev)
1169                         hoplimit = idev->cnf.hop_limit;
1170                 else
1171                         hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1172                 rcu_read_unlock();
1173         }
1174         return hoplimit;
1175 }
1176 EXPORT_SYMBOL(ip6_dst_hoplimit);
1177
1178 /*
1179  *
1180  */
1181
1182 int ip6_route_add(struct fib6_config *cfg)
1183 {
1184         int err;
1185         struct net *net = cfg->fc_nlinfo.nl_net;
1186         struct rt6_info *rt = NULL;
1187         struct net_device *dev = NULL;
1188         struct inet6_dev *idev = NULL;
1189         struct fib6_table *table;
1190         int addr_type;
1191
1192         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1193                 return -EINVAL;
1194 #ifndef CONFIG_IPV6_SUBTREES
1195         if (cfg->fc_src_len)
1196                 return -EINVAL;
1197 #endif
1198         if (cfg->fc_ifindex) {
1199                 err = -ENODEV;
1200                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1201                 if (!dev)
1202                         goto out;
1203                 idev = in6_dev_get(dev);
1204                 if (!idev)
1205                         goto out;
1206         }
1207
1208         if (cfg->fc_metric == 0)
1209                 cfg->fc_metric = IP6_RT_PRIO_USER;
1210
1211         table = fib6_new_table(net, cfg->fc_table);
1212         if (table == NULL) {
1213                 err = -ENOBUFS;
1214                 goto out;
1215         }
1216
1217         rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL);
1218
1219         if (rt == NULL) {
1220                 err = -ENOMEM;
1221                 goto out;
1222         }
1223
1224         rt->dst.obsolete = -1;
1225         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1226                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1227                                 0;
1228
1229         if (cfg->fc_protocol == RTPROT_UNSPEC)
1230                 cfg->fc_protocol = RTPROT_BOOT;
1231         rt->rt6i_protocol = cfg->fc_protocol;
1232
1233         addr_type = ipv6_addr_type(&cfg->fc_dst);
1234
1235         if (addr_type & IPV6_ADDR_MULTICAST)
1236                 rt->dst.input = ip6_mc_input;
1237         else if (cfg->fc_flags & RTF_LOCAL)
1238                 rt->dst.input = ip6_input;
1239         else
1240                 rt->dst.input = ip6_forward;
1241
1242         rt->dst.output = ip6_output;
1243
1244         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1245         rt->rt6i_dst.plen = cfg->fc_dst_len;
1246         if (rt->rt6i_dst.plen == 128)
1247                rt->dst.flags = DST_HOST;
1248
1249 #ifdef CONFIG_IPV6_SUBTREES
1250         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1251         rt->rt6i_src.plen = cfg->fc_src_len;
1252 #endif
1253
1254         rt->rt6i_metric = cfg->fc_metric;
1255
1256         /* We cannot add true routes via loopback here,
1257            they would result in kernel looping; promote them to reject routes
1258          */
1259         if ((cfg->fc_flags & RTF_REJECT) ||
1260             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1261                                               && !(cfg->fc_flags&RTF_LOCAL))) {
1262                 /* hold loopback dev/idev if we haven't done so. */
1263                 if (dev != net->loopback_dev) {
1264                         if (dev) {
1265                                 dev_put(dev);
1266                                 in6_dev_put(idev);
1267                         }
1268                         dev = net->loopback_dev;
1269                         dev_hold(dev);
1270                         idev = in6_dev_get(dev);
1271                         if (!idev) {
1272                                 err = -ENODEV;
1273                                 goto out;
1274                         }
1275                 }
1276                 rt->dst.output = ip6_pkt_discard_out;
1277                 rt->dst.input = ip6_pkt_discard;
1278                 rt->dst.error = -ENETUNREACH;
1279                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1280                 goto install_route;
1281         }
1282
1283         if (cfg->fc_flags & RTF_GATEWAY) {
1284                 const struct in6_addr *gw_addr;
1285                 int gwa_type;
1286
1287                 gw_addr = &cfg->fc_gateway;
1288                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1289                 gwa_type = ipv6_addr_type(gw_addr);
1290
1291                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1292                         struct rt6_info *grt;
1293
1294                         /* IPv6 strictly inhibits using not link-local
1295                            addresses as nexthop address.
1296                            Otherwise, router will not able to send redirects.
1297                            It is very good, but in some (rare!) circumstances
1298                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1299                            some exceptions. --ANK
1300                          */
1301                         err = -EINVAL;
1302                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1303                                 goto out;
1304
1305                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1306
1307                         err = -EHOSTUNREACH;
1308                         if (grt == NULL)
1309                                 goto out;
1310                         if (dev) {
1311                                 if (dev != grt->rt6i_dev) {
1312                                         dst_release(&grt->dst);
1313                                         goto out;
1314                                 }
1315                         } else {
1316                                 dev = grt->rt6i_dev;
1317                                 idev = grt->rt6i_idev;
1318                                 dev_hold(dev);
1319                                 in6_dev_hold(grt->rt6i_idev);
1320                         }
1321                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1322                                 err = 0;
1323                         dst_release(&grt->dst);
1324
1325                         if (err)
1326                                 goto out;
1327                 }
1328                 err = -EINVAL;
1329                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1330                         goto out;
1331         }
1332
1333         err = -ENODEV;
1334         if (dev == NULL)
1335                 goto out;
1336
1337         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
1338                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
1339                         err = -EINVAL;
1340                         goto out;
1341                 }
1342                 ipv6_addr_copy(&rt->rt6i_prefsrc.addr, &cfg->fc_prefsrc);
1343                 rt->rt6i_prefsrc.plen = 128;
1344         } else
1345                 rt->rt6i_prefsrc.plen = 0;
1346
1347         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1348                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1349                 if (IS_ERR(rt->rt6i_nexthop)) {
1350                         err = PTR_ERR(rt->rt6i_nexthop);
1351                         rt->rt6i_nexthop = NULL;
1352                         goto out;
1353                 }
1354         }
1355
1356         rt->rt6i_flags = cfg->fc_flags;
1357
1358 install_route:
1359         if (cfg->fc_mx) {
1360                 struct nlattr *nla;
1361                 int remaining;
1362
1363                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1364                         int type = nla_type(nla);
1365
1366                         if (type) {
1367                                 if (type > RTAX_MAX) {
1368                                         err = -EINVAL;
1369                                         goto out;
1370                                 }
1371
1372                                 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1373                         }
1374                 }
1375         }
1376
1377         rt->dst.dev = dev;
1378         rt->rt6i_idev = idev;
1379         rt->rt6i_table = table;
1380
1381         cfg->fc_nlinfo.nl_net = dev_net(dev);
1382
1383         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1384
1385 out:
1386         if (dev)
1387                 dev_put(dev);
1388         if (idev)
1389                 in6_dev_put(idev);
1390         if (rt)
1391                 dst_free(&rt->dst);
1392         return err;
1393 }
1394
1395 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1396 {
1397         int err;
1398         struct fib6_table *table;
1399         struct net *net = dev_net(rt->rt6i_dev);
1400
1401         if (rt == net->ipv6.ip6_null_entry)
1402                 return -ENOENT;
1403
1404         table = rt->rt6i_table;
1405         write_lock_bh(&table->tb6_lock);
1406
1407         err = fib6_del(rt, info);
1408         dst_release(&rt->dst);
1409
1410         write_unlock_bh(&table->tb6_lock);
1411
1412         return err;
1413 }
1414
1415 int ip6_del_rt(struct rt6_info *rt)
1416 {
1417         struct nl_info info = {
1418                 .nl_net = dev_net(rt->rt6i_dev),
1419         };
1420         return __ip6_del_rt(rt, &info);
1421 }
1422
1423 static int ip6_route_del(struct fib6_config *cfg)
1424 {
1425         struct fib6_table *table;
1426         struct fib6_node *fn;
1427         struct rt6_info *rt;
1428         int err = -ESRCH;
1429
1430         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1431         if (table == NULL)
1432                 return err;
1433
1434         read_lock_bh(&table->tb6_lock);
1435
1436         fn = fib6_locate(&table->tb6_root,
1437                          &cfg->fc_dst, cfg->fc_dst_len,
1438                          &cfg->fc_src, cfg->fc_src_len);
1439
1440         if (fn) {
1441                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1442                         if (cfg->fc_ifindex &&
1443                             (rt->rt6i_dev == NULL ||
1444                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1445                                 continue;
1446                         if (cfg->fc_flags & RTF_GATEWAY &&
1447                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1448                                 continue;
1449                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1450                                 continue;
1451                         dst_hold(&rt->dst);
1452                         read_unlock_bh(&table->tb6_lock);
1453
1454                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1455                 }
1456         }
1457         read_unlock_bh(&table->tb6_lock);
1458
1459         return err;
1460 }
1461
1462 /*
1463  *      Handle redirects
1464  */
1465 struct ip6rd_flowi {
1466         struct flowi6 fl6;
1467         struct in6_addr gateway;
1468 };
1469
1470 static struct rt6_info *__ip6_route_redirect(struct net *net,
1471                                              struct fib6_table *table,
1472                                              struct flowi6 *fl6,
1473                                              int flags)
1474 {
1475         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1476         struct rt6_info *rt;
1477         struct fib6_node *fn;
1478
1479         /*
1480          * Get the "current" route for this destination and
1481          * check if the redirect has come from approriate router.
1482          *
1483          * RFC 2461 specifies that redirects should only be
1484          * accepted if they come from the nexthop to the target.
1485          * Due to the way the routes are chosen, this notion
1486          * is a bit fuzzy and one might need to check all possible
1487          * routes.
1488          */
1489
1490         read_lock_bh(&table->tb6_lock);
1491         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1492 restart:
1493         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1494                 /*
1495                  * Current route is on-link; redirect is always invalid.
1496                  *
1497                  * Seems, previous statement is not true. It could
1498                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1499                  * But then router serving it might decide, that we should
1500                  * know truth 8)8) --ANK (980726).
1501                  */
1502                 if (rt6_check_expired(rt))
1503                         continue;
1504                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1505                         continue;
1506                 if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1507                         continue;
1508                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1509                         continue;
1510                 break;
1511         }
1512
1513         if (!rt)
1514                 rt = net->ipv6.ip6_null_entry;
1515         BACKTRACK(net, &fl6->saddr);
1516 out:
1517         dst_hold(&rt->dst);
1518
1519         read_unlock_bh(&table->tb6_lock);
1520
1521         return rt;
1522 };
1523
1524 static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
1525                                            const struct in6_addr *src,
1526                                            const struct in6_addr *gateway,
1527                                            struct net_device *dev)
1528 {
1529         int flags = RT6_LOOKUP_F_HAS_SADDR;
1530         struct net *net = dev_net(dev);
1531         struct ip6rd_flowi rdfl = {
1532                 .fl6 = {
1533                         .flowi6_oif = dev->ifindex,
1534                         .daddr = *dest,
1535                         .saddr = *src,
1536                 },
1537         };
1538
1539         ipv6_addr_copy(&rdfl.gateway, gateway);
1540
1541         if (rt6_need_strict(dest))
1542                 flags |= RT6_LOOKUP_F_IFACE;
1543
1544         return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1545                                                    flags, __ip6_route_redirect);
1546 }
1547
1548 void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
1549                   const struct in6_addr *saddr,
1550                   struct neighbour *neigh, u8 *lladdr, int on_link)
1551 {
1552         struct rt6_info *rt, *nrt = NULL;
1553         struct netevent_redirect netevent;
1554         struct net *net = dev_net(neigh->dev);
1555
1556         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1557
1558         if (rt == net->ipv6.ip6_null_entry) {
1559                 if (net_ratelimit())
1560                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1561                                "for redirect target\n");
1562                 goto out;
1563         }
1564
1565         /*
1566          *      We have finally decided to accept it.
1567          */
1568
1569         neigh_update(neigh, lladdr, NUD_STALE,
1570                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1571                      NEIGH_UPDATE_F_OVERRIDE|
1572                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1573                                      NEIGH_UPDATE_F_ISROUTER))
1574                      );
1575
1576         /*
1577          * Redirect received -> path was valid.
1578          * Look, redirects are sent only in response to data packets,
1579          * so that this nexthop apparently is reachable. --ANK
1580          */
1581         dst_confirm(&rt->dst);
1582
1583         /* Duplicate redirect: silently ignore. */
1584         if (neigh == rt->dst.neighbour)
1585                 goto out;
1586
1587         nrt = ip6_rt_copy(rt);
1588         if (nrt == NULL)
1589                 goto out;
1590
1591         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1592         if (on_link)
1593                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1594
1595         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1596         nrt->rt6i_dst.plen = 128;
1597         nrt->dst.flags |= DST_HOST;
1598
1599         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1600         nrt->rt6i_nexthop = neigh_clone(neigh);
1601
1602         if (ip6_ins_rt(nrt))
1603                 goto out;
1604
1605         netevent.old = &rt->dst;
1606         netevent.new = &nrt->dst;
1607         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1608
1609         if (rt->rt6i_flags&RTF_CACHE) {
1610                 ip6_del_rt(rt);
1611                 return;
1612         }
1613
1614 out:
1615         dst_release(&rt->dst);
1616 }
1617
1618 /*
1619  *      Handle ICMP "packet too big" messages
1620  *      i.e. Path MTU discovery
1621  */
1622
1623 static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1624                              struct net *net, u32 pmtu, int ifindex)
1625 {
1626         struct rt6_info *rt, *nrt;
1627         int allfrag = 0;
1628 again:
1629         rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1630         if (rt == NULL)
1631                 return;
1632
1633         if (rt6_check_expired(rt)) {
1634                 ip6_del_rt(rt);
1635                 goto again;
1636         }
1637
1638         if (pmtu >= dst_mtu(&rt->dst))
1639                 goto out;
1640
1641         if (pmtu < IPV6_MIN_MTU) {
1642                 /*
1643                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1644                  * MTU (1280) and a fragment header should always be included
1645                  * after a node receiving Too Big message reporting PMTU is
1646                  * less than the IPv6 Minimum Link MTU.
1647                  */
1648                 pmtu = IPV6_MIN_MTU;
1649                 allfrag = 1;
1650         }
1651
1652         /* New mtu received -> path was valid.
1653            They are sent only in response to data packets,
1654            so that this nexthop apparently is reachable. --ANK
1655          */
1656         dst_confirm(&rt->dst);
1657
1658         /* Host route. If it is static, it would be better
1659            not to override it, but add new one, so that
1660            when cache entry will expire old pmtu
1661            would return automatically.
1662          */
1663         if (rt->rt6i_flags & RTF_CACHE) {
1664                 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1665                 if (allfrag) {
1666                         u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1667                         features |= RTAX_FEATURE_ALLFRAG;
1668                         dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1669                 }
1670                 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1671                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1672                 goto out;
1673         }
1674
1675         /* Network route.
1676            Two cases are possible:
1677            1. It is connected route. Action: COW
1678            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1679          */
1680         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1681                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1682         else
1683                 nrt = rt6_alloc_clone(rt, daddr);
1684
1685         if (nrt) {
1686                 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1687                 if (allfrag) {
1688                         u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1689                         features |= RTAX_FEATURE_ALLFRAG;
1690                         dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1691                 }
1692
1693                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1694                  * happened within 5 mins, the recommended timer is 10 mins.
1695                  * Here this route expiration time is set to ip6_rt_mtu_expires
1696                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1697                  * and detecting PMTU increase will be automatically happened.
1698                  */
1699                 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1700                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1701
1702                 ip6_ins_rt(nrt);
1703         }
1704 out:
1705         dst_release(&rt->dst);
1706 }
1707
1708 void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1709                         struct net_device *dev, u32 pmtu)
1710 {
1711         struct net *net = dev_net(dev);
1712
1713         /*
1714          * RFC 1981 states that a node "MUST reduce the size of the packets it
1715          * is sending along the path" that caused the Packet Too Big message.
1716          * Since it's not possible in the general case to determine which
1717          * interface was used to send the original packet, we update the MTU
1718          * on the interface that will be used to send future packets. We also
1719          * update the MTU on the interface that received the Packet Too Big in
1720          * case the original packet was forced out that interface with
1721          * SO_BINDTODEVICE or similar. This is the next best thing to the
1722          * correct behaviour, which would be to update the MTU on all
1723          * interfaces.
1724          */
1725         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1726         rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1727 }
1728
1729 /*
1730  *      Misc support functions
1731  */
1732
1733 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1734 {
1735         struct net *net = dev_net(ort->rt6i_dev);
1736         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1737                                             ort->dst.dev);
1738
1739         if (rt) {
1740                 rt->dst.input = ort->dst.input;
1741                 rt->dst.output = ort->dst.output;
1742
1743                 dst_copy_metrics(&rt->dst, &ort->dst);
1744                 rt->dst.error = ort->dst.error;
1745                 rt->rt6i_idev = ort->rt6i_idev;
1746                 if (rt->rt6i_idev)
1747                         in6_dev_hold(rt->rt6i_idev);
1748                 rt->dst.lastuse = jiffies;
1749                 rt->rt6i_expires = 0;
1750
1751                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1752                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1753                 rt->rt6i_metric = 0;
1754
1755                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1756 #ifdef CONFIG_IPV6_SUBTREES
1757                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1758 #endif
1759                 rt->rt6i_table = ort->rt6i_table;
1760         }
1761         return rt;
1762 }
1763
1764 #ifdef CONFIG_IPV6_ROUTE_INFO
1765 static struct rt6_info *rt6_get_route_info(struct net *net,
1766                                            const struct in6_addr *prefix, int prefixlen,
1767                                            const struct in6_addr *gwaddr, int ifindex)
1768 {
1769         struct fib6_node *fn;
1770         struct rt6_info *rt = NULL;
1771         struct fib6_table *table;
1772
1773         table = fib6_get_table(net, RT6_TABLE_INFO);
1774         if (table == NULL)
1775                 return NULL;
1776
1777         write_lock_bh(&table->tb6_lock);
1778         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1779         if (!fn)
1780                 goto out;
1781
1782         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1783                 if (rt->rt6i_dev->ifindex != ifindex)
1784                         continue;
1785                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1786                         continue;
1787                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1788                         continue;
1789                 dst_hold(&rt->dst);
1790                 break;
1791         }
1792 out:
1793         write_unlock_bh(&table->tb6_lock);
1794         return rt;
1795 }
1796
1797 static struct rt6_info *rt6_add_route_info(struct net *net,
1798                                            const struct in6_addr *prefix, int prefixlen,
1799                                            const struct in6_addr *gwaddr, int ifindex,
1800                                            unsigned pref)
1801 {
1802         struct fib6_config cfg = {
1803                 .fc_table       = RT6_TABLE_INFO,
1804                 .fc_metric      = IP6_RT_PRIO_USER,
1805                 .fc_ifindex     = ifindex,
1806                 .fc_dst_len     = prefixlen,
1807                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1808                                   RTF_UP | RTF_PREF(pref),
1809                 .fc_nlinfo.pid = 0,
1810                 .fc_nlinfo.nlh = NULL,
1811                 .fc_nlinfo.nl_net = net,
1812         };
1813
1814         ipv6_addr_copy(&cfg.fc_dst, prefix);
1815         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1816
1817         /* We should treat it as a default route if prefix length is 0. */
1818         if (!prefixlen)
1819                 cfg.fc_flags |= RTF_DEFAULT;
1820
1821         ip6_route_add(&cfg);
1822
1823         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1824 }
1825 #endif
1826
1827 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1828 {
1829         struct rt6_info *rt;
1830         struct fib6_table *table;
1831
1832         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1833         if (table == NULL)
1834                 return NULL;
1835
1836         write_lock_bh(&table->tb6_lock);
1837         for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1838                 if (dev == rt->rt6i_dev &&
1839                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1840                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1841                         break;
1842         }
1843         if (rt)
1844                 dst_hold(&rt->dst);
1845         write_unlock_bh(&table->tb6_lock);
1846         return rt;
1847 }
1848
1849 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1850                                      struct net_device *dev,
1851                                      unsigned int pref)
1852 {
1853         struct fib6_config cfg = {
1854                 .fc_table       = RT6_TABLE_DFLT,
1855                 .fc_metric      = IP6_RT_PRIO_USER,
1856                 .fc_ifindex     = dev->ifindex,
1857                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1858                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1859                 .fc_nlinfo.pid = 0,
1860                 .fc_nlinfo.nlh = NULL,
1861                 .fc_nlinfo.nl_net = dev_net(dev),
1862         };
1863
1864         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1865
1866         ip6_route_add(&cfg);
1867
1868         return rt6_get_dflt_router(gwaddr, dev);
1869 }
1870
1871 void rt6_purge_dflt_routers(struct net *net)
1872 {
1873         struct rt6_info *rt;
1874         struct fib6_table *table;
1875
1876         /* NOTE: Keep consistent with rt6_get_dflt_router */
1877         table = fib6_get_table(net, RT6_TABLE_DFLT);
1878         if (table == NULL)
1879                 return;
1880
1881 restart:
1882         read_lock_bh(&table->tb6_lock);
1883         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1884                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1885                         dst_hold(&rt->dst);
1886                         read_unlock_bh(&table->tb6_lock);
1887                         ip6_del_rt(rt);
1888                         goto restart;
1889                 }
1890         }
1891         read_unlock_bh(&table->tb6_lock);
1892 }
1893
1894 static void rtmsg_to_fib6_config(struct net *net,
1895                                  struct in6_rtmsg *rtmsg,
1896                                  struct fib6_config *cfg)
1897 {
1898         memset(cfg, 0, sizeof(*cfg));
1899
1900         cfg->fc_table = RT6_TABLE_MAIN;
1901         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1902         cfg->fc_metric = rtmsg->rtmsg_metric;
1903         cfg->fc_expires = rtmsg->rtmsg_info;
1904         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1905         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1906         cfg->fc_flags = rtmsg->rtmsg_flags;
1907
1908         cfg->fc_nlinfo.nl_net = net;
1909
1910         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1911         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1912         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1913 }
1914
1915 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1916 {
1917         struct fib6_config cfg;
1918         struct in6_rtmsg rtmsg;
1919         int err;
1920
1921         switch(cmd) {
1922         case SIOCADDRT:         /* Add a route */
1923         case SIOCDELRT:         /* Delete a route */
1924                 if (!capable(CAP_NET_ADMIN))
1925                         return -EPERM;
1926                 err = copy_from_user(&rtmsg, arg,
1927                                      sizeof(struct in6_rtmsg));
1928                 if (err)
1929                         return -EFAULT;
1930
1931                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1932
1933                 rtnl_lock();
1934                 switch (cmd) {
1935                 case SIOCADDRT:
1936                         err = ip6_route_add(&cfg);
1937                         break;
1938                 case SIOCDELRT:
1939                         err = ip6_route_del(&cfg);
1940                         break;
1941                 default:
1942                         err = -EINVAL;
1943                 }
1944                 rtnl_unlock();
1945
1946                 return err;
1947         }
1948
1949         return -EINVAL;
1950 }
1951
1952 /*
1953  *      Drop the packet on the floor
1954  */
1955
1956 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1957 {
1958         int type;
1959         struct dst_entry *dst = skb_dst(skb);
1960         switch (ipstats_mib_noroutes) {
1961         case IPSTATS_MIB_INNOROUTES:
1962                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1963                 if (type == IPV6_ADDR_ANY) {
1964                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1965                                       IPSTATS_MIB_INADDRERRORS);
1966                         break;
1967                 }
1968                 /* FALLTHROUGH */
1969         case IPSTATS_MIB_OUTNOROUTES:
1970                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1971                               ipstats_mib_noroutes);
1972                 break;
1973         }
1974         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1975         kfree_skb(skb);
1976         return 0;
1977 }
1978
1979 static int ip6_pkt_discard(struct sk_buff *skb)
1980 {
1981         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1982 }
1983
1984 static int ip6_pkt_discard_out(struct sk_buff *skb)
1985 {
1986         skb->dev = skb_dst(skb)->dev;
1987         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1988 }
1989
1990 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1991
1992 static int ip6_pkt_prohibit(struct sk_buff *skb)
1993 {
1994         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1995 }
1996
1997 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1998 {
1999         skb->dev = skb_dst(skb)->dev;
2000         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2001 }
2002
2003 #endif
2004
2005 /*
2006  *      Allocate a dst for local (unicast / anycast) address.
2007  */
2008
2009 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2010                                     const struct in6_addr *addr,
2011                                     int anycast)
2012 {
2013         struct net *net = dev_net(idev->dev);
2014         struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2015                                             net->loopback_dev);
2016         struct neighbour *neigh;
2017
2018         if (rt == NULL) {
2019                 if (net_ratelimit())
2020                         pr_warning("IPv6:  Maximum number of routes reached,"
2021                                    " consider increasing route/max_size.\n");
2022                 return ERR_PTR(-ENOMEM);
2023         }
2024
2025         in6_dev_hold(idev);
2026
2027         rt->dst.flags = DST_HOST;
2028         rt->dst.input = ip6_input;
2029         rt->dst.output = ip6_output;
2030         rt->rt6i_idev = idev;
2031         rt->dst.obsolete = -1;
2032
2033         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2034         if (anycast)
2035                 rt->rt6i_flags |= RTF_ANYCAST;
2036         else
2037                 rt->rt6i_flags |= RTF_LOCAL;
2038         neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2039         if (IS_ERR(neigh)) {
2040                 dst_free(&rt->dst);
2041
2042                 return ERR_CAST(neigh);
2043         }
2044         rt->rt6i_nexthop = neigh;
2045
2046         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2047         rt->rt6i_dst.plen = 128;
2048         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2049
2050         atomic_set(&rt->dst.__refcnt, 1);
2051
2052         return rt;
2053 }
2054
2055 int ip6_route_get_saddr(struct net *net,
2056                         struct rt6_info *rt,
2057                         const struct in6_addr *daddr,
2058                         unsigned int prefs,
2059                         struct in6_addr *saddr)
2060 {
2061         struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
2062         int err = 0;
2063         if (rt->rt6i_prefsrc.plen)
2064                 ipv6_addr_copy(saddr, &rt->rt6i_prefsrc.addr);
2065         else
2066                 err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2067                                          daddr, prefs, saddr);
2068         return err;
2069 }
2070
2071 /* remove deleted ip from prefsrc entries */
2072 struct arg_dev_net_ip {
2073         struct net_device *dev;
2074         struct net *net;
2075         struct in6_addr *addr;
2076 };
2077
2078 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2079 {
2080         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2081         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2082         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2083
2084         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
2085             rt != net->ipv6.ip6_null_entry &&
2086             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2087                 /* remove prefsrc entry */
2088                 rt->rt6i_prefsrc.plen = 0;
2089         }
2090         return 0;
2091 }
2092
2093 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2094 {
2095         struct net *net = dev_net(ifp->idev->dev);
2096         struct arg_dev_net_ip adni = {
2097                 .dev = ifp->idev->dev,
2098                 .net = net,
2099                 .addr = &ifp->addr,
2100         };
2101         fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
2102 }
2103
2104 struct arg_dev_net {
2105         struct net_device *dev;
2106         struct net *net;
2107 };
2108
2109 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2110 {
2111         const struct arg_dev_net *adn = arg;
2112         const struct net_device *dev = adn->dev;
2113
2114         if ((rt->rt6i_dev == dev || dev == NULL) &&
2115             rt != adn->net->ipv6.ip6_null_entry) {
2116                 RT6_TRACE("deleted by ifdown %p\n", rt);
2117                 return -1;
2118         }
2119         return 0;
2120 }
2121
2122 void rt6_ifdown(struct net *net, struct net_device *dev)
2123 {
2124         struct arg_dev_net adn = {
2125                 .dev = dev,
2126                 .net = net,
2127         };
2128
2129         fib6_clean_all(net, fib6_ifdown, 0, &adn);
2130         icmp6_clean_all(fib6_ifdown, &adn);
2131 }
2132
2133 struct rt6_mtu_change_arg
2134 {
2135         struct net_device *dev;
2136         unsigned mtu;
2137 };
2138
2139 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2140 {
2141         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2142         struct inet6_dev *idev;
2143
2144         /* In IPv6 pmtu discovery is not optional,
2145            so that RTAX_MTU lock cannot disable it.
2146            We still use this lock to block changes
2147            caused by addrconf/ndisc.
2148         */
2149
2150         idev = __in6_dev_get(arg->dev);
2151         if (idev == NULL)
2152                 return 0;
2153
2154         /* For administrative MTU increase, there is no way to discover
2155            IPv6 PMTU increase, so PMTU increase should be updated here.
2156            Since RFC 1981 doesn't include administrative MTU increase
2157            update PMTU increase is a MUST. (i.e. jumbo frame)
2158          */
2159         /*
2160            If new MTU is less than route PMTU, this new MTU will be the
2161            lowest MTU in the path, update the route PMTU to reflect PMTU
2162            decreases; if new MTU is greater than route PMTU, and the
2163            old MTU is the lowest MTU in the path, update the route PMTU
2164            to reflect the increase. In this case if the other nodes' MTU
2165            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2166            PMTU discouvery.
2167          */
2168         if (rt->rt6i_dev == arg->dev &&
2169             !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2170             (dst_mtu(&rt->dst) >= arg->mtu ||
2171              (dst_mtu(&rt->dst) < arg->mtu &&
2172               dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2173                 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2174         }
2175         return 0;
2176 }
2177
2178 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2179 {
2180         struct rt6_mtu_change_arg arg = {
2181                 .dev = dev,
2182                 .mtu = mtu,
2183         };
2184
2185         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2186 }
2187
2188 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2189         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2190         [RTA_OIF]               = { .type = NLA_U32 },
2191         [RTA_IIF]               = { .type = NLA_U32 },
2192         [RTA_PRIORITY]          = { .type = NLA_U32 },
2193         [RTA_METRICS]           = { .type = NLA_NESTED },
2194 };
2195
2196 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2197                               struct fib6_config *cfg)
2198 {
2199         struct rtmsg *rtm;
2200         struct nlattr *tb[RTA_MAX+1];
2201         int err;
2202
2203         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2204         if (err < 0)
2205                 goto errout;
2206
2207         err = -EINVAL;
2208         rtm = nlmsg_data(nlh);
2209         memset(cfg, 0, sizeof(*cfg));
2210
2211         cfg->fc_table = rtm->rtm_table;
2212         cfg->fc_dst_len = rtm->rtm_dst_len;
2213         cfg->fc_src_len = rtm->rtm_src_len;
2214         cfg->fc_flags = RTF_UP;
2215         cfg->fc_protocol = rtm->rtm_protocol;
2216
2217         if (rtm->rtm_type == RTN_UNREACHABLE)
2218                 cfg->fc_flags |= RTF_REJECT;
2219
2220         if (rtm->rtm_type == RTN_LOCAL)
2221                 cfg->fc_flags |= RTF_LOCAL;
2222
2223         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2224         cfg->fc_nlinfo.nlh = nlh;
2225         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2226
2227         if (tb[RTA_GATEWAY]) {
2228                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2229                 cfg->fc_flags |= RTF_GATEWAY;
2230         }
2231
2232         if (tb[RTA_DST]) {
2233                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2234
2235                 if (nla_len(tb[RTA_DST]) < plen)
2236                         goto errout;
2237
2238                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2239         }
2240
2241         if (tb[RTA_SRC]) {
2242                 int plen = (rtm->rtm_src_len + 7) >> 3;
2243
2244                 if (nla_len(tb[RTA_SRC]) < plen)
2245                         goto errout;
2246
2247                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2248         }
2249
2250         if (tb[RTA_PREFSRC])
2251                 nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);
2252
2253         if (tb[RTA_OIF])
2254                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2255
2256         if (tb[RTA_PRIORITY])
2257                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2258
2259         if (tb[RTA_METRICS]) {
2260                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2261                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2262         }
2263
2264         if (tb[RTA_TABLE])
2265                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2266
2267         err = 0;
2268 errout:
2269         return err;
2270 }
2271
2272 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2273 {
2274         struct fib6_config cfg;
2275         int err;
2276
2277         err = rtm_to_fib6_config(skb, nlh, &cfg);
2278         if (err < 0)
2279                 return err;
2280
2281         return ip6_route_del(&cfg);
2282 }
2283
2284 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2285 {
2286         struct fib6_config cfg;
2287         int err;
2288
2289         err = rtm_to_fib6_config(skb, nlh, &cfg);
2290         if (err < 0)
2291                 return err;
2292
2293         return ip6_route_add(&cfg);
2294 }
2295
2296 static inline size_t rt6_nlmsg_size(void)
2297 {
2298         return NLMSG_ALIGN(sizeof(struct rtmsg))
2299                + nla_total_size(16) /* RTA_SRC */
2300                + nla_total_size(16) /* RTA_DST */
2301                + nla_total_size(16) /* RTA_GATEWAY */
2302                + nla_total_size(16) /* RTA_PREFSRC */
2303                + nla_total_size(4) /* RTA_TABLE */
2304                + nla_total_size(4) /* RTA_IIF */
2305                + nla_total_size(4) /* RTA_OIF */
2306                + nla_total_size(4) /* RTA_PRIORITY */
2307                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2308                + nla_total_size(sizeof(struct rta_cacheinfo));
2309 }
2310
2311 static int rt6_fill_node(struct net *net,
2312                          struct sk_buff *skb, struct rt6_info *rt,
2313                          struct in6_addr *dst, struct in6_addr *src,
2314                          int iif, int type, u32 pid, u32 seq,
2315                          int prefix, int nowait, unsigned int flags)
2316 {
2317         struct rtmsg *rtm;
2318         struct nlmsghdr *nlh;
2319         long expires;
2320         u32 table;
2321
2322         if (prefix) {   /* user wants prefix routes only */
2323                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2324                         /* success since this is not a prefix route */
2325                         return 1;
2326                 }
2327         }
2328
2329         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2330         if (nlh == NULL)
2331                 return -EMSGSIZE;
2332
2333         rtm = nlmsg_data(nlh);
2334         rtm->rtm_family = AF_INET6;
2335         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2336         rtm->rtm_src_len = rt->rt6i_src.plen;
2337         rtm->rtm_tos = 0;
2338         if (rt->rt6i_table)
2339                 table = rt->rt6i_table->tb6_id;
2340         else
2341                 table = RT6_TABLE_UNSPEC;
2342         rtm->rtm_table = table;
2343         NLA_PUT_U32(skb, RTA_TABLE, table);
2344         if (rt->rt6i_flags&RTF_REJECT)
2345                 rtm->rtm_type = RTN_UNREACHABLE;
2346         else if (rt->rt6i_flags&RTF_LOCAL)
2347                 rtm->rtm_type = RTN_LOCAL;
2348         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2349                 rtm->rtm_type = RTN_LOCAL;
2350         else
2351                 rtm->rtm_type = RTN_UNICAST;
2352         rtm->rtm_flags = 0;
2353         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2354         rtm->rtm_protocol = rt->rt6i_protocol;
2355         if (rt->rt6i_flags&RTF_DYNAMIC)
2356                 rtm->rtm_protocol = RTPROT_REDIRECT;
2357         else if (rt->rt6i_flags & RTF_ADDRCONF)
2358                 rtm->rtm_protocol = RTPROT_KERNEL;
2359         else if (rt->rt6i_flags&RTF_DEFAULT)
2360                 rtm->rtm_protocol = RTPROT_RA;
2361
2362         if (rt->rt6i_flags&RTF_CACHE)
2363                 rtm->rtm_flags |= RTM_F_CLONED;
2364
2365         if (dst) {
2366                 NLA_PUT(skb, RTA_DST, 16, dst);
2367                 rtm->rtm_dst_len = 128;
2368         } else if (rtm->rtm_dst_len)
2369                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2370 #ifdef CONFIG_IPV6_SUBTREES
2371         if (src) {
2372                 NLA_PUT(skb, RTA_SRC, 16, src);
2373                 rtm->rtm_src_len = 128;
2374         } else if (rtm->rtm_src_len)
2375                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2376 #endif
2377         if (iif) {
2378 #ifdef CONFIG_IPV6_MROUTE
2379                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2380                         int err = ip6mr_get_route(net, skb, rtm, nowait);
2381                         if (err <= 0) {
2382                                 if (!nowait) {
2383                                         if (err == 0)
2384                                                 return 0;
2385                                         goto nla_put_failure;
2386                                 } else {
2387                                         if (err == -EMSGSIZE)
2388                                                 goto nla_put_failure;
2389                                 }
2390                         }
2391                 } else
2392 #endif
2393                         NLA_PUT_U32(skb, RTA_IIF, iif);
2394         } else if (dst) {
2395                 struct in6_addr saddr_buf;
2396                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2397                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2398         }
2399
2400         if (rt->rt6i_prefsrc.plen) {
2401                 struct in6_addr saddr_buf;
2402                 ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
2403                 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2404         }
2405
2406         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2407                 goto nla_put_failure;
2408
2409         if (rt->dst.neighbour)
2410                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2411
2412         if (rt->dst.dev)
2413                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2414
2415         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2416
2417         if (!(rt->rt6i_flags & RTF_EXPIRES))
2418                 expires = 0;
2419         else if (rt->rt6i_expires - jiffies < INT_MAX)
2420                 expires = rt->rt6i_expires - jiffies;
2421         else
2422                 expires = INT_MAX;
2423
2424         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2425                                expires, rt->dst.error) < 0)
2426                 goto nla_put_failure;
2427
2428         return nlmsg_end(skb, nlh);
2429
2430 nla_put_failure:
2431         nlmsg_cancel(skb, nlh);
2432         return -EMSGSIZE;
2433 }
2434
2435 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2436 {
2437         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2438         int prefix;
2439
2440         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2441                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2442                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2443         } else
2444                 prefix = 0;
2445
2446         return rt6_fill_node(arg->net,
2447                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2448                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2449                      prefix, 0, NLM_F_MULTI);
2450 }
2451
2452 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2453 {
2454         struct net *net = sock_net(in_skb->sk);
2455         struct nlattr *tb[RTA_MAX+1];
2456         struct rt6_info *rt;
2457         struct sk_buff *skb;
2458         struct rtmsg *rtm;
2459         struct flowi6 fl6;
2460         int err, iif = 0;
2461
2462         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2463         if (err < 0)
2464                 goto errout;
2465
2466         err = -EINVAL;
2467         memset(&fl6, 0, sizeof(fl6));
2468
2469         if (tb[RTA_SRC]) {
2470                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2471                         goto errout;
2472
2473                 ipv6_addr_copy(&fl6.saddr, nla_data(tb[RTA_SRC]));
2474         }
2475
2476         if (tb[RTA_DST]) {
2477                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2478                         goto errout;
2479
2480                 ipv6_addr_copy(&fl6.daddr, nla_data(tb[RTA_DST]));
2481         }
2482
2483         if (tb[RTA_IIF])
2484                 iif = nla_get_u32(tb[RTA_IIF]);
2485
2486         if (tb[RTA_OIF])
2487                 fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
2488
2489         if (iif) {
2490                 struct net_device *dev;
2491                 dev = __dev_get_by_index(net, iif);
2492                 if (!dev) {
2493                         err = -ENODEV;
2494                         goto errout;
2495                 }
2496         }
2497
2498         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2499         if (skb == NULL) {
2500                 err = -ENOBUFS;
2501                 goto errout;
2502         }
2503
2504         /* Reserve room for dummy headers, this skb can pass
2505            through good chunk of routing engine.
2506          */
2507         skb_reset_mac_header(skb);
2508         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2509
2510         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2511         skb_dst_set(skb, &rt->dst);
2512
2513         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
2514                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2515                             nlh->nlmsg_seq, 0, 0, 0);
2516         if (err < 0) {
2517                 kfree_skb(skb);
2518                 goto errout;
2519         }
2520
2521         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2522 errout:
2523         return err;
2524 }
2525
2526 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2527 {
2528         struct sk_buff *skb;
2529         struct net *net = info->nl_net;
2530         u32 seq;
2531         int err;
2532
2533         err = -ENOBUFS;
2534         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2535
2536         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2537         if (skb == NULL)
2538                 goto errout;
2539
2540         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2541                                 event, info->pid, seq, 0, 0, 0);
2542         if (err < 0) {
2543                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2544                 WARN_ON(err == -EMSGSIZE);
2545                 kfree_skb(skb);
2546                 goto errout;
2547         }
2548         rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2549                     info->nlh, gfp_any());
2550         return;
2551 errout:
2552         if (err < 0)
2553                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2554 }
2555
2556 static int ip6_route_dev_notify(struct notifier_block *this,
2557                                 unsigned long event, void *data)
2558 {
2559         struct net_device *dev = (struct net_device *)data;
2560         struct net *net = dev_net(dev);
2561
2562         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2563                 net->ipv6.ip6_null_entry->dst.dev = dev;
2564                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2565 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2566                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2567                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2568                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2569                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2570 #endif
2571         }
2572
2573         return NOTIFY_OK;
2574 }
2575
2576 /*
2577  *      /proc
2578  */
2579
2580 #ifdef CONFIG_PROC_FS
2581
2582 struct rt6_proc_arg
2583 {
2584         char *buffer;
2585         int offset;
2586         int length;
2587         int skip;
2588         int len;
2589 };
2590
2591 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2592 {
2593         struct seq_file *m = p_arg;
2594
2595         seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2596
2597 #ifdef CONFIG_IPV6_SUBTREES
2598         seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2599 #else
2600         seq_puts(m, "00000000000000000000000000000000 00 ");
2601 #endif
2602
2603         if (rt->rt6i_nexthop) {
2604                 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2605         } else {
2606                 seq_puts(m, "00000000000000000000000000000000");
2607         }
2608         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2609                    rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2610                    rt->dst.__use, rt->rt6i_flags,
2611                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2612         return 0;
2613 }
2614
2615 static int ipv6_route_show(struct seq_file *m, void *v)
2616 {
2617         struct net *net = (struct net *)m->private;
2618         fib6_clean_all(net, rt6_info_route, 0, m);
2619         return 0;
2620 }
2621
2622 static int ipv6_route_open(struct inode *inode, struct file *file)
2623 {
2624         return single_open_net(inode, file, ipv6_route_show);
2625 }
2626
2627 static const struct file_operations ipv6_route_proc_fops = {
2628         .owner          = THIS_MODULE,
2629         .open           = ipv6_route_open,
2630         .read           = seq_read,
2631         .llseek         = seq_lseek,
2632         .release        = single_release_net,
2633 };
2634
2635 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2636 {
2637         struct net *net = (struct net *)seq->private;
2638         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2639                    net->ipv6.rt6_stats->fib_nodes,
2640                    net->ipv6.rt6_stats->fib_route_nodes,
2641                    net->ipv6.rt6_stats->fib_rt_alloc,
2642                    net->ipv6.rt6_stats->fib_rt_entries,
2643                    net->ipv6.rt6_stats->fib_rt_cache,
2644                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2645                    net->ipv6.rt6_stats->fib_discarded_routes);
2646
2647         return 0;
2648 }
2649
2650 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2651 {
2652         return single_open_net(inode, file, rt6_stats_seq_show);
2653 }
2654
2655 static const struct file_operations rt6_stats_seq_fops = {
2656         .owner   = THIS_MODULE,
2657         .open    = rt6_stats_seq_open,
2658         .read    = seq_read,
2659         .llseek  = seq_lseek,
2660         .release = single_release_net,
2661 };
2662 #endif  /* CONFIG_PROC_FS */
2663
2664 #ifdef CONFIG_SYSCTL
2665
2666 static
2667 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2668                               void __user *buffer, size_t *lenp, loff_t *ppos)
2669 {
2670         struct net *net;
2671         int delay;
2672         if (!write)
2673                 return -EINVAL;
2674
2675         net = (struct net *)ctl->extra1;
2676         delay = net->ipv6.sysctl.flush_delay;
2677         proc_dointvec(ctl, write, buffer, lenp, ppos);
2678         fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2679         return 0;
2680 }
2681
2682 ctl_table ipv6_route_table_template[] = {
2683         {
2684                 .procname       =       "flush",
2685                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2686                 .maxlen         =       sizeof(int),
2687                 .mode           =       0200,
2688                 .proc_handler   =       ipv6_sysctl_rtcache_flush
2689         },
2690         {
2691                 .procname       =       "gc_thresh",
2692                 .data           =       &ip6_dst_ops_template.gc_thresh,
2693                 .maxlen         =       sizeof(int),
2694                 .mode           =       0644,
2695                 .proc_handler   =       proc_dointvec,
2696         },
2697         {
2698                 .procname       =       "max_size",
2699                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2700                 .maxlen         =       sizeof(int),
2701                 .mode           =       0644,
2702                 .proc_handler   =       proc_dointvec,
2703         },
2704         {
2705                 .procname       =       "gc_min_interval",
2706                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2707                 .maxlen         =       sizeof(int),
2708                 .mode           =       0644,
2709                 .proc_handler   =       proc_dointvec_jiffies,
2710         },
2711         {
2712                 .procname       =       "gc_timeout",
2713                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2714                 .maxlen         =       sizeof(int),
2715                 .mode           =       0644,
2716                 .proc_handler   =       proc_dointvec_jiffies,
2717         },
2718         {
2719                 .procname       =       "gc_interval",
2720                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2721                 .maxlen         =       sizeof(int),
2722                 .mode           =       0644,
2723                 .proc_handler   =       proc_dointvec_jiffies,
2724         },
2725         {
2726                 .procname       =       "gc_elasticity",
2727                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2728                 .maxlen         =       sizeof(int),
2729                 .mode           =       0644,
2730                 .proc_handler   =       proc_dointvec,
2731         },
2732         {
2733                 .procname       =       "mtu_expires",
2734                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2735                 .maxlen         =       sizeof(int),
2736                 .mode           =       0644,
2737                 .proc_handler   =       proc_dointvec_jiffies,
2738         },
2739         {
2740                 .procname       =       "min_adv_mss",
2741                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2742                 .maxlen         =       sizeof(int),
2743                 .mode           =       0644,
2744                 .proc_handler   =       proc_dointvec,
2745         },
2746         {
2747                 .procname       =       "gc_min_interval_ms",
2748                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2749                 .maxlen         =       sizeof(int),
2750                 .mode           =       0644,
2751                 .proc_handler   =       proc_dointvec_ms_jiffies,
2752         },
2753         { }
2754 };
2755
2756 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2757 {
2758         struct ctl_table *table;
2759
2760         table = kmemdup(ipv6_route_table_template,
2761                         sizeof(ipv6_route_table_template),
2762                         GFP_KERNEL);
2763
2764         if (table) {
2765                 table[0].data = &net->ipv6.sysctl.flush_delay;
2766                 table[0].extra1 = net;
2767                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2768                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2769                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2770                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2771                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2772                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2773                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2774                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2775                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2776         }
2777
2778         return table;
2779 }
2780 #endif
2781
2782 static int __net_init ip6_route_net_init(struct net *net)
2783 {
2784         int ret = -ENOMEM;
2785
2786         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2787                sizeof(net->ipv6.ip6_dst_ops));
2788
2789         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2790                 goto out_ip6_dst_ops;
2791
2792         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2793                                            sizeof(*net->ipv6.ip6_null_entry),
2794                                            GFP_KERNEL);
2795         if (!net->ipv6.ip6_null_entry)
2796                 goto out_ip6_dst_entries;
2797         net->ipv6.ip6_null_entry->dst.path =
2798                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2799         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2800         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2801                          ip6_template_metrics, true);
2802
2803 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2804         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2805                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2806                                                GFP_KERNEL);
2807         if (!net->ipv6.ip6_prohibit_entry)
2808                 goto out_ip6_null_entry;
2809         net->ipv6.ip6_prohibit_entry->dst.path =
2810                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2811         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2812         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2813                          ip6_template_metrics, true);
2814
2815         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2816                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2817                                                GFP_KERNEL);
2818         if (!net->ipv6.ip6_blk_hole_entry)
2819                 goto out_ip6_prohibit_entry;
2820         net->ipv6.ip6_blk_hole_entry->dst.path =
2821                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2822         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2823         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2824                          ip6_template_metrics, true);
2825 #endif
2826
2827         net->ipv6.sysctl.flush_delay = 0;
2828         net->ipv6.sysctl.ip6_rt_max_size = 4096;
2829         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2830         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2831         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2832         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2833         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2834         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2835
2836 #ifdef CONFIG_PROC_FS
2837         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2838         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2839 #endif
2840         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2841
2842         ret = 0;
2843 out:
2844         return ret;
2845
2846 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2847 out_ip6_prohibit_entry:
2848         kfree(net->ipv6.ip6_prohibit_entry);
2849 out_ip6_null_entry:
2850         kfree(net->ipv6.ip6_null_entry);
2851 #endif
2852 out_ip6_dst_entries:
2853         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2854 out_ip6_dst_ops:
2855         goto out;
2856 }
2857
2858 static void __net_exit ip6_route_net_exit(struct net *net)
2859 {
2860 #ifdef CONFIG_PROC_FS
2861         proc_net_remove(net, "ipv6_route");
2862         proc_net_remove(net, "rt6_stats");
2863 #endif
2864         kfree(net->ipv6.ip6_null_entry);
2865 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2866         kfree(net->ipv6.ip6_prohibit_entry);
2867         kfree(net->ipv6.ip6_blk_hole_entry);
2868 #endif
2869         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2870 }
2871
2872 static struct pernet_operations ip6_route_net_ops = {
2873         .init = ip6_route_net_init,
2874         .exit = ip6_route_net_exit,
2875 };
2876
2877 static struct notifier_block ip6_route_dev_notifier = {
2878         .notifier_call = ip6_route_dev_notify,
2879         .priority = 0,
2880 };
2881
2882 int __init ip6_route_init(void)
2883 {
2884         int ret;
2885
2886         ret = -ENOMEM;
2887         ip6_dst_ops_template.kmem_cachep =
2888                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2889                                   SLAB_HWCACHE_ALIGN, NULL);
2890         if (!ip6_dst_ops_template.kmem_cachep)
2891                 goto out;
2892
2893         ret = dst_entries_init(&ip6_dst_blackhole_ops);
2894         if (ret)
2895                 goto out_kmem_cache;
2896
2897         ret = register_pernet_subsys(&ip6_route_net_ops);
2898         if (ret)
2899                 goto out_dst_entries;
2900
2901         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2902
2903         /* Registering of the loopback is done before this portion of code,
2904          * the loopback reference in rt6_info will not be taken, do it
2905          * manually for init_net */
2906         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2907         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2908   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2909         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2910         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2911         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2912         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2913   #endif
2914         ret = fib6_init();
2915         if (ret)
2916                 goto out_register_subsys;
2917
2918         ret = xfrm6_init();
2919         if (ret)
2920                 goto out_fib6_init;
2921
2922         ret = fib6_rules_init();
2923         if (ret)
2924                 goto xfrm6_init;
2925
2926         ret = -ENOBUFS;
2927         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2928             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2929             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2930                 goto fib6_rules_init;
2931
2932         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2933         if (ret)
2934                 goto fib6_rules_init;
2935
2936 out:
2937         return ret;
2938
2939 fib6_rules_init:
2940         fib6_rules_cleanup();
2941 xfrm6_init:
2942         xfrm6_fini();
2943 out_fib6_init:
2944         fib6_gc_cleanup();
2945 out_register_subsys:
2946         unregister_pernet_subsys(&ip6_route_net_ops);
2947 out_dst_entries:
2948         dst_entries_destroy(&ip6_dst_blackhole_ops);
2949 out_kmem_cache:
2950         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2951         goto out;
2952 }
2953
2954 void ip6_route_cleanup(void)
2955 {
2956         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2957         fib6_rules_cleanup();
2958         xfrm6_fini();
2959         fib6_gc_cleanup();
2960         unregister_pernet_subsys(&ip6_route_net_ops);
2961         dst_entries_destroy(&ip6_dst_blackhole_ops);
2962         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2963 }