2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <linux/slab.h>
44 #include <net/net_namespace.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
52 #include <linux/rtnetlink.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
58 #include <asm/uaccess.h>
61 #include <linux/sysctl.h>
64 /* Set to 3 to get tracing. */
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
72 #define RT6_TRACE(x...) do { ; } while (0)
75 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
76 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
77 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
78 static unsigned int ip6_default_mtu(const struct dst_entry *dst);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void ip6_dst_destroy(struct dst_entry *);
81 static void ip6_dst_ifdown(struct dst_entry *,
82 struct net_device *dev, int how);
83 static int ip6_dst_gc(struct dst_ops *ops);
85 static int ip6_pkt_discard(struct sk_buff *skb);
86 static int ip6_pkt_discard_out(struct sk_buff *skb);
87 static void ip6_link_failure(struct sk_buff *skb);
88 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92 struct in6_addr *prefix, int prefixlen,
93 struct in6_addr *gwaddr, int ifindex,
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96 struct in6_addr *prefix, int prefixlen,
97 struct in6_addr *gwaddr, int ifindex);
100 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
102 struct rt6_info *rt = (struct rt6_info *) dst;
103 struct inet_peer *peer;
107 rt6_bind_peer(rt, 1);
109 peer = rt->rt6i_peer;
111 u32 *old_p = __DST_METRICS_PTR(old);
112 unsigned long prev, new;
115 if (inet_metrics_new(peer))
116 memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
118 new = (unsigned long) p;
119 prev = cmpxchg(&dst->_metrics, old, new);
122 p = __DST_METRICS_PTR(prev);
123 if (prev & DST_METRICS_READ_ONLY)
130 static struct dst_ops ip6_dst_ops_template = {
132 .protocol = cpu_to_be16(ETH_P_IPV6),
135 .check = ip6_dst_check,
136 .default_advmss = ip6_default_advmss,
137 .default_mtu = ip6_default_mtu,
138 .cow_metrics = ipv6_cow_metrics,
139 .destroy = ip6_dst_destroy,
140 .ifdown = ip6_dst_ifdown,
141 .negative_advice = ip6_negative_advice,
142 .link_failure = ip6_link_failure,
143 .update_pmtu = ip6_rt_update_pmtu,
144 .local_out = __ip6_local_out,
147 static unsigned int ip6_blackhole_default_mtu(const struct dst_entry *dst)
152 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
156 static struct dst_ops ip6_dst_blackhole_ops = {
158 .protocol = cpu_to_be16(ETH_P_IPV6),
159 .destroy = ip6_dst_destroy,
160 .check = ip6_dst_check,
161 .default_mtu = ip6_blackhole_default_mtu,
162 .update_pmtu = ip6_rt_blackhole_update_pmtu,
165 static const u32 ip6_template_metrics[RTAX_MAX] = {
166 [RTAX_HOPLIMIT - 1] = 255,
169 static struct rt6_info ip6_null_entry_template = {
171 .__refcnt = ATOMIC_INIT(1),
174 .error = -ENETUNREACH,
175 .input = ip6_pkt_discard,
176 .output = ip6_pkt_discard_out,
178 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
179 .rt6i_protocol = RTPROT_KERNEL,
180 .rt6i_metric = ~(u32) 0,
181 .rt6i_ref = ATOMIC_INIT(1),
184 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
186 static int ip6_pkt_prohibit(struct sk_buff *skb);
187 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
189 static struct rt6_info ip6_prohibit_entry_template = {
191 .__refcnt = ATOMIC_INIT(1),
195 .input = ip6_pkt_prohibit,
196 .output = ip6_pkt_prohibit_out,
198 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
199 .rt6i_protocol = RTPROT_KERNEL,
200 .rt6i_metric = ~(u32) 0,
201 .rt6i_ref = ATOMIC_INIT(1),
204 static struct rt6_info ip6_blk_hole_entry_template = {
206 .__refcnt = ATOMIC_INIT(1),
210 .input = dst_discard,
211 .output = dst_discard,
213 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
214 .rt6i_protocol = RTPROT_KERNEL,
215 .rt6i_metric = ~(u32) 0,
216 .rt6i_ref = ATOMIC_INIT(1),
221 /* allocate dst with ip6_dst_ops */
222 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
224 return (struct rt6_info *)dst_alloc(ops, 0);
227 static void ip6_dst_destroy(struct dst_entry *dst)
229 struct rt6_info *rt = (struct rt6_info *)dst;
230 struct inet6_dev *idev = rt->rt6i_idev;
231 struct inet_peer *peer = rt->rt6i_peer;
234 rt->rt6i_idev = NULL;
238 rt->rt6i_peer = NULL;
243 static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);
245 static u32 rt6_peer_genid(void)
247 return atomic_read(&__rt6_peer_genid);
250 void rt6_bind_peer(struct rt6_info *rt, int create)
252 struct inet_peer *peer;
254 peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
255 if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
258 rt->rt6i_peer_genid = rt6_peer_genid();
261 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
264 struct rt6_info *rt = (struct rt6_info *)dst;
265 struct inet6_dev *idev = rt->rt6i_idev;
266 struct net_device *loopback_dev =
267 dev_net(dev)->loopback_dev;
269 if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
270 struct inet6_dev *loopback_idev =
271 in6_dev_get(loopback_dev);
272 if (loopback_idev != NULL) {
273 rt->rt6i_idev = loopback_idev;
279 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
281 return (rt->rt6i_flags & RTF_EXPIRES) &&
282 time_after(jiffies, rt->rt6i_expires);
285 static inline int rt6_need_strict(struct in6_addr *daddr)
287 return ipv6_addr_type(daddr) &
288 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
292 * Route lookup. Any table->tb6_lock is implied.
295 static inline struct rt6_info *rt6_device_match(struct net *net,
297 struct in6_addr *saddr,
301 struct rt6_info *local = NULL;
302 struct rt6_info *sprt;
304 if (!oif && ipv6_addr_any(saddr))
307 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
308 struct net_device *dev = sprt->rt6i_dev;
311 if (dev->ifindex == oif)
313 if (dev->flags & IFF_LOOPBACK) {
314 if (sprt->rt6i_idev == NULL ||
315 sprt->rt6i_idev->dev->ifindex != oif) {
316 if (flags & RT6_LOOKUP_F_IFACE && oif)
318 if (local && (!oif ||
319 local->rt6i_idev->dev->ifindex == oif))
325 if (ipv6_chk_addr(net, saddr, dev,
326 flags & RT6_LOOKUP_F_IFACE))
335 if (flags & RT6_LOOKUP_F_IFACE)
336 return net->ipv6.ip6_null_entry;
342 #ifdef CONFIG_IPV6_ROUTER_PREF
343 static void rt6_probe(struct rt6_info *rt)
345 struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
347 * Okay, this does not seem to be appropriate
348 * for now, however, we need to check if it
349 * is really so; aka Router Reachability Probing.
351 * Router Reachability Probe MUST be rate-limited
352 * to no more than one per minute.
354 if (!neigh || (neigh->nud_state & NUD_VALID))
356 read_lock_bh(&neigh->lock);
357 if (!(neigh->nud_state & NUD_VALID) &&
358 time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
359 struct in6_addr mcaddr;
360 struct in6_addr *target;
362 neigh->updated = jiffies;
363 read_unlock_bh(&neigh->lock);
365 target = (struct in6_addr *)&neigh->primary_key;
366 addrconf_addr_solict_mult(target, &mcaddr);
367 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
369 read_unlock_bh(&neigh->lock);
372 static inline void rt6_probe(struct rt6_info *rt)
378 * Default Router Selection (RFC 2461 6.3.6)
380 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
382 struct net_device *dev = rt->rt6i_dev;
383 if (!oif || dev->ifindex == oif)
385 if ((dev->flags & IFF_LOOPBACK) &&
386 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
391 static inline int rt6_check_neigh(struct rt6_info *rt)
393 struct neighbour *neigh = rt->rt6i_nexthop;
395 if (rt->rt6i_flags & RTF_NONEXTHOP ||
396 !(rt->rt6i_flags & RTF_GATEWAY))
399 read_lock_bh(&neigh->lock);
400 if (neigh->nud_state & NUD_VALID)
402 #ifdef CONFIG_IPV6_ROUTER_PREF
403 else if (neigh->nud_state & NUD_FAILED)
408 read_unlock_bh(&neigh->lock);
414 static int rt6_score_route(struct rt6_info *rt, int oif,
419 m = rt6_check_dev(rt, oif);
420 if (!m && (strict & RT6_LOOKUP_F_IFACE))
422 #ifdef CONFIG_IPV6_ROUTER_PREF
423 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
425 n = rt6_check_neigh(rt);
426 if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
431 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
432 int *mpri, struct rt6_info *match)
436 if (rt6_check_expired(rt))
439 m = rt6_score_route(rt, oif, strict);
444 if (strict & RT6_LOOKUP_F_REACHABLE)
448 } else if (strict & RT6_LOOKUP_F_REACHABLE) {
456 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
457 struct rt6_info *rr_head,
458 u32 metric, int oif, int strict)
460 struct rt6_info *rt, *match;
464 for (rt = rr_head; rt && rt->rt6i_metric == metric;
465 rt = rt->dst.rt6_next)
466 match = find_match(rt, oif, strict, &mpri, match);
467 for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
468 rt = rt->dst.rt6_next)
469 match = find_match(rt, oif, strict, &mpri, match);
474 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
476 struct rt6_info *match, *rt0;
479 RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
480 __func__, fn->leaf, oif);
484 fn->rr_ptr = rt0 = fn->leaf;
486 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
489 (strict & RT6_LOOKUP_F_REACHABLE)) {
490 struct rt6_info *next = rt0->dst.rt6_next;
492 /* no entries matched; do round-robin */
493 if (!next || next->rt6i_metric != rt0->rt6i_metric)
500 RT6_TRACE("%s() => %p\n",
503 net = dev_net(rt0->rt6i_dev);
504 return match ? match : net->ipv6.ip6_null_entry;
507 #ifdef CONFIG_IPV6_ROUTE_INFO
508 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
509 struct in6_addr *gwaddr)
511 struct net *net = dev_net(dev);
512 struct route_info *rinfo = (struct route_info *) opt;
513 struct in6_addr prefix_buf, *prefix;
515 unsigned long lifetime;
518 if (len < sizeof(struct route_info)) {
522 /* Sanity check for prefix_len and length */
523 if (rinfo->length > 3) {
525 } else if (rinfo->prefix_len > 128) {
527 } else if (rinfo->prefix_len > 64) {
528 if (rinfo->length < 2) {
531 } else if (rinfo->prefix_len > 0) {
532 if (rinfo->length < 1) {
537 pref = rinfo->route_pref;
538 if (pref == ICMPV6_ROUTER_PREF_INVALID)
541 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
543 if (rinfo->length == 3)
544 prefix = (struct in6_addr *)rinfo->prefix;
546 /* this function is safe */
547 ipv6_addr_prefix(&prefix_buf,
548 (struct in6_addr *)rinfo->prefix,
550 prefix = &prefix_buf;
553 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
556 if (rt && !lifetime) {
562 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
565 rt->rt6i_flags = RTF_ROUTEINFO |
566 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
569 if (!addrconf_finite_timeout(lifetime)) {
570 rt->rt6i_flags &= ~RTF_EXPIRES;
572 rt->rt6i_expires = jiffies + HZ * lifetime;
573 rt->rt6i_flags |= RTF_EXPIRES;
575 dst_release(&rt->dst);
581 #define BACKTRACK(__net, saddr) \
583 if (rt == __net->ipv6.ip6_null_entry) { \
584 struct fib6_node *pn; \
586 if (fn->fn_flags & RTN_TL_ROOT) \
589 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
590 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
593 if (fn->fn_flags & RTN_RTINFO) \
599 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
600 struct fib6_table *table,
601 struct flowi *fl, int flags)
603 struct fib6_node *fn;
606 read_lock_bh(&table->tb6_lock);
607 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
610 rt = rt6_device_match(net, rt, &fl->fl6_src, fl->oif, flags);
611 BACKTRACK(net, &fl->fl6_src);
613 dst_use(&rt->dst, jiffies);
614 read_unlock_bh(&table->tb6_lock);
619 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
620 const struct in6_addr *saddr, int oif, int strict)
626 struct dst_entry *dst;
627 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
630 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
631 flags |= RT6_LOOKUP_F_HAS_SADDR;
634 dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
636 return (struct rt6_info *) dst;
643 EXPORT_SYMBOL(rt6_lookup);
645 /* ip6_ins_rt is called with FREE table->tb6_lock.
646 It takes new route entry, the addition fails by any reason the
647 route is freed. In any case, if caller does not hold it, it may
651 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
654 struct fib6_table *table;
656 table = rt->rt6i_table;
657 write_lock_bh(&table->tb6_lock);
658 err = fib6_add(&table->tb6_root, rt, info);
659 write_unlock_bh(&table->tb6_lock);
664 int ip6_ins_rt(struct rt6_info *rt)
666 struct nl_info info = {
667 .nl_net = dev_net(rt->rt6i_dev),
669 return __ip6_ins_rt(rt, &info);
672 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
673 struct in6_addr *saddr)
681 rt = ip6_rt_copy(ort);
684 struct neighbour *neigh;
685 int attempts = !in_softirq();
687 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
688 if (rt->rt6i_dst.plen != 128 &&
689 ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
690 rt->rt6i_flags |= RTF_ANYCAST;
691 ipv6_addr_copy(&rt->rt6i_gateway, daddr);
694 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
695 rt->rt6i_dst.plen = 128;
696 rt->rt6i_flags |= RTF_CACHE;
697 rt->dst.flags |= DST_HOST;
699 #ifdef CONFIG_IPV6_SUBTREES
700 if (rt->rt6i_src.plen && saddr) {
701 ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
702 rt->rt6i_src.plen = 128;
707 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
709 struct net *net = dev_net(rt->rt6i_dev);
710 int saved_rt_min_interval =
711 net->ipv6.sysctl.ip6_rt_gc_min_interval;
712 int saved_rt_elasticity =
713 net->ipv6.sysctl.ip6_rt_gc_elasticity;
715 if (attempts-- > 0) {
716 net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
717 net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;
719 ip6_dst_gc(&net->ipv6.ip6_dst_ops);
721 net->ipv6.sysctl.ip6_rt_gc_elasticity =
723 net->ipv6.sysctl.ip6_rt_gc_min_interval =
724 saved_rt_min_interval;
730 "ipv6: Neighbour table overflow.\n");
734 rt->rt6i_nexthop = neigh;
741 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
743 struct rt6_info *rt = ip6_rt_copy(ort);
745 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
746 rt->rt6i_dst.plen = 128;
747 rt->rt6i_flags |= RTF_CACHE;
748 rt->dst.flags |= DST_HOST;
749 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
754 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
755 struct flowi *fl, int flags)
757 struct fib6_node *fn;
758 struct rt6_info *rt, *nrt;
762 int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
764 strict |= flags & RT6_LOOKUP_F_IFACE;
767 read_lock_bh(&table->tb6_lock);
770 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
773 rt = rt6_select(fn, oif, strict | reachable);
775 BACKTRACK(net, &fl->fl6_src);
776 if (rt == net->ipv6.ip6_null_entry ||
777 rt->rt6i_flags & RTF_CACHE)
781 read_unlock_bh(&table->tb6_lock);
783 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
784 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
786 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
788 dst_release(&rt->dst);
789 rt = nrt ? : net->ipv6.ip6_null_entry;
793 err = ip6_ins_rt(nrt);
802 * Race condition! In the gap, when table->tb6_lock was
803 * released someone could insert this route. Relookup.
805 dst_release(&rt->dst);
814 read_unlock_bh(&table->tb6_lock);
816 rt->dst.lastuse = jiffies;
822 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
823 struct flowi *fl, int flags)
825 return ip6_pol_route(net, table, fl->iif, fl, flags);
828 void ip6_route_input(struct sk_buff *skb)
830 struct ipv6hdr *iph = ipv6_hdr(skb);
831 struct net *net = dev_net(skb->dev);
832 int flags = RT6_LOOKUP_F_HAS_SADDR;
834 .iif = skb->dev->ifindex,
835 .fl6_dst = iph->daddr,
836 .fl6_src = iph->saddr,
837 .fl6_flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
839 .proto = iph->nexthdr,
842 if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
843 flags |= RT6_LOOKUP_F_IFACE;
845 skb_dst_set(skb, fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input));
848 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
849 struct flowi *fl, int flags)
851 return ip6_pol_route(net, table, fl->oif, fl, flags);
854 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
859 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl->fl6_dst))
860 flags |= RT6_LOOKUP_F_IFACE;
862 if (!ipv6_addr_any(&fl->fl6_src))
863 flags |= RT6_LOOKUP_F_HAS_SADDR;
865 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
867 return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
870 EXPORT_SYMBOL(ip6_route_output);
872 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
874 struct rt6_info *ort = (struct rt6_info *) *dstp;
875 struct rt6_info *rt = (struct rt6_info *)
876 dst_alloc(&ip6_dst_blackhole_ops, 1);
877 struct dst_entry *new = NULL;
883 new->input = dst_discard;
884 new->output = dst_discard;
886 dst_copy_metrics(new, &ort->dst);
887 new->dev = ort->dst.dev;
890 rt->rt6i_idev = ort->rt6i_idev;
892 in6_dev_hold(rt->rt6i_idev);
893 rt->rt6i_expires = 0;
895 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
896 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
899 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
900 #ifdef CONFIG_IPV6_SUBTREES
901 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
909 return new ? 0 : -ENOMEM;
911 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
914 * Destination cache support functions
917 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
921 rt = (struct rt6_info *) dst;
923 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
924 if (rt->rt6i_peer_genid != rt6_peer_genid()) {
926 rt6_bind_peer(rt, 0);
927 rt->rt6i_peer_genid = rt6_peer_genid();
934 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
936 struct rt6_info *rt = (struct rt6_info *) dst;
939 if (rt->rt6i_flags & RTF_CACHE) {
940 if (rt6_check_expired(rt)) {
952 static void ip6_link_failure(struct sk_buff *skb)
956 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
958 rt = (struct rt6_info *) skb_dst(skb);
960 if (rt->rt6i_flags&RTF_CACHE) {
961 dst_set_expires(&rt->dst, 0);
962 rt->rt6i_flags |= RTF_EXPIRES;
963 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
964 rt->rt6i_node->fn_sernum = -1;
968 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
970 struct rt6_info *rt6 = (struct rt6_info*)dst;
972 if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
973 rt6->rt6i_flags |= RTF_MODIFIED;
974 if (mtu < IPV6_MIN_MTU) {
975 u32 features = dst_metric(dst, RTAX_FEATURES);
977 features |= RTAX_FEATURE_ALLFRAG;
978 dst_metric_set(dst, RTAX_FEATURES, features);
980 dst_metric_set(dst, RTAX_MTU, mtu);
984 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
986 struct net_device *dev = dst->dev;
987 unsigned int mtu = dst_mtu(dst);
988 struct net *net = dev_net(dev);
990 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
992 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
993 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
996 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
997 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
998 * IPV6_MAXPLEN is also valid and means: "any MSS,
999 * rely only on pmtu discovery"
1001 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1006 static unsigned int ip6_default_mtu(const struct dst_entry *dst)
1008 unsigned int mtu = IPV6_MIN_MTU;
1009 struct inet6_dev *idev;
1012 idev = __in6_dev_get(dst->dev);
1014 mtu = idev->cnf.mtu6;
1020 static struct dst_entry *icmp6_dst_gc_list;
1021 static DEFINE_SPINLOCK(icmp6_dst_lock);
1023 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1024 struct neighbour *neigh,
1025 const struct in6_addr *addr)
1027 struct rt6_info *rt;
1028 struct inet6_dev *idev = in6_dev_get(dev);
1029 struct net *net = dev_net(dev);
1031 if (unlikely(idev == NULL))
1034 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1035 if (unlikely(rt == NULL)) {
1044 neigh = ndisc_get_neigh(dev, addr);
1050 rt->rt6i_idev = idev;
1051 rt->rt6i_nexthop = neigh;
1052 atomic_set(&rt->dst.__refcnt, 1);
1053 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
1054 rt->dst.output = ip6_output;
1056 #if 0 /* there's no chance to use these for ndisc */
1057 rt->dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
1060 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1061 rt->rt6i_dst.plen = 128;
1064 spin_lock_bh(&icmp6_dst_lock);
1065 rt->dst.next = icmp6_dst_gc_list;
1066 icmp6_dst_gc_list = &rt->dst;
1067 spin_unlock_bh(&icmp6_dst_lock);
1069 fib6_force_start_gc(net);
1075 int icmp6_dst_gc(void)
1077 struct dst_entry *dst, *next, **pprev;
1082 spin_lock_bh(&icmp6_dst_lock);
1083 pprev = &icmp6_dst_gc_list;
1085 while ((dst = *pprev) != NULL) {
1086 if (!atomic_read(&dst->__refcnt)) {
1095 spin_unlock_bh(&icmp6_dst_lock);
1100 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1103 struct dst_entry *dst, **pprev;
1105 spin_lock_bh(&icmp6_dst_lock);
1106 pprev = &icmp6_dst_gc_list;
1107 while ((dst = *pprev) != NULL) {
1108 struct rt6_info *rt = (struct rt6_info *) dst;
1109 if (func(rt, arg)) {
1116 spin_unlock_bh(&icmp6_dst_lock);
1119 static int ip6_dst_gc(struct dst_ops *ops)
1121 unsigned long now = jiffies;
1122 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1123 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1124 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1125 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1126 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1127 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1130 entries = dst_entries_get_fast(ops);
1131 if (time_after(rt_last_gc + rt_min_interval, now) &&
1132 entries <= rt_max_size)
1135 net->ipv6.ip6_rt_gc_expire++;
1136 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1137 net->ipv6.ip6_rt_last_gc = now;
1138 entries = dst_entries_get_slow(ops);
1139 if (entries < ops->gc_thresh)
1140 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1142 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1143 return entries > rt_max_size;
1146 /* Clean host part of a prefix. Not necessary in radix tree,
1147 but results in cleaner routing tables.
1149 Remove it only when all the things will work!
1152 int ip6_dst_hoplimit(struct dst_entry *dst)
1154 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1155 if (hoplimit == 0) {
1156 struct net_device *dev = dst->dev;
1157 struct inet6_dev *idev;
1160 idev = __in6_dev_get(dev);
1162 hoplimit = idev->cnf.hop_limit;
1164 hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1169 EXPORT_SYMBOL(ip6_dst_hoplimit);
1175 int ip6_route_add(struct fib6_config *cfg)
1178 struct net *net = cfg->fc_nlinfo.nl_net;
1179 struct rt6_info *rt = NULL;
1180 struct net_device *dev = NULL;
1181 struct inet6_dev *idev = NULL;
1182 struct fib6_table *table;
1185 if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1187 #ifndef CONFIG_IPV6_SUBTREES
1188 if (cfg->fc_src_len)
1191 if (cfg->fc_ifindex) {
1193 dev = dev_get_by_index(net, cfg->fc_ifindex);
1196 idev = in6_dev_get(dev);
1201 if (cfg->fc_metric == 0)
1202 cfg->fc_metric = IP6_RT_PRIO_USER;
1204 table = fib6_new_table(net, cfg->fc_table);
1205 if (table == NULL) {
1210 rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1217 rt->dst.obsolete = -1;
1218 rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1219 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1222 if (cfg->fc_protocol == RTPROT_UNSPEC)
1223 cfg->fc_protocol = RTPROT_BOOT;
1224 rt->rt6i_protocol = cfg->fc_protocol;
1226 addr_type = ipv6_addr_type(&cfg->fc_dst);
1228 if (addr_type & IPV6_ADDR_MULTICAST)
1229 rt->dst.input = ip6_mc_input;
1230 else if (cfg->fc_flags & RTF_LOCAL)
1231 rt->dst.input = ip6_input;
1233 rt->dst.input = ip6_forward;
1235 rt->dst.output = ip6_output;
1237 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1238 rt->rt6i_dst.plen = cfg->fc_dst_len;
1239 if (rt->rt6i_dst.plen == 128)
1240 rt->dst.flags = DST_HOST;
1242 #ifdef CONFIG_IPV6_SUBTREES
1243 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1244 rt->rt6i_src.plen = cfg->fc_src_len;
1247 rt->rt6i_metric = cfg->fc_metric;
1249 /* We cannot add true routes via loopback here,
1250 they would result in kernel looping; promote them to reject routes
1252 if ((cfg->fc_flags & RTF_REJECT) ||
1253 (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK)
1254 && !(cfg->fc_flags&RTF_LOCAL))) {
1255 /* hold loopback dev/idev if we haven't done so. */
1256 if (dev != net->loopback_dev) {
1261 dev = net->loopback_dev;
1263 idev = in6_dev_get(dev);
1269 rt->dst.output = ip6_pkt_discard_out;
1270 rt->dst.input = ip6_pkt_discard;
1271 rt->dst.error = -ENETUNREACH;
1272 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1276 if (cfg->fc_flags & RTF_GATEWAY) {
1277 struct in6_addr *gw_addr;
1280 gw_addr = &cfg->fc_gateway;
1281 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1282 gwa_type = ipv6_addr_type(gw_addr);
1284 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1285 struct rt6_info *grt;
1287 /* IPv6 strictly inhibits using not link-local
1288 addresses as nexthop address.
1289 Otherwise, router will not able to send redirects.
1290 It is very good, but in some (rare!) circumstances
1291 (SIT, PtP, NBMA NOARP links) it is handy to allow
1292 some exceptions. --ANK
1295 if (!(gwa_type&IPV6_ADDR_UNICAST))
1298 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1300 err = -EHOSTUNREACH;
1304 if (dev != grt->rt6i_dev) {
1305 dst_release(&grt->dst);
1309 dev = grt->rt6i_dev;
1310 idev = grt->rt6i_idev;
1312 in6_dev_hold(grt->rt6i_idev);
1314 if (!(grt->rt6i_flags&RTF_GATEWAY))
1316 dst_release(&grt->dst);
1322 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1330 if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1331 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1332 if (IS_ERR(rt->rt6i_nexthop)) {
1333 err = PTR_ERR(rt->rt6i_nexthop);
1334 rt->rt6i_nexthop = NULL;
1339 rt->rt6i_flags = cfg->fc_flags;
1346 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1347 int type = nla_type(nla);
1350 if (type > RTAX_MAX) {
1355 dst_metric_set(&rt->dst, type, nla_get_u32(nla));
1361 rt->rt6i_idev = idev;
1362 rt->rt6i_table = table;
1364 cfg->fc_nlinfo.nl_net = dev_net(dev);
1366 return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1378 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1381 struct fib6_table *table;
1382 struct net *net = dev_net(rt->rt6i_dev);
1384 if (rt == net->ipv6.ip6_null_entry)
1387 table = rt->rt6i_table;
1388 write_lock_bh(&table->tb6_lock);
1390 err = fib6_del(rt, info);
1391 dst_release(&rt->dst);
1393 write_unlock_bh(&table->tb6_lock);
1398 int ip6_del_rt(struct rt6_info *rt)
1400 struct nl_info info = {
1401 .nl_net = dev_net(rt->rt6i_dev),
1403 return __ip6_del_rt(rt, &info);
1406 static int ip6_route_del(struct fib6_config *cfg)
1408 struct fib6_table *table;
1409 struct fib6_node *fn;
1410 struct rt6_info *rt;
1413 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1417 read_lock_bh(&table->tb6_lock);
1419 fn = fib6_locate(&table->tb6_root,
1420 &cfg->fc_dst, cfg->fc_dst_len,
1421 &cfg->fc_src, cfg->fc_src_len);
1424 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1425 if (cfg->fc_ifindex &&
1426 (rt->rt6i_dev == NULL ||
1427 rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1429 if (cfg->fc_flags & RTF_GATEWAY &&
1430 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1432 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1435 read_unlock_bh(&table->tb6_lock);
1437 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1440 read_unlock_bh(&table->tb6_lock);
1448 struct ip6rd_flowi {
1450 struct in6_addr gateway;
1453 static struct rt6_info *__ip6_route_redirect(struct net *net,
1454 struct fib6_table *table,
1458 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1459 struct rt6_info *rt;
1460 struct fib6_node *fn;
1463 * Get the "current" route for this destination and
1464 * check if the redirect has come from approriate router.
1466 * RFC 2461 specifies that redirects should only be
1467 * accepted if they come from the nexthop to the target.
1468 * Due to the way the routes are chosen, this notion
1469 * is a bit fuzzy and one might need to check all possible
1473 read_lock_bh(&table->tb6_lock);
1474 fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1476 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1478 * Current route is on-link; redirect is always invalid.
1480 * Seems, previous statement is not true. It could
1481 * be node, which looks for us as on-link (f.e. proxy ndisc)
1482 * But then router serving it might decide, that we should
1483 * know truth 8)8) --ANK (980726).
1485 if (rt6_check_expired(rt))
1487 if (!(rt->rt6i_flags & RTF_GATEWAY))
1489 if (fl->oif != rt->rt6i_dev->ifindex)
1491 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1497 rt = net->ipv6.ip6_null_entry;
1498 BACKTRACK(net, &fl->fl6_src);
1502 read_unlock_bh(&table->tb6_lock);
1507 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1508 struct in6_addr *src,
1509 struct in6_addr *gateway,
1510 struct net_device *dev)
1512 int flags = RT6_LOOKUP_F_HAS_SADDR;
1513 struct net *net = dev_net(dev);
1514 struct ip6rd_flowi rdfl = {
1516 .oif = dev->ifindex,
1522 ipv6_addr_copy(&rdfl.gateway, gateway);
1524 if (rt6_need_strict(dest))
1525 flags |= RT6_LOOKUP_F_IFACE;
1527 return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1528 flags, __ip6_route_redirect);
1531 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1532 struct in6_addr *saddr,
1533 struct neighbour *neigh, u8 *lladdr, int on_link)
1535 struct rt6_info *rt, *nrt = NULL;
1536 struct netevent_redirect netevent;
1537 struct net *net = dev_net(neigh->dev);
1539 rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1541 if (rt == net->ipv6.ip6_null_entry) {
1542 if (net_ratelimit())
1543 printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1544 "for redirect target\n");
1549 * We have finally decided to accept it.
1552 neigh_update(neigh, lladdr, NUD_STALE,
1553 NEIGH_UPDATE_F_WEAK_OVERRIDE|
1554 NEIGH_UPDATE_F_OVERRIDE|
1555 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1556 NEIGH_UPDATE_F_ISROUTER))
1560 * Redirect received -> path was valid.
1561 * Look, redirects are sent only in response to data packets,
1562 * so that this nexthop apparently is reachable. --ANK
1564 dst_confirm(&rt->dst);
1566 /* Duplicate redirect: silently ignore. */
1567 if (neigh == rt->dst.neighbour)
1570 nrt = ip6_rt_copy(rt);
1574 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1576 nrt->rt6i_flags &= ~RTF_GATEWAY;
1578 ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1579 nrt->rt6i_dst.plen = 128;
1580 nrt->dst.flags |= DST_HOST;
1582 ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1583 nrt->rt6i_nexthop = neigh_clone(neigh);
1585 if (ip6_ins_rt(nrt))
1588 netevent.old = &rt->dst;
1589 netevent.new = &nrt->dst;
1590 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1592 if (rt->rt6i_flags&RTF_CACHE) {
1598 dst_release(&rt->dst);
1602 * Handle ICMP "packet too big" messages
1603 * i.e. Path MTU discovery
1606 static void rt6_do_pmtu_disc(struct in6_addr *daddr, struct in6_addr *saddr,
1607 struct net *net, u32 pmtu, int ifindex)
1609 struct rt6_info *rt, *nrt;
1612 rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1616 if (rt6_check_expired(rt)) {
1621 if (pmtu >= dst_mtu(&rt->dst))
1624 if (pmtu < IPV6_MIN_MTU) {
1626 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1627 * MTU (1280) and a fragment header should always be included
1628 * after a node receiving Too Big message reporting PMTU is
1629 * less than the IPv6 Minimum Link MTU.
1631 pmtu = IPV6_MIN_MTU;
1635 /* New mtu received -> path was valid.
1636 They are sent only in response to data packets,
1637 so that this nexthop apparently is reachable. --ANK
1639 dst_confirm(&rt->dst);
1641 /* Host route. If it is static, it would be better
1642 not to override it, but add new one, so that
1643 when cache entry will expire old pmtu
1644 would return automatically.
1646 if (rt->rt6i_flags & RTF_CACHE) {
1647 dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
1649 u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
1650 features |= RTAX_FEATURE_ALLFRAG;
1651 dst_metric_set(&rt->dst, RTAX_FEATURES, features);
1653 dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1654 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1659 Two cases are possible:
1660 1. It is connected route. Action: COW
1661 2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1663 if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1664 nrt = rt6_alloc_cow(rt, daddr, saddr);
1666 nrt = rt6_alloc_clone(rt, daddr);
1669 dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
1671 u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
1672 features |= RTAX_FEATURE_ALLFRAG;
1673 dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
1676 /* According to RFC 1981, detecting PMTU increase shouldn't be
1677 * happened within 5 mins, the recommended timer is 10 mins.
1678 * Here this route expiration time is set to ip6_rt_mtu_expires
1679 * which is 10 mins. After 10 mins the decreased pmtu is expired
1680 * and detecting PMTU increase will be automatically happened.
1682 dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1683 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1688 dst_release(&rt->dst);
1691 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1692 struct net_device *dev, u32 pmtu)
1694 struct net *net = dev_net(dev);
1697 * RFC 1981 states that a node "MUST reduce the size of the packets it
1698 * is sending along the path" that caused the Packet Too Big message.
1699 * Since it's not possible in the general case to determine which
1700 * interface was used to send the original packet, we update the MTU
1701 * on the interface that will be used to send future packets. We also
1702 * update the MTU on the interface that received the Packet Too Big in
1703 * case the original packet was forced out that interface with
1704 * SO_BINDTODEVICE or similar. This is the next best thing to the
1705 * correct behaviour, which would be to update the MTU on all
1708 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
1709 rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
1713 * Misc support functions
1716 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1718 struct net *net = dev_net(ort->rt6i_dev);
1719 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
1722 rt->dst.input = ort->dst.input;
1723 rt->dst.output = ort->dst.output;
1725 dst_copy_metrics(&rt->dst, &ort->dst);
1726 rt->dst.error = ort->dst.error;
1727 rt->dst.dev = ort->dst.dev;
1729 dev_hold(rt->dst.dev);
1730 rt->rt6i_idev = ort->rt6i_idev;
1732 in6_dev_hold(rt->rt6i_idev);
1733 rt->dst.lastuse = jiffies;
1734 rt->rt6i_expires = 0;
1736 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1737 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1738 rt->rt6i_metric = 0;
1740 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1741 #ifdef CONFIG_IPV6_SUBTREES
1742 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1744 rt->rt6i_table = ort->rt6i_table;
1749 #ifdef CONFIG_IPV6_ROUTE_INFO
1750 static struct rt6_info *rt6_get_route_info(struct net *net,
1751 struct in6_addr *prefix, int prefixlen,
1752 struct in6_addr *gwaddr, int ifindex)
1754 struct fib6_node *fn;
1755 struct rt6_info *rt = NULL;
1756 struct fib6_table *table;
1758 table = fib6_get_table(net, RT6_TABLE_INFO);
1762 write_lock_bh(&table->tb6_lock);
1763 fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1767 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1768 if (rt->rt6i_dev->ifindex != ifindex)
1770 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1772 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1778 write_unlock_bh(&table->tb6_lock);
1782 static struct rt6_info *rt6_add_route_info(struct net *net,
1783 struct in6_addr *prefix, int prefixlen,
1784 struct in6_addr *gwaddr, int ifindex,
1787 struct fib6_config cfg = {
1788 .fc_table = RT6_TABLE_INFO,
1789 .fc_metric = IP6_RT_PRIO_USER,
1790 .fc_ifindex = ifindex,
1791 .fc_dst_len = prefixlen,
1792 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1793 RTF_UP | RTF_PREF(pref),
1795 .fc_nlinfo.nlh = NULL,
1796 .fc_nlinfo.nl_net = net,
1799 ipv6_addr_copy(&cfg.fc_dst, prefix);
1800 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1802 /* We should treat it as a default route if prefix length is 0. */
1804 cfg.fc_flags |= RTF_DEFAULT;
1806 ip6_route_add(&cfg);
1808 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1812 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1814 struct rt6_info *rt;
1815 struct fib6_table *table;
1817 table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1821 write_lock_bh(&table->tb6_lock);
1822 for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1823 if (dev == rt->rt6i_dev &&
1824 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1825 ipv6_addr_equal(&rt->rt6i_gateway, addr))
1830 write_unlock_bh(&table->tb6_lock);
1834 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1835 struct net_device *dev,
1838 struct fib6_config cfg = {
1839 .fc_table = RT6_TABLE_DFLT,
1840 .fc_metric = IP6_RT_PRIO_USER,
1841 .fc_ifindex = dev->ifindex,
1842 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1843 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1845 .fc_nlinfo.nlh = NULL,
1846 .fc_nlinfo.nl_net = dev_net(dev),
1849 ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1851 ip6_route_add(&cfg);
1853 return rt6_get_dflt_router(gwaddr, dev);
1856 void rt6_purge_dflt_routers(struct net *net)
1858 struct rt6_info *rt;
1859 struct fib6_table *table;
1861 /* NOTE: Keep consistent with rt6_get_dflt_router */
1862 table = fib6_get_table(net, RT6_TABLE_DFLT);
1867 read_lock_bh(&table->tb6_lock);
1868 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
1869 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1871 read_unlock_bh(&table->tb6_lock);
1876 read_unlock_bh(&table->tb6_lock);
1879 static void rtmsg_to_fib6_config(struct net *net,
1880 struct in6_rtmsg *rtmsg,
1881 struct fib6_config *cfg)
1883 memset(cfg, 0, sizeof(*cfg));
1885 cfg->fc_table = RT6_TABLE_MAIN;
1886 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1887 cfg->fc_metric = rtmsg->rtmsg_metric;
1888 cfg->fc_expires = rtmsg->rtmsg_info;
1889 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1890 cfg->fc_src_len = rtmsg->rtmsg_src_len;
1891 cfg->fc_flags = rtmsg->rtmsg_flags;
1893 cfg->fc_nlinfo.nl_net = net;
1895 ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1896 ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1897 ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1900 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1902 struct fib6_config cfg;
1903 struct in6_rtmsg rtmsg;
1907 case SIOCADDRT: /* Add a route */
1908 case SIOCDELRT: /* Delete a route */
1909 if (!capable(CAP_NET_ADMIN))
1911 err = copy_from_user(&rtmsg, arg,
1912 sizeof(struct in6_rtmsg));
1916 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1921 err = ip6_route_add(&cfg);
1924 err = ip6_route_del(&cfg);
1938 * Drop the packet on the floor
1941 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1944 struct dst_entry *dst = skb_dst(skb);
1945 switch (ipstats_mib_noroutes) {
1946 case IPSTATS_MIB_INNOROUTES:
1947 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1948 if (type == IPV6_ADDR_ANY) {
1949 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1950 IPSTATS_MIB_INADDRERRORS);
1954 case IPSTATS_MIB_OUTNOROUTES:
1955 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
1956 ipstats_mib_noroutes);
1959 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1964 static int ip6_pkt_discard(struct sk_buff *skb)
1966 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1969 static int ip6_pkt_discard_out(struct sk_buff *skb)
1971 skb->dev = skb_dst(skb)->dev;
1972 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1975 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1977 static int ip6_pkt_prohibit(struct sk_buff *skb)
1979 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1982 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1984 skb->dev = skb_dst(skb)->dev;
1985 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1991 * Allocate a dst for local (unicast / anycast) address.
1994 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1995 const struct in6_addr *addr,
1998 struct net *net = dev_net(idev->dev);
1999 struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops);
2000 struct neighbour *neigh;
2003 if (net_ratelimit())
2004 pr_warning("IPv6: Maximum number of routes reached,"
2005 " consider increasing route/max_size.\n");
2006 return ERR_PTR(-ENOMEM);
2009 dev_hold(net->loopback_dev);
2012 rt->dst.flags = DST_HOST;
2013 rt->dst.input = ip6_input;
2014 rt->dst.output = ip6_output;
2015 rt->rt6i_dev = net->loopback_dev;
2016 rt->rt6i_idev = idev;
2017 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, -1);
2018 rt->dst.obsolete = -1;
2020 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2022 rt->rt6i_flags |= RTF_ANYCAST;
2024 rt->rt6i_flags |= RTF_LOCAL;
2025 neigh = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
2026 if (IS_ERR(neigh)) {
2029 /* We are casting this because that is the return
2030 * value type. But an errno encoded pointer is the
2031 * same regardless of the underlying pointer type,
2032 * and that's what we are returning. So this is OK.
2034 return (struct rt6_info *) neigh;
2036 rt->rt6i_nexthop = neigh;
2038 ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
2039 rt->rt6i_dst.plen = 128;
2040 rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
2042 atomic_set(&rt->dst.__refcnt, 1);
2047 struct arg_dev_net {
2048 struct net_device *dev;
2052 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2054 const struct arg_dev_net *adn = arg;
2055 const struct net_device *dev = adn->dev;
2057 if ((rt->rt6i_dev == dev || dev == NULL) &&
2058 rt != adn->net->ipv6.ip6_null_entry) {
2059 RT6_TRACE("deleted by ifdown %p\n", rt);
2065 void rt6_ifdown(struct net *net, struct net_device *dev)
2067 struct arg_dev_net adn = {
2072 fib6_clean_all(net, fib6_ifdown, 0, &adn);
2073 icmp6_clean_all(fib6_ifdown, &adn);
2076 struct rt6_mtu_change_arg
2078 struct net_device *dev;
2082 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2084 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2085 struct inet6_dev *idev;
2087 /* In IPv6 pmtu discovery is not optional,
2088 so that RTAX_MTU lock cannot disable it.
2089 We still use this lock to block changes
2090 caused by addrconf/ndisc.
2093 idev = __in6_dev_get(arg->dev);
2097 /* For administrative MTU increase, there is no way to discover
2098 IPv6 PMTU increase, so PMTU increase should be updated here.
2099 Since RFC 1981 doesn't include administrative MTU increase
2100 update PMTU increase is a MUST. (i.e. jumbo frame)
2103 If new MTU is less than route PMTU, this new MTU will be the
2104 lowest MTU in the path, update the route PMTU to reflect PMTU
2105 decreases; if new MTU is greater than route PMTU, and the
2106 old MTU is the lowest MTU in the path, update the route PMTU
2107 to reflect the increase. In this case if the other nodes' MTU
2108 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2111 if (rt->rt6i_dev == arg->dev &&
2112 !dst_metric_locked(&rt->dst, RTAX_MTU) &&
2113 (dst_mtu(&rt->dst) >= arg->mtu ||
2114 (dst_mtu(&rt->dst) < arg->mtu &&
2115 dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2116 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2121 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
2123 struct rt6_mtu_change_arg arg = {
2128 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
2131 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2132 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2133 [RTA_OIF] = { .type = NLA_U32 },
2134 [RTA_IIF] = { .type = NLA_U32 },
2135 [RTA_PRIORITY] = { .type = NLA_U32 },
2136 [RTA_METRICS] = { .type = NLA_NESTED },
2139 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2140 struct fib6_config *cfg)
2143 struct nlattr *tb[RTA_MAX+1];
2146 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2151 rtm = nlmsg_data(nlh);
2152 memset(cfg, 0, sizeof(*cfg));
2154 cfg->fc_table = rtm->rtm_table;
2155 cfg->fc_dst_len = rtm->rtm_dst_len;
2156 cfg->fc_src_len = rtm->rtm_src_len;
2157 cfg->fc_flags = RTF_UP;
2158 cfg->fc_protocol = rtm->rtm_protocol;
2160 if (rtm->rtm_type == RTN_UNREACHABLE)
2161 cfg->fc_flags |= RTF_REJECT;
2163 if (rtm->rtm_type == RTN_LOCAL)
2164 cfg->fc_flags |= RTF_LOCAL;
2166 cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2167 cfg->fc_nlinfo.nlh = nlh;
2168 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2170 if (tb[RTA_GATEWAY]) {
2171 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2172 cfg->fc_flags |= RTF_GATEWAY;
2176 int plen = (rtm->rtm_dst_len + 7) >> 3;
2178 if (nla_len(tb[RTA_DST]) < plen)
2181 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2185 int plen = (rtm->rtm_src_len + 7) >> 3;
2187 if (nla_len(tb[RTA_SRC]) < plen)
2190 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2194 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2196 if (tb[RTA_PRIORITY])
2197 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2199 if (tb[RTA_METRICS]) {
2200 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2201 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2205 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2212 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2214 struct fib6_config cfg;
2217 err = rtm_to_fib6_config(skb, nlh, &cfg);
2221 return ip6_route_del(&cfg);
2224 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2226 struct fib6_config cfg;
2229 err = rtm_to_fib6_config(skb, nlh, &cfg);
2233 return ip6_route_add(&cfg);
2236 static inline size_t rt6_nlmsg_size(void)
2238 return NLMSG_ALIGN(sizeof(struct rtmsg))
2239 + nla_total_size(16) /* RTA_SRC */
2240 + nla_total_size(16) /* RTA_DST */
2241 + nla_total_size(16) /* RTA_GATEWAY */
2242 + nla_total_size(16) /* RTA_PREFSRC */
2243 + nla_total_size(4) /* RTA_TABLE */
2244 + nla_total_size(4) /* RTA_IIF */
2245 + nla_total_size(4) /* RTA_OIF */
2246 + nla_total_size(4) /* RTA_PRIORITY */
2247 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2248 + nla_total_size(sizeof(struct rta_cacheinfo));
2251 static int rt6_fill_node(struct net *net,
2252 struct sk_buff *skb, struct rt6_info *rt,
2253 struct in6_addr *dst, struct in6_addr *src,
2254 int iif, int type, u32 pid, u32 seq,
2255 int prefix, int nowait, unsigned int flags)
2258 struct nlmsghdr *nlh;
2262 if (prefix) { /* user wants prefix routes only */
2263 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2264 /* success since this is not a prefix route */
2269 nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2273 rtm = nlmsg_data(nlh);
2274 rtm->rtm_family = AF_INET6;
2275 rtm->rtm_dst_len = rt->rt6i_dst.plen;
2276 rtm->rtm_src_len = rt->rt6i_src.plen;
2279 table = rt->rt6i_table->tb6_id;
2281 table = RT6_TABLE_UNSPEC;
2282 rtm->rtm_table = table;
2283 NLA_PUT_U32(skb, RTA_TABLE, table);
2284 if (rt->rt6i_flags&RTF_REJECT)
2285 rtm->rtm_type = RTN_UNREACHABLE;
2286 else if (rt->rt6i_flags&RTF_LOCAL)
2287 rtm->rtm_type = RTN_LOCAL;
2288 else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2289 rtm->rtm_type = RTN_LOCAL;
2291 rtm->rtm_type = RTN_UNICAST;
2293 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2294 rtm->rtm_protocol = rt->rt6i_protocol;
2295 if (rt->rt6i_flags&RTF_DYNAMIC)
2296 rtm->rtm_protocol = RTPROT_REDIRECT;
2297 else if (rt->rt6i_flags & RTF_ADDRCONF)
2298 rtm->rtm_protocol = RTPROT_KERNEL;
2299 else if (rt->rt6i_flags&RTF_DEFAULT)
2300 rtm->rtm_protocol = RTPROT_RA;
2302 if (rt->rt6i_flags&RTF_CACHE)
2303 rtm->rtm_flags |= RTM_F_CLONED;
2306 NLA_PUT(skb, RTA_DST, 16, dst);
2307 rtm->rtm_dst_len = 128;
2308 } else if (rtm->rtm_dst_len)
2309 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2310 #ifdef CONFIG_IPV6_SUBTREES
2312 NLA_PUT(skb, RTA_SRC, 16, src);
2313 rtm->rtm_src_len = 128;
2314 } else if (rtm->rtm_src_len)
2315 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2318 #ifdef CONFIG_IPV6_MROUTE
2319 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2320 int err = ip6mr_get_route(net, skb, rtm, nowait);
2325 goto nla_put_failure;
2327 if (err == -EMSGSIZE)
2328 goto nla_put_failure;
2333 NLA_PUT_U32(skb, RTA_IIF, iif);
2335 struct inet6_dev *idev = ip6_dst_idev(&rt->dst);
2336 struct in6_addr saddr_buf;
2337 if (ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
2338 dst, 0, &saddr_buf) == 0)
2339 NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2342 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2343 goto nla_put_failure;
2345 if (rt->dst.neighbour)
2346 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->dst.neighbour->primary_key);
2349 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2351 NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2353 if (!(rt->rt6i_flags & RTF_EXPIRES))
2355 else if (rt->rt6i_expires - jiffies < INT_MAX)
2356 expires = rt->rt6i_expires - jiffies;
2360 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
2361 expires, rt->dst.error) < 0)
2362 goto nla_put_failure;
2364 return nlmsg_end(skb, nlh);
2367 nlmsg_cancel(skb, nlh);
2371 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2373 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2376 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2377 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2378 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2382 return rt6_fill_node(arg->net,
2383 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2384 NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2385 prefix, 0, NLM_F_MULTI);
2388 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2390 struct net *net = sock_net(in_skb->sk);
2391 struct nlattr *tb[RTA_MAX+1];
2392 struct rt6_info *rt;
2393 struct sk_buff *skb;
2398 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2403 memset(&fl, 0, sizeof(fl));
2406 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2409 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2413 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2416 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2420 iif = nla_get_u32(tb[RTA_IIF]);
2423 fl.oif = nla_get_u32(tb[RTA_OIF]);
2426 struct net_device *dev;
2427 dev = __dev_get_by_index(net, iif);
2434 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2440 /* Reserve room for dummy headers, this skb can pass
2441 through good chunk of routing engine.
2443 skb_reset_mac_header(skb);
2444 skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2446 rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2447 skb_dst_set(skb, &rt->dst);
2449 err = rt6_fill_node(net, skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2450 RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2451 nlh->nlmsg_seq, 0, 0, 0);
2457 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2462 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2464 struct sk_buff *skb;
2465 struct net *net = info->nl_net;
2470 seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2472 skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2476 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2477 event, info->pid, seq, 0, 0, 0);
2479 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2480 WARN_ON(err == -EMSGSIZE);
2484 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2485 info->nlh, gfp_any());
2489 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2492 static int ip6_route_dev_notify(struct notifier_block *this,
2493 unsigned long event, void *data)
2495 struct net_device *dev = (struct net_device *)data;
2496 struct net *net = dev_net(dev);
2498 if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2499 net->ipv6.ip6_null_entry->dst.dev = dev;
2500 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2501 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2502 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2503 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2504 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2505 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2516 #ifdef CONFIG_PROC_FS
2527 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2529 struct seq_file *m = p_arg;
2531 seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
2533 #ifdef CONFIG_IPV6_SUBTREES
2534 seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
2536 seq_puts(m, "00000000000000000000000000000000 00 ");
2539 if (rt->rt6i_nexthop) {
2540 seq_printf(m, "%pi6", rt->rt6i_nexthop->primary_key);
2542 seq_puts(m, "00000000000000000000000000000000");
2544 seq_printf(m, " %08x %08x %08x %08x %8s\n",
2545 rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
2546 rt->dst.__use, rt->rt6i_flags,
2547 rt->rt6i_dev ? rt->rt6i_dev->name : "");
2551 static int ipv6_route_show(struct seq_file *m, void *v)
2553 struct net *net = (struct net *)m->private;
2554 fib6_clean_all(net, rt6_info_route, 0, m);
2558 static int ipv6_route_open(struct inode *inode, struct file *file)
2560 return single_open_net(inode, file, ipv6_route_show);
2563 static const struct file_operations ipv6_route_proc_fops = {
2564 .owner = THIS_MODULE,
2565 .open = ipv6_route_open,
2567 .llseek = seq_lseek,
2568 .release = single_release_net,
2571 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2573 struct net *net = (struct net *)seq->private;
2574 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2575 net->ipv6.rt6_stats->fib_nodes,
2576 net->ipv6.rt6_stats->fib_route_nodes,
2577 net->ipv6.rt6_stats->fib_rt_alloc,
2578 net->ipv6.rt6_stats->fib_rt_entries,
2579 net->ipv6.rt6_stats->fib_rt_cache,
2580 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2581 net->ipv6.rt6_stats->fib_discarded_routes);
2586 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2588 return single_open_net(inode, file, rt6_stats_seq_show);
2591 static const struct file_operations rt6_stats_seq_fops = {
2592 .owner = THIS_MODULE,
2593 .open = rt6_stats_seq_open,
2595 .llseek = seq_lseek,
2596 .release = single_release_net,
2598 #endif /* CONFIG_PROC_FS */
2600 #ifdef CONFIG_SYSCTL
2603 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
2604 void __user *buffer, size_t *lenp, loff_t *ppos)
2606 struct net *net = current->nsproxy->net_ns;
2607 int delay = net->ipv6.sysctl.flush_delay;
2609 proc_dointvec(ctl, write, buffer, lenp, ppos);
2610 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2616 ctl_table ipv6_route_table_template[] = {
2618 .procname = "flush",
2619 .data = &init_net.ipv6.sysctl.flush_delay,
2620 .maxlen = sizeof(int),
2622 .proc_handler = ipv6_sysctl_rtcache_flush
2625 .procname = "gc_thresh",
2626 .data = &ip6_dst_ops_template.gc_thresh,
2627 .maxlen = sizeof(int),
2629 .proc_handler = proc_dointvec,
2632 .procname = "max_size",
2633 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
2634 .maxlen = sizeof(int),
2636 .proc_handler = proc_dointvec,
2639 .procname = "gc_min_interval",
2640 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2641 .maxlen = sizeof(int),
2643 .proc_handler = proc_dointvec_jiffies,
2646 .procname = "gc_timeout",
2647 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2648 .maxlen = sizeof(int),
2650 .proc_handler = proc_dointvec_jiffies,
2653 .procname = "gc_interval",
2654 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2655 .maxlen = sizeof(int),
2657 .proc_handler = proc_dointvec_jiffies,
2660 .procname = "gc_elasticity",
2661 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2662 .maxlen = sizeof(int),
2664 .proc_handler = proc_dointvec,
2667 .procname = "mtu_expires",
2668 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2669 .maxlen = sizeof(int),
2671 .proc_handler = proc_dointvec_jiffies,
2674 .procname = "min_adv_mss",
2675 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2676 .maxlen = sizeof(int),
2678 .proc_handler = proc_dointvec,
2681 .procname = "gc_min_interval_ms",
2682 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2683 .maxlen = sizeof(int),
2685 .proc_handler = proc_dointvec_ms_jiffies,
2690 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2692 struct ctl_table *table;
2694 table = kmemdup(ipv6_route_table_template,
2695 sizeof(ipv6_route_table_template),
2699 table[0].data = &net->ipv6.sysctl.flush_delay;
2700 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2701 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2702 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2703 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2704 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2705 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2706 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2707 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2708 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2715 static int __net_init ip6_route_net_init(struct net *net)
2719 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
2720 sizeof(net->ipv6.ip6_dst_ops));
2722 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
2723 goto out_ip6_dst_ops;
2725 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2726 sizeof(*net->ipv6.ip6_null_entry),
2728 if (!net->ipv6.ip6_null_entry)
2729 goto out_ip6_dst_entries;
2730 net->ipv6.ip6_null_entry->dst.path =
2731 (struct dst_entry *)net->ipv6.ip6_null_entry;
2732 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2733 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
2734 ip6_template_metrics, true);
2736 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2737 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2738 sizeof(*net->ipv6.ip6_prohibit_entry),
2740 if (!net->ipv6.ip6_prohibit_entry)
2741 goto out_ip6_null_entry;
2742 net->ipv6.ip6_prohibit_entry->dst.path =
2743 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2744 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2745 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
2746 ip6_template_metrics, true);
2748 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2749 sizeof(*net->ipv6.ip6_blk_hole_entry),
2751 if (!net->ipv6.ip6_blk_hole_entry)
2752 goto out_ip6_prohibit_entry;
2753 net->ipv6.ip6_blk_hole_entry->dst.path =
2754 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2755 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2756 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
2757 ip6_template_metrics, true);
2760 net->ipv6.sysctl.flush_delay = 0;
2761 net->ipv6.sysctl.ip6_rt_max_size = 4096;
2762 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
2763 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
2764 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
2765 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
2766 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
2767 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
2769 #ifdef CONFIG_PROC_FS
2770 proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2771 proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2773 net->ipv6.ip6_rt_gc_expire = 30*HZ;
2779 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2780 out_ip6_prohibit_entry:
2781 kfree(net->ipv6.ip6_prohibit_entry);
2783 kfree(net->ipv6.ip6_null_entry);
2785 out_ip6_dst_entries:
2786 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2791 static void __net_exit ip6_route_net_exit(struct net *net)
2793 #ifdef CONFIG_PROC_FS
2794 proc_net_remove(net, "ipv6_route");
2795 proc_net_remove(net, "rt6_stats");
2797 kfree(net->ipv6.ip6_null_entry);
2798 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2799 kfree(net->ipv6.ip6_prohibit_entry);
2800 kfree(net->ipv6.ip6_blk_hole_entry);
2802 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2805 static struct pernet_operations ip6_route_net_ops = {
2806 .init = ip6_route_net_init,
2807 .exit = ip6_route_net_exit,
2810 static struct notifier_block ip6_route_dev_notifier = {
2811 .notifier_call = ip6_route_dev_notify,
2815 int __init ip6_route_init(void)
2820 ip6_dst_ops_template.kmem_cachep =
2821 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2822 SLAB_HWCACHE_ALIGN, NULL);
2823 if (!ip6_dst_ops_template.kmem_cachep)
2826 ret = dst_entries_init(&ip6_dst_blackhole_ops);
2828 goto out_kmem_cache;
2830 ret = register_pernet_subsys(&ip6_route_net_ops);
2832 goto out_dst_entries;
2834 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
2836 /* Registering of the loopback is done before this portion of code,
2837 * the loopback reference in rt6_info will not be taken, do it
2838 * manually for init_net */
2839 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2840 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2841 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2842 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2843 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2844 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2845 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2849 goto out_register_subsys;
2855 ret = fib6_rules_init();
2860 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2861 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2862 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2863 goto fib6_rules_init;
2865 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2867 goto fib6_rules_init;
2873 fib6_rules_cleanup();
2878 out_register_subsys:
2879 unregister_pernet_subsys(&ip6_route_net_ops);
2881 dst_entries_destroy(&ip6_dst_blackhole_ops);
2883 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2887 void ip6_route_cleanup(void)
2889 unregister_netdevice_notifier(&ip6_route_dev_notifier);
2890 fib6_rules_cleanup();
2893 unregister_pernet_subsys(&ip6_route_net_ops);
2894 dst_entries_destroy(&ip6_dst_blackhole_ops);
2895 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);