]> Pileus Git - ~andy/linux/blob - net/ipv4/route.c
baa9b289d7ab4b0a3780872ed1d05d62b9293665
[~andy/linux] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu);
144 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145                                         struct sk_buff *skb);
146 static void             ipv4_dst_destroy(struct dst_entry *dst);
147
148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
149                             int how)
150 {
151 }
152
153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
154 {
155         WARN_ON(1);
156         return NULL;
157 }
158
159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
160                                            struct sk_buff *skb,
161                                            const void *daddr);
162
163 static struct dst_ops ipv4_dst_ops = {
164         .family =               AF_INET,
165         .protocol =             cpu_to_be16(ETH_P_IP),
166         .check =                ipv4_dst_check,
167         .default_advmss =       ipv4_default_advmss,
168         .mtu =                  ipv4_mtu,
169         .cow_metrics =          ipv4_cow_metrics,
170         .destroy =              ipv4_dst_destroy,
171         .ifdown =               ipv4_dst_ifdown,
172         .negative_advice =      ipv4_negative_advice,
173         .link_failure =         ipv4_link_failure,
174         .update_pmtu =          ip_rt_update_pmtu,
175         .redirect =             ip_do_redirect,
176         .local_out =            __ip_local_out,
177         .neigh_lookup =         ipv4_neigh_lookup,
178 };
179
180 #define ECN_OR_COST(class)      TC_PRIO_##class
181
182 const __u8 ip_tos2prio[16] = {
183         TC_PRIO_BESTEFFORT,
184         ECN_OR_COST(BESTEFFORT),
185         TC_PRIO_BESTEFFORT,
186         ECN_OR_COST(BESTEFFORT),
187         TC_PRIO_BULK,
188         ECN_OR_COST(BULK),
189         TC_PRIO_BULK,
190         ECN_OR_COST(BULK),
191         TC_PRIO_INTERACTIVE,
192         ECN_OR_COST(INTERACTIVE),
193         TC_PRIO_INTERACTIVE,
194         ECN_OR_COST(INTERACTIVE),
195         TC_PRIO_INTERACTIVE_BULK,
196         ECN_OR_COST(INTERACTIVE_BULK),
197         TC_PRIO_INTERACTIVE_BULK,
198         ECN_OR_COST(INTERACTIVE_BULK)
199 };
200 EXPORT_SYMBOL(ip_tos2prio);
201
202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
204
205 #ifdef CONFIG_PROC_FS
206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
207 {
208         if (*pos)
209                 return NULL;
210         return SEQ_START_TOKEN;
211 }
212
213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
214 {
215         ++*pos;
216         return NULL;
217 }
218
219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
220 {
221 }
222
223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 {
225         if (v == SEQ_START_TOKEN)
226                 seq_printf(seq, "%-127s\n",
227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229                            "HHUptod\tSpecDst");
230         return 0;
231 }
232
233 static const struct seq_operations rt_cache_seq_ops = {
234         .start  = rt_cache_seq_start,
235         .next   = rt_cache_seq_next,
236         .stop   = rt_cache_seq_stop,
237         .show   = rt_cache_seq_show,
238 };
239
240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 {
242         return seq_open(file, &rt_cache_seq_ops);
243 }
244
245 static const struct file_operations rt_cache_seq_fops = {
246         .owner   = THIS_MODULE,
247         .open    = rt_cache_seq_open,
248         .read    = seq_read,
249         .llseek  = seq_lseek,
250         .release = seq_release,
251 };
252
253
254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
255 {
256         int cpu;
257
258         if (*pos == 0)
259                 return SEQ_START_TOKEN;
260
261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
262                 if (!cpu_possible(cpu))
263                         continue;
264                 *pos = cpu+1;
265                 return &per_cpu(rt_cache_stat, cpu);
266         }
267         return NULL;
268 }
269
270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
271 {
272         int cpu;
273
274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
275                 if (!cpu_possible(cpu))
276                         continue;
277                 *pos = cpu+1;
278                 return &per_cpu(rt_cache_stat, cpu);
279         }
280         return NULL;
281
282 }
283
284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 {
286
287 }
288
289 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 {
291         struct rt_cache_stat *st = v;
292
293         if (v == SEQ_START_TOKEN) {
294                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
295                 return 0;
296         }
297
298         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
299                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
300                    dst_entries_get_slow(&ipv4_dst_ops),
301                    st->in_hit,
302                    st->in_slow_tot,
303                    st->in_slow_mc,
304                    st->in_no_route,
305                    st->in_brd,
306                    st->in_martian_dst,
307                    st->in_martian_src,
308
309                    st->out_hit,
310                    st->out_slow_tot,
311                    st->out_slow_mc,
312
313                    st->gc_total,
314                    st->gc_ignored,
315                    st->gc_goal_miss,
316                    st->gc_dst_overflow,
317                    st->in_hlist_search,
318                    st->out_hlist_search
319                 );
320         return 0;
321 }
322
323 static const struct seq_operations rt_cpu_seq_ops = {
324         .start  = rt_cpu_seq_start,
325         .next   = rt_cpu_seq_next,
326         .stop   = rt_cpu_seq_stop,
327         .show   = rt_cpu_seq_show,
328 };
329
330
331 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 {
333         return seq_open(file, &rt_cpu_seq_ops);
334 }
335
336 static const struct file_operations rt_cpu_seq_fops = {
337         .owner   = THIS_MODULE,
338         .open    = rt_cpu_seq_open,
339         .read    = seq_read,
340         .llseek  = seq_lseek,
341         .release = seq_release,
342 };
343
344 #ifdef CONFIG_IP_ROUTE_CLASSID
345 static int rt_acct_proc_show(struct seq_file *m, void *v)
346 {
347         struct ip_rt_acct *dst, *src;
348         unsigned int i, j;
349
350         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
351         if (!dst)
352                 return -ENOMEM;
353
354         for_each_possible_cpu(i) {
355                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
356                 for (j = 0; j < 256; j++) {
357                         dst[j].o_bytes   += src[j].o_bytes;
358                         dst[j].o_packets += src[j].o_packets;
359                         dst[j].i_bytes   += src[j].i_bytes;
360                         dst[j].i_packets += src[j].i_packets;
361                 }
362         }
363
364         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365         kfree(dst);
366         return 0;
367 }
368
369 static int rt_acct_proc_open(struct inode *inode, struct file *file)
370 {
371         return single_open(file, rt_acct_proc_show, NULL);
372 }
373
374 static const struct file_operations rt_acct_proc_fops = {
375         .owner          = THIS_MODULE,
376         .open           = rt_acct_proc_open,
377         .read           = seq_read,
378         .llseek         = seq_lseek,
379         .release        = single_release,
380 };
381 #endif
382
383 static int __net_init ip_rt_do_proc_init(struct net *net)
384 {
385         struct proc_dir_entry *pde;
386
387         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
388                         &rt_cache_seq_fops);
389         if (!pde)
390                 goto err1;
391
392         pde = proc_create("rt_cache", S_IRUGO,
393                           net->proc_net_stat, &rt_cpu_seq_fops);
394         if (!pde)
395                 goto err2;
396
397 #ifdef CONFIG_IP_ROUTE_CLASSID
398         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
399         if (!pde)
400                 goto err3;
401 #endif
402         return 0;
403
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 err3:
406         remove_proc_entry("rt_cache", net->proc_net_stat);
407 #endif
408 err2:
409         remove_proc_entry("rt_cache", net->proc_net);
410 err1:
411         return -ENOMEM;
412 }
413
414 static void __net_exit ip_rt_do_proc_exit(struct net *net)
415 {
416         remove_proc_entry("rt_cache", net->proc_net_stat);
417         remove_proc_entry("rt_cache", net->proc_net);
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419         remove_proc_entry("rt_acct", net->proc_net);
420 #endif
421 }
422
423 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
424         .init = ip_rt_do_proc_init,
425         .exit = ip_rt_do_proc_exit,
426 };
427
428 static int __init ip_rt_proc_init(void)
429 {
430         return register_pernet_subsys(&ip_rt_proc_ops);
431 }
432
433 #else
434 static inline int ip_rt_proc_init(void)
435 {
436         return 0;
437 }
438 #endif /* CONFIG_PROC_FS */
439
440 static inline bool rt_is_expired(const struct rtable *rth)
441 {
442         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
443 }
444
445 void rt_cache_flush(struct net *net)
446 {
447         rt_genid_bump(net);
448 }
449
450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
451                                            struct sk_buff *skb,
452                                            const void *daddr)
453 {
454         struct net_device *dev = dst->dev;
455         const __be32 *pkey = daddr;
456         const struct rtable *rt;
457         struct neighbour *n;
458
459         rt = (const struct rtable *) dst;
460         if (rt->rt_gateway)
461                 pkey = (const __be32 *) &rt->rt_gateway;
462         else if (skb)
463                 pkey = &ip_hdr(skb)->daddr;
464
465         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
466         if (n)
467                 return n;
468         return neigh_create(&arp_tbl, pkey, dev);
469 }
470
471 /*
472  * Peer allocation may fail only in serious out-of-memory conditions.  However
473  * we still can generate some output.
474  * Random ID selection looks a bit dangerous because we have no chances to
475  * select ID being unique in a reasonable period of time.
476  * But broken packet identifier may be better than no packet at all.
477  */
478 static void ip_select_fb_ident(struct iphdr *iph)
479 {
480         static DEFINE_SPINLOCK(ip_fb_id_lock);
481         static u32 ip_fallback_id;
482         u32 salt;
483
484         spin_lock_bh(&ip_fb_id_lock);
485         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
486         iph->id = htons(salt & 0xFFFF);
487         ip_fallback_id = salt;
488         spin_unlock_bh(&ip_fb_id_lock);
489 }
490
491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
492 {
493         struct net *net = dev_net(dst->dev);
494         struct inet_peer *peer;
495
496         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
497         if (peer) {
498                 iph->id = htons(inet_getid(peer, more));
499                 inet_putpeer(peer);
500                 return;
501         }
502
503         ip_select_fb_ident(iph);
504 }
505 EXPORT_SYMBOL(__ip_select_ident);
506
507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
508                              const struct iphdr *iph,
509                              int oif, u8 tos,
510                              u8 prot, u32 mark, int flow_flags)
511 {
512         if (sk) {
513                 const struct inet_sock *inet = inet_sk(sk);
514
515                 oif = sk->sk_bound_dev_if;
516                 mark = sk->sk_mark;
517                 tos = RT_CONN_FLAGS(sk);
518                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
519         }
520         flowi4_init_output(fl4, oif, mark, tos,
521                            RT_SCOPE_UNIVERSE, prot,
522                            flow_flags,
523                            iph->daddr, iph->saddr, 0, 0);
524 }
525
526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
527                                const struct sock *sk)
528 {
529         const struct iphdr *iph = ip_hdr(skb);
530         int oif = skb->dev->ifindex;
531         u8 tos = RT_TOS(iph->tos);
532         u8 prot = iph->protocol;
533         u32 mark = skb->mark;
534
535         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
536 }
537
538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
539 {
540         const struct inet_sock *inet = inet_sk(sk);
541         const struct ip_options_rcu *inet_opt;
542         __be32 daddr = inet->inet_daddr;
543
544         rcu_read_lock();
545         inet_opt = rcu_dereference(inet->inet_opt);
546         if (inet_opt && inet_opt->opt.srr)
547                 daddr = inet_opt->opt.faddr;
548         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
549                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
550                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
551                            inet_sk_flowi_flags(sk),
552                            daddr, inet->inet_saddr, 0, 0);
553         rcu_read_unlock();
554 }
555
556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
557                                  const struct sk_buff *skb)
558 {
559         if (skb)
560                 build_skb_flow_key(fl4, skb, sk);
561         else
562                 build_sk_flow_key(fl4, sk);
563 }
564
565 static inline void rt_free(struct rtable *rt)
566 {
567         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
568 }
569
570 static DEFINE_SPINLOCK(fnhe_lock);
571
572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
573 {
574         struct fib_nh_exception *fnhe, *oldest;
575         struct rtable *orig;
576
577         oldest = rcu_dereference(hash->chain);
578         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
579              fnhe = rcu_dereference(fnhe->fnhe_next)) {
580                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
581                         oldest = fnhe;
582         }
583         orig = rcu_dereference(oldest->fnhe_rth);
584         if (orig) {
585                 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
586                 rt_free(orig);
587         }
588         return oldest;
589 }
590
591 static inline u32 fnhe_hashfun(__be32 daddr)
592 {
593         u32 hval;
594
595         hval = (__force u32) daddr;
596         hval ^= (hval >> 11) ^ (hval >> 22);
597
598         return hval & (FNHE_HASH_SIZE - 1);
599 }
600
601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
602                                   u32 pmtu, unsigned long expires)
603 {
604         struct fnhe_hash_bucket *hash;
605         struct fib_nh_exception *fnhe;
606         int depth;
607         u32 hval = fnhe_hashfun(daddr);
608
609         spin_lock_bh(&fnhe_lock);
610
611         hash = nh->nh_exceptions;
612         if (!hash) {
613                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
614                 if (!hash)
615                         goto out_unlock;
616                 nh->nh_exceptions = hash;
617         }
618
619         hash += hval;
620
621         depth = 0;
622         for (fnhe = rcu_dereference(hash->chain); fnhe;
623              fnhe = rcu_dereference(fnhe->fnhe_next)) {
624                 if (fnhe->fnhe_daddr == daddr)
625                         break;
626                 depth++;
627         }
628
629         if (fnhe) {
630                 if (gw)
631                         fnhe->fnhe_gw = gw;
632                 if (pmtu) {
633                         fnhe->fnhe_pmtu = pmtu;
634                         fnhe->fnhe_expires = expires;
635                 }
636         } else {
637                 if (depth > FNHE_RECLAIM_DEPTH)
638                         fnhe = fnhe_oldest(hash);
639                 else {
640                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
641                         if (!fnhe)
642                                 goto out_unlock;
643
644                         fnhe->fnhe_next = hash->chain;
645                         rcu_assign_pointer(hash->chain, fnhe);
646                 }
647                 fnhe->fnhe_daddr = daddr;
648                 fnhe->fnhe_gw = gw;
649                 fnhe->fnhe_pmtu = pmtu;
650                 fnhe->fnhe_expires = expires;
651         }
652
653         fnhe->fnhe_stamp = jiffies;
654
655 out_unlock:
656         spin_unlock_bh(&fnhe_lock);
657         return;
658 }
659
660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
661                              bool kill_route)
662 {
663         __be32 new_gw = icmp_hdr(skb)->un.gateway;
664         __be32 old_gw = ip_hdr(skb)->saddr;
665         struct net_device *dev = skb->dev;
666         struct in_device *in_dev;
667         struct fib_result res;
668         struct neighbour *n;
669         struct net *net;
670
671         switch (icmp_hdr(skb)->code & 7) {
672         case ICMP_REDIR_NET:
673         case ICMP_REDIR_NETTOS:
674         case ICMP_REDIR_HOST:
675         case ICMP_REDIR_HOSTTOS:
676                 break;
677
678         default:
679                 return;
680         }
681
682         if (rt->rt_gateway != old_gw)
683                 return;
684
685         in_dev = __in_dev_get_rcu(dev);
686         if (!in_dev)
687                 return;
688
689         net = dev_net(dev);
690         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
691             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
692             ipv4_is_zeronet(new_gw))
693                 goto reject_redirect;
694
695         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
696                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
697                         goto reject_redirect;
698                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
699                         goto reject_redirect;
700         } else {
701                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
702                         goto reject_redirect;
703         }
704
705         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
706         if (n) {
707                 if (!(n->nud_state & NUD_VALID)) {
708                         neigh_event_send(n, NULL);
709                 } else {
710                         if (fib_lookup(net, fl4, &res) == 0) {
711                                 struct fib_nh *nh = &FIB_RES_NH(res);
712
713                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
714                                                       0, 0);
715                         }
716                         if (kill_route)
717                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
718                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
719                 }
720                 neigh_release(n);
721         }
722         return;
723
724 reject_redirect:
725 #ifdef CONFIG_IP_ROUTE_VERBOSE
726         if (IN_DEV_LOG_MARTIANS(in_dev)) {
727                 const struct iphdr *iph = (const struct iphdr *) skb->data;
728                 __be32 daddr = iph->daddr;
729                 __be32 saddr = iph->saddr;
730
731                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
732                                      "  Advised path = %pI4 -> %pI4\n",
733                                      &old_gw, dev->name, &new_gw,
734                                      &saddr, &daddr);
735         }
736 #endif
737         ;
738 }
739
740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
741 {
742         struct rtable *rt;
743         struct flowi4 fl4;
744
745         rt = (struct rtable *) dst;
746
747         ip_rt_build_flow_key(&fl4, sk, skb);
748         __ip_do_redirect(rt, skb, &fl4, true);
749 }
750
751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
752 {
753         struct rtable *rt = (struct rtable *)dst;
754         struct dst_entry *ret = dst;
755
756         if (rt) {
757                 if (dst->obsolete > 0) {
758                         ip_rt_put(rt);
759                         ret = NULL;
760                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
761                            rt->dst.expires) {
762                         ip_rt_put(rt);
763                         ret = NULL;
764                 }
765         }
766         return ret;
767 }
768
769 /*
770  * Algorithm:
771  *      1. The first ip_rt_redirect_number redirects are sent
772  *         with exponential backoff, then we stop sending them at all,
773  *         assuming that the host ignores our redirects.
774  *      2. If we did not see packets requiring redirects
775  *         during ip_rt_redirect_silence, we assume that the host
776  *         forgot redirected route and start to send redirects again.
777  *
778  * This algorithm is much cheaper and more intelligent than dumb load limiting
779  * in icmp.c.
780  *
781  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
782  * and "frag. need" (breaks PMTU discovery) in icmp.c.
783  */
784
785 void ip_rt_send_redirect(struct sk_buff *skb)
786 {
787         struct rtable *rt = skb_rtable(skb);
788         struct in_device *in_dev;
789         struct inet_peer *peer;
790         struct net *net;
791         int log_martians;
792
793         rcu_read_lock();
794         in_dev = __in_dev_get_rcu(rt->dst.dev);
795         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
796                 rcu_read_unlock();
797                 return;
798         }
799         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
800         rcu_read_unlock();
801
802         net = dev_net(rt->dst.dev);
803         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
804         if (!peer) {
805                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
806                           rt_nexthop(rt, ip_hdr(skb)->daddr));
807                 return;
808         }
809
810         /* No redirected packets during ip_rt_redirect_silence;
811          * reset the algorithm.
812          */
813         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
814                 peer->rate_tokens = 0;
815
816         /* Too many ignored redirects; do not send anything
817          * set dst.rate_last to the last seen redirected packet.
818          */
819         if (peer->rate_tokens >= ip_rt_redirect_number) {
820                 peer->rate_last = jiffies;
821                 goto out_put_peer;
822         }
823
824         /* Check for load limit; set rate_last to the latest sent
825          * redirect.
826          */
827         if (peer->rate_tokens == 0 ||
828             time_after(jiffies,
829                        (peer->rate_last +
830                         (ip_rt_redirect_load << peer->rate_tokens)))) {
831                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
832
833                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
834                 peer->rate_last = jiffies;
835                 ++peer->rate_tokens;
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837                 if (log_martians &&
838                     peer->rate_tokens == ip_rt_redirect_number)
839                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
840                                              &ip_hdr(skb)->saddr, inet_iif(skb),
841                                              &ip_hdr(skb)->daddr, &gw);
842 #endif
843         }
844 out_put_peer:
845         inet_putpeer(peer);
846 }
847
848 static int ip_error(struct sk_buff *skb)
849 {
850         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
851         struct rtable *rt = skb_rtable(skb);
852         struct inet_peer *peer;
853         unsigned long now;
854         struct net *net;
855         bool send;
856         int code;
857
858         net = dev_net(rt->dst.dev);
859         if (!IN_DEV_FORWARD(in_dev)) {
860                 switch (rt->dst.error) {
861                 case EHOSTUNREACH:
862                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
863                         break;
864
865                 case ENETUNREACH:
866                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
867                         break;
868                 }
869                 goto out;
870         }
871
872         switch (rt->dst.error) {
873         case EINVAL:
874         default:
875                 goto out;
876         case EHOSTUNREACH:
877                 code = ICMP_HOST_UNREACH;
878                 break;
879         case ENETUNREACH:
880                 code = ICMP_NET_UNREACH;
881                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
882                 break;
883         case EACCES:
884                 code = ICMP_PKT_FILTERED;
885                 break;
886         }
887
888         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
889
890         send = true;
891         if (peer) {
892                 now = jiffies;
893                 peer->rate_tokens += now - peer->rate_last;
894                 if (peer->rate_tokens > ip_rt_error_burst)
895                         peer->rate_tokens = ip_rt_error_burst;
896                 peer->rate_last = now;
897                 if (peer->rate_tokens >= ip_rt_error_cost)
898                         peer->rate_tokens -= ip_rt_error_cost;
899                 else
900                         send = false;
901                 inet_putpeer(peer);
902         }
903         if (send)
904                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
905
906 out:    kfree_skb(skb);
907         return 0;
908 }
909
910 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
911 {
912         struct dst_entry *dst = &rt->dst;
913         struct fib_result res;
914
915         if (dst->dev->mtu < mtu)
916                 return;
917
918         if (mtu < ip_rt_min_pmtu)
919                 mtu = ip_rt_min_pmtu;
920
921         if (!rt->rt_pmtu) {
922                 dst->obsolete = DST_OBSOLETE_KILL;
923         } else {
924                 rt->rt_pmtu = mtu;
925                 dst->expires = max(1UL, jiffies + ip_rt_mtu_expires);
926         }
927
928         rcu_read_lock();
929         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
930                 struct fib_nh *nh = &FIB_RES_NH(res);
931
932                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
933                                       jiffies + ip_rt_mtu_expires);
934         }
935         rcu_read_unlock();
936 }
937
938 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
939                               struct sk_buff *skb, u32 mtu)
940 {
941         struct rtable *rt = (struct rtable *) dst;
942         struct flowi4 fl4;
943
944         ip_rt_build_flow_key(&fl4, sk, skb);
945         __ip_rt_update_pmtu(rt, &fl4, mtu);
946 }
947
948 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
949                       int oif, u32 mark, u8 protocol, int flow_flags)
950 {
951         const struct iphdr *iph = (const struct iphdr *) skb->data;
952         struct flowi4 fl4;
953         struct rtable *rt;
954
955         __build_flow_key(&fl4, NULL, iph, oif,
956                          RT_TOS(iph->tos), protocol, mark, flow_flags);
957         rt = __ip_route_output_key(net, &fl4);
958         if (!IS_ERR(rt)) {
959                 __ip_rt_update_pmtu(rt, &fl4, mtu);
960                 ip_rt_put(rt);
961         }
962 }
963 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
964
965 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
966 {
967         const struct iphdr *iph = (const struct iphdr *) skb->data;
968         struct flowi4 fl4;
969         struct rtable *rt;
970
971         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
972         rt = __ip_route_output_key(sock_net(sk), &fl4);
973         if (!IS_ERR(rt)) {
974                 __ip_rt_update_pmtu(rt, &fl4, mtu);
975                 ip_rt_put(rt);
976         }
977 }
978 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
979
980 void ipv4_redirect(struct sk_buff *skb, struct net *net,
981                    int oif, u32 mark, u8 protocol, int flow_flags)
982 {
983         const struct iphdr *iph = (const struct iphdr *) skb->data;
984         struct flowi4 fl4;
985         struct rtable *rt;
986
987         __build_flow_key(&fl4, NULL, iph, oif,
988                          RT_TOS(iph->tos), protocol, mark, flow_flags);
989         rt = __ip_route_output_key(net, &fl4);
990         if (!IS_ERR(rt)) {
991                 __ip_do_redirect(rt, skb, &fl4, false);
992                 ip_rt_put(rt);
993         }
994 }
995 EXPORT_SYMBOL_GPL(ipv4_redirect);
996
997 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
998 {
999         const struct iphdr *iph = (const struct iphdr *) skb->data;
1000         struct flowi4 fl4;
1001         struct rtable *rt;
1002
1003         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1004         rt = __ip_route_output_key(sock_net(sk), &fl4);
1005         if (!IS_ERR(rt)) {
1006                 __ip_do_redirect(rt, skb, &fl4, false);
1007                 ip_rt_put(rt);
1008         }
1009 }
1010 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1011
1012 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1013 {
1014         struct rtable *rt = (struct rtable *) dst;
1015
1016         /* All IPV4 dsts are created with ->obsolete set to the value
1017          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1018          * into this function always.
1019          *
1020          * When a PMTU/redirect information update invalidates a
1021          * route, this is indicated by setting obsolete to
1022          * DST_OBSOLETE_KILL.
1023          */
1024         if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt))
1025                 return NULL;
1026         return dst;
1027 }
1028
1029 static void ipv4_link_failure(struct sk_buff *skb)
1030 {
1031         struct rtable *rt;
1032
1033         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1034
1035         rt = skb_rtable(skb);
1036         if (rt)
1037                 dst_set_expires(&rt->dst, 0);
1038 }
1039
1040 static int ip_rt_bug(struct sk_buff *skb)
1041 {
1042         pr_debug("%s: %pI4 -> %pI4, %s\n",
1043                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1044                  skb->dev ? skb->dev->name : "?");
1045         kfree_skb(skb);
1046         WARN_ON(1);
1047         return 0;
1048 }
1049
1050 /*
1051    We do not cache source address of outgoing interface,
1052    because it is used only by IP RR, TS and SRR options,
1053    so that it out of fast path.
1054
1055    BTW remember: "addr" is allowed to be not aligned
1056    in IP options!
1057  */
1058
1059 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1060 {
1061         __be32 src;
1062
1063         if (rt_is_output_route(rt))
1064                 src = ip_hdr(skb)->saddr;
1065         else {
1066                 struct fib_result res;
1067                 struct flowi4 fl4;
1068                 struct iphdr *iph;
1069
1070                 iph = ip_hdr(skb);
1071
1072                 memset(&fl4, 0, sizeof(fl4));
1073                 fl4.daddr = iph->daddr;
1074                 fl4.saddr = iph->saddr;
1075                 fl4.flowi4_tos = RT_TOS(iph->tos);
1076                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1077                 fl4.flowi4_iif = skb->dev->ifindex;
1078                 fl4.flowi4_mark = skb->mark;
1079
1080                 rcu_read_lock();
1081                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1082                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1083                 else
1084                         src = inet_select_addr(rt->dst.dev,
1085                                                rt_nexthop(rt, iph->daddr),
1086                                                RT_SCOPE_UNIVERSE);
1087                 rcu_read_unlock();
1088         }
1089         memcpy(addr, &src, 4);
1090 }
1091
1092 #ifdef CONFIG_IP_ROUTE_CLASSID
1093 static void set_class_tag(struct rtable *rt, u32 tag)
1094 {
1095         if (!(rt->dst.tclassid & 0xFFFF))
1096                 rt->dst.tclassid |= tag & 0xFFFF;
1097         if (!(rt->dst.tclassid & 0xFFFF0000))
1098                 rt->dst.tclassid |= tag & 0xFFFF0000;
1099 }
1100 #endif
1101
1102 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1103 {
1104         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1105
1106         if (advmss == 0) {
1107                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1108                                ip_rt_min_advmss);
1109                 if (advmss > 65535 - 40)
1110                         advmss = 65535 - 40;
1111         }
1112         return advmss;
1113 }
1114
1115 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1116 {
1117         const struct rtable *rt = (const struct rtable *) dst;
1118         unsigned int mtu = rt->rt_pmtu;
1119
1120         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1121                 mtu = dst_metric_raw(dst, RTAX_MTU);
1122
1123         if (mtu && rt_is_output_route(rt))
1124                 return mtu;
1125
1126         mtu = dst->dev->mtu;
1127
1128         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1129                 if (rt->rt_uses_gateway && mtu > 576)
1130                         mtu = 576;
1131         }
1132
1133         if (mtu > IP_MAX_MTU)
1134                 mtu = IP_MAX_MTU;
1135
1136         return mtu;
1137 }
1138
1139 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1140 {
1141         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1142         struct fib_nh_exception *fnhe;
1143         u32 hval;
1144
1145         if (!hash)
1146                 return NULL;
1147
1148         hval = fnhe_hashfun(daddr);
1149
1150         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1151              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1152                 if (fnhe->fnhe_daddr == daddr)
1153                         return fnhe;
1154         }
1155         return NULL;
1156 }
1157
1158 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1159                               __be32 daddr)
1160 {
1161         bool ret = false;
1162
1163         spin_lock_bh(&fnhe_lock);
1164
1165         if (daddr == fnhe->fnhe_daddr) {
1166                 struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
1167                 if (orig && rt_is_expired(orig)) {
1168                         fnhe->fnhe_gw = 0;
1169                         fnhe->fnhe_pmtu = 0;
1170                         fnhe->fnhe_expires = 0;
1171                 }
1172                 if (fnhe->fnhe_pmtu) {
1173                         unsigned long expires = fnhe->fnhe_expires;
1174                         unsigned long diff = expires - jiffies;
1175
1176                         if (time_before(jiffies, expires)) {
1177                                 rt->rt_pmtu = fnhe->fnhe_pmtu;
1178                                 dst_set_expires(&rt->dst, diff);
1179                         }
1180                 }
1181                 if (fnhe->fnhe_gw) {
1182                         rt->rt_flags |= RTCF_REDIRECTED;
1183                         rt->rt_gateway = fnhe->fnhe_gw;
1184                         rt->rt_uses_gateway = 1;
1185                 } else if (!rt->rt_gateway)
1186                         rt->rt_gateway = daddr;
1187
1188                 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1189                 if (orig)
1190                         rt_free(orig);
1191
1192                 fnhe->fnhe_stamp = jiffies;
1193                 ret = true;
1194         }
1195         spin_unlock_bh(&fnhe_lock);
1196
1197         return ret;
1198 }
1199
1200 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1201 {
1202         struct rtable *orig, *prev, **p;
1203         bool ret = true;
1204
1205         if (rt_is_input_route(rt)) {
1206                 p = (struct rtable **)&nh->nh_rth_input;
1207         } else {
1208                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1209         }
1210         orig = *p;
1211
1212         prev = cmpxchg(p, orig, rt);
1213         if (prev == orig) {
1214                 if (orig)
1215                         rt_free(orig);
1216         } else
1217                 ret = false;
1218
1219         return ret;
1220 }
1221
1222 static DEFINE_SPINLOCK(rt_uncached_lock);
1223 static LIST_HEAD(rt_uncached_list);
1224
1225 static void rt_add_uncached_list(struct rtable *rt)
1226 {
1227         spin_lock_bh(&rt_uncached_lock);
1228         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1229         spin_unlock_bh(&rt_uncached_lock);
1230 }
1231
1232 static void ipv4_dst_destroy(struct dst_entry *dst)
1233 {
1234         struct rtable *rt = (struct rtable *) dst;
1235
1236         if (!list_empty(&rt->rt_uncached)) {
1237                 spin_lock_bh(&rt_uncached_lock);
1238                 list_del(&rt->rt_uncached);
1239                 spin_unlock_bh(&rt_uncached_lock);
1240         }
1241 }
1242
1243 void rt_flush_dev(struct net_device *dev)
1244 {
1245         if (!list_empty(&rt_uncached_list)) {
1246                 struct net *net = dev_net(dev);
1247                 struct rtable *rt;
1248
1249                 spin_lock_bh(&rt_uncached_lock);
1250                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1251                         if (rt->dst.dev != dev)
1252                                 continue;
1253                         rt->dst.dev = net->loopback_dev;
1254                         dev_hold(rt->dst.dev);
1255                         dev_put(dev);
1256                 }
1257                 spin_unlock_bh(&rt_uncached_lock);
1258         }
1259 }
1260
1261 static bool rt_cache_valid(const struct rtable *rt)
1262 {
1263         return  rt &&
1264                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1265                 !rt_is_expired(rt);
1266 }
1267
1268 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1269                            const struct fib_result *res,
1270                            struct fib_nh_exception *fnhe,
1271                            struct fib_info *fi, u16 type, u32 itag)
1272 {
1273         bool cached = false;
1274
1275         if (fi) {
1276                 struct fib_nh *nh = &FIB_RES_NH(*res);
1277
1278                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1279                         rt->rt_gateway = nh->nh_gw;
1280                         rt->rt_uses_gateway = 1;
1281                 }
1282                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1283 #ifdef CONFIG_IP_ROUTE_CLASSID
1284                 rt->dst.tclassid = nh->nh_tclassid;
1285 #endif
1286                 if (unlikely(fnhe))
1287                         cached = rt_bind_exception(rt, fnhe, daddr);
1288                 else if (!(rt->dst.flags & DST_NOCACHE))
1289                         cached = rt_cache_route(nh, rt);
1290                 if (unlikely(!cached)) {
1291                         /* Routes we intend to cache in nexthop exception or
1292                          * FIB nexthop have the DST_NOCACHE bit clear.
1293                          * However, if we are unsuccessful at storing this
1294                          * route into the cache we really need to set it.
1295                          */
1296                         rt->dst.flags |= DST_NOCACHE;
1297                         if (!rt->rt_gateway)
1298                                 rt->rt_gateway = daddr;
1299                         rt_add_uncached_list(rt);
1300                 }
1301         } else
1302                 rt_add_uncached_list(rt);
1303
1304 #ifdef CONFIG_IP_ROUTE_CLASSID
1305 #ifdef CONFIG_IP_MULTIPLE_TABLES
1306         set_class_tag(rt, res->tclassid);
1307 #endif
1308         set_class_tag(rt, itag);
1309 #endif
1310 }
1311
1312 static struct rtable *rt_dst_alloc(struct net_device *dev,
1313                                    bool nopolicy, bool noxfrm, bool will_cache)
1314 {
1315         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1316                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1317                          (nopolicy ? DST_NOPOLICY : 0) |
1318                          (noxfrm ? DST_NOXFRM : 0));
1319 }
1320
1321 /* called in rcu_read_lock() section */
1322 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1323                                 u8 tos, struct net_device *dev, int our)
1324 {
1325         struct rtable *rth;
1326         struct in_device *in_dev = __in_dev_get_rcu(dev);
1327         u32 itag = 0;
1328         int err;
1329
1330         /* Primary sanity checks. */
1331
1332         if (in_dev == NULL)
1333                 return -EINVAL;
1334
1335         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1336             skb->protocol != htons(ETH_P_IP))
1337                 goto e_inval;
1338
1339         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1340                 if (ipv4_is_loopback(saddr))
1341                         goto e_inval;
1342
1343         if (ipv4_is_zeronet(saddr)) {
1344                 if (!ipv4_is_local_multicast(daddr))
1345                         goto e_inval;
1346         } else {
1347                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1348                                           in_dev, &itag);
1349                 if (err < 0)
1350                         goto e_err;
1351         }
1352         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1353                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1354         if (!rth)
1355                 goto e_nobufs;
1356
1357 #ifdef CONFIG_IP_ROUTE_CLASSID
1358         rth->dst.tclassid = itag;
1359 #endif
1360         rth->dst.output = ip_rt_bug;
1361
1362         rth->rt_genid   = rt_genid(dev_net(dev));
1363         rth->rt_flags   = RTCF_MULTICAST;
1364         rth->rt_type    = RTN_MULTICAST;
1365         rth->rt_is_input= 1;
1366         rth->rt_iif     = 0;
1367         rth->rt_pmtu    = 0;
1368         rth->rt_gateway = 0;
1369         rth->rt_uses_gateway = 0;
1370         INIT_LIST_HEAD(&rth->rt_uncached);
1371         if (our) {
1372                 rth->dst.input= ip_local_deliver;
1373                 rth->rt_flags |= RTCF_LOCAL;
1374         }
1375
1376 #ifdef CONFIG_IP_MROUTE
1377         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1378                 rth->dst.input = ip_mr_input;
1379 #endif
1380         RT_CACHE_STAT_INC(in_slow_mc);
1381
1382         skb_dst_set(skb, &rth->dst);
1383         return 0;
1384
1385 e_nobufs:
1386         return -ENOBUFS;
1387 e_inval:
1388         return -EINVAL;
1389 e_err:
1390         return err;
1391 }
1392
1393
1394 static void ip_handle_martian_source(struct net_device *dev,
1395                                      struct in_device *in_dev,
1396                                      struct sk_buff *skb,
1397                                      __be32 daddr,
1398                                      __be32 saddr)
1399 {
1400         RT_CACHE_STAT_INC(in_martian_src);
1401 #ifdef CONFIG_IP_ROUTE_VERBOSE
1402         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1403                 /*
1404                  *      RFC1812 recommendation, if source is martian,
1405                  *      the only hint is MAC header.
1406                  */
1407                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1408                         &daddr, &saddr, dev->name);
1409                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1410                         print_hex_dump(KERN_WARNING, "ll header: ",
1411                                        DUMP_PREFIX_OFFSET, 16, 1,
1412                                        skb_mac_header(skb),
1413                                        dev->hard_header_len, true);
1414                 }
1415         }
1416 #endif
1417 }
1418
1419 /* called in rcu_read_lock() section */
1420 static int __mkroute_input(struct sk_buff *skb,
1421                            const struct fib_result *res,
1422                            struct in_device *in_dev,
1423                            __be32 daddr, __be32 saddr, u32 tos)
1424 {
1425         struct rtable *rth;
1426         int err;
1427         struct in_device *out_dev;
1428         unsigned int flags = 0;
1429         bool do_cache;
1430         u32 itag;
1431
1432         /* get a working reference to the output device */
1433         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1434         if (out_dev == NULL) {
1435                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1436                 return -EINVAL;
1437         }
1438
1439         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1440                                   in_dev->dev, in_dev, &itag);
1441         if (err < 0) {
1442                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1443                                          saddr);
1444
1445                 goto cleanup;
1446         }
1447
1448         do_cache = res->fi && !itag;
1449         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1450             (IN_DEV_SHARED_MEDIA(out_dev) ||
1451              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1452                 flags |= RTCF_DOREDIRECT;
1453                 do_cache = false;
1454         }
1455
1456         if (skb->protocol != htons(ETH_P_IP)) {
1457                 /* Not IP (i.e. ARP). Do not create route, if it is
1458                  * invalid for proxy arp. DNAT routes are always valid.
1459                  *
1460                  * Proxy arp feature have been extended to allow, ARP
1461                  * replies back to the same interface, to support
1462                  * Private VLAN switch technologies. See arp.c.
1463                  */
1464                 if (out_dev == in_dev &&
1465                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1466                         err = -EINVAL;
1467                         goto cleanup;
1468                 }
1469         }
1470
1471         if (do_cache) {
1472                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1473                 if (rt_cache_valid(rth)) {
1474                         skb_dst_set_noref(skb, &rth->dst);
1475                         goto out;
1476                 }
1477         }
1478
1479         rth = rt_dst_alloc(out_dev->dev,
1480                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1481                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1482         if (!rth) {
1483                 err = -ENOBUFS;
1484                 goto cleanup;
1485         }
1486
1487         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1488         rth->rt_flags = flags;
1489         rth->rt_type = res->type;
1490         rth->rt_is_input = 1;
1491         rth->rt_iif     = 0;
1492         rth->rt_pmtu    = 0;
1493         rth->rt_gateway = 0;
1494         rth->rt_uses_gateway = 0;
1495         INIT_LIST_HEAD(&rth->rt_uncached);
1496
1497         rth->dst.input = ip_forward;
1498         rth->dst.output = ip_output;
1499
1500         rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1501         skb_dst_set(skb, &rth->dst);
1502 out:
1503         err = 0;
1504  cleanup:
1505         return err;
1506 }
1507
1508 static int ip_mkroute_input(struct sk_buff *skb,
1509                             struct fib_result *res,
1510                             const struct flowi4 *fl4,
1511                             struct in_device *in_dev,
1512                             __be32 daddr, __be32 saddr, u32 tos)
1513 {
1514 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1515         if (res->fi && res->fi->fib_nhs > 1)
1516                 fib_select_multipath(res);
1517 #endif
1518
1519         /* create a routing cache entry */
1520         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1521 }
1522
1523 /*
1524  *      NOTE. We drop all the packets that has local source
1525  *      addresses, because every properly looped back packet
1526  *      must have correct destination already attached by output routine.
1527  *
1528  *      Such approach solves two big problems:
1529  *      1. Not simplex devices are handled properly.
1530  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1531  *      called with rcu_read_lock()
1532  */
1533
1534 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1535                                u8 tos, struct net_device *dev)
1536 {
1537         struct fib_result res;
1538         struct in_device *in_dev = __in_dev_get_rcu(dev);
1539         struct flowi4   fl4;
1540         unsigned int    flags = 0;
1541         u32             itag = 0;
1542         struct rtable   *rth;
1543         int             err = -EINVAL;
1544         struct net    *net = dev_net(dev);
1545         bool do_cache;
1546
1547         /* IP on this device is disabled. */
1548
1549         if (!in_dev)
1550                 goto out;
1551
1552         /* Check for the most weird martians, which can be not detected
1553            by fib_lookup.
1554          */
1555
1556         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1557                 goto martian_source;
1558
1559         res.fi = NULL;
1560         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1561                 goto brd_input;
1562
1563         /* Accept zero addresses only to limited broadcast;
1564          * I even do not know to fix it or not. Waiting for complains :-)
1565          */
1566         if (ipv4_is_zeronet(saddr))
1567                 goto martian_source;
1568
1569         if (ipv4_is_zeronet(daddr))
1570                 goto martian_destination;
1571
1572         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1573          * and call it once if daddr or/and saddr are loopback addresses
1574          */
1575         if (ipv4_is_loopback(daddr)) {
1576                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1577                         goto martian_destination;
1578         } else if (ipv4_is_loopback(saddr)) {
1579                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1580                         goto martian_source;
1581         }
1582
1583         /*
1584          *      Now we are ready to route packet.
1585          */
1586         fl4.flowi4_oif = 0;
1587         fl4.flowi4_iif = dev->ifindex;
1588         fl4.flowi4_mark = skb->mark;
1589         fl4.flowi4_tos = tos;
1590         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1591         fl4.daddr = daddr;
1592         fl4.saddr = saddr;
1593         err = fib_lookup(net, &fl4, &res);
1594         if (err != 0)
1595                 goto no_route;
1596
1597         RT_CACHE_STAT_INC(in_slow_tot);
1598
1599         if (res.type == RTN_BROADCAST)
1600                 goto brd_input;
1601
1602         if (res.type == RTN_LOCAL) {
1603                 err = fib_validate_source(skb, saddr, daddr, tos,
1604                                           LOOPBACK_IFINDEX,
1605                                           dev, in_dev, &itag);
1606                 if (err < 0)
1607                         goto martian_source_keep_err;
1608                 goto local_input;
1609         }
1610
1611         if (!IN_DEV_FORWARD(in_dev))
1612                 goto no_route;
1613         if (res.type != RTN_UNICAST)
1614                 goto martian_destination;
1615
1616         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1617 out:    return err;
1618
1619 brd_input:
1620         if (skb->protocol != htons(ETH_P_IP))
1621                 goto e_inval;
1622
1623         if (!ipv4_is_zeronet(saddr)) {
1624                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1625                                           in_dev, &itag);
1626                 if (err < 0)
1627                         goto martian_source_keep_err;
1628         }
1629         flags |= RTCF_BROADCAST;
1630         res.type = RTN_BROADCAST;
1631         RT_CACHE_STAT_INC(in_brd);
1632
1633 local_input:
1634         do_cache = false;
1635         if (res.fi) {
1636                 if (!itag) {
1637                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1638                         if (rt_cache_valid(rth)) {
1639                                 skb_dst_set_noref(skb, &rth->dst);
1640                                 err = 0;
1641                                 goto out;
1642                         }
1643                         do_cache = true;
1644                 }
1645         }
1646
1647         rth = rt_dst_alloc(net->loopback_dev,
1648                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1649         if (!rth)
1650                 goto e_nobufs;
1651
1652         rth->dst.input= ip_local_deliver;
1653         rth->dst.output= ip_rt_bug;
1654 #ifdef CONFIG_IP_ROUTE_CLASSID
1655         rth->dst.tclassid = itag;
1656 #endif
1657
1658         rth->rt_genid = rt_genid(net);
1659         rth->rt_flags   = flags|RTCF_LOCAL;
1660         rth->rt_type    = res.type;
1661         rth->rt_is_input = 1;
1662         rth->rt_iif     = 0;
1663         rth->rt_pmtu    = 0;
1664         rth->rt_gateway = 0;
1665         rth->rt_uses_gateway = 0;
1666         INIT_LIST_HEAD(&rth->rt_uncached);
1667         if (res.type == RTN_UNREACHABLE) {
1668                 rth->dst.input= ip_error;
1669                 rth->dst.error= -err;
1670                 rth->rt_flags   &= ~RTCF_LOCAL;
1671         }
1672         if (do_cache)
1673                 rt_cache_route(&FIB_RES_NH(res), rth);
1674         skb_dst_set(skb, &rth->dst);
1675         err = 0;
1676         goto out;
1677
1678 no_route:
1679         RT_CACHE_STAT_INC(in_no_route);
1680         res.type = RTN_UNREACHABLE;
1681         if (err == -ESRCH)
1682                 err = -ENETUNREACH;
1683         goto local_input;
1684
1685         /*
1686          *      Do not cache martian addresses: they should be logged (RFC1812)
1687          */
1688 martian_destination:
1689         RT_CACHE_STAT_INC(in_martian_dst);
1690 #ifdef CONFIG_IP_ROUTE_VERBOSE
1691         if (IN_DEV_LOG_MARTIANS(in_dev))
1692                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1693                                      &daddr, &saddr, dev->name);
1694 #endif
1695
1696 e_inval:
1697         err = -EINVAL;
1698         goto out;
1699
1700 e_nobufs:
1701         err = -ENOBUFS;
1702         goto out;
1703
1704 martian_source:
1705         err = -EINVAL;
1706 martian_source_keep_err:
1707         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1708         goto out;
1709 }
1710
1711 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1712                          u8 tos, struct net_device *dev)
1713 {
1714         int res;
1715
1716         rcu_read_lock();
1717
1718         /* Multicast recognition logic is moved from route cache to here.
1719            The problem was that too many Ethernet cards have broken/missing
1720            hardware multicast filters :-( As result the host on multicasting
1721            network acquires a lot of useless route cache entries, sort of
1722            SDR messages from all the world. Now we try to get rid of them.
1723            Really, provided software IP multicast filter is organized
1724            reasonably (at least, hashed), it does not result in a slowdown
1725            comparing with route cache reject entries.
1726            Note, that multicast routers are not affected, because
1727            route cache entry is created eventually.
1728          */
1729         if (ipv4_is_multicast(daddr)) {
1730                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1731
1732                 if (in_dev) {
1733                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1734                                                   ip_hdr(skb)->protocol);
1735                         if (our
1736 #ifdef CONFIG_IP_MROUTE
1737                                 ||
1738                             (!ipv4_is_local_multicast(daddr) &&
1739                              IN_DEV_MFORWARD(in_dev))
1740 #endif
1741                            ) {
1742                                 int res = ip_route_input_mc(skb, daddr, saddr,
1743                                                             tos, dev, our);
1744                                 rcu_read_unlock();
1745                                 return res;
1746                         }
1747                 }
1748                 rcu_read_unlock();
1749                 return -EINVAL;
1750         }
1751         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1752         rcu_read_unlock();
1753         return res;
1754 }
1755 EXPORT_SYMBOL(ip_route_input_noref);
1756
1757 /* called with rcu_read_lock() */
1758 static struct rtable *__mkroute_output(const struct fib_result *res,
1759                                        const struct flowi4 *fl4, int orig_oif,
1760                                        struct net_device *dev_out,
1761                                        unsigned int flags)
1762 {
1763         struct fib_info *fi = res->fi;
1764         struct fib_nh_exception *fnhe;
1765         struct in_device *in_dev;
1766         u16 type = res->type;
1767         struct rtable *rth;
1768         bool do_cache;
1769
1770         in_dev = __in_dev_get_rcu(dev_out);
1771         if (!in_dev)
1772                 return ERR_PTR(-EINVAL);
1773
1774         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1775                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1776                         return ERR_PTR(-EINVAL);
1777
1778         if (ipv4_is_lbcast(fl4->daddr))
1779                 type = RTN_BROADCAST;
1780         else if (ipv4_is_multicast(fl4->daddr))
1781                 type = RTN_MULTICAST;
1782         else if (ipv4_is_zeronet(fl4->daddr))
1783                 return ERR_PTR(-EINVAL);
1784
1785         if (dev_out->flags & IFF_LOOPBACK)
1786                 flags |= RTCF_LOCAL;
1787
1788         do_cache = true;
1789         if (type == RTN_BROADCAST) {
1790                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1791                 fi = NULL;
1792         } else if (type == RTN_MULTICAST) {
1793                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1794                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1795                                      fl4->flowi4_proto))
1796                         flags &= ~RTCF_LOCAL;
1797                 else
1798                         do_cache = false;
1799                 /* If multicast route do not exist use
1800                  * default one, but do not gateway in this case.
1801                  * Yes, it is hack.
1802                  */
1803                 if (fi && res->prefixlen < 4)
1804                         fi = NULL;
1805         }
1806
1807         fnhe = NULL;
1808         do_cache &= fi != NULL;
1809         if (do_cache) {
1810                 struct rtable __rcu **prth;
1811                 struct fib_nh *nh = &FIB_RES_NH(*res);
1812
1813                 fnhe = find_exception(nh, fl4->daddr);
1814                 if (fnhe)
1815                         prth = &fnhe->fnhe_rth;
1816                 else {
1817                         if (unlikely(fl4->flowi4_flags &
1818                                      FLOWI_FLAG_KNOWN_NH &&
1819                                      !(nh->nh_gw &&
1820                                        nh->nh_scope == RT_SCOPE_LINK))) {
1821                                 do_cache = false;
1822                                 goto add;
1823                         }
1824                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1825                 }
1826                 rth = rcu_dereference(*prth);
1827                 if (rt_cache_valid(rth)) {
1828                         dst_hold(&rth->dst);
1829                         return rth;
1830                 }
1831         }
1832
1833 add:
1834         rth = rt_dst_alloc(dev_out,
1835                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1836                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1837                            do_cache);
1838         if (!rth)
1839                 return ERR_PTR(-ENOBUFS);
1840
1841         rth->dst.output = ip_output;
1842
1843         rth->rt_genid = rt_genid(dev_net(dev_out));
1844         rth->rt_flags   = flags;
1845         rth->rt_type    = type;
1846         rth->rt_is_input = 0;
1847         rth->rt_iif     = orig_oif ? : 0;
1848         rth->rt_pmtu    = 0;
1849         rth->rt_gateway = 0;
1850         rth->rt_uses_gateway = 0;
1851         INIT_LIST_HEAD(&rth->rt_uncached);
1852
1853         RT_CACHE_STAT_INC(out_slow_tot);
1854
1855         if (flags & RTCF_LOCAL)
1856                 rth->dst.input = ip_local_deliver;
1857         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1858                 if (flags & RTCF_LOCAL &&
1859                     !(dev_out->flags & IFF_LOOPBACK)) {
1860                         rth->dst.output = ip_mc_output;
1861                         RT_CACHE_STAT_INC(out_slow_mc);
1862                 }
1863 #ifdef CONFIG_IP_MROUTE
1864                 if (type == RTN_MULTICAST) {
1865                         if (IN_DEV_MFORWARD(in_dev) &&
1866                             !ipv4_is_local_multicast(fl4->daddr)) {
1867                                 rth->dst.input = ip_mr_input;
1868                                 rth->dst.output = ip_mc_output;
1869                         }
1870                 }
1871 #endif
1872         }
1873
1874         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1875
1876         return rth;
1877 }
1878
1879 /*
1880  * Major route resolver routine.
1881  */
1882
1883 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1884 {
1885         struct net_device *dev_out = NULL;
1886         __u8 tos = RT_FL_TOS(fl4);
1887         unsigned int flags = 0;
1888         struct fib_result res;
1889         struct rtable *rth;
1890         int orig_oif;
1891
1892         res.tclassid    = 0;
1893         res.fi          = NULL;
1894         res.table       = NULL;
1895
1896         orig_oif = fl4->flowi4_oif;
1897
1898         fl4->flowi4_iif = LOOPBACK_IFINDEX;
1899         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
1900         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
1901                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
1902
1903         rcu_read_lock();
1904         if (fl4->saddr) {
1905                 rth = ERR_PTR(-EINVAL);
1906                 if (ipv4_is_multicast(fl4->saddr) ||
1907                     ipv4_is_lbcast(fl4->saddr) ||
1908                     ipv4_is_zeronet(fl4->saddr))
1909                         goto out;
1910
1911                 /* I removed check for oif == dev_out->oif here.
1912                    It was wrong for two reasons:
1913                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
1914                       is assigned to multiple interfaces.
1915                    2. Moreover, we are allowed to send packets with saddr
1916                       of another iface. --ANK
1917                  */
1918
1919                 if (fl4->flowi4_oif == 0 &&
1920                     (ipv4_is_multicast(fl4->daddr) ||
1921                      ipv4_is_lbcast(fl4->daddr))) {
1922                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1923                         dev_out = __ip_dev_find(net, fl4->saddr, false);
1924                         if (dev_out == NULL)
1925                                 goto out;
1926
1927                         /* Special hack: user can direct multicasts
1928                            and limited broadcast via necessary interface
1929                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
1930                            This hack is not just for fun, it allows
1931                            vic,vat and friends to work.
1932                            They bind socket to loopback, set ttl to zero
1933                            and expect that it will work.
1934                            From the viewpoint of routing cache they are broken,
1935                            because we are not allowed to build multicast path
1936                            with loopback source addr (look, routing cache
1937                            cannot know, that ttl is zero, so that packet
1938                            will not leave this host and route is valid).
1939                            Luckily, this hack is good workaround.
1940                          */
1941
1942                         fl4->flowi4_oif = dev_out->ifindex;
1943                         goto make_route;
1944                 }
1945
1946                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
1947                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
1948                         if (!__ip_dev_find(net, fl4->saddr, false))
1949                                 goto out;
1950                 }
1951         }
1952
1953
1954         if (fl4->flowi4_oif) {
1955                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
1956                 rth = ERR_PTR(-ENODEV);
1957                 if (dev_out == NULL)
1958                         goto out;
1959
1960                 /* RACE: Check return value of inet_select_addr instead. */
1961                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
1962                         rth = ERR_PTR(-ENETUNREACH);
1963                         goto out;
1964                 }
1965                 if (ipv4_is_local_multicast(fl4->daddr) ||
1966                     ipv4_is_lbcast(fl4->daddr)) {
1967                         if (!fl4->saddr)
1968                                 fl4->saddr = inet_select_addr(dev_out, 0,
1969                                                               RT_SCOPE_LINK);
1970                         goto make_route;
1971                 }
1972                 if (fl4->saddr) {
1973                         if (ipv4_is_multicast(fl4->daddr))
1974                                 fl4->saddr = inet_select_addr(dev_out, 0,
1975                                                               fl4->flowi4_scope);
1976                         else if (!fl4->daddr)
1977                                 fl4->saddr = inet_select_addr(dev_out, 0,
1978                                                               RT_SCOPE_HOST);
1979                 }
1980         }
1981
1982         if (!fl4->daddr) {
1983                 fl4->daddr = fl4->saddr;
1984                 if (!fl4->daddr)
1985                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
1986                 dev_out = net->loopback_dev;
1987                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
1988                 res.type = RTN_LOCAL;
1989                 flags |= RTCF_LOCAL;
1990                 goto make_route;
1991         }
1992
1993         if (fib_lookup(net, fl4, &res)) {
1994                 res.fi = NULL;
1995                 res.table = NULL;
1996                 if (fl4->flowi4_oif) {
1997                         /* Apparently, routing tables are wrong. Assume,
1998                            that the destination is on link.
1999
2000                            WHY? DW.
2001                            Because we are allowed to send to iface
2002                            even if it has NO routes and NO assigned
2003                            addresses. When oif is specified, routing
2004                            tables are looked up with only one purpose:
2005                            to catch if destination is gatewayed, rather than
2006                            direct. Moreover, if MSG_DONTROUTE is set,
2007                            we send packet, ignoring both routing tables
2008                            and ifaddr state. --ANK
2009
2010
2011                            We could make it even if oif is unknown,
2012                            likely IPv6, but we do not.
2013                          */
2014
2015                         if (fl4->saddr == 0)
2016                                 fl4->saddr = inet_select_addr(dev_out, 0,
2017                                                               RT_SCOPE_LINK);
2018                         res.type = RTN_UNICAST;
2019                         goto make_route;
2020                 }
2021                 rth = ERR_PTR(-ENETUNREACH);
2022                 goto out;
2023         }
2024
2025         if (res.type == RTN_LOCAL) {
2026                 if (!fl4->saddr) {
2027                         if (res.fi->fib_prefsrc)
2028                                 fl4->saddr = res.fi->fib_prefsrc;
2029                         else
2030                                 fl4->saddr = fl4->daddr;
2031                 }
2032                 dev_out = net->loopback_dev;
2033                 fl4->flowi4_oif = dev_out->ifindex;
2034                 flags |= RTCF_LOCAL;
2035                 goto make_route;
2036         }
2037
2038 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2039         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2040                 fib_select_multipath(&res);
2041         else
2042 #endif
2043         if (!res.prefixlen &&
2044             res.table->tb_num_default > 1 &&
2045             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2046                 fib_select_default(&res);
2047
2048         if (!fl4->saddr)
2049                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2050
2051         dev_out = FIB_RES_DEV(res);
2052         fl4->flowi4_oif = dev_out->ifindex;
2053
2054
2055 make_route:
2056         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2057
2058 out:
2059         rcu_read_unlock();
2060         return rth;
2061 }
2062 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2063
2064 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2065 {
2066         return NULL;
2067 }
2068
2069 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2070 {
2071         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2072
2073         return mtu ? : dst->dev->mtu;
2074 }
2075
2076 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2077                                           struct sk_buff *skb, u32 mtu)
2078 {
2079 }
2080
2081 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2082                                        struct sk_buff *skb)
2083 {
2084 }
2085
2086 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2087                                           unsigned long old)
2088 {
2089         return NULL;
2090 }
2091
2092 static struct dst_ops ipv4_dst_blackhole_ops = {
2093         .family                 =       AF_INET,
2094         .protocol               =       cpu_to_be16(ETH_P_IP),
2095         .check                  =       ipv4_blackhole_dst_check,
2096         .mtu                    =       ipv4_blackhole_mtu,
2097         .default_advmss         =       ipv4_default_advmss,
2098         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2099         .redirect               =       ipv4_rt_blackhole_redirect,
2100         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2101         .neigh_lookup           =       ipv4_neigh_lookup,
2102 };
2103
2104 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2105 {
2106         struct rtable *ort = (struct rtable *) dst_orig;
2107         struct rtable *rt;
2108
2109         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2110         if (rt) {
2111                 struct dst_entry *new = &rt->dst;
2112
2113                 new->__use = 1;
2114                 new->input = dst_discard;
2115                 new->output = dst_discard;
2116
2117                 new->dev = ort->dst.dev;
2118                 if (new->dev)
2119                         dev_hold(new->dev);
2120
2121                 rt->rt_is_input = ort->rt_is_input;
2122                 rt->rt_iif = ort->rt_iif;
2123                 rt->rt_pmtu = ort->rt_pmtu;
2124
2125                 rt->rt_genid = rt_genid(net);
2126                 rt->rt_flags = ort->rt_flags;
2127                 rt->rt_type = ort->rt_type;
2128                 rt->rt_gateway = ort->rt_gateway;
2129                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2130
2131                 INIT_LIST_HEAD(&rt->rt_uncached);
2132
2133                 dst_free(new);
2134         }
2135
2136         dst_release(dst_orig);
2137
2138         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2139 }
2140
2141 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2142                                     struct sock *sk)
2143 {
2144         struct rtable *rt = __ip_route_output_key(net, flp4);
2145
2146         if (IS_ERR(rt))
2147                 return rt;
2148
2149         if (flp4->flowi4_proto)
2150                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2151                                                    flowi4_to_flowi(flp4),
2152                                                    sk, 0);
2153
2154         return rt;
2155 }
2156 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2157
2158 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2159                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2160                         u32 seq, int event, int nowait, unsigned int flags)
2161 {
2162         struct rtable *rt = skb_rtable(skb);
2163         struct rtmsg *r;
2164         struct nlmsghdr *nlh;
2165         unsigned long expires = 0;
2166         u32 error;
2167         u32 metrics[RTAX_MAX];
2168
2169         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2170         if (nlh == NULL)
2171                 return -EMSGSIZE;
2172
2173         r = nlmsg_data(nlh);
2174         r->rtm_family    = AF_INET;
2175         r->rtm_dst_len  = 32;
2176         r->rtm_src_len  = 0;
2177         r->rtm_tos      = fl4->flowi4_tos;
2178         r->rtm_table    = RT_TABLE_MAIN;
2179         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2180                 goto nla_put_failure;
2181         r->rtm_type     = rt->rt_type;
2182         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2183         r->rtm_protocol = RTPROT_UNSPEC;
2184         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2185         if (rt->rt_flags & RTCF_NOTIFY)
2186                 r->rtm_flags |= RTM_F_NOTIFY;
2187
2188         if (nla_put_be32(skb, RTA_DST, dst))
2189                 goto nla_put_failure;
2190         if (src) {
2191                 r->rtm_src_len = 32;
2192                 if (nla_put_be32(skb, RTA_SRC, src))
2193                         goto nla_put_failure;
2194         }
2195         if (rt->dst.dev &&
2196             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2197                 goto nla_put_failure;
2198 #ifdef CONFIG_IP_ROUTE_CLASSID
2199         if (rt->dst.tclassid &&
2200             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2201                 goto nla_put_failure;
2202 #endif
2203         if (!rt_is_input_route(rt) &&
2204             fl4->saddr != src) {
2205                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2206                         goto nla_put_failure;
2207         }
2208         if (rt->rt_uses_gateway &&
2209             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2210                 goto nla_put_failure;
2211
2212         expires = rt->dst.expires;
2213         if (expires) {
2214                 unsigned long now = jiffies;
2215
2216                 if (time_before(now, expires))
2217                         expires -= now;
2218                 else
2219                         expires = 0;
2220         }
2221
2222         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2223         if (rt->rt_pmtu && expires)
2224                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2225         if (rtnetlink_put_metrics(skb, metrics) < 0)
2226                 goto nla_put_failure;
2227
2228         if (fl4->flowi4_mark &&
2229             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2230                 goto nla_put_failure;
2231
2232         error = rt->dst.error;
2233
2234         if (rt_is_input_route(rt)) {
2235                 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2236                         goto nla_put_failure;
2237         }
2238
2239         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2240                 goto nla_put_failure;
2241
2242         return nlmsg_end(skb, nlh);
2243
2244 nla_put_failure:
2245         nlmsg_cancel(skb, nlh);
2246         return -EMSGSIZE;
2247 }
2248
2249 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg)
2250 {
2251         struct net *net = sock_net(in_skb->sk);
2252         struct rtmsg *rtm;
2253         struct nlattr *tb[RTA_MAX+1];
2254         struct rtable *rt = NULL;
2255         struct flowi4 fl4;
2256         __be32 dst = 0;
2257         __be32 src = 0;
2258         u32 iif;
2259         int err;
2260         int mark;
2261         struct sk_buff *skb;
2262
2263         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2264         if (err < 0)
2265                 goto errout;
2266
2267         rtm = nlmsg_data(nlh);
2268
2269         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2270         if (skb == NULL) {
2271                 err = -ENOBUFS;
2272                 goto errout;
2273         }
2274
2275         /* Reserve room for dummy headers, this skb can pass
2276            through good chunk of routing engine.
2277          */
2278         skb_reset_mac_header(skb);
2279         skb_reset_network_header(skb);
2280
2281         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2282         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2283         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2284
2285         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2286         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2287         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2288         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2289
2290         memset(&fl4, 0, sizeof(fl4));
2291         fl4.daddr = dst;
2292         fl4.saddr = src;
2293         fl4.flowi4_tos = rtm->rtm_tos;
2294         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2295         fl4.flowi4_mark = mark;
2296
2297         if (iif) {
2298                 struct net_device *dev;
2299
2300                 dev = __dev_get_by_index(net, iif);
2301                 if (dev == NULL) {
2302                         err = -ENODEV;
2303                         goto errout_free;
2304                 }
2305
2306                 skb->protocol   = htons(ETH_P_IP);
2307                 skb->dev        = dev;
2308                 skb->mark       = mark;
2309                 local_bh_disable();
2310                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2311                 local_bh_enable();
2312
2313                 rt = skb_rtable(skb);
2314                 if (err == 0 && rt->dst.error)
2315                         err = -rt->dst.error;
2316         } else {
2317                 rt = ip_route_output_key(net, &fl4);
2318
2319                 err = 0;
2320                 if (IS_ERR(rt))
2321                         err = PTR_ERR(rt);
2322         }
2323
2324         if (err)
2325                 goto errout_free;
2326
2327         skb_dst_set(skb, &rt->dst);
2328         if (rtm->rtm_flags & RTM_F_NOTIFY)
2329                 rt->rt_flags |= RTCF_NOTIFY;
2330
2331         err = rt_fill_info(net, dst, src, &fl4, skb,
2332                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2333                            RTM_NEWROUTE, 0, 0);
2334         if (err <= 0)
2335                 goto errout_free;
2336
2337         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2338 errout:
2339         return err;
2340
2341 errout_free:
2342         kfree_skb(skb);
2343         goto errout;
2344 }
2345
2346 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2347 {
2348         return skb->len;
2349 }
2350
2351 void ip_rt_multicast_event(struct in_device *in_dev)
2352 {
2353         rt_cache_flush(dev_net(in_dev->dev));
2354 }
2355
2356 #ifdef CONFIG_SYSCTL
2357 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2358                                         void __user *buffer,
2359                                         size_t *lenp, loff_t *ppos)
2360 {
2361         if (write) {
2362                 rt_cache_flush((struct net *)__ctl->extra1);
2363                 return 0;
2364         }
2365
2366         return -EINVAL;
2367 }
2368
2369 static ctl_table ipv4_route_table[] = {
2370         {
2371                 .procname       = "gc_thresh",
2372                 .data           = &ipv4_dst_ops.gc_thresh,
2373                 .maxlen         = sizeof(int),
2374                 .mode           = 0644,
2375                 .proc_handler   = proc_dointvec,
2376         },
2377         {
2378                 .procname       = "max_size",
2379                 .data           = &ip_rt_max_size,
2380                 .maxlen         = sizeof(int),
2381                 .mode           = 0644,
2382                 .proc_handler   = proc_dointvec,
2383         },
2384         {
2385                 /*  Deprecated. Use gc_min_interval_ms */
2386
2387                 .procname       = "gc_min_interval",
2388                 .data           = &ip_rt_gc_min_interval,
2389                 .maxlen         = sizeof(int),
2390                 .mode           = 0644,
2391                 .proc_handler   = proc_dointvec_jiffies,
2392         },
2393         {
2394                 .procname       = "gc_min_interval_ms",
2395                 .data           = &ip_rt_gc_min_interval,
2396                 .maxlen         = sizeof(int),
2397                 .mode           = 0644,
2398                 .proc_handler   = proc_dointvec_ms_jiffies,
2399         },
2400         {
2401                 .procname       = "gc_timeout",
2402                 .data           = &ip_rt_gc_timeout,
2403                 .maxlen         = sizeof(int),
2404                 .mode           = 0644,
2405                 .proc_handler   = proc_dointvec_jiffies,
2406         },
2407         {
2408                 .procname       = "gc_interval",
2409                 .data           = &ip_rt_gc_interval,
2410                 .maxlen         = sizeof(int),
2411                 .mode           = 0644,
2412                 .proc_handler   = proc_dointvec_jiffies,
2413         },
2414         {
2415                 .procname       = "redirect_load",
2416                 .data           = &ip_rt_redirect_load,
2417                 .maxlen         = sizeof(int),
2418                 .mode           = 0644,
2419                 .proc_handler   = proc_dointvec,
2420         },
2421         {
2422                 .procname       = "redirect_number",
2423                 .data           = &ip_rt_redirect_number,
2424                 .maxlen         = sizeof(int),
2425                 .mode           = 0644,
2426                 .proc_handler   = proc_dointvec,
2427         },
2428         {
2429                 .procname       = "redirect_silence",
2430                 .data           = &ip_rt_redirect_silence,
2431                 .maxlen         = sizeof(int),
2432                 .mode           = 0644,
2433                 .proc_handler   = proc_dointvec,
2434         },
2435         {
2436                 .procname       = "error_cost",
2437                 .data           = &ip_rt_error_cost,
2438                 .maxlen         = sizeof(int),
2439                 .mode           = 0644,
2440                 .proc_handler   = proc_dointvec,
2441         },
2442         {
2443                 .procname       = "error_burst",
2444                 .data           = &ip_rt_error_burst,
2445                 .maxlen         = sizeof(int),
2446                 .mode           = 0644,
2447                 .proc_handler   = proc_dointvec,
2448         },
2449         {
2450                 .procname       = "gc_elasticity",
2451                 .data           = &ip_rt_gc_elasticity,
2452                 .maxlen         = sizeof(int),
2453                 .mode           = 0644,
2454                 .proc_handler   = proc_dointvec,
2455         },
2456         {
2457                 .procname       = "mtu_expires",
2458                 .data           = &ip_rt_mtu_expires,
2459                 .maxlen         = sizeof(int),
2460                 .mode           = 0644,
2461                 .proc_handler   = proc_dointvec_jiffies,
2462         },
2463         {
2464                 .procname       = "min_pmtu",
2465                 .data           = &ip_rt_min_pmtu,
2466                 .maxlen         = sizeof(int),
2467                 .mode           = 0644,
2468                 .proc_handler   = proc_dointvec,
2469         },
2470         {
2471                 .procname       = "min_adv_mss",
2472                 .data           = &ip_rt_min_advmss,
2473                 .maxlen         = sizeof(int),
2474                 .mode           = 0644,
2475                 .proc_handler   = proc_dointvec,
2476         },
2477         { }
2478 };
2479
2480 static struct ctl_table ipv4_route_flush_table[] = {
2481         {
2482                 .procname       = "flush",
2483                 .maxlen         = sizeof(int),
2484                 .mode           = 0200,
2485                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2486         },
2487         { },
2488 };
2489
2490 static __net_init int sysctl_route_net_init(struct net *net)
2491 {
2492         struct ctl_table *tbl;
2493
2494         tbl = ipv4_route_flush_table;
2495         if (!net_eq(net, &init_net)) {
2496                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2497                 if (tbl == NULL)
2498                         goto err_dup;
2499
2500                 /* Don't export sysctls to unprivileged users */
2501                 if (net->user_ns != &init_user_ns)
2502                         tbl[0].procname = NULL;
2503         }
2504         tbl[0].extra1 = net;
2505
2506         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2507         if (net->ipv4.route_hdr == NULL)
2508                 goto err_reg;
2509         return 0;
2510
2511 err_reg:
2512         if (tbl != ipv4_route_flush_table)
2513                 kfree(tbl);
2514 err_dup:
2515         return -ENOMEM;
2516 }
2517
2518 static __net_exit void sysctl_route_net_exit(struct net *net)
2519 {
2520         struct ctl_table *tbl;
2521
2522         tbl = net->ipv4.route_hdr->ctl_table_arg;
2523         unregister_net_sysctl_table(net->ipv4.route_hdr);
2524         BUG_ON(tbl == ipv4_route_flush_table);
2525         kfree(tbl);
2526 }
2527
2528 static __net_initdata struct pernet_operations sysctl_route_ops = {
2529         .init = sysctl_route_net_init,
2530         .exit = sysctl_route_net_exit,
2531 };
2532 #endif
2533
2534 static __net_init int rt_genid_init(struct net *net)
2535 {
2536         atomic_set(&net->rt_genid, 0);
2537         get_random_bytes(&net->ipv4.dev_addr_genid,
2538                          sizeof(net->ipv4.dev_addr_genid));
2539         return 0;
2540 }
2541
2542 static __net_initdata struct pernet_operations rt_genid_ops = {
2543         .init = rt_genid_init,
2544 };
2545
2546 static int __net_init ipv4_inetpeer_init(struct net *net)
2547 {
2548         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2549
2550         if (!bp)
2551                 return -ENOMEM;
2552         inet_peer_base_init(bp);
2553         net->ipv4.peers = bp;
2554         return 0;
2555 }
2556
2557 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2558 {
2559         struct inet_peer_base *bp = net->ipv4.peers;
2560
2561         net->ipv4.peers = NULL;
2562         inetpeer_invalidate_tree(bp);
2563         kfree(bp);
2564 }
2565
2566 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2567         .init   =       ipv4_inetpeer_init,
2568         .exit   =       ipv4_inetpeer_exit,
2569 };
2570
2571 #ifdef CONFIG_IP_ROUTE_CLASSID
2572 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2573 #endif /* CONFIG_IP_ROUTE_CLASSID */
2574
2575 int __init ip_rt_init(void)
2576 {
2577         int rc = 0;
2578
2579 #ifdef CONFIG_IP_ROUTE_CLASSID
2580         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2581         if (!ip_rt_acct)
2582                 panic("IP: failed to allocate ip_rt_acct\n");
2583 #endif
2584
2585         ipv4_dst_ops.kmem_cachep =
2586                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2587                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2588
2589         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2590
2591         if (dst_entries_init(&ipv4_dst_ops) < 0)
2592                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2593
2594         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2595                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2596
2597         ipv4_dst_ops.gc_thresh = ~0;
2598         ip_rt_max_size = INT_MAX;
2599
2600         devinet_init();
2601         ip_fib_init();
2602
2603         if (ip_rt_proc_init())
2604                 pr_err("Unable to create route proc files\n");
2605 #ifdef CONFIG_XFRM
2606         xfrm_init();
2607         xfrm4_init();
2608 #endif
2609         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2610
2611 #ifdef CONFIG_SYSCTL
2612         register_pernet_subsys(&sysctl_route_ops);
2613 #endif
2614         register_pernet_subsys(&rt_genid_ops);
2615         register_pernet_subsys(&ipv4_inetpeer_ops);
2616         return rc;
2617 }
2618
2619 #ifdef CONFIG_SYSCTL
2620 /*
2621  * We really need to sanitize the damn ipv4 init order, then all
2622  * this nonsense will go away.
2623  */
2624 void __init ip_static_sysctl_init(void)
2625 {
2626         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2627 }
2628 #endif