]> Pileus Git - ~andy/linux/blob - net/ipv4/route.c
68cee358d9a38a6c02f8f9edf15e208b54bccd93
[~andy/linux] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <linux/slab.h>
94 #include <net/dst.h>
95 #include <net/net_namespace.h>
96 #include <net/protocol.h>
97 #include <net/ip.h>
98 #include <net/route.h>
99 #include <net/inetpeer.h>
100 #include <net/sock.h>
101 #include <net/ip_fib.h>
102 #include <net/arp.h>
103 #include <net/tcp.h>
104 #include <net/icmp.h>
105 #include <net/xfrm.h>
106 #include <net/netevent.h>
107 #include <net/rtnetlink.h>
108 #ifdef CONFIG_SYSCTL
109 #include <linux/sysctl.h>
110 #endif
111
112 #define RT_FL_TOS(oldflp) \
113     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
121 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
122 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
123 static int ip_rt_redirect_number __read_mostly  = 9;
124 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
126 static int ip_rt_error_cost __read_mostly       = HZ;
127 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
128 static int ip_rt_gc_elasticity __read_mostly    = 8;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132 static int rt_chain_length_max __read_mostly    = 20;
133
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136
137 /*
138  *      Interface to generic destination cache.
139  */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
143 static unsigned int      ipv4_default_mtu(const struct dst_entry *dst);
144 static void              ipv4_dst_destroy(struct dst_entry *dst);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void              ipv4_link_failure(struct sk_buff *skb);
147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149
150 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
151                             int how)
152 {
153 }
154
155 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
156 {
157         struct rtable *rt = (struct rtable *) dst;
158         struct inet_peer *peer;
159         u32 *p = NULL;
160
161         if (!rt->peer)
162                 rt_bind_peer(rt, 1);
163
164         peer = rt->peer;
165         if (peer) {
166                 u32 *old_p = __DST_METRICS_PTR(old);
167                 unsigned long prev, new;
168
169                 p = peer->metrics;
170                 if (inet_metrics_new(peer))
171                         memcpy(p, old_p, sizeof(u32) * RTAX_MAX);
172
173                 new = (unsigned long) p;
174                 prev = cmpxchg(&dst->_metrics, old, new);
175
176                 if (prev != old) {
177                         p = __DST_METRICS_PTR(prev);
178                         if (prev & DST_METRICS_READ_ONLY)
179                                 p = NULL;
180                 } else {
181                         if (rt->fi) {
182                                 fib_info_put(rt->fi);
183                                 rt->fi = NULL;
184                         }
185                 }
186         }
187         return p;
188 }
189
190 static struct dst_ops ipv4_dst_ops = {
191         .family =               AF_INET,
192         .protocol =             cpu_to_be16(ETH_P_IP),
193         .gc =                   rt_garbage_collect,
194         .check =                ipv4_dst_check,
195         .default_advmss =       ipv4_default_advmss,
196         .default_mtu =          ipv4_default_mtu,
197         .cow_metrics =          ipv4_cow_metrics,
198         .destroy =              ipv4_dst_destroy,
199         .ifdown =               ipv4_dst_ifdown,
200         .negative_advice =      ipv4_negative_advice,
201         .link_failure =         ipv4_link_failure,
202         .update_pmtu =          ip_rt_update_pmtu,
203         .local_out =            __ip_local_out,
204 };
205
206 #define ECN_OR_COST(class)      TC_PRIO_##class
207
208 const __u8 ip_tos2prio[16] = {
209         TC_PRIO_BESTEFFORT,
210         ECN_OR_COST(FILLER),
211         TC_PRIO_BESTEFFORT,
212         ECN_OR_COST(BESTEFFORT),
213         TC_PRIO_BULK,
214         ECN_OR_COST(BULK),
215         TC_PRIO_BULK,
216         ECN_OR_COST(BULK),
217         TC_PRIO_INTERACTIVE,
218         ECN_OR_COST(INTERACTIVE),
219         TC_PRIO_INTERACTIVE,
220         ECN_OR_COST(INTERACTIVE),
221         TC_PRIO_INTERACTIVE_BULK,
222         ECN_OR_COST(INTERACTIVE_BULK),
223         TC_PRIO_INTERACTIVE_BULK,
224         ECN_OR_COST(INTERACTIVE_BULK)
225 };
226
227
228 /*
229  * Route cache.
230  */
231
232 /* The locking scheme is rather straight forward:
233  *
234  * 1) Read-Copy Update protects the buckets of the central route hash.
235  * 2) Only writers remove entries, and they hold the lock
236  *    as they look at rtable reference counts.
237  * 3) Only readers acquire references to rtable entries,
238  *    they do so with atomic increments and with the
239  *    lock held.
240  */
241
242 struct rt_hash_bucket {
243         struct rtable __rcu     *chain;
244 };
245
246 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
247         defined(CONFIG_PROVE_LOCKING)
248 /*
249  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
250  * The size of this table is a power of two and depends on the number of CPUS.
251  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
252  */
253 #ifdef CONFIG_LOCKDEP
254 # define RT_HASH_LOCK_SZ        256
255 #else
256 # if NR_CPUS >= 32
257 #  define RT_HASH_LOCK_SZ       4096
258 # elif NR_CPUS >= 16
259 #  define RT_HASH_LOCK_SZ       2048
260 # elif NR_CPUS >= 8
261 #  define RT_HASH_LOCK_SZ       1024
262 # elif NR_CPUS >= 4
263 #  define RT_HASH_LOCK_SZ       512
264 # else
265 #  define RT_HASH_LOCK_SZ       256
266 # endif
267 #endif
268
269 static spinlock_t       *rt_hash_locks;
270 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
271
272 static __init void rt_hash_lock_init(void)
273 {
274         int i;
275
276         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
277                         GFP_KERNEL);
278         if (!rt_hash_locks)
279                 panic("IP: failed to allocate rt_hash_locks\n");
280
281         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
282                 spin_lock_init(&rt_hash_locks[i]);
283 }
284 #else
285 # define rt_hash_lock_addr(slot) NULL
286
287 static inline void rt_hash_lock_init(void)
288 {
289 }
290 #endif
291
292 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
293 static unsigned                 rt_hash_mask __read_mostly;
294 static unsigned int             rt_hash_log  __read_mostly;
295
296 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
297 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
298
299 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
300                                    int genid)
301 {
302         return jhash_3words((__force u32)daddr, (__force u32)saddr,
303                             idx, genid)
304                 & rt_hash_mask;
305 }
306
307 static inline int rt_genid(struct net *net)
308 {
309         return atomic_read(&net->ipv4.rt_genid);
310 }
311
312 #ifdef CONFIG_PROC_FS
313 struct rt_cache_iter_state {
314         struct seq_net_private p;
315         int bucket;
316         int genid;
317 };
318
319 static struct rtable *rt_cache_get_first(struct seq_file *seq)
320 {
321         struct rt_cache_iter_state *st = seq->private;
322         struct rtable *r = NULL;
323
324         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
325                 if (!rcu_dereference_raw(rt_hash_table[st->bucket].chain))
326                         continue;
327                 rcu_read_lock_bh();
328                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
329                 while (r) {
330                         if (dev_net(r->dst.dev) == seq_file_net(seq) &&
331                             r->rt_genid == st->genid)
332                                 return r;
333                         r = rcu_dereference_bh(r->dst.rt_next);
334                 }
335                 rcu_read_unlock_bh();
336         }
337         return r;
338 }
339
340 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
341                                           struct rtable *r)
342 {
343         struct rt_cache_iter_state *st = seq->private;
344
345         r = rcu_dereference_bh(r->dst.rt_next);
346         while (!r) {
347                 rcu_read_unlock_bh();
348                 do {
349                         if (--st->bucket < 0)
350                                 return NULL;
351                 } while (!rcu_dereference_raw(rt_hash_table[st->bucket].chain));
352                 rcu_read_lock_bh();
353                 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain);
354         }
355         return r;
356 }
357
358 static struct rtable *rt_cache_get_next(struct seq_file *seq,
359                                         struct rtable *r)
360 {
361         struct rt_cache_iter_state *st = seq->private;
362         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
363                 if (dev_net(r->dst.dev) != seq_file_net(seq))
364                         continue;
365                 if (r->rt_genid == st->genid)
366                         break;
367         }
368         return r;
369 }
370
371 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
372 {
373         struct rtable *r = rt_cache_get_first(seq);
374
375         if (r)
376                 while (pos && (r = rt_cache_get_next(seq, r)))
377                         --pos;
378         return pos ? NULL : r;
379 }
380
381 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
382 {
383         struct rt_cache_iter_state *st = seq->private;
384         if (*pos)
385                 return rt_cache_get_idx(seq, *pos - 1);
386         st->genid = rt_genid(seq_file_net(seq));
387         return SEQ_START_TOKEN;
388 }
389
390 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
391 {
392         struct rtable *r;
393
394         if (v == SEQ_START_TOKEN)
395                 r = rt_cache_get_first(seq);
396         else
397                 r = rt_cache_get_next(seq, v);
398         ++*pos;
399         return r;
400 }
401
402 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
403 {
404         if (v && v != SEQ_START_TOKEN)
405                 rcu_read_unlock_bh();
406 }
407
408 static int rt_cache_seq_show(struct seq_file *seq, void *v)
409 {
410         if (v == SEQ_START_TOKEN)
411                 seq_printf(seq, "%-127s\n",
412                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
413                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
414                            "HHUptod\tSpecDst");
415         else {
416                 struct rtable *r = v;
417                 int len;
418
419                 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t"
420                               "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
421                         r->dst.dev ? r->dst.dev->name : "*",
422                         (__force u32)r->rt_dst,
423                         (__force u32)r->rt_gateway,
424                         r->rt_flags, atomic_read(&r->dst.__refcnt),
425                         r->dst.__use, 0, (__force u32)r->rt_src,
426                         dst_metric_advmss(&r->dst) + 40,
427                         dst_metric(&r->dst, RTAX_WINDOW),
428                         (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) +
429                               dst_metric(&r->dst, RTAX_RTTVAR)),
430                         r->fl.fl4_tos,
431                         r->dst.hh ? atomic_read(&r->dst.hh->hh_refcnt) : -1,
432                         r->dst.hh ? (r->dst.hh->hh_output ==
433                                        dev_queue_xmit) : 0,
434                         r->rt_spec_dst, &len);
435
436                 seq_printf(seq, "%*s\n", 127 - len, "");
437         }
438         return 0;
439 }
440
441 static const struct seq_operations rt_cache_seq_ops = {
442         .start  = rt_cache_seq_start,
443         .next   = rt_cache_seq_next,
444         .stop   = rt_cache_seq_stop,
445         .show   = rt_cache_seq_show,
446 };
447
448 static int rt_cache_seq_open(struct inode *inode, struct file *file)
449 {
450         return seq_open_net(inode, file, &rt_cache_seq_ops,
451                         sizeof(struct rt_cache_iter_state));
452 }
453
454 static const struct file_operations rt_cache_seq_fops = {
455         .owner   = THIS_MODULE,
456         .open    = rt_cache_seq_open,
457         .read    = seq_read,
458         .llseek  = seq_lseek,
459         .release = seq_release_net,
460 };
461
462
463 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
464 {
465         int cpu;
466
467         if (*pos == 0)
468                 return SEQ_START_TOKEN;
469
470         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
471                 if (!cpu_possible(cpu))
472                         continue;
473                 *pos = cpu+1;
474                 return &per_cpu(rt_cache_stat, cpu);
475         }
476         return NULL;
477 }
478
479 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
480 {
481         int cpu;
482
483         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
484                 if (!cpu_possible(cpu))
485                         continue;
486                 *pos = cpu+1;
487                 return &per_cpu(rt_cache_stat, cpu);
488         }
489         return NULL;
490
491 }
492
493 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
494 {
495
496 }
497
498 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
499 {
500         struct rt_cache_stat *st = v;
501
502         if (v == SEQ_START_TOKEN) {
503                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
504                 return 0;
505         }
506
507         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
508                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
509                    dst_entries_get_slow(&ipv4_dst_ops),
510                    st->in_hit,
511                    st->in_slow_tot,
512                    st->in_slow_mc,
513                    st->in_no_route,
514                    st->in_brd,
515                    st->in_martian_dst,
516                    st->in_martian_src,
517
518                    st->out_hit,
519                    st->out_slow_tot,
520                    st->out_slow_mc,
521
522                    st->gc_total,
523                    st->gc_ignored,
524                    st->gc_goal_miss,
525                    st->gc_dst_overflow,
526                    st->in_hlist_search,
527                    st->out_hlist_search
528                 );
529         return 0;
530 }
531
532 static const struct seq_operations rt_cpu_seq_ops = {
533         .start  = rt_cpu_seq_start,
534         .next   = rt_cpu_seq_next,
535         .stop   = rt_cpu_seq_stop,
536         .show   = rt_cpu_seq_show,
537 };
538
539
540 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
541 {
542         return seq_open(file, &rt_cpu_seq_ops);
543 }
544
545 static const struct file_operations rt_cpu_seq_fops = {
546         .owner   = THIS_MODULE,
547         .open    = rt_cpu_seq_open,
548         .read    = seq_read,
549         .llseek  = seq_lseek,
550         .release = seq_release,
551 };
552
553 #ifdef CONFIG_IP_ROUTE_CLASSID
554 static int rt_acct_proc_show(struct seq_file *m, void *v)
555 {
556         struct ip_rt_acct *dst, *src;
557         unsigned int i, j;
558
559         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
560         if (!dst)
561                 return -ENOMEM;
562
563         for_each_possible_cpu(i) {
564                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
565                 for (j = 0; j < 256; j++) {
566                         dst[j].o_bytes   += src[j].o_bytes;
567                         dst[j].o_packets += src[j].o_packets;
568                         dst[j].i_bytes   += src[j].i_bytes;
569                         dst[j].i_packets += src[j].i_packets;
570                 }
571         }
572
573         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
574         kfree(dst);
575         return 0;
576 }
577
578 static int rt_acct_proc_open(struct inode *inode, struct file *file)
579 {
580         return single_open(file, rt_acct_proc_show, NULL);
581 }
582
583 static const struct file_operations rt_acct_proc_fops = {
584         .owner          = THIS_MODULE,
585         .open           = rt_acct_proc_open,
586         .read           = seq_read,
587         .llseek         = seq_lseek,
588         .release        = single_release,
589 };
590 #endif
591
592 static int __net_init ip_rt_do_proc_init(struct net *net)
593 {
594         struct proc_dir_entry *pde;
595
596         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
597                         &rt_cache_seq_fops);
598         if (!pde)
599                 goto err1;
600
601         pde = proc_create("rt_cache", S_IRUGO,
602                           net->proc_net_stat, &rt_cpu_seq_fops);
603         if (!pde)
604                 goto err2;
605
606 #ifdef CONFIG_IP_ROUTE_CLASSID
607         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
608         if (!pde)
609                 goto err3;
610 #endif
611         return 0;
612
613 #ifdef CONFIG_IP_ROUTE_CLASSID
614 err3:
615         remove_proc_entry("rt_cache", net->proc_net_stat);
616 #endif
617 err2:
618         remove_proc_entry("rt_cache", net->proc_net);
619 err1:
620         return -ENOMEM;
621 }
622
623 static void __net_exit ip_rt_do_proc_exit(struct net *net)
624 {
625         remove_proc_entry("rt_cache", net->proc_net_stat);
626         remove_proc_entry("rt_cache", net->proc_net);
627 #ifdef CONFIG_IP_ROUTE_CLASSID
628         remove_proc_entry("rt_acct", net->proc_net);
629 #endif
630 }
631
632 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
633         .init = ip_rt_do_proc_init,
634         .exit = ip_rt_do_proc_exit,
635 };
636
637 static int __init ip_rt_proc_init(void)
638 {
639         return register_pernet_subsys(&ip_rt_proc_ops);
640 }
641
642 #else
643 static inline int ip_rt_proc_init(void)
644 {
645         return 0;
646 }
647 #endif /* CONFIG_PROC_FS */
648
649 static inline void rt_free(struct rtable *rt)
650 {
651         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
652 }
653
654 static inline void rt_drop(struct rtable *rt)
655 {
656         ip_rt_put(rt);
657         call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
658 }
659
660 static inline int rt_fast_clean(struct rtable *rth)
661 {
662         /* Kill broadcast/multicast entries very aggresively, if they
663            collide in hash table with more useful entries */
664         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
665                 rt_is_input_route(rth) && rth->dst.rt_next;
666 }
667
668 static inline int rt_valuable(struct rtable *rth)
669 {
670         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
671                 rth->dst.expires;
672 }
673
674 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
675 {
676         unsigned long age;
677         int ret = 0;
678
679         if (atomic_read(&rth->dst.__refcnt))
680                 goto out;
681
682         ret = 1;
683         if (rth->dst.expires &&
684             time_after_eq(jiffies, rth->dst.expires))
685                 goto out;
686
687         age = jiffies - rth->dst.lastuse;
688         ret = 0;
689         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
690             (age <= tmo2 && rt_valuable(rth)))
691                 goto out;
692         ret = 1;
693 out:    return ret;
694 }
695
696 /* Bits of score are:
697  * 31: very valuable
698  * 30: not quite useless
699  * 29..0: usage counter
700  */
701 static inline u32 rt_score(struct rtable *rt)
702 {
703         u32 score = jiffies - rt->dst.lastuse;
704
705         score = ~score & ~(3<<30);
706
707         if (rt_valuable(rt))
708                 score |= (1<<31);
709
710         if (rt_is_output_route(rt) ||
711             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
712                 score |= (1<<30);
713
714         return score;
715 }
716
717 static inline bool rt_caching(const struct net *net)
718 {
719         return net->ipv4.current_rt_cache_rebuild_count <=
720                 net->ipv4.sysctl_rt_cache_rebuild_count;
721 }
722
723 static inline bool compare_hash_inputs(const struct flowi *fl1,
724                                         const struct flowi *fl2)
725 {
726         return ((((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
727                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
728                 (fl1->iif ^ fl2->iif)) == 0);
729 }
730
731 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
732 {
733         return (((__force u32)fl1->fl4_dst ^ (__force u32)fl2->fl4_dst) |
734                 ((__force u32)fl1->fl4_src ^ (__force u32)fl2->fl4_src) |
735                 (fl1->mark ^ fl2->mark) |
736                 (*(u16 *)&fl1->fl4_tos ^ *(u16 *)&fl2->fl4_tos) |
737                 (fl1->oif ^ fl2->oif) |
738                 (fl1->iif ^ fl2->iif)) == 0;
739 }
740
741 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
742 {
743         return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev));
744 }
745
746 static inline int rt_is_expired(struct rtable *rth)
747 {
748         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
749 }
750
751 /*
752  * Perform a full scan of hash table and free all entries.
753  * Can be called by a softirq or a process.
754  * In the later case, we want to be reschedule if necessary
755  */
756 static void rt_do_flush(struct net *net, int process_context)
757 {
758         unsigned int i;
759         struct rtable *rth, *next;
760
761         for (i = 0; i <= rt_hash_mask; i++) {
762                 struct rtable __rcu **pprev;
763                 struct rtable *list;
764
765                 if (process_context && need_resched())
766                         cond_resched();
767                 rth = rcu_dereference_raw(rt_hash_table[i].chain);
768                 if (!rth)
769                         continue;
770
771                 spin_lock_bh(rt_hash_lock_addr(i));
772
773                 list = NULL;
774                 pprev = &rt_hash_table[i].chain;
775                 rth = rcu_dereference_protected(*pprev,
776                         lockdep_is_held(rt_hash_lock_addr(i)));
777
778                 while (rth) {
779                         next = rcu_dereference_protected(rth->dst.rt_next,
780                                 lockdep_is_held(rt_hash_lock_addr(i)));
781
782                         if (!net ||
783                             net_eq(dev_net(rth->dst.dev), net)) {
784                                 rcu_assign_pointer(*pprev, next);
785                                 rcu_assign_pointer(rth->dst.rt_next, list);
786                                 list = rth;
787                         } else {
788                                 pprev = &rth->dst.rt_next;
789                         }
790                         rth = next;
791                 }
792
793                 spin_unlock_bh(rt_hash_lock_addr(i));
794
795                 for (; list; list = next) {
796                         next = rcu_dereference_protected(list->dst.rt_next, 1);
797                         rt_free(list);
798                 }
799         }
800 }
801
802 /*
803  * While freeing expired entries, we compute average chain length
804  * and standard deviation, using fixed-point arithmetic.
805  * This to have an estimation of rt_chain_length_max
806  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
807  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
808  */
809
810 #define FRACT_BITS 3
811 #define ONE (1UL << FRACT_BITS)
812
813 /*
814  * Given a hash chain and an item in this hash chain,
815  * find if a previous entry has the same hash_inputs
816  * (but differs on tos, mark or oif)
817  * Returns 0 if an alias is found.
818  * Returns ONE if rth has no alias before itself.
819  */
820 static int has_noalias(const struct rtable *head, const struct rtable *rth)
821 {
822         const struct rtable *aux = head;
823
824         while (aux != rth) {
825                 if (compare_hash_inputs(&aux->fl, &rth->fl))
826                         return 0;
827                 aux = rcu_dereference_protected(aux->dst.rt_next, 1);
828         }
829         return ONE;
830 }
831
832 static void rt_check_expire(void)
833 {
834         static unsigned int rover;
835         unsigned int i = rover, goal;
836         struct rtable *rth;
837         struct rtable __rcu **rthp;
838         unsigned long samples = 0;
839         unsigned long sum = 0, sum2 = 0;
840         unsigned long delta;
841         u64 mult;
842
843         delta = jiffies - expires_ljiffies;
844         expires_ljiffies = jiffies;
845         mult = ((u64)delta) << rt_hash_log;
846         if (ip_rt_gc_timeout > 1)
847                 do_div(mult, ip_rt_gc_timeout);
848         goal = (unsigned int)mult;
849         if (goal > rt_hash_mask)
850                 goal = rt_hash_mask + 1;
851         for (; goal > 0; goal--) {
852                 unsigned long tmo = ip_rt_gc_timeout;
853                 unsigned long length;
854
855                 i = (i + 1) & rt_hash_mask;
856                 rthp = &rt_hash_table[i].chain;
857
858                 if (need_resched())
859                         cond_resched();
860
861                 samples++;
862
863                 if (rcu_dereference_raw(*rthp) == NULL)
864                         continue;
865                 length = 0;
866                 spin_lock_bh(rt_hash_lock_addr(i));
867                 while ((rth = rcu_dereference_protected(*rthp,
868                                         lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) {
869                         prefetch(rth->dst.rt_next);
870                         if (rt_is_expired(rth)) {
871                                 *rthp = rth->dst.rt_next;
872                                 rt_free(rth);
873                                 continue;
874                         }
875                         if (rth->dst.expires) {
876                                 /* Entry is expired even if it is in use */
877                                 if (time_before_eq(jiffies, rth->dst.expires)) {
878 nofree:
879                                         tmo >>= 1;
880                                         rthp = &rth->dst.rt_next;
881                                         /*
882                                          * We only count entries on
883                                          * a chain with equal hash inputs once
884                                          * so that entries for different QOS
885                                          * levels, and other non-hash input
886                                          * attributes don't unfairly skew
887                                          * the length computation
888                                          */
889                                         length += has_noalias(rt_hash_table[i].chain, rth);
890                                         continue;
891                                 }
892                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
893                                 goto nofree;
894
895                         /* Cleanup aged off entries. */
896                         *rthp = rth->dst.rt_next;
897                         rt_free(rth);
898                 }
899                 spin_unlock_bh(rt_hash_lock_addr(i));
900                 sum += length;
901                 sum2 += length*length;
902         }
903         if (samples) {
904                 unsigned long avg = sum / samples;
905                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
906                 rt_chain_length_max = max_t(unsigned long,
907                                         ip_rt_gc_elasticity,
908                                         (avg + 4*sd) >> FRACT_BITS);
909         }
910         rover = i;
911 }
912
913 /*
914  * rt_worker_func() is run in process context.
915  * we call rt_check_expire() to scan part of the hash table
916  */
917 static void rt_worker_func(struct work_struct *work)
918 {
919         rt_check_expire();
920         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
921 }
922
923 /*
924  * Pertubation of rt_genid by a small quantity [1..256]
925  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
926  * many times (2^24) without giving recent rt_genid.
927  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
928  */
929 static void rt_cache_invalidate(struct net *net)
930 {
931         unsigned char shuffle;
932
933         get_random_bytes(&shuffle, sizeof(shuffle));
934         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
935 }
936
937 /*
938  * delay < 0  : invalidate cache (fast : entries will be deleted later)
939  * delay >= 0 : invalidate & flush cache (can be long)
940  */
941 void rt_cache_flush(struct net *net, int delay)
942 {
943         rt_cache_invalidate(net);
944         if (delay >= 0)
945                 rt_do_flush(net, !in_softirq());
946 }
947
948 /* Flush previous cache invalidated entries from the cache */
949 void rt_cache_flush_batch(struct net *net)
950 {
951         rt_do_flush(net, !in_softirq());
952 }
953
954 static void rt_emergency_hash_rebuild(struct net *net)
955 {
956         if (net_ratelimit())
957                 printk(KERN_WARNING "Route hash chain too long!\n");
958         rt_cache_invalidate(net);
959 }
960
961 /*
962    Short description of GC goals.
963
964    We want to build algorithm, which will keep routing cache
965    at some equilibrium point, when number of aged off entries
966    is kept approximately equal to newly generated ones.
967
968    Current expiration strength is variable "expire".
969    We try to adjust it dynamically, so that if networking
970    is idle expires is large enough to keep enough of warm entries,
971    and when load increases it reduces to limit cache size.
972  */
973
974 static int rt_garbage_collect(struct dst_ops *ops)
975 {
976         static unsigned long expire = RT_GC_TIMEOUT;
977         static unsigned long last_gc;
978         static int rover;
979         static int equilibrium;
980         struct rtable *rth;
981         struct rtable __rcu **rthp;
982         unsigned long now = jiffies;
983         int goal;
984         int entries = dst_entries_get_fast(&ipv4_dst_ops);
985
986         /*
987          * Garbage collection is pretty expensive,
988          * do not make it too frequently.
989          */
990
991         RT_CACHE_STAT_INC(gc_total);
992
993         if (now - last_gc < ip_rt_gc_min_interval &&
994             entries < ip_rt_max_size) {
995                 RT_CACHE_STAT_INC(gc_ignored);
996                 goto out;
997         }
998
999         entries = dst_entries_get_slow(&ipv4_dst_ops);
1000         /* Calculate number of entries, which we want to expire now. */
1001         goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
1002         if (goal <= 0) {
1003                 if (equilibrium < ipv4_dst_ops.gc_thresh)
1004                         equilibrium = ipv4_dst_ops.gc_thresh;
1005                 goal = entries - equilibrium;
1006                 if (goal > 0) {
1007                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1008                         goal = entries - equilibrium;
1009                 }
1010         } else {
1011                 /* We are in dangerous area. Try to reduce cache really
1012                  * aggressively.
1013                  */
1014                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
1015                 equilibrium = entries - goal;
1016         }
1017
1018         if (now - last_gc >= ip_rt_gc_min_interval)
1019                 last_gc = now;
1020
1021         if (goal <= 0) {
1022                 equilibrium += goal;
1023                 goto work_done;
1024         }
1025
1026         do {
1027                 int i, k;
1028
1029                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1030                         unsigned long tmo = expire;
1031
1032                         k = (k + 1) & rt_hash_mask;
1033                         rthp = &rt_hash_table[k].chain;
1034                         spin_lock_bh(rt_hash_lock_addr(k));
1035                         while ((rth = rcu_dereference_protected(*rthp,
1036                                         lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) {
1037                                 if (!rt_is_expired(rth) &&
1038                                         !rt_may_expire(rth, tmo, expire)) {
1039                                         tmo >>= 1;
1040                                         rthp = &rth->dst.rt_next;
1041                                         continue;
1042                                 }
1043                                 *rthp = rth->dst.rt_next;
1044                                 rt_free(rth);
1045                                 goal--;
1046                         }
1047                         spin_unlock_bh(rt_hash_lock_addr(k));
1048                         if (goal <= 0)
1049                                 break;
1050                 }
1051                 rover = k;
1052
1053                 if (goal <= 0)
1054                         goto work_done;
1055
1056                 /* Goal is not achieved. We stop process if:
1057
1058                    - if expire reduced to zero. Otherwise, expire is halfed.
1059                    - if table is not full.
1060                    - if we are called from interrupt.
1061                    - jiffies check is just fallback/debug loop breaker.
1062                      We will not spin here for long time in any case.
1063                  */
1064
1065                 RT_CACHE_STAT_INC(gc_goal_miss);
1066
1067                 if (expire == 0)
1068                         break;
1069
1070                 expire >>= 1;
1071 #if RT_CACHE_DEBUG >= 2
1072                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1073                                 dst_entries_get_fast(&ipv4_dst_ops), goal, i);
1074 #endif
1075
1076                 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1077                         goto out;
1078         } while (!in_softirq() && time_before_eq(jiffies, now));
1079
1080         if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
1081                 goto out;
1082         if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
1083                 goto out;
1084         if (net_ratelimit())
1085                 printk(KERN_WARNING "dst cache overflow\n");
1086         RT_CACHE_STAT_INC(gc_dst_overflow);
1087         return 1;
1088
1089 work_done:
1090         expire += ip_rt_gc_min_interval;
1091         if (expire > ip_rt_gc_timeout ||
1092             dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
1093             dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
1094                 expire = ip_rt_gc_timeout;
1095 #if RT_CACHE_DEBUG >= 2
1096         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1097                         dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
1098 #endif
1099 out:    return 0;
1100 }
1101
1102 /*
1103  * Returns number of entries in a hash chain that have different hash_inputs
1104  */
1105 static int slow_chain_length(const struct rtable *head)
1106 {
1107         int length = 0;
1108         const struct rtable *rth = head;
1109
1110         while (rth) {
1111                 length += has_noalias(head, rth);
1112                 rth = rcu_dereference_protected(rth->dst.rt_next, 1);
1113         }
1114         return length >> FRACT_BITS;
1115 }
1116
1117 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1118                           struct rtable **rp, struct sk_buff *skb, int ifindex)
1119 {
1120         struct rtable   *rth, *cand;
1121         struct rtable __rcu **rthp, **candp;
1122         unsigned long   now;
1123         u32             min_score;
1124         int             chain_length;
1125         int attempts = !in_softirq();
1126
1127 restart:
1128         chain_length = 0;
1129         min_score = ~(u32)0;
1130         cand = NULL;
1131         candp = NULL;
1132         now = jiffies;
1133
1134         if (!rt_caching(dev_net(rt->dst.dev))) {
1135                 /*
1136                  * If we're not caching, just tell the caller we
1137                  * were successful and don't touch the route.  The
1138                  * caller hold the sole reference to the cache entry, and
1139                  * it will be released when the caller is done with it.
1140                  * If we drop it here, the callers have no way to resolve routes
1141                  * when we're not caching.  Instead, just point *rp at rt, so
1142                  * the caller gets a single use out of the route
1143                  * Note that we do rt_free on this new route entry, so that
1144                  * once its refcount hits zero, we are still able to reap it
1145                  * (Thanks Alexey)
1146                  * Note: To avoid expensive rcu stuff for this uncached dst,
1147                  * we set DST_NOCACHE so that dst_release() can free dst without
1148                  * waiting a grace period.
1149                  */
1150
1151                 rt->dst.flags |= DST_NOCACHE;
1152                 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1153                         int err = arp_bind_neighbour(&rt->dst);
1154                         if (err) {
1155                                 if (net_ratelimit())
1156                                         printk(KERN_WARNING
1157                                             "Neighbour table failure & not caching routes.\n");
1158                                 ip_rt_put(rt);
1159                                 return err;
1160                         }
1161                 }
1162
1163                 goto skip_hashing;
1164         }
1165
1166         rthp = &rt_hash_table[hash].chain;
1167
1168         spin_lock_bh(rt_hash_lock_addr(hash));
1169         while ((rth = rcu_dereference_protected(*rthp,
1170                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1171                 if (rt_is_expired(rth)) {
1172                         *rthp = rth->dst.rt_next;
1173                         rt_free(rth);
1174                         continue;
1175                 }
1176                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1177                         /* Put it first */
1178                         *rthp = rth->dst.rt_next;
1179                         /*
1180                          * Since lookup is lockfree, the deletion
1181                          * must be visible to another weakly ordered CPU before
1182                          * the insertion at the start of the hash chain.
1183                          */
1184                         rcu_assign_pointer(rth->dst.rt_next,
1185                                            rt_hash_table[hash].chain);
1186                         /*
1187                          * Since lookup is lockfree, the update writes
1188                          * must be ordered for consistency on SMP.
1189                          */
1190                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1191
1192                         dst_use(&rth->dst, now);
1193                         spin_unlock_bh(rt_hash_lock_addr(hash));
1194
1195                         rt_drop(rt);
1196                         if (rp)
1197                                 *rp = rth;
1198                         else
1199                                 skb_dst_set(skb, &rth->dst);
1200                         return 0;
1201                 }
1202
1203                 if (!atomic_read(&rth->dst.__refcnt)) {
1204                         u32 score = rt_score(rth);
1205
1206                         if (score <= min_score) {
1207                                 cand = rth;
1208                                 candp = rthp;
1209                                 min_score = score;
1210                         }
1211                 }
1212
1213                 chain_length++;
1214
1215                 rthp = &rth->dst.rt_next;
1216         }
1217
1218         if (cand) {
1219                 /* ip_rt_gc_elasticity used to be average length of chain
1220                  * length, when exceeded gc becomes really aggressive.
1221                  *
1222                  * The second limit is less certain. At the moment it allows
1223                  * only 2 entries per bucket. We will see.
1224                  */
1225                 if (chain_length > ip_rt_gc_elasticity) {
1226                         *candp = cand->dst.rt_next;
1227                         rt_free(cand);
1228                 }
1229         } else {
1230                 if (chain_length > rt_chain_length_max &&
1231                     slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) {
1232                         struct net *net = dev_net(rt->dst.dev);
1233                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1234                         if (!rt_caching(net)) {
1235                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1236                                         rt->dst.dev->name, num);
1237                         }
1238                         rt_emergency_hash_rebuild(net);
1239                         spin_unlock_bh(rt_hash_lock_addr(hash));
1240
1241                         hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1242                                         ifindex, rt_genid(net));
1243                         goto restart;
1244                 }
1245         }
1246
1247         /* Try to bind route to arp only if it is output
1248            route or unicast forwarding path.
1249          */
1250         if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) {
1251                 int err = arp_bind_neighbour(&rt->dst);
1252                 if (err) {
1253                         spin_unlock_bh(rt_hash_lock_addr(hash));
1254
1255                         if (err != -ENOBUFS) {
1256                                 rt_drop(rt);
1257                                 return err;
1258                         }
1259
1260                         /* Neighbour tables are full and nothing
1261                            can be released. Try to shrink route cache,
1262                            it is most likely it holds some neighbour records.
1263                          */
1264                         if (attempts-- > 0) {
1265                                 int saved_elasticity = ip_rt_gc_elasticity;
1266                                 int saved_int = ip_rt_gc_min_interval;
1267                                 ip_rt_gc_elasticity     = 1;
1268                                 ip_rt_gc_min_interval   = 0;
1269                                 rt_garbage_collect(&ipv4_dst_ops);
1270                                 ip_rt_gc_min_interval   = saved_int;
1271                                 ip_rt_gc_elasticity     = saved_elasticity;
1272                                 goto restart;
1273                         }
1274
1275                         if (net_ratelimit())
1276                                 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n");
1277                         rt_drop(rt);
1278                         return -ENOBUFS;
1279                 }
1280         }
1281
1282         rt->dst.rt_next = rt_hash_table[hash].chain;
1283
1284 #if RT_CACHE_DEBUG >= 2
1285         if (rt->dst.rt_next) {
1286                 struct rtable *trt;
1287                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1288                        hash, &rt->rt_dst);
1289                 for (trt = rt->dst.rt_next; trt; trt = trt->dst.rt_next)
1290                         printk(" . %pI4", &trt->rt_dst);
1291                 printk("\n");
1292         }
1293 #endif
1294         /*
1295          * Since lookup is lockfree, we must make sure
1296          * previous writes to rt are comitted to memory
1297          * before making rt visible to other CPUS.
1298          */
1299         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1300
1301         spin_unlock_bh(rt_hash_lock_addr(hash));
1302
1303 skip_hashing:
1304         if (rp)
1305                 *rp = rt;
1306         else
1307                 skb_dst_set(skb, &rt->dst);
1308         return 0;
1309 }
1310
1311 void rt_bind_peer(struct rtable *rt, int create)
1312 {
1313         struct inet_peer *peer;
1314
1315         peer = inet_getpeer_v4(rt->rt_dst, create);
1316
1317         if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
1318                 inet_putpeer(peer);
1319 }
1320
1321 /*
1322  * Peer allocation may fail only in serious out-of-memory conditions.  However
1323  * we still can generate some output.
1324  * Random ID selection looks a bit dangerous because we have no chances to
1325  * select ID being unique in a reasonable period of time.
1326  * But broken packet identifier may be better than no packet at all.
1327  */
1328 static void ip_select_fb_ident(struct iphdr *iph)
1329 {
1330         static DEFINE_SPINLOCK(ip_fb_id_lock);
1331         static u32 ip_fallback_id;
1332         u32 salt;
1333
1334         spin_lock_bh(&ip_fb_id_lock);
1335         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1336         iph->id = htons(salt & 0xFFFF);
1337         ip_fallback_id = salt;
1338         spin_unlock_bh(&ip_fb_id_lock);
1339 }
1340
1341 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1342 {
1343         struct rtable *rt = (struct rtable *) dst;
1344
1345         if (rt) {
1346                 if (rt->peer == NULL)
1347                         rt_bind_peer(rt, 1);
1348
1349                 /* If peer is attached to destination, it is never detached,
1350                    so that we need not to grab a lock to dereference it.
1351                  */
1352                 if (rt->peer) {
1353                         iph->id = htons(inet_getid(rt->peer, more));
1354                         return;
1355                 }
1356         } else
1357                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1358                        __builtin_return_address(0));
1359
1360         ip_select_fb_ident(iph);
1361 }
1362 EXPORT_SYMBOL(__ip_select_ident);
1363
1364 static void rt_del(unsigned hash, struct rtable *rt)
1365 {
1366         struct rtable __rcu **rthp;
1367         struct rtable *aux;
1368
1369         rthp = &rt_hash_table[hash].chain;
1370         spin_lock_bh(rt_hash_lock_addr(hash));
1371         ip_rt_put(rt);
1372         while ((aux = rcu_dereference_protected(*rthp,
1373                         lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) {
1374                 if (aux == rt || rt_is_expired(aux)) {
1375                         *rthp = aux->dst.rt_next;
1376                         rt_free(aux);
1377                         continue;
1378                 }
1379                 rthp = &aux->dst.rt_next;
1380         }
1381         spin_unlock_bh(rt_hash_lock_addr(hash));
1382 }
1383
1384 /* called in rcu_read_lock() section */
1385 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1386                     __be32 saddr, struct net_device *dev)
1387 {
1388         int i, k;
1389         struct in_device *in_dev = __in_dev_get_rcu(dev);
1390         struct rtable *rth;
1391         struct rtable __rcu **rthp;
1392         __be32  skeys[2] = { saddr, 0 };
1393         int  ikeys[2] = { dev->ifindex, 0 };
1394         struct netevent_redirect netevent;
1395         struct net *net;
1396
1397         if (!in_dev)
1398                 return;
1399
1400         net = dev_net(dev);
1401         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1402             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1403             ipv4_is_zeronet(new_gw))
1404                 goto reject_redirect;
1405
1406         if (!rt_caching(net))
1407                 goto reject_redirect;
1408
1409         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1410                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1411                         goto reject_redirect;
1412                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1413                         goto reject_redirect;
1414         } else {
1415                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1416                         goto reject_redirect;
1417         }
1418
1419         for (i = 0; i < 2; i++) {
1420                 for (k = 0; k < 2; k++) {
1421                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1422                                                 rt_genid(net));
1423
1424                         rthp = &rt_hash_table[hash].chain;
1425
1426                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1427                                 struct rtable *rt;
1428
1429                                 if (rth->fl.fl4_dst != daddr ||
1430                                     rth->fl.fl4_src != skeys[i] ||
1431                                     rth->fl.oif != ikeys[k] ||
1432                                     rt_is_input_route(rth) ||
1433                                     rt_is_expired(rth) ||
1434                                     !net_eq(dev_net(rth->dst.dev), net)) {
1435                                         rthp = &rth->dst.rt_next;
1436                                         continue;
1437                                 }
1438
1439                                 if (rth->rt_dst != daddr ||
1440                                     rth->rt_src != saddr ||
1441                                     rth->dst.error ||
1442                                     rth->rt_gateway != old_gw ||
1443                                     rth->dst.dev != dev)
1444                                         break;
1445
1446                                 dst_hold(&rth->dst);
1447
1448                                 rt = dst_alloc(&ipv4_dst_ops);
1449                                 if (rt == NULL) {
1450                                         ip_rt_put(rth);
1451                                         return;
1452                                 }
1453
1454                                 /* Copy all the information. */
1455                                 *rt = *rth;
1456                                 rt->dst.__use           = 1;
1457                                 atomic_set(&rt->dst.__refcnt, 1);
1458                                 rt->dst.child           = NULL;
1459                                 if (rt->dst.dev)
1460                                         dev_hold(rt->dst.dev);
1461                                 rt->dst.obsolete        = -1;
1462                                 rt->dst.lastuse = jiffies;
1463                                 rt->dst.path            = &rt->dst;
1464                                 rt->dst.neighbour       = NULL;
1465                                 rt->dst.hh              = NULL;
1466 #ifdef CONFIG_XFRM
1467                                 rt->dst.xfrm            = NULL;
1468 #endif
1469                                 rt->rt_genid            = rt_genid(net);
1470                                 rt->rt_flags            |= RTCF_REDIRECTED;
1471
1472                                 /* Gateway is different ... */
1473                                 rt->rt_gateway          = new_gw;
1474
1475                                 /* Redirect received -> path was valid */
1476                                 dst_confirm(&rth->dst);
1477
1478                                 if (rt->peer)
1479                                         atomic_inc(&rt->peer->refcnt);
1480                                 if (rt->fi)
1481                                         atomic_inc(&rt->fi->fib_clntref);
1482
1483                                 if (arp_bind_neighbour(&rt->dst) ||
1484                                     !(rt->dst.neighbour->nud_state &
1485                                             NUD_VALID)) {
1486                                         if (rt->dst.neighbour)
1487                                                 neigh_event_send(rt->dst.neighbour, NULL);
1488                                         ip_rt_put(rth);
1489                                         rt_drop(rt);
1490                                         goto do_next;
1491                                 }
1492
1493                                 netevent.old = &rth->dst;
1494                                 netevent.new = &rt->dst;
1495                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1496                                                         &netevent);
1497
1498                                 rt_del(hash, rth);
1499                                 if (!rt_intern_hash(hash, rt, &rt, NULL, rt->fl.oif))
1500                                         ip_rt_put(rt);
1501                                 goto do_next;
1502                         }
1503                 do_next:
1504                         ;
1505                 }
1506         }
1507         return;
1508
1509 reject_redirect:
1510 #ifdef CONFIG_IP_ROUTE_VERBOSE
1511         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1512                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1513                         "  Advised path = %pI4 -> %pI4\n",
1514                        &old_gw, dev->name, &new_gw,
1515                        &saddr, &daddr);
1516 #endif
1517         ;
1518 }
1519
1520 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1521 {
1522         struct rtable *rt = (struct rtable *)dst;
1523         struct dst_entry *ret = dst;
1524
1525         if (rt) {
1526                 if (dst->obsolete > 0) {
1527                         ip_rt_put(rt);
1528                         ret = NULL;
1529                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1530                            (rt->dst.expires &&
1531                             time_after_eq(jiffies, rt->dst.expires))) {
1532                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1533                                                 rt->fl.oif,
1534                                                 rt_genid(dev_net(dst->dev)));
1535 #if RT_CACHE_DEBUG >= 1
1536                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1537                                 &rt->rt_dst, rt->fl.fl4_tos);
1538 #endif
1539                         rt_del(hash, rt);
1540                         ret = NULL;
1541                 }
1542         }
1543         return ret;
1544 }
1545
1546 /*
1547  * Algorithm:
1548  *      1. The first ip_rt_redirect_number redirects are sent
1549  *         with exponential backoff, then we stop sending them at all,
1550  *         assuming that the host ignores our redirects.
1551  *      2. If we did not see packets requiring redirects
1552  *         during ip_rt_redirect_silence, we assume that the host
1553  *         forgot redirected route and start to send redirects again.
1554  *
1555  * This algorithm is much cheaper and more intelligent than dumb load limiting
1556  * in icmp.c.
1557  *
1558  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1559  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1560  */
1561
1562 void ip_rt_send_redirect(struct sk_buff *skb)
1563 {
1564         struct rtable *rt = skb_rtable(skb);
1565         struct in_device *in_dev;
1566         int log_martians;
1567
1568         rcu_read_lock();
1569         in_dev = __in_dev_get_rcu(rt->dst.dev);
1570         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1571                 rcu_read_unlock();
1572                 return;
1573         }
1574         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1575         rcu_read_unlock();
1576
1577         /* No redirected packets during ip_rt_redirect_silence;
1578          * reset the algorithm.
1579          */
1580         if (time_after(jiffies, rt->dst.rate_last + ip_rt_redirect_silence))
1581                 rt->dst.rate_tokens = 0;
1582
1583         /* Too many ignored redirects; do not send anything
1584          * set dst.rate_last to the last seen redirected packet.
1585          */
1586         if (rt->dst.rate_tokens >= ip_rt_redirect_number) {
1587                 rt->dst.rate_last = jiffies;
1588                 return;
1589         }
1590
1591         /* Check for load limit; set rate_last to the latest sent
1592          * redirect.
1593          */
1594         if (rt->dst.rate_tokens == 0 ||
1595             time_after(jiffies,
1596                        (rt->dst.rate_last +
1597                         (ip_rt_redirect_load << rt->dst.rate_tokens)))) {
1598                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1599                 rt->dst.rate_last = jiffies;
1600                 ++rt->dst.rate_tokens;
1601 #ifdef CONFIG_IP_ROUTE_VERBOSE
1602                 if (log_martians &&
1603                     rt->dst.rate_tokens == ip_rt_redirect_number &&
1604                     net_ratelimit())
1605                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1606                                 &rt->rt_src, rt->rt_iif,
1607                                 &rt->rt_dst, &rt->rt_gateway);
1608 #endif
1609         }
1610 }
1611
1612 static int ip_error(struct sk_buff *skb)
1613 {
1614         struct rtable *rt = skb_rtable(skb);
1615         unsigned long now;
1616         int code;
1617
1618         switch (rt->dst.error) {
1619                 case EINVAL:
1620                 default:
1621                         goto out;
1622                 case EHOSTUNREACH:
1623                         code = ICMP_HOST_UNREACH;
1624                         break;
1625                 case ENETUNREACH:
1626                         code = ICMP_NET_UNREACH;
1627                         IP_INC_STATS_BH(dev_net(rt->dst.dev),
1628                                         IPSTATS_MIB_INNOROUTES);
1629                         break;
1630                 case EACCES:
1631                         code = ICMP_PKT_FILTERED;
1632                         break;
1633         }
1634
1635         now = jiffies;
1636         rt->dst.rate_tokens += now - rt->dst.rate_last;
1637         if (rt->dst.rate_tokens > ip_rt_error_burst)
1638                 rt->dst.rate_tokens = ip_rt_error_burst;
1639         rt->dst.rate_last = now;
1640         if (rt->dst.rate_tokens >= ip_rt_error_cost) {
1641                 rt->dst.rate_tokens -= ip_rt_error_cost;
1642                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1643         }
1644
1645 out:    kfree_skb(skb);
1646         return 0;
1647 }
1648
1649 /*
1650  *      The last two values are not from the RFC but
1651  *      are needed for AMPRnet AX.25 paths.
1652  */
1653
1654 static const unsigned short mtu_plateau[] =
1655 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1656
1657 static inline unsigned short guess_mtu(unsigned short old_mtu)
1658 {
1659         int i;
1660
1661         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1662                 if (old_mtu > mtu_plateau[i])
1663                         return mtu_plateau[i];
1664         return 68;
1665 }
1666
1667 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1668                                  unsigned short new_mtu,
1669                                  struct net_device *dev)
1670 {
1671         int i, k;
1672         unsigned short old_mtu = ntohs(iph->tot_len);
1673         struct rtable *rth;
1674         int  ikeys[2] = { dev->ifindex, 0 };
1675         __be32  skeys[2] = { iph->saddr, 0, };
1676         __be32  daddr = iph->daddr;
1677         unsigned short est_mtu = 0;
1678
1679         for (k = 0; k < 2; k++) {
1680                 for (i = 0; i < 2; i++) {
1681                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1682                                                 rt_genid(net));
1683
1684                         rcu_read_lock();
1685                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1686                              rth = rcu_dereference(rth->dst.rt_next)) {
1687                                 unsigned short mtu = new_mtu;
1688
1689                                 if (rth->fl.fl4_dst != daddr ||
1690                                     rth->fl.fl4_src != skeys[i] ||
1691                                     rth->rt_dst != daddr ||
1692                                     rth->rt_src != iph->saddr ||
1693                                     rth->fl.oif != ikeys[k] ||
1694                                     rt_is_input_route(rth) ||
1695                                     dst_metric_locked(&rth->dst, RTAX_MTU) ||
1696                                     !net_eq(dev_net(rth->dst.dev), net) ||
1697                                     rt_is_expired(rth))
1698                                         continue;
1699
1700                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1701
1702                                         /* BSD 4.2 compatibility hack :-( */
1703                                         if (mtu == 0 &&
1704                                             old_mtu >= dst_mtu(&rth->dst) &&
1705                                             old_mtu >= 68 + (iph->ihl << 2))
1706                                                 old_mtu -= iph->ihl << 2;
1707
1708                                         mtu = guess_mtu(old_mtu);
1709                                 }
1710                                 if (mtu <= dst_mtu(&rth->dst)) {
1711                                         if (mtu < dst_mtu(&rth->dst)) {
1712                                                 dst_confirm(&rth->dst);
1713                                                 if (mtu < ip_rt_min_pmtu) {
1714                                                         u32 lock = dst_metric(&rth->dst,
1715                                                                               RTAX_LOCK);
1716                                                         mtu = ip_rt_min_pmtu;
1717                                                         lock |= (1 << RTAX_MTU);
1718                                                         dst_metric_set(&rth->dst, RTAX_LOCK,
1719                                                                        lock);
1720                                                 }
1721                                                 dst_metric_set(&rth->dst, RTAX_MTU, mtu);
1722                                                 dst_set_expires(&rth->dst,
1723                                                         ip_rt_mtu_expires);
1724                                         }
1725                                         est_mtu = mtu;
1726                                 }
1727                         }
1728                         rcu_read_unlock();
1729                 }
1730         }
1731         return est_mtu ? : new_mtu;
1732 }
1733
1734 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1735 {
1736         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1737             !(dst_metric_locked(dst, RTAX_MTU))) {
1738                 if (mtu < ip_rt_min_pmtu) {
1739                         u32 lock = dst_metric(dst, RTAX_LOCK);
1740                         mtu = ip_rt_min_pmtu;
1741                         dst_metric_set(dst, RTAX_LOCK, lock | (1 << RTAX_MTU));
1742                 }
1743                 dst_metric_set(dst, RTAX_MTU, mtu);
1744                 dst_set_expires(dst, ip_rt_mtu_expires);
1745                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1746         }
1747 }
1748
1749 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1750 {
1751         if (rt_is_expired((struct rtable *)dst))
1752                 return NULL;
1753         return dst;
1754 }
1755
1756 static void ipv4_dst_destroy(struct dst_entry *dst)
1757 {
1758         struct rtable *rt = (struct rtable *) dst;
1759         struct inet_peer *peer = rt->peer;
1760
1761         if (rt->fi) {
1762                 fib_info_put(rt->fi);
1763                 rt->fi = NULL;
1764         }
1765         if (peer) {
1766                 rt->peer = NULL;
1767                 inet_putpeer(peer);
1768         }
1769 }
1770
1771
1772 static void ipv4_link_failure(struct sk_buff *skb)
1773 {
1774         struct rtable *rt;
1775
1776         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1777
1778         rt = skb_rtable(skb);
1779         if (rt)
1780                 dst_set_expires(&rt->dst, 0);
1781 }
1782
1783 static int ip_rt_bug(struct sk_buff *skb)
1784 {
1785         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1786                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1787                 skb->dev ? skb->dev->name : "?");
1788         kfree_skb(skb);
1789         return 0;
1790 }
1791
1792 /*
1793    We do not cache source address of outgoing interface,
1794    because it is used only by IP RR, TS and SRR options,
1795    so that it out of fast path.
1796
1797    BTW remember: "addr" is allowed to be not aligned
1798    in IP options!
1799  */
1800
1801 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1802 {
1803         __be32 src;
1804         struct fib_result res;
1805
1806         if (rt_is_output_route(rt))
1807                 src = rt->rt_src;
1808         else {
1809                 rcu_read_lock();
1810                 if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
1811                         src = FIB_RES_PREFSRC(res);
1812                 else
1813                         src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
1814                                         RT_SCOPE_UNIVERSE);
1815                 rcu_read_unlock();
1816         }
1817         memcpy(addr, &src, 4);
1818 }
1819
1820 #ifdef CONFIG_IP_ROUTE_CLASSID
1821 static void set_class_tag(struct rtable *rt, u32 tag)
1822 {
1823         if (!(rt->dst.tclassid & 0xFFFF))
1824                 rt->dst.tclassid |= tag & 0xFFFF;
1825         if (!(rt->dst.tclassid & 0xFFFF0000))
1826                 rt->dst.tclassid |= tag & 0xFFFF0000;
1827 }
1828 #endif
1829
1830 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1831 {
1832         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1833
1834         if (advmss == 0) {
1835                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1836                                ip_rt_min_advmss);
1837                 if (advmss > 65535 - 40)
1838                         advmss = 65535 - 40;
1839         }
1840         return advmss;
1841 }
1842
1843 static unsigned int ipv4_default_mtu(const struct dst_entry *dst)
1844 {
1845         unsigned int mtu = dst->dev->mtu;
1846
1847         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1848                 const struct rtable *rt = (const struct rtable *) dst;
1849
1850                 if (rt->rt_gateway != rt->rt_dst && mtu > 576)
1851                         mtu = 576;
1852         }
1853
1854         if (mtu > IP_MAX_MTU)
1855                 mtu = IP_MAX_MTU;
1856
1857         return mtu;
1858 }
1859
1860 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1861 {
1862         struct dst_entry *dst = &rt->dst;
1863         struct fib_info *fi = res->fi;
1864
1865         if (fi) {
1866                 if (FIB_RES_GW(*res) &&
1867                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1868                         rt->rt_gateway = FIB_RES_GW(*res);
1869                 rt->fi = fi;
1870                 atomic_inc(&fi->fib_clntref);
1871                 dst_init_metrics(dst, fi->fib_metrics, true);
1872 #ifdef CONFIG_IP_ROUTE_CLASSID
1873                 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1874 #endif
1875         }
1876
1877         if (dst_mtu(dst) > IP_MAX_MTU)
1878                 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU);
1879         if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1880                 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1881
1882 #ifdef CONFIG_IP_ROUTE_CLASSID
1883 #ifdef CONFIG_IP_MULTIPLE_TABLES
1884         set_class_tag(rt, fib_rules_tclass(res));
1885 #endif
1886         set_class_tag(rt, itag);
1887 #endif
1888         rt->rt_type = res->type;
1889 }
1890
1891 /* called in rcu_read_lock() section */
1892 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1893                                 u8 tos, struct net_device *dev, int our)
1894 {
1895         unsigned int hash;
1896         struct rtable *rth;
1897         __be32 spec_dst;
1898         struct in_device *in_dev = __in_dev_get_rcu(dev);
1899         u32 itag = 0;
1900         int err;
1901
1902         /* Primary sanity checks. */
1903
1904         if (in_dev == NULL)
1905                 return -EINVAL;
1906
1907         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1908             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1909                 goto e_inval;
1910
1911         if (ipv4_is_zeronet(saddr)) {
1912                 if (!ipv4_is_local_multicast(daddr))
1913                         goto e_inval;
1914                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1915         } else {
1916                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1917                                           &itag, 0);
1918                 if (err < 0)
1919                         goto e_err;
1920         }
1921         rth = dst_alloc(&ipv4_dst_ops);
1922         if (!rth)
1923                 goto e_nobufs;
1924
1925         rth->dst.output = ip_rt_bug;
1926         rth->dst.obsolete = -1;
1927
1928         atomic_set(&rth->dst.__refcnt, 1);
1929         rth->dst.flags= DST_HOST;
1930         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1931                 rth->dst.flags |= DST_NOPOLICY;
1932         rth->fl.fl4_dst = daddr;
1933         rth->rt_dst     = daddr;
1934         rth->fl.fl4_tos = tos;
1935         rth->fl.mark    = skb->mark;
1936         rth->fl.fl4_src = saddr;
1937         rth->rt_src     = saddr;
1938 #ifdef CONFIG_IP_ROUTE_CLASSID
1939         rth->dst.tclassid = itag;
1940 #endif
1941         rth->rt_iif     =
1942         rth->fl.iif     = dev->ifindex;
1943         rth->dst.dev    = init_net.loopback_dev;
1944         dev_hold(rth->dst.dev);
1945         rth->fl.oif     = 0;
1946         rth->rt_gateway = daddr;
1947         rth->rt_spec_dst= spec_dst;
1948         rth->rt_genid   = rt_genid(dev_net(dev));
1949         rth->rt_flags   = RTCF_MULTICAST;
1950         rth->rt_type    = RTN_MULTICAST;
1951         if (our) {
1952                 rth->dst.input= ip_local_deliver;
1953                 rth->rt_flags |= RTCF_LOCAL;
1954         }
1955
1956 #ifdef CONFIG_IP_MROUTE
1957         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1958                 rth->dst.input = ip_mr_input;
1959 #endif
1960         RT_CACHE_STAT_INC(in_slow_mc);
1961
1962         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1963         return rt_intern_hash(hash, rth, NULL, skb, dev->ifindex);
1964
1965 e_nobufs:
1966         return -ENOBUFS;
1967 e_inval:
1968         return -EINVAL;
1969 e_err:
1970         return err;
1971 }
1972
1973
1974 static void ip_handle_martian_source(struct net_device *dev,
1975                                      struct in_device *in_dev,
1976                                      struct sk_buff *skb,
1977                                      __be32 daddr,
1978                                      __be32 saddr)
1979 {
1980         RT_CACHE_STAT_INC(in_martian_src);
1981 #ifdef CONFIG_IP_ROUTE_VERBOSE
1982         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1983                 /*
1984                  *      RFC1812 recommendation, if source is martian,
1985                  *      the only hint is MAC header.
1986                  */
1987                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1988                         &daddr, &saddr, dev->name);
1989                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1990                         int i;
1991                         const unsigned char *p = skb_mac_header(skb);
1992                         printk(KERN_WARNING "ll header: ");
1993                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1994                                 printk("%02x", *p);
1995                                 if (i < (dev->hard_header_len - 1))
1996                                         printk(":");
1997                         }
1998                         printk("\n");
1999                 }
2000         }
2001 #endif
2002 }
2003
2004 /* called in rcu_read_lock() section */
2005 static int __mkroute_input(struct sk_buff *skb,
2006                            struct fib_result *res,
2007                            struct in_device *in_dev,
2008                            __be32 daddr, __be32 saddr, u32 tos,
2009                            struct rtable **result)
2010 {
2011         struct rtable *rth;
2012         int err;
2013         struct in_device *out_dev;
2014         unsigned int flags = 0;
2015         __be32 spec_dst;
2016         u32 itag;
2017
2018         /* get a working reference to the output device */
2019         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
2020         if (out_dev == NULL) {
2021                 if (net_ratelimit())
2022                         printk(KERN_CRIT "Bug in ip_route_input" \
2023                                "_slow(). Please, report\n");
2024                 return -EINVAL;
2025         }
2026
2027
2028         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
2029                                   in_dev->dev, &spec_dst, &itag, skb->mark);
2030         if (err < 0) {
2031                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
2032                                          saddr);
2033
2034                 goto cleanup;
2035         }
2036
2037         if (err)
2038                 flags |= RTCF_DIRECTSRC;
2039
2040         if (out_dev == in_dev && err &&
2041             (IN_DEV_SHARED_MEDIA(out_dev) ||
2042              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
2043                 flags |= RTCF_DOREDIRECT;
2044
2045         if (skb->protocol != htons(ETH_P_IP)) {
2046                 /* Not IP (i.e. ARP). Do not create route, if it is
2047                  * invalid for proxy arp. DNAT routes are always valid.
2048                  *
2049                  * Proxy arp feature have been extended to allow, ARP
2050                  * replies back to the same interface, to support
2051                  * Private VLAN switch technologies. See arp.c.
2052                  */
2053                 if (out_dev == in_dev &&
2054                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
2055                         err = -EINVAL;
2056                         goto cleanup;
2057                 }
2058         }
2059
2060
2061         rth = dst_alloc(&ipv4_dst_ops);
2062         if (!rth) {
2063                 err = -ENOBUFS;
2064                 goto cleanup;
2065         }
2066
2067         atomic_set(&rth->dst.__refcnt, 1);
2068         rth->dst.flags= DST_HOST;
2069         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2070                 rth->dst.flags |= DST_NOPOLICY;
2071         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2072                 rth->dst.flags |= DST_NOXFRM;
2073         rth->fl.fl4_dst = daddr;
2074         rth->rt_dst     = daddr;
2075         rth->fl.fl4_tos = tos;
2076         rth->fl.mark    = skb->mark;
2077         rth->fl.fl4_src = saddr;
2078         rth->rt_src     = saddr;
2079         rth->rt_gateway = daddr;
2080         rth->rt_iif     =
2081                 rth->fl.iif     = in_dev->dev->ifindex;
2082         rth->dst.dev    = (out_dev)->dev;
2083         dev_hold(rth->dst.dev);
2084         rth->fl.oif     = 0;
2085         rth->rt_spec_dst= spec_dst;
2086
2087         rth->dst.obsolete = -1;
2088         rth->dst.input = ip_forward;
2089         rth->dst.output = ip_output;
2090         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
2091
2092         rt_set_nexthop(rth, res, itag);
2093
2094         rth->rt_flags = flags;
2095
2096         *result = rth;
2097         err = 0;
2098  cleanup:
2099         return err;
2100 }
2101
2102 static int ip_mkroute_input(struct sk_buff *skb,
2103                             struct fib_result *res,
2104                             const struct flowi *fl,
2105                             struct in_device *in_dev,
2106                             __be32 daddr, __be32 saddr, u32 tos)
2107 {
2108         struct rtable* rth = NULL;
2109         int err;
2110         unsigned hash;
2111
2112 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2113         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2114                 fib_select_multipath(fl, res);
2115 #endif
2116
2117         /* create a routing cache entry */
2118         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2119         if (err)
2120                 return err;
2121
2122         /* put it into the cache */
2123         hash = rt_hash(daddr, saddr, fl->iif,
2124                        rt_genid(dev_net(rth->dst.dev)));
2125         return rt_intern_hash(hash, rth, NULL, skb, fl->iif);
2126 }
2127
2128 /*
2129  *      NOTE. We drop all the packets that has local source
2130  *      addresses, because every properly looped back packet
2131  *      must have correct destination already attached by output routine.
2132  *
2133  *      Such approach solves two big problems:
2134  *      1. Not simplex devices are handled properly.
2135  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2136  *      called with rcu_read_lock()
2137  */
2138
2139 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2140                                u8 tos, struct net_device *dev)
2141 {
2142         struct fib_result res;
2143         struct in_device *in_dev = __in_dev_get_rcu(dev);
2144         struct flowi fl = { .fl4_dst    = daddr,
2145                             .fl4_src    = saddr,
2146                             .fl4_tos    = tos,
2147                             .fl4_scope  = RT_SCOPE_UNIVERSE,
2148                             .mark = skb->mark,
2149                             .iif = dev->ifindex };
2150         unsigned        flags = 0;
2151         u32             itag = 0;
2152         struct rtable * rth;
2153         unsigned        hash;
2154         __be32          spec_dst;
2155         int             err = -EINVAL;
2156         struct net    * net = dev_net(dev);
2157
2158         /* IP on this device is disabled. */
2159
2160         if (!in_dev)
2161                 goto out;
2162
2163         /* Check for the most weird martians, which can be not detected
2164            by fib_lookup.
2165          */
2166
2167         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2168             ipv4_is_loopback(saddr))
2169                 goto martian_source;
2170
2171         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2172                 goto brd_input;
2173
2174         /* Accept zero addresses only to limited broadcast;
2175          * I even do not know to fix it or not. Waiting for complains :-)
2176          */
2177         if (ipv4_is_zeronet(saddr))
2178                 goto martian_source;
2179
2180         if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
2181                 goto martian_destination;
2182
2183         /*
2184          *      Now we are ready to route packet.
2185          */
2186         err = fib_lookup(net, &fl, &res);
2187         if (err != 0) {
2188                 if (!IN_DEV_FORWARD(in_dev))
2189                         goto e_hostunreach;
2190                 goto no_route;
2191         }
2192
2193         RT_CACHE_STAT_INC(in_slow_tot);
2194
2195         if (res.type == RTN_BROADCAST)
2196                 goto brd_input;
2197
2198         if (res.type == RTN_LOCAL) {
2199                 err = fib_validate_source(saddr, daddr, tos,
2200                                           net->loopback_dev->ifindex,
2201                                           dev, &spec_dst, &itag, skb->mark);
2202                 if (err < 0)
2203                         goto martian_source_keep_err;
2204                 if (err)
2205                         flags |= RTCF_DIRECTSRC;
2206                 spec_dst = daddr;
2207                 goto local_input;
2208         }
2209
2210         if (!IN_DEV_FORWARD(in_dev))
2211                 goto e_hostunreach;
2212         if (res.type != RTN_UNICAST)
2213                 goto martian_destination;
2214
2215         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2216 out:    return err;
2217
2218 brd_input:
2219         if (skb->protocol != htons(ETH_P_IP))
2220                 goto e_inval;
2221
2222         if (ipv4_is_zeronet(saddr))
2223                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2224         else {
2225                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2226                                           &itag, skb->mark);
2227                 if (err < 0)
2228                         goto martian_source_keep_err;
2229                 if (err)
2230                         flags |= RTCF_DIRECTSRC;
2231         }
2232         flags |= RTCF_BROADCAST;
2233         res.type = RTN_BROADCAST;
2234         RT_CACHE_STAT_INC(in_brd);
2235
2236 local_input:
2237         rth = dst_alloc(&ipv4_dst_ops);
2238         if (!rth)
2239                 goto e_nobufs;
2240
2241         rth->dst.output= ip_rt_bug;
2242         rth->dst.obsolete = -1;
2243         rth->rt_genid = rt_genid(net);
2244
2245         atomic_set(&rth->dst.__refcnt, 1);
2246         rth->dst.flags= DST_HOST;
2247         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2248                 rth->dst.flags |= DST_NOPOLICY;
2249         rth->fl.fl4_dst = daddr;
2250         rth->rt_dst     = daddr;
2251         rth->fl.fl4_tos = tos;
2252         rth->fl.mark    = skb->mark;
2253         rth->fl.fl4_src = saddr;
2254         rth->rt_src     = saddr;
2255 #ifdef CONFIG_IP_ROUTE_CLASSID
2256         rth->dst.tclassid = itag;
2257 #endif
2258         rth->rt_iif     =
2259         rth->fl.iif     = dev->ifindex;
2260         rth->dst.dev    = net->loopback_dev;
2261         dev_hold(rth->dst.dev);
2262         rth->rt_gateway = daddr;
2263         rth->rt_spec_dst= spec_dst;
2264         rth->dst.input= ip_local_deliver;
2265         rth->rt_flags   = flags|RTCF_LOCAL;
2266         if (res.type == RTN_UNREACHABLE) {
2267                 rth->dst.input= ip_error;
2268                 rth->dst.error= -err;
2269                 rth->rt_flags   &= ~RTCF_LOCAL;
2270         }
2271         rth->rt_type    = res.type;
2272         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2273         err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
2274         goto out;
2275
2276 no_route:
2277         RT_CACHE_STAT_INC(in_no_route);
2278         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2279         res.type = RTN_UNREACHABLE;
2280         if (err == -ESRCH)
2281                 err = -ENETUNREACH;
2282         goto local_input;
2283
2284         /*
2285          *      Do not cache martian addresses: they should be logged (RFC1812)
2286          */
2287 martian_destination:
2288         RT_CACHE_STAT_INC(in_martian_dst);
2289 #ifdef CONFIG_IP_ROUTE_VERBOSE
2290         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2291                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2292                         &daddr, &saddr, dev->name);
2293 #endif
2294
2295 e_hostunreach:
2296         err = -EHOSTUNREACH;
2297         goto out;
2298
2299 e_inval:
2300         err = -EINVAL;
2301         goto out;
2302
2303 e_nobufs:
2304         err = -ENOBUFS;
2305         goto out;
2306
2307 martian_source:
2308         err = -EINVAL;
2309 martian_source_keep_err:
2310         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2311         goto out;
2312 }
2313
2314 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2315                            u8 tos, struct net_device *dev, bool noref)
2316 {
2317         struct rtable * rth;
2318         unsigned        hash;
2319         int iif = dev->ifindex;
2320         struct net *net;
2321         int res;
2322
2323         net = dev_net(dev);
2324
2325         rcu_read_lock();
2326
2327         if (!rt_caching(net))
2328                 goto skip_cache;
2329
2330         tos &= IPTOS_RT_MASK;
2331         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2332
2333         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2334              rth = rcu_dereference(rth->dst.rt_next)) {
2335                 if ((((__force u32)rth->fl.fl4_dst ^ (__force u32)daddr) |
2336                      ((__force u32)rth->fl.fl4_src ^ (__force u32)saddr) |
2337                      (rth->fl.iif ^ iif) |
2338                      rth->fl.oif |
2339                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2340                     rth->fl.mark == skb->mark &&
2341                     net_eq(dev_net(rth->dst.dev), net) &&
2342                     !rt_is_expired(rth)) {
2343                         if (noref) {
2344                                 dst_use_noref(&rth->dst, jiffies);
2345                                 skb_dst_set_noref(skb, &rth->dst);
2346                         } else {
2347                                 dst_use(&rth->dst, jiffies);
2348                                 skb_dst_set(skb, &rth->dst);
2349                         }
2350                         RT_CACHE_STAT_INC(in_hit);
2351                         rcu_read_unlock();
2352                         return 0;
2353                 }
2354                 RT_CACHE_STAT_INC(in_hlist_search);
2355         }
2356
2357 skip_cache:
2358         /* Multicast recognition logic is moved from route cache to here.
2359            The problem was that too many Ethernet cards have broken/missing
2360            hardware multicast filters :-( As result the host on multicasting
2361            network acquires a lot of useless route cache entries, sort of
2362            SDR messages from all the world. Now we try to get rid of them.
2363            Really, provided software IP multicast filter is organized
2364            reasonably (at least, hashed), it does not result in a slowdown
2365            comparing with route cache reject entries.
2366            Note, that multicast routers are not affected, because
2367            route cache entry is created eventually.
2368          */
2369         if (ipv4_is_multicast(daddr)) {
2370                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2371
2372                 if (in_dev) {
2373                         int our = ip_check_mc(in_dev, daddr, saddr,
2374                                               ip_hdr(skb)->protocol);
2375                         if (our
2376 #ifdef CONFIG_IP_MROUTE
2377                                 ||
2378                             (!ipv4_is_local_multicast(daddr) &&
2379                              IN_DEV_MFORWARD(in_dev))
2380 #endif
2381                            ) {
2382                                 int res = ip_route_input_mc(skb, daddr, saddr,
2383                                                             tos, dev, our);
2384                                 rcu_read_unlock();
2385                                 return res;
2386                         }
2387                 }
2388                 rcu_read_unlock();
2389                 return -EINVAL;
2390         }
2391         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2392         rcu_read_unlock();
2393         return res;
2394 }
2395 EXPORT_SYMBOL(ip_route_input_common);
2396
2397 /* called with rcu_read_lock() */
2398 static int __mkroute_output(struct rtable **result,
2399                             struct fib_result *res,
2400                             const struct flowi *fl,
2401                             const struct flowi *oldflp,
2402                             struct net_device *dev_out,
2403                             unsigned flags)
2404 {
2405         struct rtable *rth;
2406         struct in_device *in_dev;
2407         u32 tos = RT_FL_TOS(oldflp);
2408
2409         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
2410                 return -EINVAL;
2411
2412         if (ipv4_is_lbcast(fl->fl4_dst))
2413                 res->type = RTN_BROADCAST;
2414         else if (ipv4_is_multicast(fl->fl4_dst))
2415                 res->type = RTN_MULTICAST;
2416         else if (ipv4_is_zeronet(fl->fl4_dst))
2417                 return -EINVAL;
2418
2419         if (dev_out->flags & IFF_LOOPBACK)
2420                 flags |= RTCF_LOCAL;
2421
2422         in_dev = __in_dev_get_rcu(dev_out);
2423         if (!in_dev)
2424                 return -EINVAL;
2425
2426         if (res->type == RTN_BROADCAST) {
2427                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2428                 res->fi = NULL;
2429         } else if (res->type == RTN_MULTICAST) {
2430                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2431                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2432                                  oldflp->proto))
2433                         flags &= ~RTCF_LOCAL;
2434                 /* If multicast route do not exist use
2435                  * default one, but do not gateway in this case.
2436                  * Yes, it is hack.
2437                  */
2438                 if (res->fi && res->prefixlen < 4)
2439                         res->fi = NULL;
2440         }
2441
2442
2443         rth = dst_alloc(&ipv4_dst_ops);
2444         if (!rth)
2445                 return -ENOBUFS;
2446
2447         atomic_set(&rth->dst.__refcnt, 1);
2448         rth->dst.flags= DST_HOST;
2449         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2450                 rth->dst.flags |= DST_NOXFRM;
2451         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2452                 rth->dst.flags |= DST_NOPOLICY;
2453
2454         rth->fl.fl4_dst = oldflp->fl4_dst;
2455         rth->fl.fl4_tos = tos;
2456         rth->fl.fl4_src = oldflp->fl4_src;
2457         rth->fl.oif     = oldflp->oif;
2458         rth->fl.mark    = oldflp->mark;
2459         rth->rt_dst     = fl->fl4_dst;
2460         rth->rt_src     = fl->fl4_src;
2461         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2462         /* get references to the devices that are to be hold by the routing
2463            cache entry */
2464         rth->dst.dev    = dev_out;
2465         dev_hold(dev_out);
2466         rth->rt_gateway = fl->fl4_dst;
2467         rth->rt_spec_dst= fl->fl4_src;
2468
2469         rth->dst.output=ip_output;
2470         rth->dst.obsolete = -1;
2471         rth->rt_genid = rt_genid(dev_net(dev_out));
2472
2473         RT_CACHE_STAT_INC(out_slow_tot);
2474
2475         if (flags & RTCF_LOCAL) {
2476                 rth->dst.input = ip_local_deliver;
2477                 rth->rt_spec_dst = fl->fl4_dst;
2478         }
2479         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2480                 rth->rt_spec_dst = fl->fl4_src;
2481                 if (flags & RTCF_LOCAL &&
2482                     !(dev_out->flags & IFF_LOOPBACK)) {
2483                         rth->dst.output = ip_mc_output;
2484                         RT_CACHE_STAT_INC(out_slow_mc);
2485                 }
2486 #ifdef CONFIG_IP_MROUTE
2487                 if (res->type == RTN_MULTICAST) {
2488                         if (IN_DEV_MFORWARD(in_dev) &&
2489                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2490                                 rth->dst.input = ip_mr_input;
2491                                 rth->dst.output = ip_mc_output;
2492                         }
2493                 }
2494 #endif
2495         }
2496
2497         rt_set_nexthop(rth, res, 0);
2498
2499         rth->rt_flags = flags;
2500         *result = rth;
2501         return 0;
2502 }
2503
2504 /* called with rcu_read_lock() */
2505 static int ip_mkroute_output(struct rtable **rp,
2506                              struct fib_result *res,
2507                              const struct flowi *fl,
2508                              const struct flowi *oldflp,
2509                              struct net_device *dev_out,
2510                              unsigned flags)
2511 {
2512         struct rtable *rth = NULL;
2513         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2514         unsigned hash;
2515         if (err == 0) {
2516                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2517                                rt_genid(dev_net(dev_out)));
2518                 err = rt_intern_hash(hash, rth, rp, NULL, oldflp->oif);
2519         }
2520
2521         return err;
2522 }
2523
2524 /*
2525  * Major route resolver routine.
2526  * called with rcu_read_lock();
2527  */
2528
2529 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2530                                 const struct flowi *oldflp)
2531 {
2532         u32 tos = RT_FL_TOS(oldflp);
2533         struct flowi fl = { .fl4_dst = oldflp->fl4_dst,
2534                             .fl4_src = oldflp->fl4_src,
2535                             .fl4_tos = tos & IPTOS_RT_MASK,
2536                             .fl4_scope = ((tos & RTO_ONLINK) ?
2537                                           RT_SCOPE_LINK : RT_SCOPE_UNIVERSE),
2538                             .mark = oldflp->mark,
2539                             .iif = net->loopback_dev->ifindex,
2540                             .oif = oldflp->oif };
2541         struct fib_result res;
2542         unsigned int flags = 0;
2543         struct net_device *dev_out = NULL;
2544         int err;
2545
2546
2547         res.fi          = NULL;
2548 #ifdef CONFIG_IP_MULTIPLE_TABLES
2549         res.r           = NULL;
2550 #endif
2551
2552         if (oldflp->fl4_src) {
2553                 err = -EINVAL;
2554                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2555                     ipv4_is_lbcast(oldflp->fl4_src) ||
2556                     ipv4_is_zeronet(oldflp->fl4_src))
2557                         goto out;
2558
2559                 /* I removed check for oif == dev_out->oif here.
2560                    It was wrong for two reasons:
2561                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2562                       is assigned to multiple interfaces.
2563                    2. Moreover, we are allowed to send packets with saddr
2564                       of another iface. --ANK
2565                  */
2566
2567                 if (oldflp->oif == 0 &&
2568                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2569                      ipv4_is_lbcast(oldflp->fl4_dst))) {
2570                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2571                         dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
2572                         if (dev_out == NULL)
2573                                 goto out;
2574
2575                         /* Special hack: user can direct multicasts
2576                            and limited broadcast via necessary interface
2577                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2578                            This hack is not just for fun, it allows
2579                            vic,vat and friends to work.
2580                            They bind socket to loopback, set ttl to zero
2581                            and expect that it will work.
2582                            From the viewpoint of routing cache they are broken,
2583                            because we are not allowed to build multicast path
2584                            with loopback source addr (look, routing cache
2585                            cannot know, that ttl is zero, so that packet
2586                            will not leave this host and route is valid).
2587                            Luckily, this hack is good workaround.
2588                          */
2589
2590                         fl.oif = dev_out->ifindex;
2591                         goto make_route;
2592                 }
2593
2594                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2595                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2596                         if (!__ip_dev_find(net, oldflp->fl4_src, false))
2597                                 goto out;
2598                 }
2599         }
2600
2601
2602         if (oldflp->oif) {
2603                 dev_out = dev_get_by_index_rcu(net, oldflp->oif);
2604                 err = -ENODEV;
2605                 if (dev_out == NULL)
2606                         goto out;
2607
2608                 /* RACE: Check return value of inet_select_addr instead. */
2609                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2610                         err = -ENETUNREACH;
2611                         goto out;
2612                 }
2613                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2614                     ipv4_is_lbcast(oldflp->fl4_dst)) {
2615                         if (!fl.fl4_src)
2616                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2617                                                               RT_SCOPE_LINK);
2618                         goto make_route;
2619                 }
2620                 if (!fl.fl4_src) {
2621                         if (ipv4_is_multicast(oldflp->fl4_dst))
2622                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2623                                                               fl.fl4_scope);
2624                         else if (!oldflp->fl4_dst)
2625                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2626                                                               RT_SCOPE_HOST);
2627                 }
2628         }
2629
2630         if (!fl.fl4_dst) {
2631                 fl.fl4_dst = fl.fl4_src;
2632                 if (!fl.fl4_dst)
2633                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2634                 dev_out = net->loopback_dev;
2635                 fl.oif = net->loopback_dev->ifindex;
2636                 res.type = RTN_LOCAL;
2637                 flags |= RTCF_LOCAL;
2638                 goto make_route;
2639         }
2640
2641         if (fib_lookup(net, &fl, &res)) {
2642                 res.fi = NULL;
2643                 if (oldflp->oif) {
2644                         /* Apparently, routing tables are wrong. Assume,
2645                            that the destination is on link.
2646
2647                            WHY? DW.
2648                            Because we are allowed to send to iface
2649                            even if it has NO routes and NO assigned
2650                            addresses. When oif is specified, routing
2651                            tables are looked up with only one purpose:
2652                            to catch if destination is gatewayed, rather than
2653                            direct. Moreover, if MSG_DONTROUTE is set,
2654                            we send packet, ignoring both routing tables
2655                            and ifaddr state. --ANK
2656
2657
2658                            We could make it even if oif is unknown,
2659                            likely IPv6, but we do not.
2660                          */
2661
2662                         if (fl.fl4_src == 0)
2663                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2664                                                               RT_SCOPE_LINK);
2665                         res.type = RTN_UNICAST;
2666                         goto make_route;
2667                 }
2668                 err = -ENETUNREACH;
2669                 goto out;
2670         }
2671
2672         if (res.type == RTN_LOCAL) {
2673                 if (!fl.fl4_src) {
2674                         if (res.fi->fib_prefsrc)
2675                                 fl.fl4_src = res.fi->fib_prefsrc;
2676                         else
2677                                 fl.fl4_src = fl.fl4_dst;
2678                 }
2679                 dev_out = net->loopback_dev;
2680                 fl.oif = dev_out->ifindex;
2681                 res.fi = NULL;
2682                 flags |= RTCF_LOCAL;
2683                 goto make_route;
2684         }
2685
2686 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2687         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2688                 fib_select_multipath(&fl, &res);
2689         else
2690 #endif
2691         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2692                 fib_select_default(net, &fl, &res);
2693
2694         if (!fl.fl4_src)
2695                 fl.fl4_src = FIB_RES_PREFSRC(res);
2696
2697         dev_out = FIB_RES_DEV(res);
2698         fl.oif = dev_out->ifindex;
2699
2700
2701 make_route:
2702         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2703
2704 out:    return err;
2705 }
2706
2707 int __ip_route_output_key(struct net *net, struct rtable **rp,
2708                           const struct flowi *flp)
2709 {
2710         unsigned int hash;
2711         int res;
2712         struct rtable *rth;
2713
2714         if (!rt_caching(net))
2715                 goto slow_output;
2716
2717         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2718
2719         rcu_read_lock_bh();
2720         for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth;
2721                 rth = rcu_dereference_bh(rth->dst.rt_next)) {
2722                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2723                     rth->fl.fl4_src == flp->fl4_src &&
2724                     rt_is_output_route(rth) &&
2725                     rth->fl.oif == flp->oif &&
2726                     rth->fl.mark == flp->mark &&
2727                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2728                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2729                     net_eq(dev_net(rth->dst.dev), net) &&
2730                     !rt_is_expired(rth)) {
2731                         dst_use(&rth->dst, jiffies);
2732                         RT_CACHE_STAT_INC(out_hit);
2733                         rcu_read_unlock_bh();
2734                         *rp = rth;
2735                         return 0;
2736                 }
2737                 RT_CACHE_STAT_INC(out_hlist_search);
2738         }
2739         rcu_read_unlock_bh();
2740
2741 slow_output:
2742         rcu_read_lock();
2743         res = ip_route_output_slow(net, rp, flp);
2744         rcu_read_unlock();
2745         return res;
2746 }
2747 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2748
2749 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2750 {
2751         return NULL;
2752 }
2753
2754 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2755 {
2756 }
2757
2758 static struct dst_ops ipv4_dst_blackhole_ops = {
2759         .family                 =       AF_INET,
2760         .protocol               =       cpu_to_be16(ETH_P_IP),
2761         .destroy                =       ipv4_dst_destroy,
2762         .check                  =       ipv4_blackhole_dst_check,
2763         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2764 };
2765
2766
2767 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2768 {
2769         struct rtable *ort = *rp;
2770         struct rtable *rt = (struct rtable *)
2771                 dst_alloc(&ipv4_dst_blackhole_ops);
2772
2773         if (rt) {
2774                 struct dst_entry *new = &rt->dst;
2775
2776                 atomic_set(&new->__refcnt, 1);
2777                 new->__use = 1;
2778                 new->input = dst_discard;
2779                 new->output = dst_discard;
2780                 dst_copy_metrics(new, &ort->dst);
2781
2782                 new->dev = ort->dst.dev;
2783                 if (new->dev)
2784                         dev_hold(new->dev);
2785
2786                 rt->fl = ort->fl;
2787
2788                 rt->rt_genid = rt_genid(net);
2789                 rt->rt_flags = ort->rt_flags;
2790                 rt->rt_type = ort->rt_type;
2791                 rt->rt_dst = ort->rt_dst;
2792                 rt->rt_src = ort->rt_src;
2793                 rt->rt_iif = ort->rt_iif;
2794                 rt->rt_gateway = ort->rt_gateway;
2795                 rt->rt_spec_dst = ort->rt_spec_dst;
2796                 rt->peer = ort->peer;
2797                 if (rt->peer)
2798                         atomic_inc(&rt->peer->refcnt);
2799                 rt->fi = ort->fi;
2800                 if (rt->fi)
2801                         atomic_inc(&rt->fi->fib_clntref);
2802
2803                 dst_free(new);
2804         }
2805
2806         dst_release(&(*rp)->dst);
2807         *rp = rt;
2808         return rt ? 0 : -ENOMEM;
2809 }
2810
2811 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2812                          struct sock *sk, int flags)
2813 {
2814         int err;
2815
2816         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2817                 return err;
2818
2819         if (flp->proto) {
2820                 if (!flp->fl4_src)
2821                         flp->fl4_src = (*rp)->rt_src;
2822                 if (!flp->fl4_dst)
2823                         flp->fl4_dst = (*rp)->rt_dst;
2824                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2825                                     flags ? XFRM_LOOKUP_WAIT : 0);
2826                 if (err == -EREMOTE)
2827                         err = ipv4_dst_blackhole(net, rp, flp);
2828
2829                 return err;
2830         }
2831
2832         return 0;
2833 }
2834 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2835
2836 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2837 {
2838         return ip_route_output_flow(net, rp, flp, NULL, 0);
2839 }
2840 EXPORT_SYMBOL(ip_route_output_key);
2841
2842 static int rt_fill_info(struct net *net,
2843                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2844                         int nowait, unsigned int flags)
2845 {
2846         struct rtable *rt = skb_rtable(skb);
2847         struct rtmsg *r;
2848         struct nlmsghdr *nlh;
2849         long expires;
2850         u32 id = 0, ts = 0, tsage = 0, error;
2851
2852         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2853         if (nlh == NULL)
2854                 return -EMSGSIZE;
2855
2856         r = nlmsg_data(nlh);
2857         r->rtm_family    = AF_INET;
2858         r->rtm_dst_len  = 32;
2859         r->rtm_src_len  = 0;
2860         r->rtm_tos      = rt->fl.fl4_tos;
2861         r->rtm_table    = RT_TABLE_MAIN;
2862         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2863         r->rtm_type     = rt->rt_type;
2864         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2865         r->rtm_protocol = RTPROT_UNSPEC;
2866         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2867         if (rt->rt_flags & RTCF_NOTIFY)
2868                 r->rtm_flags |= RTM_F_NOTIFY;
2869
2870         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2871
2872         if (rt->fl.fl4_src) {
2873                 r->rtm_src_len = 32;
2874                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2875         }
2876         if (rt->dst.dev)
2877                 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2878 #ifdef CONFIG_IP_ROUTE_CLASSID
2879         if (rt->dst.tclassid)
2880                 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2881 #endif
2882         if (rt_is_input_route(rt))
2883                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2884         else if (rt->rt_src != rt->fl.fl4_src)
2885                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2886
2887         if (rt->rt_dst != rt->rt_gateway)
2888                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2889
2890         if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2891                 goto nla_put_failure;
2892
2893         if (rt->fl.mark)
2894                 NLA_PUT_BE32(skb, RTA_MARK, rt->fl.mark);
2895
2896         error = rt->dst.error;
2897         expires = rt->dst.expires ? rt->dst.expires - jiffies : 0;
2898         if (rt->peer) {
2899                 inet_peer_refcheck(rt->peer);
2900                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2901                 if (rt->peer->tcp_ts_stamp) {
2902                         ts = rt->peer->tcp_ts;
2903                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2904                 }
2905         }
2906
2907         if (rt_is_input_route(rt)) {
2908 #ifdef CONFIG_IP_MROUTE
2909                 __be32 dst = rt->rt_dst;
2910
2911                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2912                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2913                         int err = ipmr_get_route(net, skb, r, nowait);
2914                         if (err <= 0) {
2915                                 if (!nowait) {
2916                                         if (err == 0)
2917                                                 return 0;
2918                                         goto nla_put_failure;
2919                                 } else {
2920                                         if (err == -EMSGSIZE)
2921                                                 goto nla_put_failure;
2922                                         error = err;
2923                                 }
2924                         }
2925                 } else
2926 #endif
2927                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2928         }
2929
2930         if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage,
2931                                expires, error) < 0)
2932                 goto nla_put_failure;
2933
2934         return nlmsg_end(skb, nlh);
2935
2936 nla_put_failure:
2937         nlmsg_cancel(skb, nlh);
2938         return -EMSGSIZE;
2939 }
2940
2941 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2942 {
2943         struct net *net = sock_net(in_skb->sk);
2944         struct rtmsg *rtm;
2945         struct nlattr *tb[RTA_MAX+1];
2946         struct rtable *rt = NULL;
2947         __be32 dst = 0;
2948         __be32 src = 0;
2949         u32 iif;
2950         int err;
2951         int mark;
2952         struct sk_buff *skb;
2953
2954         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2955         if (err < 0)
2956                 goto errout;
2957
2958         rtm = nlmsg_data(nlh);
2959
2960         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2961         if (skb == NULL) {
2962                 err = -ENOBUFS;
2963                 goto errout;
2964         }
2965
2966         /* Reserve room for dummy headers, this skb can pass
2967            through good chunk of routing engine.
2968          */
2969         skb_reset_mac_header(skb);
2970         skb_reset_network_header(skb);
2971
2972         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2973         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2974         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2975
2976         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2977         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2978         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2979         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2980
2981         if (iif) {
2982                 struct net_device *dev;
2983
2984                 dev = __dev_get_by_index(net, iif);
2985                 if (dev == NULL) {
2986                         err = -ENODEV;
2987                         goto errout_free;
2988                 }
2989
2990                 skb->protocol   = htons(ETH_P_IP);
2991                 skb->dev        = dev;
2992                 skb->mark       = mark;
2993                 local_bh_disable();
2994                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2995                 local_bh_enable();
2996
2997                 rt = skb_rtable(skb);
2998                 if (err == 0 && rt->dst.error)
2999                         err = -rt->dst.error;
3000         } else {
3001                 struct flowi fl = {
3002                         .fl4_dst = dst,
3003                         .fl4_src = src,
3004                         .fl4_tos = rtm->rtm_tos,
3005                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
3006                         .mark = mark,
3007                 };
3008                 err = ip_route_output_key(net, &rt, &fl);
3009         }
3010
3011         if (err)
3012                 goto errout_free;
3013
3014         skb_dst_set(skb, &rt->dst);
3015         if (rtm->rtm_flags & RTM_F_NOTIFY)
3016                 rt->rt_flags |= RTCF_NOTIFY;
3017
3018         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
3019                            RTM_NEWROUTE, 0, 0);
3020         if (err <= 0)
3021                 goto errout_free;
3022
3023         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
3024 errout:
3025         return err;
3026
3027 errout_free:
3028         kfree_skb(skb);
3029         goto errout;
3030 }
3031
3032 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
3033 {
3034         struct rtable *rt;
3035         int h, s_h;
3036         int idx, s_idx;
3037         struct net *net;
3038
3039         net = sock_net(skb->sk);
3040
3041         s_h = cb->args[0];
3042         if (s_h < 0)
3043                 s_h = 0;
3044         s_idx = idx = cb->args[1];
3045         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3046                 if (!rt_hash_table[h].chain)
3047                         continue;
3048                 rcu_read_lock_bh();
3049                 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt;
3050                      rt = rcu_dereference_bh(rt->dst.rt_next), idx++) {
3051                         if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx)
3052                                 continue;
3053                         if (rt_is_expired(rt))
3054                                 continue;
3055                         skb_dst_set_noref(skb, &rt->dst);
3056                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3057                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3058                                          1, NLM_F_MULTI) <= 0) {
3059                                 skb_dst_drop(skb);
3060                                 rcu_read_unlock_bh();
3061                                 goto done;
3062                         }
3063                         skb_dst_drop(skb);
3064                 }
3065                 rcu_read_unlock_bh();
3066         }
3067
3068 done:
3069         cb->args[0] = h;
3070         cb->args[1] = idx;
3071         return skb->len;
3072 }
3073
3074 void ip_rt_multicast_event(struct in_device *in_dev)
3075 {
3076         rt_cache_flush(dev_net(in_dev->dev), 0);
3077 }
3078
3079 #ifdef CONFIG_SYSCTL
3080 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3081                                         void __user *buffer,
3082                                         size_t *lenp, loff_t *ppos)
3083 {
3084         if (write) {
3085                 int flush_delay;
3086                 ctl_table ctl;
3087                 struct net *net;
3088
3089                 memcpy(&ctl, __ctl, sizeof(ctl));
3090                 ctl.data = &flush_delay;
3091                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3092
3093                 net = (struct net *)__ctl->extra1;
3094                 rt_cache_flush(net, flush_delay);
3095                 return 0;
3096         }
3097
3098         return -EINVAL;
3099 }
3100
3101 static ctl_table ipv4_route_table[] = {
3102         {
3103                 .procname       = "gc_thresh",
3104                 .data           = &ipv4_dst_ops.gc_thresh,
3105                 .maxlen         = sizeof(int),
3106                 .mode           = 0644,
3107                 .proc_handler   = proc_dointvec,
3108         },
3109         {
3110                 .procname       = "max_size",
3111                 .data           = &ip_rt_max_size,
3112                 .maxlen         = sizeof(int),
3113                 .mode           = 0644,
3114                 .proc_handler   = proc_dointvec,
3115         },
3116         {
3117                 /*  Deprecated. Use gc_min_interval_ms */
3118
3119                 .procname       = "gc_min_interval",
3120                 .data           = &ip_rt_gc_min_interval,
3121                 .maxlen         = sizeof(int),
3122                 .mode           = 0644,
3123                 .proc_handler   = proc_dointvec_jiffies,
3124         },
3125         {
3126                 .procname       = "gc_min_interval_ms",
3127                 .data           = &ip_rt_gc_min_interval,
3128                 .maxlen         = sizeof(int),
3129                 .mode           = 0644,
3130                 .proc_handler   = proc_dointvec_ms_jiffies,
3131         },
3132         {
3133                 .procname       = "gc_timeout",
3134                 .data           = &ip_rt_gc_timeout,
3135                 .maxlen         = sizeof(int),
3136                 .mode           = 0644,
3137                 .proc_handler   = proc_dointvec_jiffies,
3138         },
3139         {
3140                 .procname       = "gc_interval",
3141                 .data           = &ip_rt_gc_interval,
3142                 .maxlen         = sizeof(int),
3143                 .mode           = 0644,
3144                 .proc_handler   = proc_dointvec_jiffies,
3145         },
3146         {
3147                 .procname       = "redirect_load",
3148                 .data           = &ip_rt_redirect_load,
3149                 .maxlen         = sizeof(int),
3150                 .mode           = 0644,
3151                 .proc_handler   = proc_dointvec,
3152         },
3153         {
3154                 .procname       = "redirect_number",
3155                 .data           = &ip_rt_redirect_number,
3156                 .maxlen         = sizeof(int),
3157                 .mode           = 0644,
3158                 .proc_handler   = proc_dointvec,
3159         },
3160         {
3161                 .procname       = "redirect_silence",
3162                 .data           = &ip_rt_redirect_silence,
3163                 .maxlen         = sizeof(int),
3164                 .mode           = 0644,
3165                 .proc_handler   = proc_dointvec,
3166         },
3167         {
3168                 .procname       = "error_cost",
3169                 .data           = &ip_rt_error_cost,
3170                 .maxlen         = sizeof(int),
3171                 .mode           = 0644,
3172                 .proc_handler   = proc_dointvec,
3173         },
3174         {
3175                 .procname       = "error_burst",
3176                 .data           = &ip_rt_error_burst,
3177                 .maxlen         = sizeof(int),
3178                 .mode           = 0644,
3179                 .proc_handler   = proc_dointvec,
3180         },
3181         {
3182                 .procname       = "gc_elasticity",
3183                 .data           = &ip_rt_gc_elasticity,
3184                 .maxlen         = sizeof(int),
3185                 .mode           = 0644,
3186                 .proc_handler   = proc_dointvec,
3187         },
3188         {
3189                 .procname       = "mtu_expires",
3190                 .data           = &ip_rt_mtu_expires,
3191                 .maxlen         = sizeof(int),
3192                 .mode           = 0644,
3193                 .proc_handler   = proc_dointvec_jiffies,
3194         },
3195         {
3196                 .procname       = "min_pmtu",
3197                 .data           = &ip_rt_min_pmtu,
3198                 .maxlen         = sizeof(int),
3199                 .mode           = 0644,
3200                 .proc_handler   = proc_dointvec,
3201         },
3202         {
3203                 .procname       = "min_adv_mss",
3204                 .data           = &ip_rt_min_advmss,
3205                 .maxlen         = sizeof(int),
3206                 .mode           = 0644,
3207                 .proc_handler   = proc_dointvec,
3208         },
3209         { }
3210 };
3211
3212 static struct ctl_table empty[1];
3213
3214 static struct ctl_table ipv4_skeleton[] =
3215 {
3216         { .procname = "route", 
3217           .mode = 0555, .child = ipv4_route_table},
3218         { .procname = "neigh", 
3219           .mode = 0555, .child = empty},
3220         { }
3221 };
3222
3223 static __net_initdata struct ctl_path ipv4_path[] = {
3224         { .procname = "net", },
3225         { .procname = "ipv4", },
3226         { },
3227 };
3228
3229 static struct ctl_table ipv4_route_flush_table[] = {
3230         {
3231                 .procname       = "flush",
3232                 .maxlen         = sizeof(int),
3233                 .mode           = 0200,
3234                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3235         },
3236         { },
3237 };
3238
3239 static __net_initdata struct ctl_path ipv4_route_path[] = {
3240         { .procname = "net", },
3241         { .procname = "ipv4", },
3242         { .procname = "route", },
3243         { },
3244 };
3245
3246 static __net_init int sysctl_route_net_init(struct net *net)
3247 {
3248         struct ctl_table *tbl;
3249
3250         tbl = ipv4_route_flush_table;
3251         if (!net_eq(net, &init_net)) {
3252                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3253                 if (tbl == NULL)
3254                         goto err_dup;
3255         }
3256         tbl[0].extra1 = net;
3257
3258         net->ipv4.route_hdr =
3259                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3260         if (net->ipv4.route_hdr == NULL)
3261                 goto err_reg;
3262         return 0;
3263
3264 err_reg:
3265         if (tbl != ipv4_route_flush_table)
3266                 kfree(tbl);
3267 err_dup:
3268         return -ENOMEM;
3269 }
3270
3271 static __net_exit void sysctl_route_net_exit(struct net *net)
3272 {
3273         struct ctl_table *tbl;
3274
3275         tbl = net->ipv4.route_hdr->ctl_table_arg;
3276         unregister_net_sysctl_table(net->ipv4.route_hdr);
3277         BUG_ON(tbl == ipv4_route_flush_table);
3278         kfree(tbl);
3279 }
3280
3281 static __net_initdata struct pernet_operations sysctl_route_ops = {
3282         .init = sysctl_route_net_init,
3283         .exit = sysctl_route_net_exit,
3284 };
3285 #endif
3286
3287 static __net_init int rt_genid_init(struct net *net)
3288 {
3289         get_random_bytes(&net->ipv4.rt_genid,
3290                          sizeof(net->ipv4.rt_genid));
3291         return 0;
3292 }
3293
3294 static __net_initdata struct pernet_operations rt_genid_ops = {
3295         .init = rt_genid_init,
3296 };
3297
3298
3299 #ifdef CONFIG_IP_ROUTE_CLASSID
3300 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3301 #endif /* CONFIG_IP_ROUTE_CLASSID */
3302
3303 static __initdata unsigned long rhash_entries;
3304 static int __init set_rhash_entries(char *str)
3305 {
3306         if (!str)
3307                 return 0;
3308         rhash_entries = simple_strtoul(str, &str, 0);
3309         return 1;
3310 }
3311 __setup("rhash_entries=", set_rhash_entries);
3312
3313 int __init ip_rt_init(void)
3314 {
3315         int rc = 0;
3316
3317 #ifdef CONFIG_IP_ROUTE_CLASSID
3318         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3319         if (!ip_rt_acct)
3320                 panic("IP: failed to allocate ip_rt_acct\n");
3321 #endif
3322
3323         ipv4_dst_ops.kmem_cachep =
3324                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3325                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3326
3327         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3328
3329         if (dst_entries_init(&ipv4_dst_ops) < 0)
3330                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3331
3332         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3333                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3334
3335         rt_hash_table = (struct rt_hash_bucket *)
3336                 alloc_large_system_hash("IP route cache",
3337                                         sizeof(struct rt_hash_bucket),
3338                                         rhash_entries,
3339                                         (totalram_pages >= 128 * 1024) ?
3340                                         15 : 17,
3341                                         0,
3342                                         &rt_hash_log,
3343                                         &rt_hash_mask,
3344                                         rhash_entries ? 0 : 512 * 1024);
3345         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3346         rt_hash_lock_init();
3347
3348         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3349         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3350
3351         devinet_init();
3352         ip_fib_init();
3353
3354         /* All the timers, started at system startup tend
3355            to synchronize. Perturb it a bit.
3356          */
3357         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3358         expires_ljiffies = jiffies;
3359         schedule_delayed_work(&expires_work,
3360                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3361
3362         if (ip_rt_proc_init())
3363                 printk(KERN_ERR "Unable to create route proc files\n");
3364 #ifdef CONFIG_XFRM
3365         xfrm_init();
3366         xfrm4_init(ip_rt_max_size);
3367 #endif
3368         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3369
3370 #ifdef CONFIG_SYSCTL
3371         register_pernet_subsys(&sysctl_route_ops);
3372 #endif
3373         register_pernet_subsys(&rt_genid_ops);
3374         return rc;
3375 }
3376
3377 #ifdef CONFIG_SYSCTL
3378 /*
3379  * We really need to sanitize the damn ipv4 init order, then all
3380  * this nonsense will go away.
3381  */
3382 void __init ip_static_sysctl_init(void)
3383 {
3384         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3385 }
3386 #endif