]> Pileus Git - ~andy/linux/blob - net/ipv4/route.c
Merge tag 'for-usb-linus-2013-07-31' of git://git.kernel.org/pub/scm/linux/kernel...
[~andy/linux] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/skbuff.h>
83 #include <linux/inetdevice.h>
84 #include <linux/igmp.h>
85 #include <linux/pkt_sched.h>
86 #include <linux/mroute.h>
87 #include <linux/netfilter_ipv4.h>
88 #include <linux/random.h>
89 #include <linux/rcupdate.h>
90 #include <linux/times.h>
91 #include <linux/slab.h>
92 #include <net/dst.h>
93 #include <net/net_namespace.h>
94 #include <net/protocol.h>
95 #include <net/ip.h>
96 #include <net/route.h>
97 #include <net/inetpeer.h>
98 #include <net/sock.h>
99 #include <net/ip_fib.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/netevent.h>
105 #include <net/rtnetlink.h>
106 #ifdef CONFIG_SYSCTL
107 #include <linux/sysctl.h>
108 #include <linux/kmemleak.h>
109 #endif
110 #include <net/secure_seq.h>
111
112 #define RT_FL_TOS(oldflp4) \
113         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
114
115 #define IP_MAX_MTU      0xFFF0
116
117 #define RT_GC_TIMEOUT (300*HZ)
118
119 static int ip_rt_max_size;
120 static int ip_rt_redirect_number __read_mostly  = 9;
121 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
122 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
123 static int ip_rt_error_cost __read_mostly       = HZ;
124 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
125 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
126 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
127 static int ip_rt_min_advmss __read_mostly       = 256;
128
129 /*
130  *      Interface to generic destination cache.
131  */
132
133 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
134 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
135 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
136 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
137 static void              ipv4_link_failure(struct sk_buff *skb);
138 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
139                                            struct sk_buff *skb, u32 mtu);
140 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
141                                         struct sk_buff *skb);
142 static void             ipv4_dst_destroy(struct dst_entry *dst);
143
144 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
145                             int how)
146 {
147 }
148
149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151         WARN_ON(1);
152         return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156                                            struct sk_buff *skb,
157                                            const void *daddr);
158
159 static struct dst_ops ipv4_dst_ops = {
160         .family =               AF_INET,
161         .protocol =             cpu_to_be16(ETH_P_IP),
162         .check =                ipv4_dst_check,
163         .default_advmss =       ipv4_default_advmss,
164         .mtu =                  ipv4_mtu,
165         .cow_metrics =          ipv4_cow_metrics,
166         .destroy =              ipv4_dst_destroy,
167         .ifdown =               ipv4_dst_ifdown,
168         .negative_advice =      ipv4_negative_advice,
169         .link_failure =         ipv4_link_failure,
170         .update_pmtu =          ip_rt_update_pmtu,
171         .redirect =             ip_do_redirect,
172         .local_out =            __ip_local_out,
173         .neigh_lookup =         ipv4_neigh_lookup,
174 };
175
176 #define ECN_OR_COST(class)      TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BESTEFFORT,
182         ECN_OR_COST(BESTEFFORT),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_BULK,
186         ECN_OR_COST(BULK),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE,
190         ECN_OR_COST(INTERACTIVE),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK),
193         TC_PRIO_INTERACTIVE_BULK,
194         ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204         if (*pos)
205                 return NULL;
206         return SEQ_START_TOKEN;
207 }
208
209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211         ++*pos;
212         return NULL;
213 }
214
215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221         if (v == SEQ_START_TOKEN)
222                 seq_printf(seq, "%-127s\n",
223                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225                            "HHUptod\tSpecDst");
226         return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230         .start  = rt_cache_seq_start,
231         .next   = rt_cache_seq_next,
232         .stop   = rt_cache_seq_stop,
233         .show   = rt_cache_seq_show,
234 };
235
236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238         return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct file_operations rt_cache_seq_fops = {
242         .owner   = THIS_MODULE,
243         .open    = rt_cache_seq_open,
244         .read    = seq_read,
245         .llseek  = seq_lseek,
246         .release = seq_release,
247 };
248
249
250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251 {
252         int cpu;
253
254         if (*pos == 0)
255                 return SEQ_START_TOKEN;
256
257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
258                 if (!cpu_possible(cpu))
259                         continue;
260                 *pos = cpu+1;
261                 return &per_cpu(rt_cache_stat, cpu);
262         }
263         return NULL;
264 }
265
266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267 {
268         int cpu;
269
270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
271                 if (!cpu_possible(cpu))
272                         continue;
273                 *pos = cpu+1;
274                 return &per_cpu(rt_cache_stat, cpu);
275         }
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    st->in_hit,
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    st->out_hit,
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    st->gc_total,
310                    st->gc_ignored,
311                    st->gc_goal_miss,
312                    st->gc_dst_overflow,
313                    st->in_hlist_search,
314                    st->out_hlist_search
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .owner   = THIS_MODULE,
334         .open    = rt_cpu_seq_open,
335         .read    = seq_read,
336         .llseek  = seq_lseek,
337         .release = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343         struct ip_rt_acct *dst, *src;
344         unsigned int i, j;
345
346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347         if (!dst)
348                 return -ENOMEM;
349
350         for_each_possible_cpu(i) {
351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352                 for (j = 0; j < 256; j++) {
353                         dst[j].o_bytes   += src[j].o_bytes;
354                         dst[j].o_packets += src[j].o_packets;
355                         dst[j].i_bytes   += src[j].i_bytes;
356                         dst[j].i_packets += src[j].i_packets;
357                 }
358         }
359
360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361         kfree(dst);
362         return 0;
363 }
364
365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367         return single_open(file, rt_acct_proc_show, NULL);
368 }
369
370 static const struct file_operations rt_acct_proc_fops = {
371         .owner          = THIS_MODULE,
372         .open           = rt_acct_proc_open,
373         .read           = seq_read,
374         .llseek         = seq_lseek,
375         .release        = single_release,
376 };
377 #endif
378
379 static int __net_init ip_rt_do_proc_init(struct net *net)
380 {
381         struct proc_dir_entry *pde;
382
383         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384                           &rt_cache_seq_fops);
385         if (!pde)
386                 goto err1;
387
388         pde = proc_create("rt_cache", S_IRUGO,
389                           net->proc_net_stat, &rt_cpu_seq_fops);
390         if (!pde)
391                 goto err2;
392
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395         if (!pde)
396                 goto err3;
397 #endif
398         return 0;
399
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 err3:
402         remove_proc_entry("rt_cache", net->proc_net_stat);
403 #endif
404 err2:
405         remove_proc_entry("rt_cache", net->proc_net);
406 err1:
407         return -ENOMEM;
408 }
409
410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 {
412         remove_proc_entry("rt_cache", net->proc_net_stat);
413         remove_proc_entry("rt_cache", net->proc_net);
414 #ifdef CONFIG_IP_ROUTE_CLASSID
415         remove_proc_entry("rt_acct", net->proc_net);
416 #endif
417 }
418
419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
420         .init = ip_rt_do_proc_init,
421         .exit = ip_rt_do_proc_exit,
422 };
423
424 static int __init ip_rt_proc_init(void)
425 {
426         return register_pernet_subsys(&ip_rt_proc_ops);
427 }
428
429 #else
430 static inline int ip_rt_proc_init(void)
431 {
432         return 0;
433 }
434 #endif /* CONFIG_PROC_FS */
435
436 static inline bool rt_is_expired(const struct rtable *rth)
437 {
438         return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
439 }
440
441 void rt_cache_flush(struct net *net)
442 {
443         rt_genid_bump(net);
444 }
445
446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447                                            struct sk_buff *skb,
448                                            const void *daddr)
449 {
450         struct net_device *dev = dst->dev;
451         const __be32 *pkey = daddr;
452         const struct rtable *rt;
453         struct neighbour *n;
454
455         rt = (const struct rtable *) dst;
456         if (rt->rt_gateway)
457                 pkey = (const __be32 *) &rt->rt_gateway;
458         else if (skb)
459                 pkey = &ip_hdr(skb)->daddr;
460
461         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462         if (n)
463                 return n;
464         return neigh_create(&arp_tbl, pkey, dev);
465 }
466
467 /*
468  * Peer allocation may fail only in serious out-of-memory conditions.  However
469  * we still can generate some output.
470  * Random ID selection looks a bit dangerous because we have no chances to
471  * select ID being unique in a reasonable period of time.
472  * But broken packet identifier may be better than no packet at all.
473  */
474 static void ip_select_fb_ident(struct iphdr *iph)
475 {
476         static DEFINE_SPINLOCK(ip_fb_id_lock);
477         static u32 ip_fallback_id;
478         u32 salt;
479
480         spin_lock_bh(&ip_fb_id_lock);
481         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
482         iph->id = htons(salt & 0xFFFF);
483         ip_fallback_id = salt;
484         spin_unlock_bh(&ip_fb_id_lock);
485 }
486
487 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
488 {
489         struct net *net = dev_net(dst->dev);
490         struct inet_peer *peer;
491
492         peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1);
493         if (peer) {
494                 iph->id = htons(inet_getid(peer, more));
495                 inet_putpeer(peer);
496                 return;
497         }
498
499         ip_select_fb_ident(iph);
500 }
501 EXPORT_SYMBOL(__ip_select_ident);
502
503 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
504                              const struct iphdr *iph,
505                              int oif, u8 tos,
506                              u8 prot, u32 mark, int flow_flags)
507 {
508         if (sk) {
509                 const struct inet_sock *inet = inet_sk(sk);
510
511                 oif = sk->sk_bound_dev_if;
512                 mark = sk->sk_mark;
513                 tos = RT_CONN_FLAGS(sk);
514                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
515         }
516         flowi4_init_output(fl4, oif, mark, tos,
517                            RT_SCOPE_UNIVERSE, prot,
518                            flow_flags,
519                            iph->daddr, iph->saddr, 0, 0);
520 }
521
522 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
523                                const struct sock *sk)
524 {
525         const struct iphdr *iph = ip_hdr(skb);
526         int oif = skb->dev->ifindex;
527         u8 tos = RT_TOS(iph->tos);
528         u8 prot = iph->protocol;
529         u32 mark = skb->mark;
530
531         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
532 }
533
534 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
535 {
536         const struct inet_sock *inet = inet_sk(sk);
537         const struct ip_options_rcu *inet_opt;
538         __be32 daddr = inet->inet_daddr;
539
540         rcu_read_lock();
541         inet_opt = rcu_dereference(inet->inet_opt);
542         if (inet_opt && inet_opt->opt.srr)
543                 daddr = inet_opt->opt.faddr;
544         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
545                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
546                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
547                            inet_sk_flowi_flags(sk),
548                            daddr, inet->inet_saddr, 0, 0);
549         rcu_read_unlock();
550 }
551
552 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
553                                  const struct sk_buff *skb)
554 {
555         if (skb)
556                 build_skb_flow_key(fl4, skb, sk);
557         else
558                 build_sk_flow_key(fl4, sk);
559 }
560
561 static inline void rt_free(struct rtable *rt)
562 {
563         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
564 }
565
566 static DEFINE_SPINLOCK(fnhe_lock);
567
568 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
569 {
570         struct rtable *rt;
571
572         rt = rcu_dereference(fnhe->fnhe_rth_input);
573         if (rt) {
574                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
575                 rt_free(rt);
576         }
577         rt = rcu_dereference(fnhe->fnhe_rth_output);
578         if (rt) {
579                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
580                 rt_free(rt);
581         }
582 }
583
584 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
585 {
586         struct fib_nh_exception *fnhe, *oldest;
587
588         oldest = rcu_dereference(hash->chain);
589         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
590              fnhe = rcu_dereference(fnhe->fnhe_next)) {
591                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
592                         oldest = fnhe;
593         }
594         fnhe_flush_routes(oldest);
595         return oldest;
596 }
597
598 static inline u32 fnhe_hashfun(__be32 daddr)
599 {
600         u32 hval;
601
602         hval = (__force u32) daddr;
603         hval ^= (hval >> 11) ^ (hval >> 22);
604
605         return hval & (FNHE_HASH_SIZE - 1);
606 }
607
608 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
609 {
610         rt->rt_pmtu = fnhe->fnhe_pmtu;
611         rt->dst.expires = fnhe->fnhe_expires;
612
613         if (fnhe->fnhe_gw) {
614                 rt->rt_flags |= RTCF_REDIRECTED;
615                 rt->rt_gateway = fnhe->fnhe_gw;
616                 rt->rt_uses_gateway = 1;
617         }
618 }
619
620 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
621                                   u32 pmtu, unsigned long expires)
622 {
623         struct fnhe_hash_bucket *hash;
624         struct fib_nh_exception *fnhe;
625         struct rtable *rt;
626         unsigned int i;
627         int depth;
628         u32 hval = fnhe_hashfun(daddr);
629
630         spin_lock_bh(&fnhe_lock);
631
632         hash = nh->nh_exceptions;
633         if (!hash) {
634                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
635                 if (!hash)
636                         goto out_unlock;
637                 nh->nh_exceptions = hash;
638         }
639
640         hash += hval;
641
642         depth = 0;
643         for (fnhe = rcu_dereference(hash->chain); fnhe;
644              fnhe = rcu_dereference(fnhe->fnhe_next)) {
645                 if (fnhe->fnhe_daddr == daddr)
646                         break;
647                 depth++;
648         }
649
650         if (fnhe) {
651                 if (gw)
652                         fnhe->fnhe_gw = gw;
653                 if (pmtu) {
654                         fnhe->fnhe_pmtu = pmtu;
655                         fnhe->fnhe_expires = max(1UL, expires);
656                 }
657                 /* Update all cached dsts too */
658                 rt = rcu_dereference(fnhe->fnhe_rth_input);
659                 if (rt)
660                         fill_route_from_fnhe(rt, fnhe);
661                 rt = rcu_dereference(fnhe->fnhe_rth_output);
662                 if (rt)
663                         fill_route_from_fnhe(rt, fnhe);
664         } else {
665                 if (depth > FNHE_RECLAIM_DEPTH)
666                         fnhe = fnhe_oldest(hash);
667                 else {
668                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
669                         if (!fnhe)
670                                 goto out_unlock;
671
672                         fnhe->fnhe_next = hash->chain;
673                         rcu_assign_pointer(hash->chain, fnhe);
674                 }
675                 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
676                 fnhe->fnhe_daddr = daddr;
677                 fnhe->fnhe_gw = gw;
678                 fnhe->fnhe_pmtu = pmtu;
679                 fnhe->fnhe_expires = expires;
680
681                 /* Exception created; mark the cached routes for the nexthop
682                  * stale, so anyone caching it rechecks if this exception
683                  * applies to them.
684                  */
685                 rt = rcu_dereference(nh->nh_rth_input);
686                 if (rt)
687                         rt->dst.obsolete = DST_OBSOLETE_KILL;
688
689                 for_each_possible_cpu(i) {
690                         struct rtable __rcu **prt;
691                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
692                         rt = rcu_dereference(*prt);
693                         if (rt)
694                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
695                 }
696         }
697
698         fnhe->fnhe_stamp = jiffies;
699
700 out_unlock:
701         spin_unlock_bh(&fnhe_lock);
702         return;
703 }
704
705 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
706                              bool kill_route)
707 {
708         __be32 new_gw = icmp_hdr(skb)->un.gateway;
709         __be32 old_gw = ip_hdr(skb)->saddr;
710         struct net_device *dev = skb->dev;
711         struct in_device *in_dev;
712         struct fib_result res;
713         struct neighbour *n;
714         struct net *net;
715
716         switch (icmp_hdr(skb)->code & 7) {
717         case ICMP_REDIR_NET:
718         case ICMP_REDIR_NETTOS:
719         case ICMP_REDIR_HOST:
720         case ICMP_REDIR_HOSTTOS:
721                 break;
722
723         default:
724                 return;
725         }
726
727         if (rt->rt_gateway != old_gw)
728                 return;
729
730         in_dev = __in_dev_get_rcu(dev);
731         if (!in_dev)
732                 return;
733
734         net = dev_net(dev);
735         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
736             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
737             ipv4_is_zeronet(new_gw))
738                 goto reject_redirect;
739
740         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
741                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
742                         goto reject_redirect;
743                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
744                         goto reject_redirect;
745         } else {
746                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
747                         goto reject_redirect;
748         }
749
750         n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw);
751         if (n) {
752                 if (!(n->nud_state & NUD_VALID)) {
753                         neigh_event_send(n, NULL);
754                 } else {
755                         if (fib_lookup(net, fl4, &res) == 0) {
756                                 struct fib_nh *nh = &FIB_RES_NH(res);
757
758                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
759                                                       0, 0);
760                         }
761                         if (kill_route)
762                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
763                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
764                 }
765                 neigh_release(n);
766         }
767         return;
768
769 reject_redirect:
770 #ifdef CONFIG_IP_ROUTE_VERBOSE
771         if (IN_DEV_LOG_MARTIANS(in_dev)) {
772                 const struct iphdr *iph = (const struct iphdr *) skb->data;
773                 __be32 daddr = iph->daddr;
774                 __be32 saddr = iph->saddr;
775
776                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
777                                      "  Advised path = %pI4 -> %pI4\n",
778                                      &old_gw, dev->name, &new_gw,
779                                      &saddr, &daddr);
780         }
781 #endif
782         ;
783 }
784
785 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
786 {
787         struct rtable *rt;
788         struct flowi4 fl4;
789         const struct iphdr *iph = (const struct iphdr *) skb->data;
790         int oif = skb->dev->ifindex;
791         u8 tos = RT_TOS(iph->tos);
792         u8 prot = iph->protocol;
793         u32 mark = skb->mark;
794
795         rt = (struct rtable *) dst;
796
797         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
798         __ip_do_redirect(rt, skb, &fl4, true);
799 }
800
801 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
802 {
803         struct rtable *rt = (struct rtable *)dst;
804         struct dst_entry *ret = dst;
805
806         if (rt) {
807                 if (dst->obsolete > 0) {
808                         ip_rt_put(rt);
809                         ret = NULL;
810                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
811                            rt->dst.expires) {
812                         ip_rt_put(rt);
813                         ret = NULL;
814                 }
815         }
816         return ret;
817 }
818
819 /*
820  * Algorithm:
821  *      1. The first ip_rt_redirect_number redirects are sent
822  *         with exponential backoff, then we stop sending them at all,
823  *         assuming that the host ignores our redirects.
824  *      2. If we did not see packets requiring redirects
825  *         during ip_rt_redirect_silence, we assume that the host
826  *         forgot redirected route and start to send redirects again.
827  *
828  * This algorithm is much cheaper and more intelligent than dumb load limiting
829  * in icmp.c.
830  *
831  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
832  * and "frag. need" (breaks PMTU discovery) in icmp.c.
833  */
834
835 void ip_rt_send_redirect(struct sk_buff *skb)
836 {
837         struct rtable *rt = skb_rtable(skb);
838         struct in_device *in_dev;
839         struct inet_peer *peer;
840         struct net *net;
841         int log_martians;
842
843         rcu_read_lock();
844         in_dev = __in_dev_get_rcu(rt->dst.dev);
845         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
846                 rcu_read_unlock();
847                 return;
848         }
849         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
850         rcu_read_unlock();
851
852         net = dev_net(rt->dst.dev);
853         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
854         if (!peer) {
855                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
856                           rt_nexthop(rt, ip_hdr(skb)->daddr));
857                 return;
858         }
859
860         /* No redirected packets during ip_rt_redirect_silence;
861          * reset the algorithm.
862          */
863         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
864                 peer->rate_tokens = 0;
865
866         /* Too many ignored redirects; do not send anything
867          * set dst.rate_last to the last seen redirected packet.
868          */
869         if (peer->rate_tokens >= ip_rt_redirect_number) {
870                 peer->rate_last = jiffies;
871                 goto out_put_peer;
872         }
873
874         /* Check for load limit; set rate_last to the latest sent
875          * redirect.
876          */
877         if (peer->rate_tokens == 0 ||
878             time_after(jiffies,
879                        (peer->rate_last +
880                         (ip_rt_redirect_load << peer->rate_tokens)))) {
881                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
882
883                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
884                 peer->rate_last = jiffies;
885                 ++peer->rate_tokens;
886 #ifdef CONFIG_IP_ROUTE_VERBOSE
887                 if (log_martians &&
888                     peer->rate_tokens == ip_rt_redirect_number)
889                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
890                                              &ip_hdr(skb)->saddr, inet_iif(skb),
891                                              &ip_hdr(skb)->daddr, &gw);
892 #endif
893         }
894 out_put_peer:
895         inet_putpeer(peer);
896 }
897
898 static int ip_error(struct sk_buff *skb)
899 {
900         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
901         struct rtable *rt = skb_rtable(skb);
902         struct inet_peer *peer;
903         unsigned long now;
904         struct net *net;
905         bool send;
906         int code;
907
908         net = dev_net(rt->dst.dev);
909         if (!IN_DEV_FORWARD(in_dev)) {
910                 switch (rt->dst.error) {
911                 case EHOSTUNREACH:
912                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
913                         break;
914
915                 case ENETUNREACH:
916                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
917                         break;
918                 }
919                 goto out;
920         }
921
922         switch (rt->dst.error) {
923         case EINVAL:
924         default:
925                 goto out;
926         case EHOSTUNREACH:
927                 code = ICMP_HOST_UNREACH;
928                 break;
929         case ENETUNREACH:
930                 code = ICMP_NET_UNREACH;
931                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
932                 break;
933         case EACCES:
934                 code = ICMP_PKT_FILTERED;
935                 break;
936         }
937
938         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1);
939
940         send = true;
941         if (peer) {
942                 now = jiffies;
943                 peer->rate_tokens += now - peer->rate_last;
944                 if (peer->rate_tokens > ip_rt_error_burst)
945                         peer->rate_tokens = ip_rt_error_burst;
946                 peer->rate_last = now;
947                 if (peer->rate_tokens >= ip_rt_error_cost)
948                         peer->rate_tokens -= ip_rt_error_cost;
949                 else
950                         send = false;
951                 inet_putpeer(peer);
952         }
953         if (send)
954                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
955
956 out:    kfree_skb(skb);
957         return 0;
958 }
959
960 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
961 {
962         struct dst_entry *dst = &rt->dst;
963         struct fib_result res;
964
965         if (dst_metric_locked(dst, RTAX_MTU))
966                 return;
967
968         if (dst->dev->mtu < mtu)
969                 return;
970
971         if (mtu < ip_rt_min_pmtu)
972                 mtu = ip_rt_min_pmtu;
973
974         if (rt->rt_pmtu == mtu &&
975             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
976                 return;
977
978         rcu_read_lock();
979         if (fib_lookup(dev_net(dst->dev), fl4, &res) == 0) {
980                 struct fib_nh *nh = &FIB_RES_NH(res);
981
982                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
983                                       jiffies + ip_rt_mtu_expires);
984         }
985         rcu_read_unlock();
986 }
987
988 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
989                               struct sk_buff *skb, u32 mtu)
990 {
991         struct rtable *rt = (struct rtable *) dst;
992         struct flowi4 fl4;
993
994         ip_rt_build_flow_key(&fl4, sk, skb);
995         __ip_rt_update_pmtu(rt, &fl4, mtu);
996 }
997
998 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
999                       int oif, u32 mark, u8 protocol, int flow_flags)
1000 {
1001         const struct iphdr *iph = (const struct iphdr *) skb->data;
1002         struct flowi4 fl4;
1003         struct rtable *rt;
1004
1005         __build_flow_key(&fl4, NULL, iph, oif,
1006                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1007         rt = __ip_route_output_key(net, &fl4);
1008         if (!IS_ERR(rt)) {
1009                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1010                 ip_rt_put(rt);
1011         }
1012 }
1013 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1014
1015 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1016 {
1017         const struct iphdr *iph = (const struct iphdr *) skb->data;
1018         struct flowi4 fl4;
1019         struct rtable *rt;
1020
1021         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1022         rt = __ip_route_output_key(sock_net(sk), &fl4);
1023         if (!IS_ERR(rt)) {
1024                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1025                 ip_rt_put(rt);
1026         }
1027 }
1028
1029 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1030 {
1031         const struct iphdr *iph = (const struct iphdr *) skb->data;
1032         struct flowi4 fl4;
1033         struct rtable *rt;
1034         struct dst_entry *dst;
1035         bool new = false;
1036
1037         bh_lock_sock(sk);
1038         rt = (struct rtable *) __sk_dst_get(sk);
1039
1040         if (sock_owned_by_user(sk) || !rt) {
1041                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1042                 goto out;
1043         }
1044
1045         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1046
1047         if (!__sk_dst_check(sk, 0)) {
1048                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1049                 if (IS_ERR(rt))
1050                         goto out;
1051
1052                 new = true;
1053         }
1054
1055         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1056
1057         dst = dst_check(&rt->dst, 0);
1058         if (!dst) {
1059                 if (new)
1060                         dst_release(&rt->dst);
1061
1062                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1063                 if (IS_ERR(rt))
1064                         goto out;
1065
1066                 new = true;
1067         }
1068
1069         if (new)
1070                 __sk_dst_set(sk, &rt->dst);
1071
1072 out:
1073         bh_unlock_sock(sk);
1074 }
1075 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1076
1077 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1078                    int oif, u32 mark, u8 protocol, int flow_flags)
1079 {
1080         const struct iphdr *iph = (const struct iphdr *) skb->data;
1081         struct flowi4 fl4;
1082         struct rtable *rt;
1083
1084         __build_flow_key(&fl4, NULL, iph, oif,
1085                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1086         rt = __ip_route_output_key(net, &fl4);
1087         if (!IS_ERR(rt)) {
1088                 __ip_do_redirect(rt, skb, &fl4, false);
1089                 ip_rt_put(rt);
1090         }
1091 }
1092 EXPORT_SYMBOL_GPL(ipv4_redirect);
1093
1094 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1095 {
1096         const struct iphdr *iph = (const struct iphdr *) skb->data;
1097         struct flowi4 fl4;
1098         struct rtable *rt;
1099
1100         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1101         rt = __ip_route_output_key(sock_net(sk), &fl4);
1102         if (!IS_ERR(rt)) {
1103                 __ip_do_redirect(rt, skb, &fl4, false);
1104                 ip_rt_put(rt);
1105         }
1106 }
1107 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1108
1109 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1110 {
1111         struct rtable *rt = (struct rtable *) dst;
1112
1113         /* All IPV4 dsts are created with ->obsolete set to the value
1114          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1115          * into this function always.
1116          *
1117          * When a PMTU/redirect information update invalidates a route,
1118          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1119          * DST_OBSOLETE_DEAD by dst_free().
1120          */
1121         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1122                 return NULL;
1123         return dst;
1124 }
1125
1126 static void ipv4_link_failure(struct sk_buff *skb)
1127 {
1128         struct rtable *rt;
1129
1130         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1131
1132         rt = skb_rtable(skb);
1133         if (rt)
1134                 dst_set_expires(&rt->dst, 0);
1135 }
1136
1137 static int ip_rt_bug(struct sk_buff *skb)
1138 {
1139         pr_debug("%s: %pI4 -> %pI4, %s\n",
1140                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1141                  skb->dev ? skb->dev->name : "?");
1142         kfree_skb(skb);
1143         WARN_ON(1);
1144         return 0;
1145 }
1146
1147 /*
1148    We do not cache source address of outgoing interface,
1149    because it is used only by IP RR, TS and SRR options,
1150    so that it out of fast path.
1151
1152    BTW remember: "addr" is allowed to be not aligned
1153    in IP options!
1154  */
1155
1156 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1157 {
1158         __be32 src;
1159
1160         if (rt_is_output_route(rt))
1161                 src = ip_hdr(skb)->saddr;
1162         else {
1163                 struct fib_result res;
1164                 struct flowi4 fl4;
1165                 struct iphdr *iph;
1166
1167                 iph = ip_hdr(skb);
1168
1169                 memset(&fl4, 0, sizeof(fl4));
1170                 fl4.daddr = iph->daddr;
1171                 fl4.saddr = iph->saddr;
1172                 fl4.flowi4_tos = RT_TOS(iph->tos);
1173                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1174                 fl4.flowi4_iif = skb->dev->ifindex;
1175                 fl4.flowi4_mark = skb->mark;
1176
1177                 rcu_read_lock();
1178                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0)
1179                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1180                 else
1181                         src = inet_select_addr(rt->dst.dev,
1182                                                rt_nexthop(rt, iph->daddr),
1183                                                RT_SCOPE_UNIVERSE);
1184                 rcu_read_unlock();
1185         }
1186         memcpy(addr, &src, 4);
1187 }
1188
1189 #ifdef CONFIG_IP_ROUTE_CLASSID
1190 static void set_class_tag(struct rtable *rt, u32 tag)
1191 {
1192         if (!(rt->dst.tclassid & 0xFFFF))
1193                 rt->dst.tclassid |= tag & 0xFFFF;
1194         if (!(rt->dst.tclassid & 0xFFFF0000))
1195                 rt->dst.tclassid |= tag & 0xFFFF0000;
1196 }
1197 #endif
1198
1199 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1200 {
1201         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1202
1203         if (advmss == 0) {
1204                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1205                                ip_rt_min_advmss);
1206                 if (advmss > 65535 - 40)
1207                         advmss = 65535 - 40;
1208         }
1209         return advmss;
1210 }
1211
1212 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1213 {
1214         const struct rtable *rt = (const struct rtable *) dst;
1215         unsigned int mtu = rt->rt_pmtu;
1216
1217         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1218                 mtu = dst_metric_raw(dst, RTAX_MTU);
1219
1220         if (mtu)
1221                 return mtu;
1222
1223         mtu = dst->dev->mtu;
1224
1225         if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
1226                 if (rt->rt_uses_gateway && mtu > 576)
1227                         mtu = 576;
1228         }
1229
1230         if (mtu > IP_MAX_MTU)
1231                 mtu = IP_MAX_MTU;
1232
1233         return mtu;
1234 }
1235
1236 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1237 {
1238         struct fnhe_hash_bucket *hash = nh->nh_exceptions;
1239         struct fib_nh_exception *fnhe;
1240         u32 hval;
1241
1242         if (!hash)
1243                 return NULL;
1244
1245         hval = fnhe_hashfun(daddr);
1246
1247         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1248              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1249                 if (fnhe->fnhe_daddr == daddr)
1250                         return fnhe;
1251         }
1252         return NULL;
1253 }
1254
1255 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1256                               __be32 daddr)
1257 {
1258         bool ret = false;
1259
1260         spin_lock_bh(&fnhe_lock);
1261
1262         if (daddr == fnhe->fnhe_daddr) {
1263                 struct rtable __rcu **porig;
1264                 struct rtable *orig;
1265                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1266
1267                 if (rt_is_input_route(rt))
1268                         porig = &fnhe->fnhe_rth_input;
1269                 else
1270                         porig = &fnhe->fnhe_rth_output;
1271                 orig = rcu_dereference(*porig);
1272
1273                 if (fnhe->fnhe_genid != genid) {
1274                         fnhe->fnhe_genid = genid;
1275                         fnhe->fnhe_gw = 0;
1276                         fnhe->fnhe_pmtu = 0;
1277                         fnhe->fnhe_expires = 0;
1278                         fnhe_flush_routes(fnhe);
1279                         orig = NULL;
1280                 }
1281                 fill_route_from_fnhe(rt, fnhe);
1282                 if (!rt->rt_gateway)
1283                         rt->rt_gateway = daddr;
1284
1285                 if (!(rt->dst.flags & DST_NOCACHE)) {
1286                         rcu_assign_pointer(*porig, rt);
1287                         if (orig)
1288                                 rt_free(orig);
1289                         ret = true;
1290                 }
1291
1292                 fnhe->fnhe_stamp = jiffies;
1293         }
1294         spin_unlock_bh(&fnhe_lock);
1295
1296         return ret;
1297 }
1298
1299 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1300 {
1301         struct rtable *orig, *prev, **p;
1302         bool ret = true;
1303
1304         if (rt_is_input_route(rt)) {
1305                 p = (struct rtable **)&nh->nh_rth_input;
1306         } else {
1307                 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1308         }
1309         orig = *p;
1310
1311         prev = cmpxchg(p, orig, rt);
1312         if (prev == orig) {
1313                 if (orig)
1314                         rt_free(orig);
1315         } else
1316                 ret = false;
1317
1318         return ret;
1319 }
1320
1321 static DEFINE_SPINLOCK(rt_uncached_lock);
1322 static LIST_HEAD(rt_uncached_list);
1323
1324 static void rt_add_uncached_list(struct rtable *rt)
1325 {
1326         spin_lock_bh(&rt_uncached_lock);
1327         list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1328         spin_unlock_bh(&rt_uncached_lock);
1329 }
1330
1331 static void ipv4_dst_destroy(struct dst_entry *dst)
1332 {
1333         struct rtable *rt = (struct rtable *) dst;
1334
1335         if (!list_empty(&rt->rt_uncached)) {
1336                 spin_lock_bh(&rt_uncached_lock);
1337                 list_del(&rt->rt_uncached);
1338                 spin_unlock_bh(&rt_uncached_lock);
1339         }
1340 }
1341
1342 void rt_flush_dev(struct net_device *dev)
1343 {
1344         if (!list_empty(&rt_uncached_list)) {
1345                 struct net *net = dev_net(dev);
1346                 struct rtable *rt;
1347
1348                 spin_lock_bh(&rt_uncached_lock);
1349                 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1350                         if (rt->dst.dev != dev)
1351                                 continue;
1352                         rt->dst.dev = net->loopback_dev;
1353                         dev_hold(rt->dst.dev);
1354                         dev_put(dev);
1355                 }
1356                 spin_unlock_bh(&rt_uncached_lock);
1357         }
1358 }
1359
1360 static bool rt_cache_valid(const struct rtable *rt)
1361 {
1362         return  rt &&
1363                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1364                 !rt_is_expired(rt);
1365 }
1366
1367 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1368                            const struct fib_result *res,
1369                            struct fib_nh_exception *fnhe,
1370                            struct fib_info *fi, u16 type, u32 itag)
1371 {
1372         bool cached = false;
1373
1374         if (fi) {
1375                 struct fib_nh *nh = &FIB_RES_NH(*res);
1376
1377                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1378                         rt->rt_gateway = nh->nh_gw;
1379                         rt->rt_uses_gateway = 1;
1380                 }
1381                 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1382 #ifdef CONFIG_IP_ROUTE_CLASSID
1383                 rt->dst.tclassid = nh->nh_tclassid;
1384 #endif
1385                 if (unlikely(fnhe))
1386                         cached = rt_bind_exception(rt, fnhe, daddr);
1387                 else if (!(rt->dst.flags & DST_NOCACHE))
1388                         cached = rt_cache_route(nh, rt);
1389                 if (unlikely(!cached)) {
1390                         /* Routes we intend to cache in nexthop exception or
1391                          * FIB nexthop have the DST_NOCACHE bit clear.
1392                          * However, if we are unsuccessful at storing this
1393                          * route into the cache we really need to set it.
1394                          */
1395                         rt->dst.flags |= DST_NOCACHE;
1396                         if (!rt->rt_gateway)
1397                                 rt->rt_gateway = daddr;
1398                         rt_add_uncached_list(rt);
1399                 }
1400         } else
1401                 rt_add_uncached_list(rt);
1402
1403 #ifdef CONFIG_IP_ROUTE_CLASSID
1404 #ifdef CONFIG_IP_MULTIPLE_TABLES
1405         set_class_tag(rt, res->tclassid);
1406 #endif
1407         set_class_tag(rt, itag);
1408 #endif
1409 }
1410
1411 static struct rtable *rt_dst_alloc(struct net_device *dev,
1412                                    bool nopolicy, bool noxfrm, bool will_cache)
1413 {
1414         return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1415                          (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1416                          (nopolicy ? DST_NOPOLICY : 0) |
1417                          (noxfrm ? DST_NOXFRM : 0));
1418 }
1419
1420 /* called in rcu_read_lock() section */
1421 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1422                                 u8 tos, struct net_device *dev, int our)
1423 {
1424         struct rtable *rth;
1425         struct in_device *in_dev = __in_dev_get_rcu(dev);
1426         u32 itag = 0;
1427         int err;
1428
1429         /* Primary sanity checks. */
1430
1431         if (in_dev == NULL)
1432                 return -EINVAL;
1433
1434         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1435             skb->protocol != htons(ETH_P_IP))
1436                 goto e_inval;
1437
1438         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1439                 if (ipv4_is_loopback(saddr))
1440                         goto e_inval;
1441
1442         if (ipv4_is_zeronet(saddr)) {
1443                 if (!ipv4_is_local_multicast(daddr))
1444                         goto e_inval;
1445         } else {
1446                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1447                                           in_dev, &itag);
1448                 if (err < 0)
1449                         goto e_err;
1450         }
1451         rth = rt_dst_alloc(dev_net(dev)->loopback_dev,
1452                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1453         if (!rth)
1454                 goto e_nobufs;
1455
1456 #ifdef CONFIG_IP_ROUTE_CLASSID
1457         rth->dst.tclassid = itag;
1458 #endif
1459         rth->dst.output = ip_rt_bug;
1460
1461         rth->rt_genid   = rt_genid(dev_net(dev));
1462         rth->rt_flags   = RTCF_MULTICAST;
1463         rth->rt_type    = RTN_MULTICAST;
1464         rth->rt_is_input= 1;
1465         rth->rt_iif     = 0;
1466         rth->rt_pmtu    = 0;
1467         rth->rt_gateway = 0;
1468         rth->rt_uses_gateway = 0;
1469         INIT_LIST_HEAD(&rth->rt_uncached);
1470         if (our) {
1471                 rth->dst.input= ip_local_deliver;
1472                 rth->rt_flags |= RTCF_LOCAL;
1473         }
1474
1475 #ifdef CONFIG_IP_MROUTE
1476         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1477                 rth->dst.input = ip_mr_input;
1478 #endif
1479         RT_CACHE_STAT_INC(in_slow_mc);
1480
1481         skb_dst_set(skb, &rth->dst);
1482         return 0;
1483
1484 e_nobufs:
1485         return -ENOBUFS;
1486 e_inval:
1487         return -EINVAL;
1488 e_err:
1489         return err;
1490 }
1491
1492
1493 static void ip_handle_martian_source(struct net_device *dev,
1494                                      struct in_device *in_dev,
1495                                      struct sk_buff *skb,
1496                                      __be32 daddr,
1497                                      __be32 saddr)
1498 {
1499         RT_CACHE_STAT_INC(in_martian_src);
1500 #ifdef CONFIG_IP_ROUTE_VERBOSE
1501         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1502                 /*
1503                  *      RFC1812 recommendation, if source is martian,
1504                  *      the only hint is MAC header.
1505                  */
1506                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1507                         &daddr, &saddr, dev->name);
1508                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1509                         print_hex_dump(KERN_WARNING, "ll header: ",
1510                                        DUMP_PREFIX_OFFSET, 16, 1,
1511                                        skb_mac_header(skb),
1512                                        dev->hard_header_len, true);
1513                 }
1514         }
1515 #endif
1516 }
1517
1518 /* called in rcu_read_lock() section */
1519 static int __mkroute_input(struct sk_buff *skb,
1520                            const struct fib_result *res,
1521                            struct in_device *in_dev,
1522                            __be32 daddr, __be32 saddr, u32 tos)
1523 {
1524         struct fib_nh_exception *fnhe;
1525         struct rtable *rth;
1526         int err;
1527         struct in_device *out_dev;
1528         unsigned int flags = 0;
1529         bool do_cache;
1530         u32 itag;
1531
1532         /* get a working reference to the output device */
1533         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1534         if (out_dev == NULL) {
1535                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1536                 return -EINVAL;
1537         }
1538
1539         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1540                                   in_dev->dev, in_dev, &itag);
1541         if (err < 0) {
1542                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1543                                          saddr);
1544
1545                 goto cleanup;
1546         }
1547
1548         do_cache = res->fi && !itag;
1549         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1550             (IN_DEV_SHARED_MEDIA(out_dev) ||
1551              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) {
1552                 flags |= RTCF_DOREDIRECT;
1553                 do_cache = false;
1554         }
1555
1556         if (skb->protocol != htons(ETH_P_IP)) {
1557                 /* Not IP (i.e. ARP). Do not create route, if it is
1558                  * invalid for proxy arp. DNAT routes are always valid.
1559                  *
1560                  * Proxy arp feature have been extended to allow, ARP
1561                  * replies back to the same interface, to support
1562                  * Private VLAN switch technologies. See arp.c.
1563                  */
1564                 if (out_dev == in_dev &&
1565                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1566                         err = -EINVAL;
1567                         goto cleanup;
1568                 }
1569         }
1570
1571         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1572         if (do_cache) {
1573                 if (fnhe != NULL)
1574                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1575                 else
1576                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1577
1578                 if (rt_cache_valid(rth)) {
1579                         skb_dst_set_noref(skb, &rth->dst);
1580                         goto out;
1581                 }
1582         }
1583
1584         rth = rt_dst_alloc(out_dev->dev,
1585                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1586                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1587         if (!rth) {
1588                 err = -ENOBUFS;
1589                 goto cleanup;
1590         }
1591
1592         rth->rt_genid = rt_genid(dev_net(rth->dst.dev));
1593         rth->rt_flags = flags;
1594         rth->rt_type = res->type;
1595         rth->rt_is_input = 1;
1596         rth->rt_iif     = 0;
1597         rth->rt_pmtu    = 0;
1598         rth->rt_gateway = 0;
1599         rth->rt_uses_gateway = 0;
1600         INIT_LIST_HEAD(&rth->rt_uncached);
1601
1602         rth->dst.input = ip_forward;
1603         rth->dst.output = ip_output;
1604
1605         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1606         skb_dst_set(skb, &rth->dst);
1607 out:
1608         err = 0;
1609  cleanup:
1610         return err;
1611 }
1612
1613 static int ip_mkroute_input(struct sk_buff *skb,
1614                             struct fib_result *res,
1615                             const struct flowi4 *fl4,
1616                             struct in_device *in_dev,
1617                             __be32 daddr, __be32 saddr, u32 tos)
1618 {
1619 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1620         if (res->fi && res->fi->fib_nhs > 1)
1621                 fib_select_multipath(res);
1622 #endif
1623
1624         /* create a routing cache entry */
1625         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1626 }
1627
1628 /*
1629  *      NOTE. We drop all the packets that has local source
1630  *      addresses, because every properly looped back packet
1631  *      must have correct destination already attached by output routine.
1632  *
1633  *      Such approach solves two big problems:
1634  *      1. Not simplex devices are handled properly.
1635  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1636  *      called with rcu_read_lock()
1637  */
1638
1639 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1640                                u8 tos, struct net_device *dev)
1641 {
1642         struct fib_result res;
1643         struct in_device *in_dev = __in_dev_get_rcu(dev);
1644         struct flowi4   fl4;
1645         unsigned int    flags = 0;
1646         u32             itag = 0;
1647         struct rtable   *rth;
1648         int             err = -EINVAL;
1649         struct net    *net = dev_net(dev);
1650         bool do_cache;
1651
1652         /* IP on this device is disabled. */
1653
1654         if (!in_dev)
1655                 goto out;
1656
1657         /* Check for the most weird martians, which can be not detected
1658            by fib_lookup.
1659          */
1660
1661         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1662                 goto martian_source;
1663
1664         res.fi = NULL;
1665         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1666                 goto brd_input;
1667
1668         /* Accept zero addresses only to limited broadcast;
1669          * I even do not know to fix it or not. Waiting for complains :-)
1670          */
1671         if (ipv4_is_zeronet(saddr))
1672                 goto martian_source;
1673
1674         if (ipv4_is_zeronet(daddr))
1675                 goto martian_destination;
1676
1677         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1678          * and call it once if daddr or/and saddr are loopback addresses
1679          */
1680         if (ipv4_is_loopback(daddr)) {
1681                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1682                         goto martian_destination;
1683         } else if (ipv4_is_loopback(saddr)) {
1684                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1685                         goto martian_source;
1686         }
1687
1688         /*
1689          *      Now we are ready to route packet.
1690          */
1691         fl4.flowi4_oif = 0;
1692         fl4.flowi4_iif = dev->ifindex;
1693         fl4.flowi4_mark = skb->mark;
1694         fl4.flowi4_tos = tos;
1695         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1696         fl4.daddr = daddr;
1697         fl4.saddr = saddr;
1698         err = fib_lookup(net, &fl4, &res);
1699         if (err != 0)
1700                 goto no_route;
1701
1702         RT_CACHE_STAT_INC(in_slow_tot);
1703
1704         if (res.type == RTN_BROADCAST)
1705                 goto brd_input;
1706
1707         if (res.type == RTN_LOCAL) {
1708                 err = fib_validate_source(skb, saddr, daddr, tos,
1709                                           LOOPBACK_IFINDEX,
1710                                           dev, in_dev, &itag);
1711                 if (err < 0)
1712                         goto martian_source_keep_err;
1713                 goto local_input;
1714         }
1715
1716         if (!IN_DEV_FORWARD(in_dev))
1717                 goto no_route;
1718         if (res.type != RTN_UNICAST)
1719                 goto martian_destination;
1720
1721         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1722 out:    return err;
1723
1724 brd_input:
1725         if (skb->protocol != htons(ETH_P_IP))
1726                 goto e_inval;
1727
1728         if (!ipv4_is_zeronet(saddr)) {
1729                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1730                                           in_dev, &itag);
1731                 if (err < 0)
1732                         goto martian_source_keep_err;
1733         }
1734         flags |= RTCF_BROADCAST;
1735         res.type = RTN_BROADCAST;
1736         RT_CACHE_STAT_INC(in_brd);
1737
1738 local_input:
1739         do_cache = false;
1740         if (res.fi) {
1741                 if (!itag) {
1742                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1743                         if (rt_cache_valid(rth)) {
1744                                 skb_dst_set_noref(skb, &rth->dst);
1745                                 err = 0;
1746                                 goto out;
1747                         }
1748                         do_cache = true;
1749                 }
1750         }
1751
1752         rth = rt_dst_alloc(net->loopback_dev,
1753                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1754         if (!rth)
1755                 goto e_nobufs;
1756
1757         rth->dst.input= ip_local_deliver;
1758         rth->dst.output= ip_rt_bug;
1759 #ifdef CONFIG_IP_ROUTE_CLASSID
1760         rth->dst.tclassid = itag;
1761 #endif
1762
1763         rth->rt_genid = rt_genid(net);
1764         rth->rt_flags   = flags|RTCF_LOCAL;
1765         rth->rt_type    = res.type;
1766         rth->rt_is_input = 1;
1767         rth->rt_iif     = 0;
1768         rth->rt_pmtu    = 0;
1769         rth->rt_gateway = 0;
1770         rth->rt_uses_gateway = 0;
1771         INIT_LIST_HEAD(&rth->rt_uncached);
1772         if (res.type == RTN_UNREACHABLE) {
1773                 rth->dst.input= ip_error;
1774                 rth->dst.error= -err;
1775                 rth->rt_flags   &= ~RTCF_LOCAL;
1776         }
1777         if (do_cache)
1778                 rt_cache_route(&FIB_RES_NH(res), rth);
1779         skb_dst_set(skb, &rth->dst);
1780         err = 0;
1781         goto out;
1782
1783 no_route:
1784         RT_CACHE_STAT_INC(in_no_route);
1785         res.type = RTN_UNREACHABLE;
1786         if (err == -ESRCH)
1787                 err = -ENETUNREACH;
1788         goto local_input;
1789
1790         /*
1791          *      Do not cache martian addresses: they should be logged (RFC1812)
1792          */
1793 martian_destination:
1794         RT_CACHE_STAT_INC(in_martian_dst);
1795 #ifdef CONFIG_IP_ROUTE_VERBOSE
1796         if (IN_DEV_LOG_MARTIANS(in_dev))
1797                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
1798                                      &daddr, &saddr, dev->name);
1799 #endif
1800
1801 e_inval:
1802         err = -EINVAL;
1803         goto out;
1804
1805 e_nobufs:
1806         err = -ENOBUFS;
1807         goto out;
1808
1809 martian_source:
1810         err = -EINVAL;
1811 martian_source_keep_err:
1812         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1813         goto out;
1814 }
1815
1816 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1817                          u8 tos, struct net_device *dev)
1818 {
1819         int res;
1820
1821         rcu_read_lock();
1822
1823         /* Multicast recognition logic is moved from route cache to here.
1824            The problem was that too many Ethernet cards have broken/missing
1825            hardware multicast filters :-( As result the host on multicasting
1826            network acquires a lot of useless route cache entries, sort of
1827            SDR messages from all the world. Now we try to get rid of them.
1828            Really, provided software IP multicast filter is organized
1829            reasonably (at least, hashed), it does not result in a slowdown
1830            comparing with route cache reject entries.
1831            Note, that multicast routers are not affected, because
1832            route cache entry is created eventually.
1833          */
1834         if (ipv4_is_multicast(daddr)) {
1835                 struct in_device *in_dev = __in_dev_get_rcu(dev);
1836
1837                 if (in_dev) {
1838                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
1839                                                   ip_hdr(skb)->protocol);
1840                         if (our
1841 #ifdef CONFIG_IP_MROUTE
1842                                 ||
1843                             (!ipv4_is_local_multicast(daddr) &&
1844                              IN_DEV_MFORWARD(in_dev))
1845 #endif
1846                            ) {
1847                                 int res = ip_route_input_mc(skb, daddr, saddr,
1848                                                             tos, dev, our);
1849                                 rcu_read_unlock();
1850                                 return res;
1851                         }
1852                 }
1853                 rcu_read_unlock();
1854                 return -EINVAL;
1855         }
1856         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
1857         rcu_read_unlock();
1858         return res;
1859 }
1860 EXPORT_SYMBOL(ip_route_input_noref);
1861
1862 /* called with rcu_read_lock() */
1863 static struct rtable *__mkroute_output(const struct fib_result *res,
1864                                        const struct flowi4 *fl4, int orig_oif,
1865                                        struct net_device *dev_out,
1866                                        unsigned int flags)
1867 {
1868         struct fib_info *fi = res->fi;
1869         struct fib_nh_exception *fnhe;
1870         struct in_device *in_dev;
1871         u16 type = res->type;
1872         struct rtable *rth;
1873         bool do_cache;
1874
1875         in_dev = __in_dev_get_rcu(dev_out);
1876         if (!in_dev)
1877                 return ERR_PTR(-EINVAL);
1878
1879         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
1880                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
1881                         return ERR_PTR(-EINVAL);
1882
1883         if (ipv4_is_lbcast(fl4->daddr))
1884                 type = RTN_BROADCAST;
1885         else if (ipv4_is_multicast(fl4->daddr))
1886                 type = RTN_MULTICAST;
1887         else if (ipv4_is_zeronet(fl4->daddr))
1888                 return ERR_PTR(-EINVAL);
1889
1890         if (dev_out->flags & IFF_LOOPBACK)
1891                 flags |= RTCF_LOCAL;
1892
1893         do_cache = true;
1894         if (type == RTN_BROADCAST) {
1895                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
1896                 fi = NULL;
1897         } else if (type == RTN_MULTICAST) {
1898                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
1899                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
1900                                      fl4->flowi4_proto))
1901                         flags &= ~RTCF_LOCAL;
1902                 else
1903                         do_cache = false;
1904                 /* If multicast route do not exist use
1905                  * default one, but do not gateway in this case.
1906                  * Yes, it is hack.
1907                  */
1908                 if (fi && res->prefixlen < 4)
1909                         fi = NULL;
1910         }
1911
1912         fnhe = NULL;
1913         do_cache &= fi != NULL;
1914         if (do_cache) {
1915                 struct rtable __rcu **prth;
1916                 struct fib_nh *nh = &FIB_RES_NH(*res);
1917
1918                 fnhe = find_exception(nh, fl4->daddr);
1919                 if (fnhe)
1920                         prth = &fnhe->fnhe_rth_output;
1921                 else {
1922                         if (unlikely(fl4->flowi4_flags &
1923                                      FLOWI_FLAG_KNOWN_NH &&
1924                                      !(nh->nh_gw &&
1925                                        nh->nh_scope == RT_SCOPE_LINK))) {
1926                                 do_cache = false;
1927                                 goto add;
1928                         }
1929                         prth = __this_cpu_ptr(nh->nh_pcpu_rth_output);
1930                 }
1931                 rth = rcu_dereference(*prth);
1932                 if (rt_cache_valid(rth)) {
1933                         dst_hold(&rth->dst);
1934                         return rth;
1935                 }
1936         }
1937
1938 add:
1939         rth = rt_dst_alloc(dev_out,
1940                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1941                            IN_DEV_CONF_GET(in_dev, NOXFRM),
1942                            do_cache);
1943         if (!rth)
1944                 return ERR_PTR(-ENOBUFS);
1945
1946         rth->dst.output = ip_output;
1947
1948         rth->rt_genid = rt_genid(dev_net(dev_out));
1949         rth->rt_flags   = flags;
1950         rth->rt_type    = type;
1951         rth->rt_is_input = 0;
1952         rth->rt_iif     = orig_oif ? : 0;
1953         rth->rt_pmtu    = 0;
1954         rth->rt_gateway = 0;
1955         rth->rt_uses_gateway = 0;
1956         INIT_LIST_HEAD(&rth->rt_uncached);
1957
1958         RT_CACHE_STAT_INC(out_slow_tot);
1959
1960         if (flags & RTCF_LOCAL)
1961                 rth->dst.input = ip_local_deliver;
1962         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
1963                 if (flags & RTCF_LOCAL &&
1964                     !(dev_out->flags & IFF_LOOPBACK)) {
1965                         rth->dst.output = ip_mc_output;
1966                         RT_CACHE_STAT_INC(out_slow_mc);
1967                 }
1968 #ifdef CONFIG_IP_MROUTE
1969                 if (type == RTN_MULTICAST) {
1970                         if (IN_DEV_MFORWARD(in_dev) &&
1971                             !ipv4_is_local_multicast(fl4->daddr)) {
1972                                 rth->dst.input = ip_mr_input;
1973                                 rth->dst.output = ip_mc_output;
1974                         }
1975                 }
1976 #endif
1977         }
1978
1979         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
1980
1981         return rth;
1982 }
1983
1984 /*
1985  * Major route resolver routine.
1986  */
1987
1988 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1989 {
1990         struct net_device *dev_out = NULL;
1991         __u8 tos = RT_FL_TOS(fl4);
1992         unsigned int flags = 0;
1993         struct fib_result res;
1994         struct rtable *rth;
1995         int orig_oif;
1996
1997         res.tclassid    = 0;
1998         res.fi          = NULL;
1999         res.table       = NULL;
2000
2001         orig_oif = fl4->flowi4_oif;
2002
2003         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2004         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2005         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2006                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2007
2008         rcu_read_lock();
2009         if (fl4->saddr) {
2010                 rth = ERR_PTR(-EINVAL);
2011                 if (ipv4_is_multicast(fl4->saddr) ||
2012                     ipv4_is_lbcast(fl4->saddr) ||
2013                     ipv4_is_zeronet(fl4->saddr))
2014                         goto out;
2015
2016                 /* I removed check for oif == dev_out->oif here.
2017                    It was wrong for two reasons:
2018                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2019                       is assigned to multiple interfaces.
2020                    2. Moreover, we are allowed to send packets with saddr
2021                       of another iface. --ANK
2022                  */
2023
2024                 if (fl4->flowi4_oif == 0 &&
2025                     (ipv4_is_multicast(fl4->daddr) ||
2026                      ipv4_is_lbcast(fl4->daddr))) {
2027                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2028                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2029                         if (dev_out == NULL)
2030                                 goto out;
2031
2032                         /* Special hack: user can direct multicasts
2033                            and limited broadcast via necessary interface
2034                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2035                            This hack is not just for fun, it allows
2036                            vic,vat and friends to work.
2037                            They bind socket to loopback, set ttl to zero
2038                            and expect that it will work.
2039                            From the viewpoint of routing cache they are broken,
2040                            because we are not allowed to build multicast path
2041                            with loopback source addr (look, routing cache
2042                            cannot know, that ttl is zero, so that packet
2043                            will not leave this host and route is valid).
2044                            Luckily, this hack is good workaround.
2045                          */
2046
2047                         fl4->flowi4_oif = dev_out->ifindex;
2048                         goto make_route;
2049                 }
2050
2051                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2052                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2053                         if (!__ip_dev_find(net, fl4->saddr, false))
2054                                 goto out;
2055                 }
2056         }
2057
2058
2059         if (fl4->flowi4_oif) {
2060                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2061                 rth = ERR_PTR(-ENODEV);
2062                 if (dev_out == NULL)
2063                         goto out;
2064
2065                 /* RACE: Check return value of inet_select_addr instead. */
2066                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2067                         rth = ERR_PTR(-ENETUNREACH);
2068                         goto out;
2069                 }
2070                 if (ipv4_is_local_multicast(fl4->daddr) ||
2071                     ipv4_is_lbcast(fl4->daddr)) {
2072                         if (!fl4->saddr)
2073                                 fl4->saddr = inet_select_addr(dev_out, 0,
2074                                                               RT_SCOPE_LINK);
2075                         goto make_route;
2076                 }
2077                 if (fl4->saddr) {
2078                         if (ipv4_is_multicast(fl4->daddr))
2079                                 fl4->saddr = inet_select_addr(dev_out, 0,
2080                                                               fl4->flowi4_scope);
2081                         else if (!fl4->daddr)
2082                                 fl4->saddr = inet_select_addr(dev_out, 0,
2083                                                               RT_SCOPE_HOST);
2084                 }
2085         }
2086
2087         if (!fl4->daddr) {
2088                 fl4->daddr = fl4->saddr;
2089                 if (!fl4->daddr)
2090                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2091                 dev_out = net->loopback_dev;
2092                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2093                 res.type = RTN_LOCAL;
2094                 flags |= RTCF_LOCAL;
2095                 goto make_route;
2096         }
2097
2098         if (fib_lookup(net, fl4, &res)) {
2099                 res.fi = NULL;
2100                 res.table = NULL;
2101                 if (fl4->flowi4_oif) {
2102                         /* Apparently, routing tables are wrong. Assume,
2103                            that the destination is on link.
2104
2105                            WHY? DW.
2106                            Because we are allowed to send to iface
2107                            even if it has NO routes and NO assigned
2108                            addresses. When oif is specified, routing
2109                            tables are looked up with only one purpose:
2110                            to catch if destination is gatewayed, rather than
2111                            direct. Moreover, if MSG_DONTROUTE is set,
2112                            we send packet, ignoring both routing tables
2113                            and ifaddr state. --ANK
2114
2115
2116                            We could make it even if oif is unknown,
2117                            likely IPv6, but we do not.
2118                          */
2119
2120                         if (fl4->saddr == 0)
2121                                 fl4->saddr = inet_select_addr(dev_out, 0,
2122                                                               RT_SCOPE_LINK);
2123                         res.type = RTN_UNICAST;
2124                         goto make_route;
2125                 }
2126                 rth = ERR_PTR(-ENETUNREACH);
2127                 goto out;
2128         }
2129
2130         if (res.type == RTN_LOCAL) {
2131                 if (!fl4->saddr) {
2132                         if (res.fi->fib_prefsrc)
2133                                 fl4->saddr = res.fi->fib_prefsrc;
2134                         else
2135                                 fl4->saddr = fl4->daddr;
2136                 }
2137                 dev_out = net->loopback_dev;
2138                 fl4->flowi4_oif = dev_out->ifindex;
2139                 flags |= RTCF_LOCAL;
2140                 goto make_route;
2141         }
2142
2143 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2144         if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
2145                 fib_select_multipath(&res);
2146         else
2147 #endif
2148         if (!res.prefixlen &&
2149             res.table->tb_num_default > 1 &&
2150             res.type == RTN_UNICAST && !fl4->flowi4_oif)
2151                 fib_select_default(&res);
2152
2153         if (!fl4->saddr)
2154                 fl4->saddr = FIB_RES_PREFSRC(net, res);
2155
2156         dev_out = FIB_RES_DEV(res);
2157         fl4->flowi4_oif = dev_out->ifindex;
2158
2159
2160 make_route:
2161         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2162
2163 out:
2164         rcu_read_unlock();
2165         return rth;
2166 }
2167 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2168
2169 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2170 {
2171         return NULL;
2172 }
2173
2174 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2175 {
2176         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2177
2178         return mtu ? : dst->dev->mtu;
2179 }
2180
2181 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2182                                           struct sk_buff *skb, u32 mtu)
2183 {
2184 }
2185
2186 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2187                                        struct sk_buff *skb)
2188 {
2189 }
2190
2191 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2192                                           unsigned long old)
2193 {
2194         return NULL;
2195 }
2196
2197 static struct dst_ops ipv4_dst_blackhole_ops = {
2198         .family                 =       AF_INET,
2199         .protocol               =       cpu_to_be16(ETH_P_IP),
2200         .check                  =       ipv4_blackhole_dst_check,
2201         .mtu                    =       ipv4_blackhole_mtu,
2202         .default_advmss         =       ipv4_default_advmss,
2203         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2204         .redirect               =       ipv4_rt_blackhole_redirect,
2205         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2206         .neigh_lookup           =       ipv4_neigh_lookup,
2207 };
2208
2209 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2210 {
2211         struct rtable *ort = (struct rtable *) dst_orig;
2212         struct rtable *rt;
2213
2214         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2215         if (rt) {
2216                 struct dst_entry *new = &rt->dst;
2217
2218                 new->__use = 1;
2219                 new->input = dst_discard;
2220                 new->output = dst_discard;
2221
2222                 new->dev = ort->dst.dev;
2223                 if (new->dev)
2224                         dev_hold(new->dev);
2225
2226                 rt->rt_is_input = ort->rt_is_input;
2227                 rt->rt_iif = ort->rt_iif;
2228                 rt->rt_pmtu = ort->rt_pmtu;
2229
2230                 rt->rt_genid = rt_genid(net);
2231                 rt->rt_flags = ort->rt_flags;
2232                 rt->rt_type = ort->rt_type;
2233                 rt->rt_gateway = ort->rt_gateway;
2234                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2235
2236                 INIT_LIST_HEAD(&rt->rt_uncached);
2237
2238                 dst_free(new);
2239         }
2240
2241         dst_release(dst_orig);
2242
2243         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2244 }
2245
2246 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2247                                     struct sock *sk)
2248 {
2249         struct rtable *rt = __ip_route_output_key(net, flp4);
2250
2251         if (IS_ERR(rt))
2252                 return rt;
2253
2254         if (flp4->flowi4_proto)
2255                 rt = (struct rtable *) xfrm_lookup(net, &rt->dst,
2256                                                    flowi4_to_flowi(flp4),
2257                                                    sk, 0);
2258
2259         return rt;
2260 }
2261 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2262
2263 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src,
2264                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2265                         u32 seq, int event, int nowait, unsigned int flags)
2266 {
2267         struct rtable *rt = skb_rtable(skb);
2268         struct rtmsg *r;
2269         struct nlmsghdr *nlh;
2270         unsigned long expires = 0;
2271         u32 error;
2272         u32 metrics[RTAX_MAX];
2273
2274         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2275         if (nlh == NULL)
2276                 return -EMSGSIZE;
2277
2278         r = nlmsg_data(nlh);
2279         r->rtm_family    = AF_INET;
2280         r->rtm_dst_len  = 32;
2281         r->rtm_src_len  = 0;
2282         r->rtm_tos      = fl4->flowi4_tos;
2283         r->rtm_table    = RT_TABLE_MAIN;
2284         if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN))
2285                 goto nla_put_failure;
2286         r->rtm_type     = rt->rt_type;
2287         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2288         r->rtm_protocol = RTPROT_UNSPEC;
2289         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2290         if (rt->rt_flags & RTCF_NOTIFY)
2291                 r->rtm_flags |= RTM_F_NOTIFY;
2292
2293         if (nla_put_be32(skb, RTA_DST, dst))
2294                 goto nla_put_failure;
2295         if (src) {
2296                 r->rtm_src_len = 32;
2297                 if (nla_put_be32(skb, RTA_SRC, src))
2298                         goto nla_put_failure;
2299         }
2300         if (rt->dst.dev &&
2301             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2302                 goto nla_put_failure;
2303 #ifdef CONFIG_IP_ROUTE_CLASSID
2304         if (rt->dst.tclassid &&
2305             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2306                 goto nla_put_failure;
2307 #endif
2308         if (!rt_is_input_route(rt) &&
2309             fl4->saddr != src) {
2310                 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr))
2311                         goto nla_put_failure;
2312         }
2313         if (rt->rt_uses_gateway &&
2314             nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway))
2315                 goto nla_put_failure;
2316
2317         expires = rt->dst.expires;
2318         if (expires) {
2319                 unsigned long now = jiffies;
2320
2321                 if (time_before(now, expires))
2322                         expires -= now;
2323                 else
2324                         expires = 0;
2325         }
2326
2327         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2328         if (rt->rt_pmtu && expires)
2329                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2330         if (rtnetlink_put_metrics(skb, metrics) < 0)
2331                 goto nla_put_failure;
2332
2333         if (fl4->flowi4_mark &&
2334             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2335                 goto nla_put_failure;
2336
2337         error = rt->dst.error;
2338
2339         if (rt_is_input_route(rt)) {
2340 #ifdef CONFIG_IP_MROUTE
2341                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2342                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2343                         int err = ipmr_get_route(net, skb,
2344                                                  fl4->saddr, fl4->daddr,
2345                                                  r, nowait);
2346                         if (err <= 0) {
2347                                 if (!nowait) {
2348                                         if (err == 0)
2349                                                 return 0;
2350                                         goto nla_put_failure;
2351                                 } else {
2352                                         if (err == -EMSGSIZE)
2353                                                 goto nla_put_failure;
2354                                         error = err;
2355                                 }
2356                         }
2357                 } else
2358 #endif
2359                         if (nla_put_u32(skb, RTA_IIF, rt->rt_iif))
2360                                 goto nla_put_failure;
2361         }
2362
2363         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2364                 goto nla_put_failure;
2365
2366         return nlmsg_end(skb, nlh);
2367
2368 nla_put_failure:
2369         nlmsg_cancel(skb, nlh);
2370         return -EMSGSIZE;
2371 }
2372
2373 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2374 {
2375         struct net *net = sock_net(in_skb->sk);
2376         struct rtmsg *rtm;
2377         struct nlattr *tb[RTA_MAX+1];
2378         struct rtable *rt = NULL;
2379         struct flowi4 fl4;
2380         __be32 dst = 0;
2381         __be32 src = 0;
2382         u32 iif;
2383         int err;
2384         int mark;
2385         struct sk_buff *skb;
2386
2387         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2388         if (err < 0)
2389                 goto errout;
2390
2391         rtm = nlmsg_data(nlh);
2392
2393         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2394         if (skb == NULL) {
2395                 err = -ENOBUFS;
2396                 goto errout;
2397         }
2398
2399         /* Reserve room for dummy headers, this skb can pass
2400            through good chunk of routing engine.
2401          */
2402         skb_reset_mac_header(skb);
2403         skb_reset_network_header(skb);
2404
2405         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2406         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2407         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2408
2409         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2410         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2411         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2412         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2413
2414         memset(&fl4, 0, sizeof(fl4));
2415         fl4.daddr = dst;
2416         fl4.saddr = src;
2417         fl4.flowi4_tos = rtm->rtm_tos;
2418         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2419         fl4.flowi4_mark = mark;
2420
2421         if (iif) {
2422                 struct net_device *dev;
2423
2424                 dev = __dev_get_by_index(net, iif);
2425                 if (dev == NULL) {
2426                         err = -ENODEV;
2427                         goto errout_free;
2428                 }
2429
2430                 skb->protocol   = htons(ETH_P_IP);
2431                 skb->dev        = dev;
2432                 skb->mark       = mark;
2433                 local_bh_disable();
2434                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2435                 local_bh_enable();
2436
2437                 rt = skb_rtable(skb);
2438                 if (err == 0 && rt->dst.error)
2439                         err = -rt->dst.error;
2440         } else {
2441                 rt = ip_route_output_key(net, &fl4);
2442
2443                 err = 0;
2444                 if (IS_ERR(rt))
2445                         err = PTR_ERR(rt);
2446         }
2447
2448         if (err)
2449                 goto errout_free;
2450
2451         skb_dst_set(skb, &rt->dst);
2452         if (rtm->rtm_flags & RTM_F_NOTIFY)
2453                 rt->rt_flags |= RTCF_NOTIFY;
2454
2455         err = rt_fill_info(net, dst, src, &fl4, skb,
2456                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2457                            RTM_NEWROUTE, 0, 0);
2458         if (err <= 0)
2459                 goto errout_free;
2460
2461         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2462 errout:
2463         return err;
2464
2465 errout_free:
2466         kfree_skb(skb);
2467         goto errout;
2468 }
2469
2470 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2471 {
2472         return skb->len;
2473 }
2474
2475 void ip_rt_multicast_event(struct in_device *in_dev)
2476 {
2477         rt_cache_flush(dev_net(in_dev->dev));
2478 }
2479
2480 #ifdef CONFIG_SYSCTL
2481 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
2482 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2483 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2484 static int ip_rt_gc_elasticity __read_mostly    = 8;
2485
2486 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2487                                         void __user *buffer,
2488                                         size_t *lenp, loff_t *ppos)
2489 {
2490         struct net *net = (struct net *)__ctl->extra1;
2491
2492         if (write) {
2493                 rt_cache_flush(net);
2494                 fnhe_genid_bump(net);
2495                 return 0;
2496         }
2497
2498         return -EINVAL;
2499 }
2500
2501 static struct ctl_table ipv4_route_table[] = {
2502         {
2503                 .procname       = "gc_thresh",
2504                 .data           = &ipv4_dst_ops.gc_thresh,
2505                 .maxlen         = sizeof(int),
2506                 .mode           = 0644,
2507                 .proc_handler   = proc_dointvec,
2508         },
2509         {
2510                 .procname       = "max_size",
2511                 .data           = &ip_rt_max_size,
2512                 .maxlen         = sizeof(int),
2513                 .mode           = 0644,
2514                 .proc_handler   = proc_dointvec,
2515         },
2516         {
2517                 /*  Deprecated. Use gc_min_interval_ms */
2518
2519                 .procname       = "gc_min_interval",
2520                 .data           = &ip_rt_gc_min_interval,
2521                 .maxlen         = sizeof(int),
2522                 .mode           = 0644,
2523                 .proc_handler   = proc_dointvec_jiffies,
2524         },
2525         {
2526                 .procname       = "gc_min_interval_ms",
2527                 .data           = &ip_rt_gc_min_interval,
2528                 .maxlen         = sizeof(int),
2529                 .mode           = 0644,
2530                 .proc_handler   = proc_dointvec_ms_jiffies,
2531         },
2532         {
2533                 .procname       = "gc_timeout",
2534                 .data           = &ip_rt_gc_timeout,
2535                 .maxlen         = sizeof(int),
2536                 .mode           = 0644,
2537                 .proc_handler   = proc_dointvec_jiffies,
2538         },
2539         {
2540                 .procname       = "gc_interval",
2541                 .data           = &ip_rt_gc_interval,
2542                 .maxlen         = sizeof(int),
2543                 .mode           = 0644,
2544                 .proc_handler   = proc_dointvec_jiffies,
2545         },
2546         {
2547                 .procname       = "redirect_load",
2548                 .data           = &ip_rt_redirect_load,
2549                 .maxlen         = sizeof(int),
2550                 .mode           = 0644,
2551                 .proc_handler   = proc_dointvec,
2552         },
2553         {
2554                 .procname       = "redirect_number",
2555                 .data           = &ip_rt_redirect_number,
2556                 .maxlen         = sizeof(int),
2557                 .mode           = 0644,
2558                 .proc_handler   = proc_dointvec,
2559         },
2560         {
2561                 .procname       = "redirect_silence",
2562                 .data           = &ip_rt_redirect_silence,
2563                 .maxlen         = sizeof(int),
2564                 .mode           = 0644,
2565                 .proc_handler   = proc_dointvec,
2566         },
2567         {
2568                 .procname       = "error_cost",
2569                 .data           = &ip_rt_error_cost,
2570                 .maxlen         = sizeof(int),
2571                 .mode           = 0644,
2572                 .proc_handler   = proc_dointvec,
2573         },
2574         {
2575                 .procname       = "error_burst",
2576                 .data           = &ip_rt_error_burst,
2577                 .maxlen         = sizeof(int),
2578                 .mode           = 0644,
2579                 .proc_handler   = proc_dointvec,
2580         },
2581         {
2582                 .procname       = "gc_elasticity",
2583                 .data           = &ip_rt_gc_elasticity,
2584                 .maxlen         = sizeof(int),
2585                 .mode           = 0644,
2586                 .proc_handler   = proc_dointvec,
2587         },
2588         {
2589                 .procname       = "mtu_expires",
2590                 .data           = &ip_rt_mtu_expires,
2591                 .maxlen         = sizeof(int),
2592                 .mode           = 0644,
2593                 .proc_handler   = proc_dointvec_jiffies,
2594         },
2595         {
2596                 .procname       = "min_pmtu",
2597                 .data           = &ip_rt_min_pmtu,
2598                 .maxlen         = sizeof(int),
2599                 .mode           = 0644,
2600                 .proc_handler   = proc_dointvec,
2601         },
2602         {
2603                 .procname       = "min_adv_mss",
2604                 .data           = &ip_rt_min_advmss,
2605                 .maxlen         = sizeof(int),
2606                 .mode           = 0644,
2607                 .proc_handler   = proc_dointvec,
2608         },
2609         { }
2610 };
2611
2612 static struct ctl_table ipv4_route_flush_table[] = {
2613         {
2614                 .procname       = "flush",
2615                 .maxlen         = sizeof(int),
2616                 .mode           = 0200,
2617                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2618         },
2619         { },
2620 };
2621
2622 static __net_init int sysctl_route_net_init(struct net *net)
2623 {
2624         struct ctl_table *tbl;
2625
2626         tbl = ipv4_route_flush_table;
2627         if (!net_eq(net, &init_net)) {
2628                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2629                 if (tbl == NULL)
2630                         goto err_dup;
2631
2632                 /* Don't export sysctls to unprivileged users */
2633                 if (net->user_ns != &init_user_ns)
2634                         tbl[0].procname = NULL;
2635         }
2636         tbl[0].extra1 = net;
2637
2638         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2639         if (net->ipv4.route_hdr == NULL)
2640                 goto err_reg;
2641         return 0;
2642
2643 err_reg:
2644         if (tbl != ipv4_route_flush_table)
2645                 kfree(tbl);
2646 err_dup:
2647         return -ENOMEM;
2648 }
2649
2650 static __net_exit void sysctl_route_net_exit(struct net *net)
2651 {
2652         struct ctl_table *tbl;
2653
2654         tbl = net->ipv4.route_hdr->ctl_table_arg;
2655         unregister_net_sysctl_table(net->ipv4.route_hdr);
2656         BUG_ON(tbl == ipv4_route_flush_table);
2657         kfree(tbl);
2658 }
2659
2660 static __net_initdata struct pernet_operations sysctl_route_ops = {
2661         .init = sysctl_route_net_init,
2662         .exit = sysctl_route_net_exit,
2663 };
2664 #endif
2665
2666 static __net_init int rt_genid_init(struct net *net)
2667 {
2668         atomic_set(&net->rt_genid, 0);
2669         atomic_set(&net->fnhe_genid, 0);
2670         get_random_bytes(&net->ipv4.dev_addr_genid,
2671                          sizeof(net->ipv4.dev_addr_genid));
2672         return 0;
2673 }
2674
2675 static __net_initdata struct pernet_operations rt_genid_ops = {
2676         .init = rt_genid_init,
2677 };
2678
2679 static int __net_init ipv4_inetpeer_init(struct net *net)
2680 {
2681         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2682
2683         if (!bp)
2684                 return -ENOMEM;
2685         inet_peer_base_init(bp);
2686         net->ipv4.peers = bp;
2687         return 0;
2688 }
2689
2690 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2691 {
2692         struct inet_peer_base *bp = net->ipv4.peers;
2693
2694         net->ipv4.peers = NULL;
2695         inetpeer_invalidate_tree(bp);
2696         kfree(bp);
2697 }
2698
2699 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2700         .init   =       ipv4_inetpeer_init,
2701         .exit   =       ipv4_inetpeer_exit,
2702 };
2703
2704 #ifdef CONFIG_IP_ROUTE_CLASSID
2705 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2706 #endif /* CONFIG_IP_ROUTE_CLASSID */
2707
2708 int __init ip_rt_init(void)
2709 {
2710         int rc = 0;
2711
2712 #ifdef CONFIG_IP_ROUTE_CLASSID
2713         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2714         if (!ip_rt_acct)
2715                 panic("IP: failed to allocate ip_rt_acct\n");
2716 #endif
2717
2718         ipv4_dst_ops.kmem_cachep =
2719                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2720                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2721
2722         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2723
2724         if (dst_entries_init(&ipv4_dst_ops) < 0)
2725                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2726
2727         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2728                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2729
2730         ipv4_dst_ops.gc_thresh = ~0;
2731         ip_rt_max_size = INT_MAX;
2732
2733         devinet_init();
2734         ip_fib_init();
2735
2736         if (ip_rt_proc_init())
2737                 pr_err("Unable to create route proc files\n");
2738 #ifdef CONFIG_XFRM
2739         xfrm_init();
2740         xfrm4_init();
2741 #endif
2742         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
2743
2744 #ifdef CONFIG_SYSCTL
2745         register_pernet_subsys(&sysctl_route_ops);
2746 #endif
2747         register_pernet_subsys(&rt_genid_ops);
2748         register_pernet_subsys(&ipv4_inetpeer_ops);
2749         return rc;
2750 }
2751
2752 #ifdef CONFIG_SYSCTL
2753 /*
2754  * We really need to sanitize the damn ipv4 init order, then all
2755  * this nonsense will go away.
2756  */
2757 void __init ip_static_sysctl_init(void)
2758 {
2759         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
2760 }
2761 #endif