]> Pileus Git - ~andy/linux/blob - net/ipv6/ip6_output.c
Merge branch 'x86-reboot-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...
[~andy/linux] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89         skb_reset_mac_header(newskb);
90         __skb_pull(newskb, skb_network_offset(newskb));
91         newskb->pkt_type = PACKET_LOOPBACK;
92         newskb->ip_summed = CHECKSUM_UNNECESSARY;
93         WARN_ON(!skb_dst(newskb));
94
95         netif_rx_ni(newskb);
96         return 0;
97 }
98
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101         struct dst_entry *dst = skb_dst(skb);
102         struct net_device *dev = dst->dev;
103         struct neighbour *neigh;
104
105         skb->protocol = htons(ETH_P_IPV6);
106         skb->dev = dev;
107
108         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110
111                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112                     ((mroute6_socket(dev_net(dev), skb) &&
113                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115                                          &ipv6_hdr(skb)->saddr))) {
116                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117
118                         /* Do not check for IFF_ALLMULTI; multicast routing
119                            is not supported in any case.
120                          */
121                         if (newskb)
122                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123                                         newskb, NULL, newskb->dev,
124                                         ip6_dev_loopback_xmit);
125
126                         if (ipv6_hdr(skb)->hop_limit == 0) {
127                                 IP6_INC_STATS(dev_net(dev), idev,
128                                               IPSTATS_MIB_OUTDISCARDS);
129                                 kfree_skb(skb);
130                                 return 0;
131                         }
132                 }
133
134                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135                                 skb->len);
136         }
137
138         rcu_read_lock();
139         neigh = dst_get_neighbour_noref(dst);
140         if (neigh) {
141                 int res = neigh_output(neigh, skb);
142
143                 rcu_read_unlock();
144                 return res;
145         }
146         rcu_read_unlock();
147         IP6_INC_STATS_BH(dev_net(dst->dev),
148                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149         kfree_skb(skb);
150         return -EINVAL;
151 }
152
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156             dst_allfrag(skb_dst(skb)))
157                 return ip6_fragment(skb, ip6_finish_output2);
158         else
159                 return ip6_finish_output2(skb);
160 }
161
162 int ip6_output(struct sk_buff *skb)
163 {
164         struct net_device *dev = skb_dst(skb)->dev;
165         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166         if (unlikely(idev->cnf.disable_ipv6)) {
167                 IP6_INC_STATS(dev_net(dev), idev,
168                               IPSTATS_MIB_OUTDISCARDS);
169                 kfree_skb(skb);
170                 return 0;
171         }
172
173         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174                             ip6_finish_output,
175                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177
178 /*
179  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183              struct ipv6_txoptions *opt, int tclass)
184 {
185         struct net *net = sock_net(sk);
186         struct ipv6_pinfo *np = inet6_sk(sk);
187         struct in6_addr *first_hop = &fl6->daddr;
188         struct dst_entry *dst = skb_dst(skb);
189         struct ipv6hdr *hdr;
190         u8  proto = fl6->flowi6_proto;
191         int seg_len = skb->len;
192         int hlimit = -1;
193         u32 mtu;
194
195         if (opt) {
196                 unsigned int head_room;
197
198                 /* First: exthdrs may take lots of space (~8K for now)
199                    MAX_HEADER is not enough.
200                  */
201                 head_room = opt->opt_nflen + opt->opt_flen;
202                 seg_len += head_room;
203                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204
205                 if (skb_headroom(skb) < head_room) {
206                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207                         if (skb2 == NULL) {
208                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209                                               IPSTATS_MIB_OUTDISCARDS);
210                                 kfree_skb(skb);
211                                 return -ENOBUFS;
212                         }
213                         consume_skb(skb);
214                         skb = skb2;
215                         skb_set_owner_w(skb, sk);
216                 }
217                 if (opt->opt_flen)
218                         ipv6_push_frag_opts(skb, opt, &proto);
219                 if (opt->opt_nflen)
220                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221         }
222
223         skb_push(skb, sizeof(struct ipv6hdr));
224         skb_reset_network_header(skb);
225         hdr = ipv6_hdr(skb);
226
227         /*
228          *      Fill in the IPv6 header
229          */
230         if (np)
231                 hlimit = np->hop_limit;
232         if (hlimit < 0)
233                 hlimit = ip6_dst_hoplimit(dst);
234
235         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236
237         hdr->payload_len = htons(seg_len);
238         hdr->nexthdr = proto;
239         hdr->hop_limit = hlimit;
240
241         hdr->saddr = fl6->saddr;
242         hdr->daddr = *first_hop;
243
244         skb->priority = sk->sk_priority;
245         skb->mark = sk->sk_mark;
246
247         mtu = dst_mtu(dst);
248         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250                               IPSTATS_MIB_OUT, skb->len);
251                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252                                dst->dev, dst_output);
253         }
254
255         net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
256         skb->dev = dst->dev;
257         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
258         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
259         kfree_skb(skb);
260         return -EMSGSIZE;
261 }
262
263 EXPORT_SYMBOL(ip6_xmit);
264
265 /*
266  *      To avoid extra problems ND packets are send through this
267  *      routine. It's code duplication but I really want to avoid
268  *      extra checks since ipv6_build_header is used by TCP (which
269  *      is for us performance critical)
270  */
271
272 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
273                const struct in6_addr *saddr, const struct in6_addr *daddr,
274                int proto, int len)
275 {
276         struct ipv6_pinfo *np = inet6_sk(sk);
277         struct ipv6hdr *hdr;
278
279         skb->protocol = htons(ETH_P_IPV6);
280         skb->dev = dev;
281
282         skb_reset_network_header(skb);
283         skb_put(skb, sizeof(struct ipv6hdr));
284         hdr = ipv6_hdr(skb);
285
286         *(__be32*)hdr = htonl(0x60000000);
287
288         hdr->payload_len = htons(len);
289         hdr->nexthdr = proto;
290         hdr->hop_limit = np->hop_limit;
291
292         hdr->saddr = *saddr;
293         hdr->daddr = *daddr;
294
295         return 0;
296 }
297
298 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
299 {
300         struct ip6_ra_chain *ra;
301         struct sock *last = NULL;
302
303         read_lock(&ip6_ra_lock);
304         for (ra = ip6_ra_chain; ra; ra = ra->next) {
305                 struct sock *sk = ra->sk;
306                 if (sk && ra->sel == sel &&
307                     (!sk->sk_bound_dev_if ||
308                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
309                         if (last) {
310                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311                                 if (skb2)
312                                         rawv6_rcv(last, skb2);
313                         }
314                         last = sk;
315                 }
316         }
317
318         if (last) {
319                 rawv6_rcv(last, skb);
320                 read_unlock(&ip6_ra_lock);
321                 return 1;
322         }
323         read_unlock(&ip6_ra_lock);
324         return 0;
325 }
326
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
328 {
329         struct ipv6hdr *hdr = ipv6_hdr(skb);
330         u8 nexthdr = hdr->nexthdr;
331         __be16 frag_off;
332         int offset;
333
334         if (ipv6_ext_hdr(nexthdr)) {
335                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
336                 if (offset < 0)
337                         return 0;
338         } else
339                 offset = sizeof(struct ipv6hdr);
340
341         if (nexthdr == IPPROTO_ICMPV6) {
342                 struct icmp6hdr *icmp6;
343
344                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
345                                          offset + 1 - skb->data)))
346                         return 0;
347
348                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349
350                 switch (icmp6->icmp6_type) {
351                 case NDISC_ROUTER_SOLICITATION:
352                 case NDISC_ROUTER_ADVERTISEMENT:
353                 case NDISC_NEIGHBOUR_SOLICITATION:
354                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
355                 case NDISC_REDIRECT:
356                         /* For reaction involving unicast neighbor discovery
357                          * message destined to the proxied address, pass it to
358                          * input function.
359                          */
360                         return 1;
361                 default:
362                         break;
363                 }
364         }
365
366         /*
367          * The proxying router can't forward traffic sent to a link-local
368          * address, so signal the sender and discard the packet. This
369          * behavior is clarified by the MIPv6 specification.
370          */
371         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372                 dst_link_failure(skb);
373                 return -1;
374         }
375
376         return 0;
377 }
378
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381         return dst_output(skb);
382 }
383
384 int ip6_forward(struct sk_buff *skb)
385 {
386         struct dst_entry *dst = skb_dst(skb);
387         struct ipv6hdr *hdr = ipv6_hdr(skb);
388         struct inet6_skb_parm *opt = IP6CB(skb);
389         struct net *net = dev_net(dst->dev);
390         u32 mtu;
391
392         if (net->ipv6.devconf_all->forwarding == 0)
393                 goto error;
394
395         if (skb_warn_if_lro(skb))
396                 goto drop;
397
398         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
399                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
400                 goto drop;
401         }
402
403         if (skb->pkt_type != PACKET_HOST)
404                 goto drop;
405
406         skb_forward_csum(skb);
407
408         /*
409          *      We DO NOT make any processing on
410          *      RA packets, pushing them to user level AS IS
411          *      without ane WARRANTY that application will be able
412          *      to interpret them. The reason is that we
413          *      cannot make anything clever here.
414          *
415          *      We are not end-node, so that if packet contains
416          *      AH/ESP, we cannot make anything.
417          *      Defragmentation also would be mistake, RA packets
418          *      cannot be fragmented, because there is no warranty
419          *      that different fragments will go along one path. --ANK
420          */
421         if (opt->ra) {
422                 u8 *ptr = skb_network_header(skb) + opt->ra;
423                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
424                         return 0;
425         }
426
427         /*
428          *      check and decrement ttl
429          */
430         if (hdr->hop_limit <= 1) {
431                 /* Force OUTPUT device used as source address */
432                 skb->dev = dst->dev;
433                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
434                 IP6_INC_STATS_BH(net,
435                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
436
437                 kfree_skb(skb);
438                 return -ETIMEDOUT;
439         }
440
441         /* XXX: idev->cnf.proxy_ndp? */
442         if (net->ipv6.devconf_all->proxy_ndp &&
443             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
444                 int proxied = ip6_forward_proxy_check(skb);
445                 if (proxied > 0)
446                         return ip6_input(skb);
447                 else if (proxied < 0) {
448                         IP6_INC_STATS(net, ip6_dst_idev(dst),
449                                       IPSTATS_MIB_INDISCARDS);
450                         goto drop;
451                 }
452         }
453
454         if (!xfrm6_route_forward(skb)) {
455                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
456                 goto drop;
457         }
458         dst = skb_dst(skb);
459
460         /* IPv6 specs say nothing about it, but it is clear that we cannot
461            send redirects to source routed frames.
462            We don't send redirects to frames decapsulated from IPsec.
463          */
464         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
465                 struct in6_addr *target = NULL;
466                 struct rt6_info *rt;
467
468                 /*
469                  *      incoming and outgoing devices are the same
470                  *      send a redirect.
471                  */
472
473                 rt = (struct rt6_info *) dst;
474                 if (rt->rt6i_flags & RTF_GATEWAY)
475                         target = &rt->rt6i_gateway;
476                 else
477                         target = &hdr->daddr;
478
479                 if (!rt->rt6i_peer)
480                         rt6_bind_peer(rt, 1);
481
482                 /* Limit redirects both by destination (here)
483                    and by source (inside ndisc_send_redirect)
484                  */
485                 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486                         ndisc_send_redirect(skb, target);
487         } else {
488                 int addrtype = ipv6_addr_type(&hdr->saddr);
489
490                 /* This check is security critical. */
491                 if (addrtype == IPV6_ADDR_ANY ||
492                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493                         goto error;
494                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
495                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496                                     ICMPV6_NOT_NEIGHBOUR, 0);
497                         goto error;
498                 }
499         }
500
501         mtu = dst_mtu(dst);
502         if (mtu < IPV6_MIN_MTU)
503                 mtu = IPV6_MIN_MTU;
504
505         if (skb->len > mtu && !skb_is_gso(skb)) {
506                 /* Again, force OUTPUT device used as source address */
507                 skb->dev = dst->dev;
508                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509                 IP6_INC_STATS_BH(net,
510                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511                 IP6_INC_STATS_BH(net,
512                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513                 kfree_skb(skb);
514                 return -EMSGSIZE;
515         }
516
517         if (skb_cow(skb, dst->dev->hard_header_len)) {
518                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519                 goto drop;
520         }
521
522         hdr = ipv6_hdr(skb);
523
524         /* Mangling hops number delayed to point after skb COW */
525
526         hdr->hop_limit--;
527
528         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
530         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
531                        ip6_forward_finish);
532
533 error:
534         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
535 drop:
536         kfree_skb(skb);
537         return -EINVAL;
538 }
539
540 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
541 {
542         to->pkt_type = from->pkt_type;
543         to->priority = from->priority;
544         to->protocol = from->protocol;
545         skb_dst_drop(to);
546         skb_dst_set(to, dst_clone(skb_dst(from)));
547         to->dev = from->dev;
548         to->mark = from->mark;
549
550 #ifdef CONFIG_NET_SCHED
551         to->tc_index = from->tc_index;
552 #endif
553         nf_copy(to, from);
554 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
555     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
556         to->nf_trace = from->nf_trace;
557 #endif
558         skb_copy_secmark(to, from);
559 }
560
561 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
562 {
563         u16 offset = sizeof(struct ipv6hdr);
564         struct ipv6_opt_hdr *exthdr =
565                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
566         unsigned int packet_len = skb->tail - skb->network_header;
567         int found_rhdr = 0;
568         *nexthdr = &ipv6_hdr(skb)->nexthdr;
569
570         while (offset + 1 <= packet_len) {
571
572                 switch (**nexthdr) {
573
574                 case NEXTHDR_HOP:
575                         break;
576                 case NEXTHDR_ROUTING:
577                         found_rhdr = 1;
578                         break;
579                 case NEXTHDR_DEST:
580 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
581                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
582                                 break;
583 #endif
584                         if (found_rhdr)
585                                 return offset;
586                         break;
587                 default :
588                         return offset;
589                 }
590
591                 offset += ipv6_optlen(exthdr);
592                 *nexthdr = &exthdr->nexthdr;
593                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
594                                                  offset);
595         }
596
597         return offset;
598 }
599
600 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
601 {
602         static atomic_t ipv6_fragmentation_id;
603         int old, new;
604
605         if (rt && !(rt->dst.flags & DST_NOPEER)) {
606                 struct inet_peer *peer;
607
608                 if (!rt->rt6i_peer)
609                         rt6_bind_peer(rt, 1);
610                 peer = rt->rt6i_peer;
611                 if (peer) {
612                         fhdr->identification = htonl(inet_getid(peer, 0));
613                         return;
614                 }
615         }
616         do {
617                 old = atomic_read(&ipv6_fragmentation_id);
618                 new = old + 1;
619                 if (!new)
620                         new = 1;
621         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
622         fhdr->identification = htonl(new);
623 }
624
625 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
626 {
627         struct sk_buff *frag;
628         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
629         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
630         struct ipv6hdr *tmp_hdr;
631         struct frag_hdr *fh;
632         unsigned int mtu, hlen, left, len;
633         int hroom, troom;
634         __be32 frag_id = 0;
635         int ptr, offset = 0, err=0;
636         u8 *prevhdr, nexthdr = 0;
637         struct net *net = dev_net(skb_dst(skb)->dev);
638
639         hlen = ip6_find_1stfragopt(skb, &prevhdr);
640         nexthdr = *prevhdr;
641
642         mtu = ip6_skb_dst_mtu(skb);
643
644         /* We must not fragment if the socket is set to force MTU discovery
645          * or if the skb it not generated by a local socket.
646          */
647         if (unlikely(!skb->local_df && skb->len > mtu)) {
648                 if (skb->sk && dst_allfrag(skb_dst(skb)))
649                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
650
651                 skb->dev = skb_dst(skb)->dev;
652                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
653                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
654                               IPSTATS_MIB_FRAGFAILS);
655                 kfree_skb(skb);
656                 return -EMSGSIZE;
657         }
658
659         if (np && np->frag_size < mtu) {
660                 if (np->frag_size)
661                         mtu = np->frag_size;
662         }
663         mtu -= hlen + sizeof(struct frag_hdr);
664
665         if (skb_has_frag_list(skb)) {
666                 int first_len = skb_pagelen(skb);
667                 struct sk_buff *frag2;
668
669                 if (first_len - hlen > mtu ||
670                     ((first_len - hlen) & 7) ||
671                     skb_cloned(skb))
672                         goto slow_path;
673
674                 skb_walk_frags(skb, frag) {
675                         /* Correct geometry. */
676                         if (frag->len > mtu ||
677                             ((frag->len & 7) && frag->next) ||
678                             skb_headroom(frag) < hlen)
679                                 goto slow_path_clean;
680
681                         /* Partially cloned skb? */
682                         if (skb_shared(frag))
683                                 goto slow_path_clean;
684
685                         BUG_ON(frag->sk);
686                         if (skb->sk) {
687                                 frag->sk = skb->sk;
688                                 frag->destructor = sock_wfree;
689                         }
690                         skb->truesize -= frag->truesize;
691                 }
692
693                 err = 0;
694                 offset = 0;
695                 frag = skb_shinfo(skb)->frag_list;
696                 skb_frag_list_init(skb);
697                 /* BUILD HEADER */
698
699                 *prevhdr = NEXTHDR_FRAGMENT;
700                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
701                 if (!tmp_hdr) {
702                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
703                                       IPSTATS_MIB_FRAGFAILS);
704                         return -ENOMEM;
705                 }
706
707                 __skb_pull(skb, hlen);
708                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
709                 __skb_push(skb, hlen);
710                 skb_reset_network_header(skb);
711                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
712
713                 ipv6_select_ident(fh, rt);
714                 fh->nexthdr = nexthdr;
715                 fh->reserved = 0;
716                 fh->frag_off = htons(IP6_MF);
717                 frag_id = fh->identification;
718
719                 first_len = skb_pagelen(skb);
720                 skb->data_len = first_len - skb_headlen(skb);
721                 skb->len = first_len;
722                 ipv6_hdr(skb)->payload_len = htons(first_len -
723                                                    sizeof(struct ipv6hdr));
724
725                 dst_hold(&rt->dst);
726
727                 for (;;) {
728                         /* Prepare header of the next frame,
729                          * before previous one went down. */
730                         if (frag) {
731                                 frag->ip_summed = CHECKSUM_NONE;
732                                 skb_reset_transport_header(frag);
733                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
734                                 __skb_push(frag, hlen);
735                                 skb_reset_network_header(frag);
736                                 memcpy(skb_network_header(frag), tmp_hdr,
737                                        hlen);
738                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
739                                 fh->nexthdr = nexthdr;
740                                 fh->reserved = 0;
741                                 fh->frag_off = htons(offset);
742                                 if (frag->next != NULL)
743                                         fh->frag_off |= htons(IP6_MF);
744                                 fh->identification = frag_id;
745                                 ipv6_hdr(frag)->payload_len =
746                                                 htons(frag->len -
747                                                       sizeof(struct ipv6hdr));
748                                 ip6_copy_metadata(frag, skb);
749                         }
750
751                         err = output(skb);
752                         if(!err)
753                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
754                                               IPSTATS_MIB_FRAGCREATES);
755
756                         if (err || !frag)
757                                 break;
758
759                         skb = frag;
760                         frag = skb->next;
761                         skb->next = NULL;
762                 }
763
764                 kfree(tmp_hdr);
765
766                 if (err == 0) {
767                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
768                                       IPSTATS_MIB_FRAGOKS);
769                         dst_release(&rt->dst);
770                         return 0;
771                 }
772
773                 while (frag) {
774                         skb = frag->next;
775                         kfree_skb(frag);
776                         frag = skb;
777                 }
778
779                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
780                               IPSTATS_MIB_FRAGFAILS);
781                 dst_release(&rt->dst);
782                 return err;
783
784 slow_path_clean:
785                 skb_walk_frags(skb, frag2) {
786                         if (frag2 == frag)
787                                 break;
788                         frag2->sk = NULL;
789                         frag2->destructor = NULL;
790                         skb->truesize += frag2->truesize;
791                 }
792         }
793
794 slow_path:
795         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
796             skb_checksum_help(skb))
797                 goto fail;
798
799         left = skb->len - hlen;         /* Space per frame */
800         ptr = hlen;                     /* Where to start from */
801
802         /*
803          *      Fragment the datagram.
804          */
805
806         *prevhdr = NEXTHDR_FRAGMENT;
807         hroom = LL_RESERVED_SPACE(rt->dst.dev);
808         troom = rt->dst.dev->needed_tailroom;
809
810         /*
811          *      Keep copying data until we run out.
812          */
813         while(left > 0) {
814                 len = left;
815                 /* IF: it doesn't fit, use 'mtu' - the data space left */
816                 if (len > mtu)
817                         len = mtu;
818                 /* IF: we are not sending up to and including the packet end
819                    then align the next start on an eight byte boundary */
820                 if (len < left) {
821                         len &= ~7;
822                 }
823                 /*
824                  *      Allocate buffer.
825                  */
826
827                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
828                                       hroom + troom, GFP_ATOMIC)) == NULL) {
829                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
830                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
831                                       IPSTATS_MIB_FRAGFAILS);
832                         err = -ENOMEM;
833                         goto fail;
834                 }
835
836                 /*
837                  *      Set up data on packet
838                  */
839
840                 ip6_copy_metadata(frag, skb);
841                 skb_reserve(frag, hroom);
842                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
843                 skb_reset_network_header(frag);
844                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
845                 frag->transport_header = (frag->network_header + hlen +
846                                           sizeof(struct frag_hdr));
847
848                 /*
849                  *      Charge the memory for the fragment to any owner
850                  *      it might possess
851                  */
852                 if (skb->sk)
853                         skb_set_owner_w(frag, skb->sk);
854
855                 /*
856                  *      Copy the packet header into the new buffer.
857                  */
858                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
859
860                 /*
861                  *      Build fragment header.
862                  */
863                 fh->nexthdr = nexthdr;
864                 fh->reserved = 0;
865                 if (!frag_id) {
866                         ipv6_select_ident(fh, rt);
867                         frag_id = fh->identification;
868                 } else
869                         fh->identification = frag_id;
870
871                 /*
872                  *      Copy a block of the IP datagram.
873                  */
874                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
875                         BUG();
876                 left -= len;
877
878                 fh->frag_off = htons(offset);
879                 if (left > 0)
880                         fh->frag_off |= htons(IP6_MF);
881                 ipv6_hdr(frag)->payload_len = htons(frag->len -
882                                                     sizeof(struct ipv6hdr));
883
884                 ptr += len;
885                 offset += len;
886
887                 /*
888                  *      Put this fragment into the sending queue.
889                  */
890                 err = output(frag);
891                 if (err)
892                         goto fail;
893
894                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
895                               IPSTATS_MIB_FRAGCREATES);
896         }
897         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
898                       IPSTATS_MIB_FRAGOKS);
899         consume_skb(skb);
900         return err;
901
902 fail:
903         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
904                       IPSTATS_MIB_FRAGFAILS);
905         kfree_skb(skb);
906         return err;
907 }
908
909 static inline int ip6_rt_check(const struct rt6key *rt_key,
910                                const struct in6_addr *fl_addr,
911                                const struct in6_addr *addr_cache)
912 {
913         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
914                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
915 }
916
917 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
918                                           struct dst_entry *dst,
919                                           const struct flowi6 *fl6)
920 {
921         struct ipv6_pinfo *np = inet6_sk(sk);
922         struct rt6_info *rt = (struct rt6_info *)dst;
923
924         if (!dst)
925                 goto out;
926
927         /* Yes, checking route validity in not connected
928          * case is not very simple. Take into account,
929          * that we do not support routing by source, TOS,
930          * and MSG_DONTROUTE            --ANK (980726)
931          *
932          * 1. ip6_rt_check(): If route was host route,
933          *    check that cached destination is current.
934          *    If it is network route, we still may
935          *    check its validity using saved pointer
936          *    to the last used address: daddr_cache.
937          *    We do not want to save whole address now,
938          *    (because main consumer of this service
939          *    is tcp, which has not this problem),
940          *    so that the last trick works only on connected
941          *    sockets.
942          * 2. oif also should be the same.
943          */
944         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
945 #ifdef CONFIG_IPV6_SUBTREES
946             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
947 #endif
948             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
949                 dst_release(dst);
950                 dst = NULL;
951         }
952
953 out:
954         return dst;
955 }
956
957 static int ip6_dst_lookup_tail(struct sock *sk,
958                                struct dst_entry **dst, struct flowi6 *fl6)
959 {
960         struct net *net = sock_net(sk);
961 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
962         struct neighbour *n;
963 #endif
964         int err;
965
966         if (*dst == NULL)
967                 *dst = ip6_route_output(net, sk, fl6);
968
969         if ((err = (*dst)->error))
970                 goto out_err_release;
971
972         if (ipv6_addr_any(&fl6->saddr)) {
973                 struct rt6_info *rt = (struct rt6_info *) *dst;
974                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
975                                           sk ? inet6_sk(sk)->srcprefs : 0,
976                                           &fl6->saddr);
977                 if (err)
978                         goto out_err_release;
979         }
980
981 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
982         /*
983          * Here if the dst entry we've looked up
984          * has a neighbour entry that is in the INCOMPLETE
985          * state and the src address from the flow is
986          * marked as OPTIMISTIC, we release the found
987          * dst entry and replace it instead with the
988          * dst entry of the nexthop router
989          */
990         rcu_read_lock();
991         n = dst_get_neighbour_noref(*dst);
992         if (n && !(n->nud_state & NUD_VALID)) {
993                 struct inet6_ifaddr *ifp;
994                 struct flowi6 fl_gw6;
995                 int redirect;
996
997                 rcu_read_unlock();
998                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
999                                       (*dst)->dev, 1);
1000
1001                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1002                 if (ifp)
1003                         in6_ifa_put(ifp);
1004
1005                 if (redirect) {
1006                         /*
1007                          * We need to get the dst entry for the
1008                          * default router instead
1009                          */
1010                         dst_release(*dst);
1011                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1012                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1013                         *dst = ip6_route_output(net, sk, &fl_gw6);
1014                         if ((err = (*dst)->error))
1015                                 goto out_err_release;
1016                 }
1017         } else {
1018                 rcu_read_unlock();
1019         }
1020 #endif
1021
1022         return 0;
1023
1024 out_err_release:
1025         if (err == -ENETUNREACH)
1026                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1027         dst_release(*dst);
1028         *dst = NULL;
1029         return err;
1030 }
1031
1032 /**
1033  *      ip6_dst_lookup - perform route lookup on flow
1034  *      @sk: socket which provides route info
1035  *      @dst: pointer to dst_entry * for result
1036  *      @fl6: flow to lookup
1037  *
1038  *      This function performs a route lookup on the given flow.
1039  *
1040  *      It returns zero on success, or a standard errno code on error.
1041  */
1042 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1043 {
1044         *dst = NULL;
1045         return ip6_dst_lookup_tail(sk, dst, fl6);
1046 }
1047 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1048
1049 /**
1050  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1051  *      @sk: socket which provides route info
1052  *      @fl6: flow to lookup
1053  *      @final_dst: final destination address for ipsec lookup
1054  *      @can_sleep: we are in a sleepable context
1055  *
1056  *      This function performs a route lookup on the given flow.
1057  *
1058  *      It returns a valid dst pointer on success, or a pointer encoded
1059  *      error code.
1060  */
1061 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1062                                       const struct in6_addr *final_dst,
1063                                       bool can_sleep)
1064 {
1065         struct dst_entry *dst = NULL;
1066         int err;
1067
1068         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1069         if (err)
1070                 return ERR_PTR(err);
1071         if (final_dst)
1072                 fl6->daddr = *final_dst;
1073         if (can_sleep)
1074                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1075
1076         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1077 }
1078 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1079
1080 /**
1081  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1082  *      @sk: socket which provides the dst cache and route info
1083  *      @fl6: flow to lookup
1084  *      @final_dst: final destination address for ipsec lookup
1085  *      @can_sleep: we are in a sleepable context
1086  *
1087  *      This function performs a route lookup on the given flow with the
1088  *      possibility of using the cached route in the socket if it is valid.
1089  *      It will take the socket dst lock when operating on the dst cache.
1090  *      As a result, this function can only be used in process context.
1091  *
1092  *      It returns a valid dst pointer on success, or a pointer encoded
1093  *      error code.
1094  */
1095 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1096                                          const struct in6_addr *final_dst,
1097                                          bool can_sleep)
1098 {
1099         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1100         int err;
1101
1102         dst = ip6_sk_dst_check(sk, dst, fl6);
1103
1104         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1105         if (err)
1106                 return ERR_PTR(err);
1107         if (final_dst)
1108                 fl6->daddr = *final_dst;
1109         if (can_sleep)
1110                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1111
1112         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1113 }
1114 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1115
1116 static inline int ip6_ufo_append_data(struct sock *sk,
1117                         int getfrag(void *from, char *to, int offset, int len,
1118                         int odd, struct sk_buff *skb),
1119                         void *from, int length, int hh_len, int fragheaderlen,
1120                         int transhdrlen, int mtu,unsigned int flags,
1121                         struct rt6_info *rt)
1122
1123 {
1124         struct sk_buff *skb;
1125         int err;
1126
1127         /* There is support for UDP large send offload by network
1128          * device, so create one single skb packet containing complete
1129          * udp datagram
1130          */
1131         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1132                 skb = sock_alloc_send_skb(sk,
1133                         hh_len + fragheaderlen + transhdrlen + 20,
1134                         (flags & MSG_DONTWAIT), &err);
1135                 if (skb == NULL)
1136                         return err;
1137
1138                 /* reserve space for Hardware header */
1139                 skb_reserve(skb, hh_len);
1140
1141                 /* create space for UDP/IP header */
1142                 skb_put(skb,fragheaderlen + transhdrlen);
1143
1144                 /* initialize network header pointer */
1145                 skb_reset_network_header(skb);
1146
1147                 /* initialize protocol header pointer */
1148                 skb->transport_header = skb->network_header + fragheaderlen;
1149
1150                 skb->ip_summed = CHECKSUM_PARTIAL;
1151                 skb->csum = 0;
1152         }
1153
1154         err = skb_append_datato_frags(sk,skb, getfrag, from,
1155                                       (length - transhdrlen));
1156         if (!err) {
1157                 struct frag_hdr fhdr;
1158
1159                 /* Specify the length of each IPv6 datagram fragment.
1160                  * It has to be a multiple of 8.
1161                  */
1162                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1163                                              sizeof(struct frag_hdr)) & ~7;
1164                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1165                 ipv6_select_ident(&fhdr, rt);
1166                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1167                 __skb_queue_tail(&sk->sk_write_queue, skb);
1168
1169                 return 0;
1170         }
1171         /* There is not enough support do UPD LSO,
1172          * so follow normal path
1173          */
1174         kfree_skb(skb);
1175
1176         return err;
1177 }
1178
1179 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1180                                                gfp_t gfp)
1181 {
1182         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1183 }
1184
1185 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1186                                                 gfp_t gfp)
1187 {
1188         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1189 }
1190
1191 static void ip6_append_data_mtu(int *mtu,
1192                                 int *maxfraglen,
1193                                 unsigned int fragheaderlen,
1194                                 struct sk_buff *skb,
1195                                 struct rt6_info *rt)
1196 {
1197         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1198                 if (skb == NULL) {
1199                         /* first fragment, reserve header_len */
1200                         *mtu = *mtu - rt->dst.header_len;
1201
1202                 } else {
1203                         /*
1204                          * this fragment is not first, the headers
1205                          * space is regarded as data space.
1206                          */
1207                         *mtu = dst_mtu(rt->dst.path);
1208                 }
1209                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1210                               + fragheaderlen - sizeof(struct frag_hdr);
1211         }
1212 }
1213
1214 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1215         int offset, int len, int odd, struct sk_buff *skb),
1216         void *from, int length, int transhdrlen,
1217         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1218         struct rt6_info *rt, unsigned int flags, int dontfrag)
1219 {
1220         struct inet_sock *inet = inet_sk(sk);
1221         struct ipv6_pinfo *np = inet6_sk(sk);
1222         struct inet_cork *cork;
1223         struct sk_buff *skb, *skb_prev = NULL;
1224         unsigned int maxfraglen, fragheaderlen;
1225         int exthdrlen;
1226         int dst_exthdrlen;
1227         int hh_len;
1228         int mtu;
1229         int copy;
1230         int err;
1231         int offset = 0;
1232         __u8 tx_flags = 0;
1233
1234         if (flags&MSG_PROBE)
1235                 return 0;
1236         cork = &inet->cork.base;
1237         if (skb_queue_empty(&sk->sk_write_queue)) {
1238                 /*
1239                  * setup for corking
1240                  */
1241                 if (opt) {
1242                         if (WARN_ON(np->cork.opt))
1243                                 return -EINVAL;
1244
1245                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1246                         if (unlikely(np->cork.opt == NULL))
1247                                 return -ENOBUFS;
1248
1249                         np->cork.opt->tot_len = opt->tot_len;
1250                         np->cork.opt->opt_flen = opt->opt_flen;
1251                         np->cork.opt->opt_nflen = opt->opt_nflen;
1252
1253                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1254                                                             sk->sk_allocation);
1255                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1256                                 return -ENOBUFS;
1257
1258                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1259                                                             sk->sk_allocation);
1260                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1261                                 return -ENOBUFS;
1262
1263                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1264                                                            sk->sk_allocation);
1265                         if (opt->hopopt && !np->cork.opt->hopopt)
1266                                 return -ENOBUFS;
1267
1268                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1269                                                             sk->sk_allocation);
1270                         if (opt->srcrt && !np->cork.opt->srcrt)
1271                                 return -ENOBUFS;
1272
1273                         /* need source address above miyazawa*/
1274                 }
1275                 dst_hold(&rt->dst);
1276                 cork->dst = &rt->dst;
1277                 inet->cork.fl.u.ip6 = *fl6;
1278                 np->cork.hop_limit = hlimit;
1279                 np->cork.tclass = tclass;
1280                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1281                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1282                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1283                 else
1284                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1285                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1286                 if (np->frag_size < mtu) {
1287                         if (np->frag_size)
1288                                 mtu = np->frag_size;
1289                 }
1290                 cork->fragsize = mtu;
1291                 if (dst_allfrag(rt->dst.path))
1292                         cork->flags |= IPCORK_ALLFRAG;
1293                 cork->length = 0;
1294                 sk->sk_sndmsg_page = NULL;
1295                 sk->sk_sndmsg_off = 0;
1296                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1297                 length += exthdrlen;
1298                 transhdrlen += exthdrlen;
1299                 dst_exthdrlen = rt->dst.header_len;
1300         } else {
1301                 rt = (struct rt6_info *)cork->dst;
1302                 fl6 = &inet->cork.fl.u.ip6;
1303                 opt = np->cork.opt;
1304                 transhdrlen = 0;
1305                 exthdrlen = 0;
1306                 dst_exthdrlen = 0;
1307                 mtu = cork->fragsize;
1308         }
1309
1310         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1311
1312         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1313                         (opt ? opt->opt_nflen : 0);
1314         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1315
1316         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1317                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1318                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1319                         return -EMSGSIZE;
1320                 }
1321         }
1322
1323         /* For UDP, check if TX timestamp is enabled */
1324         if (sk->sk_type == SOCK_DGRAM) {
1325                 err = sock_tx_timestamp(sk, &tx_flags);
1326                 if (err)
1327                         goto error;
1328         }
1329
1330         /*
1331          * Let's try using as much space as possible.
1332          * Use MTU if total length of the message fits into the MTU.
1333          * Otherwise, we need to reserve fragment header and
1334          * fragment alignment (= 8-15 octects, in total).
1335          *
1336          * Note that we may need to "move" the data from the tail of
1337          * of the buffer to the new fragment when we split
1338          * the message.
1339          *
1340          * FIXME: It may be fragmented into multiple chunks
1341          *        at once if non-fragmentable extension headers
1342          *        are too large.
1343          * --yoshfuji
1344          */
1345
1346         cork->length += length;
1347         if (length > mtu) {
1348                 int proto = sk->sk_protocol;
1349                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1350                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1351                         return -EMSGSIZE;
1352                 }
1353
1354                 if (proto == IPPROTO_UDP &&
1355                     (rt->dst.dev->features & NETIF_F_UFO)) {
1356
1357                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1358                                                   hh_len, fragheaderlen,
1359                                                   transhdrlen, mtu, flags, rt);
1360                         if (err)
1361                                 goto error;
1362                         return 0;
1363                 }
1364         }
1365
1366         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1367                 goto alloc_new_skb;
1368
1369         while (length > 0) {
1370                 /* Check if the remaining data fits into current packet. */
1371                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1372                 if (copy < length)
1373                         copy = maxfraglen - skb->len;
1374
1375                 if (copy <= 0) {
1376                         char *data;
1377                         unsigned int datalen;
1378                         unsigned int fraglen;
1379                         unsigned int fraggap;
1380                         unsigned int alloclen;
1381 alloc_new_skb:
1382                         /* There's no room in the current skb */
1383                         if (skb)
1384                                 fraggap = skb->len - maxfraglen;
1385                         else
1386                                 fraggap = 0;
1387                         /* update mtu and maxfraglen if necessary */
1388                         if (skb == NULL || skb_prev == NULL)
1389                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1390                                                     fragheaderlen, skb, rt);
1391
1392                         skb_prev = skb;
1393
1394                         /*
1395                          * If remaining data exceeds the mtu,
1396                          * we know we need more fragment(s).
1397                          */
1398                         datalen = length + fraggap;
1399
1400                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1401                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1402                         if ((flags & MSG_MORE) &&
1403                             !(rt->dst.dev->features&NETIF_F_SG))
1404                                 alloclen = mtu;
1405                         else
1406                                 alloclen = datalen + fragheaderlen;
1407
1408                         alloclen += dst_exthdrlen;
1409
1410                         if (datalen != length + fraggap) {
1411                                 /*
1412                                  * this is not the last fragment, the trailer
1413                                  * space is regarded as data space.
1414                                  */
1415                                 datalen += rt->dst.trailer_len;
1416                         }
1417
1418                         alloclen += rt->dst.trailer_len;
1419                         fraglen = datalen + fragheaderlen;
1420
1421                         /*
1422                          * We just reserve space for fragment header.
1423                          * Note: this may be overallocation if the message
1424                          * (without MSG_MORE) fits into the MTU.
1425                          */
1426                         alloclen += sizeof(struct frag_hdr);
1427
1428                         if (transhdrlen) {
1429                                 skb = sock_alloc_send_skb(sk,
1430                                                 alloclen + hh_len,
1431                                                 (flags & MSG_DONTWAIT), &err);
1432                         } else {
1433                                 skb = NULL;
1434                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1435                                     2 * sk->sk_sndbuf)
1436                                         skb = sock_wmalloc(sk,
1437                                                            alloclen + hh_len, 1,
1438                                                            sk->sk_allocation);
1439                                 if (unlikely(skb == NULL))
1440                                         err = -ENOBUFS;
1441                                 else {
1442                                         /* Only the initial fragment
1443                                          * is time stamped.
1444                                          */
1445                                         tx_flags = 0;
1446                                 }
1447                         }
1448                         if (skb == NULL)
1449                                 goto error;
1450                         /*
1451                          *      Fill in the control structures
1452                          */
1453                         skb->ip_summed = CHECKSUM_NONE;
1454                         skb->csum = 0;
1455                         /* reserve for fragmentation and ipsec header */
1456                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1457                                     dst_exthdrlen);
1458
1459                         if (sk->sk_type == SOCK_DGRAM)
1460                                 skb_shinfo(skb)->tx_flags = tx_flags;
1461
1462                         /*
1463                          *      Find where to start putting bytes
1464                          */
1465                         data = skb_put(skb, fraglen);
1466                         skb_set_network_header(skb, exthdrlen);
1467                         data += fragheaderlen;
1468                         skb->transport_header = (skb->network_header +
1469                                                  fragheaderlen);
1470                         if (fraggap) {
1471                                 skb->csum = skb_copy_and_csum_bits(
1472                                         skb_prev, maxfraglen,
1473                                         data + transhdrlen, fraggap, 0);
1474                                 skb_prev->csum = csum_sub(skb_prev->csum,
1475                                                           skb->csum);
1476                                 data += fraggap;
1477                                 pskb_trim_unique(skb_prev, maxfraglen);
1478                         }
1479                         copy = datalen - transhdrlen - fraggap;
1480
1481                         if (copy < 0) {
1482                                 err = -EINVAL;
1483                                 kfree_skb(skb);
1484                                 goto error;
1485                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1486                                 err = -EFAULT;
1487                                 kfree_skb(skb);
1488                                 goto error;
1489                         }
1490
1491                         offset += copy;
1492                         length -= datalen - fraggap;
1493                         transhdrlen = 0;
1494                         exthdrlen = 0;
1495                         dst_exthdrlen = 0;
1496
1497                         /*
1498                          * Put the packet on the pending queue
1499                          */
1500                         __skb_queue_tail(&sk->sk_write_queue, skb);
1501                         continue;
1502                 }
1503
1504                 if (copy > length)
1505                         copy = length;
1506
1507                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1508                         unsigned int off;
1509
1510                         off = skb->len;
1511                         if (getfrag(from, skb_put(skb, copy),
1512                                                 offset, copy, off, skb) < 0) {
1513                                 __skb_trim(skb, off);
1514                                 err = -EFAULT;
1515                                 goto error;
1516                         }
1517                 } else {
1518                         int i = skb_shinfo(skb)->nr_frags;
1519                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1520                         struct page *page = sk->sk_sndmsg_page;
1521                         int off = sk->sk_sndmsg_off;
1522                         unsigned int left;
1523
1524                         if (page && (left = PAGE_SIZE - off) > 0) {
1525                                 if (copy >= left)
1526                                         copy = left;
1527                                 if (page != skb_frag_page(frag)) {
1528                                         if (i == MAX_SKB_FRAGS) {
1529                                                 err = -EMSGSIZE;
1530                                                 goto error;
1531                                         }
1532                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1533                                         skb_frag_ref(skb, i);
1534                                         frag = &skb_shinfo(skb)->frags[i];
1535                                 }
1536                         } else if(i < MAX_SKB_FRAGS) {
1537                                 if (copy > PAGE_SIZE)
1538                                         copy = PAGE_SIZE;
1539                                 page = alloc_pages(sk->sk_allocation, 0);
1540                                 if (page == NULL) {
1541                                         err = -ENOMEM;
1542                                         goto error;
1543                                 }
1544                                 sk->sk_sndmsg_page = page;
1545                                 sk->sk_sndmsg_off = 0;
1546
1547                                 skb_fill_page_desc(skb, i, page, 0, 0);
1548                                 frag = &skb_shinfo(skb)->frags[i];
1549                         } else {
1550                                 err = -EMSGSIZE;
1551                                 goto error;
1552                         }
1553                         if (getfrag(from,
1554                                     skb_frag_address(frag) + skb_frag_size(frag),
1555                                     offset, copy, skb->len, skb) < 0) {
1556                                 err = -EFAULT;
1557                                 goto error;
1558                         }
1559                         sk->sk_sndmsg_off += copy;
1560                         skb_frag_size_add(frag, copy);
1561                         skb->len += copy;
1562                         skb->data_len += copy;
1563                         skb->truesize += copy;
1564                         atomic_add(copy, &sk->sk_wmem_alloc);
1565                 }
1566                 offset += copy;
1567                 length -= copy;
1568         }
1569         return 0;
1570 error:
1571         cork->length -= length;
1572         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1573         return err;
1574 }
1575 EXPORT_SYMBOL_GPL(ip6_append_data);
1576
1577 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1578 {
1579         if (np->cork.opt) {
1580                 kfree(np->cork.opt->dst0opt);
1581                 kfree(np->cork.opt->dst1opt);
1582                 kfree(np->cork.opt->hopopt);
1583                 kfree(np->cork.opt->srcrt);
1584                 kfree(np->cork.opt);
1585                 np->cork.opt = NULL;
1586         }
1587
1588         if (inet->cork.base.dst) {
1589                 dst_release(inet->cork.base.dst);
1590                 inet->cork.base.dst = NULL;
1591                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1592         }
1593         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1594 }
1595
1596 int ip6_push_pending_frames(struct sock *sk)
1597 {
1598         struct sk_buff *skb, *tmp_skb;
1599         struct sk_buff **tail_skb;
1600         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1601         struct inet_sock *inet = inet_sk(sk);
1602         struct ipv6_pinfo *np = inet6_sk(sk);
1603         struct net *net = sock_net(sk);
1604         struct ipv6hdr *hdr;
1605         struct ipv6_txoptions *opt = np->cork.opt;
1606         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1607         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1608         unsigned char proto = fl6->flowi6_proto;
1609         int err = 0;
1610
1611         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1612                 goto out;
1613         tail_skb = &(skb_shinfo(skb)->frag_list);
1614
1615         /* move skb->data to ip header from ext header */
1616         if (skb->data < skb_network_header(skb))
1617                 __skb_pull(skb, skb_network_offset(skb));
1618         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1619                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1620                 *tail_skb = tmp_skb;
1621                 tail_skb = &(tmp_skb->next);
1622                 skb->len += tmp_skb->len;
1623                 skb->data_len += tmp_skb->len;
1624                 skb->truesize += tmp_skb->truesize;
1625                 tmp_skb->destructor = NULL;
1626                 tmp_skb->sk = NULL;
1627         }
1628
1629         /* Allow local fragmentation. */
1630         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1631                 skb->local_df = 1;
1632
1633         *final_dst = fl6->daddr;
1634         __skb_pull(skb, skb_network_header_len(skb));
1635         if (opt && opt->opt_flen)
1636                 ipv6_push_frag_opts(skb, opt, &proto);
1637         if (opt && opt->opt_nflen)
1638                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1639
1640         skb_push(skb, sizeof(struct ipv6hdr));
1641         skb_reset_network_header(skb);
1642         hdr = ipv6_hdr(skb);
1643
1644         *(__be32*)hdr = fl6->flowlabel |
1645                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1646
1647         hdr->hop_limit = np->cork.hop_limit;
1648         hdr->nexthdr = proto;
1649         hdr->saddr = fl6->saddr;
1650         hdr->daddr = *final_dst;
1651
1652         skb->priority = sk->sk_priority;
1653         skb->mark = sk->sk_mark;
1654
1655         skb_dst_set(skb, dst_clone(&rt->dst));
1656         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1657         if (proto == IPPROTO_ICMPV6) {
1658                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1659
1660                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1661                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1662         }
1663
1664         err = ip6_local_out(skb);
1665         if (err) {
1666                 if (err > 0)
1667                         err = net_xmit_errno(err);
1668                 if (err)
1669                         goto error;
1670         }
1671
1672 out:
1673         ip6_cork_release(inet, np);
1674         return err;
1675 error:
1676         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1677         goto out;
1678 }
1679 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1680
1681 void ip6_flush_pending_frames(struct sock *sk)
1682 {
1683         struct sk_buff *skb;
1684
1685         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1686                 if (skb_dst(skb))
1687                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1688                                       IPSTATS_MIB_OUTDISCARDS);
1689                 kfree_skb(skb);
1690         }
1691
1692         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1693 }
1694 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);