]> Pileus Git - ~andy/linux/blob - net/ipv6/ip6_output.c
3deaa4e2e8e2ead3f8bc4c33e4fbe48b9fa6dd25
[~andy/linux] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71                        skb_dst(skb)->dev, dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 static int ip6_finish_output2(struct sk_buff *skb)
87 {
88         struct dst_entry *dst = skb_dst(skb);
89         struct net_device *dev = dst->dev;
90         struct neighbour *neigh;
91         struct rt6_info *rt;
92
93         skb->protocol = htons(ETH_P_IPV6);
94         skb->dev = dev;
95
96         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
97                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
98
99                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
100                     ((mroute6_socket(dev_net(dev), skb) &&
101                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
102                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
103                                          &ipv6_hdr(skb)->saddr))) {
104                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
105
106                         /* Do not check for IFF_ALLMULTI; multicast routing
107                            is not supported in any case.
108                          */
109                         if (newskb)
110                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
111                                         newskb, NULL, newskb->dev,
112                                         dev_loopback_xmit);
113
114                         if (ipv6_hdr(skb)->hop_limit == 0) {
115                                 IP6_INC_STATS(dev_net(dev), idev,
116                                               IPSTATS_MIB_OUTDISCARDS);
117                                 kfree_skb(skb);
118                                 return 0;
119                         }
120                 }
121
122                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
123                                 skb->len);
124         }
125
126         rt = (struct rt6_info *) dst;
127         neigh = rt->n;
128         if (neigh)
129                 return dst_neigh_output(dst, neigh, skb);
130
131         IP6_INC_STATS_BH(dev_net(dst->dev),
132                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
133         kfree_skb(skb);
134         return -EINVAL;
135 }
136
137 static int ip6_finish_output(struct sk_buff *skb)
138 {
139         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
140             dst_allfrag(skb_dst(skb)))
141                 return ip6_fragment(skb, ip6_finish_output2);
142         else
143                 return ip6_finish_output2(skb);
144 }
145
146 int ip6_output(struct sk_buff *skb)
147 {
148         struct net_device *dev = skb_dst(skb)->dev;
149         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
150         if (unlikely(idev->cnf.disable_ipv6)) {
151                 IP6_INC_STATS(dev_net(dev), idev,
152                               IPSTATS_MIB_OUTDISCARDS);
153                 kfree_skb(skb);
154                 return 0;
155         }
156
157         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
158                             ip6_finish_output,
159                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
160 }
161
162 /*
163  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
164  */
165
166 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
167              struct ipv6_txoptions *opt, int tclass)
168 {
169         struct net *net = sock_net(sk);
170         struct ipv6_pinfo *np = inet6_sk(sk);
171         struct in6_addr *first_hop = &fl6->daddr;
172         struct dst_entry *dst = skb_dst(skb);
173         struct ipv6hdr *hdr;
174         u8  proto = fl6->flowi6_proto;
175         int seg_len = skb->len;
176         int hlimit = -1;
177         u32 mtu;
178
179         if (opt) {
180                 unsigned int head_room;
181
182                 /* First: exthdrs may take lots of space (~8K for now)
183                    MAX_HEADER is not enough.
184                  */
185                 head_room = opt->opt_nflen + opt->opt_flen;
186                 seg_len += head_room;
187                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
188
189                 if (skb_headroom(skb) < head_room) {
190                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
191                         if (skb2 == NULL) {
192                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
193                                               IPSTATS_MIB_OUTDISCARDS);
194                                 kfree_skb(skb);
195                                 return -ENOBUFS;
196                         }
197                         consume_skb(skb);
198                         skb = skb2;
199                         skb_set_owner_w(skb, sk);
200                 }
201                 if (opt->opt_flen)
202                         ipv6_push_frag_opts(skb, opt, &proto);
203                 if (opt->opt_nflen)
204                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
205         }
206
207         skb_push(skb, sizeof(struct ipv6hdr));
208         skb_reset_network_header(skb);
209         hdr = ipv6_hdr(skb);
210
211         /*
212          *      Fill in the IPv6 header
213          */
214         if (np)
215                 hlimit = np->hop_limit;
216         if (hlimit < 0)
217                 hlimit = ip6_dst_hoplimit(dst);
218
219         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
220
221         hdr->payload_len = htons(seg_len);
222         hdr->nexthdr = proto;
223         hdr->hop_limit = hlimit;
224
225         hdr->saddr = fl6->saddr;
226         hdr->daddr = *first_hop;
227
228         skb->priority = sk->sk_priority;
229         skb->mark = sk->sk_mark;
230
231         mtu = dst_mtu(dst);
232         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
233                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
234                               IPSTATS_MIB_OUT, skb->len);
235                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
236                                dst->dev, dst_output);
237         }
238
239         net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
240         skb->dev = dst->dev;
241         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
242         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
243         kfree_skb(skb);
244         return -EMSGSIZE;
245 }
246
247 EXPORT_SYMBOL(ip6_xmit);
248
249 /*
250  *      To avoid extra problems ND packets are send through this
251  *      routine. It's code duplication but I really want to avoid
252  *      extra checks since ipv6_build_header is used by TCP (which
253  *      is for us performance critical)
254  */
255
256 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
257                const struct in6_addr *saddr, const struct in6_addr *daddr,
258                int proto, int len)
259 {
260         struct ipv6_pinfo *np = inet6_sk(sk);
261         struct ipv6hdr *hdr;
262
263         skb->protocol = htons(ETH_P_IPV6);
264         skb->dev = dev;
265
266         skb_reset_network_header(skb);
267         skb_put(skb, sizeof(struct ipv6hdr));
268         hdr = ipv6_hdr(skb);
269
270         *(__be32*)hdr = htonl(0x60000000);
271
272         hdr->payload_len = htons(len);
273         hdr->nexthdr = proto;
274         hdr->hop_limit = np->hop_limit;
275
276         hdr->saddr = *saddr;
277         hdr->daddr = *daddr;
278
279         return 0;
280 }
281
282 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
283 {
284         struct ip6_ra_chain *ra;
285         struct sock *last = NULL;
286
287         read_lock(&ip6_ra_lock);
288         for (ra = ip6_ra_chain; ra; ra = ra->next) {
289                 struct sock *sk = ra->sk;
290                 if (sk && ra->sel == sel &&
291                     (!sk->sk_bound_dev_if ||
292                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
293                         if (last) {
294                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
295                                 if (skb2)
296                                         rawv6_rcv(last, skb2);
297                         }
298                         last = sk;
299                 }
300         }
301
302         if (last) {
303                 rawv6_rcv(last, skb);
304                 read_unlock(&ip6_ra_lock);
305                 return 1;
306         }
307         read_unlock(&ip6_ra_lock);
308         return 0;
309 }
310
311 static int ip6_forward_proxy_check(struct sk_buff *skb)
312 {
313         struct ipv6hdr *hdr = ipv6_hdr(skb);
314         u8 nexthdr = hdr->nexthdr;
315         __be16 frag_off;
316         int offset;
317
318         if (ipv6_ext_hdr(nexthdr)) {
319                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
320                 if (offset < 0)
321                         return 0;
322         } else
323                 offset = sizeof(struct ipv6hdr);
324
325         if (nexthdr == IPPROTO_ICMPV6) {
326                 struct icmp6hdr *icmp6;
327
328                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
329                                          offset + 1 - skb->data)))
330                         return 0;
331
332                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
333
334                 switch (icmp6->icmp6_type) {
335                 case NDISC_ROUTER_SOLICITATION:
336                 case NDISC_ROUTER_ADVERTISEMENT:
337                 case NDISC_NEIGHBOUR_SOLICITATION:
338                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
339                 case NDISC_REDIRECT:
340                         /* For reaction involving unicast neighbor discovery
341                          * message destined to the proxied address, pass it to
342                          * input function.
343                          */
344                         return 1;
345                 default:
346                         break;
347                 }
348         }
349
350         /*
351          * The proxying router can't forward traffic sent to a link-local
352          * address, so signal the sender and discard the packet. This
353          * behavior is clarified by the MIPv6 specification.
354          */
355         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
356                 dst_link_failure(skb);
357                 return -1;
358         }
359
360         return 0;
361 }
362
363 static inline int ip6_forward_finish(struct sk_buff *skb)
364 {
365         return dst_output(skb);
366 }
367
368 int ip6_forward(struct sk_buff *skb)
369 {
370         struct dst_entry *dst = skb_dst(skb);
371         struct ipv6hdr *hdr = ipv6_hdr(skb);
372         struct inet6_skb_parm *opt = IP6CB(skb);
373         struct net *net = dev_net(dst->dev);
374         u32 mtu;
375
376         if (net->ipv6.devconf_all->forwarding == 0)
377                 goto error;
378
379         if (skb_warn_if_lro(skb))
380                 goto drop;
381
382         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
383                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
384                 goto drop;
385         }
386
387         if (skb->pkt_type != PACKET_HOST)
388                 goto drop;
389
390         skb_forward_csum(skb);
391
392         /*
393          *      We DO NOT make any processing on
394          *      RA packets, pushing them to user level AS IS
395          *      without ane WARRANTY that application will be able
396          *      to interpret them. The reason is that we
397          *      cannot make anything clever here.
398          *
399          *      We are not end-node, so that if packet contains
400          *      AH/ESP, we cannot make anything.
401          *      Defragmentation also would be mistake, RA packets
402          *      cannot be fragmented, because there is no warranty
403          *      that different fragments will go along one path. --ANK
404          */
405         if (opt->ra) {
406                 u8 *ptr = skb_network_header(skb) + opt->ra;
407                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
408                         return 0;
409         }
410
411         /*
412          *      check and decrement ttl
413          */
414         if (hdr->hop_limit <= 1) {
415                 /* Force OUTPUT device used as source address */
416                 skb->dev = dst->dev;
417                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418                 IP6_INC_STATS_BH(net,
419                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
420
421                 kfree_skb(skb);
422                 return -ETIMEDOUT;
423         }
424
425         /* XXX: idev->cnf.proxy_ndp? */
426         if (net->ipv6.devconf_all->proxy_ndp &&
427             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428                 int proxied = ip6_forward_proxy_check(skb);
429                 if (proxied > 0)
430                         return ip6_input(skb);
431                 else if (proxied < 0) {
432                         IP6_INC_STATS(net, ip6_dst_idev(dst),
433                                       IPSTATS_MIB_INDISCARDS);
434                         goto drop;
435                 }
436         }
437
438         if (!xfrm6_route_forward(skb)) {
439                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
440                 goto drop;
441         }
442         dst = skb_dst(skb);
443
444         /* IPv6 specs say nothing about it, but it is clear that we cannot
445            send redirects to source routed frames.
446            We don't send redirects to frames decapsulated from IPsec.
447          */
448         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
449                 struct in6_addr *target = NULL;
450                 struct inet_peer *peer;
451                 struct rt6_info *rt;
452
453                 /*
454                  *      incoming and outgoing devices are the same
455                  *      send a redirect.
456                  */
457
458                 rt = (struct rt6_info *) dst;
459                 if (rt->rt6i_flags & RTF_GATEWAY)
460                         target = &rt->rt6i_gateway;
461                 else
462                         target = &hdr->daddr;
463
464                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
465
466                 /* Limit redirects both by destination (here)
467                    and by source (inside ndisc_send_redirect)
468                  */
469                 if (inet_peer_xrlim_allow(peer, 1*HZ))
470                         ndisc_send_redirect(skb, target);
471                 if (peer)
472                         inet_putpeer(peer);
473         } else {
474                 int addrtype = ipv6_addr_type(&hdr->saddr);
475
476                 /* This check is security critical. */
477                 if (addrtype == IPV6_ADDR_ANY ||
478                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
479                         goto error;
480                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
481                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
482                                     ICMPV6_NOT_NEIGHBOUR, 0);
483                         goto error;
484                 }
485         }
486
487         mtu = dst_mtu(dst);
488         if (mtu < IPV6_MIN_MTU)
489                 mtu = IPV6_MIN_MTU;
490
491         if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
492             (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
493                 /* Again, force OUTPUT device used as source address */
494                 skb->dev = dst->dev;
495                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496                 IP6_INC_STATS_BH(net,
497                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
498                 IP6_INC_STATS_BH(net,
499                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
500                 kfree_skb(skb);
501                 return -EMSGSIZE;
502         }
503
504         if (skb_cow(skb, dst->dev->hard_header_len)) {
505                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
506                 goto drop;
507         }
508
509         hdr = ipv6_hdr(skb);
510
511         /* Mangling hops number delayed to point after skb COW */
512
513         hdr->hop_limit--;
514
515         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
516         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
517         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
518                        ip6_forward_finish);
519
520 error:
521         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
522 drop:
523         kfree_skb(skb);
524         return -EINVAL;
525 }
526
527 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
528 {
529         to->pkt_type = from->pkt_type;
530         to->priority = from->priority;
531         to->protocol = from->protocol;
532         skb_dst_drop(to);
533         skb_dst_set(to, dst_clone(skb_dst(from)));
534         to->dev = from->dev;
535         to->mark = from->mark;
536
537 #ifdef CONFIG_NET_SCHED
538         to->tc_index = from->tc_index;
539 #endif
540         nf_copy(to, from);
541 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
542         to->nf_trace = from->nf_trace;
543 #endif
544         skb_copy_secmark(to, from);
545 }
546
547 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
548 {
549         u16 offset = sizeof(struct ipv6hdr);
550         struct ipv6_opt_hdr *exthdr =
551                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
552         unsigned int packet_len = skb->tail - skb->network_header;
553         int found_rhdr = 0;
554         *nexthdr = &ipv6_hdr(skb)->nexthdr;
555
556         while (offset + 1 <= packet_len) {
557
558                 switch (**nexthdr) {
559
560                 case NEXTHDR_HOP:
561                         break;
562                 case NEXTHDR_ROUTING:
563                         found_rhdr = 1;
564                         break;
565                 case NEXTHDR_DEST:
566 #if IS_ENABLED(CONFIG_IPV6_MIP6)
567                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
568                                 break;
569 #endif
570                         if (found_rhdr)
571                                 return offset;
572                         break;
573                 default :
574                         return offset;
575                 }
576
577                 offset += ipv6_optlen(exthdr);
578                 *nexthdr = &exthdr->nexthdr;
579                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
580                                                  offset);
581         }
582
583         return offset;
584 }
585
586 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
587 {
588         static atomic_t ipv6_fragmentation_id;
589         int old, new;
590
591         if (rt && !(rt->dst.flags & DST_NOPEER)) {
592                 struct inet_peer *peer;
593                 struct net *net;
594
595                 net = dev_net(rt->dst.dev);
596                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
597                 if (peer) {
598                         fhdr->identification = htonl(inet_getid(peer, 0));
599                         inet_putpeer(peer);
600                         return;
601                 }
602         }
603         do {
604                 old = atomic_read(&ipv6_fragmentation_id);
605                 new = old + 1;
606                 if (!new)
607                         new = 1;
608         } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
609         fhdr->identification = htonl(new);
610 }
611
612 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
613 {
614         struct sk_buff *frag;
615         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
616         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
617         struct ipv6hdr *tmp_hdr;
618         struct frag_hdr *fh;
619         unsigned int mtu, hlen, left, len;
620         int hroom, troom;
621         __be32 frag_id = 0;
622         int ptr, offset = 0, err=0;
623         u8 *prevhdr, nexthdr = 0;
624         struct net *net = dev_net(skb_dst(skb)->dev);
625
626         hlen = ip6_find_1stfragopt(skb, &prevhdr);
627         nexthdr = *prevhdr;
628
629         mtu = ip6_skb_dst_mtu(skb);
630
631         /* We must not fragment if the socket is set to force MTU discovery
632          * or if the skb it not generated by a local socket.
633          */
634         if (unlikely(!skb->local_df && skb->len > mtu) ||
635                      (IP6CB(skb)->frag_max_size &&
636                       IP6CB(skb)->frag_max_size > mtu)) {
637                 if (skb->sk && dst_allfrag(skb_dst(skb)))
638                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
639
640                 skb->dev = skb_dst(skb)->dev;
641                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
642                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
643                               IPSTATS_MIB_FRAGFAILS);
644                 kfree_skb(skb);
645                 return -EMSGSIZE;
646         }
647
648         if (np && np->frag_size < mtu) {
649                 if (np->frag_size)
650                         mtu = np->frag_size;
651         }
652         mtu -= hlen + sizeof(struct frag_hdr);
653
654         if (skb_has_frag_list(skb)) {
655                 int first_len = skb_pagelen(skb);
656                 struct sk_buff *frag2;
657
658                 if (first_len - hlen > mtu ||
659                     ((first_len - hlen) & 7) ||
660                     skb_cloned(skb))
661                         goto slow_path;
662
663                 skb_walk_frags(skb, frag) {
664                         /* Correct geometry. */
665                         if (frag->len > mtu ||
666                             ((frag->len & 7) && frag->next) ||
667                             skb_headroom(frag) < hlen)
668                                 goto slow_path_clean;
669
670                         /* Partially cloned skb? */
671                         if (skb_shared(frag))
672                                 goto slow_path_clean;
673
674                         BUG_ON(frag->sk);
675                         if (skb->sk) {
676                                 frag->sk = skb->sk;
677                                 frag->destructor = sock_wfree;
678                         }
679                         skb->truesize -= frag->truesize;
680                 }
681
682                 err = 0;
683                 offset = 0;
684                 frag = skb_shinfo(skb)->frag_list;
685                 skb_frag_list_init(skb);
686                 /* BUILD HEADER */
687
688                 *prevhdr = NEXTHDR_FRAGMENT;
689                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
690                 if (!tmp_hdr) {
691                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
692                                       IPSTATS_MIB_FRAGFAILS);
693                         return -ENOMEM;
694                 }
695
696                 __skb_pull(skb, hlen);
697                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
698                 __skb_push(skb, hlen);
699                 skb_reset_network_header(skb);
700                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
701
702                 ipv6_select_ident(fh, rt);
703                 fh->nexthdr = nexthdr;
704                 fh->reserved = 0;
705                 fh->frag_off = htons(IP6_MF);
706                 frag_id = fh->identification;
707
708                 first_len = skb_pagelen(skb);
709                 skb->data_len = first_len - skb_headlen(skb);
710                 skb->len = first_len;
711                 ipv6_hdr(skb)->payload_len = htons(first_len -
712                                                    sizeof(struct ipv6hdr));
713
714                 dst_hold(&rt->dst);
715
716                 for (;;) {
717                         /* Prepare header of the next frame,
718                          * before previous one went down. */
719                         if (frag) {
720                                 frag->ip_summed = CHECKSUM_NONE;
721                                 skb_reset_transport_header(frag);
722                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
723                                 __skb_push(frag, hlen);
724                                 skb_reset_network_header(frag);
725                                 memcpy(skb_network_header(frag), tmp_hdr,
726                                        hlen);
727                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
728                                 fh->nexthdr = nexthdr;
729                                 fh->reserved = 0;
730                                 fh->frag_off = htons(offset);
731                                 if (frag->next != NULL)
732                                         fh->frag_off |= htons(IP6_MF);
733                                 fh->identification = frag_id;
734                                 ipv6_hdr(frag)->payload_len =
735                                                 htons(frag->len -
736                                                       sizeof(struct ipv6hdr));
737                                 ip6_copy_metadata(frag, skb);
738                         }
739
740                         err = output(skb);
741                         if(!err)
742                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
743                                               IPSTATS_MIB_FRAGCREATES);
744
745                         if (err || !frag)
746                                 break;
747
748                         skb = frag;
749                         frag = skb->next;
750                         skb->next = NULL;
751                 }
752
753                 kfree(tmp_hdr);
754
755                 if (err == 0) {
756                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
757                                       IPSTATS_MIB_FRAGOKS);
758                         ip6_rt_put(rt);
759                         return 0;
760                 }
761
762                 while (frag) {
763                         skb = frag->next;
764                         kfree_skb(frag);
765                         frag = skb;
766                 }
767
768                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
769                               IPSTATS_MIB_FRAGFAILS);
770                 ip6_rt_put(rt);
771                 return err;
772
773 slow_path_clean:
774                 skb_walk_frags(skb, frag2) {
775                         if (frag2 == frag)
776                                 break;
777                         frag2->sk = NULL;
778                         frag2->destructor = NULL;
779                         skb->truesize += frag2->truesize;
780                 }
781         }
782
783 slow_path:
784         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
785             skb_checksum_help(skb))
786                 goto fail;
787
788         left = skb->len - hlen;         /* Space per frame */
789         ptr = hlen;                     /* Where to start from */
790
791         /*
792          *      Fragment the datagram.
793          */
794
795         *prevhdr = NEXTHDR_FRAGMENT;
796         hroom = LL_RESERVED_SPACE(rt->dst.dev);
797         troom = rt->dst.dev->needed_tailroom;
798
799         /*
800          *      Keep copying data until we run out.
801          */
802         while(left > 0) {
803                 len = left;
804                 /* IF: it doesn't fit, use 'mtu' - the data space left */
805                 if (len > mtu)
806                         len = mtu;
807                 /* IF: we are not sending up to and including the packet end
808                    then align the next start on an eight byte boundary */
809                 if (len < left) {
810                         len &= ~7;
811                 }
812                 /*
813                  *      Allocate buffer.
814                  */
815
816                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
817                                       hroom + troom, GFP_ATOMIC)) == NULL) {
818                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
819                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
820                                       IPSTATS_MIB_FRAGFAILS);
821                         err = -ENOMEM;
822                         goto fail;
823                 }
824
825                 /*
826                  *      Set up data on packet
827                  */
828
829                 ip6_copy_metadata(frag, skb);
830                 skb_reserve(frag, hroom);
831                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
832                 skb_reset_network_header(frag);
833                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
834                 frag->transport_header = (frag->network_header + hlen +
835                                           sizeof(struct frag_hdr));
836
837                 /*
838                  *      Charge the memory for the fragment to any owner
839                  *      it might possess
840                  */
841                 if (skb->sk)
842                         skb_set_owner_w(frag, skb->sk);
843
844                 /*
845                  *      Copy the packet header into the new buffer.
846                  */
847                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
848
849                 /*
850                  *      Build fragment header.
851                  */
852                 fh->nexthdr = nexthdr;
853                 fh->reserved = 0;
854                 if (!frag_id) {
855                         ipv6_select_ident(fh, rt);
856                         frag_id = fh->identification;
857                 } else
858                         fh->identification = frag_id;
859
860                 /*
861                  *      Copy a block of the IP datagram.
862                  */
863                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
864                         BUG();
865                 left -= len;
866
867                 fh->frag_off = htons(offset);
868                 if (left > 0)
869                         fh->frag_off |= htons(IP6_MF);
870                 ipv6_hdr(frag)->payload_len = htons(frag->len -
871                                                     sizeof(struct ipv6hdr));
872
873                 ptr += len;
874                 offset += len;
875
876                 /*
877                  *      Put this fragment into the sending queue.
878                  */
879                 err = output(frag);
880                 if (err)
881                         goto fail;
882
883                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
884                               IPSTATS_MIB_FRAGCREATES);
885         }
886         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
887                       IPSTATS_MIB_FRAGOKS);
888         consume_skb(skb);
889         return err;
890
891 fail:
892         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
893                       IPSTATS_MIB_FRAGFAILS);
894         kfree_skb(skb);
895         return err;
896 }
897
898 static inline int ip6_rt_check(const struct rt6key *rt_key,
899                                const struct in6_addr *fl_addr,
900                                const struct in6_addr *addr_cache)
901 {
902         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
903                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
904 }
905
906 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
907                                           struct dst_entry *dst,
908                                           const struct flowi6 *fl6)
909 {
910         struct ipv6_pinfo *np = inet6_sk(sk);
911         struct rt6_info *rt = (struct rt6_info *)dst;
912
913         if (!dst)
914                 goto out;
915
916         /* Yes, checking route validity in not connected
917          * case is not very simple. Take into account,
918          * that we do not support routing by source, TOS,
919          * and MSG_DONTROUTE            --ANK (980726)
920          *
921          * 1. ip6_rt_check(): If route was host route,
922          *    check that cached destination is current.
923          *    If it is network route, we still may
924          *    check its validity using saved pointer
925          *    to the last used address: daddr_cache.
926          *    We do not want to save whole address now,
927          *    (because main consumer of this service
928          *    is tcp, which has not this problem),
929          *    so that the last trick works only on connected
930          *    sockets.
931          * 2. oif also should be the same.
932          */
933         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
934 #ifdef CONFIG_IPV6_SUBTREES
935             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
936 #endif
937             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
938                 dst_release(dst);
939                 dst = NULL;
940         }
941
942 out:
943         return dst;
944 }
945
946 static int ip6_dst_lookup_tail(struct sock *sk,
947                                struct dst_entry **dst, struct flowi6 *fl6)
948 {
949         struct net *net = sock_net(sk);
950 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
951         struct neighbour *n;
952         struct rt6_info *rt;
953 #endif
954         int err;
955
956         if (*dst == NULL)
957                 *dst = ip6_route_output(net, sk, fl6);
958
959         if ((err = (*dst)->error))
960                 goto out_err_release;
961
962         if (ipv6_addr_any(&fl6->saddr)) {
963                 struct rt6_info *rt = (struct rt6_info *) *dst;
964                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
965                                           sk ? inet6_sk(sk)->srcprefs : 0,
966                                           &fl6->saddr);
967                 if (err)
968                         goto out_err_release;
969         }
970
971 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
972         /*
973          * Here if the dst entry we've looked up
974          * has a neighbour entry that is in the INCOMPLETE
975          * state and the src address from the flow is
976          * marked as OPTIMISTIC, we release the found
977          * dst entry and replace it instead with the
978          * dst entry of the nexthop router
979          */
980         rt = (struct rt6_info *) *dst;
981         n = rt->n;
982         if (n && !(n->nud_state & NUD_VALID)) {
983                 struct inet6_ifaddr *ifp;
984                 struct flowi6 fl_gw6;
985                 int redirect;
986
987                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
988                                       (*dst)->dev, 1);
989
990                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
991                 if (ifp)
992                         in6_ifa_put(ifp);
993
994                 if (redirect) {
995                         /*
996                          * We need to get the dst entry for the
997                          * default router instead
998                          */
999                         dst_release(*dst);
1000                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1001                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1002                         *dst = ip6_route_output(net, sk, &fl_gw6);
1003                         if ((err = (*dst)->error))
1004                                 goto out_err_release;
1005                 }
1006         }
1007 #endif
1008
1009         return 0;
1010
1011 out_err_release:
1012         if (err == -ENETUNREACH)
1013                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1014         dst_release(*dst);
1015         *dst = NULL;
1016         return err;
1017 }
1018
1019 /**
1020  *      ip6_dst_lookup - perform route lookup on flow
1021  *      @sk: socket which provides route info
1022  *      @dst: pointer to dst_entry * for result
1023  *      @fl6: flow to lookup
1024  *
1025  *      This function performs a route lookup on the given flow.
1026  *
1027  *      It returns zero on success, or a standard errno code on error.
1028  */
1029 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1030 {
1031         *dst = NULL;
1032         return ip6_dst_lookup_tail(sk, dst, fl6);
1033 }
1034 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1035
1036 /**
1037  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1038  *      @sk: socket which provides route info
1039  *      @fl6: flow to lookup
1040  *      @final_dst: final destination address for ipsec lookup
1041  *      @can_sleep: we are in a sleepable context
1042  *
1043  *      This function performs a route lookup on the given flow.
1044  *
1045  *      It returns a valid dst pointer on success, or a pointer encoded
1046  *      error code.
1047  */
1048 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1049                                       const struct in6_addr *final_dst,
1050                                       bool can_sleep)
1051 {
1052         struct dst_entry *dst = NULL;
1053         int err;
1054
1055         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1056         if (err)
1057                 return ERR_PTR(err);
1058         if (final_dst)
1059                 fl6->daddr = *final_dst;
1060         if (can_sleep)
1061                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1062
1063         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1064 }
1065 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1066
1067 /**
1068  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1069  *      @sk: socket which provides the dst cache and route info
1070  *      @fl6: flow to lookup
1071  *      @final_dst: final destination address for ipsec lookup
1072  *      @can_sleep: we are in a sleepable context
1073  *
1074  *      This function performs a route lookup on the given flow with the
1075  *      possibility of using the cached route in the socket if it is valid.
1076  *      It will take the socket dst lock when operating on the dst cache.
1077  *      As a result, this function can only be used in process context.
1078  *
1079  *      It returns a valid dst pointer on success, or a pointer encoded
1080  *      error code.
1081  */
1082 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1083                                          const struct in6_addr *final_dst,
1084                                          bool can_sleep)
1085 {
1086         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1087         int err;
1088
1089         dst = ip6_sk_dst_check(sk, dst, fl6);
1090
1091         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1092         if (err)
1093                 return ERR_PTR(err);
1094         if (final_dst)
1095                 fl6->daddr = *final_dst;
1096         if (can_sleep)
1097                 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1098
1099         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1100 }
1101 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1102
1103 static inline int ip6_ufo_append_data(struct sock *sk,
1104                         int getfrag(void *from, char *to, int offset, int len,
1105                         int odd, struct sk_buff *skb),
1106                         void *from, int length, int hh_len, int fragheaderlen,
1107                         int transhdrlen, int mtu,unsigned int flags,
1108                         struct rt6_info *rt)
1109
1110 {
1111         struct sk_buff *skb;
1112         int err;
1113
1114         /* There is support for UDP large send offload by network
1115          * device, so create one single skb packet containing complete
1116          * udp datagram
1117          */
1118         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1119                 skb = sock_alloc_send_skb(sk,
1120                         hh_len + fragheaderlen + transhdrlen + 20,
1121                         (flags & MSG_DONTWAIT), &err);
1122                 if (skb == NULL)
1123                         return err;
1124
1125                 /* reserve space for Hardware header */
1126                 skb_reserve(skb, hh_len);
1127
1128                 /* create space for UDP/IP header */
1129                 skb_put(skb,fragheaderlen + transhdrlen);
1130
1131                 /* initialize network header pointer */
1132                 skb_reset_network_header(skb);
1133
1134                 /* initialize protocol header pointer */
1135                 skb->transport_header = skb->network_header + fragheaderlen;
1136
1137                 skb->ip_summed = CHECKSUM_PARTIAL;
1138                 skb->csum = 0;
1139         }
1140
1141         err = skb_append_datato_frags(sk,skb, getfrag, from,
1142                                       (length - transhdrlen));
1143         if (!err) {
1144                 struct frag_hdr fhdr;
1145
1146                 /* Specify the length of each IPv6 datagram fragment.
1147                  * It has to be a multiple of 8.
1148                  */
1149                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1150                                              sizeof(struct frag_hdr)) & ~7;
1151                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1152                 ipv6_select_ident(&fhdr, rt);
1153                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1154                 __skb_queue_tail(&sk->sk_write_queue, skb);
1155
1156                 return 0;
1157         }
1158         /* There is not enough support do UPD LSO,
1159          * so follow normal path
1160          */
1161         kfree_skb(skb);
1162
1163         return err;
1164 }
1165
1166 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1167                                                gfp_t gfp)
1168 {
1169         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1170 }
1171
1172 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1173                                                 gfp_t gfp)
1174 {
1175         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1176 }
1177
1178 static void ip6_append_data_mtu(int *mtu,
1179                                 int *maxfraglen,
1180                                 unsigned int fragheaderlen,
1181                                 struct sk_buff *skb,
1182                                 struct rt6_info *rt)
1183 {
1184         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1185                 if (skb == NULL) {
1186                         /* first fragment, reserve header_len */
1187                         *mtu = *mtu - rt->dst.header_len;
1188
1189                 } else {
1190                         /*
1191                          * this fragment is not first, the headers
1192                          * space is regarded as data space.
1193                          */
1194                         *mtu = dst_mtu(rt->dst.path);
1195                 }
1196                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1197                               + fragheaderlen - sizeof(struct frag_hdr);
1198         }
1199 }
1200
1201 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1202         int offset, int len, int odd, struct sk_buff *skb),
1203         void *from, int length, int transhdrlen,
1204         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1205         struct rt6_info *rt, unsigned int flags, int dontfrag)
1206 {
1207         struct inet_sock *inet = inet_sk(sk);
1208         struct ipv6_pinfo *np = inet6_sk(sk);
1209         struct inet_cork *cork;
1210         struct sk_buff *skb, *skb_prev = NULL;
1211         unsigned int maxfraglen, fragheaderlen;
1212         int exthdrlen;
1213         int dst_exthdrlen;
1214         int hh_len;
1215         int mtu;
1216         int copy;
1217         int err;
1218         int offset = 0;
1219         __u8 tx_flags = 0;
1220
1221         if (flags&MSG_PROBE)
1222                 return 0;
1223         cork = &inet->cork.base;
1224         if (skb_queue_empty(&sk->sk_write_queue)) {
1225                 /*
1226                  * setup for corking
1227                  */
1228                 if (opt) {
1229                         if (WARN_ON(np->cork.opt))
1230                                 return -EINVAL;
1231
1232                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1233                         if (unlikely(np->cork.opt == NULL))
1234                                 return -ENOBUFS;
1235
1236                         np->cork.opt->tot_len = opt->tot_len;
1237                         np->cork.opt->opt_flen = opt->opt_flen;
1238                         np->cork.opt->opt_nflen = opt->opt_nflen;
1239
1240                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1241                                                             sk->sk_allocation);
1242                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1243                                 return -ENOBUFS;
1244
1245                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1246                                                             sk->sk_allocation);
1247                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1248                                 return -ENOBUFS;
1249
1250                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1251                                                            sk->sk_allocation);
1252                         if (opt->hopopt && !np->cork.opt->hopopt)
1253                                 return -ENOBUFS;
1254
1255                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1256                                                             sk->sk_allocation);
1257                         if (opt->srcrt && !np->cork.opt->srcrt)
1258                                 return -ENOBUFS;
1259
1260                         /* need source address above miyazawa*/
1261                 }
1262                 dst_hold(&rt->dst);
1263                 cork->dst = &rt->dst;
1264                 inet->cork.fl.u.ip6 = *fl6;
1265                 np->cork.hop_limit = hlimit;
1266                 np->cork.tclass = tclass;
1267                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1268                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1269                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1270                 else
1271                         mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1272                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1273                 if (np->frag_size < mtu) {
1274                         if (np->frag_size)
1275                                 mtu = np->frag_size;
1276                 }
1277                 cork->fragsize = mtu;
1278                 if (dst_allfrag(rt->dst.path))
1279                         cork->flags |= IPCORK_ALLFRAG;
1280                 cork->length = 0;
1281                 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1282                 length += exthdrlen;
1283                 transhdrlen += exthdrlen;
1284                 dst_exthdrlen = rt->dst.header_len;
1285         } else {
1286                 rt = (struct rt6_info *)cork->dst;
1287                 fl6 = &inet->cork.fl.u.ip6;
1288                 opt = np->cork.opt;
1289                 transhdrlen = 0;
1290                 exthdrlen = 0;
1291                 dst_exthdrlen = 0;
1292                 mtu = cork->fragsize;
1293         }
1294
1295         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1296
1297         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1298                         (opt ? opt->opt_nflen : 0);
1299         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1300
1301         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1302                 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1303                         ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1304                         return -EMSGSIZE;
1305                 }
1306         }
1307
1308         /* For UDP, check if TX timestamp is enabled */
1309         if (sk->sk_type == SOCK_DGRAM) {
1310                 err = sock_tx_timestamp(sk, &tx_flags);
1311                 if (err)
1312                         goto error;
1313         }
1314
1315         /*
1316          * Let's try using as much space as possible.
1317          * Use MTU if total length of the message fits into the MTU.
1318          * Otherwise, we need to reserve fragment header and
1319          * fragment alignment (= 8-15 octects, in total).
1320          *
1321          * Note that we may need to "move" the data from the tail of
1322          * of the buffer to the new fragment when we split
1323          * the message.
1324          *
1325          * FIXME: It may be fragmented into multiple chunks
1326          *        at once if non-fragmentable extension headers
1327          *        are too large.
1328          * --yoshfuji
1329          */
1330
1331         cork->length += length;
1332         if (length > mtu) {
1333                 int proto = sk->sk_protocol;
1334                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1335                         ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1336                         return -EMSGSIZE;
1337                 }
1338
1339                 if (proto == IPPROTO_UDP &&
1340                     (rt->dst.dev->features & NETIF_F_UFO)) {
1341
1342                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1343                                                   hh_len, fragheaderlen,
1344                                                   transhdrlen, mtu, flags, rt);
1345                         if (err)
1346                                 goto error;
1347                         return 0;
1348                 }
1349         }
1350
1351         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1352                 goto alloc_new_skb;
1353
1354         while (length > 0) {
1355                 /* Check if the remaining data fits into current packet. */
1356                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1357                 if (copy < length)
1358                         copy = maxfraglen - skb->len;
1359
1360                 if (copy <= 0) {
1361                         char *data;
1362                         unsigned int datalen;
1363                         unsigned int fraglen;
1364                         unsigned int fraggap;
1365                         unsigned int alloclen;
1366 alloc_new_skb:
1367                         /* There's no room in the current skb */
1368                         if (skb)
1369                                 fraggap = skb->len - maxfraglen;
1370                         else
1371                                 fraggap = 0;
1372                         /* update mtu and maxfraglen if necessary */
1373                         if (skb == NULL || skb_prev == NULL)
1374                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1375                                                     fragheaderlen, skb, rt);
1376
1377                         skb_prev = skb;
1378
1379                         /*
1380                          * If remaining data exceeds the mtu,
1381                          * we know we need more fragment(s).
1382                          */
1383                         datalen = length + fraggap;
1384
1385                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1386                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1387                         if ((flags & MSG_MORE) &&
1388                             !(rt->dst.dev->features&NETIF_F_SG))
1389                                 alloclen = mtu;
1390                         else
1391                                 alloclen = datalen + fragheaderlen;
1392
1393                         alloclen += dst_exthdrlen;
1394
1395                         if (datalen != length + fraggap) {
1396                                 /*
1397                                  * this is not the last fragment, the trailer
1398                                  * space is regarded as data space.
1399                                  */
1400                                 datalen += rt->dst.trailer_len;
1401                         }
1402
1403                         alloclen += rt->dst.trailer_len;
1404                         fraglen = datalen + fragheaderlen;
1405
1406                         /*
1407                          * We just reserve space for fragment header.
1408                          * Note: this may be overallocation if the message
1409                          * (without MSG_MORE) fits into the MTU.
1410                          */
1411                         alloclen += sizeof(struct frag_hdr);
1412
1413                         if (transhdrlen) {
1414                                 skb = sock_alloc_send_skb(sk,
1415                                                 alloclen + hh_len,
1416                                                 (flags & MSG_DONTWAIT), &err);
1417                         } else {
1418                                 skb = NULL;
1419                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1420                                     2 * sk->sk_sndbuf)
1421                                         skb = sock_wmalloc(sk,
1422                                                            alloclen + hh_len, 1,
1423                                                            sk->sk_allocation);
1424                                 if (unlikely(skb == NULL))
1425                                         err = -ENOBUFS;
1426                                 else {
1427                                         /* Only the initial fragment
1428                                          * is time stamped.
1429                                          */
1430                                         tx_flags = 0;
1431                                 }
1432                         }
1433                         if (skb == NULL)
1434                                 goto error;
1435                         /*
1436                          *      Fill in the control structures
1437                          */
1438                         skb->ip_summed = CHECKSUM_NONE;
1439                         skb->csum = 0;
1440                         /* reserve for fragmentation and ipsec header */
1441                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1442                                     dst_exthdrlen);
1443
1444                         if (sk->sk_type == SOCK_DGRAM)
1445                                 skb_shinfo(skb)->tx_flags = tx_flags;
1446
1447                         /*
1448                          *      Find where to start putting bytes
1449                          */
1450                         data = skb_put(skb, fraglen);
1451                         skb_set_network_header(skb, exthdrlen);
1452                         data += fragheaderlen;
1453                         skb->transport_header = (skb->network_header +
1454                                                  fragheaderlen);
1455                         if (fraggap) {
1456                                 skb->csum = skb_copy_and_csum_bits(
1457                                         skb_prev, maxfraglen,
1458                                         data + transhdrlen, fraggap, 0);
1459                                 skb_prev->csum = csum_sub(skb_prev->csum,
1460                                                           skb->csum);
1461                                 data += fraggap;
1462                                 pskb_trim_unique(skb_prev, maxfraglen);
1463                         }
1464                         copy = datalen - transhdrlen - fraggap;
1465
1466                         if (copy < 0) {
1467                                 err = -EINVAL;
1468                                 kfree_skb(skb);
1469                                 goto error;
1470                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1471                                 err = -EFAULT;
1472                                 kfree_skb(skb);
1473                                 goto error;
1474                         }
1475
1476                         offset += copy;
1477                         length -= datalen - fraggap;
1478                         transhdrlen = 0;
1479                         exthdrlen = 0;
1480                         dst_exthdrlen = 0;
1481
1482                         /*
1483                          * Put the packet on the pending queue
1484                          */
1485                         __skb_queue_tail(&sk->sk_write_queue, skb);
1486                         continue;
1487                 }
1488
1489                 if (copy > length)
1490                         copy = length;
1491
1492                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1493                         unsigned int off;
1494
1495                         off = skb->len;
1496                         if (getfrag(from, skb_put(skb, copy),
1497                                                 offset, copy, off, skb) < 0) {
1498                                 __skb_trim(skb, off);
1499                                 err = -EFAULT;
1500                                 goto error;
1501                         }
1502                 } else {
1503                         int i = skb_shinfo(skb)->nr_frags;
1504                         struct page_frag *pfrag = sk_page_frag(sk);
1505
1506                         err = -ENOMEM;
1507                         if (!sk_page_frag_refill(sk, pfrag))
1508                                 goto error;
1509
1510                         if (!skb_can_coalesce(skb, i, pfrag->page,
1511                                               pfrag->offset)) {
1512                                 err = -EMSGSIZE;
1513                                 if (i == MAX_SKB_FRAGS)
1514                                         goto error;
1515
1516                                 __skb_fill_page_desc(skb, i, pfrag->page,
1517                                                      pfrag->offset, 0);
1518                                 skb_shinfo(skb)->nr_frags = ++i;
1519                                 get_page(pfrag->page);
1520                         }
1521                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1522                         if (getfrag(from,
1523                                     page_address(pfrag->page) + pfrag->offset,
1524                                     offset, copy, skb->len, skb) < 0)
1525                                 goto error_efault;
1526
1527                         pfrag->offset += copy;
1528                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1529                         skb->len += copy;
1530                         skb->data_len += copy;
1531                         skb->truesize += copy;
1532                         atomic_add(copy, &sk->sk_wmem_alloc);
1533                 }
1534                 offset += copy;
1535                 length -= copy;
1536         }
1537
1538         return 0;
1539
1540 error_efault:
1541         err = -EFAULT;
1542 error:
1543         cork->length -= length;
1544         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1545         return err;
1546 }
1547 EXPORT_SYMBOL_GPL(ip6_append_data);
1548
1549 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1550 {
1551         if (np->cork.opt) {
1552                 kfree(np->cork.opt->dst0opt);
1553                 kfree(np->cork.opt->dst1opt);
1554                 kfree(np->cork.opt->hopopt);
1555                 kfree(np->cork.opt->srcrt);
1556                 kfree(np->cork.opt);
1557                 np->cork.opt = NULL;
1558         }
1559
1560         if (inet->cork.base.dst) {
1561                 dst_release(inet->cork.base.dst);
1562                 inet->cork.base.dst = NULL;
1563                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1564         }
1565         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1566 }
1567
1568 int ip6_push_pending_frames(struct sock *sk)
1569 {
1570         struct sk_buff *skb, *tmp_skb;
1571         struct sk_buff **tail_skb;
1572         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1573         struct inet_sock *inet = inet_sk(sk);
1574         struct ipv6_pinfo *np = inet6_sk(sk);
1575         struct net *net = sock_net(sk);
1576         struct ipv6hdr *hdr;
1577         struct ipv6_txoptions *opt = np->cork.opt;
1578         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1579         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1580         unsigned char proto = fl6->flowi6_proto;
1581         int err = 0;
1582
1583         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1584                 goto out;
1585         tail_skb = &(skb_shinfo(skb)->frag_list);
1586
1587         /* move skb->data to ip header from ext header */
1588         if (skb->data < skb_network_header(skb))
1589                 __skb_pull(skb, skb_network_offset(skb));
1590         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1591                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1592                 *tail_skb = tmp_skb;
1593                 tail_skb = &(tmp_skb->next);
1594                 skb->len += tmp_skb->len;
1595                 skb->data_len += tmp_skb->len;
1596                 skb->truesize += tmp_skb->truesize;
1597                 tmp_skb->destructor = NULL;
1598                 tmp_skb->sk = NULL;
1599         }
1600
1601         /* Allow local fragmentation. */
1602         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1603                 skb->local_df = 1;
1604
1605         *final_dst = fl6->daddr;
1606         __skb_pull(skb, skb_network_header_len(skb));
1607         if (opt && opt->opt_flen)
1608                 ipv6_push_frag_opts(skb, opt, &proto);
1609         if (opt && opt->opt_nflen)
1610                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1611
1612         skb_push(skb, sizeof(struct ipv6hdr));
1613         skb_reset_network_header(skb);
1614         hdr = ipv6_hdr(skb);
1615
1616         *(__be32*)hdr = fl6->flowlabel |
1617                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1618
1619         hdr->hop_limit = np->cork.hop_limit;
1620         hdr->nexthdr = proto;
1621         hdr->saddr = fl6->saddr;
1622         hdr->daddr = *final_dst;
1623
1624         skb->priority = sk->sk_priority;
1625         skb->mark = sk->sk_mark;
1626
1627         skb_dst_set(skb, dst_clone(&rt->dst));
1628         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1629         if (proto == IPPROTO_ICMPV6) {
1630                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1631
1632                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1633                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1634         }
1635
1636         err = ip6_local_out(skb);
1637         if (err) {
1638                 if (err > 0)
1639                         err = net_xmit_errno(err);
1640                 if (err)
1641                         goto error;
1642         }
1643
1644 out:
1645         ip6_cork_release(inet, np);
1646         return err;
1647 error:
1648         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1649         goto out;
1650 }
1651 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1652
1653 void ip6_flush_pending_frames(struct sock *sk)
1654 {
1655         struct sk_buff *skb;
1656
1657         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1658                 if (skb_dst(skb))
1659                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1660                                       IPSTATS_MIB_OUTDISCARDS);
1661                 kfree_skb(skb);
1662         }
1663
1664         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1665 }
1666 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);