]> Pileus Git - ~andy/linux/blob - net/ipv6/ip6_output.c
Merge tag 'upstream-3.14-rc5' of git://git.infradead.org/linux-ubifs
[~andy/linux] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 static int ip6_finish_output2(struct sk_buff *skb)
60 {
61         struct dst_entry *dst = skb_dst(skb);
62         struct net_device *dev = dst->dev;
63         struct neighbour *neigh;
64         struct in6_addr *nexthop;
65         int ret;
66
67         skb->protocol = htons(ETH_P_IPV6);
68         skb->dev = dev;
69
70         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72
73                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
74                     ((mroute6_socket(dev_net(dev), skb) &&
75                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77                                          &ipv6_hdr(skb)->saddr))) {
78                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79
80                         /* Do not check for IFF_ALLMULTI; multicast routing
81                            is not supported in any case.
82                          */
83                         if (newskb)
84                                 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85                                         newskb, NULL, newskb->dev,
86                                         dev_loopback_xmit);
87
88                         if (ipv6_hdr(skb)->hop_limit == 0) {
89                                 IP6_INC_STATS(dev_net(dev), idev,
90                                               IPSTATS_MIB_OUTDISCARDS);
91                                 kfree_skb(skb);
92                                 return 0;
93                         }
94                 }
95
96                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97                                 skb->len);
98
99                 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100                     IPV6_ADDR_SCOPE_NODELOCAL &&
101                     !(dev->flags & IFF_LOOPBACK)) {
102                         kfree_skb(skb);
103                         return 0;
104                 }
105         }
106
107         rcu_read_lock_bh();
108         nexthop = rt6_nexthop((struct rt6_info *)dst);
109         neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110         if (unlikely(!neigh))
111                 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112         if (!IS_ERR(neigh)) {
113                 ret = dst_neigh_output(dst, neigh, skb);
114                 rcu_read_unlock_bh();
115                 return ret;
116         }
117         rcu_read_unlock_bh();
118
119         IP6_INC_STATS(dev_net(dst->dev),
120                       ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121         kfree_skb(skb);
122         return -EINVAL;
123 }
124
125 static int ip6_finish_output(struct sk_buff *skb)
126 {
127         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128             dst_allfrag(skb_dst(skb)) ||
129             (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130                 return ip6_fragment(skb, ip6_finish_output2);
131         else
132                 return ip6_finish_output2(skb);
133 }
134
135 int ip6_output(struct sk_buff *skb)
136 {
137         struct net_device *dev = skb_dst(skb)->dev;
138         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139         if (unlikely(idev->cnf.disable_ipv6)) {
140                 IP6_INC_STATS(dev_net(dev), idev,
141                               IPSTATS_MIB_OUTDISCARDS);
142                 kfree_skb(skb);
143                 return 0;
144         }
145
146         return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
147                             ip6_finish_output,
148                             !(IP6CB(skb)->flags & IP6SKB_REROUTED));
149 }
150
151 /*
152  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
153  */
154
155 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
156              struct ipv6_txoptions *opt, int tclass)
157 {
158         struct net *net = sock_net(sk);
159         struct ipv6_pinfo *np = inet6_sk(sk);
160         struct in6_addr *first_hop = &fl6->daddr;
161         struct dst_entry *dst = skb_dst(skb);
162         struct ipv6hdr *hdr;
163         u8  proto = fl6->flowi6_proto;
164         int seg_len = skb->len;
165         int hlimit = -1;
166         u32 mtu;
167
168         if (opt) {
169                 unsigned int head_room;
170
171                 /* First: exthdrs may take lots of space (~8K for now)
172                    MAX_HEADER is not enough.
173                  */
174                 head_room = opt->opt_nflen + opt->opt_flen;
175                 seg_len += head_room;
176                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
177
178                 if (skb_headroom(skb) < head_room) {
179                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
180                         if (skb2 == NULL) {
181                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
182                                               IPSTATS_MIB_OUTDISCARDS);
183                                 kfree_skb(skb);
184                                 return -ENOBUFS;
185                         }
186                         consume_skb(skb);
187                         skb = skb2;
188                         skb_set_owner_w(skb, sk);
189                 }
190                 if (opt->opt_flen)
191                         ipv6_push_frag_opts(skb, opt, &proto);
192                 if (opt->opt_nflen)
193                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
194         }
195
196         skb_push(skb, sizeof(struct ipv6hdr));
197         skb_reset_network_header(skb);
198         hdr = ipv6_hdr(skb);
199
200         /*
201          *      Fill in the IPv6 header
202          */
203         if (np)
204                 hlimit = np->hop_limit;
205         if (hlimit < 0)
206                 hlimit = ip6_dst_hoplimit(dst);
207
208         ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
209
210         hdr->payload_len = htons(seg_len);
211         hdr->nexthdr = proto;
212         hdr->hop_limit = hlimit;
213
214         hdr->saddr = fl6->saddr;
215         hdr->daddr = *first_hop;
216
217         skb->protocol = htons(ETH_P_IPV6);
218         skb->priority = sk->sk_priority;
219         skb->mark = sk->sk_mark;
220
221         mtu = dst_mtu(dst);
222         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
223                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
224                               IPSTATS_MIB_OUT, skb->len);
225                 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
226                                dst->dev, dst_output);
227         }
228
229         skb->dev = dst->dev;
230         ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
231         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
232         kfree_skb(skb);
233         return -EMSGSIZE;
234 }
235
236 EXPORT_SYMBOL(ip6_xmit);
237
238 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
239 {
240         struct ip6_ra_chain *ra;
241         struct sock *last = NULL;
242
243         read_lock(&ip6_ra_lock);
244         for (ra = ip6_ra_chain; ra; ra = ra->next) {
245                 struct sock *sk = ra->sk;
246                 if (sk && ra->sel == sel &&
247                     (!sk->sk_bound_dev_if ||
248                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
249                         if (last) {
250                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
251                                 if (skb2)
252                                         rawv6_rcv(last, skb2);
253                         }
254                         last = sk;
255                 }
256         }
257
258         if (last) {
259                 rawv6_rcv(last, skb);
260                 read_unlock(&ip6_ra_lock);
261                 return 1;
262         }
263         read_unlock(&ip6_ra_lock);
264         return 0;
265 }
266
267 static int ip6_forward_proxy_check(struct sk_buff *skb)
268 {
269         struct ipv6hdr *hdr = ipv6_hdr(skb);
270         u8 nexthdr = hdr->nexthdr;
271         __be16 frag_off;
272         int offset;
273
274         if (ipv6_ext_hdr(nexthdr)) {
275                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
276                 if (offset < 0)
277                         return 0;
278         } else
279                 offset = sizeof(struct ipv6hdr);
280
281         if (nexthdr == IPPROTO_ICMPV6) {
282                 struct icmp6hdr *icmp6;
283
284                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
285                                          offset + 1 - skb->data)))
286                         return 0;
287
288                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
289
290                 switch (icmp6->icmp6_type) {
291                 case NDISC_ROUTER_SOLICITATION:
292                 case NDISC_ROUTER_ADVERTISEMENT:
293                 case NDISC_NEIGHBOUR_SOLICITATION:
294                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
295                 case NDISC_REDIRECT:
296                         /* For reaction involving unicast neighbor discovery
297                          * message destined to the proxied address, pass it to
298                          * input function.
299                          */
300                         return 1;
301                 default:
302                         break;
303                 }
304         }
305
306         /*
307          * The proxying router can't forward traffic sent to a link-local
308          * address, so signal the sender and discard the packet. This
309          * behavior is clarified by the MIPv6 specification.
310          */
311         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
312                 dst_link_failure(skb);
313                 return -1;
314         }
315
316         return 0;
317 }
318
319 static inline int ip6_forward_finish(struct sk_buff *skb)
320 {
321         return dst_output(skb);
322 }
323
324 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
325 {
326         unsigned int mtu;
327         struct inet6_dev *idev;
328
329         if (dst_metric_locked(dst, RTAX_MTU)) {
330                 mtu = dst_metric_raw(dst, RTAX_MTU);
331                 if (mtu)
332                         return mtu;
333         }
334
335         mtu = IPV6_MIN_MTU;
336         rcu_read_lock();
337         idev = __in6_dev_get(dst->dev);
338         if (idev)
339                 mtu = idev->cnf.mtu6;
340         rcu_read_unlock();
341
342         return mtu;
343 }
344
345 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
346 {
347         if (skb->len <= mtu || skb->local_df)
348                 return false;
349
350         if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
351                 return true;
352
353         if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
354                 return false;
355
356         return true;
357 }
358
359 int ip6_forward(struct sk_buff *skb)
360 {
361         struct dst_entry *dst = skb_dst(skb);
362         struct ipv6hdr *hdr = ipv6_hdr(skb);
363         struct inet6_skb_parm *opt = IP6CB(skb);
364         struct net *net = dev_net(dst->dev);
365         u32 mtu;
366
367         if (net->ipv6.devconf_all->forwarding == 0)
368                 goto error;
369
370         if (skb_warn_if_lro(skb))
371                 goto drop;
372
373         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
374                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
375                                  IPSTATS_MIB_INDISCARDS);
376                 goto drop;
377         }
378
379         if (skb->pkt_type != PACKET_HOST)
380                 goto drop;
381
382         skb_forward_csum(skb);
383
384         /*
385          *      We DO NOT make any processing on
386          *      RA packets, pushing them to user level AS IS
387          *      without ane WARRANTY that application will be able
388          *      to interpret them. The reason is that we
389          *      cannot make anything clever here.
390          *
391          *      We are not end-node, so that if packet contains
392          *      AH/ESP, we cannot make anything.
393          *      Defragmentation also would be mistake, RA packets
394          *      cannot be fragmented, because there is no warranty
395          *      that different fragments will go along one path. --ANK
396          */
397         if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
398                 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
399                         return 0;
400         }
401
402         /*
403          *      check and decrement ttl
404          */
405         if (hdr->hop_limit <= 1) {
406                 /* Force OUTPUT device used as source address */
407                 skb->dev = dst->dev;
408                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
409                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
410                                  IPSTATS_MIB_INHDRERRORS);
411
412                 kfree_skb(skb);
413                 return -ETIMEDOUT;
414         }
415
416         /* XXX: idev->cnf.proxy_ndp? */
417         if (net->ipv6.devconf_all->proxy_ndp &&
418             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
419                 int proxied = ip6_forward_proxy_check(skb);
420                 if (proxied > 0)
421                         return ip6_input(skb);
422                 else if (proxied < 0) {
423                         IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
424                                          IPSTATS_MIB_INDISCARDS);
425                         goto drop;
426                 }
427         }
428
429         if (!xfrm6_route_forward(skb)) {
430                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
431                                  IPSTATS_MIB_INDISCARDS);
432                 goto drop;
433         }
434         dst = skb_dst(skb);
435
436         /* IPv6 specs say nothing about it, but it is clear that we cannot
437            send redirects to source routed frames.
438            We don't send redirects to frames decapsulated from IPsec.
439          */
440         if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
441                 struct in6_addr *target = NULL;
442                 struct inet_peer *peer;
443                 struct rt6_info *rt;
444
445                 /*
446                  *      incoming and outgoing devices are the same
447                  *      send a redirect.
448                  */
449
450                 rt = (struct rt6_info *) dst;
451                 if (rt->rt6i_flags & RTF_GATEWAY)
452                         target = &rt->rt6i_gateway;
453                 else
454                         target = &hdr->daddr;
455
456                 peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
457
458                 /* Limit redirects both by destination (here)
459                    and by source (inside ndisc_send_redirect)
460                  */
461                 if (inet_peer_xrlim_allow(peer, 1*HZ))
462                         ndisc_send_redirect(skb, target);
463                 if (peer)
464                         inet_putpeer(peer);
465         } else {
466                 int addrtype = ipv6_addr_type(&hdr->saddr);
467
468                 /* This check is security critical. */
469                 if (addrtype == IPV6_ADDR_ANY ||
470                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
471                         goto error;
472                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
473                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
474                                     ICMPV6_NOT_NEIGHBOUR, 0);
475                         goto error;
476                 }
477         }
478
479         mtu = ip6_dst_mtu_forward(dst);
480         if (mtu < IPV6_MIN_MTU)
481                 mtu = IPV6_MIN_MTU;
482
483         if (ip6_pkt_too_big(skb, mtu)) {
484                 /* Again, force OUTPUT device used as source address */
485                 skb->dev = dst->dev;
486                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
487                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
488                                  IPSTATS_MIB_INTOOBIGERRORS);
489                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
490                                  IPSTATS_MIB_FRAGFAILS);
491                 kfree_skb(skb);
492                 return -EMSGSIZE;
493         }
494
495         if (skb_cow(skb, dst->dev->hard_header_len)) {
496                 IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
497                                  IPSTATS_MIB_OUTDISCARDS);
498                 goto drop;
499         }
500
501         hdr = ipv6_hdr(skb);
502
503         /* Mangling hops number delayed to point after skb COW */
504
505         hdr->hop_limit--;
506
507         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
508         IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
509         return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
510                        ip6_forward_finish);
511
512 error:
513         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
514 drop:
515         kfree_skb(skb);
516         return -EINVAL;
517 }
518
519 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
520 {
521         to->pkt_type = from->pkt_type;
522         to->priority = from->priority;
523         to->protocol = from->protocol;
524         skb_dst_drop(to);
525         skb_dst_set(to, dst_clone(skb_dst(from)));
526         to->dev = from->dev;
527         to->mark = from->mark;
528
529 #ifdef CONFIG_NET_SCHED
530         to->tc_index = from->tc_index;
531 #endif
532         nf_copy(to, from);
533 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
534         to->nf_trace = from->nf_trace;
535 #endif
536         skb_copy_secmark(to, from);
537 }
538
539 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
540 {
541         struct sk_buff *frag;
542         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
543         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
544         struct ipv6hdr *tmp_hdr;
545         struct frag_hdr *fh;
546         unsigned int mtu, hlen, left, len;
547         int hroom, troom;
548         __be32 frag_id = 0;
549         int ptr, offset = 0, err=0;
550         u8 *prevhdr, nexthdr = 0;
551         struct net *net = dev_net(skb_dst(skb)->dev);
552
553         hlen = ip6_find_1stfragopt(skb, &prevhdr);
554         nexthdr = *prevhdr;
555
556         mtu = ip6_skb_dst_mtu(skb);
557
558         /* We must not fragment if the socket is set to force MTU discovery
559          * or if the skb it not generated by a local socket.
560          */
561         if (unlikely(!skb->local_df && skb->len > mtu) ||
562                      (IP6CB(skb)->frag_max_size &&
563                       IP6CB(skb)->frag_max_size > mtu)) {
564                 if (skb->sk && dst_allfrag(skb_dst(skb)))
565                         sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
566
567                 skb->dev = skb_dst(skb)->dev;
568                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
569                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
570                               IPSTATS_MIB_FRAGFAILS);
571                 kfree_skb(skb);
572                 return -EMSGSIZE;
573         }
574
575         if (np && np->frag_size < mtu) {
576                 if (np->frag_size)
577                         mtu = np->frag_size;
578         }
579         mtu -= hlen + sizeof(struct frag_hdr);
580
581         if (skb_has_frag_list(skb)) {
582                 int first_len = skb_pagelen(skb);
583                 struct sk_buff *frag2;
584
585                 if (first_len - hlen > mtu ||
586                     ((first_len - hlen) & 7) ||
587                     skb_cloned(skb))
588                         goto slow_path;
589
590                 skb_walk_frags(skb, frag) {
591                         /* Correct geometry. */
592                         if (frag->len > mtu ||
593                             ((frag->len & 7) && frag->next) ||
594                             skb_headroom(frag) < hlen)
595                                 goto slow_path_clean;
596
597                         /* Partially cloned skb? */
598                         if (skb_shared(frag))
599                                 goto slow_path_clean;
600
601                         BUG_ON(frag->sk);
602                         if (skb->sk) {
603                                 frag->sk = skb->sk;
604                                 frag->destructor = sock_wfree;
605                         }
606                         skb->truesize -= frag->truesize;
607                 }
608
609                 err = 0;
610                 offset = 0;
611                 frag = skb_shinfo(skb)->frag_list;
612                 skb_frag_list_init(skb);
613                 /* BUILD HEADER */
614
615                 *prevhdr = NEXTHDR_FRAGMENT;
616                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
617                 if (!tmp_hdr) {
618                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
619                                       IPSTATS_MIB_FRAGFAILS);
620                         return -ENOMEM;
621                 }
622
623                 __skb_pull(skb, hlen);
624                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
625                 __skb_push(skb, hlen);
626                 skb_reset_network_header(skb);
627                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
628
629                 ipv6_select_ident(fh, rt);
630                 fh->nexthdr = nexthdr;
631                 fh->reserved = 0;
632                 fh->frag_off = htons(IP6_MF);
633                 frag_id = fh->identification;
634
635                 first_len = skb_pagelen(skb);
636                 skb->data_len = first_len - skb_headlen(skb);
637                 skb->len = first_len;
638                 ipv6_hdr(skb)->payload_len = htons(first_len -
639                                                    sizeof(struct ipv6hdr));
640
641                 dst_hold(&rt->dst);
642
643                 for (;;) {
644                         /* Prepare header of the next frame,
645                          * before previous one went down. */
646                         if (frag) {
647                                 frag->ip_summed = CHECKSUM_NONE;
648                                 skb_reset_transport_header(frag);
649                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
650                                 __skb_push(frag, hlen);
651                                 skb_reset_network_header(frag);
652                                 memcpy(skb_network_header(frag), tmp_hdr,
653                                        hlen);
654                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
655                                 fh->nexthdr = nexthdr;
656                                 fh->reserved = 0;
657                                 fh->frag_off = htons(offset);
658                                 if (frag->next != NULL)
659                                         fh->frag_off |= htons(IP6_MF);
660                                 fh->identification = frag_id;
661                                 ipv6_hdr(frag)->payload_len =
662                                                 htons(frag->len -
663                                                       sizeof(struct ipv6hdr));
664                                 ip6_copy_metadata(frag, skb);
665                         }
666
667                         err = output(skb);
668                         if(!err)
669                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
670                                               IPSTATS_MIB_FRAGCREATES);
671
672                         if (err || !frag)
673                                 break;
674
675                         skb = frag;
676                         frag = skb->next;
677                         skb->next = NULL;
678                 }
679
680                 kfree(tmp_hdr);
681
682                 if (err == 0) {
683                         IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
684                                       IPSTATS_MIB_FRAGOKS);
685                         ip6_rt_put(rt);
686                         return 0;
687                 }
688
689                 while (frag) {
690                         skb = frag->next;
691                         kfree_skb(frag);
692                         frag = skb;
693                 }
694
695                 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
696                               IPSTATS_MIB_FRAGFAILS);
697                 ip6_rt_put(rt);
698                 return err;
699
700 slow_path_clean:
701                 skb_walk_frags(skb, frag2) {
702                         if (frag2 == frag)
703                                 break;
704                         frag2->sk = NULL;
705                         frag2->destructor = NULL;
706                         skb->truesize += frag2->truesize;
707                 }
708         }
709
710 slow_path:
711         if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
712             skb_checksum_help(skb))
713                 goto fail;
714
715         left = skb->len - hlen;         /* Space per frame */
716         ptr = hlen;                     /* Where to start from */
717
718         /*
719          *      Fragment the datagram.
720          */
721
722         *prevhdr = NEXTHDR_FRAGMENT;
723         hroom = LL_RESERVED_SPACE(rt->dst.dev);
724         troom = rt->dst.dev->needed_tailroom;
725
726         /*
727          *      Keep copying data until we run out.
728          */
729         while(left > 0) {
730                 len = left;
731                 /* IF: it doesn't fit, use 'mtu' - the data space left */
732                 if (len > mtu)
733                         len = mtu;
734                 /* IF: we are not sending up to and including the packet end
735                    then align the next start on an eight byte boundary */
736                 if (len < left) {
737                         len &= ~7;
738                 }
739                 /*
740                  *      Allocate buffer.
741                  */
742
743                 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
744                                       hroom + troom, GFP_ATOMIC)) == NULL) {
745                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
746                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
747                                       IPSTATS_MIB_FRAGFAILS);
748                         err = -ENOMEM;
749                         goto fail;
750                 }
751
752                 /*
753                  *      Set up data on packet
754                  */
755
756                 ip6_copy_metadata(frag, skb);
757                 skb_reserve(frag, hroom);
758                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
759                 skb_reset_network_header(frag);
760                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
761                 frag->transport_header = (frag->network_header + hlen +
762                                           sizeof(struct frag_hdr));
763
764                 /*
765                  *      Charge the memory for the fragment to any owner
766                  *      it might possess
767                  */
768                 if (skb->sk)
769                         skb_set_owner_w(frag, skb->sk);
770
771                 /*
772                  *      Copy the packet header into the new buffer.
773                  */
774                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
775
776                 /*
777                  *      Build fragment header.
778                  */
779                 fh->nexthdr = nexthdr;
780                 fh->reserved = 0;
781                 if (!frag_id) {
782                         ipv6_select_ident(fh, rt);
783                         frag_id = fh->identification;
784                 } else
785                         fh->identification = frag_id;
786
787                 /*
788                  *      Copy a block of the IP datagram.
789                  */
790                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
791                         BUG();
792                 left -= len;
793
794                 fh->frag_off = htons(offset);
795                 if (left > 0)
796                         fh->frag_off |= htons(IP6_MF);
797                 ipv6_hdr(frag)->payload_len = htons(frag->len -
798                                                     sizeof(struct ipv6hdr));
799
800                 ptr += len;
801                 offset += len;
802
803                 /*
804                  *      Put this fragment into the sending queue.
805                  */
806                 err = output(frag);
807                 if (err)
808                         goto fail;
809
810                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
811                               IPSTATS_MIB_FRAGCREATES);
812         }
813         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
814                       IPSTATS_MIB_FRAGOKS);
815         consume_skb(skb);
816         return err;
817
818 fail:
819         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
820                       IPSTATS_MIB_FRAGFAILS);
821         kfree_skb(skb);
822         return err;
823 }
824
825 static inline int ip6_rt_check(const struct rt6key *rt_key,
826                                const struct in6_addr *fl_addr,
827                                const struct in6_addr *addr_cache)
828 {
829         return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
830                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
831 }
832
833 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
834                                           struct dst_entry *dst,
835                                           const struct flowi6 *fl6)
836 {
837         struct ipv6_pinfo *np = inet6_sk(sk);
838         struct rt6_info *rt;
839
840         if (!dst)
841                 goto out;
842
843         if (dst->ops->family != AF_INET6) {
844                 dst_release(dst);
845                 return NULL;
846         }
847
848         rt = (struct rt6_info *)dst;
849         /* Yes, checking route validity in not connected
850          * case is not very simple. Take into account,
851          * that we do not support routing by source, TOS,
852          * and MSG_DONTROUTE            --ANK (980726)
853          *
854          * 1. ip6_rt_check(): If route was host route,
855          *    check that cached destination is current.
856          *    If it is network route, we still may
857          *    check its validity using saved pointer
858          *    to the last used address: daddr_cache.
859          *    We do not want to save whole address now,
860          *    (because main consumer of this service
861          *    is tcp, which has not this problem),
862          *    so that the last trick works only on connected
863          *    sockets.
864          * 2. oif also should be the same.
865          */
866         if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
867 #ifdef CONFIG_IPV6_SUBTREES
868             ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
869 #endif
870             (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
871                 dst_release(dst);
872                 dst = NULL;
873         }
874
875 out:
876         return dst;
877 }
878
879 static int ip6_dst_lookup_tail(struct sock *sk,
880                                struct dst_entry **dst, struct flowi6 *fl6)
881 {
882         struct net *net = sock_net(sk);
883 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
884         struct neighbour *n;
885         struct rt6_info *rt;
886 #endif
887         int err;
888
889         if (*dst == NULL)
890                 *dst = ip6_route_output(net, sk, fl6);
891
892         if ((err = (*dst)->error))
893                 goto out_err_release;
894
895         if (ipv6_addr_any(&fl6->saddr)) {
896                 struct rt6_info *rt = (struct rt6_info *) *dst;
897                 err = ip6_route_get_saddr(net, rt, &fl6->daddr,
898                                           sk ? inet6_sk(sk)->srcprefs : 0,
899                                           &fl6->saddr);
900                 if (err)
901                         goto out_err_release;
902         }
903
904 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
905         /*
906          * Here if the dst entry we've looked up
907          * has a neighbour entry that is in the INCOMPLETE
908          * state and the src address from the flow is
909          * marked as OPTIMISTIC, we release the found
910          * dst entry and replace it instead with the
911          * dst entry of the nexthop router
912          */
913         rt = (struct rt6_info *) *dst;
914         rcu_read_lock_bh();
915         n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
916         err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
917         rcu_read_unlock_bh();
918
919         if (err) {
920                 struct inet6_ifaddr *ifp;
921                 struct flowi6 fl_gw6;
922                 int redirect;
923
924                 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
925                                       (*dst)->dev, 1);
926
927                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
928                 if (ifp)
929                         in6_ifa_put(ifp);
930
931                 if (redirect) {
932                         /*
933                          * We need to get the dst entry for the
934                          * default router instead
935                          */
936                         dst_release(*dst);
937                         memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
938                         memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
939                         *dst = ip6_route_output(net, sk, &fl_gw6);
940                         if ((err = (*dst)->error))
941                                 goto out_err_release;
942                 }
943         }
944 #endif
945
946         return 0;
947
948 out_err_release:
949         if (err == -ENETUNREACH)
950                 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
951         dst_release(*dst);
952         *dst = NULL;
953         return err;
954 }
955
956 /**
957  *      ip6_dst_lookup - perform route lookup on flow
958  *      @sk: socket which provides route info
959  *      @dst: pointer to dst_entry * for result
960  *      @fl6: flow to lookup
961  *
962  *      This function performs a route lookup on the given flow.
963  *
964  *      It returns zero on success, or a standard errno code on error.
965  */
966 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
967 {
968         *dst = NULL;
969         return ip6_dst_lookup_tail(sk, dst, fl6);
970 }
971 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
972
973 /**
974  *      ip6_dst_lookup_flow - perform route lookup on flow with ipsec
975  *      @sk: socket which provides route info
976  *      @fl6: flow to lookup
977  *      @final_dst: final destination address for ipsec lookup
978  *
979  *      This function performs a route lookup on the given flow.
980  *
981  *      It returns a valid dst pointer on success, or a pointer encoded
982  *      error code.
983  */
984 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
985                                       const struct in6_addr *final_dst)
986 {
987         struct dst_entry *dst = NULL;
988         int err;
989
990         err = ip6_dst_lookup_tail(sk, &dst, fl6);
991         if (err)
992                 return ERR_PTR(err);
993         if (final_dst)
994                 fl6->daddr = *final_dst;
995
996         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
997 }
998 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
999
1000 /**
1001  *      ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1002  *      @sk: socket which provides the dst cache and route info
1003  *      @fl6: flow to lookup
1004  *      @final_dst: final destination address for ipsec lookup
1005  *
1006  *      This function performs a route lookup on the given flow with the
1007  *      possibility of using the cached route in the socket if it is valid.
1008  *      It will take the socket dst lock when operating on the dst cache.
1009  *      As a result, this function can only be used in process context.
1010  *
1011  *      It returns a valid dst pointer on success, or a pointer encoded
1012  *      error code.
1013  */
1014 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1015                                          const struct in6_addr *final_dst)
1016 {
1017         struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1018         int err;
1019
1020         dst = ip6_sk_dst_check(sk, dst, fl6);
1021
1022         err = ip6_dst_lookup_tail(sk, &dst, fl6);
1023         if (err)
1024                 return ERR_PTR(err);
1025         if (final_dst)
1026                 fl6->daddr = *final_dst;
1027
1028         return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1029 }
1030 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1031
1032 static inline int ip6_ufo_append_data(struct sock *sk,
1033                         int getfrag(void *from, char *to, int offset, int len,
1034                         int odd, struct sk_buff *skb),
1035                         void *from, int length, int hh_len, int fragheaderlen,
1036                         int transhdrlen, int mtu,unsigned int flags,
1037                         struct rt6_info *rt)
1038
1039 {
1040         struct sk_buff *skb;
1041         struct frag_hdr fhdr;
1042         int err;
1043
1044         /* There is support for UDP large send offload by network
1045          * device, so create one single skb packet containing complete
1046          * udp datagram
1047          */
1048         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1049                 skb = sock_alloc_send_skb(sk,
1050                         hh_len + fragheaderlen + transhdrlen + 20,
1051                         (flags & MSG_DONTWAIT), &err);
1052                 if (skb == NULL)
1053                         return err;
1054
1055                 /* reserve space for Hardware header */
1056                 skb_reserve(skb, hh_len);
1057
1058                 /* create space for UDP/IP header */
1059                 skb_put(skb,fragheaderlen + transhdrlen);
1060
1061                 /* initialize network header pointer */
1062                 skb_reset_network_header(skb);
1063
1064                 /* initialize protocol header pointer */
1065                 skb->transport_header = skb->network_header + fragheaderlen;
1066
1067                 skb->protocol = htons(ETH_P_IPV6);
1068                 skb->csum = 0;
1069
1070                 __skb_queue_tail(&sk->sk_write_queue, skb);
1071         } else if (skb_is_gso(skb)) {
1072                 goto append;
1073         }
1074
1075         skb->ip_summed = CHECKSUM_PARTIAL;
1076         /* Specify the length of each IPv6 datagram fragment.
1077          * It has to be a multiple of 8.
1078          */
1079         skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1080                                      sizeof(struct frag_hdr)) & ~7;
1081         skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1082         ipv6_select_ident(&fhdr, rt);
1083         skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1084
1085 append:
1086         return skb_append_datato_frags(sk, skb, getfrag, from,
1087                                        (length - transhdrlen));
1088 }
1089
1090 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1091                                                gfp_t gfp)
1092 {
1093         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1094 }
1095
1096 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1097                                                 gfp_t gfp)
1098 {
1099         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1100 }
1101
1102 static void ip6_append_data_mtu(unsigned int *mtu,
1103                                 int *maxfraglen,
1104                                 unsigned int fragheaderlen,
1105                                 struct sk_buff *skb,
1106                                 struct rt6_info *rt,
1107                                 bool pmtuprobe)
1108 {
1109         if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1110                 if (skb == NULL) {
1111                         /* first fragment, reserve header_len */
1112                         *mtu = *mtu - rt->dst.header_len;
1113
1114                 } else {
1115                         /*
1116                          * this fragment is not first, the headers
1117                          * space is regarded as data space.
1118                          */
1119                         *mtu = min(*mtu, pmtuprobe ?
1120                                    rt->dst.dev->mtu :
1121                                    dst_mtu(rt->dst.path));
1122                 }
1123                 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1124                               + fragheaderlen - sizeof(struct frag_hdr);
1125         }
1126 }
1127
1128 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1129         int offset, int len, int odd, struct sk_buff *skb),
1130         void *from, int length, int transhdrlen,
1131         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1132         struct rt6_info *rt, unsigned int flags, int dontfrag)
1133 {
1134         struct inet_sock *inet = inet_sk(sk);
1135         struct ipv6_pinfo *np = inet6_sk(sk);
1136         struct inet_cork *cork;
1137         struct sk_buff *skb, *skb_prev = NULL;
1138         unsigned int maxfraglen, fragheaderlen, mtu;
1139         int exthdrlen;
1140         int dst_exthdrlen;
1141         int hh_len;
1142         int copy;
1143         int err;
1144         int offset = 0;
1145         __u8 tx_flags = 0;
1146
1147         if (flags&MSG_PROBE)
1148                 return 0;
1149         cork = &inet->cork.base;
1150         if (skb_queue_empty(&sk->sk_write_queue)) {
1151                 /*
1152                  * setup for corking
1153                  */
1154                 if (opt) {
1155                         if (WARN_ON(np->cork.opt))
1156                                 return -EINVAL;
1157
1158                         np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1159                         if (unlikely(np->cork.opt == NULL))
1160                                 return -ENOBUFS;
1161
1162                         np->cork.opt->tot_len = opt->tot_len;
1163                         np->cork.opt->opt_flen = opt->opt_flen;
1164                         np->cork.opt->opt_nflen = opt->opt_nflen;
1165
1166                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1167                                                             sk->sk_allocation);
1168                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1169                                 return -ENOBUFS;
1170
1171                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1172                                                             sk->sk_allocation);
1173                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1174                                 return -ENOBUFS;
1175
1176                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1177                                                            sk->sk_allocation);
1178                         if (opt->hopopt && !np->cork.opt->hopopt)
1179                                 return -ENOBUFS;
1180
1181                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1182                                                             sk->sk_allocation);
1183                         if (opt->srcrt && !np->cork.opt->srcrt)
1184                                 return -ENOBUFS;
1185
1186                         /* need source address above miyazawa*/
1187                 }
1188                 dst_hold(&rt->dst);
1189                 cork->dst = &rt->dst;
1190                 inet->cork.fl.u.ip6 = *fl6;
1191                 np->cork.hop_limit = hlimit;
1192                 np->cork.tclass = tclass;
1193                 if (rt->dst.flags & DST_XFRM_TUNNEL)
1194                         mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1195                               rt->dst.dev->mtu : dst_mtu(&rt->dst);
1196                 else
1197                         mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1198                               rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1199                 if (np->frag_size < mtu) {
1200                         if (np->frag_size)
1201                                 mtu = np->frag_size;
1202                 }
1203                 cork->fragsize = mtu;
1204                 if (dst_allfrag(rt->dst.path))
1205                         cork->flags |= IPCORK_ALLFRAG;
1206                 cork->length = 0;
1207                 exthdrlen = (opt ? opt->opt_flen : 0);
1208                 length += exthdrlen;
1209                 transhdrlen += exthdrlen;
1210                 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1211         } else {
1212                 rt = (struct rt6_info *)cork->dst;
1213                 fl6 = &inet->cork.fl.u.ip6;
1214                 opt = np->cork.opt;
1215                 transhdrlen = 0;
1216                 exthdrlen = 0;
1217                 dst_exthdrlen = 0;
1218                 mtu = cork->fragsize;
1219         }
1220
1221         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1222
1223         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1224                         (opt ? opt->opt_nflen : 0);
1225         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1226                      sizeof(struct frag_hdr);
1227
1228         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1229                 unsigned int maxnonfragsize, headersize;
1230
1231                 headersize = sizeof(struct ipv6hdr) +
1232                              (opt ? opt->tot_len : 0) +
1233                              (dst_allfrag(&rt->dst) ?
1234                               sizeof(struct frag_hdr) : 0) +
1235                              rt->rt6i_nfheader_len;
1236
1237                 maxnonfragsize = (np->pmtudisc >= IPV6_PMTUDISC_DO) ?
1238                                  mtu : sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1239
1240                 /* dontfrag active */
1241                 if ((cork->length + length > mtu - headersize) && dontfrag &&
1242                     (sk->sk_protocol == IPPROTO_UDP ||
1243                      sk->sk_protocol == IPPROTO_RAW)) {
1244                         ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1245                                                    sizeof(struct ipv6hdr));
1246                         goto emsgsize;
1247                 }
1248
1249                 if (cork->length + length > maxnonfragsize - headersize) {
1250 emsgsize:
1251                         ipv6_local_error(sk, EMSGSIZE, fl6,
1252                                          mtu - headersize +
1253                                          sizeof(struct ipv6hdr));
1254                         return -EMSGSIZE;
1255                 }
1256         }
1257
1258         /* For UDP, check if TX timestamp is enabled */
1259         if (sk->sk_type == SOCK_DGRAM)
1260                 sock_tx_timestamp(sk, &tx_flags);
1261
1262         /*
1263          * Let's try using as much space as possible.
1264          * Use MTU if total length of the message fits into the MTU.
1265          * Otherwise, we need to reserve fragment header and
1266          * fragment alignment (= 8-15 octects, in total).
1267          *
1268          * Note that we may need to "move" the data from the tail of
1269          * of the buffer to the new fragment when we split
1270          * the message.
1271          *
1272          * FIXME: It may be fragmented into multiple chunks
1273          *        at once if non-fragmentable extension headers
1274          *        are too large.
1275          * --yoshfuji
1276          */
1277
1278         skb = skb_peek_tail(&sk->sk_write_queue);
1279         cork->length += length;
1280         if (((length > mtu) ||
1281              (skb && skb_is_gso(skb))) &&
1282             (sk->sk_protocol == IPPROTO_UDP) &&
1283             (rt->dst.dev->features & NETIF_F_UFO)) {
1284                 err = ip6_ufo_append_data(sk, getfrag, from, length,
1285                                           hh_len, fragheaderlen,
1286                                           transhdrlen, mtu, flags, rt);
1287                 if (err)
1288                         goto error;
1289                 return 0;
1290         }
1291
1292         if (!skb)
1293                 goto alloc_new_skb;
1294
1295         while (length > 0) {
1296                 /* Check if the remaining data fits into current packet. */
1297                 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1298                 if (copy < length)
1299                         copy = maxfraglen - skb->len;
1300
1301                 if (copy <= 0) {
1302                         char *data;
1303                         unsigned int datalen;
1304                         unsigned int fraglen;
1305                         unsigned int fraggap;
1306                         unsigned int alloclen;
1307 alloc_new_skb:
1308                         /* There's no room in the current skb */
1309                         if (skb)
1310                                 fraggap = skb->len - maxfraglen;
1311                         else
1312                                 fraggap = 0;
1313                         /* update mtu and maxfraglen if necessary */
1314                         if (skb == NULL || skb_prev == NULL)
1315                                 ip6_append_data_mtu(&mtu, &maxfraglen,
1316                                                     fragheaderlen, skb, rt,
1317                                                     np->pmtudisc >=
1318                                                     IPV6_PMTUDISC_PROBE);
1319
1320                         skb_prev = skb;
1321
1322                         /*
1323                          * If remaining data exceeds the mtu,
1324                          * we know we need more fragment(s).
1325                          */
1326                         datalen = length + fraggap;
1327
1328                         if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1329                                 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1330                         if ((flags & MSG_MORE) &&
1331                             !(rt->dst.dev->features&NETIF_F_SG))
1332                                 alloclen = mtu;
1333                         else
1334                                 alloclen = datalen + fragheaderlen;
1335
1336                         alloclen += dst_exthdrlen;
1337
1338                         if (datalen != length + fraggap) {
1339                                 /*
1340                                  * this is not the last fragment, the trailer
1341                                  * space is regarded as data space.
1342                                  */
1343                                 datalen += rt->dst.trailer_len;
1344                         }
1345
1346                         alloclen += rt->dst.trailer_len;
1347                         fraglen = datalen + fragheaderlen;
1348
1349                         /*
1350                          * We just reserve space for fragment header.
1351                          * Note: this may be overallocation if the message
1352                          * (without MSG_MORE) fits into the MTU.
1353                          */
1354                         alloclen += sizeof(struct frag_hdr);
1355
1356                         if (transhdrlen) {
1357                                 skb = sock_alloc_send_skb(sk,
1358                                                 alloclen + hh_len,
1359                                                 (flags & MSG_DONTWAIT), &err);
1360                         } else {
1361                                 skb = NULL;
1362                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1363                                     2 * sk->sk_sndbuf)
1364                                         skb = sock_wmalloc(sk,
1365                                                            alloclen + hh_len, 1,
1366                                                            sk->sk_allocation);
1367                                 if (unlikely(skb == NULL))
1368                                         err = -ENOBUFS;
1369                                 else {
1370                                         /* Only the initial fragment
1371                                          * is time stamped.
1372                                          */
1373                                         tx_flags = 0;
1374                                 }
1375                         }
1376                         if (skb == NULL)
1377                                 goto error;
1378                         /*
1379                          *      Fill in the control structures
1380                          */
1381                         skb->protocol = htons(ETH_P_IPV6);
1382                         skb->ip_summed = CHECKSUM_NONE;
1383                         skb->csum = 0;
1384                         /* reserve for fragmentation and ipsec header */
1385                         skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1386                                     dst_exthdrlen);
1387
1388                         if (sk->sk_type == SOCK_DGRAM)
1389                                 skb_shinfo(skb)->tx_flags = tx_flags;
1390
1391                         /*
1392                          *      Find where to start putting bytes
1393                          */
1394                         data = skb_put(skb, fraglen);
1395                         skb_set_network_header(skb, exthdrlen);
1396                         data += fragheaderlen;
1397                         skb->transport_header = (skb->network_header +
1398                                                  fragheaderlen);
1399                         if (fraggap) {
1400                                 skb->csum = skb_copy_and_csum_bits(
1401                                         skb_prev, maxfraglen,
1402                                         data + transhdrlen, fraggap, 0);
1403                                 skb_prev->csum = csum_sub(skb_prev->csum,
1404                                                           skb->csum);
1405                                 data += fraggap;
1406                                 pskb_trim_unique(skb_prev, maxfraglen);
1407                         }
1408                         copy = datalen - transhdrlen - fraggap;
1409
1410                         if (copy < 0) {
1411                                 err = -EINVAL;
1412                                 kfree_skb(skb);
1413                                 goto error;
1414                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1415                                 err = -EFAULT;
1416                                 kfree_skb(skb);
1417                                 goto error;
1418                         }
1419
1420                         offset += copy;
1421                         length -= datalen - fraggap;
1422                         transhdrlen = 0;
1423                         exthdrlen = 0;
1424                         dst_exthdrlen = 0;
1425
1426                         /*
1427                          * Put the packet on the pending queue
1428                          */
1429                         __skb_queue_tail(&sk->sk_write_queue, skb);
1430                         continue;
1431                 }
1432
1433                 if (copy > length)
1434                         copy = length;
1435
1436                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1437                         unsigned int off;
1438
1439                         off = skb->len;
1440                         if (getfrag(from, skb_put(skb, copy),
1441                                                 offset, copy, off, skb) < 0) {
1442                                 __skb_trim(skb, off);
1443                                 err = -EFAULT;
1444                                 goto error;
1445                         }
1446                 } else {
1447                         int i = skb_shinfo(skb)->nr_frags;
1448                         struct page_frag *pfrag = sk_page_frag(sk);
1449
1450                         err = -ENOMEM;
1451                         if (!sk_page_frag_refill(sk, pfrag))
1452                                 goto error;
1453
1454                         if (!skb_can_coalesce(skb, i, pfrag->page,
1455                                               pfrag->offset)) {
1456                                 err = -EMSGSIZE;
1457                                 if (i == MAX_SKB_FRAGS)
1458                                         goto error;
1459
1460                                 __skb_fill_page_desc(skb, i, pfrag->page,
1461                                                      pfrag->offset, 0);
1462                                 skb_shinfo(skb)->nr_frags = ++i;
1463                                 get_page(pfrag->page);
1464                         }
1465                         copy = min_t(int, copy, pfrag->size - pfrag->offset);
1466                         if (getfrag(from,
1467                                     page_address(pfrag->page) + pfrag->offset,
1468                                     offset, copy, skb->len, skb) < 0)
1469                                 goto error_efault;
1470
1471                         pfrag->offset += copy;
1472                         skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1473                         skb->len += copy;
1474                         skb->data_len += copy;
1475                         skb->truesize += copy;
1476                         atomic_add(copy, &sk->sk_wmem_alloc);
1477                 }
1478                 offset += copy;
1479                 length -= copy;
1480         }
1481
1482         return 0;
1483
1484 error_efault:
1485         err = -EFAULT;
1486 error:
1487         cork->length -= length;
1488         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1489         return err;
1490 }
1491 EXPORT_SYMBOL_GPL(ip6_append_data);
1492
1493 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1494 {
1495         if (np->cork.opt) {
1496                 kfree(np->cork.opt->dst0opt);
1497                 kfree(np->cork.opt->dst1opt);
1498                 kfree(np->cork.opt->hopopt);
1499                 kfree(np->cork.opt->srcrt);
1500                 kfree(np->cork.opt);
1501                 np->cork.opt = NULL;
1502         }
1503
1504         if (inet->cork.base.dst) {
1505                 dst_release(inet->cork.base.dst);
1506                 inet->cork.base.dst = NULL;
1507                 inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1508         }
1509         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1510 }
1511
1512 int ip6_push_pending_frames(struct sock *sk)
1513 {
1514         struct sk_buff *skb, *tmp_skb;
1515         struct sk_buff **tail_skb;
1516         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1517         struct inet_sock *inet = inet_sk(sk);
1518         struct ipv6_pinfo *np = inet6_sk(sk);
1519         struct net *net = sock_net(sk);
1520         struct ipv6hdr *hdr;
1521         struct ipv6_txoptions *opt = np->cork.opt;
1522         struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1523         struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1524         unsigned char proto = fl6->flowi6_proto;
1525         int err = 0;
1526
1527         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1528                 goto out;
1529         tail_skb = &(skb_shinfo(skb)->frag_list);
1530
1531         /* move skb->data to ip header from ext header */
1532         if (skb->data < skb_network_header(skb))
1533                 __skb_pull(skb, skb_network_offset(skb));
1534         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1535                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1536                 *tail_skb = tmp_skb;
1537                 tail_skb = &(tmp_skb->next);
1538                 skb->len += tmp_skb->len;
1539                 skb->data_len += tmp_skb->len;
1540                 skb->truesize += tmp_skb->truesize;
1541                 tmp_skb->destructor = NULL;
1542                 tmp_skb->sk = NULL;
1543         }
1544
1545         /* Allow local fragmentation. */
1546         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1547                 skb->local_df = 1;
1548
1549         *final_dst = fl6->daddr;
1550         __skb_pull(skb, skb_network_header_len(skb));
1551         if (opt && opt->opt_flen)
1552                 ipv6_push_frag_opts(skb, opt, &proto);
1553         if (opt && opt->opt_nflen)
1554                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1555
1556         skb_push(skb, sizeof(struct ipv6hdr));
1557         skb_reset_network_header(skb);
1558         hdr = ipv6_hdr(skb);
1559
1560         ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1561         hdr->hop_limit = np->cork.hop_limit;
1562         hdr->nexthdr = proto;
1563         hdr->saddr = fl6->saddr;
1564         hdr->daddr = *final_dst;
1565
1566         skb->priority = sk->sk_priority;
1567         skb->mark = sk->sk_mark;
1568
1569         skb_dst_set(skb, dst_clone(&rt->dst));
1570         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1571         if (proto == IPPROTO_ICMPV6) {
1572                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1573
1574                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1575                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1576         }
1577
1578         err = ip6_local_out(skb);
1579         if (err) {
1580                 if (err > 0)
1581                         err = net_xmit_errno(err);
1582                 if (err)
1583                         goto error;
1584         }
1585
1586 out:
1587         ip6_cork_release(inet, np);
1588         return err;
1589 error:
1590         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1591         goto out;
1592 }
1593 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1594
1595 void ip6_flush_pending_frames(struct sock *sk)
1596 {
1597         struct sk_buff *skb;
1598
1599         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1600                 if (skb_dst(skb))
1601                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1602                                       IPSTATS_MIB_OUTDISCARDS);
1603                 kfree_skb(skb);
1604         }
1605
1606         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1607 }
1608 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);