Pileus Git - ~andy/linux/blob - net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Donald Becker, <becker@super.org>
  11  *              Alan Cox, <Alan.Cox@linux.org>
  12  *              Richard Underwood
  13  *              Stefan Becker, <stefanb@yello.ping.de>
  14  *              Jorge Cwik, <jorge@laser.satlink.net>
  15  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  16  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  17  *
  18  *      See ip_input.c for original log
  19  *
  20  *      Fixes:
  21  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  22  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  23  *              Bradford Johnson:       Fix faulty handling of some frames when
  24  *                                      no route is found.
  25  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  26  *                                      (in case if packet not accepted by
  27  *                                      output firewall rules)
  28  *              Mike McLagan    :       Routing by source
  29  *              Alexey Kuznetsov:       use new route cache
  30  *              Andi Kleen:             Fix broken PMTU recovery and remove
  31  *                                      some redundant tests.
  32  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  33  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  34  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  35  *                                      for decreased register pressure on x86
  36  *                                      and more readibility.
  37  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  38  *                                      silently drop skb instead of failing with -EPERM.
  39  *              Detlev Wengorz  :       Copy protocol for fragments.
  40  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  41  *                                      datagrams.
  42  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  43  */
  44
  45 #include <asm/uaccess.h>
  46 #include <asm/system.h>
  47 #include <linux/module.h>
  48 #include <linux/types.h>
  49 #include <linux/kernel.h>
  50 #include <linux/mm.h>
  51 #include <linux/string.h>
  52 #include <linux/errno.h>
  53 #include <linux/highmem.h>
  54 #include <linux/slab.h>
  55
  56 #include <linux/socket.h>
  57 #include <linux/sockios.h>
  58 #include <linux/in.h>
  59 #include <linux/inet.h>
  60 #include <linux/netdevice.h>
  61 #include <linux/etherdevice.h>
  62 #include <linux/proc_fs.h>
  63 #include <linux/stat.h>
  64 #include <linux/init.h>
  65
  66 #include <net/snmp.h>
  67 #include <net/ip.h>
  68 #include <net/protocol.h>
  69 #include <net/route.h>
  70 #include <net/xfrm.h>
  71 #include <linux/skbuff.h>
  72 #include <net/sock.h>
  73 #include <net/arp.h>
  74 #include <net/icmp.h>
  75 #include <net/checksum.h>
  76 #include <net/inetpeer.h>
  77 #include <linux/igmp.h>
  78 #include <linux/netfilter_ipv4.h>
  79 #include <linux/netfilter_bridge.h>
  80 #include <linux/mroute.h>
  81 #include <linux/netlink.h>
  82 #include <linux/tcp.h>
  83
  84 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  85
  86 /* Generate a checksum for an outgoing IP datagram. */
  87 __inline__ void ip_send_check(struct iphdr *iph)
  88 {
  89         iph->check = 0;
  90         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  91 }
  92
  93 int __ip_local_out(struct sk_buff *skb)
  94 {
  95         struct iphdr *iph = ip_hdr(skb);
  96
  97         iph->tot_len = htons(skb->len);
  98         ip_send_check(iph);
  99         return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
 100                        skb_dst(skb)->dev, dst_output);
 101 }
 102
 103 int ip_local_out(struct sk_buff *skb)
 104 {
 105         int err;
 106
 107         err = __ip_local_out(skb);
 108         if (likely(err == 1))
 109                 err = dst_output(skb);
 110
 111         return err;
 112 }
 113 EXPORT_SYMBOL_GPL(ip_local_out);
 114
 115 /* dev_loopback_xmit for use with netfilter. */
 116 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 117 {
 118         skb_reset_mac_header(newskb);
 119         __skb_pull(newskb, skb_network_offset(newskb));
 120         newskb->pkt_type = PACKET_LOOPBACK;
 121         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 122         WARN_ON(!skb_dst(newskb));
 123         netif_rx_ni(newskb);
 124         return 0;
 125 }
 126
 127 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 128 {
 129         int ttl = inet->uc_ttl;
 130
 131         if (ttl < 0)
 132                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 133         return ttl;
 134 }
 135
 136 /*
 137  *              Add an ip header to a skbuff and send it out.
 138  *
 139  */
 140 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 141                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 142 {
 143         struct inet_sock *inet = inet_sk(sk);
 144         struct rtable *rt = skb_rtable(skb);
 145         struct iphdr *iph;
 146
 147         /* Build the IP header. */
 148         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 149         skb_reset_network_header(skb);
 150         iph = ip_hdr(skb);
 151         iph->version  = 4;
 152         iph->ihl      = 5;
 153         iph->tos      = inet->tos;
 154         if (ip_dont_fragment(sk, &rt->dst))
 155                 iph->frag_off = htons(IP_DF);
 156         else
 157                 iph->frag_off = 0;
 158         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 159         iph->daddr    = rt->rt_dst;
 160         iph->saddr    = rt->rt_src;
 161         iph->protocol = sk->sk_protocol;
 162         ip_select_ident(iph, &rt->dst, sk);
 163
 164         if (opt && opt->optlen) {
 165                 iph->ihl += opt->optlen>>2;
 166                 ip_options_build(skb, opt, daddr, rt, 0);
 167         }
 168
 169         skb->priority = sk->sk_priority;
 170         skb->mark = sk->sk_mark;
 171
 172         /* Send it out. */
 173         return ip_local_out(skb);
 174 }
 175
 176 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 177
 178 static inline int ip_finish_output2(struct sk_buff *skb)
 179 {
 180         struct dst_entry *dst = skb_dst(skb);
 181         struct rtable *rt = (struct rtable *)dst;
 182         struct net_device *dev = dst->dev;
 183         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 184
 185         if (rt->rt_type == RTN_MULTICAST) {
 186                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
 187         } else if (rt->rt_type == RTN_BROADCAST)
 188                 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
 189
 190         /* Be paranoid, rather than too clever. */
 191         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 192                 struct sk_buff *skb2;
 193
 194                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 195                 if (skb2 == NULL) {
 196                         kfree_skb(skb);
 197                         return -ENOMEM;
 198                 }
 199                 if (skb->sk)
 200                         skb_set_owner_w(skb2, skb->sk);
 201                 kfree_skb(skb);
 202                 skb = skb2;
 203         }
 204
 205         if (dst->hh)
 206                 return neigh_hh_output(dst->hh, skb);
 207         else if (dst->neighbour)
 208                 return dst->neighbour->output(skb);
 209
 210         if (net_ratelimit())
 211                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 212         kfree_skb(skb);
 213         return -EINVAL;
 214 }
 215
 216 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 217 {
 218         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 219
 220         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 221                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
 222 }
 223
 224 static int ip_finish_output(struct sk_buff *skb)
 225 {
 226 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 227         /* Policy lookup after SNAT yielded a new policy */
 228         if (skb_dst(skb)->xfrm != NULL) {
 229                 IPCB(skb)->flags |= IPSKB_REROUTED;
 230                 return dst_output(skb);
 231         }
 232 #endif
 233         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 234                 return ip_fragment(skb, ip_finish_output2);
 235         else
 236                 return ip_finish_output2(skb);
 237 }
 238
 239 int ip_mc_output(struct sk_buff *skb)
 240 {
 241         struct sock *sk = skb->sk;
 242         struct rtable *rt = skb_rtable(skb);
 243         struct net_device *dev = rt->dst.dev;
 244
 245         /*
 246          *      If the indicated interface is up and running, send the packet.
 247          */
 248         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 249
 250         skb->dev = dev;
 251         skb->protocol = htons(ETH_P_IP);
 252
 253         /*
 254          *      Multicasts are looped back for other local users
 255          */
 256
 257         if (rt->rt_flags&RTCF_MULTICAST) {
 258                 if (sk_mc_loop(sk)
 259 #ifdef CONFIG_IP_MROUTE
 260                 /* Small optimization: do not loopback not local frames,
 261                    which returned after forwarding; they will be  dropped
 262                    by ip_mr_input in any case.
 263                    Note, that local frames are looped back to be delivered
 264                    to local recipients.
 265
 266                    This check is duplicated in ip_mr_input at the moment.
 267                  */
 268                     &&
 269                     ((rt->rt_flags & RTCF_LOCAL) ||
 270                      !(IPCB(skb)->flags & IPSKB_FORWARDED))
 271 #endif
 272                    ) {
 273                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 274                         if (newskb)
 275                                 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
 276                                         newskb, NULL, newskb->dev,
 277                                         ip_dev_loopback_xmit);
 278                 }
 279
 280                 /* Multicasts with ttl 0 must not go beyond the host */
 281
 282                 if (ip_hdr(skb)->ttl == 0) {
 283                         kfree_skb(skb);
 284                         return 0;
 285                 }
 286         }
 287
 288         if (rt->rt_flags&RTCF_BROADCAST) {
 289                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 290                 if (newskb)
 291                         NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
 292                                 NULL, newskb->dev, ip_dev_loopback_xmit);
 293         }
 294
 295         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
 296                             skb->dev, ip_finish_output,
 297                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 298 }
 299
 300 int ip_output(struct sk_buff *skb)
 301 {
 302         struct net_device *dev = skb_dst(skb)->dev;
 303
 304         IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
 305
 306         skb->dev = dev;
 307         skb->protocol = htons(ETH_P_IP);
 308
 309         return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
 310                             ip_finish_output,
 311                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 312 }
 313
 314 int ip_queue_xmit(struct sk_buff *skb)
 315 {
 316         struct sock *sk = skb->sk;
 317         struct inet_sock *inet = inet_sk(sk);
 318         struct ip_options *opt = inet->opt;
 319         struct rtable *rt;
 320         struct iphdr *iph;
 321         int res;
 322
 323         /* Skip all of this if the packet is already routed,
 324          * f.e. by something like SCTP.
 325          */
 326         rcu_read_lock();
 327         rt = skb_rtable(skb);
 328         if (rt != NULL)
 329                 goto packet_routed;
 330
 331         /* Make sure we can route this packet. */
 332         rt = (struct rtable *)__sk_dst_check(sk, 0);
 333         if (rt == NULL) {
 334                 __be32 daddr;
 335
 336                 /* Use correct destination address if we have options. */
 337                 daddr = inet->inet_daddr;
 338                 if(opt && opt->srr)
 339                         daddr = opt->faddr;
 340
 341                 {
 342                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 343                                             .mark = sk->sk_mark,
 344                                             .nl_u = { .ip4_u =
 345                                                       { .daddr = daddr,
 346                                                         .saddr = inet->inet_saddr,
 347                                                         .tos = RT_CONN_FLAGS(sk) } },
 348                                             .proto = sk->sk_protocol,
 349                                             .flags = inet_sk_flowi_flags(sk),
 350                                             .uli_u = { .ports =
 351                                                        { .sport = inet->inet_sport,
 352                                                          .dport = inet->inet_dport } } };
 353
 354                         /* If this fails, retransmit mechanism of transport layer will
 355                          * keep trying until route appears or the connection times
 356                          * itself out.
 357                          */
 358                         security_sk_classify_flow(sk, &fl);
 359                         if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
 360                                 goto no_route;
 361                 }
 362                 sk_setup_caps(sk, &rt->dst);
 363         }
 364         skb_dst_set_noref(skb, &rt->dst);
 365
 366 packet_routed:
 367         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 368                 goto no_route;
 369
 370         /* OK, we know where to send it, allocate and build IP header. */
 371         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 372         skb_reset_network_header(skb);
 373         iph = ip_hdr(skb);
 374         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 375         if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
 376                 iph->frag_off = htons(IP_DF);
 377         else
 378                 iph->frag_off = 0;
 379         iph->ttl      = ip_select_ttl(inet, &rt->dst);
 380         iph->protocol = sk->sk_protocol;
 381         iph->saddr    = rt->rt_src;
 382         iph->daddr    = rt->rt_dst;
 383         /* Transport layer set skb->h.foo itself. */
 384
 385         if (opt && opt->optlen) {
 386                 iph->ihl += opt->optlen >> 2;
 387                 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
 388         }
 389
 390         ip_select_ident_more(iph, &rt->dst, sk,
 391                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 392
 393         skb->priority = sk->sk_priority;
 394         skb->mark = sk->sk_mark;
 395
 396         res = ip_local_out(skb);
 397         rcu_read_unlock();
 398         return res;
 399
 400 no_route:
 401         rcu_read_unlock();
 402         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 403         kfree_skb(skb);
 404         return -EHOSTUNREACH;
 405 }
 406
 407
 408 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 409 {
 410         to->pkt_type = from->pkt_type;
 411         to->priority = from->priority;
 412         to->protocol = from->protocol;
 413         skb_dst_drop(to);
 414         skb_dst_copy(to, from);
 415         to->dev = from->dev;
 416         to->mark = from->mark;
 417
 418         /* Copy the flags to each fragment. */
 419         IPCB(to)->flags = IPCB(from)->flags;
 420
 421 #ifdef CONFIG_NET_SCHED
 422         to->tc_index = from->tc_index;
 423 #endif
 424         nf_copy(to, from);
 425 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 426     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 427         to->nf_trace = from->nf_trace;
 428 #endif
 429 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 430         to->ipvs_property = from->ipvs_property;
 431 #endif
 432         skb_copy_secmark(to, from);
 433 }
 434
 435 /*
 436  *      This IP datagram is too large to be sent in one piece.  Break it up into
 437  *      smaller pieces (each of size equal to IP header plus
 438  *      a block of the data of the original IP data part) that will yet fit in a
 439  *      single device frame, and queue such a frame for sending.
 440  */
 441
 442 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 443 {
 444         struct iphdr *iph;
 445         int ptr;
 446         struct net_device *dev;
 447         struct sk_buff *skb2;
 448         unsigned int mtu, hlen, left, len, ll_rs, pad;
 449         int offset;
 450         __be16 not_last_frag;
 451         struct rtable *rt = skb_rtable(skb);
 452         int err = 0;
 453
 454         dev = rt->dst.dev;
 455
 456         /*
 457          *      Point into the IP datagram header.
 458          */
 459
 460         iph = ip_hdr(skb);
 461
 462         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 463                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 464                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 465                           htonl(ip_skb_dst_mtu(skb)));
 466                 kfree_skb(skb);
 467                 return -EMSGSIZE;
 468         }
 469
 470         /*
 471          *      Setup starting values.
 472          */
 473
 474         hlen = iph->ihl * 4;
 475         mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
 476 #ifdef CONFIG_BRIDGE_NETFILTER
 477         if (skb->nf_bridge)
 478                 mtu -= nf_bridge_mtu_reduction(skb);
 479 #endif
 480         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 481
 482         /* When frag_list is given, use it. First, check its validity:
 483          * some transformers could create wrong frag_list or break existing
 484          * one, it is not prohibited. In this case fall back to copying.
 485          *
 486          * LATER: this step can be merged to real generation of fragments,
 487          * we can switch to copy when see the first bad fragment.
 488          */
 489         if (skb_has_frags(skb)) {
 490                 struct sk_buff *frag;
 491                 int first_len = skb_pagelen(skb);
 492                 int truesizes = 0;
 493
 494                 if (first_len - hlen > mtu ||
 495                     ((first_len - hlen) & 7) ||
 496                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 497                     skb_cloned(skb))
 498                         goto slow_path;
 499
 500                 skb_walk_frags(skb, frag) {
 501                         /* Correct geometry. */
 502                         if (frag->len > mtu ||
 503                             ((frag->len & 7) && frag->next) ||
 504                             skb_headroom(frag) < hlen)
 505                             goto slow_path;
 506
 507                         /* Partially cloned skb? */
 508                         if (skb_shared(frag))
 509                                 goto slow_path;
 510
 511                         BUG_ON(frag->sk);
 512                         if (skb->sk) {
 513                                 frag->sk = skb->sk;
 514                                 frag->destructor = sock_wfree;
 515                         }
 516                         truesizes += frag->truesize;
 517                 }
 518
 519                 /* Everything is OK. Generate! */
 520
 521                 err = 0;
 522                 offset = 0;
 523                 frag = skb_shinfo(skb)->frag_list;
 524                 skb_frag_list_init(skb);
 525                 skb->data_len = first_len - skb_headlen(skb);
 526                 skb->truesize -= truesizes;
 527                 skb->len = first_len;
 528                 iph->tot_len = htons(first_len);
 529                 iph->frag_off = htons(IP_MF);
 530                 ip_send_check(iph);
 531
 532                 for (;;) {
 533                         /* Prepare header of the next frame,
 534                          * before previous one went down. */
 535                         if (frag) {
 536                                 frag->ip_summed = CHECKSUM_NONE;
 537                                 skb_reset_transport_header(frag);
 538                                 __skb_push(frag, hlen);
 539                                 skb_reset_network_header(frag);
 540                                 memcpy(skb_network_header(frag), iph, hlen);
 541                                 iph = ip_hdr(frag);
 542                                 iph->tot_len = htons(frag->len);
 543                                 ip_copy_metadata(frag, skb);
 544                                 if (offset == 0)
 545                                         ip_options_fragment(frag);
 546                                 offset += skb->len - hlen;
 547                                 iph->frag_off = htons(offset>>3);
 548                                 if (frag->next != NULL)
 549                                         iph->frag_off |= htons(IP_MF);
 550                                 /* Ready, complete checksum */
 551                                 ip_send_check(iph);
 552                         }
 553
 554                         err = output(skb);
 555
 556                         if (!err)
 557                                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 558                         if (err || !frag)
 559                                 break;
 560
 561                         skb = frag;
 562                         frag = skb->next;
 563                         skb->next = NULL;
 564                 }
 565
 566                 if (err == 0) {
 567                         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 568                         return 0;
 569                 }
 570
 571                 while (frag) {
 572                         skb = frag->next;
 573                         kfree_skb(frag);
 574                         frag = skb;
 575                 }
 576                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 577                 return err;
 578         }
 579
 580 slow_path:
 581         left = skb->len - hlen;         /* Space per frame */
 582         ptr = hlen;             /* Where to start from */
 583
 584         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 585          * we need to make room for the encapsulating header
 586          */
 587         pad = nf_bridge_pad(skb);
 588         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, pad);
 589         mtu -= pad;
 590
 591         /*
 592          *      Fragment the datagram.
 593          */
 594
 595         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 596         not_last_frag = iph->frag_off & htons(IP_MF);
 597
 598         /*
 599          *      Keep copying data until we run out.
 600          */
 601
 602         while (left > 0) {
 603                 len = left;
 604                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 605                 if (len > mtu)
 606                         len = mtu;
 607                 /* IF: we are not sending upto and including the packet end
 608                    then align the next start on an eight byte boundary */
 609                 if (len < left) {
 610                         len &= ~7;
 611                 }
 612                 /*
 613                  *      Allocate buffer.
 614                  */
 615
 616                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 617                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 618                         err = -ENOMEM;
 619                         goto fail;
 620                 }
 621
 622                 /*
 623                  *      Set up data on packet
 624                  */
 625
 626                 ip_copy_metadata(skb2, skb);
 627                 skb_reserve(skb2, ll_rs);
 628                 skb_put(skb2, len + hlen);
 629                 skb_reset_network_header(skb2);
 630                 skb2->transport_header = skb2->network_header + hlen;
 631
 632                 /*
 633                  *      Charge the memory for the fragment to any owner
 634                  *      it might possess
 635                  */
 636
 637                 if (skb->sk)
 638                         skb_set_owner_w(skb2, skb->sk);
 639
 640                 /*
 641                  *      Copy the packet header into the new buffer.
 642                  */
 643
 644                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 645
 646                 /*
 647                  *      Copy a block of the IP datagram.
 648                  */
 649                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 650                         BUG();
 651                 left -= len;
 652
 653                 /*
 654                  *      Fill in the new header fields.
 655                  */
 656                 iph = ip_hdr(skb2);
 657                 iph->frag_off = htons((offset >> 3));
 658
 659                 /* ANK: dirty, but effective trick. Upgrade options only if
 660                  * the segment to be fragmented was THE FIRST (otherwise,
 661                  * options are already fixed) and make it ONCE
 662                  * on the initial skb, so that all the following fragments
 663                  * will inherit fixed options.
 664                  */
 665                 if (offset == 0)
 666                         ip_options_fragment(skb);
 667
 668                 /*
 669                  *      Added AC : If we are fragmenting a fragment that's not the
 670                  *                 last fragment then keep MF on each bit
 671                  */
 672                 if (left > 0 || not_last_frag)
 673                         iph->frag_off |= htons(IP_MF);
 674                 ptr += len;
 675                 offset += len;
 676
 677                 /*
 678                  *      Put this fragment into the sending queue.
 679                  */
 680                 iph->tot_len = htons(len + hlen);
 681
 682                 ip_send_check(iph);
 683
 684                 err = output(skb2);
 685                 if (err)
 686                         goto fail;
 687
 688                 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
 689         }
 690         kfree_skb(skb);
 691         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
 692         return err;
 693
 694 fail:
 695         kfree_skb(skb);
 696         IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
 697         return err;
 698 }
 699
 700 EXPORT_SYMBOL(ip_fragment);
 701
 702 int
 703 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 704 {
 705         struct iovec *iov = from;
 706
 707         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 708                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 709                         return -EFAULT;
 710         } else {
 711                 __wsum csum = 0;
 712                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 713                         return -EFAULT;
 714                 skb->csum = csum_block_add(skb->csum, csum, odd);
 715         }
 716         return 0;
 717 }
 718
 719 static inline __wsum
 720 csum_page(struct page *page, int offset, int copy)
 721 {
 722         char *kaddr;
 723         __wsum csum;
 724         kaddr = kmap(page);
 725         csum = csum_partial(kaddr + offset, copy, 0);
 726         kunmap(page);
 727         return csum;
 728 }
 729
 730 static inline int ip_ufo_append_data(struct sock *sk,
 731                         int getfrag(void *from, char *to, int offset, int len,
 732                                int odd, struct sk_buff *skb),
 733                         void *from, int length, int hh_len, int fragheaderlen,
 734                         int transhdrlen, int mtu, unsigned int flags)
 735 {
 736         struct sk_buff *skb;
 737         int err;
 738
 739         /* There is support for UDP fragmentation offload by network
 740          * device, so create one single skb packet containing complete
 741          * udp datagram
 742          */
 743         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 744                 skb = sock_alloc_send_skb(sk,
 745                         hh_len + fragheaderlen + transhdrlen + 20,
 746                         (flags & MSG_DONTWAIT), &err);
 747
 748                 if (skb == NULL)
 749                         return err;
 750
 751                 /* reserve space for Hardware header */
 752                 skb_reserve(skb, hh_len);
 753
 754                 /* create space for UDP/IP header */
 755                 skb_put(skb, fragheaderlen + transhdrlen);
 756
 757                 /* initialize network header pointer */
 758                 skb_reset_network_header(skb);
 759
 760                 /* initialize protocol header pointer */
 761                 skb->transport_header = skb->network_header + fragheaderlen;
 762
 763                 skb->ip_summed = CHECKSUM_PARTIAL;
 764                 skb->csum = 0;
 765                 sk->sk_sndmsg_off = 0;
 766
 767                 /* specify the length of each IP datagram fragment */
 768                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 769                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 770                 __skb_queue_tail(&sk->sk_write_queue, skb);
 771         }
 772
 773         return skb_append_datato_frags(sk, skb, getfrag, from,
 774                                        (length - transhdrlen));
 775 }
 776
 777 /*
 778  *      ip_append_data() and ip_append_page() can make one large IP datagram
 779  *      from many pieces of data. Each pieces will be holded on the socket
 780  *      until ip_push_pending_frames() is called. Each piece can be a page
 781  *      or non-page data.
 782  *
 783  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 784  *      this interface potentially.
 785  *
 786  *      LATER: length must be adjusted by pad at tail, when it is required.
 787  */
 788 int ip_append_data(struct sock *sk,
 789                    int getfrag(void *from, char *to, int offset, int len,
 790                                int odd, struct sk_buff *skb),
 791                    void *from, int length, int transhdrlen,
 792                    struct ipcm_cookie *ipc, struct rtable **rtp,
 793                    unsigned int flags)
 794 {
 795         struct inet_sock *inet = inet_sk(sk);
 796         struct sk_buff *skb;
 797
 798         struct ip_options *opt = NULL;
 799         int hh_len;
 800         int exthdrlen;
 801         int mtu;
 802         int copy;
 803         int err;
 804         int offset = 0;
 805         unsigned int maxfraglen, fragheaderlen;
 806         int csummode = CHECKSUM_NONE;
 807         struct rtable *rt;
 808
 809         if (flags&MSG_PROBE)
 810                 return 0;
 811
 812         if (skb_queue_empty(&sk->sk_write_queue)) {
 813                 /*
 814                  * setup for corking.
 815                  */
 816                 opt = ipc->opt;
 817                 if (opt) {
 818                         if (inet->cork.opt == NULL) {
 819                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 820                                 if (unlikely(inet->cork.opt == NULL))
 821                                         return -ENOBUFS;
 822                         }
 823                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 824                         inet->cork.flags |= IPCORK_OPT;
 825                         inet->cork.addr = ipc->addr;
 826                 }
 827                 rt = *rtp;
 828                 if (unlikely(!rt))
 829                         return -EFAULT;
 830                 /*
 831                  * We steal reference to this route, caller should not release it
 832                  */
 833                 *rtp = NULL;
 834                 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 835                                             rt->dst.dev->mtu :
 836                                             dst_mtu(rt->dst.path);
 837                 inet->cork.dst = &rt->dst;
 838                 inet->cork.length = 0;
 839                 sk->sk_sndmsg_page = NULL;
 840                 sk->sk_sndmsg_off = 0;
 841                 if ((exthdrlen = rt->dst.header_len) != 0) {
 842                         length += exthdrlen;
 843                         transhdrlen += exthdrlen;
 844                 }
 845         } else {
 846                 rt = (struct rtable *)inet->cork.dst;
 847                 if (inet->cork.flags & IPCORK_OPT)
 848                         opt = inet->cork.opt;
 849
 850                 transhdrlen = 0;
 851                 exthdrlen = 0;
 852                 mtu = inet->cork.fragsize;
 853         }
 854         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
 855
 856         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 857         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 858
 859         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 860                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
 861                                mtu-exthdrlen);
 862                 return -EMSGSIZE;
 863         }
 864
 865         /*
 866          * transhdrlen > 0 means that this is the first fragment and we wish
 867          * it won't be fragmented in the future.
 868          */
 869         if (transhdrlen &&
 870             length + fragheaderlen <= mtu &&
 871             rt->dst.dev->features & NETIF_F_V4_CSUM &&
 872             !exthdrlen)
 873                 csummode = CHECKSUM_PARTIAL;
 874
 875         skb = skb_peek_tail(&sk->sk_write_queue);
 876
 877         inet->cork.length += length;
 878         if (((length > mtu) || (skb && skb_is_gso(skb))) &&
 879             (sk->sk_protocol == IPPROTO_UDP) &&
 880             (rt->dst.dev->features & NETIF_F_UFO)) {
 881                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 882                                          fragheaderlen, transhdrlen, mtu,
 883                                          flags);
 884                 if (err)
 885                         goto error;
 886                 return 0;
 887         }
 888
 889         /* So, what's going on in the loop below?
 890          *
 891          * We use calculated fragment length to generate chained skb,
 892          * each of segments is IP fragment ready for sending to network after
 893          * adding appropriate IP header.
 894          */
 895
 896         if (!skb)
 897                 goto alloc_new_skb;
 898
 899         while (length > 0) {
 900                 /* Check if the remaining data fits into current packet. */
 901                 copy = mtu - skb->len;
 902                 if (copy < length)
 903                         copy = maxfraglen - skb->len;
 904                 if (copy <= 0) {
 905                         char *data;
 906                         unsigned int datalen;
 907                         unsigned int fraglen;
 908                         unsigned int fraggap;
 909                         unsigned int alloclen;
 910                         struct sk_buff *skb_prev;
 911 alloc_new_skb:
 912                         skb_prev = skb;
 913                         if (skb_prev)
 914                                 fraggap = skb_prev->len - maxfraglen;
 915                         else
 916                                 fraggap = 0;
 917
 918                         /*
 919                          * If remaining data exceeds the mtu,
 920                          * we know we need more fragment(s).
 921                          */
 922                         datalen = length + fraggap;
 923                         if (datalen > mtu - fragheaderlen)
 924                                 datalen = maxfraglen - fragheaderlen;
 925                         fraglen = datalen + fragheaderlen;
 926
 927                         if ((flags & MSG_MORE) &&
 928                             !(rt->dst.dev->features&NETIF_F_SG))
 929                                 alloclen = mtu;
 930                         else
 931                                 alloclen = datalen + fragheaderlen;
 932
 933                         /* The last fragment gets additional space at tail.
 934                          * Note, with MSG_MORE we overallocate on fragments,
 935                          * because we have no idea what fragment will be
 936                          * the last.
 937                          */
 938                         if (datalen == length + fraggap)
 939                                 alloclen += rt->dst.trailer_len;
 940
 941                         if (transhdrlen) {
 942                                 skb = sock_alloc_send_skb(sk,
 943                                                 alloclen + hh_len + 15,
 944                                                 (flags & MSG_DONTWAIT), &err);
 945                         } else {
 946                                 skb = NULL;
 947                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 948                                     2 * sk->sk_sndbuf)
 949                                         skb = sock_wmalloc(sk,
 950                                                            alloclen + hh_len + 15, 1,
 951                                                            sk->sk_allocation);
 952                                 if (unlikely(skb == NULL))
 953                                         err = -ENOBUFS;
 954                                 else
 955                                         /* only the initial fragment is
 956                                            time stamped */
 957                                         ipc->shtx.flags = 0;
 958                         }
 959                         if (skb == NULL)
 960                                 goto error;
 961
 962                         /*
 963                          *      Fill in the control structures
 964                          */
 965                         skb->ip_summed = csummode;
 966                         skb->csum = 0;
 967                         skb_reserve(skb, hh_len);
 968                         *skb_tx(skb) = ipc->shtx;
 969
 970                         /*
 971                          *      Find where to start putting bytes.
 972                          */
 973                         data = skb_put(skb, fraglen);
 974                         skb_set_network_header(skb, exthdrlen);
 975                         skb->transport_header = (skb->network_header +
 976                                                  fragheaderlen);
 977                         data += fragheaderlen;
 978
 979                         if (fraggap) {
 980                                 skb->csum = skb_copy_and_csum_bits(
 981                                         skb_prev, maxfraglen,
 982                                         data + transhdrlen, fraggap, 0);
 983                                 skb_prev->csum = csum_sub(skb_prev->csum,
 984                                                           skb->csum);
 985                                 data += fraggap;
 986                                 pskb_trim_unique(skb_prev, maxfraglen);
 987                         }
 988
 989                         copy = datalen - transhdrlen - fraggap;
 990                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 991                                 err = -EFAULT;
 992                                 kfree_skb(skb);
 993                                 goto error;
 994                         }
 995
 996                         offset += copy;
 997                         length -= datalen - fraggap;
 998                         transhdrlen = 0;
 999                         exthdrlen = 0;
1000                         csummode = CHECKSUM_NONE;
1001
1002                         /*
1003                          * Put the packet on the pending queue.
1004                          */
1005                         __skb_queue_tail(&sk->sk_write_queue, skb);
1006                         continue;
1007                 }
1008
1009                 if (copy > length)
1010                         copy = length;
1011
1012                 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1013                         unsigned int off;
1014
1015                         off = skb->len;
1016                         if (getfrag(from, skb_put(skb, copy),
1017                                         offset, copy, off, skb) < 0) {
1018                                 __skb_trim(skb, off);
1019                                 err = -EFAULT;
1020                                 goto error;
1021                         }
1022                 } else {
1023                         int i = skb_shinfo(skb)->nr_frags;
1024                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1025                         struct page *page = sk->sk_sndmsg_page;
1026                         int off = sk->sk_sndmsg_off;
1027                         unsigned int left;
1028
1029                         if (page && (left = PAGE_SIZE - off) > 0) {
1030                                 if (copy >= left)
1031                                         copy = left;
1032                                 if (page != frag->page) {
1033                                         if (i == MAX_SKB_FRAGS) {
1034                                                 err = -EMSGSIZE;
1035                                                 goto error;
1036                                         }
1037                                         get_page(page);
1038                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1039                                         frag = &skb_shinfo(skb)->frags[i];
1040                                 }
1041                         } else if (i < MAX_SKB_FRAGS) {
1042                                 if (copy > PAGE_SIZE)
1043                                         copy = PAGE_SIZE;
1044                                 page = alloc_pages(sk->sk_allocation, 0);
1045                                 if (page == NULL)  {
1046                                         err = -ENOMEM;
1047                                         goto error;
1048                                 }
1049                                 sk->sk_sndmsg_page = page;
1050                                 sk->sk_sndmsg_off = 0;
1051
1052                                 skb_fill_page_desc(skb, i, page, 0, 0);
1053                                 frag = &skb_shinfo(skb)->frags[i];
1054                         } else {
1055                                 err = -EMSGSIZE;
1056                                 goto error;
1057                         }
1058                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1059                                 err = -EFAULT;
1060                                 goto error;
1061                         }
1062                         sk->sk_sndmsg_off += copy;
1063                         frag->size += copy;
1064                         skb->len += copy;
1065                         skb->data_len += copy;
1066                         skb->truesize += copy;
1067                         atomic_add(copy, &sk->sk_wmem_alloc);
1068                 }
1069                 offset += copy;
1070                 length -= copy;
1071         }
1072
1073         return 0;
1074
1075 error:
1076         inet->cork.length -= length;
1077         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1078         return err;
1079 }
1080
1081 ssize_t ip_append_page(struct sock *sk, struct page *page,
1082                        int offset, size_t size, int flags)
1083 {
1084         struct inet_sock *inet = inet_sk(sk);
1085         struct sk_buff *skb;
1086         struct rtable *rt;
1087         struct ip_options *opt = NULL;
1088         int hh_len;
1089         int mtu;
1090         int len;
1091         int err;
1092         unsigned int maxfraglen, fragheaderlen, fraggap;
1093
1094         if (inet->hdrincl)
1095                 return -EPERM;
1096
1097         if (flags&MSG_PROBE)
1098                 return 0;
1099
1100         if (skb_queue_empty(&sk->sk_write_queue))
1101                 return -EINVAL;
1102
1103         rt = (struct rtable *)inet->cork.dst;
1104         if (inet->cork.flags & IPCORK_OPT)
1105                 opt = inet->cork.opt;
1106
1107         if (!(rt->dst.dev->features&NETIF_F_SG))
1108                 return -EOPNOTSUPP;
1109
1110         hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1111         mtu = inet->cork.fragsize;
1112
1113         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1114         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1115
1116         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1117                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1118                 return -EMSGSIZE;
1119         }
1120
1121         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1122                 return -EINVAL;
1123
1124         inet->cork.length += size;
1125         if ((size + skb->len > mtu) &&
1126             (sk->sk_protocol == IPPROTO_UDP) &&
1127             (rt->dst.dev->features & NETIF_F_UFO)) {
1128                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1129                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1130         }
1131
1132
1133         while (size > 0) {
1134                 int i;
1135
1136                 if (skb_is_gso(skb))
1137                         len = size;
1138                 else {
1139
1140                         /* Check if the remaining data fits into current packet. */
1141                         len = mtu - skb->len;
1142                         if (len < size)
1143                                 len = maxfraglen - skb->len;
1144                 }
1145                 if (len <= 0) {
1146                         struct sk_buff *skb_prev;
1147                         int alloclen;
1148
1149                         skb_prev = skb;
1150                         fraggap = skb_prev->len - maxfraglen;
1151
1152                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1153                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1154                         if (unlikely(!skb)) {
1155                                 err = -ENOBUFS;
1156                                 goto error;
1157                         }
1158
1159                         /*
1160                          *      Fill in the control structures
1161                          */
1162                         skb->ip_summed = CHECKSUM_NONE;
1163                         skb->csum = 0;
1164                         skb_reserve(skb, hh_len);
1165
1166                         /*
1167                          *      Find where to start putting bytes.
1168                          */
1169                         skb_put(skb, fragheaderlen + fraggap);
1170                         skb_reset_network_header(skb);
1171                         skb->transport_header = (skb->network_header +
1172                                                  fragheaderlen);
1173                         if (fraggap) {
1174                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1175                                                                    maxfraglen,
1176                                                     skb_transport_header(skb),
1177                                                                    fraggap, 0);
1178                                 skb_prev->csum = csum_sub(skb_prev->csum,
1179                                                           skb->csum);
1180                                 pskb_trim_unique(skb_prev, maxfraglen);
1181                         }
1182
1183                         /*
1184                          * Put the packet on the pending queue.
1185                          */
1186                         __skb_queue_tail(&sk->sk_write_queue, skb);
1187                         continue;
1188                 }
1189
1190                 i = skb_shinfo(skb)->nr_frags;
1191                 if (len > size)
1192                         len = size;
1193                 if (skb_can_coalesce(skb, i, page, offset)) {
1194                         skb_shinfo(skb)->frags[i-1].size += len;
1195                 } else if (i < MAX_SKB_FRAGS) {
1196                         get_page(page);
1197                         skb_fill_page_desc(skb, i, page, offset, len);
1198                 } else {
1199                         err = -EMSGSIZE;
1200                         goto error;
1201                 }
1202
1203                 if (skb->ip_summed == CHECKSUM_NONE) {
1204                         __wsum csum;
1205                         csum = csum_page(page, offset, len);
1206                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1207                 }
1208
1209                 skb->len += len;
1210                 skb->data_len += len;
1211                 skb->truesize += len;
1212                 atomic_add(len, &sk->sk_wmem_alloc);
1213                 offset += len;
1214                 size -= len;
1215         }
1216         return 0;
1217
1218 error:
1219         inet->cork.length -= size;
1220         IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1221         return err;
1222 }
1223
1224 static void ip_cork_release(struct inet_sock *inet)
1225 {
1226         inet->cork.flags &= ~IPCORK_OPT;
1227         kfree(inet->cork.opt);
1228         inet->cork.opt = NULL;
1229         dst_release(inet->cork.dst);
1230         inet->cork.dst = NULL;
1231 }
1232
1233 /*
1234  *      Combined all pending IP fragments on the socket as one IP datagram
1235  *      and push them out.
1236  */
1237 int ip_push_pending_frames(struct sock *sk)
1238 {
1239         struct sk_buff *skb, *tmp_skb;
1240         struct sk_buff **tail_skb;
1241         struct inet_sock *inet = inet_sk(sk);
1242         struct net *net = sock_net(sk);
1243         struct ip_options *opt = NULL;
1244         struct rtable *rt = (struct rtable *)inet->cork.dst;
1245         struct iphdr *iph;
1246         __be16 df = 0;
1247         __u8 ttl;
1248         int err = 0;
1249
1250         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1251                 goto out;
1252         tail_skb = &(skb_shinfo(skb)->frag_list);
1253
1254         /* move skb->data to ip header from ext header */
1255         if (skb->data < skb_network_header(skb))
1256                 __skb_pull(skb, skb_network_offset(skb));
1257         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1258                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1259                 *tail_skb = tmp_skb;
1260                 tail_skb = &(tmp_skb->next);
1261                 skb->len += tmp_skb->len;
1262                 skb->data_len += tmp_skb->len;
1263                 skb->truesize += tmp_skb->truesize;
1264                 tmp_skb->destructor = NULL;
1265                 tmp_skb->sk = NULL;
1266         }
1267
1268         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1269          * to fragment the frame generated here. No matter, what transforms
1270          * how transforms change size of the packet, it will come out.
1271          */
1272         if (inet->pmtudisc < IP_PMTUDISC_DO)
1273                 skb->local_df = 1;
1274
1275         /* DF bit is set when we want to see DF on outgoing frames.
1276          * If local_df is set too, we still allow to fragment this frame
1277          * locally. */
1278         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1279             (skb->len <= dst_mtu(&rt->dst) &&
1280              ip_dont_fragment(sk, &rt->dst)))
1281                 df = htons(IP_DF);
1282
1283         if (inet->cork.flags & IPCORK_OPT)
1284                 opt = inet->cork.opt;
1285
1286         if (rt->rt_type == RTN_MULTICAST)
1287                 ttl = inet->mc_ttl;
1288         else
1289                 ttl = ip_select_ttl(inet, &rt->dst);
1290
1291         iph = (struct iphdr *)skb->data;
1292         iph->version = 4;
1293         iph->ihl = 5;
1294         if (opt) {
1295                 iph->ihl += opt->optlen>>2;
1296                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1297         }
1298         iph->tos = inet->tos;
1299         iph->frag_off = df;
1300         ip_select_ident(iph, &rt->dst, sk);
1301         iph->ttl = ttl;
1302         iph->protocol = sk->sk_protocol;
1303         iph->saddr = rt->rt_src;
1304         iph->daddr = rt->rt_dst;
1305
1306         skb->priority = sk->sk_priority;
1307         skb->mark = sk->sk_mark;
1308         /*
1309          * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1310          * on dst refcount
1311          */
1312         inet->cork.dst = NULL;
1313         skb_dst_set(skb, &rt->dst);
1314
1315         if (iph->protocol == IPPROTO_ICMP)
1316                 icmp_out_count(net, ((struct icmphdr *)
1317                         skb_transport_header(skb))->type);
1318
1319         /* Netfilter gets whole the not fragmented skb. */
1320         err = ip_local_out(skb);
1321         if (err) {
1322                 if (err > 0)
1323                         err = net_xmit_errno(err);
1324                 if (err)
1325                         goto error;
1326         }
1327
1328 out:
1329         ip_cork_release(inet);
1330         return err;
1331
1332 error:
1333         IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1334         goto out;
1335 }
1336
1337 /*
1338  *      Throw away all pending data on the socket.
1339  */
1340 void ip_flush_pending_frames(struct sock *sk)
1341 {
1342         struct sk_buff *skb;
1343
1344         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1345                 kfree_skb(skb);
1346
1347         ip_cork_release(inet_sk(sk));
1348 }
1349
1350
1351 /*
1352  *      Fetch data from kernel space and fill in checksum if needed.
1353  */
1354 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1355                               int len, int odd, struct sk_buff *skb)
1356 {
1357         __wsum csum;
1358
1359         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1360         skb->csum = csum_block_add(skb->csum, csum, odd);
1361         return 0;
1362 }
1363
1364 /*
1365  *      Generic function to send a packet as reply to another packet.
1366  *      Used to send TCP resets so far. ICMP should use this function too.
1367  *
1368  *      Should run single threaded per socket because it uses the sock
1369  *      structure to pass arguments.
1370  */
1371 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1372                    unsigned int len)
1373 {
1374         struct inet_sock *inet = inet_sk(sk);
1375         struct {
1376                 struct ip_options       opt;
1377                 char                    data[40];
1378         } replyopts;
1379         struct ipcm_cookie ipc;
1380         __be32 daddr;
1381         struct rtable *rt = skb_rtable(skb);
1382
1383         if (ip_options_echo(&replyopts.opt, skb))
1384                 return;
1385
1386         daddr = ipc.addr = rt->rt_src;
1387         ipc.opt = NULL;
1388         ipc.shtx.flags = 0;
1389
1390         if (replyopts.opt.optlen) {
1391                 ipc.opt = &replyopts.opt;
1392
1393                 if (ipc.opt->srr)
1394                         daddr = replyopts.opt.faddr;
1395         }
1396
1397         {
1398                 struct flowi fl = { .oif = arg->bound_dev_if,
1399                                     .nl_u = { .ip4_u =
1400                                               { .daddr = daddr,
1401                                                 .saddr = rt->rt_spec_dst,
1402                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1403                                     /* Not quite clean, but right. */
1404                                     .uli_u = { .ports =
1405                                                { .sport = tcp_hdr(skb)->dest,
1406                                                  .dport = tcp_hdr(skb)->source } },
1407                                     .proto = sk->sk_protocol,
1408                                     .flags = ip_reply_arg_flowi_flags(arg) };
1409                 security_skb_classify_flow(skb, &fl);
1410                 if (ip_route_output_key(sock_net(sk), &rt, &fl))
1411                         return;
1412         }
1413
1414         /* And let IP do all the hard work.
1415
1416            This chunk is not reenterable, hence spinlock.
1417            Note that it uses the fact, that this function is called
1418            with locally disabled BH and that sk cannot be already spinlocked.
1419          */
1420         bh_lock_sock(sk);
1421         inet->tos = ip_hdr(skb)->tos;
1422         sk->sk_priority = skb->priority;
1423         sk->sk_protocol = ip_hdr(skb)->protocol;
1424         sk->sk_bound_dev_if = arg->bound_dev_if;
1425         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1426                        &ipc, &rt, MSG_DONTWAIT);
1427         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1428                 if (arg->csumoffset >= 0)
1429                         *((__sum16 *)skb_transport_header(skb) +
1430                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1431                                                                 arg->csum));
1432                 skb->ip_summed = CHECKSUM_NONE;
1433                 ip_push_pending_frames(sk);
1434         }
1435
1436         bh_unlock_sock(sk);
1437
1438         ip_rt_put(rt);
1439 }
1440
1441 void __init ip_init(void)
1442 {
1443         ip_rt_init();
1444         inet_initpeers();
1445
1446 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1447         igmp_mc_proc_init();
1448 #endif
1449 }
1450
1451 EXPORT_SYMBOL(ip_generic_getfrag);
1452 EXPORT_SYMBOL(ip_queue_xmit);
1453 EXPORT_SYMBOL(ip_send_check);