Pileus Git - ~andy/linux/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53 #define pr_fmt(fmt) "TCP: " fmt
  54
  55 #include <linux/bottom_half.h>
  56 #include <linux/types.h>
  57 #include <linux/fcntl.h>
  58 #include <linux/module.h>
  59 #include <linux/random.h>
  60 #include <linux/cache.h>
  61 #include <linux/jhash.h>
  62 #include <linux/init.h>
  63 #include <linux/times.h>
  64 #include <linux/slab.h>
  65
  66 #include <net/net_namespace.h>
  67 #include <net/icmp.h>
  68 #include <net/inet_hashtables.h>
  69 #include <net/tcp.h>
  70 #include <net/transp_v6.h>
  71 #include <net/ipv6.h>
  72 #include <net/inet_common.h>
  73 #include <net/timewait_sock.h>
  74 #include <net/xfrm.h>
  75 #include <net/netdma.h>
  76 #include <net/secure_seq.h>
  77 #include <net/tcp_memcontrol.h>
  78
  79 #include <linux/inet.h>
  80 #include <linux/ipv6.h>
  81 #include <linux/stddef.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/seq_file.h>
  84
  85 #include <linux/crypto.h>
  86 #include <linux/scatterlist.h>
  87
  88 int sysctl_tcp_tw_reuse __read_mostly;
  89 int sysctl_tcp_low_latency __read_mostly;
  90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
  91
  92
  93 #ifdef CONFIG_TCP_MD5SIG
  94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
  95                                __be32 daddr, __be32 saddr, const struct tcphdr *th);
  96 #endif
  97
  98 struct inet_hashinfo tcp_hashinfo;
  99 EXPORT_SYMBOL(tcp_hashinfo);
 100
 101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
 102 {
 103         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 104                                           ip_hdr(skb)->saddr,
 105                                           tcp_hdr(skb)->dest,
 106                                           tcp_hdr(skb)->source);
 107 }
 108
 109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 110 {
 111         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 112         struct tcp_sock *tp = tcp_sk(sk);
 113
 114         /* With PAWS, it is safe from the viewpoint
 115            of data integrity. Even without PAWS it is safe provided sequence
 116            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 117
 118            Actually, the idea is close to VJ's one, only timestamp cache is
 119            held not per host, but per port pair and TW bucket is used as state
 120            holder.
 121
 122            If TW bucket has been already destroyed we fall back to VJ's scheme
 123            and use initial timestamp retrieved from peer table.
 124          */
 125         if (tcptw->tw_ts_recent_stamp &&
 126             (twp == NULL || (sysctl_tcp_tw_reuse &&
 127                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 128                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 129                 if (tp->write_seq == 0)
 130                         tp->write_seq = 1;
 131                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 132                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 133                 sock_hold(sktw);
 134                 return 1;
 135         }
 136
 137         return 0;
 138 }
 139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 140
 141 static int tcp_repair_connect(struct sock *sk)
 142 {
 143         tcp_connect_init(sk);
 144         tcp_finish_connect(sk, NULL);
 145
 146         return 0;
 147 }
 148
 149 /* This will initiate an outgoing connection. */
 150 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 151 {
 152         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 153         struct inet_sock *inet = inet_sk(sk);
 154         struct tcp_sock *tp = tcp_sk(sk);
 155         __be16 orig_sport, orig_dport;
 156         __be32 daddr, nexthop;
 157         struct flowi4 *fl4;
 158         struct rtable *rt;
 159         int err;
 160         struct ip_options_rcu *inet_opt;
 161
 162         if (addr_len < sizeof(struct sockaddr_in))
 163                 return -EINVAL;
 164
 165         if (usin->sin_family != AF_INET)
 166                 return -EAFNOSUPPORT;
 167
 168         nexthop = daddr = usin->sin_addr.s_addr;
 169         inet_opt = rcu_dereference_protected(inet->inet_opt,
 170                                              sock_owned_by_user(sk));
 171         if (inet_opt && inet_opt->opt.srr) {
 172                 if (!daddr)
 173                         return -EINVAL;
 174                 nexthop = inet_opt->opt.faddr;
 175         }
 176
 177         orig_sport = inet->inet_sport;
 178         orig_dport = usin->sin_port;
 179         fl4 = &inet->cork.fl.u.ip4;
 180         rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
 181                               RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 182                               IPPROTO_TCP,
 183                               orig_sport, orig_dport, sk, true);
 184         if (IS_ERR(rt)) {
 185                 err = PTR_ERR(rt);
 186                 if (err == -ENETUNREACH)
 187                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 188                 return err;
 189         }
 190
 191         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 192                 ip_rt_put(rt);
 193                 return -ENETUNREACH;
 194         }
 195
 196         if (!inet_opt || !inet_opt->opt.srr)
 197                 daddr = fl4->daddr;
 198
 199         if (!inet->inet_saddr)
 200                 inet->inet_saddr = fl4->saddr;
 201         inet->inet_rcv_saddr = inet->inet_saddr;
 202
 203         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 204                 /* Reset inherited state */
 205                 tp->rx_opt.ts_recent       = 0;
 206                 tp->rx_opt.ts_recent_stamp = 0;
 207                 if (likely(!tp->repair))
 208                         tp->write_seq      = 0;
 209         }
 210
 211         if (tcp_death_row.sysctl_tw_recycle &&
 212             !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
 213                 tcp_fetch_timewait_stamp(sk, &rt->dst);
 214
 215         inet->inet_dport = usin->sin_port;
 216         inet->inet_daddr = daddr;
 217
 218         inet_csk(sk)->icsk_ext_hdr_len = 0;
 219         if (inet_opt)
 220                 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
 221
 222         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 223
 224         /* Socket identity is still unknown (sport may be zero).
 225          * However we set state to SYN-SENT and not releasing socket
 226          * lock select source port, enter ourselves into the hash tables and
 227          * complete initialization after this.
 228          */
 229         tcp_set_state(sk, TCP_SYN_SENT);
 230         err = inet_hash_connect(&tcp_death_row, sk);
 231         if (err)
 232                 goto failure;
 233
 234         rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
 235                                inet->inet_sport, inet->inet_dport, sk);
 236         if (IS_ERR(rt)) {
 237                 err = PTR_ERR(rt);
 238                 rt = NULL;
 239                 goto failure;
 240         }
 241         /* OK, now commit destination to socket.  */
 242         sk->sk_gso_type = SKB_GSO_TCPV4;
 243         sk_setup_caps(sk, &rt->dst);
 244
 245         if (!tp->write_seq && likely(!tp->repair))
 246                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 247                                                            inet->inet_daddr,
 248                                                            inet->inet_sport,
 249                                                            usin->sin_port);
 250
 251         inet->inet_id = tp->write_seq ^ jiffies;
 252
 253         if (likely(!tp->repair))
 254                 err = tcp_connect(sk);
 255         else
 256                 err = tcp_repair_connect(sk);
 257
 258         rt = NULL;
 259         if (err)
 260                 goto failure;
 261
 262         return 0;
 263
 264 failure:
 265         /*
 266          * This unhashes the socket and releases the local port,
 267          * if necessary.
 268          */
 269         tcp_set_state(sk, TCP_CLOSE);
 270         ip_rt_put(rt);
 271         sk->sk_route_caps = 0;
 272         inet->inet_dport = 0;
 273         return err;
 274 }
 275 EXPORT_SYMBOL(tcp_v4_connect);
 276
 277 /*
 278  * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
 279  * It can be called through tcp_release_cb() if socket was owned by user
 280  * at the time tcp_v4_err() was called to handle ICMP message.
 281  */
 282 static void tcp_v4_mtu_reduced(struct sock *sk)
 283 {
 284         struct dst_entry *dst;
 285         struct inet_sock *inet = inet_sk(sk);
 286         u32 mtu = tcp_sk(sk)->mtu_info;
 287
 288         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 289          * send out by Linux are always <576bytes so they should go through
 290          * unfragmented).
 291          */
 292         if (sk->sk_state == TCP_LISTEN)
 293                 return;
 294
 295         dst = inet_csk_update_pmtu(sk, mtu);
 296         if (!dst)
 297                 return;
 298
 299         /* Something is about to be wrong... Remember soft error
 300          * for the case, if this connection will not able to recover.
 301          */
 302         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 303                 sk->sk_err_soft = EMSGSIZE;
 304
 305         mtu = dst_mtu(dst);
 306
 307         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 308             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 309                 tcp_sync_mss(sk, mtu);
 310
 311                 /* Resend the TCP packet because it's
 312                  * clear that the old packet has been
 313                  * dropped. This is the new "fast" path mtu
 314                  * discovery.
 315                  */
 316                 tcp_simple_retransmit(sk);
 317         } /* else let the usual retransmit timer handle it */
 318 }
 319
 320 static void do_redirect(struct sk_buff *skb, struct sock *sk)
 321 {
 322         struct dst_entry *dst = __sk_dst_check(sk, 0);
 323
 324         if (dst)
 325                 dst->ops->redirect(dst, sk, skb);
 326 }
 327
 328 /*
 329  * This routine is called by the ICMP module when it gets some
 330  * sort of error condition.  If err < 0 then the socket should
 331  * be closed and the error returned to the user.  If err > 0
 332  * it's just the icmp type << 8 | icmp code.  After adjustment
 333  * header points to the first 8 bytes of the tcp header.  We need
 334  * to find the appropriate port.
 335  *
 336  * The locking strategy used here is very "optimistic". When
 337  * someone else accesses the socket the ICMP is just dropped
 338  * and for some paths there is no check at all.
 339  * A more general error queue to queue errors for later handling
 340  * is probably better.
 341  *
 342  */
 343
 344 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 345 {
 346         const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
 347         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 348         struct inet_connection_sock *icsk;
 349         struct tcp_sock *tp;
 350         struct inet_sock *inet;
 351         const int type = icmp_hdr(icmp_skb)->type;
 352         const int code = icmp_hdr(icmp_skb)->code;
 353         struct sock *sk;
 354         struct sk_buff *skb;
 355         struct request_sock *req;
 356         __u32 seq;
 357         __u32 remaining;
 358         int err;
 359         struct net *net = dev_net(icmp_skb->dev);
 360
 361         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 362                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 363                 return;
 364         }
 365
 366         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 367                         iph->saddr, th->source, inet_iif(icmp_skb));
 368         if (!sk) {
 369                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 370                 return;
 371         }
 372         if (sk->sk_state == TCP_TIME_WAIT) {
 373                 inet_twsk_put(inet_twsk(sk));
 374                 return;
 375         }
 376
 377         bh_lock_sock(sk);
 378         /* If too many ICMPs get dropped on busy
 379          * servers this needs to be solved differently.
 380          * We do take care of PMTU discovery (RFC1191) special case :
 381          * we can receive locally generated ICMP messages while socket is held.
 382          */
 383         if (sock_owned_by_user(sk) &&
 384             type != ICMP_DEST_UNREACH &&
 385             code != ICMP_FRAG_NEEDED)
 386                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 387
 388         if (sk->sk_state == TCP_CLOSE)
 389                 goto out;
 390
 391         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
 392                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
 393                 goto out;
 394         }
 395
 396         icsk = inet_csk(sk);
 397         tp = tcp_sk(sk);
 398         req = tp->fastopen_rsk;
 399         seq = ntohl(th->seq);
 400         if (sk->sk_state != TCP_LISTEN &&
 401             !between(seq, tp->snd_una, tp->snd_nxt) &&
 402             (req == NULL || seq != tcp_rsk(req)->snt_isn)) {
 403                 /* For a Fast Open socket, allow seq to be snt_isn. */
 404                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 405                 goto out;
 406         }
 407
 408         switch (type) {
 409         case ICMP_REDIRECT:
 410                 do_redirect(icmp_skb, sk);
 411                 goto out;
 412         case ICMP_SOURCE_QUENCH:
 413                 /* Just silently ignore these. */
 414                 goto out;
 415         case ICMP_PARAMETERPROB:
 416                 err = EPROTO;
 417                 break;
 418         case ICMP_DEST_UNREACH:
 419                 if (code > NR_ICMP_UNREACH)
 420                         goto out;
 421
 422                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 423                         tp->mtu_info = info;
 424                         if (!sock_owned_by_user(sk)) {
 425                                 tcp_v4_mtu_reduced(sk);
 426                         } else {
 427                                 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
 428                                         sock_hold(sk);
 429                         }
 430                         goto out;
 431                 }
 432
 433                 err = icmp_err_convert[code].errno;
 434                 /* check if icmp_skb allows revert of backoff
 435                  * (see draft-zimmermann-tcp-lcd) */
 436                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 437                         break;
 438                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 439                     !icsk->icsk_backoff)
 440                         break;
 441
 442                 /* XXX (TFO) - revisit the following logic for TFO */
 443
 444                 if (sock_owned_by_user(sk))
 445                         break;
 446
 447                 icsk->icsk_backoff--;
 448                 inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
 449                         TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
 450                 tcp_bound_rto(sk);
 451
 452                 skb = tcp_write_queue_head(sk);
 453                 BUG_ON(!skb);
 454
 455                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 456                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 457
 458                 if (remaining) {
 459                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 460                                                   remaining, TCP_RTO_MAX);
 461                 } else {
 462                         /* RTO revert clocked out retransmission.
 463                          * Will retransmit now */
 464                         tcp_retransmit_timer(sk);
 465                 }
 466
 467                 break;
 468         case ICMP_TIME_EXCEEDED:
 469                 err = EHOSTUNREACH;
 470                 break;
 471         default:
 472                 goto out;
 473         }
 474
 475         /* XXX (TFO) - if it's a TFO socket and has been accepted, rather
 476          * than following the TCP_SYN_RECV case and closing the socket,
 477          * we ignore the ICMP error and keep trying like a fully established
 478          * socket. Is this the right thing to do?
 479          */
 480         if (req && req->sk == NULL)
 481                 goto out;
 482
 483         switch (sk->sk_state) {
 484                 struct request_sock *req, **prev;
 485         case TCP_LISTEN:
 486                 if (sock_owned_by_user(sk))
 487                         goto out;
 488
 489                 req = inet_csk_search_req(sk, &prev, th->dest,
 490                                           iph->daddr, iph->saddr);
 491                 if (!req)
 492                         goto out;
 493
 494                 /* ICMPs are not backlogged, hence we cannot get
 495                    an established socket here.
 496                  */
 497                 WARN_ON(req->sk);
 498
 499                 if (seq != tcp_rsk(req)->snt_isn) {
 500                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 501                         goto out;
 502                 }
 503
 504                 /*
 505                  * Still in SYN_RECV, just remove it silently.
 506                  * There is no good way to pass the error to the newly
 507                  * created socket, and POSIX does not want network
 508                  * errors returned from accept().
 509                  */
 510                 inet_csk_reqsk_queue_drop(sk, req, prev);
 511                 goto out;
 512
 513         case TCP_SYN_SENT:
 514         case TCP_SYN_RECV:  /* Cannot happen.
 515                                It can f.e. if SYNs crossed,
 516                                or Fast Open.
 517                              */
 518                 if (!sock_owned_by_user(sk)) {
 519                         sk->sk_err = err;
 520
 521                         sk->sk_error_report(sk);
 522
 523                         tcp_done(sk);
 524                 } else {
 525                         sk->sk_err_soft = err;
 526                 }
 527                 goto out;
 528         }
 529
 530         /* If we've already connected we will keep trying
 531          * until we time out, or the user gives up.
 532          *
 533          * rfc1122 4.2.3.9 allows to consider as hard errors
 534          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 535          * but it is obsoleted by pmtu discovery).
 536          *
 537          * Note, that in modern internet, where routing is unreliable
 538          * and in each dark corner broken firewalls sit, sending random
 539          * errors ordered by their masters even this two messages finally lose
 540          * their original sense (even Linux sends invalid PORT_UNREACHs)
 541          *
 542          * Now we are in compliance with RFCs.
 543          *                                                      --ANK (980905)
 544          */
 545
 546         inet = inet_sk(sk);
 547         if (!sock_owned_by_user(sk) && inet->recverr) {
 548                 sk->sk_err = err;
 549                 sk->sk_error_report(sk);
 550         } else  { /* Only an error on timeout */
 551                 sk->sk_err_soft = err;
 552         }
 553
 554 out:
 555         bh_unlock_sock(sk);
 556         sock_put(sk);
 557 }
 558
 559 static void __tcp_v4_send_check(struct sk_buff *skb,
 560                                 __be32 saddr, __be32 daddr)
 561 {
 562         struct tcphdr *th = tcp_hdr(skb);
 563
 564         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 565                 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
 566                 skb->csum_start = skb_transport_header(skb) - skb->head;
 567                 skb->csum_offset = offsetof(struct tcphdr, check);
 568         } else {
 569                 th->check = tcp_v4_check(skb->len, saddr, daddr,
 570                                          csum_partial(th,
 571                                                       th->doff << 2,
 572                                                       skb->csum));
 573         }
 574 }
 575
 576 /* This routine computes an IPv4 TCP checksum. */
 577 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
 578 {
 579         const struct inet_sock *inet = inet_sk(sk);
 580
 581         __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
 582 }
 583 EXPORT_SYMBOL(tcp_v4_send_check);
 584
 585 int tcp_v4_gso_send_check(struct sk_buff *skb)
 586 {
 587         const struct iphdr *iph;
 588         struct tcphdr *th;
 589
 590         if (!pskb_may_pull(skb, sizeof(*th)))
 591                 return -EINVAL;
 592
 593         iph = ip_hdr(skb);
 594         th = tcp_hdr(skb);
 595
 596         th->check = 0;
 597         skb->ip_summed = CHECKSUM_PARTIAL;
 598         __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
 599         return 0;
 600 }
 601
 602 /*
 603  *      This routine will send an RST to the other tcp.
 604  *
 605  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 606  *                    for reset.
 607  *      Answer: if a packet caused RST, it is not for a socket
 608  *              existing in our system, if it is matched to a socket,
 609  *              it is just duplicate segment or bug in other side's TCP.
 610  *              So that we build reply only basing on parameters
 611  *              arrived with segment.
 612  *      Exception: precedence violation. We do not implement it in any case.
 613  */
 614
 615 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 616 {
 617         const struct tcphdr *th = tcp_hdr(skb);
 618         struct {
 619                 struct tcphdr th;
 620 #ifdef CONFIG_TCP_MD5SIG
 621                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 622 #endif
 623         } rep;
 624         struct ip_reply_arg arg;
 625 #ifdef CONFIG_TCP_MD5SIG
 626         struct tcp_md5sig_key *key;
 627         const __u8 *hash_location = NULL;
 628         unsigned char newhash[16];
 629         int genhash;
 630         struct sock *sk1 = NULL;
 631 #endif
 632         struct net *net;
 633
 634         /* Never send a reset in response to a reset. */
 635         if (th->rst)
 636                 return;
 637
 638         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 639                 return;
 640
 641         /* Swap the send and the receive. */
 642         memset(&rep, 0, sizeof(rep));
 643         rep.th.dest   = th->source;
 644         rep.th.source = th->dest;
 645         rep.th.doff   = sizeof(struct tcphdr) / 4;
 646         rep.th.rst    = 1;
 647
 648         if (th->ack) {
 649                 rep.th.seq = th->ack_seq;
 650         } else {
 651                 rep.th.ack = 1;
 652                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 653                                        skb->len - (th->doff << 2));
 654         }
 655
 656         memset(&arg, 0, sizeof(arg));
 657         arg.iov[0].iov_base = (unsigned char *)&rep;
 658         arg.iov[0].iov_len  = sizeof(rep.th);
 659
 660 #ifdef CONFIG_TCP_MD5SIG
 661         hash_location = tcp_parse_md5sig_option(th);
 662         if (!sk && hash_location) {
 663                 /*
 664                  * active side is lost. Try to find listening socket through
 665                  * source port, and then find md5 key through listening socket.
 666                  * we are not loose security here:
 667                  * Incoming packet is checked with md5 hash with finding key,
 668                  * no RST generated if md5 hash doesn't match.
 669                  */
 670                 sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
 671                                              &tcp_hashinfo, ip_hdr(skb)->daddr,
 672                                              ntohs(th->source), inet_iif(skb));
 673                 /* don't send rst if it can't find key */
 674                 if (!sk1)
 675                         return;
 676                 rcu_read_lock();
 677                 key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
 678                                         &ip_hdr(skb)->saddr, AF_INET);
 679                 if (!key)
 680                         goto release_sk1;
 681
 682                 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
 683                 if (genhash || memcmp(hash_location, newhash, 16) != 0)
 684                         goto release_sk1;
 685         } else {
 686                 key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
 687                                              &ip_hdr(skb)->saddr,
 688                                              AF_INET) : NULL;
 689         }
 690
 691         if (key) {
 692                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 693                                    (TCPOPT_NOP << 16) |
 694                                    (TCPOPT_MD5SIG << 8) |
 695                                    TCPOLEN_MD5SIG);
 696                 /* Update length and the length the header thinks exists */
 697                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 698                 rep.th.doff = arg.iov[0].iov_len / 4;
 699
 700                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 701                                      key, ip_hdr(skb)->saddr,
 702                                      ip_hdr(skb)->daddr, &rep.th);
 703         }
 704 #endif
 705         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 706                                       ip_hdr(skb)->saddr, /* XXX */
 707                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 708         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 709         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 710         /* When socket is gone, all binding information is lost.
 711          * routing might fail in this case. using iif for oif to
 712          * make sure we can deliver it
 713          */
 714         arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb);
 715
 716         net = dev_net(skb_dst(skb)->dev);
 717         arg.tos = ip_hdr(skb)->tos;
 718         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 719                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 720
 721         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 722         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 723
 724 #ifdef CONFIG_TCP_MD5SIG
 725 release_sk1:
 726         if (sk1) {
 727                 rcu_read_unlock();
 728                 sock_put(sk1);
 729         }
 730 #endif
 731 }
 732
 733 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 734    outside socket context is ugly, certainly. What can I do?
 735  */
 736
 737 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 738                             u32 win, u32 ts, int oif,
 739                             struct tcp_md5sig_key *key,
 740                             int reply_flags, u8 tos)
 741 {
 742         const struct tcphdr *th = tcp_hdr(skb);
 743         struct {
 744                 struct tcphdr th;
 745                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 746 #ifdef CONFIG_TCP_MD5SIG
 747                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 748 #endif
 749                         ];
 750         } rep;
 751         struct ip_reply_arg arg;
 752         struct net *net = dev_net(skb_dst(skb)->dev);
 753
 754         memset(&rep.th, 0, sizeof(struct tcphdr));
 755         memset(&arg, 0, sizeof(arg));
 756
 757         arg.iov[0].iov_base = (unsigned char *)&rep;
 758         arg.iov[0].iov_len  = sizeof(rep.th);
 759         if (ts) {
 760                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 761                                    (TCPOPT_TIMESTAMP << 8) |
 762                                    TCPOLEN_TIMESTAMP);
 763                 rep.opt[1] = htonl(tcp_time_stamp);
 764                 rep.opt[2] = htonl(ts);
 765                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 766         }
 767
 768         /* Swap the send and the receive. */
 769         rep.th.dest    = th->source;
 770         rep.th.source  = th->dest;
 771         rep.th.doff    = arg.iov[0].iov_len / 4;
 772         rep.th.seq     = htonl(seq);
 773         rep.th.ack_seq = htonl(ack);
 774         rep.th.ack     = 1;
 775         rep.th.window  = htons(win);
 776
 777 #ifdef CONFIG_TCP_MD5SIG
 778         if (key) {
 779                 int offset = (ts) ? 3 : 0;
 780
 781                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 782                                           (TCPOPT_NOP << 16) |
 783                                           (TCPOPT_MD5SIG << 8) |
 784                                           TCPOLEN_MD5SIG);
 785                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 786                 rep.th.doff = arg.iov[0].iov_len/4;
 787
 788                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 789                                     key, ip_hdr(skb)->saddr,
 790                                     ip_hdr(skb)->daddr, &rep.th);
 791         }
 792 #endif
 793         arg.flags = reply_flags;
 794         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 795                                       ip_hdr(skb)->saddr, /* XXX */
 796                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 797         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 798         if (oif)
 799                 arg.bound_dev_if = oif;
 800         arg.tos = tos;
 801         ip_send_unicast_reply(net, skb, ip_hdr(skb)->saddr,
 802                               ip_hdr(skb)->daddr, &arg, arg.iov[0].iov_len);
 803
 804         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 805 }
 806
 807 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 808 {
 809         struct inet_timewait_sock *tw = inet_twsk(sk);
 810         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 811
 812         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 813                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 814                         tcptw->tw_ts_recent,
 815                         tw->tw_bound_dev_if,
 816                         tcp_twsk_md5_key(tcptw),
 817                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
 818                         tw->tw_tos
 819                         );
 820
 821         inet_twsk_put(tw);
 822 }
 823
 824 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 825                                   struct request_sock *req)
 826 {
 827         /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
 828          * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
 829          */
 830         tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
 831                         tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
 832                         tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
 833                         req->ts_recent,
 834                         0,
 835                         tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
 836                                           AF_INET),
 837                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
 838                         ip_hdr(skb)->tos);
 839 }
 840
 841 /*
 842  *      Send a SYN-ACK after having received a SYN.
 843  *      This still operates on a request_sock only, not on a big
 844  *      socket.
 845  */
 846 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 847                               struct request_sock *req,
 848                               struct request_values *rvp,
 849                               u16 queue_mapping,
 850                               bool nocache)
 851 {
 852         const struct inet_request_sock *ireq = inet_rsk(req);
 853         struct flowi4 fl4;
 854         int err = -1;
 855         struct sk_buff * skb;
 856
 857         /* First, grab a route. */
 858         if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 859                 return -1;
 860
 861         skb = tcp_make_synack(sk, dst, req, rvp, NULL);
 862
 863         if (skb) {
 864                 __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 865
 866                 skb_set_queue_mapping(skb, queue_mapping);
 867                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 868                                             ireq->rmt_addr,
 869                                             ireq->opt);
 870                 err = net_xmit_eval(err);
 871                 if (!tcp_rsk(req)->snt_synack && !err)
 872                         tcp_rsk(req)->snt_synack = tcp_time_stamp;
 873         }
 874
 875         return err;
 876 }
 877
 878 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
 879                               struct request_values *rvp)
 880 {
 881         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 882         return tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
 883 }
 884
 885 /*
 886  *      IPv4 request_sock destructor.
 887  */
 888 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 889 {
 890         kfree(inet_rsk(req)->opt);
 891 }
 892
 893 /*
 894  * Return true if a syncookie should be sent
 895  */
 896 bool tcp_syn_flood_action(struct sock *sk,
 897                          const struct sk_buff *skb,
 898                          const char *proto)
 899 {
 900         const char *msg = "Dropping request";
 901         bool want_cookie = false;
 902         struct listen_sock *lopt;
 903
 904
 905
 906 #ifdef CONFIG_SYN_COOKIES
 907         if (sysctl_tcp_syncookies) {
 908                 msg = "Sending cookies";
 909                 want_cookie = true;
 910                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
 911         } else
 912 #endif
 913                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
 914
 915         lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
 916         if (!lopt->synflood_warned) {
 917                 lopt->synflood_warned = 1;
 918                 pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
 919                         proto, ntohs(tcp_hdr(skb)->dest), msg);
 920         }
 921         return want_cookie;
 922 }
 923 EXPORT_SYMBOL(tcp_syn_flood_action);
 924
 925 /*
 926  * Save and compile IPv4 options into the request_sock if needed.
 927  */
 928 static struct ip_options_rcu *tcp_v4_save_options(struct sk_buff *skb)
 929 {
 930         const struct ip_options *opt = &(IPCB(skb)->opt);
 931         struct ip_options_rcu *dopt = NULL;
 932
 933         if (opt && opt->optlen) {
 934                 int opt_size = sizeof(*dopt) + opt->optlen;
 935
 936                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 937                 if (dopt) {
 938                         if (ip_options_echo(&dopt->opt, skb)) {
 939                                 kfree(dopt);
 940                                 dopt = NULL;
 941                         }
 942                 }
 943         }
 944         return dopt;
 945 }
 946
 947 #ifdef CONFIG_TCP_MD5SIG
 948 /*
 949  * RFC2385 MD5 checksumming requires a mapping of
 950  * IP address->MD5 Key.
 951  * We need to maintain these in the sk structure.
 952  */
 953
 954 /* Find the Key structure for an address.  */
 955 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
 956                                          const union tcp_md5_addr *addr,
 957                                          int family)
 958 {
 959         struct tcp_sock *tp = tcp_sk(sk);
 960         struct tcp_md5sig_key *key;
 961         struct hlist_node *pos;
 962         unsigned int size = sizeof(struct in_addr);
 963         struct tcp_md5sig_info *md5sig;
 964
 965         /* caller either holds rcu_read_lock() or socket lock */
 966         md5sig = rcu_dereference_check(tp->md5sig_info,
 967                                        sock_owned_by_user(sk) ||
 968                                        lockdep_is_held(&sk->sk_lock.slock));
 969         if (!md5sig)
 970                 return NULL;
 971 #if IS_ENABLED(CONFIG_IPV6)
 972         if (family == AF_INET6)
 973                 size = sizeof(struct in6_addr);
 974 #endif
 975         hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
 976                 if (key->family != family)
 977                         continue;
 978                 if (!memcmp(&key->addr, addr, size))
 979                         return key;
 980         }
 981         return NULL;
 982 }
 983 EXPORT_SYMBOL(tcp_md5_do_lookup);
 984
 985 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 986                                          struct sock *addr_sk)
 987 {
 988         union tcp_md5_addr *addr;
 989
 990         addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
 991         return tcp_md5_do_lookup(sk, addr, AF_INET);
 992 }
 993 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 994
 995 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 996                                                       struct request_sock *req)
 997 {
 998         union tcp_md5_addr *addr;
 999
1000         addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
1001         return tcp_md5_do_lookup(sk, addr, AF_INET);
1002 }
1003
1004 /* This can be called on a newly created socket, from other files */
1005 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1006                    int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
1007 {
1008         /* Add Key to the list */
1009         struct tcp_md5sig_key *key;
1010         struct tcp_sock *tp = tcp_sk(sk);
1011         struct tcp_md5sig_info *md5sig;
1012
1013         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1014         if (key) {
1015                 /* Pre-existing entry - just update that one. */
1016                 memcpy(key->key, newkey, newkeylen);
1017                 key->keylen = newkeylen;
1018                 return 0;
1019         }
1020
1021         md5sig = rcu_dereference_protected(tp->md5sig_info,
1022                                            sock_owned_by_user(sk));
1023         if (!md5sig) {
1024                 md5sig = kmalloc(sizeof(*md5sig), gfp);
1025                 if (!md5sig)
1026                         return -ENOMEM;
1027
1028                 sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1029                 INIT_HLIST_HEAD(&md5sig->head);
1030                 rcu_assign_pointer(tp->md5sig_info, md5sig);
1031         }
1032
1033         key = sock_kmalloc(sk, sizeof(*key), gfp);
1034         if (!key)
1035                 return -ENOMEM;
1036         if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1037                 sock_kfree_s(sk, key, sizeof(*key));
1038                 return -ENOMEM;
1039         }
1040
1041         memcpy(key->key, newkey, newkeylen);
1042         key->keylen = newkeylen;
1043         key->family = family;
1044         memcpy(&key->addr, addr,
1045                (family == AF_INET6) ? sizeof(struct in6_addr) :
1046                                       sizeof(struct in_addr));
1047         hlist_add_head_rcu(&key->node, &md5sig->head);
1048         return 0;
1049 }
1050 EXPORT_SYMBOL(tcp_md5_do_add);
1051
1052 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1053 {
1054         struct tcp_sock *tp = tcp_sk(sk);
1055         struct tcp_md5sig_key *key;
1056         struct tcp_md5sig_info *md5sig;
1057
1058         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&addr, AF_INET);
1059         if (!key)
1060                 return -ENOENT;
1061         hlist_del_rcu(&key->node);
1062         atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1063         kfree_rcu(key, rcu);
1064         md5sig = rcu_dereference_protected(tp->md5sig_info,
1065                                            sock_owned_by_user(sk));
1066         if (hlist_empty(&md5sig->head))
1067                 tcp_free_md5sig_pool();
1068         return 0;
1069 }
1070 EXPORT_SYMBOL(tcp_md5_do_del);
1071
1072 void tcp_clear_md5_list(struct sock *sk)
1073 {
1074         struct tcp_sock *tp = tcp_sk(sk);
1075         struct tcp_md5sig_key *key;
1076         struct hlist_node *pos, *n;
1077         struct tcp_md5sig_info *md5sig;
1078
1079         md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1080
1081         if (!hlist_empty(&md5sig->head))
1082                 tcp_free_md5sig_pool();
1083         hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1084                 hlist_del_rcu(&key->node);
1085                 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1086                 kfree_rcu(key, rcu);
1087         }
1088 }
1089
1090 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1091                                  int optlen)
1092 {
1093         struct tcp_md5sig cmd;
1094         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1095
1096         if (optlen < sizeof(cmd))
1097                 return -EINVAL;
1098
1099         if (copy_from_user(&cmd, optval, sizeof(cmd)))
1100                 return -EFAULT;
1101
1102         if (sin->sin_family != AF_INET)
1103                 return -EINVAL;
1104
1105         if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1106                 return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1107                                       AF_INET);
1108
1109         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1110                 return -EINVAL;
1111
1112         return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1113                               AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1114                               GFP_KERNEL);
1115 }
1116
1117 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1118                                         __be32 daddr, __be32 saddr, int nbytes)
1119 {
1120         struct tcp4_pseudohdr *bp;
1121         struct scatterlist sg;
1122
1123         bp = &hp->md5_blk.ip4;
1124
1125         /*
1126          * 1. the TCP pseudo-header (in the order: source IP address,
1127          * destination IP address, zero-padded protocol number, and
1128          * segment length)
1129          */
1130         bp->saddr = saddr;
1131         bp->daddr = daddr;
1132         bp->pad = 0;
1133         bp->protocol = IPPROTO_TCP;
1134         bp->len = cpu_to_be16(nbytes);
1135
1136         sg_init_one(&sg, bp, sizeof(*bp));
1137         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1138 }
1139
1140 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1141                                __be32 daddr, __be32 saddr, const struct tcphdr *th)
1142 {
1143         struct tcp_md5sig_pool *hp;
1144         struct hash_desc *desc;
1145
1146         hp = tcp_get_md5sig_pool();
1147         if (!hp)
1148                 goto clear_hash_noput;
1149         desc = &hp->md5_desc;
1150
1151         if (crypto_hash_init(desc))
1152                 goto clear_hash;
1153         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1154                 goto clear_hash;
1155         if (tcp_md5_hash_header(hp, th))
1156                 goto clear_hash;
1157         if (tcp_md5_hash_key(hp, key))
1158                 goto clear_hash;
1159         if (crypto_hash_final(desc, md5_hash))
1160                 goto clear_hash;
1161
1162         tcp_put_md5sig_pool();
1163         return 0;
1164
1165 clear_hash:
1166         tcp_put_md5sig_pool();
1167 clear_hash_noput:
1168         memset(md5_hash, 0, 16);
1169         return 1;
1170 }
1171
1172 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1173                         const struct sock *sk, const struct request_sock *req,
1174                         const struct sk_buff *skb)
1175 {
1176         struct tcp_md5sig_pool *hp;
1177         struct hash_desc *desc;
1178         const struct tcphdr *th = tcp_hdr(skb);
1179         __be32 saddr, daddr;
1180
1181         if (sk) {
1182                 saddr = inet_sk(sk)->inet_saddr;
1183                 daddr = inet_sk(sk)->inet_daddr;
1184         } else if (req) {
1185                 saddr = inet_rsk(req)->loc_addr;
1186                 daddr = inet_rsk(req)->rmt_addr;
1187         } else {
1188                 const struct iphdr *iph = ip_hdr(skb);
1189                 saddr = iph->saddr;
1190                 daddr = iph->daddr;
1191         }
1192
1193         hp = tcp_get_md5sig_pool();
1194         if (!hp)
1195                 goto clear_hash_noput;
1196         desc = &hp->md5_desc;
1197
1198         if (crypto_hash_init(desc))
1199                 goto clear_hash;
1200
1201         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1202                 goto clear_hash;
1203         if (tcp_md5_hash_header(hp, th))
1204                 goto clear_hash;
1205         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1206                 goto clear_hash;
1207         if (tcp_md5_hash_key(hp, key))
1208                 goto clear_hash;
1209         if (crypto_hash_final(desc, md5_hash))
1210                 goto clear_hash;
1211
1212         tcp_put_md5sig_pool();
1213         return 0;
1214
1215 clear_hash:
1216         tcp_put_md5sig_pool();
1217 clear_hash_noput:
1218         memset(md5_hash, 0, 16);
1219         return 1;
1220 }
1221 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1222
1223 static bool tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1224 {
1225         /*
1226          * This gets called for each TCP segment that arrives
1227          * so we want to be efficient.
1228          * We have 3 drop cases:
1229          * o No MD5 hash and one expected.
1230          * o MD5 hash and we're not expecting one.
1231          * o MD5 hash and its wrong.
1232          */
1233         const __u8 *hash_location = NULL;
1234         struct tcp_md5sig_key *hash_expected;
1235         const struct iphdr *iph = ip_hdr(skb);
1236         const struct tcphdr *th = tcp_hdr(skb);
1237         int genhash;
1238         unsigned char newhash[16];
1239
1240         hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1241                                           AF_INET);
1242         hash_location = tcp_parse_md5sig_option(th);
1243
1244         /* We've parsed the options - do we have a hash? */
1245         if (!hash_expected && !hash_location)
1246                 return false;
1247
1248         if (hash_expected && !hash_location) {
1249                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1250                 return true;
1251         }
1252
1253         if (!hash_expected && hash_location) {
1254                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1255                 return true;
1256         }
1257
1258         /* Okay, so this is hash_expected and hash_location -
1259          * so we need to calculate the checksum.
1260          */
1261         genhash = tcp_v4_md5_hash_skb(newhash,
1262                                       hash_expected,
1263                                       NULL, NULL, skb);
1264
1265         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1266                 net_info_ratelimited("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1267                                      &iph->saddr, ntohs(th->source),
1268                                      &iph->daddr, ntohs(th->dest),
1269                                      genhash ? " tcp_v4_calc_md5_hash failed"
1270                                      : "");
1271                 return true;
1272         }
1273         return false;
1274 }
1275
1276 #endif
1277
1278 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1279         .family         =       PF_INET,
1280         .obj_size       =       sizeof(struct tcp_request_sock),
1281         .rtx_syn_ack    =       tcp_v4_rtx_synack,
1282         .send_ack       =       tcp_v4_reqsk_send_ack,
1283         .destructor     =       tcp_v4_reqsk_destructor,
1284         .send_reset     =       tcp_v4_send_reset,
1285         .syn_ack_timeout =      tcp_syn_ack_timeout,
1286 };
1287
1288 #ifdef CONFIG_TCP_MD5SIG
1289 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1290         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1291         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1292 };
1293 #endif
1294
1295 static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
1296                                struct request_sock *req,
1297                                struct tcp_fastopen_cookie *foc,
1298                                struct tcp_fastopen_cookie *valid_foc)
1299 {
1300         bool skip_cookie = false;
1301         struct fastopen_queue *fastopenq;
1302
1303         if (likely(!fastopen_cookie_present(foc))) {
1304                 /* See include/net/tcp.h for the meaning of these knobs */
1305                 if ((sysctl_tcp_fastopen & TFO_SERVER_ALWAYS) ||
1306                     ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_REQD) &&
1307                     (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1)))
1308                         skip_cookie = true; /* no cookie to validate */
1309                 else
1310                         return false;
1311         }
1312         fastopenq = inet_csk(sk)->icsk_accept_queue.fastopenq;
1313         /* A FO option is present; bump the counter. */
1314         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVE);
1315
1316         /* Make sure the listener has enabled fastopen, and we don't
1317          * exceed the max # of pending TFO requests allowed before trying
1318          * to validating the cookie in order to avoid burning CPU cycles
1319          * unnecessarily.
1320          *
1321          * XXX (TFO) - The implication of checking the max_qlen before
1322          * processing a cookie request is that clients can't differentiate
1323          * between qlen overflow causing Fast Open to be disabled
1324          * temporarily vs a server not supporting Fast Open at all.
1325          */
1326         if ((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) == 0 ||
1327             fastopenq == NULL || fastopenq->max_qlen == 0)
1328                 return false;
1329
1330         if (fastopenq->qlen >= fastopenq->max_qlen) {
1331                 struct request_sock *req1;
1332                 spin_lock(&fastopenq->lock);
1333                 req1 = fastopenq->rskq_rst_head;
1334                 if ((req1 == NULL) || time_after(req1->expires, jiffies)) {
1335                         spin_unlock(&fastopenq->lock);
1336                         NET_INC_STATS_BH(sock_net(sk),
1337                             LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
1338                         /* Avoid bumping LINUX_MIB_TCPFASTOPENPASSIVEFAIL*/
1339                         foc->len = -1;
1340                         return false;
1341                 }
1342                 fastopenq->rskq_rst_head = req1->dl_next;
1343                 fastopenq->qlen--;
1344                 spin_unlock(&fastopenq->lock);
1345                 reqsk_free(req1);
1346         }
1347         if (skip_cookie) {
1348                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1349                 return true;
1350         }
1351         if (foc->len == TCP_FASTOPEN_COOKIE_SIZE) {
1352                 if ((sysctl_tcp_fastopen & TFO_SERVER_COOKIE_NOT_CHKED) == 0) {
1353                         tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1354                         if ((valid_foc->len != TCP_FASTOPEN_COOKIE_SIZE) ||
1355                             memcmp(&foc->val[0], &valid_foc->val[0],
1356                             TCP_FASTOPEN_COOKIE_SIZE) != 0)
1357                                 return false;
1358                         valid_foc->len = -1;
1359                 }
1360                 /* Acknowledge the data received from the peer. */
1361                 tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1362                 return true;
1363         } else if (foc->len == 0) { /* Client requesting a cookie */
1364                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1365                 NET_INC_STATS_BH(sock_net(sk),
1366                     LINUX_MIB_TCPFASTOPENCOOKIEREQD);
1367         } else {
1368                 /* Client sent a cookie with wrong size. Treat it
1369                  * the same as invalid and return a valid one.
1370                  */
1371                 tcp_fastopen_cookie_gen(ip_hdr(skb)->saddr, valid_foc);
1372         }
1373         return false;
1374 }
1375
1376 static int tcp_v4_conn_req_fastopen(struct sock *sk,
1377                                     struct sk_buff *skb,
1378                                     struct sk_buff *skb_synack,
1379                                     struct request_sock *req,
1380                                     struct request_values *rvp)
1381 {
1382         struct tcp_sock *tp = tcp_sk(sk);
1383         struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
1384         const struct inet_request_sock *ireq = inet_rsk(req);
1385         struct sock *child;
1386         int err;
1387
1388         req->retrans = 0;
1389         req->sk = NULL;
1390
1391         child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
1392         if (child == NULL) {
1393                 NET_INC_STATS_BH(sock_net(sk),
1394                                  LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1395                 kfree_skb(skb_synack);
1396                 return -1;
1397         }
1398         err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1399                                     ireq->rmt_addr, ireq->opt);
1400         err = net_xmit_eval(err);
1401         if (!err)
1402                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1403         /* XXX (TFO) - is it ok to ignore error and continue? */
1404
1405         spin_lock(&queue->fastopenq->lock);
1406         queue->fastopenq->qlen++;
1407         spin_unlock(&queue->fastopenq->lock);
1408
1409         /* Initialize the child socket. Have to fix some values to take
1410          * into account the child is a Fast Open socket and is created
1411          * only out of the bits carried in the SYN packet.
1412          */
1413         tp = tcp_sk(child);
1414
1415         tp->fastopen_rsk = req;
1416         /* Do a hold on the listner sk so that if the listener is being
1417          * closed, the child that has been accepted can live on and still
1418          * access listen_lock.
1419          */
1420         sock_hold(sk);
1421         tcp_rsk(req)->listener = sk;
1422
1423         /* RFC1323: The window in SYN & SYN/ACK segments is never
1424          * scaled. So correct it appropriately.
1425          */
1426         tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
1427
1428         /* Activate the retrans timer so that SYNACK can be retransmitted.
1429          * The request socket is not added to the SYN table of the parent
1430          * because it's been added to the accept queue directly.
1431          */
1432         inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
1433             TCP_TIMEOUT_INIT, TCP_RTO_MAX);
1434
1435         /* Add the child socket directly into the accept queue */
1436         inet_csk_reqsk_queue_add(sk, req, child);
1437
1438         /* Now finish processing the fastopen child socket. */
1439         inet_csk(child)->icsk_af_ops->rebuild_header(child);
1440         tcp_init_congestion_control(child);
1441         tcp_mtup_init(child);
1442         tcp_init_buffer_space(child);
1443         tcp_init_metrics(child);
1444
1445         /* Queue the data carried in the SYN packet. We need to first
1446          * bump skb's refcnt because the caller will attempt to free it.
1447          *
1448          * XXX (TFO) - we honor a zero-payload TFO request for now.
1449          * (Any reason not to?)
1450          */
1451         if (TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq + 1) {
1452                 /* Don't queue the skb if there is no payload in SYN.
1453                  * XXX (TFO) - How about SYN+FIN?
1454                  */
1455                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1456         } else {
1457                 skb = skb_get(skb);
1458                 skb_dst_drop(skb);
1459                 __skb_pull(skb, tcp_hdr(skb)->doff * 4);
1460                 skb_set_owner_r(skb, child);
1461                 __skb_queue_tail(&child->sk_receive_queue, skb);
1462                 tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
1463         }
1464         sk->sk_data_ready(sk, 0);
1465         bh_unlock_sock(child);
1466         sock_put(child);
1467         WARN_ON(req->sk == NULL);
1468         return 0;
1469 }
1470
1471 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1472 {
1473         struct tcp_extend_values tmp_ext;
1474         struct tcp_options_received tmp_opt;
1475         const u8 *hash_location;
1476         struct request_sock *req;
1477         struct inet_request_sock *ireq;
1478         struct tcp_sock *tp = tcp_sk(sk);
1479         struct dst_entry *dst = NULL;
1480         __be32 saddr = ip_hdr(skb)->saddr;
1481         __be32 daddr = ip_hdr(skb)->daddr;
1482         __u32 isn = TCP_SKB_CB(skb)->when;
1483         bool want_cookie = false;
1484         struct flowi4 fl4;
1485         struct tcp_fastopen_cookie foc = { .len = -1 };
1486         struct tcp_fastopen_cookie valid_foc = { .len = -1 };
1487         struct sk_buff *skb_synack;
1488         int do_fastopen;
1489
1490         /* Never answer to SYNs send to broadcast or multicast */
1491         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1492                 goto drop;
1493
1494         /* TW buckets are converted to open requests without
1495          * limitations, they conserve resources and peer is
1496          * evidently real one.
1497          */
1498         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1499                 want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1500                 if (!want_cookie)
1501                         goto drop;
1502         }
1503
1504         /* Accept backlog is full. If we have already queued enough
1505          * of warm entries in syn queue, drop request. It is better than
1506          * clogging syn queue with openreqs with exponentially increasing
1507          * timeout.
1508          */
1509         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1510                 goto drop;
1511
1512         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1513         if (!req)
1514                 goto drop;
1515
1516 #ifdef CONFIG_TCP_MD5SIG
1517         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1518 #endif
1519
1520         tcp_clear_options(&tmp_opt);
1521         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1522         tmp_opt.user_mss  = tp->rx_opt.user_mss;
1523         tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
1524             want_cookie ? NULL : &foc);
1525
1526         if (tmp_opt.cookie_plus > 0 &&
1527             tmp_opt.saw_tstamp &&
1528             !tp->rx_opt.cookie_out_never &&
1529             (sysctl_tcp_cookie_size > 0 ||
1530              (tp->cookie_values != NULL &&
1531               tp->cookie_values->cookie_desired > 0))) {
1532                 u8 *c;
1533                 u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1534                 int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1535
1536                 if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1537                         goto drop_and_release;
1538
1539                 /* Secret recipe starts with IP addresses */
1540                 *mess++ ^= (__force u32)daddr;
1541                 *mess++ ^= (__force u32)saddr;
1542
1543                 /* plus variable length Initiator Cookie */
1544                 c = (u8 *)mess;
1545                 while (l-- > 0)
1546                         *c++ ^= *hash_location++;
1547
1548                 want_cookie = false;    /* not our kind of cookie */
1549                 tmp_ext.cookie_out_never = 0; /* false */
1550                 tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1551         } else if (!tp->rx_opt.cookie_in_always) {
1552                 /* redundant indications, but ensure initialization. */
1553                 tmp_ext.cookie_out_never = 1; /* true */
1554                 tmp_ext.cookie_plus = 0;
1555         } else {
1556                 goto drop_and_release;
1557         }
1558         tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1559
1560         if (want_cookie && !tmp_opt.saw_tstamp)
1561                 tcp_clear_options(&tmp_opt);
1562
1563         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1564         tcp_openreq_init(req, &tmp_opt, skb);
1565
1566         ireq = inet_rsk(req);
1567         ireq->loc_addr = daddr;
1568         ireq->rmt_addr = saddr;
1569         ireq->no_srccheck = inet_sk(sk)->transparent;
1570         ireq->opt = tcp_v4_save_options(skb);
1571
1572         if (security_inet_conn_request(sk, skb, req))
1573                 goto drop_and_free;
1574
1575         if (!want_cookie || tmp_opt.tstamp_ok)
1576                 TCP_ECN_create_request(req, skb);
1577
1578         if (want_cookie) {
1579                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1580                 req->cookie_ts = tmp_opt.tstamp_ok;
1581         } else if (!isn) {
1582                 /* VJ's idea. We save last timestamp seen
1583                  * from the destination in peer table, when entering
1584                  * state TIME-WAIT, and check against it before
1585                  * accepting new connection request.
1586                  *
1587                  * If "isn" is not zero, this request hit alive
1588                  * timewait bucket, so that all the necessary checks
1589                  * are made in the function processing timewait state.
1590                  */
1591                 if (tmp_opt.saw_tstamp &&
1592                     tcp_death_row.sysctl_tw_recycle &&
1593                     (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1594                     fl4.daddr == saddr) {
1595                         if (!tcp_peer_is_proven(req, dst, true)) {
1596                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1597                                 goto drop_and_release;
1598                         }
1599                 }
1600                 /* Kill the following clause, if you dislike this way. */
1601                 else if (!sysctl_tcp_syncookies &&
1602                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1603                           (sysctl_max_syn_backlog >> 2)) &&
1604                          !tcp_peer_is_proven(req, dst, false)) {
1605                         /* Without syncookies last quarter of
1606                          * backlog is filled with destinations,
1607                          * proven to be alive.
1608                          * It means that we continue to communicate
1609                          * to destinations, already remembered
1610                          * to the moment of synflood.
1611                          */
1612                         LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1613                                        &saddr, ntohs(tcp_hdr(skb)->source));
1614                         goto drop_and_release;
1615                 }
1616
1617                 isn = tcp_v4_init_sequence(skb);
1618         }
1619         tcp_rsk(req)->snt_isn = isn;
1620
1621         if (dst == NULL) {
1622                 dst = inet_csk_route_req(sk, &fl4, req);
1623                 if (dst == NULL)
1624                         goto drop_and_free;
1625         }
1626         do_fastopen = tcp_fastopen_check(sk, skb, req, &foc, &valid_foc);
1627
1628         /* We don't call tcp_v4_send_synack() directly because we need
1629          * to make sure a child socket can be created successfully before
1630          * sending back synack!
1631          *
1632          * XXX (TFO) - Ideally one would simply call tcp_v4_send_synack()
1633          * (or better yet, call tcp_send_synack() in the child context
1634          * directly, but will have to fix bunch of other code first)
1635          * after syn_recv_sock() except one will need to first fix the
1636          * latter to remove its dependency on the current implementation
1637          * of tcp_v4_send_synack()->tcp_select_initial_window().
1638          */
1639         skb_synack = tcp_make_synack(sk, dst, req,
1640             (struct request_values *)&tmp_ext,
1641             fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
1642
1643         if (skb_synack) {
1644                 __tcp_v4_send_check(skb_synack, ireq->loc_addr, ireq->rmt_addr);
1645                 skb_set_queue_mapping(skb_synack, skb_get_queue_mapping(skb));
1646         } else
1647                 goto drop_and_free;
1648
1649         if (likely(!do_fastopen)) {
1650                 int err;
1651                 err = ip_build_and_send_pkt(skb_synack, sk, ireq->loc_addr,
1652                      ireq->rmt_addr, ireq->opt);
1653                 err = net_xmit_eval(err);
1654                 if (err || want_cookie)
1655                         goto drop_and_free;
1656
1657                 tcp_rsk(req)->snt_synack = tcp_time_stamp;
1658                 tcp_rsk(req)->listener = NULL;
1659                 /* Add the request_sock to the SYN table */
1660                 inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1661                 if (fastopen_cookie_present(&foc) && foc.len != 0)
1662                         NET_INC_STATS_BH(sock_net(sk),
1663                             LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
1664         } else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
1665             (struct request_values *)&tmp_ext))
1666                 goto drop_and_free;
1667
1668         return 0;
1669
1670 drop_and_release:
1671         dst_release(dst);
1672 drop_and_free:
1673         reqsk_free(req);
1674 drop:
1675         return 0;
1676 }
1677 EXPORT_SYMBOL(tcp_v4_conn_request);
1678
1679
1680 /*
1681  * The three way handshake has completed - we got a valid synack -
1682  * now create the new socket.
1683  */
1684 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1685                                   struct request_sock *req,
1686                                   struct dst_entry *dst)
1687 {
1688         struct inet_request_sock *ireq;
1689         struct inet_sock *newinet;
1690         struct tcp_sock *newtp;
1691         struct sock *newsk;
1692 #ifdef CONFIG_TCP_MD5SIG
1693         struct tcp_md5sig_key *key;
1694 #endif
1695         struct ip_options_rcu *inet_opt;
1696
1697         if (sk_acceptq_is_full(sk))
1698                 goto exit_overflow;
1699
1700         newsk = tcp_create_openreq_child(sk, req, skb);
1701         if (!newsk)
1702                 goto exit_nonewsk;
1703
1704         newsk->sk_gso_type = SKB_GSO_TCPV4;
1705         inet_sk_rx_dst_set(newsk, skb);
1706
1707         newtp                 = tcp_sk(newsk);
1708         newinet               = inet_sk(newsk);
1709         ireq                  = inet_rsk(req);
1710         newinet->inet_daddr   = ireq->rmt_addr;
1711         newinet->inet_rcv_saddr = ireq->loc_addr;
1712         newinet->inet_saddr           = ireq->loc_addr;
1713         inet_opt              = ireq->opt;
1714         rcu_assign_pointer(newinet->inet_opt, inet_opt);
1715         ireq->opt             = NULL;
1716         newinet->mc_index     = inet_iif(skb);
1717         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1718         newinet->rcv_tos      = ip_hdr(skb)->tos;
1719         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1720         if (inet_opt)
1721                 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1722         newinet->inet_id = newtp->write_seq ^ jiffies;
1723
1724         if (!dst) {
1725                 dst = inet_csk_route_child_sock(sk, newsk, req);
1726                 if (!dst)
1727                         goto put_and_exit;
1728         } else {
1729                 /* syncookie case : see end of cookie_v4_check() */
1730         }
1731         sk_setup_caps(newsk, dst);
1732
1733         tcp_mtup_init(newsk);
1734         tcp_sync_mss(newsk, dst_mtu(dst));
1735         newtp->advmss = dst_metric_advmss(dst);
1736         if (tcp_sk(sk)->rx_opt.user_mss &&
1737             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1738                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1739
1740         tcp_initialize_rcv_mss(newsk);
1741         tcp_synack_rtt_meas(newsk, req);
1742         newtp->total_retrans = req->retrans;
1743
1744 #ifdef CONFIG_TCP_MD5SIG
1745         /* Copy over the MD5 key from the original socket */
1746         key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1747                                 AF_INET);
1748         if (key != NULL) {
1749                 /*
1750                  * We're using one, so create a matching key
1751                  * on the newsk structure. If we fail to get
1752                  * memory, then we end up not copying the key
1753                  * across. Shucks.
1754                  */
1755                 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1756                                AF_INET, key->key, key->keylen, GFP_ATOMIC);
1757                 sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1758         }
1759 #endif
1760
1761         if (__inet_inherit_port(sk, newsk) < 0)
1762                 goto put_and_exit;
1763         __inet_hash_nolisten(newsk, NULL);
1764
1765         return newsk;
1766
1767 exit_overflow:
1768         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1769 exit_nonewsk:
1770         dst_release(dst);
1771 exit:
1772         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1773         return NULL;
1774 put_and_exit:
1775         tcp_clear_xmit_timers(newsk);
1776         tcp_cleanup_congestion_control(newsk);
1777         bh_unlock_sock(newsk);
1778         sock_put(newsk);
1779         goto exit;
1780 }
1781 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1782
1783 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1784 {
1785         struct tcphdr *th = tcp_hdr(skb);
1786         const struct iphdr *iph = ip_hdr(skb);
1787         struct sock *nsk;
1788         struct request_sock **prev;
1789         /* Find possible connection requests. */
1790         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1791                                                        iph->saddr, iph->daddr);
1792         if (req)
1793                 return tcp_check_req(sk, skb, req, prev, false);
1794
1795         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1796                         th->source, iph->daddr, th->dest, inet_iif(skb));
1797
1798         if (nsk) {
1799                 if (nsk->sk_state != TCP_TIME_WAIT) {
1800                         bh_lock_sock(nsk);
1801                         return nsk;
1802                 }
1803                 inet_twsk_put(inet_twsk(nsk));
1804                 return NULL;
1805         }
1806
1807 #ifdef CONFIG_SYN_COOKIES
1808         if (!th->syn)
1809                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1810 #endif
1811         return sk;
1812 }
1813
1814 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1815 {
1816         const struct iphdr *iph = ip_hdr(skb);
1817
1818         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1819                 if (!tcp_v4_check(skb->len, iph->saddr,
1820                                   iph->daddr, skb->csum)) {
1821                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1822                         return 0;
1823                 }
1824         }
1825
1826         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1827                                        skb->len, IPPROTO_TCP, 0);
1828
1829         if (skb->len <= 76) {
1830                 return __skb_checksum_complete(skb);
1831         }
1832         return 0;
1833 }
1834
1835
1836 /* The socket must have it's spinlock held when we get
1837  * here.
1838  *
1839  * We have a potential double-lock case here, so even when
1840  * doing backlog processing we use the BH locking scheme.
1841  * This is because we cannot sleep with the original spinlock
1842  * held.
1843  */
1844 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1845 {
1846         struct sock *rsk;
1847 #ifdef CONFIG_TCP_MD5SIG
1848         /*
1849          * We really want to reject the packet as early as possible
1850          * if:
1851          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1852          *  o There is an MD5 option and we're not expecting one
1853          */
1854         if (tcp_v4_inbound_md5_hash(sk, skb))
1855                 goto discard;
1856 #endif
1857
1858         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1859                 struct dst_entry *dst = sk->sk_rx_dst;
1860
1861                 sock_rps_save_rxhash(sk, skb);
1862                 if (dst) {
1863                         if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1864                             dst->ops->check(dst, 0) == NULL) {
1865                                 dst_release(dst);
1866                                 sk->sk_rx_dst = NULL;
1867                         }
1868                 }
1869                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1870                         rsk = sk;
1871                         goto reset;
1872                 }
1873                 return 0;
1874         }
1875
1876         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1877                 goto csum_err;
1878
1879         if (sk->sk_state == TCP_LISTEN) {
1880                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1881                 if (!nsk)
1882                         goto discard;
1883
1884                 if (nsk != sk) {
1885                         sock_rps_save_rxhash(nsk, skb);
1886                         if (tcp_child_process(sk, nsk, skb)) {
1887                                 rsk = nsk;
1888                                 goto reset;
1889                         }
1890                         return 0;
1891                 }
1892         } else
1893                 sock_rps_save_rxhash(sk, skb);
1894
1895         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1896                 rsk = sk;
1897                 goto reset;
1898         }
1899         return 0;
1900
1901 reset:
1902         tcp_v4_send_reset(rsk, skb);
1903 discard:
1904         kfree_skb(skb);
1905         /* Be careful here. If this function gets more complicated and
1906          * gcc suffers from register pressure on the x86, sk (in %ebx)
1907          * might be destroyed here. This current version compiles correctly,
1908          * but you have been warned.
1909          */
1910         return 0;
1911
1912 csum_err:
1913         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1914         goto discard;
1915 }
1916 EXPORT_SYMBOL(tcp_v4_do_rcv);
1917
1918 void tcp_v4_early_demux(struct sk_buff *skb)
1919 {
1920         struct net *net = dev_net(skb->dev);
1921         const struct iphdr *iph;
1922         const struct tcphdr *th;
1923         struct sock *sk;
1924
1925         if (skb->pkt_type != PACKET_HOST)
1926                 return;
1927
1928         if (!pskb_may_pull(skb, ip_hdrlen(skb) + sizeof(struct tcphdr)))
1929                 return;
1930
1931         iph = ip_hdr(skb);
1932         th = (struct tcphdr *) ((char *)iph + ip_hdrlen(skb));
1933
1934         if (th->doff < sizeof(struct tcphdr) / 4)
1935                 return;
1936
1937         sk = __inet_lookup_established(net, &tcp_hashinfo,
1938                                        iph->saddr, th->source,
1939                                        iph->daddr, ntohs(th->dest),
1940                                        skb->skb_iif);
1941         if (sk) {
1942                 skb->sk = sk;
1943                 skb->destructor = sock_edemux;
1944                 if (sk->sk_state != TCP_TIME_WAIT) {
1945                         struct dst_entry *dst = sk->sk_rx_dst;
1946
1947                         if (dst)
1948                                 dst = dst_check(dst, 0);
1949                         if (dst &&
1950                             inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1951                                 skb_dst_set_noref(skb, dst);
1952                 }
1953         }
1954 }
1955
1956 /*
1957  *      From tcp_input.c
1958  */
1959
1960 int tcp_v4_rcv(struct sk_buff *skb)
1961 {
1962         const struct iphdr *iph;
1963         const struct tcphdr *th;
1964         struct sock *sk;
1965         int ret;
1966         struct net *net = dev_net(skb->dev);
1967
1968         if (skb->pkt_type != PACKET_HOST)
1969                 goto discard_it;
1970
1971         /* Count it even if it's bad */
1972         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1973
1974         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1975                 goto discard_it;
1976
1977         th = tcp_hdr(skb);
1978
1979         if (th->doff < sizeof(struct tcphdr) / 4)
1980                 goto bad_packet;
1981         if (!pskb_may_pull(skb, th->doff * 4))
1982                 goto discard_it;
1983
1984         /* An explanation is required here, I think.
1985          * Packet length and doff are validated by header prediction,
1986          * provided case of th->doff==0 is eliminated.
1987          * So, we defer the checks. */
1988         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1989                 goto bad_packet;
1990
1991         th = tcp_hdr(skb);
1992         iph = ip_hdr(skb);
1993         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1994         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1995                                     skb->len - th->doff * 4);
1996         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1997         TCP_SKB_CB(skb)->when    = 0;
1998         TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1999         TCP_SKB_CB(skb)->sacked  = 0;
2000
2001         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
2002         if (!sk)
2003                 goto no_tcp_socket;
2004
2005 process:
2006         if (sk->sk_state == TCP_TIME_WAIT)
2007                 goto do_time_wait;
2008
2009         if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
2010                 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
2011                 goto discard_and_relse;
2012         }
2013
2014         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
2015                 goto discard_and_relse;
2016         nf_reset(skb);
2017
2018         if (sk_filter(sk, skb))
2019                 goto discard_and_relse;
2020
2021         skb->dev = NULL;
2022
2023         bh_lock_sock_nested(sk);
2024         ret = 0;
2025         if (!sock_owned_by_user(sk)) {
2026 #ifdef CONFIG_NET_DMA
2027                 struct tcp_sock *tp = tcp_sk(sk);
2028                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
2029                         tp->ucopy.dma_chan = net_dma_find_channel();
2030                 if (tp->ucopy.dma_chan)
2031                         ret = tcp_v4_do_rcv(sk, skb);
2032                 else
2033 #endif
2034                 {
2035                         if (!tcp_prequeue(sk, skb))
2036                                 ret = tcp_v4_do_rcv(sk, skb);
2037                 }
2038         } else if (unlikely(sk_add_backlog(sk, skb,
2039                                            sk->sk_rcvbuf + sk->sk_sndbuf))) {
2040                 bh_unlock_sock(sk);
2041                 NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
2042                 goto discard_and_relse;
2043         }
2044         bh_unlock_sock(sk);
2045
2046         sock_put(sk);
2047
2048         return ret;
2049
2050 no_tcp_socket:
2051         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2052                 goto discard_it;
2053
2054         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2055 bad_packet:
2056                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2057         } else {
2058                 tcp_v4_send_reset(NULL, skb);
2059         }
2060
2061 discard_it:
2062         /* Discard frame. */
2063         kfree_skb(skb);
2064         return 0;
2065
2066 discard_and_relse:
2067         sock_put(sk);
2068         goto discard_it;
2069
2070 do_time_wait:
2071         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2072                 inet_twsk_put(inet_twsk(sk));
2073                 goto discard_it;
2074         }
2075
2076         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
2077                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
2078                 inet_twsk_put(inet_twsk(sk));
2079                 goto discard_it;
2080         }
2081         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2082         case TCP_TW_SYN: {
2083                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2084                                                         &tcp_hashinfo,
2085                                                         iph->daddr, th->dest,
2086                                                         inet_iif(skb));
2087                 if (sk2) {
2088                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
2089                         inet_twsk_put(inet_twsk(sk));
2090                         sk = sk2;
2091                         goto process;
2092                 }
2093                 /* Fall through to ACK */
2094         }
2095         case TCP_TW_ACK:
2096                 tcp_v4_timewait_ack(sk, skb);
2097                 break;
2098         case TCP_TW_RST:
2099                 goto no_tcp_socket;
2100         case TCP_TW_SUCCESS:;
2101         }
2102         goto discard_it;
2103 }
2104
2105 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2106         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
2107         .twsk_unique    = tcp_twsk_unique,
2108         .twsk_destructor= tcp_twsk_destructor,
2109 };
2110
2111 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2112 {
2113         struct dst_entry *dst = skb_dst(skb);
2114
2115         dst_hold(dst);
2116         sk->sk_rx_dst = dst;
2117         inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
2118 }
2119 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2120
2121 const struct inet_connection_sock_af_ops ipv4_specific = {
2122         .queue_xmit        = ip_queue_xmit,
2123         .send_check        = tcp_v4_send_check,
2124         .rebuild_header    = inet_sk_rebuild_header,
2125         .sk_rx_dst_set     = inet_sk_rx_dst_set,
2126         .conn_request      = tcp_v4_conn_request,
2127         .syn_recv_sock     = tcp_v4_syn_recv_sock,
2128         .net_header_len    = sizeof(struct iphdr),
2129         .setsockopt        = ip_setsockopt,
2130         .getsockopt        = ip_getsockopt,
2131         .addr2sockaddr     = inet_csk_addr2sockaddr,
2132         .sockaddr_len      = sizeof(struct sockaddr_in),
2133         .bind_conflict     = inet_csk_bind_conflict,
2134 #ifdef CONFIG_COMPAT
2135         .compat_setsockopt = compat_ip_setsockopt,
2136         .compat_getsockopt = compat_ip_getsockopt,
2137 #endif
2138 };
2139 EXPORT_SYMBOL(ipv4_specific);
2140
2141 #ifdef CONFIG_TCP_MD5SIG
2142 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2143         .md5_lookup             = tcp_v4_md5_lookup,
2144         .calc_md5_hash          = tcp_v4_md5_hash_skb,
2145         .md5_parse              = tcp_v4_parse_md5_keys,
2146 };
2147 #endif
2148
2149 /* NOTE: A lot of things set to zero explicitly by call to
2150  *       sk_alloc() so need not be done here.
2151  */
2152 static int tcp_v4_init_sock(struct sock *sk)
2153 {
2154         struct inet_connection_sock *icsk = inet_csk(sk);
2155
2156         tcp_init_sock(sk);
2157
2158         icsk->icsk_af_ops = &ipv4_specific;
2159
2160 #ifdef CONFIG_TCP_MD5SIG
2161         tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2162 #endif
2163
2164         return 0;
2165 }
2166
2167 void tcp_v4_destroy_sock(struct sock *sk)
2168 {
2169         struct tcp_sock *tp = tcp_sk(sk);
2170
2171         tcp_clear_xmit_timers(sk);
2172
2173         tcp_cleanup_congestion_control(sk);
2174
2175         /* Cleanup up the write buffer. */
2176         tcp_write_queue_purge(sk);
2177
2178         /* Cleans up our, hopefully empty, out_of_order_queue. */
2179         __skb_queue_purge(&tp->out_of_order_queue);
2180
2181 #ifdef CONFIG_TCP_MD5SIG
2182         /* Clean up the MD5 key list, if any */
2183         if (tp->md5sig_info) {
2184                 tcp_clear_md5_list(sk);
2185                 kfree_rcu(tp->md5sig_info, rcu);
2186                 tp->md5sig_info = NULL;
2187         }
2188 #endif
2189
2190 #ifdef CONFIG_NET_DMA
2191         /* Cleans up our sk_async_wait_queue */
2192         __skb_queue_purge(&sk->sk_async_wait_queue);
2193 #endif
2194
2195         /* Clean prequeue, it must be empty really */
2196         __skb_queue_purge(&tp->ucopy.prequeue);
2197
2198         /* Clean up a referenced TCP bind bucket. */
2199         if (inet_csk(sk)->icsk_bind_hash)
2200                 inet_put_port(sk);
2201
2202         /* TCP Cookie Transactions */
2203         if (tp->cookie_values != NULL) {
2204                 kref_put(&tp->cookie_values->kref,
2205                          tcp_cookie_values_release);
2206                 tp->cookie_values = NULL;
2207         }
2208         BUG_ON(tp->fastopen_rsk != NULL);
2209
2210         /* If socket is aborted during connect operation */
2211         tcp_free_fastopen_req(tp);
2212
2213         sk_sockets_allocated_dec(sk);
2214         sock_release_memcg(sk);
2215 }
2216 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2217
2218 #ifdef CONFIG_PROC_FS
2219 /* Proc filesystem TCP sock list dumping. */
2220
2221 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
2222 {
2223         return hlist_nulls_empty(head) ? NULL :
2224                 list_entry(head->first, struct inet_timewait_sock, tw_node);
2225 }
2226
2227 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2228 {
2229         return !is_a_nulls(tw->tw_node.next) ?
2230                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2231 }
2232
2233 /*
2234  * Get next listener socket follow cur.  If cur is NULL, get first socket
2235  * starting from bucket given in st->bucket; when st->bucket is zero the
2236  * very first socket in the hash table is returned.
2237  */
2238 static void *listening_get_next(struct seq_file *seq, void *cur)
2239 {
2240         struct inet_connection_sock *icsk;
2241         struct hlist_nulls_node *node;
2242         struct sock *sk = cur;
2243         struct inet_listen_hashbucket *ilb;
2244         struct tcp_iter_state *st = seq->private;
2245         struct net *net = seq_file_net(seq);
2246
2247         if (!sk) {
2248                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2249                 spin_lock_bh(&ilb->lock);
2250                 sk = sk_nulls_head(&ilb->head);
2251                 st->offset = 0;
2252                 goto get_sk;
2253         }
2254         ilb = &tcp_hashinfo.listening_hash[st->bucket];
2255         ++st->num;
2256         ++st->offset;
2257
2258         if (st->state == TCP_SEQ_STATE_OPENREQ) {
2259                 struct request_sock *req = cur;
2260
2261                 icsk = inet_csk(st->syn_wait_sk);
2262                 req = req->dl_next;
2263                 while (1) {
2264                         while (req) {
2265                                 if (req->rsk_ops->family == st->family) {
2266                                         cur = req;
2267                                         goto out;
2268                                 }
2269                                 req = req->dl_next;
2270                         }
2271                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2272                                 break;
2273 get_req:
2274                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2275                 }
2276                 sk        = sk_nulls_next(st->syn_wait_sk);
2277                 st->state = TCP_SEQ_STATE_LISTENING;
2278                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2279         } else {
2280                 icsk = inet_csk(sk);
2281                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2282                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
2283                         goto start_req;
2284                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2285                 sk = sk_nulls_next(sk);
2286         }
2287 get_sk:
2288         sk_nulls_for_each_from(sk, node) {
2289                 if (!net_eq(sock_net(sk), net))
2290                         continue;
2291                 if (sk->sk_family == st->family) {
2292                         cur = sk;
2293                         goto out;
2294                 }
2295                 icsk = inet_csk(sk);
2296                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2297                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2298 start_req:
2299                         st->uid         = sock_i_uid(sk);
2300                         st->syn_wait_sk = sk;
2301                         st->state       = TCP_SEQ_STATE_OPENREQ;
2302                         st->sbucket     = 0;
2303                         goto get_req;
2304                 }
2305                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2306         }
2307         spin_unlock_bh(&ilb->lock);
2308         st->offset = 0;
2309         if (++st->bucket < INET_LHTABLE_SIZE) {
2310                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
2311                 spin_lock_bh(&ilb->lock);
2312                 sk = sk_nulls_head(&ilb->head);
2313                 goto get_sk;
2314         }
2315         cur = NULL;
2316 out:
2317         return cur;
2318 }
2319
2320 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2321 {
2322         struct tcp_iter_state *st = seq->private;
2323         void *rc;
2324
2325         st->bucket = 0;
2326         st->offset = 0;
2327         rc = listening_get_next(seq, NULL);
2328
2329         while (rc && *pos) {
2330                 rc = listening_get_next(seq, rc);
2331                 --*pos;
2332         }
2333         return rc;
2334 }
2335
2336 static inline bool empty_bucket(struct tcp_iter_state *st)
2337 {
2338         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2339                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2340 }
2341
2342 /*
2343  * Get first established socket starting from bucket given in st->bucket.
2344  * If st->bucket is zero, the very first socket in the hash is returned.
2345  */
2346 static void *established_get_first(struct seq_file *seq)
2347 {
2348         struct tcp_iter_state *st = seq->private;
2349         struct net *net = seq_file_net(seq);
2350         void *rc = NULL;
2351
2352         st->offset = 0;
2353         for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2354                 struct sock *sk;
2355                 struct hlist_nulls_node *node;
2356                 struct inet_timewait_sock *tw;
2357                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2358
2359                 /* Lockless fast path for the common case of empty buckets */
2360                 if (empty_bucket(st))
2361                         continue;
2362
2363                 spin_lock_bh(lock);
2364                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2365                         if (sk->sk_family != st->family ||
2366                             !net_eq(sock_net(sk), net)) {
2367                                 continue;
2368                         }
2369                         rc = sk;
2370                         goto out;
2371                 }
2372                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2373                 inet_twsk_for_each(tw, node,
2374                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2375                         if (tw->tw_family != st->family ||
2376                             !net_eq(twsk_net(tw), net)) {
2377                                 continue;
2378                         }
2379                         rc = tw;
2380                         goto out;
2381                 }
2382                 spin_unlock_bh(lock);
2383                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2384         }
2385 out:
2386         return rc;
2387 }
2388
2389 static void *established_get_next(struct seq_file *seq, void *cur)
2390 {
2391         struct sock *sk = cur;
2392         struct inet_timewait_sock *tw;
2393         struct hlist_nulls_node *node;
2394         struct tcp_iter_state *st = seq->private;
2395         struct net *net = seq_file_net(seq);
2396
2397         ++st->num;
2398         ++st->offset;
2399
2400         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2401                 tw = cur;
2402                 tw = tw_next(tw);
2403 get_tw:
2404                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2405                         tw = tw_next(tw);
2406                 }
2407                 if (tw) {
2408                         cur = tw;
2409                         goto out;
2410                 }
2411                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2412                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2413
2414                 /* Look for next non empty bucket */
2415                 st->offset = 0;
2416                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2417                                 empty_bucket(st))
2418                         ;
2419                 if (st->bucket > tcp_hashinfo.ehash_mask)
2420                         return NULL;
2421
2422                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2423                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2424         } else
2425                 sk = sk_nulls_next(sk);
2426
2427         sk_nulls_for_each_from(sk, node) {
2428                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2429                         goto found;
2430         }
2431
2432         st->state = TCP_SEQ_STATE_TIME_WAIT;
2433         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2434         goto get_tw;
2435 found:
2436         cur = sk;
2437 out:
2438         return cur;
2439 }
2440
2441 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2442 {
2443         struct tcp_iter_state *st = seq->private;
2444         void *rc;
2445
2446         st->bucket = 0;
2447         rc = established_get_first(seq);
2448
2449         while (rc && pos) {
2450                 rc = established_get_next(seq, rc);
2451                 --pos;
2452         }
2453         return rc;
2454 }
2455
2456 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2457 {
2458         void *rc;
2459         struct tcp_iter_state *st = seq->private;
2460
2461         st->state = TCP_SEQ_STATE_LISTENING;
2462         rc        = listening_get_idx(seq, &pos);
2463
2464         if (!rc) {
2465                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2466                 rc        = established_get_idx(seq, pos);
2467         }
2468
2469         return rc;
2470 }
2471
2472 static void *tcp_seek_last_pos(struct seq_file *seq)
2473 {
2474         struct tcp_iter_state *st = seq->private;
2475         int offset = st->offset;
2476         int orig_num = st->num;
2477         void *rc = NULL;
2478
2479         switch (st->state) {
2480         case TCP_SEQ_STATE_OPENREQ:
2481         case TCP_SEQ_STATE_LISTENING:
2482                 if (st->bucket >= INET_LHTABLE_SIZE)
2483                         break;
2484                 st->state = TCP_SEQ_STATE_LISTENING;
2485                 rc = listening_get_next(seq, NULL);
2486                 while (offset-- && rc)
2487                         rc = listening_get_next(seq, rc);
2488                 if (rc)
2489                         break;
2490                 st->bucket = 0;
2491                 /* Fallthrough */
2492         case TCP_SEQ_STATE_ESTABLISHED:
2493         case TCP_SEQ_STATE_TIME_WAIT:
2494                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2495                 if (st->bucket > tcp_hashinfo.ehash_mask)
2496                         break;
2497                 rc = established_get_first(seq);
2498                 while (offset-- && rc)
2499                         rc = established_get_next(seq, rc);
2500         }
2501
2502         st->num = orig_num;
2503
2504         return rc;
2505 }
2506
2507 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2508 {
2509         struct tcp_iter_state *st = seq->private;
2510         void *rc;
2511
2512         if (*pos && *pos == st->last_pos) {
2513                 rc = tcp_seek_last_pos(seq);
2514                 if (rc)
2515                         goto out;
2516         }
2517
2518         st->state = TCP_SEQ_STATE_LISTENING;
2519         st->num = 0;
2520         st->bucket = 0;
2521         st->offset = 0;
2522         rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2523
2524 out:
2525         st->last_pos = *pos;
2526         return rc;
2527 }
2528
2529 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2530 {
2531         struct tcp_iter_state *st = seq->private;
2532         void *rc = NULL;
2533
2534         if (v == SEQ_START_TOKEN) {
2535                 rc = tcp_get_idx(seq, 0);
2536                 goto out;
2537         }
2538
2539         switch (st->state) {
2540         case TCP_SEQ_STATE_OPENREQ:
2541         case TCP_SEQ_STATE_LISTENING:
2542                 rc = listening_get_next(seq, v);
2543                 if (!rc) {
2544                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2545                         st->bucket = 0;
2546                         st->offset = 0;
2547                         rc        = established_get_first(seq);
2548                 }
2549                 break;
2550         case TCP_SEQ_STATE_ESTABLISHED:
2551         case TCP_SEQ_STATE_TIME_WAIT:
2552                 rc = established_get_next(seq, v);
2553                 break;
2554         }
2555 out:
2556         ++*pos;
2557         st->last_pos = *pos;
2558         return rc;
2559 }
2560
2561 static void tcp_seq_stop(struct seq_file *seq, void *v)
2562 {
2563         struct tcp_iter_state *st = seq->private;
2564
2565         switch (st->state) {
2566         case TCP_SEQ_STATE_OPENREQ:
2567                 if (v) {
2568                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2569                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2570                 }
2571         case TCP_SEQ_STATE_LISTENING:
2572                 if (v != SEQ_START_TOKEN)
2573                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2574                 break;
2575         case TCP_SEQ_STATE_TIME_WAIT:
2576         case TCP_SEQ_STATE_ESTABLISHED:
2577                 if (v)
2578                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2579                 break;
2580         }
2581 }
2582
2583 int tcp_seq_open(struct inode *inode, struct file *file)
2584 {
2585         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2586         struct tcp_iter_state *s;
2587         int err;
2588
2589         err = seq_open_net(inode, file, &afinfo->seq_ops,
2590                           sizeof(struct tcp_iter_state));
2591         if (err < 0)
2592                 return err;
2593
2594         s = ((struct seq_file *)file->private_data)->private;
2595         s->family               = afinfo->family;
2596         s->last_pos             = 0;
2597         return 0;
2598 }
2599 EXPORT_SYMBOL(tcp_seq_open);
2600
2601 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2602 {
2603         int rc = 0;
2604         struct proc_dir_entry *p;
2605
2606         afinfo->seq_ops.start           = tcp_seq_start;
2607         afinfo->seq_ops.next            = tcp_seq_next;
2608         afinfo->seq_ops.stop            = tcp_seq_stop;
2609
2610         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2611                              afinfo->seq_fops, afinfo);
2612         if (!p)
2613                 rc = -ENOMEM;
2614         return rc;
2615 }
2616 EXPORT_SYMBOL(tcp_proc_register);
2617
2618 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2619 {
2620         proc_net_remove(net, afinfo->name);
2621 }
2622 EXPORT_SYMBOL(tcp_proc_unregister);
2623
2624 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2625                          struct seq_file *f, int i, kuid_t uid, int *len)
2626 {
2627         const struct inet_request_sock *ireq = inet_rsk(req);
2628         long delta = req->expires - jiffies;
2629
2630         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2631                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2632                 i,
2633                 ireq->loc_addr,
2634                 ntohs(inet_sk(sk)->inet_sport),
2635                 ireq->rmt_addr,
2636                 ntohs(ireq->rmt_port),
2637                 TCP_SYN_RECV,
2638                 0, 0, /* could print option size, but that is af dependent. */
2639                 1,    /* timers active (only the expire timer) */
2640                 jiffies_delta_to_clock_t(delta),
2641                 req->retrans,
2642                 from_kuid_munged(seq_user_ns(f), uid),
2643                 0,  /* non standard timer */
2644                 0, /* open_requests have no inode */
2645                 atomic_read(&sk->sk_refcnt),
2646                 req,
2647                 len);
2648 }
2649
2650 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2651 {
2652         int timer_active;
2653         unsigned long timer_expires;
2654         const struct tcp_sock *tp = tcp_sk(sk);
2655         const struct inet_connection_sock *icsk = inet_csk(sk);
2656         const struct inet_sock *inet = inet_sk(sk);
2657         struct fastopen_queue *fastopenq = icsk->icsk_accept_queue.fastopenq;
2658         __be32 dest = inet->inet_daddr;
2659         __be32 src = inet->inet_rcv_saddr;
2660         __u16 destp = ntohs(inet->inet_dport);
2661         __u16 srcp = ntohs(inet->inet_sport);
2662         int rx_queue;
2663
2664         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2665                 timer_active    = 1;
2666                 timer_expires   = icsk->icsk_timeout;
2667         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2668                 timer_active    = 4;
2669                 timer_expires   = icsk->icsk_timeout;
2670         } else if (timer_pending(&sk->sk_timer)) {
2671                 timer_active    = 2;
2672                 timer_expires   = sk->sk_timer.expires;
2673         } else {
2674                 timer_active    = 0;
2675                 timer_expires = jiffies;
2676         }
2677
2678         if (sk->sk_state == TCP_LISTEN)
2679                 rx_queue = sk->sk_ack_backlog;
2680         else
2681                 /*
2682                  * because we dont lock socket, we might find a transient negative value
2683                  */
2684                 rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2685
2686         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2687                         "%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2688                 i, src, srcp, dest, destp, sk->sk_state,
2689                 tp->write_seq - tp->snd_una,
2690                 rx_queue,
2691                 timer_active,
2692                 jiffies_delta_to_clock_t(timer_expires - jiffies),
2693                 icsk->icsk_retransmits,
2694                 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2695                 icsk->icsk_probes_out,
2696                 sock_i_ino(sk),
2697                 atomic_read(&sk->sk_refcnt), sk,
2698                 jiffies_to_clock_t(icsk->icsk_rto),
2699                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2700                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2701                 tp->snd_cwnd,
2702                 sk->sk_state == TCP_LISTEN ?
2703                     (fastopenq ? fastopenq->max_qlen : 0) :
2704                     (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh),
2705                 len);
2706 }
2707
2708 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2709                                struct seq_file *f, int i, int *len)
2710 {
2711         __be32 dest, src;
2712         __u16 destp, srcp;
2713         long delta = tw->tw_ttd - jiffies;
2714
2715         dest  = tw->tw_daddr;
2716         src   = tw->tw_rcv_saddr;
2717         destp = ntohs(tw->tw_dport);
2718         srcp  = ntohs(tw->tw_sport);
2719
2720         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2721                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2722                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2723                 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2724                 atomic_read(&tw->tw_refcnt), tw, len);
2725 }
2726
2727 #define TMPSZ 150
2728
2729 static int tcp4_seq_show(struct seq_file *seq, void *v)
2730 {
2731         struct tcp_iter_state *st;
2732         int len;
2733
2734         if (v == SEQ_START_TOKEN) {
2735                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2736                            "  sl  local_address rem_address   st tx_queue "
2737                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2738                            "inode");
2739                 goto out;
2740         }
2741         st = seq->private;
2742
2743         switch (st->state) {
2744         case TCP_SEQ_STATE_LISTENING:
2745         case TCP_SEQ_STATE_ESTABLISHED:
2746                 get_tcp4_sock(v, seq, st->num, &len);
2747                 break;
2748         case TCP_SEQ_STATE_OPENREQ:
2749                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2750                 break;
2751         case TCP_SEQ_STATE_TIME_WAIT:
2752                 get_timewait4_sock(v, seq, st->num, &len);
2753                 break;
2754         }
2755         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2756 out:
2757         return 0;
2758 }
2759
2760 static const struct file_operations tcp_afinfo_seq_fops = {
2761         .owner   = THIS_MODULE,
2762         .open    = tcp_seq_open,
2763         .read    = seq_read,
2764         .llseek  = seq_lseek,
2765         .release = seq_release_net
2766 };
2767
2768 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2769         .name           = "tcp",
2770         .family         = AF_INET,
2771         .seq_fops       = &tcp_afinfo_seq_fops,
2772         .seq_ops        = {
2773                 .show           = tcp4_seq_show,
2774         },
2775 };
2776
2777 static int __net_init tcp4_proc_init_net(struct net *net)
2778 {
2779         return tcp_proc_register(net, &tcp4_seq_afinfo);
2780 }
2781
2782 static void __net_exit tcp4_proc_exit_net(struct net *net)
2783 {
2784         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2785 }
2786
2787 static struct pernet_operations tcp4_net_ops = {
2788         .init = tcp4_proc_init_net,
2789         .exit = tcp4_proc_exit_net,
2790 };
2791
2792 int __init tcp4_proc_init(void)
2793 {
2794         return register_pernet_subsys(&tcp4_net_ops);
2795 }
2796
2797 void tcp4_proc_exit(void)
2798 {
2799         unregister_pernet_subsys(&tcp4_net_ops);
2800 }
2801 #endif /* CONFIG_PROC_FS */
2802
2803 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2804 {
2805         const struct iphdr *iph = skb_gro_network_header(skb);
2806         __wsum wsum;
2807         __sum16 sum;
2808
2809         switch (skb->ip_summed) {
2810         case CHECKSUM_COMPLETE:
2811                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2812                                   skb->csum)) {
2813                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2814                         break;
2815                 }
2816 flush:
2817                 NAPI_GRO_CB(skb)->flush = 1;
2818                 return NULL;
2819
2820         case CHECKSUM_NONE:
2821                 wsum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
2822                                           skb_gro_len(skb), IPPROTO_TCP, 0);
2823                 sum = csum_fold(skb_checksum(skb,
2824                                              skb_gro_offset(skb),
2825                                              skb_gro_len(skb),
2826                                              wsum));
2827                 if (sum)
2828                         goto flush;
2829
2830                 skb->ip_summed = CHECKSUM_UNNECESSARY;
2831                 break;
2832         }
2833
2834         return tcp_gro_receive(head, skb);
2835 }
2836
2837 int tcp4_gro_complete(struct sk_buff *skb)
2838 {
2839         const struct iphdr *iph = ip_hdr(skb);
2840         struct tcphdr *th = tcp_hdr(skb);
2841
2842         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2843                                   iph->saddr, iph->daddr, 0);
2844         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2845
2846         return tcp_gro_complete(skb);
2847 }
2848
2849 struct proto tcp_prot = {
2850         .name                   = "TCP",
2851         .owner                  = THIS_MODULE,
2852         .close                  = tcp_close,
2853         .connect                = tcp_v4_connect,
2854         .disconnect             = tcp_disconnect,
2855         .accept                 = inet_csk_accept,
2856         .ioctl                  = tcp_ioctl,
2857         .init                   = tcp_v4_init_sock,
2858         .destroy                = tcp_v4_destroy_sock,
2859         .shutdown               = tcp_shutdown,
2860         .setsockopt             = tcp_setsockopt,
2861         .getsockopt             = tcp_getsockopt,
2862         .recvmsg                = tcp_recvmsg,
2863         .sendmsg                = tcp_sendmsg,
2864         .sendpage               = tcp_sendpage,
2865         .backlog_rcv            = tcp_v4_do_rcv,
2866         .release_cb             = tcp_release_cb,
2867         .mtu_reduced            = tcp_v4_mtu_reduced,
2868         .hash                   = inet_hash,
2869         .unhash                 = inet_unhash,
2870         .get_port               = inet_csk_get_port,
2871         .enter_memory_pressure  = tcp_enter_memory_pressure,
2872         .sockets_allocated      = &tcp_sockets_allocated,
2873         .orphan_count           = &tcp_orphan_count,
2874         .memory_allocated       = &tcp_memory_allocated,
2875         .memory_pressure        = &tcp_memory_pressure,
2876         .sysctl_wmem            = sysctl_tcp_wmem,
2877         .sysctl_rmem            = sysctl_tcp_rmem,
2878         .max_header             = MAX_TCP_HEADER,
2879         .obj_size               = sizeof(struct tcp_sock),
2880         .slab_flags             = SLAB_DESTROY_BY_RCU,
2881         .twsk_prot              = &tcp_timewait_sock_ops,
2882         .rsk_prot               = &tcp_request_sock_ops,
2883         .h.hashinfo             = &tcp_hashinfo,
2884         .no_autobind            = true,
2885 #ifdef CONFIG_COMPAT
2886         .compat_setsockopt      = compat_tcp_setsockopt,
2887         .compat_getsockopt      = compat_tcp_getsockopt,
2888 #endif
2889 #ifdef CONFIG_MEMCG_KMEM
2890         .init_cgroup            = tcp_init_cgroup,
2891         .destroy_cgroup         = tcp_destroy_cgroup,
2892         .proto_cgroup           = tcp_proto_cgroup,
2893 #endif
2894 };
2895 EXPORT_SYMBOL(tcp_prot);
2896
2897 static int __net_init tcp_sk_init(struct net *net)
2898 {
2899         return 0;
2900 }
2901
2902 static void __net_exit tcp_sk_exit(struct net *net)
2903 {
2904 }
2905
2906 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2907 {
2908         inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2909 }
2910
2911 static struct pernet_operations __net_initdata tcp_sk_ops = {
2912        .init       = tcp_sk_init,
2913        .exit       = tcp_sk_exit,
2914        .exit_batch = tcp_sk_exit_batch,
2915 };
2916
2917 void __init tcp_v4_init(void)
2918 {
2919         inet_hashinfo_init(&tcp_hashinfo);
2920         if (register_pernet_subsys(&tcp_sk_ops))
2921                 panic("Failed to create the TCP control socket.\n");
2922 }