]> Pileus Git - ~andy/linux/blobdiff - net/ipv4/tcp.c
tcp: tsq: restore minimal amount of queueing
[~andy/linux] / net / ipv4 / tcp.c
index b2f6c74861af6d8e1209c65823bef34f806609c2..3dc0c6cf02a896e66071cd5f66b0a93f0dd3fc06 100644 (file)
 
 int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
 
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
+long sysctl_tcp_mem[3] __read_mostly;
 int sysctl_tcp_wmem[3] __read_mostly;
 int sysctl_tcp_rmem[3] __read_mostly;
 
+EXPORT_SYMBOL(sysctl_tcp_mem);
 EXPORT_SYMBOL(sysctl_tcp_rmem);
 EXPORT_SYMBOL(sysctl_tcp_wmem);
 
@@ -410,10 +414,6 @@ void tcp_init_sock(struct sock *sk)
 
        icsk->icsk_sync_mss = tcp_sync_mss;
 
-       /* Presumed zeroed, in order of appearance:
-        *      cookie_in_always, cookie_out_never,
-        *      s_data_constant, s_data_in, s_data_out
-        */
        sk->sk_sndbuf = sysctl_tcp_wmem[1];
        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
@@ -499,7 +499,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
                        mask |= POLLIN | POLLRDNORM;
 
                if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
-                       if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+                       if (sk_stream_is_writeable(sk)) {
                                mask |= POLLOUT | POLLWRNORM;
                        } else {  /* send SIGIO later */
                                set_bit(SOCK_ASYNC_NOSPACE,
@@ -510,7 +510,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
                                 * wspace test but before the flags are set,
                                 * IO signal will be lost.
                                 */
-                               if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+                               if (sk_stream_is_writeable(sk))
                                        mask |= POLLOUT | POLLWRNORM;
                        }
                } else
@@ -789,14 +789,24 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
        xmit_size_goal = mss_now;
 
        if (large_allowed && sk_can_gso(sk)) {
-               xmit_size_goal = ((sk->sk_gso_max_size - 1) -
-                                 inet_csk(sk)->icsk_af_ops->net_header_len -
-                                 inet_csk(sk)->icsk_ext_hdr_len -
-                                 tp->tcp_header_len);
+               u32 gso_size, hlen;
+
+               /* Maybe we should/could use sk->sk_prot->max_header here ? */
+               hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+                      inet_csk(sk)->icsk_ext_hdr_len +
+                      tp->tcp_header_len;
 
-               /* TSQ : try to have two TSO segments in flight */
-               xmit_size_goal = min_t(u32, xmit_size_goal,
-                                      sysctl_tcp_limit_output_bytes >> 1);
+               /* Goal is to send at least one packet per ms,
+                * not one big TSO packet every 100 ms.
+                * This preserves ACK clocking and is consistent
+                * with tcp_tso_should_defer() heuristic.
+                */
+               gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+               gso_size = max_t(u32, gso_size,
+                                sysctl_tcp_min_tso_segs * mss_now);
+
+               xmit_size_goal = min_t(u32, gso_size,
+                                      sk->sk_gso_max_size - 1 - hlen);
 
                xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
 
@@ -2454,10 +2464,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
        case TCP_THIN_DUPACK:
                if (val < 0 || val > 1)
                        err = -EINVAL;
-               else
+               else {
                        tp->thin_dupack = val;
                        if (tp->thin_dupack)
                                tcp_disable_early_retrans(tp);
+               }
                break;
 
        case TCP_REPAIR:
@@ -2638,6 +2649,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
                else
                        tp->tsoffset = val - tcp_time_stamp;
                break;
+       case TCP_NOTSENT_LOWAT:
+               tp->notsent_lowat = val;
+               sk->sk_write_space(sk);
+               break;
        default:
                err = -ENOPROTOOPT;
                break;
@@ -2854,6 +2869,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
        case TCP_TIMESTAMP:
                val = tcp_time_stamp + tp->tsoffset;
                break;
+       case TCP_NOTSENT_LOWAT:
+               val = tp->notsent_lowat;
+               break;
        default:
                return -ENOPROTOOPT;
        }
@@ -3075,13 +3093,13 @@ static int __init set_thash_entries(char *str)
 }
 __setup("thash_entries=", set_thash_entries);
 
-void tcp_init_mem(struct net *net)
+static void tcp_init_mem(void)
 {
        unsigned long limit = nr_free_buffer_pages() / 8;
        limit = max(limit, 128UL);
-       net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3;
-       net->ipv4.sysctl_tcp_mem[1] = limit;
-       net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2;
+       sysctl_tcp_mem[0] = limit / 4 * 3;
+       sysctl_tcp_mem[1] = limit;
+       sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2;
 }
 
 void __init tcp_init(void)
@@ -3115,10 +3133,9 @@ void __init tcp_init(void)
                                        &tcp_hashinfo.ehash_mask,
                                        0,
                                        thash_entries ? 0 : 512 * 1024);
-       for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) {
+       for (i = 0; i <= tcp_hashinfo.ehash_mask; i++)
                INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i);
-               INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i);
-       }
+
        if (inet_ehash_locks_alloc(&tcp_hashinfo))
                panic("TCP: failed to alloc ehash_locks");
        tcp_hashinfo.bhash =
@@ -3144,7 +3161,7 @@ void __init tcp_init(void)
        sysctl_tcp_max_orphans = cnt / 2;
        sysctl_max_syn_backlog = max(128, cnt / 256);
 
-       tcp_init_mem(&init_net);
+       tcp_init_mem();
        /* Set per-socket limits to no more than 1/128 the pressure threshold */
        limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
        max_wshare = min(4UL*1024*1024, limit);