tcp: TSO packets automatic sizing

author Eric Dumazet <edumazet@google.com>

Tue, 27 Aug 2013 12:46:32 +0000 (05:46 -0700)

committer David S. Miller <davem@davemloft.net>

Thu, 29 Aug 2013 19:50:06 +0000 (15:50 -0400)
author Eric Dumazet <edumazet@google.com>
Tue, 27 Aug 2013 12:46:32 +0000 (05:46 -0700)
committer David S. Miller <davem@davemloft.net>
Thu, 29 Aug 2013 19:50:06 +0000 (15:50 -0400)
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt

index a2be556032c9e6b5f4009709cdc8563df6a699a3..1cb3aeb4baff919a5d4ee040af880eaf73c3daae 100644 (file)
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -482,6 +482,15 @@ tcp_syn_retries - INTEGER
  tcp_timestamps - BOOLEAN
         Enable timestamps as defined in RFC1323.
  
+tcp_min_tso_segs - INTEGER
+       Minimal number of segments per TSO frame.
+       Since linux-3.12, TCP does an automatic sizing of TSO frames,
+       depending on flow rate, instead of filling 64Kbytes packets.
+       For specific usages, it's possible to force TCP to build big
+       TSO frames. Note that TCP stack might split too big TSO packets
+       if available window is too small.
+       Default: 2
+
  tcp_tso_win_divisor - INTEGER
         This allows control over what percentage of the congestion window
         can be consumed by a single TSO frame.
diff --git a/include/net/sock.h b/include/net/sock.h

index e4bbcbfd07ea4d15f0ef63f3d765d261d0c25a68..6ba2e7b0e2b1300f3983ef4d3c845e257d46c579 100644 (file)
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -232,6 +232,7 @@ struct cg_proto;
    *    @sk_napi_id: id of the last napi context to receive data for sk
    *    @sk_ll_usec: usecs to busypoll when there is no data
    *    @sk_allocation: allocation mode
+  *    @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
    *    @sk_sndbuf: size of send buffer in bytes
    *    @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
    *               %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
@@ -361,6 +362,7 @@ struct sock {
         kmemcheck_bitfield_end(flags);
         int                     sk_wmem_queued;
         gfp_t                   sk_allocation;
+       u32                     sk_pacing_rate; /* bytes per second */
         netdev_features_t       sk_route_caps;
         netdev_features_t       sk_route_nocaps;
         int                     sk_gso_type;
diff --git a/include/net/tcp.h b/include/net/tcp.h

index dd5e16f66f84809b63fde1019f83901fc39671c1..6a6a88db462d53b72ae24c7d324fc34f4c2f2562 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -281,6 +281,7 @@ extern int sysctl_tcp_early_retrans;
  extern int sysctl_tcp_limit_output_bytes;
  extern int sysctl_tcp_challenge_ack_limit;
  extern unsigned int sysctl_tcp_notsent_lowat;
+extern int sysctl_tcp_min_tso_segs;
  
  extern atomic_long_t tcp_memory_allocated;
  extern struct percpu_counter tcp_sockets_allocated;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c

index 8ed7c32ae28e47bbb73052f4686643d0f5cf7eb6..540279f4c531be079e45c027aa11d37920c14a88 100644 (file)
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -29,6 +29,7 @@
  static int zero;
  static int one = 1;
  static int four = 4;
+static int gso_max_segs = GSO_MAX_SEGS;
  static int tcp_retr1_max = 255;
  static int ip_local_port_range_min[] = { 1, 1 };
  static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -760,6 +761,15 @@ static struct ctl_table ipv4_table[] = {
                 .extra1         = &zero,
                 .extra2         = &four,
         },
+       {
+               .procname       = "tcp_min_tso_segs",
+               .data           = &sysctl_tcp_min_tso_segs,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &gso_max_segs,
+       },
         {
                 .procname       = "udp_mem",
                 .data           = &sysctl_udp_mem,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c

index 4e42c03859f46e9b8067494710b3db71bf803593..fdf74090a00134709dea6ad67760a5dcaa0141a7 100644 (file)
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -283,6 +283,8 @@
  
  int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
  
+int sysctl_tcp_min_tso_segs __read_mostly = 2;
+
  struct percpu_counter tcp_orphan_count;
  EXPORT_SYMBOL_GPL(tcp_orphan_count);
  
@@ -785,12 +787,28 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
         xmit_size_goal = mss_now;
  
         if (large_allowed && sk_can_gso(sk)) {
-               xmit_size_goal = ((sk->sk_gso_max_size - 1) -
-                                 inet_csk(sk)->icsk_af_ops->net_header_len -
-                                 inet_csk(sk)->icsk_ext_hdr_len -
-                                 tp->tcp_header_len);
+               u32 gso_size, hlen;
+
+               /* Maybe we should/could use sk->sk_prot->max_header here ? */
+               hlen = inet_csk(sk)->icsk_af_ops->net_header_len +
+                      inet_csk(sk)->icsk_ext_hdr_len +
+                      tp->tcp_header_len;
+
+               /* Goal is to send at least one packet per ms,
+                * not one big TSO packet every 100 ms.
+                * This preserves ACK clocking and is consistent
+                * with tcp_tso_should_defer() heuristic.
+                */
+               gso_size = sk->sk_pacing_rate / (2 * MSEC_PER_SEC);
+               gso_size = max_t(u32, gso_size,
+                                sysctl_tcp_min_tso_segs * mss_now);
+
+               xmit_size_goal = min_t(u32, gso_size,
+                                      sk->sk_gso_max_size - 1 - hlen);
  
-               /* TSQ : try to have two TSO segments in flight */
+               /* TSQ : try to have at least two segments in flight
+                * (one in NIC TX ring, another in Qdisc)
+                */
                 xmit_size_goal = min_t(u32, xmit_size_goal,
                                        sysctl_tcp_limit_output_bytes >> 1);
  
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index ec492eae0cd75684cc83d0bce697eb39fb37fdd5..1a84fffe6993de90202021b708747ba9cf5b8782 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -688,6 +688,34 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
         }
  }
  
+/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
+ * Note: TCP stack does not yet implement pacing.
+ * FQ packet scheduler can be used to implement cheap but effective
+ * TCP pacing, to smooth the burst on large writes when packets
+ * in flight is significantly lower than cwnd (or rwin)
+ */
+static void tcp_update_pacing_rate(struct sock *sk)
+{
+       const struct tcp_sock *tp = tcp_sk(sk);
+       u64 rate;
+
+       /* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
+       rate = (u64)tp->mss_cache * 2 * (HZ << 3);
+
+       rate *= max(tp->snd_cwnd, tp->packets_out);
+
+       /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3),
+        * be conservative and assume srtt = 1 (125 us instead of 1.25 ms)
+        * We probably need usec resolution in the future.
+        * Note: This also takes care of possible srtt=0 case,
+        * when tcp_rtt_estimator() was not yet called.
+        */
+       if (tp->srtt > 8 + 2)
+               do_div(rate, tp->srtt);
+
+       sk->sk_pacing_rate = min_t(u64, rate, ~0U);
+}
+
  /* Calculate rto without backoff.  This is the second half of Van Jacobson's
   * routine referred to above.
   */
@@ -3278,7 +3306,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
         u32 ack_seq = TCP_SKB_CB(skb)->seq;
         u32 ack = TCP_SKB_CB(skb)->ack_seq;
         bool is_dupack = false;
-       u32 prior_in_flight;
+       u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
         u32 prior_fackets;
         int prior_packets = tp->packets_out;
         const int prior_unsacked = tp->packets_out - tp->sacked_out;
@@ -3383,6 +3411,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
  
         if (icsk->icsk_pending == ICSK_TIME_RETRANS)
                 tcp_schedule_loss_probe(sk);
+       if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
+               tcp_update_pacing_rate(sk);
         return 1;
  
  no_queue:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index 884efff5b531f9c6177a789ea5013c0492939afa..e63ae4c9691ddf886e6ef4a343665821a0b000f2 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1631,7 +1631,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
  
         /* If a full-sized TSO skb can be sent, do it. */
         if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
-                          sk->sk_gso_max_segs * tp->mss_cache))
+                          tp->xmit_size_goal_segs * tp->mss_cache))
                 goto send_now;
  
         /* Middle in queue won't get any more data, full sendable already? */
author	Eric Dumazet <edumazet@google.com>
	Tue, 27 Aug 2013 12:46:32 +0000 (05:46 -0700)
committer	David S. Miller <davem@davemloft.net>
	Thu, 29 Aug 2013 19:50:06 +0000 (15:50 -0400)
Documentation/networking/ip-sysctl.txt		patch \| blob \| history
include/net/sock.h		patch \| blob \| history
include/net/tcp.h		patch \| blob \| history
net/ipv4/sysctl_net_ipv4.c		patch \| blob \| history
net/ipv4/tcp.c		patch \| blob \| history
net/ipv4/tcp_input.c		patch \| blob \| history
net/ipv4/tcp_output.c		patch \| blob \| history