summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/networking/ip-sysctl.rst23
-rw-r--r--include/net/netns/ipv4.h3
-rw-r--r--net/ipv4/sysctl_net_ipv4.c7
-rw-r--r--net/ipv4/tcp_ipv4.c1
-rw-r--r--net/ipv4/tcp_output.c33
5 files changed, 54 insertions, 13 deletions
diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 2572eecc3e86..b0024aa7b051 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -878,6 +878,29 @@ tcp_min_tso_segs - INTEGER
Default: 2
+tcp_tso_rtt_log - INTEGER
+ Adjustment of TSO packet sizes based on min_rtt
+
+ Starting from linux-5.18, TCP autosizing can be tweaked
+ for flows having small RTT.
+
+ Old autosizing was splitting the pacing budget to send 1024 TSO
+ per second.
+
+ tso_packet_size = sk->sk_pacing_rate / 1024;
+
+ With the new mechanism, we increase this TSO sizing using:
+
+ distance = min_rtt_usec / (2^tcp_tso_rtt_log)
+ tso_packet_size += gso_max_size >> distance;
+
+ This means that flows between very close hosts can use bigger
+ TSO packets, reducing their cpu costs.
+
+ If you want to use the old autosizing, set this sysctl to 0.
+
+ Default: 9 (2^9 = 512 usec)
+
tcp_pacing_ss_ratio - INTEGER
sk->sk_pacing_rate is set by TCP stack using a ratio applied
to current rate. (current_rate = cwnd * mss / srtt)
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index f0687867b5cd..ce0cc4e8d8c7 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -127,6 +127,7 @@ struct netns_ipv4 {
u8 sysctl_tcp_synack_retries;
u8 sysctl_tcp_syncookies;
u8 sysctl_tcp_migrate_req;
+ u8 sysctl_tcp_comp_sack_nr;
int sysctl_tcp_reordering;
u8 sysctl_tcp_retries1;
u8 sysctl_tcp_retries2;
@@ -160,9 +161,9 @@ struct netns_ipv4 {
int sysctl_tcp_challenge_ack_limit;
int sysctl_tcp_min_rtt_wlen;
u8 sysctl_tcp_min_tso_segs;
+ u8 sysctl_tcp_tso_rtt_log;
u8 sysctl_tcp_autocorking;
u8 sysctl_tcp_reflect_tos;
- u8 sysctl_tcp_comp_sack_nr;
int sysctl_tcp_invalid_ratelimit;
int sysctl_tcp_pacing_ss_ratio;
int sysctl_tcp_pacing_ca_ratio;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 1cae27b5dcd8..ad80d180b60b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1272,6 +1272,13 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = SYSCTL_ONE,
},
{
+ .procname = "tcp_tso_rtt_log",
+ .data = &init_net.ipv4.sysctl_tcp_tso_rtt_log,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
+ {
.procname = "tcp_min_rtt_wlen",
.data = &init_net.ipv4.sysctl_tcp_min_rtt_wlen,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 81694a354110..f9cec624068d 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3137,6 +3137,7 @@ static int __net_init tcp_sk_init(struct net *net)
/* rfc5961 challenge ack rate limiting */
net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
net->ipv4.sysctl_tcp_min_tso_segs = 2;
+ net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
net->ipv4.sysctl_tcp_autocorking = 1;
net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 2319531267c6..81aaa7da3e8c 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1951,25 +1951,34 @@ static bool tcp_nagle_check(bool partial, const struct tcp_sock *tp,
}
/* Return how many segs we'd like on a TSO packet,
- * to send one TSO packet per ms
+ * depending on current pacing rate, and how close the peer is.
+ *
+ * Rationale is:
+ * - For close peers, we rather send bigger packets to reduce
+ * cpu costs, because occasional losses will be repaired fast.
+ * - For long distance/rtt flows, we would like to get ACK clocking
+ * with 1 ACK per ms.
+ *
+ * Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting
+ * in bigger TSO bursts. We we cut the RTT-based allowance in half
+ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance
+ * is below 1500 bytes after 6 * ~500 usec = 3ms.
*/
static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now,
int min_tso_segs)
{
- u32 bytes, segs;
+ unsigned long bytes;
+ u32 r;
- bytes = min_t(unsigned long,
- sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift),
- sk->sk_gso_max_size);
+ bytes = sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift);
- /* Goal is to send at least one packet per ms,
- * not one big TSO packet every 100 ms.
- * This preserves ACK clocking and is consistent
- * with tcp_tso_should_defer() heuristic.
- */
- segs = max_t(u32, bytes / mss_now, min_tso_segs);
+ r = tcp_min_rtt(tcp_sk(sk)) >> sock_net(sk)->ipv4.sysctl_tcp_tso_rtt_log;
+ if (r < BITS_PER_TYPE(sk->sk_gso_max_size))
+ bytes += sk->sk_gso_max_size >> r;
+
+ bytes = min_t(unsigned long, bytes, sk->sk_gso_max_size);
- return segs;
+ return max_t(u32, bytes / mss_now, min_tso_segs);
}
/* Return the number of segments we want in the skb we are transmitting.