mirror of
https://github.com/Fishwaldo/Star64_linux.git
synced 2025-06-20 13:41:30 +00:00
tcp: switch rtt estimations to usec resolution
Upcoming congestion controls for TCP require usec resolution for RTT estimations. Millisecond resolution is simply not enough these days. FQ/pacing in DC environments also require this change for finer control and removal of bimodal behavior due to the current hack in tcp_update_pacing_rate() for 'small rtt' TCP_CONG_RTT_STAMP is no longer needed. As Julian Anastasov pointed out, we need to keep user compatibility : tcp_metrics used to export RTT and RTTVAR in msec resolution, so we added RTT_US and RTTVAR_US. An iproute2 patch is needed to use the new attributes if provided by the kernel. In this example ss command displays a srtt of 32 usecs (10Gbit link) lpk51:~# ./ss -i dst lpk52 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 1 10.246.11.51:42959 10.246.11.52:64614 cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448 cwnd:10 send 3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559 Updated iproute2 ip command displays : lpk51:~# ./ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source 10.246.11.51 Old binary displays : lpk51:~# ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source 10.246.11.51 With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Yuchung Cheng <ycheng@google.com> Cc: Larry Brakmo <brakmo@google.com> Cc: Julian Anastasov <ja@ssi.bg> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
363ec39235
commit
740b0f1841
17 changed files with 175 additions and 170 deletions
|
@ -33,6 +33,11 @@ struct tcp_fastopen_metrics {
|
|||
struct tcp_fastopen_cookie cookie;
|
||||
};
|
||||
|
||||
/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
|
||||
* Kernel only stores RTT and RTTVAR in usec resolution
|
||||
*/
|
||||
#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
|
||||
|
||||
struct tcp_metrics_block {
|
||||
struct tcp_metrics_block __rcu *tcpm_next;
|
||||
struct inetpeer_addr tcpm_saddr;
|
||||
|
@ -41,7 +46,7 @@ struct tcp_metrics_block {
|
|||
u32 tcpm_ts;
|
||||
u32 tcpm_ts_stamp;
|
||||
u32 tcpm_lock;
|
||||
u32 tcpm_vals[TCP_METRIC_MAX + 1];
|
||||
u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
|
||||
struct tcp_fastopen_metrics tcpm_fastopen;
|
||||
|
||||
struct rcu_head rcu_head;
|
||||
|
@ -59,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm,
|
|||
return tm->tcpm_vals[idx];
|
||||
}
|
||||
|
||||
static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
|
||||
enum tcp_metric_index idx)
|
||||
{
|
||||
return msecs_to_jiffies(tm->tcpm_vals[idx]);
|
||||
}
|
||||
|
||||
static void tcp_metric_set(struct tcp_metrics_block *tm,
|
||||
enum tcp_metric_index idx,
|
||||
u32 val)
|
||||
|
@ -72,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,
|
|||
tm->tcpm_vals[idx] = val;
|
||||
}
|
||||
|
||||
static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
|
||||
enum tcp_metric_index idx,
|
||||
u32 val)
|
||||
{
|
||||
tm->tcpm_vals[idx] = jiffies_to_msecs(val);
|
||||
}
|
||||
|
||||
static bool addr_same(const struct inetpeer_addr *a,
|
||||
const struct inetpeer_addr *b)
|
||||
{
|
||||
|
@ -101,9 +93,11 @@ struct tcpm_hash_bucket {
|
|||
|
||||
static DEFINE_SPINLOCK(tcp_metrics_lock);
|
||||
|
||||
static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
|
||||
static void tcpm_suck_dst(struct tcp_metrics_block *tm,
|
||||
const struct dst_entry *dst,
|
||||
bool fastopen_clear)
|
||||
{
|
||||
u32 msval;
|
||||
u32 val;
|
||||
|
||||
tm->tcpm_stamp = jiffies;
|
||||
|
@ -121,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
|
|||
val |= 1 << TCP_METRIC_REORDERING;
|
||||
tm->tcpm_lock = val;
|
||||
|
||||
tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
|
||||
tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
|
||||
msval = dst_metric_raw(dst, RTAX_RTT);
|
||||
tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
|
||||
|
||||
msval = dst_metric_raw(dst, RTAX_RTTVAR);
|
||||
tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
|
||||
tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
|
||||
tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
|
||||
tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
|
||||
|
@ -384,7 +381,7 @@ void tcp_update_metrics(struct sock *sk)
|
|||
dst_confirm(dst);
|
||||
|
||||
rcu_read_lock();
|
||||
if (icsk->icsk_backoff || !tp->srtt) {
|
||||
if (icsk->icsk_backoff || !tp->srtt_us) {
|
||||
/* This session failed to estimate rtt. Why?
|
||||
* Probably, no packets returned in time. Reset our
|
||||
* results.
|
||||
|
@ -399,8 +396,8 @@ void tcp_update_metrics(struct sock *sk)
|
|||
if (!tm)
|
||||
goto out_unlock;
|
||||
|
||||
rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
|
||||
m = rtt - tp->srtt;
|
||||
rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
|
||||
m = rtt - tp->srtt_us;
|
||||
|
||||
/* If newly calculated rtt larger than stored one, store new
|
||||
* one. Otherwise, use EWMA. Remember, rtt overestimation is
|
||||
|
@ -408,10 +405,10 @@ void tcp_update_metrics(struct sock *sk)
|
|||
*/
|
||||
if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
|
||||
if (m <= 0)
|
||||
rtt = tp->srtt;
|
||||
rtt = tp->srtt_us;
|
||||
else
|
||||
rtt -= (m >> 3);
|
||||
tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
|
||||
tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
|
||||
}
|
||||
|
||||
if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
|
||||
|
@ -422,16 +419,16 @@ void tcp_update_metrics(struct sock *sk)
|
|||
|
||||
/* Scale deviation to rttvar fixed point */
|
||||
m >>= 1;
|
||||
if (m < tp->mdev)
|
||||
m = tp->mdev;
|
||||
if (m < tp->mdev_us)
|
||||
m = tp->mdev_us;
|
||||
|
||||
var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
|
||||
var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
|
||||
if (m >= var)
|
||||
var = m;
|
||||
else
|
||||
var -= (var - m) >> 2;
|
||||
|
||||
tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
|
||||
tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
|
||||
}
|
||||
|
||||
if (tcp_in_initial_slowstart(tp)) {
|
||||
|
@ -528,7 +525,7 @@ void tcp_init_metrics(struct sock *sk)
|
|||
tp->reordering = val;
|
||||
}
|
||||
|
||||
crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
|
||||
crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
|
||||
rcu_read_unlock();
|
||||
reset:
|
||||
/* The initial RTT measurement from the SYN/SYN-ACK is not ideal
|
||||
|
@ -551,18 +548,20 @@ reset:
|
|||
* to low value, and then abruptly stops to do it and starts to delay
|
||||
* ACKs, wait for troubles.
|
||||
*/
|
||||
if (crtt > tp->srtt) {
|
||||
if (crtt > tp->srtt_us) {
|
||||
/* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
|
||||
crtt >>= 3;
|
||||
crtt /= 8 * USEC_PER_MSEC;
|
||||
inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
|
||||
} else if (tp->srtt == 0) {
|
||||
} else if (tp->srtt_us == 0) {
|
||||
/* RFC6298: 5.7 We've failed to get a valid RTT sample from
|
||||
* 3WHS. This is most likely due to retransmission,
|
||||
* including spurious one. Reset the RTO back to 3secs
|
||||
* from the more aggressive 1sec to avoid more spurious
|
||||
* retransmission.
|
||||
*/
|
||||
tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
|
||||
tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
|
||||
tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
|
||||
|
||||
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
|
||||
}
|
||||
/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
|
||||
|
@ -809,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
|
|||
nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
|
||||
if (!nest)
|
||||
goto nla_put_failure;
|
||||
for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
|
||||
if (!tm->tcpm_vals[i])
|
||||
for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
|
||||
u32 val = tm->tcpm_vals[i];
|
||||
|
||||
if (!val)
|
||||
continue;
|
||||
if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
|
||||
if (i == TCP_METRIC_RTT) {
|
||||
if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
|
||||
val) < 0)
|
||||
goto nla_put_failure;
|
||||
n++;
|
||||
val = max(val / 1000, 1U);
|
||||
}
|
||||
if (i == TCP_METRIC_RTTVAR) {
|
||||
if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
|
||||
val) < 0)
|
||||
goto nla_put_failure;
|
||||
n++;
|
||||
val = max(val / 1000, 1U);
|
||||
}
|
||||
if (nla_put_u32(msg, i + 1, val) < 0)
|
||||
goto nla_put_failure;
|
||||
n++;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue