tcp: switch rtt estimations to usec resolution

Upcoming congestion controls for TCP require usec resolution for RTT
estimations. Millisecond resolution is simply not enough these days.

FQ/pacing in DC environments also require this change for finer control
and removal of bimodal behavior due to the current hack in
tcp_update_pacing_rate() for 'small rtt'

TCP_CONG_RTT_STAMP is no longer needed.

As Julian Anastasov pointed out, we need to keep user compatibility :
tcp_metrics used to export RTT and RTTVAR in msec resolution,
so we added RTT_US and RTTVAR_US. An iproute2 patch is needed
to use the new attributes if provided by the kernel.

In this example ss command displays a srtt of 32 usecs (10Gbit link)

lpk51:~# ./ss -i dst lpk52
Netid  State      Recv-Q Send-Q   Local Address:Port       Peer
Address:Port
tcp    ESTAB      0      1         10.246.11.51:42959
10.246.11.52:64614
         cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448
cwnd:10 send
3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559

Updated iproute2 ip command displays :

lpk51:~# ./ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source
10.246.11.51

Old binary displays :

lpk51:~# ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source
10.246.11.51

With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Larry Brakmo <brakmo@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet 2014-02-26 14:02:48 -08:00 committed by David S. Miller
parent 363ec39235
commit 740b0f1841
17 changed files with 175 additions and 170 deletions

View file

@ -33,6 +33,11 @@ struct tcp_fastopen_metrics {
struct tcp_fastopen_cookie cookie;
};
/* TCP_METRIC_MAX includes 2 extra fields for userspace compatibility
* Kernel only stores RTT and RTTVAR in usec resolution
*/
#define TCP_METRIC_MAX_KERNEL (TCP_METRIC_MAX - 2)
struct tcp_metrics_block {
struct tcp_metrics_block __rcu *tcpm_next;
struct inetpeer_addr tcpm_saddr;
@ -41,7 +46,7 @@ struct tcp_metrics_block {
u32 tcpm_ts;
u32 tcpm_ts_stamp;
u32 tcpm_lock;
u32 tcpm_vals[TCP_METRIC_MAX + 1];
u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1];
struct tcp_fastopen_metrics tcpm_fastopen;
struct rcu_head rcu_head;
@ -59,12 +64,6 @@ static u32 tcp_metric_get(struct tcp_metrics_block *tm,
return tm->tcpm_vals[idx];
}
static u32 tcp_metric_get_jiffies(struct tcp_metrics_block *tm,
enum tcp_metric_index idx)
{
return msecs_to_jiffies(tm->tcpm_vals[idx]);
}
static void tcp_metric_set(struct tcp_metrics_block *tm,
enum tcp_metric_index idx,
u32 val)
@ -72,13 +71,6 @@ static void tcp_metric_set(struct tcp_metrics_block *tm,
tm->tcpm_vals[idx] = val;
}
static void tcp_metric_set_msecs(struct tcp_metrics_block *tm,
enum tcp_metric_index idx,
u32 val)
{
tm->tcpm_vals[idx] = jiffies_to_msecs(val);
}
static bool addr_same(const struct inetpeer_addr *a,
const struct inetpeer_addr *b)
{
@ -101,9 +93,11 @@ struct tcpm_hash_bucket {
static DEFINE_SPINLOCK(tcp_metrics_lock);
static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
static void tcpm_suck_dst(struct tcp_metrics_block *tm,
const struct dst_entry *dst,
bool fastopen_clear)
{
u32 msval;
u32 val;
tm->tcpm_stamp = jiffies;
@ -121,8 +115,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
val |= 1 << TCP_METRIC_REORDERING;
tm->tcpm_lock = val;
tm->tcpm_vals[TCP_METRIC_RTT] = dst_metric_raw(dst, RTAX_RTT);
tm->tcpm_vals[TCP_METRIC_RTTVAR] = dst_metric_raw(dst, RTAX_RTTVAR);
msval = dst_metric_raw(dst, RTAX_RTT);
tm->tcpm_vals[TCP_METRIC_RTT] = msval * USEC_PER_MSEC;
msval = dst_metric_raw(dst, RTAX_RTTVAR);
tm->tcpm_vals[TCP_METRIC_RTTVAR] = msval * USEC_PER_MSEC;
tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH);
tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND);
tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
@ -384,7 +381,7 @@ void tcp_update_metrics(struct sock *sk)
dst_confirm(dst);
rcu_read_lock();
if (icsk->icsk_backoff || !tp->srtt) {
if (icsk->icsk_backoff || !tp->srtt_us) {
/* This session failed to estimate rtt. Why?
* Probably, no packets returned in time. Reset our
* results.
@ -399,8 +396,8 @@ void tcp_update_metrics(struct sock *sk)
if (!tm)
goto out_unlock;
rtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
m = rtt - tp->srtt;
rtt = tcp_metric_get(tm, TCP_METRIC_RTT);
m = rtt - tp->srtt_us;
/* If newly calculated rtt larger than stored one, store new
* one. Otherwise, use EWMA. Remember, rtt overestimation is
@ -408,10 +405,10 @@ void tcp_update_metrics(struct sock *sk)
*/
if (!tcp_metric_locked(tm, TCP_METRIC_RTT)) {
if (m <= 0)
rtt = tp->srtt;
rtt = tp->srtt_us;
else
rtt -= (m >> 3);
tcp_metric_set_msecs(tm, TCP_METRIC_RTT, rtt);
tcp_metric_set(tm, TCP_METRIC_RTT, rtt);
}
if (!tcp_metric_locked(tm, TCP_METRIC_RTTVAR)) {
@ -422,16 +419,16 @@ void tcp_update_metrics(struct sock *sk)
/* Scale deviation to rttvar fixed point */
m >>= 1;
if (m < tp->mdev)
m = tp->mdev;
if (m < tp->mdev_us)
m = tp->mdev_us;
var = tcp_metric_get_jiffies(tm, TCP_METRIC_RTTVAR);
var = tcp_metric_get(tm, TCP_METRIC_RTTVAR);
if (m >= var)
var = m;
else
var -= (var - m) >> 2;
tcp_metric_set_msecs(tm, TCP_METRIC_RTTVAR, var);
tcp_metric_set(tm, TCP_METRIC_RTTVAR, var);
}
if (tcp_in_initial_slowstart(tp)) {
@ -528,7 +525,7 @@ void tcp_init_metrics(struct sock *sk)
tp->reordering = val;
}
crtt = tcp_metric_get_jiffies(tm, TCP_METRIC_RTT);
crtt = tcp_metric_get(tm, TCP_METRIC_RTT);
rcu_read_unlock();
reset:
/* The initial RTT measurement from the SYN/SYN-ACK is not ideal
@ -551,18 +548,20 @@ reset:
* to low value, and then abruptly stops to do it and starts to delay
* ACKs, wait for troubles.
*/
if (crtt > tp->srtt) {
if (crtt > tp->srtt_us) {
/* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
crtt >>= 3;
crtt /= 8 * USEC_PER_MSEC;
inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
} else if (tp->srtt == 0) {
} else if (tp->srtt_us == 0) {
/* RFC6298: 5.7 We've failed to get a valid RTT sample from
* 3WHS. This is most likely due to retransmission,
* including spurious one. Reset the RTO back to 3secs
* from the more aggressive 1sec to avoid more spurious
* retransmission.
*/
tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_FALLBACK;
tp->rttvar_us = jiffies_to_usecs(TCP_TIMEOUT_FALLBACK);
tp->mdev_us = tp->mdev_max_us = tp->rttvar_us;
inet_csk(sk)->icsk_rto = TCP_TIMEOUT_FALLBACK;
}
/* Cut cwnd down to 1 per RFC5681 if SYN or SYN-ACK has been
@ -809,10 +808,26 @@ static int tcp_metrics_fill_info(struct sk_buff *msg,
nest = nla_nest_start(msg, TCP_METRICS_ATTR_VALS);
if (!nest)
goto nla_put_failure;
for (i = 0; i < TCP_METRIC_MAX + 1; i++) {
if (!tm->tcpm_vals[i])
for (i = 0; i < TCP_METRIC_MAX_KERNEL + 1; i++) {
u32 val = tm->tcpm_vals[i];
if (!val)
continue;
if (nla_put_u32(msg, i + 1, tm->tcpm_vals[i]) < 0)
if (i == TCP_METRIC_RTT) {
if (nla_put_u32(msg, TCP_METRIC_RTT_US + 1,
val) < 0)
goto nla_put_failure;
n++;
val = max(val / 1000, 1U);
}
if (i == TCP_METRIC_RTTVAR) {
if (nla_put_u32(msg, TCP_METRIC_RTTVAR_US + 1,
val) < 0)
goto nla_put_failure;
n++;
val = max(val / 1000, 1U);
}
if (nla_put_u32(msg, i + 1, val) < 0)
goto nla_put_failure;
n++;
}