tcp: switch rtt estimations to usec resolution

Upcoming congestion controls for TCP require usec resolution for RTT
estimations. Millisecond resolution is simply not enough these days.

FQ/pacing in DC environments also require this change for finer control
and removal of bimodal behavior due to the current hack in
tcp_update_pacing_rate() for 'small rtt'

TCP_CONG_RTT_STAMP is no longer needed.

As Julian Anastasov pointed out, we need to keep user compatibility :
tcp_metrics used to export RTT and RTTVAR in msec resolution,
so we added RTT_US and RTTVAR_US. An iproute2 patch is needed
to use the new attributes if provided by the kernel.

In this example ss command displays a srtt of 32 usecs (10Gbit link)

lpk51:~# ./ss -i dst lpk52
Netid  State      Recv-Q Send-Q   Local Address:Port       Peer
Address:Port
tcp    ESTAB      0      1         10.246.11.51:42959
10.246.11.52:64614
         cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448
cwnd:10 send
3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559

Updated iproute2 ip command displays :

lpk51:~# ./ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source
10.246.11.51

Old binary displays :

lpk51:~# ip tcp_metrics | grep 10.246.11.52
10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source
10.246.11.51

With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng

Signed-off-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Larry Brakmo <brakmo@google.com>
Cc: Julian Anastasov <ja@ssi.bg>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet 2014-02-26 14:02:48 -08:00 committed by David S. Miller
parent 363ec39235
commit 740b0f1841
17 changed files with 175 additions and 170 deletions

View file

@ -667,11 +667,11 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
static void tcp_rtt_estimator(struct sock *sk, long mrtt_us)
{
struct tcp_sock *tp = tcp_sk(sk);
long m = mrtt; /* RTT */
u32 srtt = tp->srtt;
long m = mrtt_us; /* RTT */
u32 srtt = tp->srtt_us;
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
@ -694,7 +694,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0) {
m = -m; /* m is now abs(error) */
m -= (tp->mdev >> 2); /* similar update on mdev */
m -= (tp->mdev_us >> 2); /* similar update on mdev */
/* This is similar to one of Eifel findings.
* Eifel blocks mdev updates when rtt decreases.
* This solution is a bit different: we use finer gain
@ -706,28 +706,29 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
if (m > 0)
m >>= 3;
} else {
m -= (tp->mdev >> 2); /* similar update on mdev */
m -= (tp->mdev_us >> 2); /* similar update on mdev */
}
tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */
if (tp->mdev > tp->mdev_max) {
tp->mdev_max = tp->mdev;
if (tp->mdev_max > tp->rttvar)
tp->rttvar = tp->mdev_max;
tp->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
if (tp->mdev_us > tp->mdev_max_us) {
tp->mdev_max_us = tp->mdev_us;
if (tp->mdev_max_us > tp->rttvar_us)
tp->rttvar_us = tp->mdev_max_us;
}
if (after(tp->snd_una, tp->rtt_seq)) {
if (tp->mdev_max < tp->rttvar)
tp->rttvar -= (tp->rttvar - tp->mdev_max) >> 2;
if (tp->mdev_max_us < tp->rttvar_us)
tp->rttvar_us -= (tp->rttvar_us - tp->mdev_max_us) >> 2;
tp->rtt_seq = tp->snd_nxt;
tp->mdev_max = tcp_rto_min(sk);
tp->mdev_max_us = tcp_rto_min_us(sk);
}
} else {
/* no previous measure. */
srtt = m << 3; /* take the measured time to be rtt */
tp->mdev = m << 1; /* make sure rto = 3*rtt */
tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
tp->mdev_us = m << 1; /* make sure rto = 3*rtt */
tp->rttvar_us = max(tp->mdev_us, tcp_rto_min_us(sk));
tp->mdev_max_us = tp->rttvar_us;
tp->rtt_seq = tp->snd_nxt;
}
tp->srtt = max(1U, srtt);
tp->srtt_us = max(1U, srtt);
}
/* Set the sk_pacing_rate to allow proper sizing of TSO packets.
@ -742,20 +743,12 @@ static void tcp_update_pacing_rate(struct sock *sk)
u64 rate;
/* set sk_pacing_rate to 200 % of current rate (mss * cwnd / srtt) */
rate = (u64)tp->mss_cache * 2 * (HZ << 3);
rate = (u64)tp->mss_cache * 2 * (USEC_PER_SEC << 3);
rate *= max(tp->snd_cwnd, tp->packets_out);
/* Correction for small srtt and scheduling constraints.
* For small rtt, consider noise is too high, and use
* the minimal value (srtt = 1 -> 125 us for HZ=1000)
*
* We probably need usec resolution in the future.
* Note: This also takes care of possible srtt=0 case,
* when tcp_rtt_estimator() was not yet called.
*/
if (tp->srtt > 8 + 2)
do_div(rate, tp->srtt);
if (likely(tp->srtt_us))
do_div(rate, tp->srtt_us);
/* ACCESS_ONCE() is needed because sch_fq fetches sk_pacing_rate
* without any lock. We want to make sure compiler wont store
@ -1122,10 +1115,10 @@ static bool tcp_check_dsack(struct sock *sk, const struct sk_buff *ack_skb,
}
struct tcp_sacktag_state {
int reord;
int fack_count;
int flag;
s32 rtt; /* RTT measured by SACKing never-retransmitted data */
int reord;
int fack_count;
long rtt_us; /* RTT measured by SACKing never-retransmitted data */
int flag;
};
/* Check if skb is fully within the SACK block. In presence of GSO skbs,
@ -1186,7 +1179,8 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
int dup_sack, int pcount, u32 xmit_time)
int dup_sack, int pcount,
const struct skb_mstamp *xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
int fack_count = state->fack_count;
@ -1227,8 +1221,13 @@ static u8 tcp_sacktag_one(struct sock *sk,
if (!after(end_seq, tp->high_seq))
state->flag |= FLAG_ORIG_SACK_ACKED;
/* Pick the earliest sequence sacked for RTT */
if (state->rtt < 0)
state->rtt = tcp_time_stamp - xmit_time;
if (state->rtt_us < 0) {
struct skb_mstamp now;
skb_mstamp_get(&now);
state->rtt_us = skb_mstamp_us_delta(&now,
xmit_time);
}
}
if (sacked & TCPCB_LOST) {
@ -1287,7 +1286,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
start_seq, end_seq, dup_sack, pcount,
TCP_SKB_CB(skb)->when);
&skb->skb_mstamp);
if (skb == tp->lost_skb_hint)
tp->lost_cnt_hint += pcount;
@ -1565,7 +1564,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb),
TCP_SKB_CB(skb)->when);
&skb->skb_mstamp);
if (!before(TCP_SKB_CB(skb)->seq,
tcp_highest_sack_seq(tp)))
@ -1622,7 +1621,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
static int
tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
u32 prior_snd_una, s32 *sack_rtt)
u32 prior_snd_una, long *sack_rtt_us)
{
struct tcp_sock *tp = tcp_sk(sk);
const unsigned char *ptr = (skb_transport_header(ack_skb) +
@ -1640,7 +1639,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
state.flag = 0;
state.reord = tp->packets_out;
state.rtt = -1;
state.rtt_us = -1L;
if (!tp->sacked_out) {
if (WARN_ON(tp->fackets_out))
@ -1824,7 +1823,7 @@ out:
WARN_ON((int)tp->retrans_out < 0);
WARN_ON((int)tcp_packets_in_flight(tp) < 0);
#endif
*sack_rtt = state.rtt;
*sack_rtt_us = state.rtt_us;
return state.flag;
}
@ -2034,10 +2033,12 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
* available, or RTO is scheduled to fire first.
*/
if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
(flag & FLAG_ECE) || !tp->srtt)
(flag & FLAG_ECE) || !tp->srtt_us)
return false;
delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
delay = max(usecs_to_jiffies(tp->srtt_us >> 5),
msecs_to_jiffies(2));
if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
return false;
@ -2884,7 +2885,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
s32 seq_rtt, s32 sack_rtt)
long seq_rtt_us, long sack_rtt_us)
{
const struct tcp_sock *tp = tcp_sk(sk);
@ -2894,10 +2895,10 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* is acked (RFC6298).
*/
if (flag & FLAG_RETRANS_DATA_ACKED)
seq_rtt = -1;
seq_rtt_us = -1L;
if (seq_rtt < 0)
seq_rtt = sack_rtt;
if (seq_rtt_us < 0)
seq_rtt_us = sack_rtt_us;
/* RTTM Rule: A TSecr value received in a segment is used to
* update the averaged RTT measurement only if the segment
@ -2905,14 +2906,14 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
* left edge of the send window.
* See draft-ietf-tcplw-high-performance-00, section 3.3.
*/
if (seq_rtt < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
if (seq_rtt_us < 0 && tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
flag & FLAG_ACKED)
seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr;
seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - tp->rx_opt.rcv_tsecr);
if (seq_rtt < 0)
if (seq_rtt_us < 0)
return false;
tcp_rtt_estimator(sk, seq_rtt);
tcp_rtt_estimator(sk, seq_rtt_us);
tcp_set_rto(sk);
/* RFC6298: only reset backoff on valid RTT measurement. */
@ -2924,16 +2925,16 @@ static inline bool tcp_ack_update_rtt(struct sock *sk, const int flag,
static void tcp_synack_rtt_meas(struct sock *sk, const u32 synack_stamp)
{
struct tcp_sock *tp = tcp_sk(sk);
s32 seq_rtt = -1;
long seq_rtt_us = -1L;
if (synack_stamp && !tp->total_retrans)
seq_rtt = tcp_time_stamp - synack_stamp;
seq_rtt_us = jiffies_to_usecs(tcp_time_stamp - synack_stamp);
/* If the ACK acks both the SYNACK and the (Fast Open'd) data packets
* sent in SYN_RECV, SYNACK RTT is the smooth RTT computed in tcp_ack()
*/
if (!tp->srtt)
tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt, -1);
if (!tp->srtt_us)
tcp_ack_update_rtt(sk, FLAG_SYN_ACKED, seq_rtt_us, -1L);
}
static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 acked, u32 in_flight)
@ -3022,26 +3023,27 @@ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
u32 prior_snd_una, s32 sack_rtt)
u32 prior_snd_una, long sack_rtt_us)
{
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
u32 now = tcp_time_stamp;
bool fully_acked = true;
int flag = 0;
u32 pkts_acked = 0;
u32 reord = tp->packets_out;
struct skb_mstamp first_ackt, last_ackt, now;
struct tcp_sock *tp = tcp_sk(sk);
u32 prior_sacked = tp->sacked_out;
s32 seq_rtt = -1;
s32 ca_seq_rtt = -1;
ktime_t last_ackt = net_invalid_timestamp();
u32 reord = tp->packets_out;
bool fully_acked = true;
long ca_seq_rtt_us = -1L;
long seq_rtt_us = -1L;
struct sk_buff *skb;
u32 pkts_acked = 0;
bool rtt_update;
int flag = 0;
first_ackt.v64 = 0;
while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
u32 acked_pcount;
u8 sacked = scb->sacked;
u32 acked_pcount;
/* Determine how many packets and what bytes were acked, tso and else */
if (after(scb->end_seq, tp->snd_una)) {
@ -3063,11 +3065,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->retrans_out -= acked_pcount;
flag |= FLAG_RETRANS_DATA_ACKED;
} else {
ca_seq_rtt = now - scb->when;
last_ackt = skb->tstamp;
if (seq_rtt < 0) {
seq_rtt = ca_seq_rtt;
}
last_ackt = skb->skb_mstamp;
if (!first_ackt.v64)
first_ackt = last_ackt;
if (!(sacked & TCPCB_SACKED_ACKED))
reord = min(pkts_acked, reord);
if (!after(scb->end_seq, tp->high_seq))
@ -3113,7 +3114,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
if (skb && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
flag |= FLAG_SACK_RENEGING;
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt, sack_rtt);
skb_mstamp_get(&now);
if (first_ackt.v64) {
seq_rtt_us = skb_mstamp_us_delta(&now, &first_ackt);
ca_seq_rtt_us = skb_mstamp_us_delta(&now, &last_ackt);
}
rtt_update = tcp_ack_update_rtt(sk, flag, seq_rtt_us, sack_rtt_us);
if (flag & FLAG_ACKED) {
const struct tcp_congestion_ops *ca_ops
@ -3141,25 +3148,11 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
tp->fackets_out -= min(pkts_acked, tp->fackets_out);
if (ca_ops->pkts_acked) {
s32 rtt_us = -1;
if (ca_ops->pkts_acked)
ca_ops->pkts_acked(sk, pkts_acked, ca_seq_rtt_us);
/* Is the ACK triggering packet unambiguous? */
if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
/* High resolution needed and available? */
if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
!ktime_equal(last_ackt,
net_invalid_timestamp()))
rtt_us = ktime_us_delta(ktime_get_real(),
last_ackt);
else if (ca_seq_rtt >= 0)
rtt_us = jiffies_to_usecs(ca_seq_rtt);
}
ca_ops->pkts_acked(sk, pkts_acked, rtt_us);
}
} else if (skb && rtt_update && sack_rtt >= 0 &&
sack_rtt > (s32)(now - TCP_SKB_CB(skb)->when)) {
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > skb_mstamp_us_delta(&now, &skb->skb_mstamp)) {
/* Do not re-arm RTO if the sack RTT is measured from data sent
* after when the head was last (re)transmitted. Otherwise the
* timeout may continue to extend in loss recovery.
@ -3369,12 +3362,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 ack_seq = TCP_SKB_CB(skb)->seq;
u32 ack = TCP_SKB_CB(skb)->ack_seq;
bool is_dupack = false;
u32 prior_in_flight, prior_cwnd = tp->snd_cwnd, prior_rtt = tp->srtt;
u32 prior_in_flight;
u32 prior_fackets;
int prior_packets = tp->packets_out;
const int prior_unsacked = tp->packets_out - tp->sacked_out;
int acked = 0; /* Number of packets newly acked */
s32 sack_rtt = -1;
long sack_rtt_us = -1L;
/* If the ack is older than previous acks
* then we can probably ignore it.
@ -3432,7 +3425,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (TCP_SKB_CB(skb)->sacked)
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt);
&sack_rtt_us);
if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
flag |= FLAG_ECE;
@ -3451,7 +3444,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* See if we can take anything off of the retransmit queue. */
acked = tp->packets_out;
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, sack_rtt);
flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
sack_rtt_us);
acked -= tp->packets_out;
/* Advance cwnd if state allows */
@ -3474,8 +3468,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
if (tp->srtt != prior_rtt || tp->snd_cwnd != prior_cwnd)
tcp_update_pacing_rate(sk);
tcp_update_pacing_rate(sk);
return 1;
no_queue:
@ -3504,7 +3497,7 @@ old_ack:
*/
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_rtt);
&sack_rtt_us);
tcp_fastretrans_alert(sk, acked, prior_unsacked,
is_dupack, flag);
}