Page MenuHomeFreeBSD

D15337.id.diff
No OneTemporary

D15337.id.diff

Index: sys/amd64/include/cpufunc.h
===================================================================
--- sys/amd64/include/cpufunc.h
+++ sys/amd64/include/cpufunc.h
@@ -386,6 +386,16 @@
return (low | ((uint64_t)high << 32));
}
+static __inline uint64_t
+rdtscp(void)
+{
+ uint64_t low, high;
+ uint32_t aux;
+
+ __asm __volatile("rdtscp" : "=a" (low), "=d" (high), "=c" (aux) : : );
+ return (low | (high << 32));
+}
+
static __inline uint32_t
rdtsc32(void)
{
Index: sys/dev/cxgbe/tom/t4_cpl_io.c
===================================================================
--- sys/dev/cxgbe/tom/t4_cpl_io.c
+++ sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -1808,7 +1808,7 @@
if (tp->snd_una != snd_una) {
tp->snd_una = snd_una;
- tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent_age = tcp_ts_getsbintime();
}
}
Index: sys/netinet/khelp/h_ertt.c
===================================================================
--- sys/netinet/khelp/h_ertt.c
+++ sys/netinet/khelp/h_ertt.c
@@ -153,12 +153,12 @@
*prtt_bytes_adjust += *pmeasurenext_len;
} else {
if (mflag & FORCED_MEASUREMENT) {
- e_t->markedpkt_rtt = tcp_ts_getticks() -
+ e_t->markedpkt_rtt = tcp_ts_getsbintime() -
*pmeasurenext + 1;
e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
*pmeasurenext_len - *prtt_bytes_adjust;
} else {
- e_t->markedpkt_rtt = tcp_ts_getticks() -
+ e_t->markedpkt_rtt = tcp_ts_getsbintime() -
txsi->tx_ts + 1;
e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
*prtt_bytes_adjust;
@@ -353,7 +353,7 @@
*/
if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
/* Make an accurate new measurement. */
- e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
+ e_t->rtt = tcp_ts_getsbintime() - txsi->tx_ts + 1;
if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
e_t->minrtt = e_t->rtt;
@@ -478,11 +478,10 @@
if (((tp->t_flags & TF_NOOPT) == 0) &&
(to->to_flags & TOF_TS)) {
- txsi->tx_ts = ntohl(to->to_tsval) -
- tp->ts_offset;
+ txsi->tx_ts = ntohl(to->to_tsval);
txsi->rx_ts = ntohl(to->to_tsecr);
} else {
- txsi->tx_ts = tcp_ts_getticks();
+ txsi->tx_ts = tcp_ts_getsbintime();
txsi->rx_ts = 0; /* No received time stamp. */
}
TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
Index: sys/netinet/tcp_input.c
===================================================================
--- sys/netinet/tcp_input.c
+++ sys/netinet/tcp_input.c
@@ -62,6 +62,7 @@
#ifdef TCP_HHOOK
#include <sys/hhook.h>
#endif
+#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
@@ -341,21 +342,20 @@
tcp_hc_get(&inp->inp_inc, &metrics);
maxseg = tcp_maxseg(tp);
- if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt * SBT_1US)) {
tp->t_srtt = rtt;
- tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+ tp->t_rttbest = tp->t_srtt;
TCPSTAT_INC(tcps_usedrtt);
if (metrics.rmx_rttvar) {
- tp->t_rttvar = metrics.rmx_rttvar;
+ tp->t_rttvar = metrics.rmx_rttvar * SBT_1US;
TCPSTAT_INC(tcps_usedrttvar);
} else {
/* default variation is +- 1 rtt */
- tp->t_rttvar =
- tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+ tp->t_rttvar = (tp->t_srtt >> 1);
}
TCPT_RANGESET(tp->t_rxtcur,
- ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
- tp->t_rttmin, TCPTV_REXMTMAX);
+ tp->t_srtt + 4*tp->t_rttvar,
+ tp->t_rttmin, TCPTV_REXMTMAX*tick_sbt);
}
if (metrics.rmx_ssthresh) {
/*
@@ -479,12 +479,14 @@
* the ack that opens up a 0-sized window.
* - LRO wasn't used for this segment. We make sure by checking that the
* segment size is not larger than the MSS.
+ * - the calculated delay is greater than 2ms
*/
#define DELAY_ACK(tp, tlen) \
- ((!tcp_timer_active(tp, TT_DELACK) && \
+ (((!tcp_timer_active(tp, TT_DELACK) && \
(tp->t_flags & TF_RXWIN0SENT) == 0) && \
(tlen <= tp->t_maxseg) && \
- (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
+ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) && \
+ tp->t_delack > 2*SBT_1MS)
static void inline
cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
@@ -517,7 +519,7 @@
CC_ALGO(tp)->ecnpkt_handler(tp->ccv);
if (tp->ccv->flags & CCF_ACKNOW)
- tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ tcp_timer_activate(tp, TT_DELACK, tp->t_delack);
}
}
@@ -581,6 +583,7 @@
int drop_hdrlen;
int thflags;
int rstreason = 0; /* For badport_bandlim accounting purposes */
+ sbintime_t t;
uint8_t iptos;
struct m_tag *fwd_tag = NULL;
#ifdef INET6
@@ -606,6 +609,7 @@
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
#endif
+ t = tcp_ts_getsbintime();
off0 = *offp;
m = *mp;
*mp = NULL;
@@ -1510,11 +1514,11 @@
{
int newsize = 0;
- if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
- tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
- TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
- (tp->t_srtt >> TCP_RTT_SHIFT)) {
- if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
+ if ((V_tcp_do_autorcvbuf & !!(so->so_rcv.sb_flags & SB_AUTOSIZE) &
+ !!tp->t_srtt & !!tp->rfbuf_ts) &&
+ tcp_ts_getsbintime() - TCP_TS_TO_SBT(tp->rfbuf_ts) >
+ tp->t_srtt) {
+ if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 8) * 7) &&
so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
newsize = min(so->so_rcv.sb_hiwat +
V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max);
@@ -1545,7 +1549,8 @@
struct mbuf *mfree;
struct tcpopt to;
int tfo_syn;
-
+ sbintime_t t;
+
#ifdef TCPDEBUG
/*
* The size of tcp_saveipgen must be the size of the max ip header,
@@ -1555,6 +1560,7 @@
struct tcphdr tcp_savetcp;
short ostate = 0;
#endif
+ t = tcp_ts_getsbintime();
thflags = th->th_flags;
inc = &tp->t_inpcb->inp_inc;
tp->sackhint.last_sack_ack = 0;
@@ -1622,7 +1628,7 @@
* XXX: This should be done after segment
* validation to ignore broken/spoofed segs.
*/
- tp->t_rcvtime = ticks;
+ tp->t_rcvtime = t;
/*
* Scale up the window into a 32-bit value.
@@ -1679,9 +1685,13 @@
* was established.
*/
if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
- to.to_tsecr -= tp->ts_offset;
- if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
+ if (to.to_tsecr == tp->t_lasttsecr + MAX_TS_STEP) {
+ tp->t_lasttsecr = to.to_tsecr;
+ to.to_tsecr = tp->t_lasttsval;
+ } else if (TSTMP_GT(to.to_tsecr, TCP_SBT_TO_TS(t)))
to.to_tsecr = 0;
+ else
+ tp->t_lasttsecr = to.to_tsecr;
}
/*
* Process options only when we get SYN/ACK back. The SYN case
@@ -1704,7 +1714,7 @@
if (to.to_flags & TOF_TS) {
tp->t_flags |= TF_RCVD_TSTMP;
tp->ts_recent = to.to_tsval;
- tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent_age = t;
}
if (to.to_flags & TOF_MSS)
tcp_mss(tp, to.to_mss);
@@ -1774,7 +1784,7 @@
*/
if ((to.to_flags & TOF_TS) != 0 &&
SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
- tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent_age = t;
tp->ts_recent = to.to_tsval;
}
@@ -1798,7 +1808,7 @@
*/
if (tp->t_rxtshift == 1 &&
tp->t_flags & TF_PREVVALID &&
- (int)(ticks - tp->t_badrxtwin) < 0) {
+ (t - tp->t_badrxtwin) < 0) {
cc_cong_signal(tp, th, CC_RTO_ERR);
}
@@ -1812,20 +1822,27 @@
*/
if ((to.to_flags & TOF_TS) != 0 &&
to.to_tsecr) {
- uint32_t t;
+ u_int curts;
+ sbintime_t rtt;
- t = tcp_ts_getticks() - to.to_tsecr;
- if (!tp->t_rttlow || tp->t_rttlow > t)
- tp->t_rttlow = t;
- tcp_xmit_timer(tp,
- TCP_TS_TO_TICKS(t) + 1);
+ curts = (uint32_t)TCP_SBT_TO_TS(t);
+ /*
+ * cope with frequent wrap
+ */
+ if (__predict_true(curts > to.to_tsecr))
+ rtt = curts - to.to_tsecr;
+ else
+ rtt = UINT_MAX - to.to_tsecr + curts;
+ rtt = TCP_TS_TO_SBT(rtt);
+ if (!tp->t_rttlow || tp->t_rttlow > rtt)
+ tp->t_rttlow = rtt;
+ tcp_xmit_timer(tp, rtt + SBT_MINTS);
} else if (tp->t_rtttime &&
SEQ_GT(th->th_ack, tp->t_rtseq)) {
if (!tp->t_rttlow ||
- tp->t_rttlow > ticks - tp->t_rtttime)
- tp->t_rttlow = ticks - tp->t_rtttime;
- tcp_xmit_timer(tp,
- ticks - tp->t_rtttime);
+ tp->t_rttlow > t - tp->t_rtttime)
+ tp->t_rttlow = t - tp->t_rtttime;
+ tcp_xmit_timer(tp, t - tp->t_rtttime);
}
acked = BYTES_THIS_ACK(tp, th);
@@ -2056,7 +2073,7 @@
*/
if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack)
tcp_timer_activate(tp, TT_DELACK,
- tcp_delacktime);
+ tp->t_delack);
else
tp->t_flags |= TF_ACKNOW;
@@ -2247,7 +2264,8 @@
TSTMP_LT(to.to_tsval, tp->ts_recent)) {
/* Check to see if ts_recent is over 24 days old. */
- if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
+ /* Check to see if ts_recent is over MSL OLD */
+ if (t - tp->ts_recent_age > TCP_PAWS_IDLE_SBT) {
/*
* Invalidate ts_recent. If this segment updates
* ts_recent, the age will be reset later and ts_recent
@@ -2401,7 +2419,7 @@
SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
((thflags & (TH_SYN|TH_FIN)) != 0))) {
- tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent_age = t;
tp->ts_recent = to.to_tsval;
}
@@ -2450,7 +2468,7 @@
* SYN-RECEIVED -> ESTABLISHED
* SYN-RECEIVED* -> FIN-WAIT-1
*/
- tp->t_starttime = ticks;
+ tp->t_starttime = t;
if (tp->t_flags & TF_NEEDFIN) {
tcp_state_change(tp, TCPS_FIN_WAIT_1);
tp->t_flags &= ~TF_NEEDFIN;
@@ -2787,8 +2805,8 @@
* original cwnd and ssthresh, and proceed to transmit where
* we left off.
*/
- if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
- (int)(ticks - tp->t_badrxtwin) < 0)
+ if (tp->t_rxtshift > 0 && tp->t_flags & TF_PREVVALID &&
+ (t - tp->t_badrxtwin) < 0)
cc_cong_signal(tp, th, CC_RTO_ERR);
/*
@@ -2806,16 +2824,16 @@
* huge RTT and blow up the retransmit timer.
*/
if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) {
- uint32_t t;
+ sbintime_t rtt;
- t = tcp_ts_getticks() - to.to_tsecr;
- if (!tp->t_rttlow || tp->t_rttlow > t)
- tp->t_rttlow = t;
- tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
+ rtt = TCP_TS_TO_SBT(((uint32_t)TCP_SBT_TO_TS(t)) - to.to_tsecr);
+ if (!tp->t_rttlow || tp->t_rttlow > rtt)
+ tp->t_rttlow = rtt;
+ tcp_xmit_timer(tp, rtt + SBT_MINTS);
} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
- if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
- tp->t_rttlow = ticks - tp->t_rtttime;
- tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+ if (!tp->t_rttlow || tp->t_rttlow > t - tp->t_rtttime)
+ tp->t_rttlow = t - tp->t_rtttime;
+ tcp_xmit_timer(tp, t - tp->t_rtttime);
}
/*
@@ -3134,7 +3152,7 @@
* enter the CLOSE_WAIT state.
*/
case TCPS_SYN_RECEIVED:
- tp->t_starttime = ticks;
+ tp->t_starttime = t;
/* FALLTHROUGH */
case TCPS_ESTABLISHED:
tcp_state_change(tp, TCPS_CLOSE_WAIT);
@@ -3189,7 +3207,7 @@
if (tp->t_flags & TF_DELACK) {
tp->t_flags &= ~TF_DELACK;
- tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ tcp_timer_activate(tp, TT_DELACK, tp->t_delack);
}
INP_WUNLOCK(tp->t_inpcb);
return;
@@ -3480,27 +3498,47 @@
* and update averages and current timeout.
*/
void
-tcp_xmit_timer(struct tcpcb *tp, int rtt)
+tcp_xmit_timer(struct tcpcb *tp, sbintime_t rtt)
{
- int delta;
+ int64_t delta;
+ uint64_t expected_samples, shift, var_shift;
INP_WLOCK_ASSERT(tp->t_inpcb);
+ /*
+ * track this
+ */
+ if (rtt < SBT_1NS*100)
+ return;
+
+ /* RFC 7323 Appendix G RTO Calculation Modification */
+ /* ExpectedSamples = ceiling(FlightSize / (SMSS * 2)) */
+ /* roundup(x, y) == ceiling(x / y) * y */
+ expected_samples = ((tcp_compute_pipe(tp) + ((tp->t_maxseg*2)-1)) / (tp->t_maxseg*2));
+ /*
+ * alpha' = alpha / ExpectedSamples =>
+ * alpha = 1 / 1 >> TCP_RTT_SHIFT
+ * alpha' = 1 / 1 >> (TCP_RTT_SHIFT + shift)
+ **/
+ shift = max(fls(expected_samples + 1), 0) + TCP_RTT_SHIFT;
TCPSTAT_INC(tcps_rttupdated);
tp->t_rttupdated++;
if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) {
/*
- * srtt is stored as fixed point with 5 bits after the
- * binary point (i.e., scaled by 8). The following magic
+ * The following magic
* is equivalent to the smoothing algorithm in rfc793 with
* an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
- * point). Adjust rtt to origin 0.
+ * point) when FlightSize is 1. Adjust rtt to origin 0.
*/
- delta = ((rtt - 1) << TCP_DELTA_SHIFT)
- - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
- if ((tp->t_srtt += delta) <= 0)
- tp->t_srtt = 1;
+ /*
+ * original calculation:
+ * delta = ((rtt - 1) << TCP_DELTA_SHIFT)
+ * - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
+ */
+ delta = ((rtt - 1) >> shift) - (tp->t_srtt >> shift);
+ tp->t_srtt = max(tp->t_srtt + delta, SBT_1US);
+
/*
* We accumulate a smoothed rtt variance (actually, a
@@ -3512,11 +3550,14 @@
* (rttvar = rttvar*3/4 + |delta| / 4). This replaces
* rfc793's wired-in beta.
*/
- if (delta < 0)
- delta = -delta;
- delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
- if ((tp->t_rttvar += delta) <= 0)
- tp->t_rttvar = 1;
+ /*
+ * delta has already implicitly been divided by 8
+ * se we need to multiply by 2 - similarly shift
+ * needs to be adjusted down by one
+ */
+ var_shift = TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT;
+ delta = (abs(delta) << var_shift) - (tp->t_rttvar >> (shift-var_shift));
+ tp->t_rttvar = max(tp->t_rttvar + delta, SBT_1US);
if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
} else {
@@ -3525,8 +3566,8 @@
* Set the variance to half the rtt (so our first
* retransmit happens at 3*rtt).
*/
- tp->t_srtt = rtt << TCP_RTT_SHIFT;
- tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+ tp->t_srtt = rtt;
+ tp->t_rttvar = rtt >> 1;
tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
}
tp->t_rtttime = 0;
@@ -3543,8 +3584,7 @@
* statistical, we have to test that we don't drop below
* the minimum feasible timer (which is 2 ticks).
*/
- TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
- max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
+ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt+2), TCPTV_REXMTMAX*tick_sbt);
/*
* We received an ack for a packet that wasn't retransmitted;
@@ -3569,7 +3609,7 @@
* While looking at the routing entry, we also initialize other path-dependent
* parameters from pre-set or cached values in the routing entry.
*
- * NOTE that resulting t_maxseg doesn't include space for TCP options or
+o * NOTE that resulting t_maxseg doesn't include space for TCP options or
* IP options, e.g. IPSEC data, since length of this data may vary, and
* thus it is calculated for every segment separately in tcp_output().
*
Index: sys/netinet/tcp_output.c
===================================================================
--- sys/netinet/tcp_output.c
+++ sys/netinet/tcp_output.c
@@ -191,7 +191,8 @@
int
tcp_output(struct tcpcb *tp)
{
- struct socket *so = tp->t_inpcb->inp_socket;
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
int32_t len;
uint32_t recwin, sendwin;
int off, flags, error = 0; /* Keep compiler happy */
@@ -213,6 +214,7 @@
struct tcpopt to;
unsigned int wanted_cookie = 0;
unsigned int dont_sendalot = 0;
+ sbintime_t t;
#if 0
int maxburst = TCP_MAXBURST;
#endif
@@ -222,9 +224,9 @@
isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif
+ t = tcp_ts_getsbintime();
- INP_WLOCK_ASSERT(tp->t_inpcb);
-
+ INP_WLOCK_ASSERT(inp);
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
return (tcp_offload_output(tp));
@@ -247,7 +249,7 @@
* to send, then transmit; otherwise, investigate further.
*/
idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
- if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
+ if (idle && (t - tp->t_rcvtime) >= tp->t_rxtcur)
cc_after_idle(tp);
tp->t_flags &= ~TF_LASTIDLE;
if (idle) {
@@ -257,6 +259,7 @@
}
}
again:
+ t = tcp_ts_getsbintime();
/*
* If we've recently taken a timeout, snd_max will be greater than
* snd_nxt. There may be SACK information that allows us to avoid
@@ -808,7 +811,21 @@
/* Timestamps. */
if ((tp->t_flags & TF_RCVD_TSTMP) ||
((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
- to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
+
+ /*
+ * This next part is subtle and extremely critical.
+ * If we've been idle long enough with respect to
+ * the peer we have to lie about our timestamp so
+ * that the peer doesn't see our timestamp as being
+ * "before" the last one that we sent out. The TCP
+ * standard gives no mention to high resolution
+ * timestamp interoperability.
+ */
+ if (SEQ_GT(tp->t_lasttsecr, TCP_SBT_TO_TS(t)))
+ to.to_tsval = (uint32_t)(tp->t_lasttsecr + MAX_TS_STEP);
+ else
+ to.to_tsval = TCP_SBT_TO_TS(t);
+ tp->t_lasttsval = to.to_tsval;
to.to_tsecr = tp->ts_recent;
to.to_flags |= TOF_TS;
}
@@ -816,7 +833,7 @@
/* Set receive buffer autosizing timestamp. */
if (tp->rfbuf_ts == 0 &&
(so->so_rcv.sb_flags & SB_AUTOSIZE))
- tp->rfbuf_ts = tcp_ts_getticks();
+ tp->rfbuf_ts = TCP_SBT_TO_TS(t);
/* Selective ACK's. */
if (tp->t_flags & TF_SACK_PERMIT) {
@@ -1489,7 +1506,7 @@
* not currently timing anything.
*/
if (tp->t_rtttime == 0) {
- tp->t_rtttime = ticks;
+ tp->t_rtttime = t;
tp->t_rtseq = startseq;
TCPSTAT_INC(tcps_segstimed);
}
@@ -1655,8 +1672,7 @@
void
tcp_setpersist(struct tcpcb *tp)
{
- int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
- int tt;
+ uint64_t tt, t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
tp->t_flags &= ~TF_PREVVALID;
if (tcp_timer_active(tp, TT_REXMT))
@@ -1665,7 +1681,7 @@
* Start/restart persistence timer.
*/
TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
- tcp_persmin, tcp_persmax);
+ tcp_persmin*tick_sbt, tcp_persmax*tick_sbt);
tcp_timer_activate(tp, TT_PERSIST, tt);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
tp->t_rxtshift++;
Index: sys/netinet/tcp_seq.h
===================================================================
--- sys/netinet/tcp_seq.h
+++ sys/netinet/tcp_seq.h
@@ -73,18 +73,62 @@
(tp)->snd_recover = (tp)->iss
#ifdef _KERNEL
+
/*
- * Clock macros for RFC 1323 timestamps.
+ * RFC 7323
+ * Section 5.4. Timestamp Clock
+ *
+ * (b) The timestamp clock must not be "too fast".
+ *
+ * The recycling time of the timestamp clock MUST be greater than
+ * MSL seconds. Since the clock (timestamp) is 32 bits and the
+ * worst-case MSL is 255 seconds, the maximum acceptable clock
+ * frequency is one tick every 59 ns.
*/
-#define TCP_TS_TO_TICKS(_t) ((_t) * hz / 1000)
-/* Timestamp wrap-around time, 24 days. */
-#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * 1000)
+/*
+ * The minimum permissible timestamp is 59ns. However, to reduce calculation
+ * overhead we use 256 - (8 bit shift).
+ * - (1<<32)/(1000000000/59) == 253
+ * - (1<<32)/(1000000000/60) == 257
+ *
+ *
+ * Note that MSL should be a function of RTT. Although 60ns is more than sufficient resolution for
+ * the time being a 255s MSL on data center network with a sub-millisecond RTT doesn't make a whole
+ * lot of senese. In the future the MSL should be determined dynamically or at the very least con-
+ * figurable per subnet. Nonetheless, fixing the timestamp clock at a rate corresponding to a 256s
+ * MSL gives us what we need for now while otherwise remaining as RFC compliant as possible.
+ *
+ */
+
+#define SBT_MINTS_SHIFT 8
+#define MIN_TS_STEP 2
+#define TS_1S (SBT_1S >> SBT_MINTS_SHIFT)
+#define SBT_MINTS (1 << SBT_MINTS_SHIFT)
+/* minimum rtt is ~1us (60ns * 16) */
+#define SBT_MINRTT (SBT_MINTS << 4)
/*
- * tcp_ts_getticks() in ms, should be 1ms < x < 1000ms according to RFC 1323.
- * We always use 1ms granularity independent of hz.
+ * Clock macros for RFC 1323 timestamps.
*/
+#define TCP_TS_TO_SBT(_t) ((_t) << SBT_MINTS_SHIFT)
+#define TCP_SBT_TO_TS(_t) ((_t) >> SBT_MINTS_SHIFT)
+#define MAX_TS_STEP ((1<<30))
+
+/*
+ * RFC defined MSL: 255s ( 2s rounding slop)
+ */
+#define TCP_PAWS_IDLE_SBT (SBT_MINTS*SBT_1S/2)
+
+#include <sys/clock.h>
+
+
+#define tcp_ts_getsbintime() (cpu_ts_getsbintime)()
+
+#define TCP_TS_TO_TICKS(_t) ((_t) * hz / 1000)
+
+/* Timestamp wrap-around time, 24 days. */
+#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * 1000)
static __inline uint32_t
tcp_ts_getticks(void)
{
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -751,6 +751,18 @@
static volatile int next_tcp_stack_id = 1;
+#if !defined(__amd64__) && !defined(__i386__)
+static sbintime_t
+cpu_ts_getsbintime_(void)
+{
+ struct bintime bt;
+
+ getbinuptime(&bt);
+ sbt = bt.frac >> SBT_MINTS_SHIFT;
+ return (sbt);
+}
+#endif
+
/*
* Register a TCP function block with the name provided in the names
* array. (Note that this function does NOT automatically register
@@ -1121,6 +1133,9 @@
#ifdef TCPPCAP
tcp_pcap_init();
#endif
+#if !defined(__amd64__) && !defined(__i386__)
+ cpu_tcp_ts_getsbintime = cpu_tcp_ts_getsbintime_;
+#endif
}
#ifdef VIMAGE
@@ -1443,7 +1458,7 @@
if (incl_opts) {
/* Timestamps. */
if (tp->t_flags & TF_RCVD_TSTMP) {
- to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
+ to.to_tsval = TCP_SBT_TO_TS(tcp_ts_getsbintime());
to.to_tsecr = tp->ts_recent;
to.to_flags |= TOF_TS;
}
@@ -1655,11 +1670,12 @@
*/
tp->t_srtt = TCPTV_SRTTBASE;
tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
- tp->t_rttmin = tcp_rexmit_min;
- tp->t_rxtcur = TCPTV_RTOBASE;
+ tp->t_rttmin = tcp_rexmit_min*tick_sbt;
+ tp->t_rxtcur = TCPTV_RTOBASE*tick_sbt;
+ tp->t_delack = tcp_delacktime*tick_sbt;
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
- tp->t_rcvtime = ticks;
+ tp->t_rcvtime = tcp_ts_getsbintime();
/*
* IPv4 TTL initialization is necessary for an IPv6 socket as well,
* because the socket may be bound to an IPv6 wildcard address,
@@ -1856,8 +1872,9 @@
ssthresh = 0;
metrics.rmx_ssthresh = ssthresh;
- metrics.rmx_rtt = tp->t_srtt;
- metrics.rmx_rttvar = tp->t_rttvar;
+
+ metrics.rmx_rtt = tp->t_srtt / SBT_1US;
+ metrics.rmx_rttvar = tp->t_rttvar / SBT_1US;
metrics.rmx_cwnd = tp->snd_cwnd;
metrics.rmx_sendpipe = 0;
metrics.rmx_recvpipe = 0;
Index: sys/netinet/tcp_syncache.c
===================================================================
--- sys/netinet/tcp_syncache.c
+++ sys/netinet/tcp_syncache.c
@@ -834,6 +834,7 @@
tcp_state_change(tp, TCPS_SYN_RECEIVED);
tp->iss = sc->sc_iss;
tp->irs = sc->sc_irs;
+ tp->t_lasttsval = sc->sc_ts;
tcp_rcvseqinit(tp);
tcp_sendseqinit(tp);
blk = sototcpcb(lso)->t_fb;
@@ -882,8 +883,7 @@
if (sc->sc_flags & SCF_TIMESTAMP) {
tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
tp->ts_recent = sc->sc_tsreflect;
- tp->ts_recent_age = tcp_ts_getticks();
- tp->ts_offset = sc->sc_tsoff;
+ tp->ts_recent_age = tcp_ts_getsbintime();
}
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if (sc->sc_flags & SCF_SIGNATURE)
@@ -1488,7 +1488,7 @@
*/
if (to->to_flags & TOF_TS) {
sc->sc_tsreflect = to->to_tsval;
- sc->sc_ts = tcp_ts_getticks();
+ sc->sc_ts = TCP_SBT_TO_TS(tcp_ts_getsbintime());
sc->sc_flags |= SCF_TIMESTAMP;
}
if (to->to_flags & TOF_SCALE) {
@@ -2025,8 +2025,7 @@
/* Randomize the timestamp. */
if (sc->sc_flags & SCF_TIMESTAMP) {
- sc->sc_ts = arc4random();
- sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks();
+ sc->sc_ts = TCP_SBT_TO_TS(tcp_ts_getsbintime());
}
TCPSTAT_INC(tcps_sc_sendcookie);
@@ -2116,7 +2115,7 @@
sc->sc_flags |= SCF_TIMESTAMP;
sc->sc_tsreflect = to->to_tsval;
sc->sc_ts = to->to_tsecr;
- sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks();
+ sc->sc_tsoff = to->to_tsecr - TCP_SBT_TO_TS(tcp_ts_getsbintime());
}
if (to->to_flags & TOF_SIGNATURE)
Index: sys/netinet/tcp_timer.c
===================================================================
--- sys/netinet/tcp_timer.c
+++ sys/netinet/tcp_timer.c
@@ -71,6 +71,7 @@
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
+#include <netinet/tcp_seq.h>
#include <netinet/cc/cc.h>
#ifdef INET6
#include <netinet6/tcp6_var.h>
@@ -389,7 +390,7 @@
tcp_inpinfo_lock_del(inp, tp);
goto out;
} else {
- if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
+ if (tcp_ts_getsbintime() - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
callout_reset(&tp->t_timers->tt_2msl,
TP_KEEPINTVL(tp), tcp_timer_2msl, tp);
} else {
@@ -475,7 +476,7 @@
if ((tcp_always_keepalive ||
inp->inp_socket->so_options & SO_KEEPALIVE) &&
tp->t_state <= TCPS_CLOSING) {
- if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
+ if (tcp_ts_getsbintime() - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
goto dropit;
/*
* Send a packet designed to force a response
@@ -538,6 +539,7 @@
{
struct tcpcb *tp = xtp;
struct inpcb *inp;
+ sbintime_t dt;
CURVNET_SET(tp->t_vnet);
#ifdef TCPDEBUG
int ostate;
@@ -573,9 +575,10 @@
* (no responses to probes) reaches the maximum
* backoff that we would use if retransmitting.
*/
+ dt = tcp_ts_getsbintime() - tp->t_rcvtime;
if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
- (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
- ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
+ (dt >= tcp_maxpersistidle*tick_sbt ||
+ dt >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
TCPSTAT_INC(tcps_persistdrop);
if (tcp_inpinfo_lock_add(inp)) {
tcp_inpinfo_lock_del(inp, tp);
@@ -693,18 +696,19 @@
tp->t_flags |= TF_WASCRECOVERY;
else
tp->t_flags &= ~TF_WASCRECOVERY;
- tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+ tp->t_badrxtwin = tcp_ts_getsbintime() + tp->t_rxtcur;
tp->t_flags |= TF_PREVVALID;
} else
tp->t_flags &= ~TF_PREVVALID;
TCPSTAT_INC(tcps_rexmttimeo);
if ((tp->t_state == TCPS_SYN_SENT) ||
(tp->t_state == TCPS_SYN_RECEIVED))
- rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
+ rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift] * tick_sbt;
else
rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
+ /* 1 < delack < tcp_delacktime - and should scale down with RTO/2 */
TCPT_RANGESET(tp->t_rxtcur, rexmt,
- tp->t_rttmin, TCPTV_REXMTMAX);
+ tp->t_rttmin, TCPTV_REXMTMAX*tick_sbt);
/*
* We enter the path for PLMTUD if connection is established or, if
@@ -863,13 +867,13 @@
}
void
-tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
+tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, sbintime_t delta)
{
struct callout *t_callout;
timeout_t *f_callout;
struct inpcb *inp = tp->t_inpcb;
int cpu = inp_to_cpuid(inp);
-
+ sbintime_t f_precision;
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
return;
@@ -882,22 +886,27 @@
case TT_DELACK:
t_callout = &tp->t_timers->tt_delack;
f_callout = tcp_timer_delack;
+ f_precision = SBT_1MS;
break;
case TT_REXMT:
t_callout = &tp->t_timers->tt_rexmt;
f_callout = tcp_timer_rexmt;
+ f_precision = SBT_1US;
break;
case TT_PERSIST:
t_callout = &tp->t_timers->tt_persist;
f_callout = tcp_timer_persist;
+ f_precision = SBT_1S;
break;
case TT_KEEP:
t_callout = &tp->t_timers->tt_keep;
f_callout = tcp_timer_keep;
+ f_precision = SBT_1S;
break;
case TT_2MSL:
t_callout = &tp->t_timers->tt_2msl;
f_callout = tcp_timer_2msl;
+ f_precision = SBT_1S;
break;
default:
if (tp->t_fb->tfb_tcp_timer_activate) {
@@ -909,7 +918,7 @@
if (delta == 0) {
callout_stop(t_callout);
} else {
- callout_reset_on(t_callout, delta, f_callout, tp, cpu);
+ callout_reset_sbt_on(t_callout, delta, f_precision, f_callout, tp, cpu, 0);
}
}
Index: sys/netinet/tcp_timewait.c
===================================================================
--- sys/netinet/tcp_timewait.c
+++ sys/netinet/tcp_timewait.c
@@ -301,10 +301,8 @@
if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
(TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
tw->t_recent = tp->ts_recent;
- tw->ts_offset = tp->ts_offset;
} else {
tw->t_recent = 0;
- tw->ts_offset = 0;
}
tw->snd_nxt = tp->snd_nxt;
@@ -574,7 +572,7 @@
*/
if (tw->t_recent && flags == TH_ACK) {
to.to_flags |= TOF_TS;
- to.to_tsval = tcp_ts_getticks() + tw->ts_offset;
+ to.to_tsval = TCP_SBT_TO_TS(tcp_ts_getsbintime());
to.to_tsecr = tw->t_recent;
}
optlen = tcp_addoptions(&to, (u_char *)(th + 1));
Index: sys/netinet/tcp_usrreq.c
===================================================================
--- sys/netinet/tcp_usrreq.c
+++ sys/netinet/tcp_usrreq.c
@@ -1380,9 +1380,9 @@
ti->tcpi_options |= TCPI_OPT_ECN;
ti->tcpi_rto = tp->t_rxtcur * tick;
- ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
- ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
- ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
+ ti->tcpi_last_data_recv = (long)((tcp_ts_getsbintime() - tp->t_rcvtime)/tick_sbt) * tick;
+ ti->tcpi_rtt = ((u_int64_t)(tp->t_srtt/tick_sbt) * tick);
+ ti->tcpi_rttvar = ((u_int64_t)(tp->t_rttvar/tick_sbt) * tick);
ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
ti->tcpi_snd_cwnd = tp->snd_cwnd;
@@ -2175,7 +2175,7 @@
int timeout;
timeout = (tcp_fast_finwait2_recycle) ?
- tcp_finwait2_timeout : TP_MAXIDLE(tp);
+ tcp_finwait2_timeout*tick_sbt : TP_MAXIDLE(tp);
tcp_timer_activate(tp, TT_2MSL, timeout);
}
}
@@ -2426,20 +2426,20 @@
"0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
db_print_indent(indent);
- db_printf("t_rcvtime: %u t_startime: %u\n",
+ db_printf("t_rcvtime: %zu t_startime: %zu\n",
tp->t_rcvtime, tp->t_starttime);
db_print_indent(indent);
- db_printf("t_rttime: %u t_rtsq: 0x%08x\n",
+ db_printf("t_rttime: %zu t_rtsq: 0x%08x\n",
tp->t_rtttime, tp->t_rtseq);
db_print_indent(indent);
- db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n",
+ db_printf("t_rxtcur: %zu t_maxseg: %u t_srtt: %zu\n",
tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
db_print_indent(indent);
- db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u "
- "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin,
+ db_printf("t_rttvar: %zu t_rxtshift: %d t_rttmin: %zu "
+ "t_rttbest: %zu\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin,
tp->t_rttbest);
db_print_indent(indent);
@@ -2456,16 +2456,16 @@
tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
db_print_indent(indent);
- db_printf("ts_recent: %u ts_recent_age: %u\n",
+ db_printf("ts_recent: %u ts_recent_age: %zu\n",
tp->ts_recent, tp->ts_recent_age);
db_print_indent(indent);
- db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: "
- "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
+ db_printf("last_ack_sent: 0x%08x snd_cwnd_prev: "
+ "%u\n", tp->last_ack_sent, tp->snd_cwnd_prev);
db_print_indent(indent);
db_printf("snd_ssthresh_prev: %u snd_recover_prev: 0x%08x "
- "t_badrxtwin: %u\n", tp->snd_ssthresh_prev,
+ "t_badrxtwin: %zu\n", tp->snd_ssthresh_prev,
tp->snd_recover_prev, tp->t_badrxtwin);
db_print_indent(indent);
Index: sys/netinet/tcp_var.h
===================================================================
--- sys/netinet/tcp_var.h
+++ sys/netinet/tcp_var.h
@@ -116,62 +116,65 @@
tcp_seq rcv_adv; /* advertised window */
uint32_t rcv_wnd; /* receive window */
u_int t_flags2; /* More tcpcb flags storage */
- int t_srtt; /* smoothed round-trip time */
- int t_rttvar; /* variance in round-trip time */
+ uint64_t t_srtt; /* smoothed round-trip time */
+ uint64_t t_rttvar; /* variance in round-trip time */
u_int32_t ts_recent; /* timestamp echo data */
u_char snd_scale; /* window scaling for send window */
u_char rcv_scale; /* window scaling for recv window */
u_char snd_limited; /* segments limited transmitted */
u_char request_r_scale; /* pending window scaling */
- tcp_seq last_ack_sent;
- u_int t_rcvtime; /* inactivity time */
/* Cache line 3 */
+ sbintime_t t_rcvtime; /* inactivity time */
+ tcp_seq last_ack_sent;
tcp_seq rcv_up; /* receive urgent pointer */
- int t_segqlen; /* segment reassembly queue length */
+
struct tsegqe_head t_segq; /* segment reassembly queue */
+
struct mbuf *t_in_pkt;
struct mbuf *t_tail_pkt;
+
struct tcp_timer *t_timers; /* All the TCP timers in one struct */
- struct vnet *t_vnet; /* back pointer to parent vnet */
uint32_t snd_ssthresh; /* snd_cwnd size threshold for
* for slow start exponential to
* linear switch
*/
- tcp_seq snd_wl1; /* window update seg seq number */
+ int t_segqlen; /* segment reassembly queue length */
/* Cache line 4 */
+ struct vnet *t_vnet; /* back pointer to parent vnet */
+ tcp_seq snd_wl1; /* window update seg seq number */
tcp_seq snd_wl2; /* window update seg ack number */
tcp_seq irs; /* initial receive sequence number */
tcp_seq iss; /* initial send sequence number */
u_int t_acktime;
- u_int ts_recent_age; /* when last updated */
tcp_seq snd_recover; /* for use in NewReno Fast Recovery */
+
+ sbintime_t ts_recent_age; /* when last updated */
+ sbintime_t t_rxtcur; /* current retransmit value (ticks) */
+ sbintime_t t_rtttime; /* RTT measurement start time */
+ uint32_t t_lasttsecr;
+ uint32_t t_lasttsval;
+ /* Cache line 5 */
+ tcp_seq t_rtseq; /* sequence number being timed */
uint16_t cl4_spare; /* Spare to adjust CL 4 */
char t_oobflags; /* have some */
char t_iobc; /* input character */
- int t_rxtcur; /* current retransmit value (ticks) */
-
+ sbintime_t t_starttime; /* time connection was established */
int t_rxtshift; /* log(2) of rexmt exp. backoff */
- u_int t_rtttime; /* RTT measurement start time */
-
- tcp_seq t_rtseq; /* sequence number being timed */
- u_int t_starttime; /* time connection was established */
-
u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */
- u_int t_rttmin; /* minimum rtt allowed */
-
- u_int t_rttbest; /* best rtt we've seen */
-
+ sbintime_t t_rttmin; /* minimum rtt allowed */
+ sbintime_t t_rttbest; /* best rtt we'v seen */
+ sbintime_t t_delack; /* delayed ack timer */
int t_softerror; /* possible error not yet reported */
uint32_t max_sndwnd; /* largest window peer has offered */
- /* Cache line 5 */
+
uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */
uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */
tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */
- int t_sndzerowin; /* zero-window updates sent */
u_long t_rttupdated; /* number of times rtt sampled */
+ sbintime_t t_badrxtwin; /* window for retransmit recovery */
+ int t_sndzerowin; /* zero-window updates sent */
int snd_numholes; /* number of holes seen by sender */
- u_int t_badrxtwin; /* window for retransmit recovery */
TAILQ_HEAD(sackhole_head, sackhole) snd_holes;
/* SACK scoreboard (sorted) */
tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/
@@ -217,6 +220,7 @@
struct tcptemp {
u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
struct tcphdr tt_t;
+ u_char opt[TCP_MAXOLEN];
};
/*
@@ -432,8 +436,7 @@
short tw_so_options; /* copy of so_options */
struct ucred *tw_cred; /* user credentials */
u_int32_t t_recent;
- u_int32_t ts_offset; /* our timestamp offset */
- u_int t_starttime;
+ sbintime_t t_starttime;
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
void *tw_pspare; /* TCP_SIGNATURE */
@@ -475,9 +478,7 @@
* which results in inappropriately large RTO values for very
* fast networks.
*/
-#define TCP_REXMTVAL(tp) \
- max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \
- + (tp)->t_rttvar) >> TCP_DELTA_SHIFT)
+#define TCP_REXMTVAL(tp) max((tp)->t_rttmin, (tp)->t_srtt + ((tp)->t_rttvar << 2))
/*
* TCP statistics.
@@ -833,7 +834,7 @@
struct tcpcb *, int, int);
void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
-void tcp_xmit_timer(struct tcpcb *, int);
+void tcp_xmit_timer(struct tcpcb *, sbintime_t);
void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
void cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
uint16_t nsegs, uint16_t type);
@@ -892,7 +893,7 @@
struct tcptemp *
tcpip_maketemplate(struct inpcb *);
void tcpip_fillheaders(struct inpcb *, void *, void *);
-void tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
+void tcp_timer_activate(struct tcpcb *, uint32_t, sbintime_t);
int tcp_timer_active(struct tcpcb *, uint32_t);
void tcp_timer_stop(struct tcpcb *, uint32_t);
void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
Index: sys/sys/callout.h
===================================================================
--- sys/sys/callout.h
+++ sys/sys/callout.h
@@ -53,10 +53,13 @@
#define CALLOUT_DIRECT 0x0100 /* allow exec from hw int context */
#define C_DIRECT_EXEC 0x0001 /* direct execution of callout */
-#define C_PRELBITS 7
+
+#define C_PMS 7
+#define C_PRELBITS 11
#define C_PRELRANGE ((1 << C_PRELBITS) - 1)
#define C_PREL(x) (((x) + 1) << 1)
#define C_PRELGET(x) (int)((((x) >> 1) & C_PRELRANGE) - 1)
+#define C_DEFAULT C_PREL(C_PMS)
#define C_HARDCLOCK 0x0100 /* align to hardclock() calls */
#define C_ABSOLUTE 0x0200 /* event time is absolute. */
#define C_PRECALC 0x0400 /* event time is pre-calculated. */
Index: sys/sys/clock.h
===================================================================
--- sys/sys/clock.h
+++ sys/sys/clock.h
@@ -203,6 +203,8 @@
void clock_dbgprint_err(device_t dev, int rw, int err);
void clock_dbgprint_ts(device_t dev, int rw, const struct timespec *ts);
+extern sbintime_t (*cpu_ts_getsbintime)(void);
+
#endif /* _KERNEL */
#endif /* !_SYS_CLOCK_H_ */
Index: sys/x86/x86/tsc.c
===================================================================
--- sys/x86/x86/tsc.c
+++ sys/x86/x86/tsc.c
@@ -54,11 +54,24 @@
#include "cpufreq_if.h"
uint64_t tsc_freq;
+uint64_t tsc_sbt;
+int64_t max_tsc_jitter;
int tsc_is_invariant;
int tsc_perf_stat;
+static int tsc_ts_recalibrate;
+
static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag;
+sbintime_t cpu_ts_getsbintime_rdtsc(void);
+sbintime_t cpu_ts_getsbintime_rdtscp(void);
+sbintime_t (*cpu_ts_getsbintime)(void);
+static void cpu_ts_calibrate_all(void);
+
+
+SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_ts_always_calibrate, CTLFLAG_RW,
+ &tsc_ts_recalibrate, 0, "always use sbintime for timestamp clock");
+
SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN,
&tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant");
@@ -612,6 +625,19 @@
tsc_timecounter.tc_priv = (void *)(intptr_t)shift;
tc_init(&tsc_timecounter);
}
+
+ /* XXX yes this needs to be revisited */
+#if defined(__amd64__)
+ cpu_ts_getsbintime = cpu_ts_getsbintime_rdtscp;
+#elif defined(__i386__)
+ cpu_ts_getsbintime = cpu_ts_getsbintime_rdtsc;
+#endif
+ cpu_ts_calibrate_all();
+
+ /* tsc ticks per 10us */
+ max_tsc_jitter = tsc_freq/(1000000/10);
+
+ printf("tsc_freq: %lu max_tsc_jitter: %lu\n", tsc_freq, max_tsc_jitter);
}
SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL);
@@ -779,3 +805,64 @@
return (1);
}
#endif
+
+static DPCPU_DEFINE(int64_t, pcputsc); /* Per-CPU version of tsc at time of last calibration */
+static volatile sbintime_t sbt0;
+
+
+static void
+cpu_ts_calibrate_all(void)
+{
+ u_int _i;
+ int64_t *tsc;
+
+ CPU_FOREACH(_i) {
+ tsc = DPCPU_ID_PTR(_i, pcputsc);
+ *tsc = rdtsc();
+ }
+ sbt0 = sbinuptime();
+}
+
+
+#define CPU_TS_CALIBRATE(op) \
+static void \
+cpu_ts_calibrate_ ## op(void) \
+{\
+ int64_t *tsc, sbt; \
+\
+ tsc = DPCPU_PTR(pcputsc);\
+ *tsc = op();\
+\
+ sbt = sbinuptime(); \
+ while (sbt > sbt0) \
+ atomic_cmpset_long(&sbt0, sbt0, sbt); \
+}
+
+#define CPU_TS_GETSBINTIME(op) \
+sbintime_t \
+cpu_ts_getsbintime_ ## op(void) \
+{\
+ int64_t tsc, curtsc, tsc_delta; \
+ \
+ critical_enter(); \
+ tsc = DPCPU_GET(pcputsc); \
+ curtsc = op(); \
+ \
+ tsc_delta = curtsc - tsc; \
+ if (tsc_ts_recalibrate || \
+ __predict_false(tsc_delta < 0 || tsc_delta > max_tsc_jitter)) { \
+ cpu_ts_calibrate_ ## op();\
+ critical_exit();\
+ return (sbt0);\
+ }\
+ critical_exit();\
+\
+ return (sbt0);\
+}
+
+
+CPU_TS_CALIBRATE(rdtsc)
+CPU_TS_CALIBRATE(rdtscp)
+
+CPU_TS_GETSBINTIME(rdtsc)
+CPU_TS_GETSBINTIME(rdtscp)

File Metadata

Mime Type
text/plain
Expires
Mon, Jan 19, 10:14 PM (12 h, 43 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27755433
Default Alt Text
D15337.id.diff (40 KB)

Event Timeline