Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F142350684
D15337.id.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
40 KB
Referenced Files
None
Subscribers
None
D15337.id.diff
View Options
Index: sys/amd64/include/cpufunc.h
===================================================================
--- sys/amd64/include/cpufunc.h
+++ sys/amd64/include/cpufunc.h
@@ -386,6 +386,16 @@
return (low | ((uint64_t)high << 32));
}
+static __inline uint64_t
+rdtscp(void)
+{
+ uint64_t low, high;
+ uint32_t aux;
+
+ __asm __volatile("rdtscp" : "=a" (low), "=d" (high), "=c" (aux) : : );
+ return (low | (high << 32));
+}
+
static __inline uint32_t
rdtsc32(void)
{
Index: sys/dev/cxgbe/tom/t4_cpl_io.c
===================================================================
--- sys/dev/cxgbe/tom/t4_cpl_io.c
+++ sys/dev/cxgbe/tom/t4_cpl_io.c
@@ -1808,7 +1808,7 @@
if (tp->snd_una != snd_una) {
tp->snd_una = snd_una;
- tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent_age = tcp_ts_getsbintime();
}
}
Index: sys/netinet/khelp/h_ertt.c
===================================================================
--- sys/netinet/khelp/h_ertt.c
+++ sys/netinet/khelp/h_ertt.c
@@ -153,12 +153,12 @@
*prtt_bytes_adjust += *pmeasurenext_len;
} else {
if (mflag & FORCED_MEASUREMENT) {
- e_t->markedpkt_rtt = tcp_ts_getticks() -
+ e_t->markedpkt_rtt = tcp_ts_getsbintime() -
*pmeasurenext + 1;
e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt +
*pmeasurenext_len - *prtt_bytes_adjust;
} else {
- e_t->markedpkt_rtt = tcp_ts_getticks() -
+ e_t->markedpkt_rtt = tcp_ts_getsbintime() -
txsi->tx_ts + 1;
e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt -
*prtt_bytes_adjust;
@@ -353,7 +353,7 @@
*/
if (!e_t->dlyack_rx || multiack || new_sacked_bytes) {
/* Make an accurate new measurement. */
- e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1;
+ e_t->rtt = tcp_ts_getsbintime() - txsi->tx_ts + 1;
if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0)
e_t->minrtt = e_t->rtt;
@@ -478,11 +478,10 @@
if (((tp->t_flags & TF_NOOPT) == 0) &&
(to->to_flags & TOF_TS)) {
- txsi->tx_ts = ntohl(to->to_tsval) -
- tp->ts_offset;
+ txsi->tx_ts = ntohl(to->to_tsval);
txsi->rx_ts = ntohl(to->to_tsecr);
} else {
- txsi->tx_ts = tcp_ts_getticks();
+ txsi->tx_ts = tcp_ts_getsbintime();
txsi->rx_ts = 0; /* No received time stamp. */
}
TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk);
Index: sys/netinet/tcp_input.c
===================================================================
--- sys/netinet/tcp_input.c
+++ sys/netinet/tcp_input.c
@@ -62,6 +62,7 @@
#ifdef TCP_HHOOK
#include <sys/hhook.h>
#endif
+#include <sys/limits.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
@@ -341,21 +342,20 @@
tcp_hc_get(&inp->inp_inc, &metrics);
maxseg = tcp_maxseg(tp);
- if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt * SBT_1US)) {
tp->t_srtt = rtt;
- tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+ tp->t_rttbest = tp->t_srtt;
TCPSTAT_INC(tcps_usedrtt);
if (metrics.rmx_rttvar) {
- tp->t_rttvar = metrics.rmx_rttvar;
+ tp->t_rttvar = metrics.rmx_rttvar * SBT_1US;
TCPSTAT_INC(tcps_usedrttvar);
} else {
/* default variation is +- 1 rtt */
- tp->t_rttvar =
- tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+ tp->t_rttvar = (tp->t_srtt >> 1);
}
TCPT_RANGESET(tp->t_rxtcur,
- ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
- tp->t_rttmin, TCPTV_REXMTMAX);
+ tp->t_srtt + 4*tp->t_rttvar,
+ tp->t_rttmin, TCPTV_REXMTMAX*tick_sbt);
}
if (metrics.rmx_ssthresh) {
/*
@@ -479,12 +479,14 @@
* the ack that opens up a 0-sized window.
* - LRO wasn't used for this segment. We make sure by checking that the
* segment size is not larger than the MSS.
+ * - the calculated delay is greater than 2ms
*/
#define DELAY_ACK(tp, tlen) \
- ((!tcp_timer_active(tp, TT_DELACK) && \
+ (((!tcp_timer_active(tp, TT_DELACK) && \
(tp->t_flags & TF_RXWIN0SENT) == 0) && \
(tlen <= tp->t_maxseg) && \
- (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
+ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) && \
+ tp->t_delack > 2*SBT_1MS)
static void inline
cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
@@ -517,7 +519,7 @@
CC_ALGO(tp)->ecnpkt_handler(tp->ccv);
if (tp->ccv->flags & CCF_ACKNOW)
- tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ tcp_timer_activate(tp, TT_DELACK, tp->t_delack);
}
}
@@ -581,6 +583,7 @@
int drop_hdrlen;
int thflags;
int rstreason = 0; /* For badport_bandlim accounting purposes */
+ sbintime_t t;
uint8_t iptos;
struct m_tag *fwd_tag = NULL;
#ifdef INET6
@@ -606,6 +609,7 @@
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
#endif
+ t = tcp_ts_getsbintime();
off0 = *offp;
m = *mp;
*mp = NULL;
@@ -1510,11 +1514,11 @@
{
int newsize = 0;
- if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
- tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
- TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
- (tp->t_srtt >> TCP_RTT_SHIFT)) {
- if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
+ if ((V_tcp_do_autorcvbuf & !!(so->so_rcv.sb_flags & SB_AUTOSIZE) &
+ !!tp->t_srtt & !!tp->rfbuf_ts) &&
+ tcp_ts_getsbintime() - TCP_TS_TO_SBT(tp->rfbuf_ts) >
+ tp->t_srtt) {
+ if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 8) * 7) &&
so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
newsize = min(so->so_rcv.sb_hiwat +
V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max);
@@ -1545,7 +1549,8 @@
struct mbuf *mfree;
struct tcpopt to;
int tfo_syn;
-
+ sbintime_t t;
+
#ifdef TCPDEBUG
/*
* The size of tcp_saveipgen must be the size of the max ip header,
@@ -1555,6 +1560,7 @@
struct tcphdr tcp_savetcp;
short ostate = 0;
#endif
+ t = tcp_ts_getsbintime();
thflags = th->th_flags;
inc = &tp->t_inpcb->inp_inc;
tp->sackhint.last_sack_ack = 0;
@@ -1622,7 +1628,7 @@
* XXX: This should be done after segment
* validation to ignore broken/spoofed segs.
*/
- tp->t_rcvtime = ticks;
+ tp->t_rcvtime = t;
/*
* Scale up the window into a 32-bit value.
@@ -1679,9 +1685,13 @@
* was established.
*/
if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) {
- to.to_tsecr -= tp->ts_offset;
- if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks()))
+ if (to.to_tsecr == tp->t_lasttsecr + MAX_TS_STEP) {
+ tp->t_lasttsecr = to.to_tsecr;
+ to.to_tsecr = tp->t_lasttsval;
+ } else if (TSTMP_GT(to.to_tsecr, TCP_SBT_TO_TS(t)))
to.to_tsecr = 0;
+ else
+ tp->t_lasttsecr = to.to_tsecr;
}
/*
* Process options only when we get SYN/ACK back. The SYN case
@@ -1704,7 +1714,7 @@
if (to.to_flags & TOF_TS) {
tp->t_flags |= TF_RCVD_TSTMP;
tp->ts_recent = to.to_tsval;
- tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent_age = t;
}
if (to.to_flags & TOF_MSS)
tcp_mss(tp, to.to_mss);
@@ -1774,7 +1784,7 @@
*/
if ((to.to_flags & TOF_TS) != 0 &&
SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
- tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent_age = t;
tp->ts_recent = to.to_tsval;
}
@@ -1798,7 +1808,7 @@
*/
if (tp->t_rxtshift == 1 &&
tp->t_flags & TF_PREVVALID &&
- (int)(ticks - tp->t_badrxtwin) < 0) {
+ (t - tp->t_badrxtwin) < 0) {
cc_cong_signal(tp, th, CC_RTO_ERR);
}
@@ -1812,20 +1822,27 @@
*/
if ((to.to_flags & TOF_TS) != 0 &&
to.to_tsecr) {
- uint32_t t;
+ u_int curts;
+ sbintime_t rtt;
- t = tcp_ts_getticks() - to.to_tsecr;
- if (!tp->t_rttlow || tp->t_rttlow > t)
- tp->t_rttlow = t;
- tcp_xmit_timer(tp,
- TCP_TS_TO_TICKS(t) + 1);
+ curts = (uint32_t)TCP_SBT_TO_TS(t);
+ /*
+ * cope with frequent wrap
+ */
+ if (__predict_true(curts > to.to_tsecr))
+ rtt = curts - to.to_tsecr;
+ else
+ rtt = UINT_MAX - to.to_tsecr + curts;
+ rtt = TCP_TS_TO_SBT(rtt);
+ if (!tp->t_rttlow || tp->t_rttlow > rtt)
+ tp->t_rttlow = rtt;
+ tcp_xmit_timer(tp, rtt + SBT_MINTS);
} else if (tp->t_rtttime &&
SEQ_GT(th->th_ack, tp->t_rtseq)) {
if (!tp->t_rttlow ||
- tp->t_rttlow > ticks - tp->t_rtttime)
- tp->t_rttlow = ticks - tp->t_rtttime;
- tcp_xmit_timer(tp,
- ticks - tp->t_rtttime);
+ tp->t_rttlow > t - tp->t_rtttime)
+ tp->t_rttlow = t - tp->t_rtttime;
+ tcp_xmit_timer(tp, t - tp->t_rtttime);
}
acked = BYTES_THIS_ACK(tp, th);
@@ -2056,7 +2073,7 @@
*/
if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack)
tcp_timer_activate(tp, TT_DELACK,
- tcp_delacktime);
+ tp->t_delack);
else
tp->t_flags |= TF_ACKNOW;
@@ -2247,7 +2264,8 @@
TSTMP_LT(to.to_tsval, tp->ts_recent)) {
/* Check to see if ts_recent is over 24 days old. */
- if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
+ /* Check to see if ts_recent is over MSL OLD */
+ if (t - tp->ts_recent_age > TCP_PAWS_IDLE_SBT) {
/*
* Invalidate ts_recent. If this segment updates
* ts_recent, the age will be reset later and ts_recent
@@ -2401,7 +2419,7 @@
SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
((thflags & (TH_SYN|TH_FIN)) != 0))) {
- tp->ts_recent_age = tcp_ts_getticks();
+ tp->ts_recent_age = t;
tp->ts_recent = to.to_tsval;
}
@@ -2450,7 +2468,7 @@
* SYN-RECEIVED -> ESTABLISHED
* SYN-RECEIVED* -> FIN-WAIT-1
*/
- tp->t_starttime = ticks;
+ tp->t_starttime = t;
if (tp->t_flags & TF_NEEDFIN) {
tcp_state_change(tp, TCPS_FIN_WAIT_1);
tp->t_flags &= ~TF_NEEDFIN;
@@ -2787,8 +2805,8 @@
* original cwnd and ssthresh, and proceed to transmit where
* we left off.
*/
- if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID &&
- (int)(ticks - tp->t_badrxtwin) < 0)
+ if (tp->t_rxtshift > 0 && tp->t_flags & TF_PREVVALID &&
+ (t - tp->t_badrxtwin) < 0)
cc_cong_signal(tp, th, CC_RTO_ERR);
/*
@@ -2806,16 +2824,16 @@
* huge RTT and blow up the retransmit timer.
*/
if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) {
- uint32_t t;
+ sbintime_t rtt;
- t = tcp_ts_getticks() - to.to_tsecr;
- if (!tp->t_rttlow || tp->t_rttlow > t)
- tp->t_rttlow = t;
- tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1);
+ rtt = TCP_TS_TO_SBT(((uint32_t)TCP_SBT_TO_TS(t)) - to.to_tsecr);
+ if (!tp->t_rttlow || tp->t_rttlow > rtt)
+ tp->t_rttlow = rtt;
+ tcp_xmit_timer(tp, rtt + SBT_MINTS);
} else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) {
- if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime)
- tp->t_rttlow = ticks - tp->t_rtttime;
- tcp_xmit_timer(tp, ticks - tp->t_rtttime);
+ if (!tp->t_rttlow || tp->t_rttlow > t - tp->t_rtttime)
+ tp->t_rttlow = t - tp->t_rtttime;
+ tcp_xmit_timer(tp, t - tp->t_rtttime);
}
/*
@@ -3134,7 +3152,7 @@
* enter the CLOSE_WAIT state.
*/
case TCPS_SYN_RECEIVED:
- tp->t_starttime = ticks;
+ tp->t_starttime = t;
/* FALLTHROUGH */
case TCPS_ESTABLISHED:
tcp_state_change(tp, TCPS_CLOSE_WAIT);
@@ -3189,7 +3207,7 @@
if (tp->t_flags & TF_DELACK) {
tp->t_flags &= ~TF_DELACK;
- tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+ tcp_timer_activate(tp, TT_DELACK, tp->t_delack);
}
INP_WUNLOCK(tp->t_inpcb);
return;
@@ -3480,27 +3498,47 @@
* and update averages and current timeout.
*/
void
-tcp_xmit_timer(struct tcpcb *tp, int rtt)
+tcp_xmit_timer(struct tcpcb *tp, sbintime_t rtt)
{
- int delta;
+ int64_t delta;
+ uint64_t expected_samples, shift, var_shift;
INP_WLOCK_ASSERT(tp->t_inpcb);
+ /*
+ * track this
+ */
+ if (rtt < SBT_1NS*100)
+ return;
+
+ /* RFC 7323 Appendix G RTO Calculation Modification */
+ /* ExpectedSamples = ceiling(FlightSize / (SMSS * 2)) */
+ /* roundup(x, y) == ceiling(x / y) * y */
+ expected_samples = ((tcp_compute_pipe(tp) + ((tp->t_maxseg*2)-1)) / (tp->t_maxseg*2));
+ /*
+ * alpha' = alpha / ExpectedSamples =>
+ * alpha = 1 / 1 >> TCP_RTT_SHIFT
+ * alpha' = 1 / 1 >> (TCP_RTT_SHIFT + shift)
+ **/
+ shift = max(fls(expected_samples + 1), 0) + TCP_RTT_SHIFT;
TCPSTAT_INC(tcps_rttupdated);
tp->t_rttupdated++;
if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) {
/*
- * srtt is stored as fixed point with 5 bits after the
- * binary point (i.e., scaled by 8). The following magic
+ * The following magic
* is equivalent to the smoothing algorithm in rfc793 with
* an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
- * point). Adjust rtt to origin 0.
+ * point) when FlightSize is 1. Adjust rtt to origin 0.
*/
- delta = ((rtt - 1) << TCP_DELTA_SHIFT)
- - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
- if ((tp->t_srtt += delta) <= 0)
- tp->t_srtt = 1;
+ /*
+ * original calculation:
+ * delta = ((rtt - 1) << TCP_DELTA_SHIFT)
+ * - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
+ */
+ delta = ((rtt - 1) >> shift) - (tp->t_srtt >> shift);
+ tp->t_srtt = max(tp->t_srtt + delta, SBT_1US);
+
/*
* We accumulate a smoothed rtt variance (actually, a
@@ -3512,11 +3550,14 @@
* (rttvar = rttvar*3/4 + |delta| / 4). This replaces
* rfc793's wired-in beta.
*/
- if (delta < 0)
- delta = -delta;
- delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
- if ((tp->t_rttvar += delta) <= 0)
- tp->t_rttvar = 1;
+ /*
+ * delta has already implicitly been divided by 8
+ * se we need to multiply by 2 - similarly shift
+ * needs to be adjusted down by one
+ */
+ var_shift = TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT;
+ delta = (abs(delta) << var_shift) - (tp->t_rttvar >> (shift-var_shift));
+ tp->t_rttvar = max(tp->t_rttvar + delta, SBT_1US);
if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar)
tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
} else {
@@ -3525,8 +3566,8 @@
* Set the variance to half the rtt (so our first
* retransmit happens at 3*rtt).
*/
- tp->t_srtt = rtt << TCP_RTT_SHIFT;
- tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
+ tp->t_srtt = rtt;
+ tp->t_rttvar = rtt >> 1;
tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
}
tp->t_rtttime = 0;
@@ -3543,8 +3584,7 @@
* statistical, we have to test that we don't drop below
* the minimum feasible timer (which is 2 ticks).
*/
- TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
- max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX);
+ TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt+2), TCPTV_REXMTMAX*tick_sbt);
/*
* We received an ack for a packet that wasn't retransmitted;
@@ -3569,7 +3609,7 @@
* While looking at the routing entry, we also initialize other path-dependent
* parameters from pre-set or cached values in the routing entry.
*
- * NOTE that resulting t_maxseg doesn't include space for TCP options or
+o * NOTE that resulting t_maxseg doesn't include space for TCP options or
* IP options, e.g. IPSEC data, since length of this data may vary, and
* thus it is calculated for every segment separately in tcp_output().
*
Index: sys/netinet/tcp_output.c
===================================================================
--- sys/netinet/tcp_output.c
+++ sys/netinet/tcp_output.c
@@ -191,7 +191,8 @@
int
tcp_output(struct tcpcb *tp)
{
- struct socket *so = tp->t_inpcb->inp_socket;
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
int32_t len;
uint32_t recwin, sendwin;
int off, flags, error = 0; /* Keep compiler happy */
@@ -213,6 +214,7 @@
struct tcpopt to;
unsigned int wanted_cookie = 0;
unsigned int dont_sendalot = 0;
+ sbintime_t t;
#if 0
int maxburst = TCP_MAXBURST;
#endif
@@ -222,9 +224,9 @@
isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif
+ t = tcp_ts_getsbintime();
- INP_WLOCK_ASSERT(tp->t_inpcb);
-
+ INP_WLOCK_ASSERT(inp);
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
return (tcp_offload_output(tp));
@@ -247,7 +249,7 @@
* to send, then transmit; otherwise, investigate further.
*/
idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
- if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur)
+ if (idle && (t - tp->t_rcvtime) >= tp->t_rxtcur)
cc_after_idle(tp);
tp->t_flags &= ~TF_LASTIDLE;
if (idle) {
@@ -257,6 +259,7 @@
}
}
again:
+ t = tcp_ts_getsbintime();
/*
* If we've recently taken a timeout, snd_max will be greater than
* snd_nxt. There may be SACK information that allows us to avoid
@@ -808,7 +811,21 @@
/* Timestamps. */
if ((tp->t_flags & TF_RCVD_TSTMP) ||
((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) {
- to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
+
+ /*
+ * This next part is subtle and extremely critical.
+ * If we've been idle long enough with respect to
+ * the peer we have to lie about our timestamp so
+ * that the peer doesn't see our timestamp as being
+ * "before" the last one that we sent out. The TCP
+ * standard gives no mention to high resolution
+ * timestamp interoperability.
+ */
+ if (SEQ_GT(tp->t_lasttsecr, TCP_SBT_TO_TS(t)))
+ to.to_tsval = (uint32_t)(tp->t_lasttsecr + MAX_TS_STEP);
+ else
+ to.to_tsval = TCP_SBT_TO_TS(t);
+ tp->t_lasttsval = to.to_tsval;
to.to_tsecr = tp->ts_recent;
to.to_flags |= TOF_TS;
}
@@ -816,7 +833,7 @@
/* Set receive buffer autosizing timestamp. */
if (tp->rfbuf_ts == 0 &&
(so->so_rcv.sb_flags & SB_AUTOSIZE))
- tp->rfbuf_ts = tcp_ts_getticks();
+ tp->rfbuf_ts = TCP_SBT_TO_TS(t);
/* Selective ACK's. */
if (tp->t_flags & TF_SACK_PERMIT) {
@@ -1489,7 +1506,7 @@
* not currently timing anything.
*/
if (tp->t_rtttime == 0) {
- tp->t_rtttime = ticks;
+ tp->t_rtttime = t;
tp->t_rtseq = startseq;
TCPSTAT_INC(tcps_segstimed);
}
@@ -1655,8 +1672,7 @@
void
tcp_setpersist(struct tcpcb *tp)
{
- int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
- int tt;
+ uint64_t tt, t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1;
tp->t_flags &= ~TF_PREVVALID;
if (tcp_timer_active(tp, TT_REXMT))
@@ -1665,7 +1681,7 @@
* Start/restart persistence timer.
*/
TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift],
- tcp_persmin, tcp_persmax);
+ tcp_persmin*tick_sbt, tcp_persmax*tick_sbt);
tcp_timer_activate(tp, TT_PERSIST, tt);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
tp->t_rxtshift++;
Index: sys/netinet/tcp_seq.h
===================================================================
--- sys/netinet/tcp_seq.h
+++ sys/netinet/tcp_seq.h
@@ -73,18 +73,62 @@
(tp)->snd_recover = (tp)->iss
#ifdef _KERNEL
+
/*
- * Clock macros for RFC 1323 timestamps.
+ * RFC 7323
+ * Section 5.4. Timestamp Clock
+ *
+ * (b) The timestamp clock must not be "too fast".
+ *
+ * The recycling time of the timestamp clock MUST be greater than
+ * MSL seconds. Since the clock (timestamp) is 32 bits and the
+ * worst-case MSL is 255 seconds, the maximum acceptable clock
+ * frequency is one tick every 59 ns.
*/
-#define TCP_TS_TO_TICKS(_t) ((_t) * hz / 1000)
-/* Timestamp wrap-around time, 24 days. */
-#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * 1000)
+/*
+ * The minimum permissible timestamp is 59ns. However, to reduce calculation
+ * overhead we use 256 - (8 bit shift).
+ * - (1<<32)/(1000000000/59) == 253
+ * - (1<<32)/(1000000000/60) == 257
+ *
+ *
+ * Note that MSL should be a function of RTT. Although 60ns is more than sufficient resolution for
+ * the time being a 255s MSL on data center network with a sub-millisecond RTT doesn't make a whole
+ * lot of senese. In the future the MSL should be determined dynamically or at the very least con-
+ * figurable per subnet. Nonetheless, fixing the timestamp clock at a rate corresponding to a 256s
+ * MSL gives us what we need for now while otherwise remaining as RFC compliant as possible.
+ *
+ */
+
+#define SBT_MINTS_SHIFT 8
+#define MIN_TS_STEP 2
+#define TS_1S (SBT_1S >> SBT_MINTS_SHIFT)
+#define SBT_MINTS (1 << SBT_MINTS_SHIFT)
+/* minimum rtt is ~1us (60ns * 16) */
+#define SBT_MINRTT (SBT_MINTS << 4)
/*
- * tcp_ts_getticks() in ms, should be 1ms < x < 1000ms according to RFC 1323.
- * We always use 1ms granularity independent of hz.
+ * Clock macros for RFC 1323 timestamps.
*/
+#define TCP_TS_TO_SBT(_t) ((_t) << SBT_MINTS_SHIFT)
+#define TCP_SBT_TO_TS(_t) ((_t) >> SBT_MINTS_SHIFT)
+#define MAX_TS_STEP ((1<<30))
+
+/*
+ * RFC defined MSL: 255s ( 2s rounding slop)
+ */
+#define TCP_PAWS_IDLE_SBT (SBT_MINTS*SBT_1S/2)
+
+#include <sys/clock.h>
+
+
+#define tcp_ts_getsbintime() (cpu_ts_getsbintime)()
+
+#define TCP_TS_TO_TICKS(_t) ((_t) * hz / 1000)
+
+/* Timestamp wrap-around time, 24 days. */
+#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * 1000)
static __inline uint32_t
tcp_ts_getticks(void)
{
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -751,6 +751,18 @@
static volatile int next_tcp_stack_id = 1;
+#if !defined(__amd64__) && !defined(__i386__)
+static sbintime_t
+cpu_ts_getsbintime_(void)
+{
+ struct bintime bt;
+
+ getbinuptime(&bt);
+ sbt = bt.frac >> SBT_MINTS_SHIFT;
+ return (sbt);
+}
+#endif
+
/*
* Register a TCP function block with the name provided in the names
* array. (Note that this function does NOT automatically register
@@ -1121,6 +1133,9 @@
#ifdef TCPPCAP
tcp_pcap_init();
#endif
+#if !defined(__amd64__) && !defined(__i386__)
+ cpu_tcp_ts_getsbintime = cpu_tcp_ts_getsbintime_;
+#endif
}
#ifdef VIMAGE
@@ -1443,7 +1458,7 @@
if (incl_opts) {
/* Timestamps. */
if (tp->t_flags & TF_RCVD_TSTMP) {
- to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
+ to.to_tsval = TCP_SBT_TO_TS(tcp_ts_getsbintime());
to.to_tsecr = tp->ts_recent;
to.to_flags |= TOF_TS;
}
@@ -1655,11 +1670,12 @@
*/
tp->t_srtt = TCPTV_SRTTBASE;
tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
- tp->t_rttmin = tcp_rexmit_min;
- tp->t_rxtcur = TCPTV_RTOBASE;
+ tp->t_rttmin = tcp_rexmit_min*tick_sbt;
+ tp->t_rxtcur = TCPTV_RTOBASE*tick_sbt;
+ tp->t_delack = tcp_delacktime*tick_sbt;
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
- tp->t_rcvtime = ticks;
+ tp->t_rcvtime = tcp_ts_getsbintime();
/*
* IPv4 TTL initialization is necessary for an IPv6 socket as well,
* because the socket may be bound to an IPv6 wildcard address,
@@ -1856,8 +1872,9 @@
ssthresh = 0;
metrics.rmx_ssthresh = ssthresh;
- metrics.rmx_rtt = tp->t_srtt;
- metrics.rmx_rttvar = tp->t_rttvar;
+
+ metrics.rmx_rtt = tp->t_srtt / SBT_1US;
+ metrics.rmx_rttvar = tp->t_rttvar / SBT_1US;
metrics.rmx_cwnd = tp->snd_cwnd;
metrics.rmx_sendpipe = 0;
metrics.rmx_recvpipe = 0;
Index: sys/netinet/tcp_syncache.c
===================================================================
--- sys/netinet/tcp_syncache.c
+++ sys/netinet/tcp_syncache.c
@@ -834,6 +834,7 @@
tcp_state_change(tp, TCPS_SYN_RECEIVED);
tp->iss = sc->sc_iss;
tp->irs = sc->sc_irs;
+ tp->t_lasttsval = sc->sc_ts;
tcp_rcvseqinit(tp);
tcp_sendseqinit(tp);
blk = sototcpcb(lso)->t_fb;
@@ -882,8 +883,7 @@
if (sc->sc_flags & SCF_TIMESTAMP) {
tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
tp->ts_recent = sc->sc_tsreflect;
- tp->ts_recent_age = tcp_ts_getticks();
- tp->ts_offset = sc->sc_tsoff;
+ tp->ts_recent_age = tcp_ts_getsbintime();
}
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if (sc->sc_flags & SCF_SIGNATURE)
@@ -1488,7 +1488,7 @@
*/
if (to->to_flags & TOF_TS) {
sc->sc_tsreflect = to->to_tsval;
- sc->sc_ts = tcp_ts_getticks();
+ sc->sc_ts = TCP_SBT_TO_TS(tcp_ts_getsbintime());
sc->sc_flags |= SCF_TIMESTAMP;
}
if (to->to_flags & TOF_SCALE) {
@@ -2025,8 +2025,7 @@
/* Randomize the timestamp. */
if (sc->sc_flags & SCF_TIMESTAMP) {
- sc->sc_ts = arc4random();
- sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks();
+ sc->sc_ts = TCP_SBT_TO_TS(tcp_ts_getsbintime());
}
TCPSTAT_INC(tcps_sc_sendcookie);
@@ -2116,7 +2115,7 @@
sc->sc_flags |= SCF_TIMESTAMP;
sc->sc_tsreflect = to->to_tsval;
sc->sc_ts = to->to_tsecr;
- sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks();
+ sc->sc_tsoff = to->to_tsecr - TCP_SBT_TO_TS(tcp_ts_getsbintime());
}
if (to->to_flags & TOF_SIGNATURE)
Index: sys/netinet/tcp_timer.c
===================================================================
--- sys/netinet/tcp_timer.c
+++ sys/netinet/tcp_timer.c
@@ -71,6 +71,7 @@
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
+#include <netinet/tcp_seq.h>
#include <netinet/cc/cc.h>
#ifdef INET6
#include <netinet6/tcp6_var.h>
@@ -389,7 +390,7 @@
tcp_inpinfo_lock_del(inp, tp);
goto out;
} else {
- if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
+ if (tcp_ts_getsbintime() - tp->t_rcvtime <= TP_MAXIDLE(tp)) {
callout_reset(&tp->t_timers->tt_2msl,
TP_KEEPINTVL(tp), tcp_timer_2msl, tp);
} else {
@@ -475,7 +476,7 @@
if ((tcp_always_keepalive ||
inp->inp_socket->so_options & SO_KEEPALIVE) &&
tp->t_state <= TCPS_CLOSING) {
- if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
+ if (tcp_ts_getsbintime() - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp))
goto dropit;
/*
* Send a packet designed to force a response
@@ -538,6 +539,7 @@
{
struct tcpcb *tp = xtp;
struct inpcb *inp;
+ sbintime_t dt;
CURVNET_SET(tp->t_vnet);
#ifdef TCPDEBUG
int ostate;
@@ -573,9 +575,10 @@
* (no responses to probes) reaches the maximum
* backoff that we would use if retransmitting.
*/
+ dt = tcp_ts_getsbintime() - tp->t_rcvtime;
if (tp->t_rxtshift == TCP_MAXRXTSHIFT &&
- (ticks - tp->t_rcvtime >= tcp_maxpersistidle ||
- ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
+ (dt >= tcp_maxpersistidle*tick_sbt ||
+ dt >= TCP_REXMTVAL(tp) * tcp_totbackoff)) {
TCPSTAT_INC(tcps_persistdrop);
if (tcp_inpinfo_lock_add(inp)) {
tcp_inpinfo_lock_del(inp, tp);
@@ -693,18 +696,19 @@
tp->t_flags |= TF_WASCRECOVERY;
else
tp->t_flags &= ~TF_WASCRECOVERY;
- tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
+ tp->t_badrxtwin = tcp_ts_getsbintime() + tp->t_rxtcur;
tp->t_flags |= TF_PREVVALID;
} else
tp->t_flags &= ~TF_PREVVALID;
TCPSTAT_INC(tcps_rexmttimeo);
if ((tp->t_state == TCPS_SYN_SENT) ||
(tp->t_state == TCPS_SYN_RECEIVED))
- rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift];
+ rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift] * tick_sbt;
else
rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
+ /* 1 < delack < tcp_delacktime - and should scale down with RTO/2 */
TCPT_RANGESET(tp->t_rxtcur, rexmt,
- tp->t_rttmin, TCPTV_REXMTMAX);
+ tp->t_rttmin, TCPTV_REXMTMAX*tick_sbt);
/*
* We enter the path for PLMTUD if connection is established or, if
@@ -863,13 +867,13 @@
}
void
-tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
+tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, sbintime_t delta)
{
struct callout *t_callout;
timeout_t *f_callout;
struct inpcb *inp = tp->t_inpcb;
int cpu = inp_to_cpuid(inp);
-
+ sbintime_t f_precision;
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE)
return;
@@ -882,22 +886,27 @@
case TT_DELACK:
t_callout = &tp->t_timers->tt_delack;
f_callout = tcp_timer_delack;
+ f_precision = SBT_1MS;
break;
case TT_REXMT:
t_callout = &tp->t_timers->tt_rexmt;
f_callout = tcp_timer_rexmt;
+ f_precision = SBT_1US;
break;
case TT_PERSIST:
t_callout = &tp->t_timers->tt_persist;
f_callout = tcp_timer_persist;
+ f_precision = SBT_1S;
break;
case TT_KEEP:
t_callout = &tp->t_timers->tt_keep;
f_callout = tcp_timer_keep;
+ f_precision = SBT_1S;
break;
case TT_2MSL:
t_callout = &tp->t_timers->tt_2msl;
f_callout = tcp_timer_2msl;
+ f_precision = SBT_1S;
break;
default:
if (tp->t_fb->tfb_tcp_timer_activate) {
@@ -909,7 +918,7 @@
if (delta == 0) {
callout_stop(t_callout);
} else {
- callout_reset_on(t_callout, delta, f_callout, tp, cpu);
+ callout_reset_sbt_on(t_callout, delta, f_precision, f_callout, tp, cpu, 0);
}
}
Index: sys/netinet/tcp_timewait.c
===================================================================
--- sys/netinet/tcp_timewait.c
+++ sys/netinet/tcp_timewait.c
@@ -301,10 +301,8 @@
if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
(TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
tw->t_recent = tp->ts_recent;
- tw->ts_offset = tp->ts_offset;
} else {
tw->t_recent = 0;
- tw->ts_offset = 0;
}
tw->snd_nxt = tp->snd_nxt;
@@ -574,7 +572,7 @@
*/
if (tw->t_recent && flags == TH_ACK) {
to.to_flags |= TOF_TS;
- to.to_tsval = tcp_ts_getticks() + tw->ts_offset;
+ to.to_tsval = TCP_SBT_TO_TS(tcp_ts_getsbintime());
to.to_tsecr = tw->t_recent;
}
optlen = tcp_addoptions(&to, (u_char *)(th + 1));
Index: sys/netinet/tcp_usrreq.c
===================================================================
--- sys/netinet/tcp_usrreq.c
+++ sys/netinet/tcp_usrreq.c
@@ -1380,9 +1380,9 @@
ti->tcpi_options |= TCPI_OPT_ECN;
ti->tcpi_rto = tp->t_rxtcur * tick;
- ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
- ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
- ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
+ ti->tcpi_last_data_recv = (long)((tcp_ts_getsbintime() - tp->t_rcvtime)/tick_sbt) * tick;
+ ti->tcpi_rtt = ((u_int64_t)(tp->t_srtt/tick_sbt) * tick);
+ ti->tcpi_rttvar = ((u_int64_t)(tp->t_rttvar/tick_sbt) * tick);
ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
ti->tcpi_snd_cwnd = tp->snd_cwnd;
@@ -2175,7 +2175,7 @@
int timeout;
timeout = (tcp_fast_finwait2_recycle) ?
- tcp_finwait2_timeout : TP_MAXIDLE(tp);
+ tcp_finwait2_timeout*tick_sbt : TP_MAXIDLE(tp);
tcp_timer_activate(tp, TT_2MSL, timeout);
}
}
@@ -2426,20 +2426,20 @@
"0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
db_print_indent(indent);
- db_printf("t_rcvtime: %u t_startime: %u\n",
+ db_printf("t_rcvtime: %zu t_startime: %zu\n",
tp->t_rcvtime, tp->t_starttime);
db_print_indent(indent);
- db_printf("t_rttime: %u t_rtsq: 0x%08x\n",
+ db_printf("t_rttime: %zu t_rtsq: 0x%08x\n",
tp->t_rtttime, tp->t_rtseq);
db_print_indent(indent);
- db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n",
+ db_printf("t_rxtcur: %zu t_maxseg: %u t_srtt: %zu\n",
tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
db_print_indent(indent);
- db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u "
- "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin,
+ db_printf("t_rttvar: %zu t_rxtshift: %d t_rttmin: %zu "
+ "t_rttbest: %zu\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin,
tp->t_rttbest);
db_print_indent(indent);
@@ -2456,16 +2456,16 @@
tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
db_print_indent(indent);
- db_printf("ts_recent: %u ts_recent_age: %u\n",
+ db_printf("ts_recent: %u ts_recent_age: %zu\n",
tp->ts_recent, tp->ts_recent_age);
db_print_indent(indent);
- db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: "
- "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
+ db_printf("last_ack_sent: 0x%08x snd_cwnd_prev: "
+ "%u\n", tp->last_ack_sent, tp->snd_cwnd_prev);
db_print_indent(indent);
db_printf("snd_ssthresh_prev: %u snd_recover_prev: 0x%08x "
- "t_badrxtwin: %u\n", tp->snd_ssthresh_prev,
+ "t_badrxtwin: %zu\n", tp->snd_ssthresh_prev,
tp->snd_recover_prev, tp->t_badrxtwin);
db_print_indent(indent);
Index: sys/netinet/tcp_var.h
===================================================================
--- sys/netinet/tcp_var.h
+++ sys/netinet/tcp_var.h
@@ -116,62 +116,65 @@
tcp_seq rcv_adv; /* advertised window */
uint32_t rcv_wnd; /* receive window */
u_int t_flags2; /* More tcpcb flags storage */
- int t_srtt; /* smoothed round-trip time */
- int t_rttvar; /* variance in round-trip time */
+ uint64_t t_srtt; /* smoothed round-trip time */
+ uint64_t t_rttvar; /* variance in round-trip time */
u_int32_t ts_recent; /* timestamp echo data */
u_char snd_scale; /* window scaling for send window */
u_char rcv_scale; /* window scaling for recv window */
u_char snd_limited; /* segments limited transmitted */
u_char request_r_scale; /* pending window scaling */
- tcp_seq last_ack_sent;
- u_int t_rcvtime; /* inactivity time */
/* Cache line 3 */
+ sbintime_t t_rcvtime; /* inactivity time */
+ tcp_seq last_ack_sent;
tcp_seq rcv_up; /* receive urgent pointer */
- int t_segqlen; /* segment reassembly queue length */
+
struct tsegqe_head t_segq; /* segment reassembly queue */
+
struct mbuf *t_in_pkt;
struct mbuf *t_tail_pkt;
+
struct tcp_timer *t_timers; /* All the TCP timers in one struct */
- struct vnet *t_vnet; /* back pointer to parent vnet */
uint32_t snd_ssthresh; /* snd_cwnd size threshold for
* for slow start exponential to
* linear switch
*/
- tcp_seq snd_wl1; /* window update seg seq number */
+ int t_segqlen; /* segment reassembly queue length */
/* Cache line 4 */
+ struct vnet *t_vnet; /* back pointer to parent vnet */
+ tcp_seq snd_wl1; /* window update seg seq number */
tcp_seq snd_wl2; /* window update seg ack number */
tcp_seq irs; /* initial receive sequence number */
tcp_seq iss; /* initial send sequence number */
u_int t_acktime;
- u_int ts_recent_age; /* when last updated */
tcp_seq snd_recover; /* for use in NewReno Fast Recovery */
+
+ sbintime_t ts_recent_age; /* when last updated */
+ sbintime_t t_rxtcur; /* current retransmit value (ticks) */
+ sbintime_t t_rtttime; /* RTT measurement start time */
+ uint32_t t_lasttsecr;
+ uint32_t t_lasttsval;
+ /* Cache line 5 */
+ tcp_seq t_rtseq; /* sequence number being timed */
uint16_t cl4_spare; /* Spare to adjust CL 4 */
char t_oobflags; /* have some */
char t_iobc; /* input character */
- int t_rxtcur; /* current retransmit value (ticks) */
-
+ sbintime_t t_starttime; /* time connection was established */
int t_rxtshift; /* log(2) of rexmt exp. backoff */
- u_int t_rtttime; /* RTT measurement start time */
-
- tcp_seq t_rtseq; /* sequence number being timed */
- u_int t_starttime; /* time connection was established */
-
u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */
- u_int t_rttmin; /* minimum rtt allowed */
-
- u_int t_rttbest; /* best rtt we've seen */
-
+ sbintime_t t_rttmin; /* minimum rtt allowed */
+ sbintime_t t_rttbest; /* best rtt we'v seen */
+ sbintime_t t_delack; /* delayed ack timer */
int t_softerror; /* possible error not yet reported */
uint32_t max_sndwnd; /* largest window peer has offered */
- /* Cache line 5 */
+
uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */
uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */
tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */
- int t_sndzerowin; /* zero-window updates sent */
u_long t_rttupdated; /* number of times rtt sampled */
+ sbintime_t t_badrxtwin; /* window for retransmit recovery */
+ int t_sndzerowin; /* zero-window updates sent */
int snd_numholes; /* number of holes seen by sender */
- u_int t_badrxtwin; /* window for retransmit recovery */
TAILQ_HEAD(sackhole_head, sackhole) snd_holes;
/* SACK scoreboard (sorted) */
tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/
@@ -217,6 +220,7 @@
struct tcptemp {
u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
struct tcphdr tt_t;
+ u_char opt[TCP_MAXOLEN];
};
/*
@@ -432,8 +436,7 @@
short tw_so_options; /* copy of so_options */
struct ucred *tw_cred; /* user credentials */
u_int32_t t_recent;
- u_int32_t ts_offset; /* our timestamp offset */
- u_int t_starttime;
+ sbintime_t t_starttime;
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
void *tw_pspare; /* TCP_SIGNATURE */
@@ -475,9 +478,7 @@
* which results in inappropriately large RTO values for very
* fast networks.
*/
-#define TCP_REXMTVAL(tp) \
- max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \
- + (tp)->t_rttvar) >> TCP_DELTA_SHIFT)
+#define TCP_REXMTVAL(tp) max((tp)->t_rttmin, (tp)->t_srtt + ((tp)->t_rttvar << 2))
/*
* TCP statistics.
@@ -833,7 +834,7 @@
struct tcpcb *, int, int);
void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
-void tcp_xmit_timer(struct tcpcb *, int);
+void tcp_xmit_timer(struct tcpcb *, sbintime_t);
void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
void cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
uint16_t nsegs, uint16_t type);
@@ -892,7 +893,7 @@
struct tcptemp *
tcpip_maketemplate(struct inpcb *);
void tcpip_fillheaders(struct inpcb *, void *, void *);
-void tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
+void tcp_timer_activate(struct tcpcb *, uint32_t, sbintime_t);
int tcp_timer_active(struct tcpcb *, uint32_t);
void tcp_timer_stop(struct tcpcb *, uint32_t);
void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
Index: sys/sys/callout.h
===================================================================
--- sys/sys/callout.h
+++ sys/sys/callout.h
@@ -53,10 +53,13 @@
#define CALLOUT_DIRECT 0x0100 /* allow exec from hw int context */
#define C_DIRECT_EXEC 0x0001 /* direct execution of callout */
-#define C_PRELBITS 7
+
+#define C_PMS 7
+#define C_PRELBITS 11
#define C_PRELRANGE ((1 << C_PRELBITS) - 1)
#define C_PREL(x) (((x) + 1) << 1)
#define C_PRELGET(x) (int)((((x) >> 1) & C_PRELRANGE) - 1)
+#define C_DEFAULT C_PREL(C_PMS)
#define C_HARDCLOCK 0x0100 /* align to hardclock() calls */
#define C_ABSOLUTE 0x0200 /* event time is absolute. */
#define C_PRECALC 0x0400 /* event time is pre-calculated. */
Index: sys/sys/clock.h
===================================================================
--- sys/sys/clock.h
+++ sys/sys/clock.h
@@ -203,6 +203,8 @@
void clock_dbgprint_err(device_t dev, int rw, int err);
void clock_dbgprint_ts(device_t dev, int rw, const struct timespec *ts);
+extern sbintime_t (*cpu_ts_getsbintime)(void);
+
#endif /* _KERNEL */
#endif /* !_SYS_CLOCK_H_ */
Index: sys/x86/x86/tsc.c
===================================================================
--- sys/x86/x86/tsc.c
+++ sys/x86/x86/tsc.c
@@ -54,11 +54,24 @@
#include "cpufreq_if.h"
uint64_t tsc_freq;
+uint64_t tsc_sbt;
+int64_t max_tsc_jitter;
int tsc_is_invariant;
int tsc_perf_stat;
+static int tsc_ts_recalibrate;
+
static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag;
+sbintime_t cpu_ts_getsbintime_rdtsc(void);
+sbintime_t cpu_ts_getsbintime_rdtscp(void);
+sbintime_t (*cpu_ts_getsbintime)(void);
+static void cpu_ts_calibrate_all(void);
+
+
+SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_ts_always_calibrate, CTLFLAG_RW,
+ &tsc_ts_recalibrate, 0, "always use sbintime for timestamp clock");
+
SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN,
&tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant");
@@ -612,6 +625,19 @@
tsc_timecounter.tc_priv = (void *)(intptr_t)shift;
tc_init(&tsc_timecounter);
}
+
+ /* XXX yes this needs to be revisited */
+#if defined(__amd64__)
+ cpu_ts_getsbintime = cpu_ts_getsbintime_rdtscp;
+#elif defined(__i386__)
+ cpu_ts_getsbintime = cpu_ts_getsbintime_rdtsc;
+#endif
+ cpu_ts_calibrate_all();
+
+ /* tsc ticks per 10us */
+ max_tsc_jitter = tsc_freq/(1000000/10);
+
+ printf("tsc_freq: %lu max_tsc_jitter: %lu\n", tsc_freq, max_tsc_jitter);
}
SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL);
@@ -779,3 +805,64 @@
return (1);
}
#endif
+
+static DPCPU_DEFINE(int64_t, pcputsc); /* Per-CPU version of tsc at time of last calibration */
+static volatile sbintime_t sbt0;
+
+
+static void
+cpu_ts_calibrate_all(void)
+{
+ u_int _i;
+ int64_t *tsc;
+
+ CPU_FOREACH(_i) {
+ tsc = DPCPU_ID_PTR(_i, pcputsc);
+ *tsc = rdtsc();
+ }
+ sbt0 = sbinuptime();
+}
+
+
+#define CPU_TS_CALIBRATE(op) \
+static void \
+cpu_ts_calibrate_ ## op(void) \
+{\
+ int64_t *tsc, sbt; \
+\
+ tsc = DPCPU_PTR(pcputsc);\
+ *tsc = op();\
+\
+ sbt = sbinuptime(); \
+ while (sbt > sbt0) \
+ atomic_cmpset_long(&sbt0, sbt0, sbt); \
+}
+
+#define CPU_TS_GETSBINTIME(op) \
+sbintime_t \
+cpu_ts_getsbintime_ ## op(void) \
+{\
+ int64_t tsc, curtsc, tsc_delta; \
+ \
+ critical_enter(); \
+ tsc = DPCPU_GET(pcputsc); \
+ curtsc = op(); \
+ \
+ tsc_delta = curtsc - tsc; \
+ if (tsc_ts_recalibrate || \
+ __predict_false(tsc_delta < 0 || tsc_delta > max_tsc_jitter)) { \
+ cpu_ts_calibrate_ ## op();\
+ critical_exit();\
+ return (sbt0);\
+ }\
+ critical_exit();\
+\
+ return (sbt0);\
+}
+
+
+CPU_TS_CALIBRATE(rdtsc)
+CPU_TS_CALIBRATE(rdtscp)
+
+CPU_TS_GETSBINTIME(rdtsc)
+CPU_TS_GETSBINTIME(rdtscp)
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Mon, Jan 19, 10:14 PM (12 h, 43 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27755433
Default Alt Text
D15337.id.diff (40 KB)
Attached To
Mode
D15337: Add support for higher resolution timestamps
Attached
Detach File
Event Timeline
Log In to Comment