Index: sys/amd64/include/cpufunc.h =================================================================== --- sys/amd64/include/cpufunc.h +++ sys/amd64/include/cpufunc.h @@ -386,6 +386,16 @@ return (low | ((uint64_t)high << 32)); } +static __inline uint64_t +rdtscp(void) +{ + uint64_t low, high; + uint32_t aux; + + __asm __volatile("rdtscp" : "=a" (low), "=d" (high), "=c" (aux) : : ); + return (low | (high << 32)); +} + static __inline uint32_t rdtsc32(void) { Index: sys/dev/cxgbe/tom/t4_cpl_io.c =================================================================== --- sys/dev/cxgbe/tom/t4_cpl_io.c +++ sys/dev/cxgbe/tom/t4_cpl_io.c @@ -1808,7 +1808,7 @@ if (tp->snd_una != snd_una) { tp->snd_una = snd_una; - tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent_age = tcp_ts_getsbintime(); } } Index: sys/netinet/khelp/h_ertt.c =================================================================== --- sys/netinet/khelp/h_ertt.c +++ sys/netinet/khelp/h_ertt.c @@ -153,12 +153,12 @@ *prtt_bytes_adjust += *pmeasurenext_len; } else { if (mflag & FORCED_MEASUREMENT) { - e_t->markedpkt_rtt = tcp_ts_getticks() - + e_t->markedpkt_rtt = tcp_ts_getsbintime() - *pmeasurenext + 1; e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt + *pmeasurenext_len - *prtt_bytes_adjust; } else { - e_t->markedpkt_rtt = tcp_ts_getticks() - + e_t->markedpkt_rtt = tcp_ts_getsbintime() - txsi->tx_ts + 1; e_t->bytes_tx_in_marked_rtt = e_t->bytes_tx_in_rtt - *prtt_bytes_adjust; @@ -353,7 +353,7 @@ */ if (!e_t->dlyack_rx || multiack || new_sacked_bytes) { /* Make an accurate new measurement. */ - e_t->rtt = tcp_ts_getticks() - txsi->tx_ts + 1; + e_t->rtt = tcp_ts_getsbintime() - txsi->tx_ts + 1; if (e_t->rtt < e_t->minrtt || e_t->minrtt == 0) e_t->minrtt = e_t->rtt; @@ -478,11 +478,10 @@ if (((tp->t_flags & TF_NOOPT) == 0) && (to->to_flags & TOF_TS)) { - txsi->tx_ts = ntohl(to->to_tsval) - - tp->ts_offset; + txsi->tx_ts = ntohl(to->to_tsval); txsi->rx_ts = ntohl(to->to_tsecr); } else { - txsi->tx_ts = tcp_ts_getticks(); + txsi->tx_ts = tcp_ts_getsbintime(); txsi->rx_ts = 0; /* No received time stamp. */ } TAILQ_INSERT_TAIL(&e_t->txsegi_q, txsi, txsegi_lnk); Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -62,6 +62,7 @@ #ifdef TCP_HHOOK #include #endif +#include #include #include #include /* for proc0 declaration */ @@ -341,21 +342,20 @@ tcp_hc_get(&inp->inp_inc, &metrics); maxseg = tcp_maxseg(tp); - if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { + if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt * SBT_1US)) { tp->t_srtt = rtt; - tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; + tp->t_rttbest = tp->t_srtt; TCPSTAT_INC(tcps_usedrtt); if (metrics.rmx_rttvar) { - tp->t_rttvar = metrics.rmx_rttvar; + tp->t_rttvar = metrics.rmx_rttvar * SBT_1US; TCPSTAT_INC(tcps_usedrttvar); } else { /* default variation is +- 1 rtt */ - tp->t_rttvar = - tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + tp->t_rttvar = (tp->t_srtt >> 1); } TCPT_RANGESET(tp->t_rxtcur, - ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_srtt + 4*tp->t_rttvar, + tp->t_rttmin, TCPTV_REXMTMAX*tick_sbt); } if (metrics.rmx_ssthresh) { /* @@ -479,12 +479,14 @@ * the ack that opens up a 0-sized window. * - LRO wasn't used for this segment. We make sure by checking that the * segment size is not larger than the MSS. + * - the calculated delay is greater than 2ms */ #define DELAY_ACK(tp, tlen) \ - ((!tcp_timer_active(tp, TT_DELACK) && \ + (((!tcp_timer_active(tp, TT_DELACK) && \ (tp->t_flags & TF_RXWIN0SENT) == 0) && \ (tlen <= tp->t_maxseg) && \ - (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) + (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) && \ + tp->t_delack > 2*SBT_1MS) static void inline cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) @@ -517,7 +519,7 @@ CC_ALGO(tp)->ecnpkt_handler(tp->ccv); if (tp->ccv->flags & CCF_ACKNOW) - tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); + tcp_timer_activate(tp, TT_DELACK, tp->t_delack); } } @@ -581,6 +583,7 @@ int drop_hdrlen; int thflags; int rstreason = 0; /* For badport_bandlim accounting purposes */ + sbintime_t t; uint8_t iptos; struct m_tag *fwd_tag = NULL; #ifdef INET6 @@ -606,6 +609,7 @@ isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif + t = tcp_ts_getsbintime(); off0 = *offp; m = *mp; *mp = NULL; @@ -1510,11 +1514,11 @@ { int newsize = 0; - if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) && - tp->t_srtt != 0 && tp->rfbuf_ts != 0 && - TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) > - (tp->t_srtt >> TCP_RTT_SHIFT)) { - if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) && + if ((V_tcp_do_autorcvbuf & !!(so->so_rcv.sb_flags & SB_AUTOSIZE) & + !!tp->t_srtt & !!tp->rfbuf_ts) && + tcp_ts_getsbintime() - TCP_TS_TO_SBT(tp->rfbuf_ts) > + tp->t_srtt) { + if (tp->rfbuf_cnt > ((so->so_rcv.sb_hiwat / 8) * 7) && so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) { newsize = min(so->so_rcv.sb_hiwat + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max); @@ -1545,7 +1549,8 @@ struct mbuf *mfree; struct tcpopt to; int tfo_syn; - + sbintime_t t; + #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -1555,6 +1560,7 @@ struct tcphdr tcp_savetcp; short ostate = 0; #endif + t = tcp_ts_getsbintime(); thflags = th->th_flags; inc = &tp->t_inpcb->inp_inc; tp->sackhint.last_sack_ack = 0; @@ -1622,7 +1628,7 @@ * XXX: This should be done after segment * validation to ignore broken/spoofed segs. */ - tp->t_rcvtime = ticks; + tp->t_rcvtime = t; /* * Scale up the window into a 32-bit value. @@ -1679,9 +1685,13 @@ * was established. */ if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { - to.to_tsecr -= tp->ts_offset; - if (TSTMP_GT(to.to_tsecr, tcp_ts_getticks())) + if (to.to_tsecr == tp->t_lasttsecr + MAX_TS_STEP) { + tp->t_lasttsecr = to.to_tsecr; + to.to_tsecr = tp->t_lasttsval; + } else if (TSTMP_GT(to.to_tsecr, TCP_SBT_TO_TS(t))) to.to_tsecr = 0; + else + tp->t_lasttsecr = to.to_tsecr; } /* * Process options only when we get SYN/ACK back. The SYN case @@ -1704,7 +1714,7 @@ if (to.to_flags & TOF_TS) { tp->t_flags |= TF_RCVD_TSTMP; tp->ts_recent = to.to_tsval; - tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent_age = t; } if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); @@ -1774,7 +1784,7 @@ */ if ((to.to_flags & TOF_TS) != 0 && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { - tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent_age = t; tp->ts_recent = to.to_tsval; } @@ -1798,7 +1808,7 @@ */ if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && - (int)(ticks - tp->t_badrxtwin) < 0) { + (t - tp->t_badrxtwin) < 0) { cc_cong_signal(tp, th, CC_RTO_ERR); } @@ -1812,20 +1822,27 @@ */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { - uint32_t t; + u_int curts; + sbintime_t rtt; - t = tcp_ts_getticks() - to.to_tsecr; - if (!tp->t_rttlow || tp->t_rttlow > t) - tp->t_rttlow = t; - tcp_xmit_timer(tp, - TCP_TS_TO_TICKS(t) + 1); + curts = (uint32_t)TCP_SBT_TO_TS(t); + /* + * cope with frequent wrap + */ + if (__predict_true(curts > to.to_tsecr)) + rtt = curts - to.to_tsecr; + else + rtt = UINT_MAX - to.to_tsecr + curts; + rtt = TCP_TS_TO_SBT(rtt); + if (!tp->t_rttlow || tp->t_rttlow > rtt) + tp->t_rttlow = rtt; + tcp_xmit_timer(tp, rtt + SBT_MINTS); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { if (!tp->t_rttlow || - tp->t_rttlow > ticks - tp->t_rtttime) - tp->t_rttlow = ticks - tp->t_rtttime; - tcp_xmit_timer(tp, - ticks - tp->t_rtttime); + tp->t_rttlow > t - tp->t_rtttime) + tp->t_rttlow = t - tp->t_rtttime; + tcp_xmit_timer(tp, t - tp->t_rtttime); } acked = BYTES_THIS_ACK(tp, th); @@ -2056,7 +2073,7 @@ */ if (DELAY_ACK(tp, tlen) && tlen != 0 && !tfo_partial_ack) tcp_timer_activate(tp, TT_DELACK, - tcp_delacktime); + tp->t_delack); else tp->t_flags |= TF_ACKNOW; @@ -2247,7 +2264,8 @@ TSTMP_LT(to.to_tsval, tp->ts_recent)) { /* Check to see if ts_recent is over 24 days old. */ - if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { + /* Check to see if ts_recent is over MSL OLD */ + if (t - tp->ts_recent_age > TCP_PAWS_IDLE_SBT) { /* * Invalidate ts_recent. If this segment updates * ts_recent, the age will be reset later and ts_recent @@ -2401,7 +2419,7 @@ SEQ_LEQ(th->th_seq, tp->last_ack_sent) && SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + ((thflags & (TH_SYN|TH_FIN)) != 0))) { - tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent_age = t; tp->ts_recent = to.to_tsval; } @@ -2450,7 +2468,7 @@ * SYN-RECEIVED -> ESTABLISHED * SYN-RECEIVED* -> FIN-WAIT-1 */ - tp->t_starttime = ticks; + tp->t_starttime = t; if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; @@ -2787,8 +2805,8 @@ * original cwnd and ssthresh, and proceed to transmit where * we left off. */ - if (tp->t_rxtshift == 1 && tp->t_flags & TF_PREVVALID && - (int)(ticks - tp->t_badrxtwin) < 0) + if (tp->t_rxtshift > 0 && tp->t_flags & TF_PREVVALID && + (t - tp->t_badrxtwin) < 0) cc_cong_signal(tp, th, CC_RTO_ERR); /* @@ -2806,16 +2824,16 @@ * huge RTT and blow up the retransmit timer. */ if ((to.to_flags & TOF_TS) != 0 && to.to_tsecr) { - uint32_t t; + sbintime_t rtt; - t = tcp_ts_getticks() - to.to_tsecr; - if (!tp->t_rttlow || tp->t_rttlow > t) - tp->t_rttlow = t; - tcp_xmit_timer(tp, TCP_TS_TO_TICKS(t) + 1); + rtt = TCP_TS_TO_SBT(((uint32_t)TCP_SBT_TO_TS(t)) - to.to_tsecr); + if (!tp->t_rttlow || tp->t_rttlow > rtt) + tp->t_rttlow = rtt; + tcp_xmit_timer(tp, rtt + SBT_MINTS); } else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq)) { - if (!tp->t_rttlow || tp->t_rttlow > ticks - tp->t_rtttime) - tp->t_rttlow = ticks - tp->t_rtttime; - tcp_xmit_timer(tp, ticks - tp->t_rtttime); + if (!tp->t_rttlow || tp->t_rttlow > t - tp->t_rtttime) + tp->t_rttlow = t - tp->t_rtttime; + tcp_xmit_timer(tp, t - tp->t_rtttime); } /* @@ -3134,7 +3152,7 @@ * enter the CLOSE_WAIT state. */ case TCPS_SYN_RECEIVED: - tp->t_starttime = ticks; + tp->t_starttime = t; /* FALLTHROUGH */ case TCPS_ESTABLISHED: tcp_state_change(tp, TCPS_CLOSE_WAIT); @@ -3189,7 +3207,7 @@ if (tp->t_flags & TF_DELACK) { tp->t_flags &= ~TF_DELACK; - tcp_timer_activate(tp, TT_DELACK, tcp_delacktime); + tcp_timer_activate(tp, TT_DELACK, tp->t_delack); } INP_WUNLOCK(tp->t_inpcb); return; @@ -3480,27 +3498,47 @@ * and update averages and current timeout. */ void -tcp_xmit_timer(struct tcpcb *tp, int rtt) +tcp_xmit_timer(struct tcpcb *tp, sbintime_t rtt) { - int delta; + int64_t delta; + uint64_t expected_samples, shift, var_shift; INP_WLOCK_ASSERT(tp->t_inpcb); + /* + * track this + */ + if (rtt < SBT_1NS*100) + return; + + /* RFC 7323 Appendix G RTO Calculation Modification */ + /* ExpectedSamples = ceiling(FlightSize / (SMSS * 2)) */ + /* roundup(x, y) == ceiling(x / y) * y */ + expected_samples = ((tcp_compute_pipe(tp) + ((tp->t_maxseg*2)-1)) / (tp->t_maxseg*2)); + /* + * alpha' = alpha / ExpectedSamples => + * alpha = 1 / 1 >> TCP_RTT_SHIFT + * alpha' = 1 / 1 >> (TCP_RTT_SHIFT + shift) + **/ + shift = max(fls(expected_samples + 1), 0) + TCP_RTT_SHIFT; TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) { /* - * srtt is stored as fixed point with 5 bits after the - * binary point (i.e., scaled by 8). The following magic + * The following magic * is equivalent to the smoothing algorithm in rfc793 with * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed - * point). Adjust rtt to origin 0. + * point) when FlightSize is 1. Adjust rtt to origin 0. */ - delta = ((rtt - 1) << TCP_DELTA_SHIFT) - - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); - if ((tp->t_srtt += delta) <= 0) - tp->t_srtt = 1; + /* + * original calculation: + * delta = ((rtt - 1) << TCP_DELTA_SHIFT) + * - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); + */ + delta = ((rtt - 1) >> shift) - (tp->t_srtt >> shift); + tp->t_srtt = max(tp->t_srtt + delta, SBT_1US); + /* * We accumulate a smoothed rtt variance (actually, a @@ -3512,11 +3550,14 @@ * (rttvar = rttvar*3/4 + |delta| / 4). This replaces * rfc793's wired-in beta. */ - if (delta < 0) - delta = -delta; - delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); - if ((tp->t_rttvar += delta) <= 0) - tp->t_rttvar = 1; + /* + * delta has already implicitly been divided by 8 + * se we need to multiply by 2 - similarly shift + * needs to be adjusted down by one + */ + var_shift = TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT; + delta = (abs(delta) << var_shift) - (tp->t_rttvar >> (shift-var_shift)); + tp->t_rttvar = max(tp->t_rttvar + delta, SBT_1US); if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } else { @@ -3525,8 +3566,8 @@ * Set the variance to half the rtt (so our first * retransmit happens at 3*rtt). */ - tp->t_srtt = rtt << TCP_RTT_SHIFT; - tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + tp->t_srtt = rtt; + tp->t_rttvar = rtt >> 1; tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } tp->t_rtttime = 0; @@ -3543,8 +3584,7 @@ * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). */ - TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), - max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX); + TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), max(tp->t_rttmin, rtt+2), TCPTV_REXMTMAX*tick_sbt); /* * We received an ack for a packet that wasn't retransmitted; @@ -3569,7 +3609,7 @@ * While looking at the routing entry, we also initialize other path-dependent * parameters from pre-set or cached values in the routing entry. * - * NOTE that resulting t_maxseg doesn't include space for TCP options or +o * NOTE that resulting t_maxseg doesn't include space for TCP options or * IP options, e.g. IPSEC data, since length of this data may vary, and * thus it is calculated for every segment separately in tcp_output(). * Index: sys/netinet/tcp_output.c =================================================================== --- sys/netinet/tcp_output.c +++ sys/netinet/tcp_output.c @@ -191,7 +191,8 @@ int tcp_output(struct tcpcb *tp) { - struct socket *so = tp->t_inpcb->inp_socket; + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; int32_t len; uint32_t recwin, sendwin; int off, flags, error = 0; /* Keep compiler happy */ @@ -213,6 +214,7 @@ struct tcpopt to; unsigned int wanted_cookie = 0; unsigned int dont_sendalot = 0; + sbintime_t t; #if 0 int maxburst = TCP_MAXBURST; #endif @@ -222,9 +224,9 @@ isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif + t = tcp_ts_getsbintime(); - INP_WLOCK_ASSERT(tp->t_inpcb); - + INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); @@ -247,7 +249,7 @@ * to send, then transmit; otherwise, investigate further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); - if (idle && ticks - tp->t_rcvtime >= tp->t_rxtcur) + if (idle && (t - tp->t_rcvtime) >= tp->t_rxtcur) cc_after_idle(tp); tp->t_flags &= ~TF_LASTIDLE; if (idle) { @@ -257,6 +259,7 @@ } } again: + t = tcp_ts_getsbintime(); /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid @@ -808,7 +811,21 @@ /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { - to.to_tsval = tcp_ts_getticks() + tp->ts_offset; + + /* + * This next part is subtle and extremely critical. + * If we've been idle long enough with respect to + * the peer we have to lie about our timestamp so + * that the peer doesn't see our timestamp as being + * "before" the last one that we sent out. The TCP + * standard gives no mention to high resolution + * timestamp interoperability. + */ + if (SEQ_GT(tp->t_lasttsecr, TCP_SBT_TO_TS(t))) + to.to_tsval = (uint32_t)(tp->t_lasttsecr + MAX_TS_STEP); + else + to.to_tsval = TCP_SBT_TO_TS(t); + tp->t_lasttsval = to.to_tsval; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } @@ -816,7 +833,7 @@ /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE)) - tp->rfbuf_ts = tcp_ts_getticks(); + tp->rfbuf_ts = TCP_SBT_TO_TS(t); /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { @@ -1489,7 +1506,7 @@ * not currently timing anything. */ if (tp->t_rtttime == 0) { - tp->t_rtttime = ticks; + tp->t_rtttime = t; tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); } @@ -1655,8 +1672,7 @@ void tcp_setpersist(struct tcpcb *tp) { - int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; - int tt; + uint64_t tt, t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; tp->t_flags &= ~TF_PREVVALID; if (tcp_timer_active(tp, TT_REXMT)) @@ -1665,7 +1681,7 @@ * Start/restart persistence timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], - tcp_persmin, tcp_persmax); + tcp_persmin*tick_sbt, tcp_persmax*tick_sbt); tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; Index: sys/netinet/tcp_seq.h =================================================================== --- sys/netinet/tcp_seq.h +++ sys/netinet/tcp_seq.h @@ -73,18 +73,62 @@ (tp)->snd_recover = (tp)->iss #ifdef _KERNEL + /* - * Clock macros for RFC 1323 timestamps. + * RFC 7323 + * Section 5.4. Timestamp Clock + * + * (b) The timestamp clock must not be "too fast". + * + * The recycling time of the timestamp clock MUST be greater than + * MSL seconds. Since the clock (timestamp) is 32 bits and the + * worst-case MSL is 255 seconds, the maximum acceptable clock + * frequency is one tick every 59 ns. */ -#define TCP_TS_TO_TICKS(_t) ((_t) * hz / 1000) -/* Timestamp wrap-around time, 24 days. */ -#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * 1000) +/* + * The minimum permissible timestamp is 59ns. However, to reduce calculation + * overhead we use 256 - (8 bit shift). + * - (1<<32)/(1000000000/59) == 253 + * - (1<<32)/(1000000000/60) == 257 + * + * + * Note that MSL should be a function of RTT. Although 60ns is more than sufficient resolution for + * the time being a 255s MSL on data center network with a sub-millisecond RTT doesn't make a whole + * lot of senese. In the future the MSL should be determined dynamically or at the very least con- + * figurable per subnet. Nonetheless, fixing the timestamp clock at a rate corresponding to a 256s + * MSL gives us what we need for now while otherwise remaining as RFC compliant as possible. + * + */ + +#define SBT_MINTS_SHIFT 8 +#define MIN_TS_STEP 2 +#define TS_1S (SBT_1S >> SBT_MINTS_SHIFT) +#define SBT_MINTS (1 << SBT_MINTS_SHIFT) +/* minimum rtt is ~1us (60ns * 16) */ +#define SBT_MINRTT (SBT_MINTS << 4) /* - * tcp_ts_getticks() in ms, should be 1ms < x < 1000ms according to RFC 1323. - * We always use 1ms granularity independent of hz. + * Clock macros for RFC 1323 timestamps. */ +#define TCP_TS_TO_SBT(_t) ((_t) << SBT_MINTS_SHIFT) +#define TCP_SBT_TO_TS(_t) ((_t) >> SBT_MINTS_SHIFT) +#define MAX_TS_STEP ((1<<30)) + +/* + * RFC defined MSL: 255s ( 2s rounding slop) + */ +#define TCP_PAWS_IDLE_SBT (SBT_MINTS*SBT_1S/2) + +#include + + +#define tcp_ts_getsbintime() (cpu_ts_getsbintime)() + +#define TCP_TS_TO_TICKS(_t) ((_t) * hz / 1000) + +/* Timestamp wrap-around time, 24 days. */ +#define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * 1000) static __inline uint32_t tcp_ts_getticks(void) { Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -751,6 +751,18 @@ static volatile int next_tcp_stack_id = 1; +#if !defined(__amd64__) && !defined(__i386__) +static sbintime_t +cpu_ts_getsbintime_(void) +{ + struct bintime bt; + + getbinuptime(&bt); + sbt = bt.frac >> SBT_MINTS_SHIFT; + return (sbt); +} +#endif + /* * Register a TCP function block with the name provided in the names * array. (Note that this function does NOT automatically register @@ -1121,6 +1133,9 @@ #ifdef TCPPCAP tcp_pcap_init(); #endif +#if !defined(__amd64__) && !defined(__i386__) + cpu_tcp_ts_getsbintime = cpu_tcp_ts_getsbintime_; +#endif } #ifdef VIMAGE @@ -1443,7 +1458,7 @@ if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { - to.to_tsval = tcp_ts_getticks() + tp->ts_offset; + to.to_tsval = TCP_SBT_TO_TS(tcp_ts_getsbintime()); to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } @@ -1655,11 +1670,12 @@ */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; - tp->t_rttmin = tcp_rexmit_min; - tp->t_rxtcur = TCPTV_RTOBASE; + tp->t_rttmin = tcp_rexmit_min*tick_sbt; + tp->t_rxtcur = TCPTV_RTOBASE*tick_sbt; + tp->t_delack = tcp_delacktime*tick_sbt; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; - tp->t_rcvtime = ticks; + tp->t_rcvtime = tcp_ts_getsbintime(); /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, @@ -1856,8 +1872,9 @@ ssthresh = 0; metrics.rmx_ssthresh = ssthresh; - metrics.rmx_rtt = tp->t_srtt; - metrics.rmx_rttvar = tp->t_rttvar; + + metrics.rmx_rtt = tp->t_srtt / SBT_1US; + metrics.rmx_rttvar = tp->t_rttvar / SBT_1US; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; Index: sys/netinet/tcp_syncache.c =================================================================== --- sys/netinet/tcp_syncache.c +++ sys/netinet/tcp_syncache.c @@ -834,6 +834,7 @@ tcp_state_change(tp, TCPS_SYN_RECEIVED); tp->iss = sc->sc_iss; tp->irs = sc->sc_irs; + tp->t_lasttsval = sc->sc_ts; tcp_rcvseqinit(tp); tcp_sendseqinit(tp); blk = sototcpcb(lso)->t_fb; @@ -882,8 +883,7 @@ if (sc->sc_flags & SCF_TIMESTAMP) { tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP; tp->ts_recent = sc->sc_tsreflect; - tp->ts_recent_age = tcp_ts_getticks(); - tp->ts_offset = sc->sc_tsoff; + tp->ts_recent_age = tcp_ts_getsbintime(); } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (sc->sc_flags & SCF_SIGNATURE) @@ -1488,7 +1488,7 @@ */ if (to->to_flags & TOF_TS) { sc->sc_tsreflect = to->to_tsval; - sc->sc_ts = tcp_ts_getticks(); + sc->sc_ts = TCP_SBT_TO_TS(tcp_ts_getsbintime()); sc->sc_flags |= SCF_TIMESTAMP; } if (to->to_flags & TOF_SCALE) { @@ -2025,8 +2025,7 @@ /* Randomize the timestamp. */ if (sc->sc_flags & SCF_TIMESTAMP) { - sc->sc_ts = arc4random(); - sc->sc_tsoff = sc->sc_ts - tcp_ts_getticks(); + sc->sc_ts = TCP_SBT_TO_TS(tcp_ts_getsbintime()); } TCPSTAT_INC(tcps_sc_sendcookie); @@ -2116,7 +2115,7 @@ sc->sc_flags |= SCF_TIMESTAMP; sc->sc_tsreflect = to->to_tsval; sc->sc_ts = to->to_tsecr; - sc->sc_tsoff = to->to_tsecr - tcp_ts_getticks(); + sc->sc_tsoff = to->to_tsecr - TCP_SBT_TO_TS(tcp_ts_getsbintime()); } if (to->to_flags & TOF_SIGNATURE) Index: sys/netinet/tcp_timer.c =================================================================== --- sys/netinet/tcp_timer.c +++ sys/netinet/tcp_timer.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #ifdef INET6 #include @@ -389,7 +390,7 @@ tcp_inpinfo_lock_del(inp, tp); goto out; } else { - if (ticks - tp->t_rcvtime <= TP_MAXIDLE(tp)) { + if (tcp_ts_getsbintime() - tp->t_rcvtime <= TP_MAXIDLE(tp)) { callout_reset(&tp->t_timers->tt_2msl, TP_KEEPINTVL(tp), tcp_timer_2msl, tp); } else { @@ -475,7 +476,7 @@ if ((tcp_always_keepalive || inp->inp_socket->so_options & SO_KEEPALIVE) && tp->t_state <= TCPS_CLOSING) { - if (ticks - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) + if (tcp_ts_getsbintime() - tp->t_rcvtime >= TP_KEEPIDLE(tp) + TP_MAXIDLE(tp)) goto dropit; /* * Send a packet designed to force a response @@ -538,6 +539,7 @@ { struct tcpcb *tp = xtp; struct inpcb *inp; + sbintime_t dt; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; @@ -573,9 +575,10 @@ * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. */ + dt = tcp_ts_getsbintime() - tp->t_rcvtime; if (tp->t_rxtshift == TCP_MAXRXTSHIFT && - (ticks - tp->t_rcvtime >= tcp_maxpersistidle || - ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { + (dt >= tcp_maxpersistidle*tick_sbt || + dt >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { TCPSTAT_INC(tcps_persistdrop); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); @@ -693,18 +696,19 @@ tp->t_flags |= TF_WASCRECOVERY; else tp->t_flags &= ~TF_WASCRECOVERY; - tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); + tp->t_badrxtwin = tcp_ts_getsbintime() + tp->t_rxtcur; tp->t_flags |= TF_PREVVALID; } else tp->t_flags &= ~TF_PREVVALID; TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) - rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift]; + rexmt = TCPTV_RTOBASE * tcp_syn_backoff[tp->t_rxtshift] * tick_sbt; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + /* 1 < delack < tcp_delacktime - and should scale down with RTO/2 */ TCPT_RANGESET(tp->t_rxtcur, rexmt, - tp->t_rttmin, TCPTV_REXMTMAX); + tp->t_rttmin, TCPTV_REXMTMAX*tick_sbt); /* * We enter the path for PLMTUD if connection is established or, if @@ -863,13 +867,13 @@ } void -tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta) +tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, sbintime_t delta) { struct callout *t_callout; timeout_t *f_callout; struct inpcb *inp = tp->t_inpcb; int cpu = inp_to_cpuid(inp); - + sbintime_t f_precision; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; @@ -882,22 +886,27 @@ case TT_DELACK: t_callout = &tp->t_timers->tt_delack; f_callout = tcp_timer_delack; + f_precision = SBT_1MS; break; case TT_REXMT: t_callout = &tp->t_timers->tt_rexmt; f_callout = tcp_timer_rexmt; + f_precision = SBT_1US; break; case TT_PERSIST: t_callout = &tp->t_timers->tt_persist; f_callout = tcp_timer_persist; + f_precision = SBT_1S; break; case TT_KEEP: t_callout = &tp->t_timers->tt_keep; f_callout = tcp_timer_keep; + f_precision = SBT_1S; break; case TT_2MSL: t_callout = &tp->t_timers->tt_2msl; f_callout = tcp_timer_2msl; + f_precision = SBT_1S; break; default: if (tp->t_fb->tfb_tcp_timer_activate) { @@ -909,7 +918,7 @@ if (delta == 0) { callout_stop(t_callout); } else { - callout_reset_on(t_callout, delta, f_callout, tp, cpu); + callout_reset_sbt_on(t_callout, delta, f_precision, f_callout, tp, cpu, 0); } } Index: sys/netinet/tcp_timewait.c =================================================================== --- sys/netinet/tcp_timewait.c +++ sys/netinet/tcp_timewait.c @@ -301,10 +301,8 @@ if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { tw->t_recent = tp->ts_recent; - tw->ts_offset = tp->ts_offset; } else { tw->t_recent = 0; - tw->ts_offset = 0; } tw->snd_nxt = tp->snd_nxt; @@ -574,7 +572,7 @@ */ if (tw->t_recent && flags == TH_ACK) { to.to_flags |= TOF_TS; - to.to_tsval = tcp_ts_getticks() + tw->ts_offset; + to.to_tsval = TCP_SBT_TO_TS(tcp_ts_getsbintime()); to.to_tsecr = tw->t_recent; } optlen = tcp_addoptions(&to, (u_char *)(th + 1)); Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -1380,9 +1380,9 @@ ti->tcpi_options |= TCPI_OPT_ECN; ti->tcpi_rto = tp->t_rxtcur * tick; - ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; - ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; - ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; + ti->tcpi_last_data_recv = (long)((tcp_ts_getsbintime() - tp->t_rcvtime)/tick_sbt) * tick; + ti->tcpi_rtt = ((u_int64_t)(tp->t_srtt/tick_sbt) * tick); + ti->tcpi_rttvar = ((u_int64_t)(tp->t_rttvar/tick_sbt) * tick); ti->tcpi_snd_ssthresh = tp->snd_ssthresh; ti->tcpi_snd_cwnd = tp->snd_cwnd; @@ -2175,7 +2175,7 @@ int timeout; timeout = (tcp_fast_finwait2_recycle) ? - tcp_finwait2_timeout : TP_MAXIDLE(tp); + tcp_finwait2_timeout*tick_sbt : TP_MAXIDLE(tp); tcp_timer_activate(tp, TT_2MSL, timeout); } } @@ -2426,20 +2426,20 @@ "0x%08x\n", tp->snd_ssthresh, tp->snd_recover); db_print_indent(indent); - db_printf("t_rcvtime: %u t_startime: %u\n", + db_printf("t_rcvtime: %zu t_startime: %zu\n", tp->t_rcvtime, tp->t_starttime); db_print_indent(indent); - db_printf("t_rttime: %u t_rtsq: 0x%08x\n", + db_printf("t_rttime: %zu t_rtsq: 0x%08x\n", tp->t_rtttime, tp->t_rtseq); db_print_indent(indent); - db_printf("t_rxtcur: %d t_maxseg: %u t_srtt: %d\n", + db_printf("t_rxtcur: %zu t_maxseg: %u t_srtt: %zu\n", tp->t_rxtcur, tp->t_maxseg, tp->t_srtt); db_print_indent(indent); - db_printf("t_rttvar: %d t_rxtshift: %d t_rttmin: %u " - "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin, + db_printf("t_rttvar: %zu t_rxtshift: %d t_rttmin: %zu " + "t_rttbest: %zu\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin, tp->t_rttbest); db_print_indent(indent); @@ -2456,16 +2456,16 @@ tp->snd_scale, tp->rcv_scale, tp->request_r_scale); db_print_indent(indent); - db_printf("ts_recent: %u ts_recent_age: %u\n", + db_printf("ts_recent: %u ts_recent_age: %zu\n", tp->ts_recent, tp->ts_recent_age); db_print_indent(indent); - db_printf("ts_offset: %u last_ack_sent: 0x%08x snd_cwnd_prev: " - "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev); + db_printf("last_ack_sent: 0x%08x snd_cwnd_prev: " + "%u\n", tp->last_ack_sent, tp->snd_cwnd_prev); db_print_indent(indent); db_printf("snd_ssthresh_prev: %u snd_recover_prev: 0x%08x " - "t_badrxtwin: %u\n", tp->snd_ssthresh_prev, + "t_badrxtwin: %zu\n", tp->snd_ssthresh_prev, tp->snd_recover_prev, tp->t_badrxtwin); db_print_indent(indent); Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -116,62 +116,65 @@ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ u_int t_flags2; /* More tcpcb flags storage */ - int t_srtt; /* smoothed round-trip time */ - int t_rttvar; /* variance in round-trip time */ + uint64_t t_srtt; /* smoothed round-trip time */ + uint64_t t_rttvar; /* variance in round-trip time */ u_int32_t ts_recent; /* timestamp echo data */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char snd_limited; /* segments limited transmitted */ u_char request_r_scale; /* pending window scaling */ - tcp_seq last_ack_sent; - u_int t_rcvtime; /* inactivity time */ /* Cache line 3 */ + sbintime_t t_rcvtime; /* inactivity time */ + tcp_seq last_ack_sent; tcp_seq rcv_up; /* receive urgent pointer */ - int t_segqlen; /* segment reassembly queue length */ + struct tsegqe_head t_segq; /* segment reassembly queue */ + struct mbuf *t_in_pkt; struct mbuf *t_tail_pkt; + struct tcp_timer *t_timers; /* All the TCP timers in one struct */ - struct vnet *t_vnet; /* back pointer to parent vnet */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ - tcp_seq snd_wl1; /* window update seg seq number */ + int t_segqlen; /* segment reassembly queue length */ /* Cache line 4 */ + struct vnet *t_vnet; /* back pointer to parent vnet */ + tcp_seq snd_wl1; /* window update seg seq number */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq iss; /* initial send sequence number */ u_int t_acktime; - u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ + + sbintime_t ts_recent_age; /* when last updated */ + sbintime_t t_rxtcur; /* current retransmit value (ticks) */ + sbintime_t t_rtttime; /* RTT measurement start time */ + uint32_t t_lasttsecr; + uint32_t t_lasttsval; + /* Cache line 5 */ + tcp_seq t_rtseq; /* sequence number being timed */ uint16_t cl4_spare; /* Spare to adjust CL 4 */ char t_oobflags; /* have some */ char t_iobc; /* input character */ - int t_rxtcur; /* current retransmit value (ticks) */ - + sbintime_t t_starttime; /* time connection was established */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ - u_int t_rtttime; /* RTT measurement start time */ - - tcp_seq t_rtseq; /* sequence number being timed */ - u_int t_starttime; /* time connection was established */ - u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ - u_int t_rttmin; /* minimum rtt allowed */ - - u_int t_rttbest; /* best rtt we've seen */ - + sbintime_t t_rttmin; /* minimum rtt allowed */ + sbintime_t t_rttbest; /* best rtt we'v seen */ + sbintime_t t_delack; /* delayed ack timer */ int t_softerror; /* possible error not yet reported */ uint32_t max_sndwnd; /* largest window peer has offered */ - /* Cache line 5 */ + uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ - int t_sndzerowin; /* zero-window updates sent */ u_long t_rttupdated; /* number of times rtt sampled */ + sbintime_t t_badrxtwin; /* window for retransmit recovery */ + int t_sndzerowin; /* zero-window updates sent */ int snd_numholes; /* number of holes seen by sender */ - u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ @@ -217,6 +220,7 @@ struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; + u_char opt[TCP_MAXOLEN]; }; /* @@ -432,8 +436,7 @@ short tw_so_options; /* copy of so_options */ struct ucred *tw_cred; /* user credentials */ u_int32_t t_recent; - u_int32_t ts_offset; /* our timestamp offset */ - u_int t_starttime; + sbintime_t t_starttime; int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; void *tw_pspare; /* TCP_SIGNATURE */ @@ -475,9 +478,7 @@ * which results in inappropriately large RTO values for very * fast networks. */ -#define TCP_REXMTVAL(tp) \ - max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ - + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) +#define TCP_REXMTVAL(tp) max((tp)->t_rttmin, (tp)->t_srtt + ((tp)->t_rttvar << 2)) /* * TCP statistics. @@ -833,7 +834,7 @@ struct tcpcb *, int, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); -void tcp_xmit_timer(struct tcpcb *, int); +void tcp_xmit_timer(struct tcpcb *, sbintime_t); void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type); @@ -892,7 +893,7 @@ struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, void *, void *); -void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); +void tcp_timer_activate(struct tcpcb *, uint32_t, sbintime_t); int tcp_timer_active(struct tcpcb *, uint32_t); void tcp_timer_stop(struct tcpcb *, uint32_t); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); Index: sys/sys/callout.h =================================================================== --- sys/sys/callout.h +++ sys/sys/callout.h @@ -53,10 +53,13 @@ #define CALLOUT_DIRECT 0x0100 /* allow exec from hw int context */ #define C_DIRECT_EXEC 0x0001 /* direct execution of callout */ -#define C_PRELBITS 7 + +#define C_PMS 7 +#define C_PRELBITS 11 #define C_PRELRANGE ((1 << C_PRELBITS) - 1) #define C_PREL(x) (((x) + 1) << 1) #define C_PRELGET(x) (int)((((x) >> 1) & C_PRELRANGE) - 1) +#define C_DEFAULT C_PREL(C_PMS) #define C_HARDCLOCK 0x0100 /* align to hardclock() calls */ #define C_ABSOLUTE 0x0200 /* event time is absolute. */ #define C_PRECALC 0x0400 /* event time is pre-calculated. */ Index: sys/sys/clock.h =================================================================== --- sys/sys/clock.h +++ sys/sys/clock.h @@ -203,6 +203,8 @@ void clock_dbgprint_err(device_t dev, int rw, int err); void clock_dbgprint_ts(device_t dev, int rw, const struct timespec *ts); +extern sbintime_t (*cpu_ts_getsbintime)(void); + #endif /* _KERNEL */ #endif /* !_SYS_CLOCK_H_ */ Index: sys/x86/x86/tsc.c =================================================================== --- sys/x86/x86/tsc.c +++ sys/x86/x86/tsc.c @@ -54,11 +54,24 @@ #include "cpufreq_if.h" uint64_t tsc_freq; +uint64_t tsc_sbt; +int64_t max_tsc_jitter; int tsc_is_invariant; int tsc_perf_stat; +static int tsc_ts_recalibrate; + static eventhandler_tag tsc_levels_tag, tsc_pre_tag, tsc_post_tag; +sbintime_t cpu_ts_getsbintime_rdtsc(void); +sbintime_t cpu_ts_getsbintime_rdtscp(void); +sbintime_t (*cpu_ts_getsbintime)(void); +static void cpu_ts_calibrate_all(void); + + +SYSCTL_INT(_kern_timecounter, OID_AUTO, tsc_ts_always_calibrate, CTLFLAG_RW, + &tsc_ts_recalibrate, 0, "always use sbintime for timestamp clock"); + SYSCTL_INT(_kern_timecounter, OID_AUTO, invariant_tsc, CTLFLAG_RDTUN, &tsc_is_invariant, 0, "Indicates whether the TSC is P-state invariant"); @@ -612,6 +625,19 @@ tsc_timecounter.tc_priv = (void *)(intptr_t)shift; tc_init(&tsc_timecounter); } + + /* XXX yes this needs to be revisited */ +#if defined(__amd64__) + cpu_ts_getsbintime = cpu_ts_getsbintime_rdtscp; +#elif defined(__i386__) + cpu_ts_getsbintime = cpu_ts_getsbintime_rdtsc; +#endif + cpu_ts_calibrate_all(); + + /* tsc ticks per 10us */ + max_tsc_jitter = tsc_freq/(1000000/10); + + printf("tsc_freq: %lu max_tsc_jitter: %lu\n", tsc_freq, max_tsc_jitter); } SYSINIT(tsc_tc, SI_SUB_SMP, SI_ORDER_ANY, init_TSC_tc, NULL); @@ -779,3 +805,64 @@ return (1); } #endif + +static DPCPU_DEFINE(int64_t, pcputsc); /* Per-CPU version of tsc at time of last calibration */ +static volatile sbintime_t sbt0; + + +static void +cpu_ts_calibrate_all(void) +{ + u_int _i; + int64_t *tsc; + + CPU_FOREACH(_i) { + tsc = DPCPU_ID_PTR(_i, pcputsc); + *tsc = rdtsc(); + } + sbt0 = sbinuptime(); +} + + +#define CPU_TS_CALIBRATE(op) \ +static void \ +cpu_ts_calibrate_ ## op(void) \ +{\ + int64_t *tsc, sbt; \ +\ + tsc = DPCPU_PTR(pcputsc);\ + *tsc = op();\ +\ + sbt = sbinuptime(); \ + while (sbt > sbt0) \ + atomic_cmpset_long(&sbt0, sbt0, sbt); \ +} + +#define CPU_TS_GETSBINTIME(op) \ +sbintime_t \ +cpu_ts_getsbintime_ ## op(void) \ +{\ + int64_t tsc, curtsc, tsc_delta; \ + \ + critical_enter(); \ + tsc = DPCPU_GET(pcputsc); \ + curtsc = op(); \ + \ + tsc_delta = curtsc - tsc; \ + if (tsc_ts_recalibrate || \ + __predict_false(tsc_delta < 0 || tsc_delta > max_tsc_jitter)) { \ + cpu_ts_calibrate_ ## op();\ + critical_exit();\ + return (sbt0);\ + }\ + critical_exit();\ +\ + return (sbt0);\ +} + + +CPU_TS_CALIBRATE(rdtsc) +CPU_TS_CALIBRATE(rdtscp) + +CPU_TS_GETSBINTIME(rdtsc) +CPU_TS_GETSBINTIME(rdtscp)