Index: sys/dev/cxgbe/adapter.h =================================================================== --- sys/dev/cxgbe/adapter.h +++ sys/dev/cxgbe/adapter.h @@ -859,6 +859,15 @@ struct clip_entry; +#define CNT_CAL_INFO 3 +struct clock_sync { + uint64_t hw_cur; + uint64_t hw_prev; + uint64_t rt_cur; + uint64_t rt_prev; + uint32_t gen; +}; + struct adapter { SLIST_ENTRY(adapter) link; device_t dev; @@ -978,6 +987,11 @@ struct mtx sfl_lock; /* same cache-line as sc_lock? but that's ok */ TAILQ_HEAD(, sge_fl) sfl; struct callout sfl_callout; + struct callout cal_callout; + struct clock_sync cal_info[CNT_CAL_INFO]; + int cal_current; + int cal_count; + uint32_t cal_gen; /* * Driver code that can run when the adapter is suspended must use this Index: sys/dev/cxgbe/t4_main.c =================================================================== --- sys/dev/cxgbe/t4_main.c +++ sys/dev/cxgbe/t4_main.c @@ -320,6 +320,18 @@ SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq, CTLFLAG_RDTUN, &t4_nofldtxq, 0, "Number of offload TX queues per port"); +static int t4_clocksync_fast = 1; +SYSCTL_INT(_hw_cxgbe, OID_AUTO, csfast, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_clocksync_fast, 0, + "During initial clock sync how fast do we update in seconds"); + +static int t4_clocksync_normal = 30; +SYSCTL_INT(_hw_cxgbe, OID_AUTO, csnormal, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_clocksync_normal, 0, + "During normal clock sync how fast do we update in seconds"); + +static int t4_fast_2_normal = 30; +SYSCTL_INT(_hw_cxgbe, OID_AUTO, cscount, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_fast_2_normal, 0, + "How many clock syncs do we need to do to transition to slow"); + #define NOFLDRXQ 2 static int t4_nofldrxq = -NOFLDRXQ; SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq, CTLFLAG_RDTUN, &t4_nofldrxq, 0, @@ -1109,6 +1121,79 @@ return (-1); } +static inline uint64_t +t4_get_ns_timestamp(struct timespec *ts) +{ + return ((ts->tv_sec * 1000000000) + ts->tv_nsec); +} + +static void +t4_calibration(void *arg) +{ + struct adapter *sc; + struct timespec ts; + struct clock_sync *cur, *nex; + int next_up; + + sc = (struct adapter *)arg; + + cur = &sc->cal_info[sc->cal_current]; + next_up = (sc->cal_current + 1) % CNT_CAL_INFO; + nex = &sc->cal_info[next_up]; + if (__predict_false(sc->cal_count == 0)) { + /* First time in, just get the values in */ + cur->hw_cur = t4_read_reg64(sc, A_SGE_TIMESTAMP_LO); + nanouptime(&ts); + cur->rt_cur = t4_get_ns_timestamp(&ts); + sc->cal_count++; + goto done; + } + nex->hw_prev = cur->hw_cur; + nex->rt_prev = cur->rt_cur; + KASSERT((hw_off_limits(sc) == 0), "hw_off_limits at t4_calibtration"); + nex->hw_cur = t4_read_reg64(sc, A_SGE_TIMESTAMP_LO); + nanouptime(&ts); + nex->rt_cur = t4_get_ns_timestamp(&ts); + if ((nex->hw_cur - nex->hw_prev) == 0) { + /* The clock is not advancing? */ + sc->cal_count = 0; + atomic_store_rel_int(&cur->gen, 0); + goto done; + } + atomic_store_rel_int(&cur->gen, 0); + sc->cal_current = next_up; + sc->cal_gen++; + atomic_store_rel_int(&nex->gen, sc->cal_gen); + if (sc->cal_count < t4_fast_2_normal) + sc->cal_count++; +done: + callout_reset_sbt_curcpu(&sc->cal_callout, + ((sc->cal_count < t4_fast_2_normal) ? + t4_clocksync_fast : t4_clocksync_normal) * SBT_1S, 0, + t4_calibration, sc, C_DIRECT_EXEC); +} + + + +static void +t4_calibration_start(struct adapter *sc) +{ + /* + * Here if we have not done a calibration + * then do so otherwise start the appropriate + * timer. + */ + int i; + + for (i = 0; i < CNT_CAL_INFO; i++) { + sc->cal_info[i].gen = 0; + } + sc->cal_current = 0; + sc->cal_count = 0; + sc->cal_gen = 0; + t4_calibration(sc); +} + static int t4_attach(device_t dev) { @@ -1177,6 +1262,8 @@ callout_init(&sc->ktls_tick, 1); + callout_init(&sc->cal_callout, 1); + refcount_init(&sc->vxlan_refcount, 0); TASK_INIT(&sc->reset_task, 0, reset_adapter_task, sc); @@ -1567,6 +1654,7 @@ "failed to attach all child ports: %d\n", rc); goto done; } + t4_calibration_start(sc); device_printf(dev, "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n", @@ -1742,7 +1830,8 @@ free(pi, M_CXGBE); } } - + callout_stop(&sc->cal_callout); + callout_drain(&sc->cal_callout); device_delete_children(dev); sysctl_ctx_free(&sc->ctx); adapter_full_uninit(sc); @@ -1920,7 +2009,6 @@ /* No more DMA or interrupts. */ stop_adapter(sc); - /* Quiesce all activity. */ for_each_port(sc, i) { pi = sc->port[i]; @@ -1993,6 +2081,10 @@ quiesce_iq_fl(sc, &sc->sge.fwq, NULL); } + /* Stop calibration */ + callout_stop(&sc->cal_callout); + callout_drain(&sc->cal_callout); + /* Mark the adapter totally off limits. */ mtx_lock(&sc->reg_lock); atomic_set_int(&sc->error_flags, HW_OFF_LIMITS); @@ -2359,6 +2451,10 @@ } } } + + /* Reset all calibration */ + t4_calibration_start(sc); + done: if (rc == 0) { sc->incarnation++; Index: sys/dev/cxgbe/t4_sge.c =================================================================== --- sys/dev/cxgbe/t4_sge.c +++ sys/dev/cxgbe/t4_sge.c @@ -1503,15 +1503,73 @@ } #endif +#define CGBE_SHIFT_SCALE 10 + static inline uint64_t -last_flit_to_ns(struct adapter *sc, uint64_t lf) +t4_tstmp_to_ns(struct adapter *sc, uint64_t lf) { - uint64_t n = be64toh(lf) & 0xfffffffffffffff; /* 60b, not 64b. */ + struct clock_sync *cur, dcur; + uint64_t tstmp_sec, tstmp_nsec; + uint64_t hw_clocks; + uint64_t rt_cur_to_prev, res_s, res_n, res_s_modulo, res; + uint64_t hw_clk_div, cclk; + uint64_t hw_tstmp = lf & 0xfffffffffffffffULL; /* 60b, not 64b. */ + uint32_t gen; - if (n > UINT64_MAX / 1000000) - return (n / sc->params.vpd.cclk * 1000000); - else - return (n * 1000000 / sc->params.vpd.cclk); + do { + cur = &sc->cal_info[sc->cal_current]; + gen = atomic_load_acq_int(&cur->gen); + if (gen == 0) + return (0); + dcur = *cur; + atomic_thread_fence_acq(); + } while (gen != dcur.gen); + + /* + * Our goal here is to have a result that is: + * + * ( (cur_time - prev_time) ) + * ((hw_tstmp - hw_prev) * ----------------------------- ) + prev_time + * ( (hw_cur - hw_prev) ) + * + * With the constraints that we cannot use float and we + * don't want to overflow the uint64_t numbers we are using. + * + * The plan is to take the clocking value of the hw timestamps + * and split them into seconds and nanosecond equivalent portions. + * Then we operate on the two portions seperately making sure to + * bring back the carry over from the seconds when we divide. + * + * First up lets get the two divided into separate entities + * i.e. the seconds. We use the clock frequency for this. + * Note that vpd.cclk is in khz, we need it in raw hz so + * convert to hz. + */ + cclk = sc->params.vpd.cclk * 1000; + hw_clocks = hw_tstmp - dcur.hw_prev; + tstmp_sec = hw_clocks / cclk; + tstmp_nsec = hw_clocks % cclk; + /* Now work with them separately */ + rt_cur_to_prev = (dcur.rt_cur - dcur.rt_prev); + res_s = tstmp_sec * rt_cur_to_prev; + res_n = tstmp_nsec * rt_cur_to_prev; + /* Now lets get our divider */ + hw_clk_div = dcur.hw_cur - dcur.hw_prev; + /* Make sure to save the remainder from the seconds divide */ + res_s_modulo = res_s % hw_clk_div; + res_s /= hw_clk_div; + /* scale the remainder to where it should be */ + res_s_modulo *= cclk; + /* Now add in the remainder */ + res_n += res_s_modulo; + /* Now do the divide */ + res_n /= hw_clk_div; + res_s *= cclk; + /* Recombine the two */ + res = res_s + res_n; + /* And now add in the base time to get to the real timestamp */ + res += dcur.rt_prev; + return (res); } static inline void @@ -2060,17 +2118,13 @@ if (rxq->iq.flags & IQ_RX_TIMESTAMP) { /* - * Fill up rcv_tstmp but do not set M_TSTMP. - * rcv_tstmp is not in the format that the - * kernel expects and we don't want to mislead - * it. For now this is only for custom code - * that knows how to interpret cxgbe's stamp. + * Fill up rcv_tstmp but do not set M_TSTMP as + * long as we get a non-zero back from t4_tstmp_to_ns(). */ - m0->m_pkthdr.rcv_tstmp = - last_flit_to_ns(sc, d->rsp.u.last_flit); -#ifdef notyet - m0->m_flags |= M_TSTMP; -#endif + m0->m_pkthdr.rcv_tstmp = t4_tstmp_to_ns(sc, + be64toh(d->rsp.u.last_flit)); + if (m0->m_pkthdr.rcv_tstmp != 0) + m0->m_flags |= M_TSTMP; } #ifdef NUMA