Index: sys/dev/cxgbe/adapter.h
===================================================================
--- sys/dev/cxgbe/adapter.h
+++ sys/dev/cxgbe/adapter.h
@@ -859,6 +859,15 @@
 
 struct clip_entry;
 
+#define CNT_CAL_INFO 3
+struct clock_sync {
+	uint64_t hw_cur;
+	uint64_t hw_prev;
+	uint64_t rt_cur;
+	uint64_t rt_prev;
+	uint32_t gen;
+};
+
 struct adapter {
 	SLIST_ENTRY(adapter) link;
 	device_t dev;
@@ -978,6 +987,11 @@
 	struct mtx sfl_lock;	/* same cache-line as sc_lock? but that's ok */
 	TAILQ_HEAD(, sge_fl) sfl;
 	struct callout sfl_callout;
+	struct callout cal_callout;
+	struct clock_sync cal_info[CNT_CAL_INFO];
+	int cal_current;
+	int cal_count;
+	uint32_t cal_gen;
 
 	/*
 	 * Driver code that can run when the adapter is suspended must use this
Index: sys/dev/cxgbe/t4_main.c
===================================================================
--- sys/dev/cxgbe/t4_main.c
+++ sys/dev/cxgbe/t4_main.c
@@ -320,6 +320,18 @@
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldtxq, CTLFLAG_RDTUN, &t4_nofldtxq, 0,
     "Number of offload TX queues per port");
 
+static int t4_clocksync_fast = 1;
+SYSCTL_INT(_hw_cxgbe, OID_AUTO, csfast, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_clocksync_fast, 0,
+    "During initial clock sync how fast do we update in seconds");
+
+static int t4_clocksync_normal = 30;
+SYSCTL_INT(_hw_cxgbe, OID_AUTO, csnormal, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_clocksync_normal, 0,
+    "During normal clock sync how fast do we update in seconds");
+
+static int t4_fast_2_normal = 30;
+SYSCTL_INT(_hw_cxgbe, OID_AUTO, cscount, CTLFLAG_RW | CTLFLAG_MPSAFE, &t4_fast_2_normal, 0,
+    "How many clock syncs do we need to do to transition to slow");
+
 #define NOFLDRXQ 2
 static int t4_nofldrxq = -NOFLDRXQ;
 SYSCTL_INT(_hw_cxgbe, OID_AUTO, nofldrxq, CTLFLAG_RDTUN, &t4_nofldrxq, 0,
@@ -1109,6 +1121,79 @@
 	return (-1);
 }
 
+static inline uint64_t
+t4_get_ns_timestamp(struct timespec *ts)
+{
+	return ((ts->tv_sec * 1000000000) + ts->tv_nsec);
+}
+
+static void
+t4_calibration(void *arg)
+{
+	struct adapter *sc;
+	struct timespec ts;
+	struct clock_sync *cur, *nex;
+	int next_up;
+
+	sc = (struct adapter *)arg;
+
+	cur = &sc->cal_info[sc->cal_current];
+	next_up = (sc->cal_current + 1) % CNT_CAL_INFO;
+       	nex = &sc->cal_info[next_up];
+	if (__predict_false(sc->cal_count == 0)) {
+		/* First time in, just get the values in */
+		cur->hw_cur = t4_read_reg64(sc, A_SGE_TIMESTAMP_LO);
+		nanouptime(&ts);
+		cur->rt_cur = t4_get_ns_timestamp(&ts);
+		sc->cal_count++;
+		goto done;
+	}
+	nex->hw_prev = cur->hw_cur;
+	nex->rt_prev = cur->rt_cur;
+	KASSERT((hw_off_limits(sc) == 0), "hw_off_limits at t4_calibtration");
+	nex->hw_cur = t4_read_reg64(sc, A_SGE_TIMESTAMP_LO);
+	nanouptime(&ts);	
+	nex->rt_cur = t4_get_ns_timestamp(&ts);
+	if ((nex->hw_cur - nex->hw_prev) == 0) {
+		/* The clock is not advancing? */
+		sc->cal_count = 0;
+		atomic_store_rel_int(&cur->gen, 0);
+		goto done;
+	}
+	atomic_store_rel_int(&cur->gen, 0);
+	sc->cal_current = next_up;
+	sc->cal_gen++;
+	atomic_store_rel_int(&nex->gen, sc->cal_gen);
+	if (sc->cal_count < t4_fast_2_normal)
+		sc->cal_count++;
+done:
+	callout_reset_sbt_curcpu(&sc->cal_callout,
+				 ((sc->cal_count < t4_fast_2_normal)  ?
+				 t4_clocksync_fast : t4_clocksync_normal) * SBT_1S, 0,
+				 t4_calibration, sc, C_DIRECT_EXEC);
+}
+
+
+
+static void
+t4_calibration_start(struct adapter *sc)
+{
+	/*
+	 * Here if we have not done a calibration
+	 * then do so otherwise start the appropriate
+	 * timer.
+	 */
+	int i;
+
+	for (i = 0; i < CNT_CAL_INFO; i++) {
+		sc->cal_info[i].gen = 0;
+	}
+	sc->cal_current = 0;
+	sc->cal_count = 0;
+	sc->cal_gen = 0;
+	t4_calibration(sc);
+}
+
 static int
 t4_attach(device_t dev)
 {
@@ -1177,6 +1262,8 @@
 
 	callout_init(&sc->ktls_tick, 1);
 
+	callout_init(&sc->cal_callout, 1);
+
 	refcount_init(&sc->vxlan_refcount, 0);
 
 	TASK_INIT(&sc->reset_task, 0, reset_adapter_task, sc);
@@ -1567,6 +1654,7 @@
 		    "failed to attach all child ports: %d\n", rc);
 		goto done;
 	}
+	t4_calibration_start(sc);
 
 	device_printf(dev,
 	    "PCIe gen%d x%d, %d ports, %d %s interrupt%s, %d eq, %d iq\n",
@@ -1742,7 +1830,8 @@
 			free(pi, M_CXGBE);
 		}
 	}
-
+	callout_stop(&sc->cal_callout);
+	callout_drain(&sc->cal_callout);
 	device_delete_children(dev);
 	sysctl_ctx_free(&sc->ctx);
 	adapter_full_uninit(sc);
@@ -1920,7 +2009,6 @@
 
 	/* No more DMA or interrupts. */
 	stop_adapter(sc);
-
 	/* Quiesce all activity. */
 	for_each_port(sc, i) {
 		pi = sc->port[i];
@@ -1993,6 +2081,10 @@
 		quiesce_iq_fl(sc, &sc->sge.fwq, NULL);
 	}
 
+	/* Stop calibration */
+	callout_stop(&sc->cal_callout);
+	callout_drain(&sc->cal_callout);
+
 	/* Mark the adapter totally off limits. */
 	mtx_lock(&sc->reg_lock);
 	atomic_set_int(&sc->error_flags, HW_OFF_LIMITS);
@@ -2359,6 +2451,10 @@
 			}
 		}
 	}
+
+	/* Reset all calibration */
+	t4_calibration_start(sc);	
+
 done:
 	if (rc == 0) {
 		sc->incarnation++;
Index: sys/dev/cxgbe/t4_sge.c
===================================================================
--- sys/dev/cxgbe/t4_sge.c
+++ sys/dev/cxgbe/t4_sge.c
@@ -1503,15 +1503,73 @@
 }
 #endif
 
+#define CGBE_SHIFT_SCALE 10
+
 static inline uint64_t
-last_flit_to_ns(struct adapter *sc, uint64_t lf)
+t4_tstmp_to_ns(struct adapter *sc, uint64_t lf)
 {
-	uint64_t n = be64toh(lf) & 0xfffffffffffffff;	/* 60b, not 64b. */
+	struct clock_sync *cur, dcur;
+	uint64_t tstmp_sec, tstmp_nsec;
+	uint64_t hw_clocks;
+	uint64_t rt_cur_to_prev, res_s, res_n, res_s_modulo, res;
+	uint64_t hw_clk_div, cclk;
+	uint64_t hw_tstmp = lf & 0xfffffffffffffffULL;	/* 60b, not 64b. */
+	uint32_t gen;
 
-	if (n > UINT64_MAX / 1000000)
-		return (n / sc->params.vpd.cclk * 1000000);
-	else
-		return (n * 1000000 / sc->params.vpd.cclk);
+	do {
+		cur = &sc->cal_info[sc->cal_current];
+		gen = atomic_load_acq_int(&cur->gen);
+		if (gen == 0)
+			return (0);
+		dcur = *cur;
+		atomic_thread_fence_acq();
+	} while (gen != dcur.gen);
+
+	/*
+	 * Our goal here is to have a result that is:
+	 *
+	 * (                             (cur_time - prev_time)   )
+	 * ((hw_tstmp - hw_prev) *  ----------------------------- ) + prev_time
+	 * (                             (hw_cur - hw_prev)       )
+	 *
+	 * With the constraints that we cannot use float and we
+	 * don't want to overflow the uint64_t numbers we are using.
+	 *
+	 * The plan is to take the clocking value of the hw timestamps
+	 * and split them into seconds and nanosecond equivalent portions.
+	 * Then we operate on the two portions seperately making sure to
+	 * bring back the carry over from the seconds when we divide.
+	 *
+	 * First up lets get the two divided into separate entities
+	 * i.e. the seconds. We use the clock frequency for this.
+	 * Note that vpd.cclk is in khz, we need it in raw hz so
+	 * convert to hz.
+	 */
+	cclk = sc->params.vpd.cclk * 1000;
+	hw_clocks = hw_tstmp - dcur.hw_prev;
+	tstmp_sec = hw_clocks / cclk;
+	tstmp_nsec = hw_clocks % cclk;
+	/* Now work with them separately */
+	rt_cur_to_prev = (dcur.rt_cur - dcur.rt_prev);
+	res_s = tstmp_sec * rt_cur_to_prev;
+	res_n = tstmp_nsec * rt_cur_to_prev;
+	/* Now lets get our divider */
+	hw_clk_div = dcur.hw_cur - dcur.hw_prev;
+	/* Make sure to save the remainder from the seconds divide */
+	res_s_modulo = res_s % hw_clk_div;
+	res_s /= hw_clk_div;
+	/* scale the remainder to where it should be */
+	res_s_modulo *= cclk;
+	/* Now add in the remainder */
+	res_n += res_s_modulo;
+	/* Now do the divide */
+	res_n /= hw_clk_div;
+	res_s *= cclk;
+	/* Recombine the two */
+	res = res_s + res_n;
+	/* And now add in the base time to get to the real timestamp */
+	res += dcur.rt_prev;
+	return (res);
 }
 
 static inline void
@@ -2060,17 +2118,13 @@
 
 	if (rxq->iq.flags & IQ_RX_TIMESTAMP) {
 		/*
-		 * Fill up rcv_tstmp but do not set M_TSTMP.
-		 * rcv_tstmp is not in the format that the
-		 * kernel expects and we don't want to mislead
-		 * it.  For now this is only for custom code
-		 * that knows how to interpret cxgbe's stamp.
+		 * Fill up rcv_tstmp but do not set M_TSTMP as
+		 * long as we get a non-zero back from t4_tstmp_to_ns().
 		 */
-		m0->m_pkthdr.rcv_tstmp =
-		    last_flit_to_ns(sc, d->rsp.u.last_flit);
-#ifdef notyet
-		m0->m_flags |= M_TSTMP;
-#endif
+		m0->m_pkthdr.rcv_tstmp = t4_tstmp_to_ns(sc,
+		    be64toh(d->rsp.u.last_flit));
+		if (m0->m_pkthdr.rcv_tstmp != 0)
+			m0->m_flags |= M_TSTMP;
 	}
 
 #ifdef NUMA