Index: sbin/ifconfig/ifconfig.c =================================================================== --- sbin/ifconfig/ifconfig.c +++ sbin/ifconfig/ifconfig.c @@ -1143,7 +1143,7 @@ "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \ "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \ "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \ -"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT" +"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP" /* * Print the status of the interface. If an address family was @@ -1456,6 +1456,8 @@ DEF_CMD("-wol_magic", -IFCAP_WOL_MAGIC, setifcap), DEF_CMD("txrtlmt", IFCAP_TXRTLMT, setifcap), DEF_CMD("-txrtlmt", -IFCAP_TXRTLMT, setifcap), + DEF_CMD("hwrxtsmp", IFCAP_HWRXTSTMP, setifcap), + DEF_CMD("-hwrxtsmp", -IFCAP_HWRXTSTMP, setifcap), DEF_CMD("normal", -IFF_LINK0, setifflags), DEF_CMD("compress", IFF_LINK0, setifflags), DEF_CMD("noicmp", IFF_LINK1, setifflags), Index: sys/dev/mlx5/device.h =================================================================== --- sys/dev/mlx5/device.h +++ sys/dev/mlx5/device.h @@ -573,6 +573,8 @@ u8 op_own; }; +#define MLX5_CQE_TSTMP_PTP (1ULL << 63) + static inline bool get_cqe_lro_timestamp_valid(struct mlx5_cqe64 *cqe) { return (cqe->lro_tcppsh_abort_dupack >> 7) & 1; Index: sys/dev/mlx5/mlx5_en/en.h =================================================================== --- sys/dev/mlx5/mlx5_en/en.h +++ sys/dev/mlx5/mlx5_en/en.h @@ -633,6 +633,14 @@ void *main; }; +struct mlx5e_clbr_point { + uint64_t base_curr; + uint64_t base_prev; + uint64_t clbr_hw_prev; + uint64_t clbr_hw_curr; + u_int clbr_gen; +}; + struct mlx5e_priv { /* priv data path fields - start */ int order_base_2_num_channels; @@ -687,6 +695,12 @@ int media_active_last; struct callout watchdog; + + struct callout tstmp_clbr; + int clbr_done; + int clbr_curr; + struct mlx5e_clbr_point clbr_points[2]; + u_int clbr_gen; }; #define MLX5E_NET_IP_ALIGN 2 Index: sys/dev/mlx5/mlx5_en/mlx5_en_main.c =================================================================== --- sys/dev/mlx5/mlx5_en/mlx5_en_main.c +++ sys/dev/mlx5/mlx5_en/mlx5_en_main.c @@ -637,6 +637,83 @@ mtx_unlock(&priv->async_events_mtx); } +static void mlx5e_calibration_callout(void *arg); +static int mlx5e_calibration_duration = 20; +static int mlx5e_fast_calibration = 1; +static int mlx5e_normal_calibration = 30; + +static void +mlx5e_reset_calibration_callout(struct mlx5e_priv *priv) +{ + + if (priv->clbr_done == 0) + mlx5e_calibration_callout(priv); + else + callout_reset(&priv->tstmp_clbr, (priv->clbr_done < + mlx5e_calibration_duration ? mlx5e_fast_calibration : + mlx5e_normal_calibration) * hz, mlx5e_calibration_callout, + priv); +} + +static uint64_t +mlx5e_timespec2usec(const struct timespec *ts) +{ + + return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec); +} + +static uint64_t +mlx5e_hw_clock(struct mlx5e_priv *priv) +{ + uint32_t hw_h, hw_h1, hw_l; + + do { + hw_h = ioread32be(&priv->mdev->iseg->internal_timer_h); + hw_l = ioread32be(&priv->mdev->iseg->internal_timer_l); + hw_h1 = ioread32be(&priv->mdev->iseg->internal_timer_h); + } while (hw_h1 != hw_h); + return (((uint64_t)hw_h << 32) | hw_l); +} + +static void +mlx5e_calibration_callout(void *arg) +{ + struct mlx5e_priv *priv; + struct mlx5e_clbr_point *next, *curr; + struct timespec ts; + int clbr_curr_next; + + priv = arg; + curr = &priv->clbr_points[priv->clbr_curr]; + clbr_curr_next = priv->clbr_curr + 1; + if (clbr_curr_next >= nitems(priv->clbr_points)) + clbr_curr_next = 0; + next = &priv->clbr_points[clbr_curr_next]; + + next->base_prev = curr->base_curr; + next->clbr_hw_prev = curr->clbr_hw_curr; + + next->clbr_hw_curr = mlx5e_hw_clock(priv); + if (next->clbr_hw_curr == curr->clbr_hw_prev) { + if_printf(priv->ifp, "hw failed tstmp frozen %jx, disabling\n", + next->clbr_hw_curr); + priv->clbr_done = 0; + return; + } + + nanouptime(&ts); + next->base_curr = mlx5e_timespec2usec(&ts); + + curr->clbr_gen = 0; + atomic_thread_fence_rel(); + priv->clbr_curr = clbr_curr_next; + atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen)); + + if (priv->clbr_done < mlx5e_calibration_duration) + priv->clbr_done++; + mlx5e_reset_calibration_callout(priv); +} + static const char *mlx5e_rq_stats_desc[] = { MLX5E_RQ_STATS(MLX5E_STATS_DESC) }; @@ -2689,6 +2766,16 @@ mlx5e_open_locked(ifp); } } + if (mask & IFCAP_HWRXTSTMP) { + ifp->if_capenable ^= IFCAP_HWRXTSTMP; + if (ifp->if_capenable & IFCAP_HWRXTSTMP) { + if (priv->clbr_done == 0) + mlx5e_reset_calibration_callout(priv); + } else { + callout_drain(&priv->tstmp_clbr); + priv->clbr_done = 0; + } + } out: PRIV_UNLOCK(priv); break; @@ -3036,7 +3123,7 @@ ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; ifp->if_capabilities |= IFCAP_LRO; ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO; - ifp->if_capabilities |= IFCAP_HWSTATS; + ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP; /* set TSO limits so that we don't have to drop TX packets */ ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN); @@ -3185,6 +3272,9 @@ mlx5e_update_stats(priv); mtx_unlock(&priv->async_events_mtx); + callout_init(&priv->tstmp_clbr, CALLOUT_DIRECT); + mlx5e_reset_calibration_callout(priv); + return (priv); err_dealloc_transport_domain: @@ -3229,6 +3319,8 @@ /* stop watchdog timer */ callout_drain(&priv->watchdog); + callout_drain(&priv->tstmp_clbr); + if (priv->vlan_attach != NULL) EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach); if (priv->vlan_detach != NULL) Index: sys/dev/mlx5/mlx5_en/mlx5_en_rx.c =================================================================== --- sys/dev/mlx5/mlx5_en/mlx5_en_rx.c +++ sys/dev/mlx5/mlx5_en/mlx5_en_rx.c @@ -179,13 +179,37 @@ /* TODO: handle tcp checksum */ } +#define PREC 10 + +static uint64_t +mlx5e_mbuf_tstmp(struct mlx5e_priv *priv, uint64_t hw_tstmp) +{ + struct mlx5e_clbr_point *cp; + uint64_t a1, a2, res; + u_int gen; + + do { + cp = &priv->clbr_points[priv->clbr_curr]; + gen = atomic_load_acq_int(&cp->clbr_gen); + a1 = (hw_tstmp - cp->clbr_hw_prev) >> PREC; + a2 = (cp->base_curr - cp->base_prev) >> PREC; + res = (a1 * a2) << PREC; + res /= (cp->clbr_hw_curr - cp->clbr_hw_prev) >> PREC; + res += cp->base_prev; + atomic_thread_fence_acq(); + } while (gen == 0 || gen != cp->clbr_gen); + return (res); +} + static inline void mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe, struct mlx5e_rq *rq, struct mbuf *mb, u32 cqe_bcnt) { struct ifnet *ifp = rq->ifp; + struct mlx5e_channel *c; int lro_num_seg; /* HW LRO session aggregated packets counter */ + uint64_t tstmp; lro_num_seg = be32_to_cpu(cqe->srqn) >> 24; if (lro_num_seg > 1) { @@ -250,6 +274,21 @@ mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->vlan_info); mb->m_flags |= M_VLANTAG; } + + c = container_of(rq, struct mlx5e_channel, rq); + if (c->priv->clbr_done >= 2) { + tstmp = mlx5e_mbuf_tstmp(c->priv, be64_to_cpu(cqe->timestamp)); + if ((tstmp & MLX5_CQE_TSTMP_PTP) != 0) { + /* + * Timestamp was taken on the packet entrance, + * instead of the cqe generation. + */ + tstmp &= ~MLX5_CQE_TSTMP_PTP; + mb->m_flags |= M_TSTMP_HPREC; + } + mb->m_pkthdr.rcv_tstmp = tstmp; + mb->m_flags |= M_TSTMP; + } } static inline void Index: sys/net/if.h =================================================================== --- sys/net/if.h +++ sys/net/if.h @@ -240,6 +240,7 @@ #define IFCAP_TXCSUM_IPV6 0x400000 /* can offload checksum on IPv6 TX */ #define IFCAP_HWSTATS 0x800000 /* manages counters internally */ #define IFCAP_TXRTLMT 0x1000000 /* hardware supports TX rate limiting */ +#define IFCAP_HWRXTSTMP 0x2000000 /* hardware rx timestamping */ #define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6) Index: sys/netinet/ip_input.c =================================================================== --- sys/netinet/ip_input.c +++ sys/netinet/ip_input.c @@ -1143,40 +1143,96 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip, struct mbuf *m) { + bool stamped; + stamped = false; if ((inp->inp_socket->so_options & SO_BINTIME) || CHECK_SO_CT(inp->inp_socket, SO_TS_BINTIME)) { - struct bintime bt; - - bintime(&bt); + struct bintime boottimebin, bt; + struct timespec ts1; + + if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | + M_TSTMP)) { + mbuf_tstmp2timespec(m, &ts1); + timespec2bintime(&ts1, &bt); + getboottimebin(&boottimebin); + bintime_add(&bt, &boottimebin); + } else { + bintime(&bt); + } *mp = sbcreatecontrol((caddr_t)&bt, sizeof(bt), SCM_BINTIME, SOL_SOCKET); - if (*mp) + if (*mp != NULL) { mp = &(*mp)->m_next; + stamped = true; + } } if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME_MICRO)) { + struct bintime boottimebin, bt1; + struct timespec ts1;; struct timeval tv; - microtime(&tv); + if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | + M_TSTMP)) { + mbuf_tstmp2timespec(m, &ts1); + timespec2bintime(&ts1, &bt1); + getboottimebin(&boottimebin); + bintime_add(&bt1, &boottimebin); + bintime2timeval(&bt1, &tv); + } else { + microtime(&tv); + } *mp = sbcreatecontrol((caddr_t)&tv, sizeof(tv), SCM_TIMESTAMP, SOL_SOCKET); - if (*mp) + if (*mp != NULL) { mp = &(*mp)->m_next; + stamped = true; + } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_REALTIME)) { - struct timespec ts; - - nanotime(&ts); + struct bintime boottimebin; + struct timespec ts, ts1; + + if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | + M_TSTMP)) { + mbuf_tstmp2timespec(m, &ts); + getboottimebin(&boottimebin); + bintime2timespec(&boottimebin, &ts1); + timespecadd(&ts, &ts1); + } else { + nanotime(&ts); + } *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), SCM_REALTIME, SOL_SOCKET); - if (*mp) + if (*mp != NULL) { mp = &(*mp)->m_next; + stamped = true; + } } else if (CHECK_SO_CT(inp->inp_socket, SO_TS_MONOTONIC)) { struct timespec ts; - nanouptime(&ts); + if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | + M_TSTMP)) + mbuf_tstmp2timespec(m, &ts); + else + nanouptime(&ts); *mp = sbcreatecontrol((caddr_t)&ts, sizeof(ts), SCM_MONOTONIC, SOL_SOCKET); - if (*mp) + if (*mp != NULL) { + mp = &(*mp)->m_next; + stamped = true; + } + } + if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | + M_TSTMP)) { + struct sock_timestamp_info sti; + + bzero(&sti, sizeof(sti)); + sti.st_info_flags = ST_INFO_HW; + if ((m->m_flags & M_TSTMP_HPREC) != 0) + sti.st_info_flags |= ST_INFO_HW_HPREC; + *mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti), SCM_TIME_INFO, + SOL_SOCKET); + if (*mp != NULL) mp = &(*mp)->m_next; } if (inp->inp_flags & INP_RECVDSTADDR) { Index: sys/netinet6/ip6_input.c =================================================================== --- sys/netinet6/ip6_input.c +++ sys/netinet6/ip6_input.c @@ -1221,43 +1221,97 @@ struct bintime bt; struct timespec ts; } t; + struct bintime boottimebin, bt1; + struct timespec ts1; + bool stamped; + stamped = false; switch (inp->inp_socket->so_ts_clock) { case SO_TS_REALTIME_MICRO: - microtime(&t.tv); + if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | + M_TSTMP)) { + mbuf_tstmp2timespec(m, &ts1); + timespec2bintime(&ts1, &bt1); + getboottimebin(&boottimebin); + bintime_add(&bt1, &boottimebin); + bintime2timeval(&bt1, &t.tv); + } else { + microtime(&t.tv); + } *mp = sbcreatecontrol((caddr_t) &t.tv, sizeof(t.tv), SCM_TIMESTAMP, SOL_SOCKET); - if (*mp) + if (*mp != NULL) { mp = &(*mp)->m_next; + stamped = true; + } break; case SO_TS_BINTIME: - bintime(&t.bt); + if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | + M_TSTMP)) { + mbuf_tstmp2timespec(m, &ts1); + timespec2bintime(&ts1, &t.bt); + getboottimebin(&boottimebin); + bintime_add(&t.bt, &boottimebin); + } else { + bintime(&t.bt); + } *mp = sbcreatecontrol((caddr_t)&t.bt, sizeof(t.bt), SCM_BINTIME, SOL_SOCKET); - if (*mp) + if (*mp != NULL) { mp = &(*mp)->m_next; + stamped = true; + } break; case SO_TS_REALTIME: - nanotime(&t.ts); + if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | + M_TSTMP)) { + mbuf_tstmp2timespec(m, &t.ts); + getboottimebin(&boottimebin); + bintime2timespec(&boottimebin, &ts1); + timespecadd(&t.ts, &ts1); + } else { + nanotime(&t.ts); + } *mp = sbcreatecontrol((caddr_t)&t.ts, sizeof(t.ts), SCM_REALTIME, SOL_SOCKET); - if (*mp) + if (*mp != NULL) { mp = &(*mp)->m_next; + stamped = true; + } break; case SO_TS_MONOTONIC: - nanouptime(&t.ts); + if ((m->m_flags & (M_PKTHDR | M_TSTMP)) == (M_PKTHDR | + M_TSTMP)) + mbuf_tstmp2timespec(m, &t.ts); + else + nanouptime(&t.ts); *mp = sbcreatecontrol((caddr_t)&t.ts, sizeof(t.ts), SCM_MONOTONIC, SOL_SOCKET); - if (*mp) + if (*mp != NULL) { mp = &(*mp)->m_next; + stamped = true; + } break; default: panic("unknown (corrupted) so_ts_clock"); } + if (stamped && (m->m_flags & (M_PKTHDR | M_TSTMP)) == + (M_PKTHDR | M_TSTMP)) { + struct sock_timestamp_info sti; + + bzero(&sti, sizeof(sti)); + sti.st_info_flags = ST_INFO_HW; + if ((m->m_flags & M_TSTMP_HPREC) != 0) + sti.st_info_flags |= ST_INFO_HW_HPREC; + *mp = sbcreatecontrol((caddr_t)&sti, sizeof(sti), + SCM_TIME_INFO, SOL_SOCKET); + if (*mp != NULL) + mp = &(*mp)->m_next; + } } #endif Index: sys/sys/mbuf.h =================================================================== --- sys/sys/mbuf.h +++ sys/sys/mbuf.h @@ -154,14 +154,20 @@ /* Layer crossing persistent information. */ uint32_t flowid; /* packet's 4-tuple system */ - uint64_t csum_flags; /* checksum and offload features */ + uint32_t csum_flags; /* checksum and offload features */ uint16_t fibnum; /* this packet should use this fib */ uint8_t cosqos; /* class/quality of service */ uint8_t rsstype; /* hash type */ - uint8_t l2hlen; /* layer 2 header length */ - uint8_t l3hlen; /* layer 3 header length */ - uint8_t l4hlen; /* layer 4 header length */ - uint8_t l5hlen; /* layer 5 header length */ + union { + uint64_t rcv_tstmp; /* timestamp in ns */ + struct { + uint8_t l2hlen; /* layer 2 hdr len */ + uint8_t l3hlen; /* layer 3 hdr len */ + uint8_t l4hlen; /* layer 4 hdr len */ + uint8_t l5hlen; /* layer 5 hdr len */ + uint32_t spare; + }; + }; union { uint8_t eight[8]; uint16_t sixteen[4]; @@ -293,6 +299,10 @@ #define M_VLANTAG 0x00000080 /* ether_vtag is valid */ #define M_UNUSED_8 0x00000100 /* --available-- */ #define M_NOFREE 0x00000200 /* do not free mbuf, embedded in cluster */ +#define M_TSTMP 0x00000400 /* rcv_tstmp field is valid */ +#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically + hw-stamped on entrance (useful for + IEEE 1588 and 802.1AS) */ #define M_PROTO1 0x00001000 /* protocol-specific */ #define M_PROTO2 0x00002000 /* protocol-specific */ @@ -320,15 +330,15 @@ * Flags preserved when copying m_pkthdr. */ #define M_COPYFLAGS \ - (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG| \ - M_PROTOFLAGS) + (M_PKTHDR|M_EOR|M_RDONLY|M_BCAST|M_MCAST|M_PROMISC|M_VLANTAG|M_TSTMP| \ + M_TSTMP_HPREC|M_PROTOFLAGS) /* * Mbuf flag description for use with printf(9) %b identifier. */ #define M_FLAG_BITS \ "\20\1M_EXT\2M_PKTHDR\3M_EOR\4M_RDONLY\5M_BCAST\6M_MCAST" \ - "\7M_PROMISC\10M_VLANTAG" + "\7M_PROMISC\10M_VLANTAG\13M_TSTMP\14M_TSTMP_HPREC" #define M_FLAG_PROTOBITS \ "\15M_PROTO1\16M_PROTO2\17M_PROTO3\20M_PROTO4\21M_PROTO5" \ "\22M_PROTO6\23M_PROTO7\24M_PROTO8\25M_PROTO9\26M_PROTO10" \ @@ -1348,5 +1358,17 @@ mq_src->mq_len = 0; } +#ifdef _SYS_TIMESPEC_H_ +static inline void +mbuf_tstmp2timespec(struct mbuf *m, struct timespec *ts) +{ + + MPASS((m->m_flags & M_PKTHDR) != 0); + MPASS((m->m_flags & M_TSTMP) != 0); + ts->tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; + ts->tv_nsec = m->m_pkthdr.rcv_tstmp % 1000000000; +} +#endif + #endif /* _KERNEL */ #endif /* !_SYS_MBUF_H_ */ Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -563,6 +563,17 @@ #define SCM_BINTIME 0x04 /* timestamp (struct bintime) */ #define SCM_REALTIME 0x05 /* timestamp (struct timespec) */ #define SCM_MONOTONIC 0x06 /* timestamp (struct timespec) */ +#define SCM_TIME_INFO 0x07 /* timestamp info */ + +struct sock_timestamp_info { + __uint32_t st_info_flags; + __uint32_t st_info_pad0; + __uint64_t st_info_rsv[7]; +}; + +#define ST_INFO_HW 0x0001 /* SCM_TIMESTAMP was hw */ +#define ST_INFO_HW_HPREC 0x0002 /* SCM_TIMESTAMP was hw-assisted + on entrance */ #endif #if __BSD_VISIBLE