Index: share/man/man4/tcp.4 =================================================================== --- share/man/man4/tcp.4 +++ share/man/man4/tcp.4 @@ -291,6 +291,10 @@ .Pp If an SADB entry cannot be found for the destination, the system does not send any outgoing segments and drops any inbound segments. +.It Dv TCP_STATS +Manage collection of connection level statistics using the +.Xr stats 3 +framework. .Pp Each dropped segment is taken into account in the TCP protocol statistics. .El @@ -606,6 +610,17 @@ .It Va insecure_syn Use criteria defined in RFC793 instead of RFC5961 for accepting SYN segments. Default is false. +.It Va perconn_stats_enable +Controls the default collection of statistics for all connections using the +.Xr stats 3 +framework. +0 disables, 1 enables, 2 enables random sampling across log id connection +groups with all connections in a group receiving the same setting. +.It Va perconn_stats_sample_rates +A CSV list of template_spec=percent key-value pairs which controls the per +template sampling rates when +.Xr stats 3 +sampling is enabled. .El .Sh ERRORS A socket operation may fail with one of the following errors returned: @@ -645,6 +660,7 @@ .Sh SEE ALSO .Xr getsockopt 2 , .Xr socket 2 , +.Xr stats 3 , .Xr sysctl 3 , .Xr blackhole 4 , .Xr inet 4 , Index: sys/conf/files =================================================================== --- sys/conf/files +++ sys/conf/files @@ -4271,6 +4271,7 @@ netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap netinet/tcp_reass.c optional inet | inet6 netinet/tcp_sack.c optional inet | inet6 +netinet/tcp_stats.c optional stats inet | stats inet6 netinet/tcp_subr.c optional inet | inet6 netinet/tcp_syncache.c optional inet | inet6 netinet/tcp_timer.c optional inet | inet6 Index: sys/netinet/tcp.h =================================================================== --- sys/netinet/tcp.h +++ sys/netinet/tcp.h @@ -168,6 +168,7 @@ #define TCP_NOOPT 8 /* don't use TCP options */ #define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ #define TCP_INFO 32 /* retrieve tcp_info structure */ +#define TCP_STATS 33 /* retrieve stats blob structure */ #define TCP_LOG 34 /* configure event logging for connection */ #define TCP_LOGBUF 35 /* retrieve event log for connection */ #define TCP_LOGID 36 /* configure log ID to correlate connections */ @@ -250,7 +251,20 @@ #define TCPI_OPT_WSCALE 0x04 #define TCPI_OPT_ECN 0x08 #define TCPI_OPT_TOE 0x10 +#define TCPI_OPT_TFO 0x20 +/* + * Storage for details exchanged during the handshake, regardless of whether + * they are in fact successfully negotiated or not. This is particularly useful + * for servers to gain insight into the options their clients try to use. + */ +struct tcpsyninfo { + uint8_t to_synopts[13]; /* opts recorded on SYN (in order) */ + uint8_t to_nrsynopts; /* # opts recorded on SYN */ + uint8_t to_npsynopts; /* # opts present on SYN */ + uint8_t th_flags; /* flags present on SYN */ +}; + /* Maximum length of log ID. */ #define TCP_LOG_ID_LEN 64 @@ -315,9 +329,10 @@ u_int32_t tcpi_snd_rexmitpack; /* Retransmitted packets */ u_int32_t tcpi_rcv_ooopack; /* Out-of-order packets */ u_int32_t tcpi_snd_zerowin; /* Zero-sized windows sent */ - + struct tcpsyninfo tcpi_rxsyninfo; /* Peers' SYN info */ + /* Padding to grow without breaking ABI. */ - u_int32_t __tcpi_pad[26]; /* Padding. */ + u_int32_t __tcpi_pad[22]; /* Padding. */ }; /* @@ -337,4 +352,17 @@ uint32_t pcbcnt; }; +/* + * TCP specific variables of interest for tp->t_stats stats(9) accounting. + */ +#define VOI_TCP_TXPB 0 /* Transmit payload bytes */ +#define VOI_TCP_RETXPB 1 /* Retransmit payload bytes */ +#define VOI_TCP_FRWIN 2 /* Foreign receive window */ +#define VOI_TCP_LCWIN 3 /* Local congesiton window */ +#define VOI_TCP_RTT 4 /* Round trip time */ +#define VOI_TCP_CSIG 5 /* Congestion signal */ +#define VOI_TCP_GPUT 6 /* Goodput */ +#define VOI_TCP_CALCFRWINDIFF 7 /* Congestion avoidance LCWIN - FRWIN */ +#define VOI_TCP_GPUT_ND 8 /* Goodput normalised delta */ +#define VOI_TCP_ACKLEN 9 /* Average ACKed bytes per ACK */ #endif /* !_NETINET_TCP_H_ */ Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -66,6 +66,7 @@ #include #include /* for proc0 declaration */ #include +#include #include #include #include @@ -73,6 +74,8 @@ #include #include #include +#include +#include /* Must come after qmath.h and tree.h */ #include /* before tcp_seq.h, for tcp_random18() */ @@ -293,6 +296,10 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type) { +#ifdef STATS + int32_t gput; +#endif + INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; @@ -303,6 +310,32 @@ tp->ccv->flags &= ~CCF_CWND_LIMITED; if (type == CC_ACK) { +#ifdef STATS + stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, + ((int32_t)tp->snd_cwnd) - tp->snd_wnd); + if (!IN_RECOVERY(tp->t_flags)) + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN, + tp->ccv->bytes_this_ack / (tcp_maxseg(tp) * nsegs)); + if ((tp->t_flags & TF_GPUTINPROG) && + SEQ_GEQ(th->th_ack, tp->gput_ack)) { + gput = (((int64_t)(th->th_ack - tp->gput_seq)) << 3) / + max(1, tcp_ts_getticks() - tp->gput_ts); + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, + gput); + /* + * XXXLAS: This is a temporary hack, and should be + * chained off VOI_TCP_GPUT when stats(9) grows an API + * to deal with chained VOIs. + */ + if (tp->t_stats_gput_prev > 0) + stats_voi_update_abs_s32(tp->t_stats, + VOI_TCP_GPUT_ND, + ((gput - tp->t_stats_gput_prev) * 100) / + tp->t_stats_gput_prev); + tp->t_flags &= ~TF_GPUTINPROG; + tp->t_stats_gput_prev = gput; + } +#endif /* STATS */ if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, nsegs * V_tcp_abc_l_var * tcp_maxseg(tp)); @@ -321,6 +354,9 @@ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->ack_received(tp->ccv, type); } +#ifdef STATS + stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd); +#endif } void @@ -386,6 +422,10 @@ INP_WLOCK_ASSERT(tp->t_inpcb); +#ifdef STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); +#endif + switch(type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(tp->t_flags)) { @@ -1571,6 +1611,9 @@ * For the SYN_SENT state the scale is zero. */ tiwin = th->th_win << tp->snd_scale; +#ifdef STATS + stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); +#endif /* * TCP ECN processing. @@ -3289,9 +3332,15 @@ void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { + struct tcpsyninfo *syninfo; int opt, optlen; to->to_flags = 0; + if (flags & TO_SYN) { + syninfo = &to->to_syninfo; + syninfo->to_npsynopts = syninfo->to_nrsynopts = + syninfo->th_flags = 0; + } for (; cnt > 0; cnt -= optlen, cp += optlen) { opt = cp[0]; if (opt == TCPOPT_EOL) @@ -3305,6 +3354,12 @@ if (optlen < 2 || optlen > cnt) break; } + if (flags & TO_SYN) { + syninfo->to_npsynopts++; + if (syninfo->to_nrsynopts < sizeof(syninfo->to_synopts)) + syninfo->to_synopts[syninfo->to_nrsynopts++] = + cp[0]; + } switch (opt) { case TCPOPT_MAXSEG: if (optlen != TCPOLEN_MAXSEG) @@ -3436,6 +3491,9 @@ TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; +#ifdef STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); +#endif if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) { /* * srtt is stored as fixed point with 5 bits after the Index: sys/netinet/tcp_log_buf.c =================================================================== --- sys/netinet/tcp_log_buf.c +++ sys/netinet/tcp_log_buf.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ #include #include #include +#include /* Must come after qmath.h and tree.h */ #include #include @@ -475,7 +477,7 @@ INP_WLOCK_ASSERT(tp->t_inpcb); -#ifdef NETFLIX +#ifdef STATS if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL) (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id)); #endif Index: sys/netinet/tcp_output.c =================================================================== --- sys/netinet/tcp_output.c +++ sys/netinet/tcp_output.c @@ -50,10 +50,13 @@ #include #include #include +#include #include #include #include #include +#include +#include /* Must come after qmath.h and tree.h */ #include #include @@ -968,15 +971,31 @@ struct sockbuf *msb; u_int moff; - if ((tp->t_flags & TF_FORCEDATA) && len == 1) + if ((tp->t_flags & TF_FORCEDATA) && len == 1) { TCPSTAT_INC(tcps_sndprobe); - else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { +#ifdef STATS + if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + stats_voi_update_abs_u32(tp->t_stats, + VOI_TCP_RETXPB, len); + else + stats_voi_update_abs_u64(tp->t_stats, + VOI_TCP_TXPB, len); +#endif /* STATS */ + } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); +#ifdef STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, + len); +#endif /* STATS */ } else { TCPSTAT_INC(tcps_sndpack); TCPSTAT_ADD(tcps_sndbyte, len); +#ifdef STATS + stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, + len); +#endif /* STATS */ } #ifdef INET6 if (MHLEN < hdrlen + max_linkhdr) @@ -1448,6 +1467,13 @@ tp->t_rtttime = ticks; tp->t_rtseq = startseq; TCPSTAT_INC(tcps_segstimed); + } + if (!(tp->t_flags & TF_GPUTINPROG) && len) { + tp->t_flags |= TF_GPUTINPROG; + tp->gput_seq = startseq; + tp->gput_ack = startseq + + ulmin(sbavail(&so->so_snd) - off, sendwin); + tp->gput_ts = tcp_ts_getticks(); } } Index: sys/netinet/tcp_reass.c =================================================================== --- sys/netinet/tcp_reass.c +++ sys/netinet/tcp_reass.c @@ -47,11 +47,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include /* Must come after qmath.h and tree.h */ #include Index: sys/netinet/tcp_stats.c =================================================================== --- /dev/null +++ sys/netinet/tcp_stats.c @@ -0,0 +1,269 @@ +/*- + * Copyright (c) 2016-2018 Netflix, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Author: Lawrence Stewart + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#endif +#include + +#include + +#include +#include +#include +#include + +#include + +VNET_DEFINE(int, tcp_perconn_stats_dflt_tpl) = -1; + +#ifdef _KERNEL + +VNET_DEFINE(int, tcp_perconn_stats_enable) = 2; +VNET_DEFINE_STATIC(struct stats_tpl_sample_rate *, tcp_perconn_stats_sample_rates); +VNET_DEFINE_STATIC(int, tcp_stats_nrates) = 0; +#define V_tcp_perconn_stats_sample_rates VNET(tcp_perconn_stats_sample_rates) +#define V_tcp_stats_nrates VNET(tcp_stats_nrates) + +static struct rmlock tcp_stats_tpl_sampling_lock; +static int tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action, + struct stats_tpl_sample_rate **rates, int *nrates, void *ctx); + +SYSCTL_INT(_net_inet_tcp, OID_AUTO, perconn_stats_enable, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_perconn_stats_enable), 0, + "Enable per-connection TCP stats gathering; 1 enables for all connections, " + "2 enables random sampling across log id connection groups"); +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, perconn_stats_sample_rates, + CTLTYPE_STRING | CTLFLAG_RW, tcp_stats_tpl_sr_cb, + sizeof(struct rm_priotracker), stats_tpl_sample_rates, "A", + "TCP stats per template random sampling rates, in CSV tpl_spec=percent " + "key-value pairs (see stats(9) for template spec details)"); +#endif /* _KERNEL */ + +int +#ifndef _KERNEL +/* Ensure all templates are also added to the userland template list. */ +__attribute__ ((constructor)) +#endif +tcp_stats_init() +{ + int err, lasterr; + + err = lasterr = 0; + + V_tcp_perconn_stats_dflt_tpl = stats_tpl_alloc("TCP_DEFAULT", 0); + if (V_tcp_perconn_stats_dflt_tpl < 0) + return (-V_tcp_perconn_stats_dflt_tpl); + + struct voistatspec vss_sum[] = { + STATS_VSS_SUM(), + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_TXPB, "TCP_TXPB", VSD_DTYPE_INT_U64, + NVSS(vss_sum), vss_sum, 0); + lasterr = err ? err : lasterr; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_RETXPB, "TCP_RETXPB", VSD_DTYPE_INT_U32, + NVSS(vss_sum), vss_sum, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_max[] = { + STATS_VSS_MAX(), + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_FRWIN, "TCP_FRWIN", VSD_DTYPE_INT_ULONG, + NVSS(vss_max), vss_max, 0); + lasterr = err ? err : lasterr; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_LCWIN, "TCP_LCWIN", VSD_DTYPE_INT_ULONG, + NVSS(vss_max), vss_max, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_rtt[] = { + STATS_VSS_MAX(), + STATS_VSS_MIN(), + STATS_VSS_TDGSTCLUST32(20, 4), + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_RTT, "TCP_RTT", VSD_DTYPE_INT_U32, + NVSS(vss_rtt), vss_rtt, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_congsig[] = { + STATS_VSS_DVHIST32_USR(HBKTS(DVBKT(CC_ECN), DVBKT(CC_RTO), + DVBKT(CC_RTO_ERR), DVBKT(CC_NDUPACK)), 0) + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_CSIG, "TCP_CSIG", VSD_DTYPE_INT_U32, + NVSS(vss_congsig), vss_congsig, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_gput[] = { + STATS_VSS_MAX(), + STATS_VSS_TDGSTCLUST32(20, 4), + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_GPUT, "TCP_GPUT", VSD_DTYPE_INT_U32, + NVSS(vss_gput), vss_gput, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_gput_nd[] = { + STATS_VSS_TDGSTCLUST32(10, 4), + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_GPUT_ND, "TCP_GPUT_ND", VSD_DTYPE_INT_S32, + NVSS(vss_gput_nd), vss_gput_nd, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_windiff[] = { + STATS_VSS_CRHIST32_USR(HBKTS(CRBKT(0)), VSD_HIST_LBOUND_INF) + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_CALCFRWINDIFF, "TCP_CALCFRWINDIFF", VSD_DTYPE_INT_S32, + NVSS(vss_windiff), vss_windiff, 0); + lasterr = err ? err : lasterr; + + struct voistatspec vss_acklen[] = { + STATS_VSS_MAX(), + STATS_VSS_CRHIST32_LIN(0, 9, 1, VSD_HIST_UBOUND_INF) + }; + err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl, + VOI_TCP_ACKLEN, "TCP_ACKLEN", VSD_DTYPE_INT_U32, + NVSS(vss_acklen), vss_acklen, 0); + lasterr = err ? err : lasterr; + + return (lasterr); +} + +#ifdef _KERNEL +int +tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, + size_t seed_len) +{ + struct rm_priotracker tracker; + int tpl; + + tpl = -1; + + if (V_tcp_stats_nrates > 0) { + rm_rlock(&tcp_stats_tpl_sampling_lock, &tracker); + tpl = stats_tpl_sample_rollthedice(V_tcp_perconn_stats_sample_rates, + V_tcp_stats_nrates, seed_bytes, seed_len); + rm_runlock(&tcp_stats_tpl_sampling_lock, &tracker); + + if (tpl >= 0) { + INP_WLOCK_ASSERT(tp->t_inpcb); + if (tp->t_stats != NULL) + stats_blob_destroy(tp->t_stats); + tp->t_stats = stats_blob_alloc(tpl, 0); + if (tp->t_stats == NULL) + tpl = -ENOMEM; + } + } + + return (tpl); +} + +/* + * Callback function for stats_tpl_sample_rates() to interact with the TCP + * subsystem's stats template sample rates list. + */ +int +tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action, + struct stats_tpl_sample_rate **rates, int *nrates, void *ctx) +{ + struct stats_tpl_sample_rate *old_rates; + int old_nrates; + + if (ctx == NULL) + return (ENOMEM); + + switch (action) { + case TPL_SR_RLOCKED_GET: + /* + * Return with rlock held i.e. this call must be paired with a + * "action == TPL_SR_RUNLOCK" call. + */ + rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED); + rm_rlock(&tcp_stats_tpl_sampling_lock, + (struct rm_priotracker *)ctx); + /* FALLTHROUGH */ + case TPL_SR_UNLOCKED_GET: + if (rates != NULL) + *rates = V_tcp_perconn_stats_sample_rates; + if (nrates != NULL) + *nrates = V_tcp_stats_nrates; + break; + case TPL_SR_RUNLOCK: + rm_assert(&tcp_stats_tpl_sampling_lock, RA_RLOCKED); + rm_runlock(&tcp_stats_tpl_sampling_lock, + (struct rm_priotracker *)ctx); + break; + case TPL_SR_PUT: + KASSERT(rates != NULL && nrates != NULL, + ("%s: PUT without new rates", __func__)); + rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED); + if (rates == NULL || nrates == NULL) + return (EINVAL); + rm_wlock(&tcp_stats_tpl_sampling_lock); + old_rates = V_tcp_perconn_stats_sample_rates; + old_nrates = V_tcp_stats_nrates; + V_tcp_perconn_stats_sample_rates = *rates; + V_tcp_stats_nrates = *nrates; + rm_wunlock(&tcp_stats_tpl_sampling_lock); + *rates = old_rates; + *nrates = old_nrates; + break; + default: + return (EINVAL); + break; + } + + return (0); +} + +RM_SYSINIT(tcp_stats_tpl_sampling_lock, &tcp_stats_tpl_sampling_lock, + "tcp_stats_tpl_sampling_lock"); +#endif /* _KERNEL */ Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -50,6 +50,9 @@ #ifdef TCP_HHOOK #include #endif +#include +#include +#include /* Must come after qmath.h and tree.h */ #include #include #include @@ -992,6 +995,13 @@ &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); #endif + +#ifdef STATS + if (tcp_stats_init()) + printf("%s: WARNING: unable to initialise TCP stats\n", + __func__); +#endif + hashsize = TCBHASHSIZE; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { @@ -1674,6 +1684,11 @@ if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } +#ifdef STATS + if (V_tcp_perconn_stats_enable == 1) + tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0); +#endif + return (tp); /* XXX */ } @@ -1892,7 +1907,9 @@ #ifdef TCP_HHOOK khelp_destroy_osd(tp->osd); #endif - +#ifdef STATS + stats_blob_destroy(tp->t_stats); +#endif CC_ALGO(tp) = NULL; inp->inp_ppcb = NULL; if (tp->t_timers->tt_draincnt == 0) { Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #ifdef INET6 @@ -63,6 +64,8 @@ #include #include #include +#include +#include /* Must come after qmath.h and tree.h */ #ifdef DDB #include @@ -106,6 +109,13 @@ #endif #include +#include +#include +#include +#include +#include +#include + /* * TCP protocol interface to socket abstraction. */ @@ -1520,6 +1530,9 @@ if (tp->t_flags & TF_ECN_PERMIT) ti->tcpi_options |= TCPI_OPT_ECN; + if (tp->t_flags & TF_FASTOPEN) + ti->tcpi_options |= TCPI_OPT_TFO; + ti->tcpi_rto = tp->t_rxtcur * tick; ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; @@ -1541,6 +1554,8 @@ ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; ti->tcpi_rcv_ooopack = tp->t_rcvoopack; ti->tcpi_snd_zerowin = tp->t_sndzerowin; + + memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { ti->tcpi_options |= TCPI_OPT_TOE; @@ -1758,6 +1773,9 @@ struct tcp_info ti; struct cc_algo *algo; char *pbuf, buf[TCP_LOG_ID_LEN]; +#ifdef STATS + struct statsblob *sbp; +#endif size_t len; /* @@ -1875,6 +1893,36 @@ error = EINVAL; break; + case TCP_STATS: +#ifdef STATS + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + if (optval > 0) + sbp = stats_blob_alloc( + V_tcp_perconn_stats_dflt_tpl, 0); + else + sbp = NULL; + + INP_WLOCK_RECHECK(inp); + if ((tp->t_stats != NULL && sbp == NULL) || + (tp->t_stats == NULL && sbp != NULL)) { + struct statsblob *t = tp->t_stats; + tp->t_stats = sbp; + sbp = t; + } + INP_WUNLOCK(inp); + + stats_blob_destroy(sbp); +#else /* !STATS */ + INP_WUNLOCK(inp); + return (EOPNOTSUPP); +#endif /* !STATS */ + break; + case TCP_CONGESTION: INP_WUNLOCK(inp); error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1); @@ -2138,6 +2186,55 @@ INP_WUNLOCK(inp); error = sooptcopyout(sopt, &ti, sizeof ti); break; + case TCP_STATS: + { +#ifdef STATS + int nheld; + TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0; + + error = 0; + socklen_t outsbsz = sopt->sopt_valsize; + if (tp->t_stats == NULL) + error = ENOENT; + else if (outsbsz >= tp->t_stats->cursz) + outsbsz = tp->t_stats->cursz; + else if (outsbsz >= sizeof(struct statsblob)) + outsbsz = sizeof(struct statsblob); + else + error = EINVAL; + INP_WUNLOCK(inp); + if (error) + break; + + sbp = sopt->sopt_val; + nheld = atop(round_page(((vm_offset_t)sbp) + + (vm_size_t)outsbsz) - trunc_page(sbp)); + vm_page_t ma[nheld]; + if (vm_fault_quick_hold_pages( + &curproc->p_vmspace->vm_map, (vm_offset_t)sbp, + outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma, + nheld) < 0) { + error = EFAULT; + break; + } + + if ((error = copyin_nofault(&(sbp->flags), &sbflags, + SIZEOF_MEMBER(struct statsblob, flags)))) + goto unhold; + + INP_WLOCK_RECHECK(inp); + error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats, + sbflags | SB_CLONE_USRDSTNOFAULT); + INP_WUNLOCK(inp); + sopt->sopt_valsize = outsbsz; +unhold: + vm_page_unhold_pages(ma, nheld); +#else /* !STATS */ + INP_WUNLOCK(inp); + error = EOPNOTSUPP; +#endif /* !STATS */ + break; + } case TCP_CONGESTION: len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX); INP_WUNLOCK(inp); Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -209,7 +209,14 @@ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ + struct statsblob *t_stats; /* Per-connection stats */ uint32_t t_logsn; /* Log "serial number" */ + uint32_t gput_ts; /* Time goodput measurement started */ + tcp_seq gput_seq; /* Outbound measurement seq */ + tcp_seq gput_ack; /* Inbound measurement ack */ + struct tcpsyninfo t_rxsyninfo; /* Peer's SYN details */ + int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ + uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */ union { @@ -320,6 +327,7 @@ #define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x001000 /* don't push */ #define TF_PREVVALID 0x002000 /* saved values for bad rxmit valid */ +#define TF_GPUTINPROG 0x008000 /* Goodput measurement in progress */ #define TF_MORETOCOME 0x010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */ #define TF_LASTIDLE 0x040000 /* connection was previously idle */ @@ -399,7 +407,7 @@ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ - u_int32_t to_spare; /* UTO */ + struct tcpsyninfo to_syninfo; /* SYN specific opts data */ }; /* @@ -738,6 +746,11 @@ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ +/* These stats(9) related bits need to be visible to userland code. */ +int tcp_stats_init(void); +#define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable) +#define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl) + #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); @@ -774,6 +787,8 @@ VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); +VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl); +VNET_DECLARE(int, tcp_perconn_stats_enable); VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); @@ -942,6 +957,8 @@ int tcp_compute_pipe(struct tcpcb *); uint32_t tcp_compute_initwnd(uint32_t); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); +int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, + size_t seed_len); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb);