Page MenuHomeFreeBSD

D20655.id58671.diff
No OneTemporary

D20655.id58671.diff

Index: share/man/man4/tcp.4
===================================================================
--- share/man/man4/tcp.4
+++ share/man/man4/tcp.4
@@ -291,6 +291,10 @@
.Pp
If an SADB entry cannot be found for the destination,
the system does not send any outgoing segments and drops any inbound segments.
+.It Dv TCP_STATS
+Manage collection of connection level statistics using the
+.Xr stats 3
+framework.
.Pp
Each dropped segment is taken into account in the TCP protocol statistics.
.El
@@ -606,6 +610,17 @@
.It Va insecure_syn
Use criteria defined in RFC793 instead of RFC5961 for accepting SYN segments.
Default is false.
+.It Va perconn_stats_enable
+Controls the default collection of statistics for all connections using the
+.Xr stats 3
+framework.
+0 disables, 1 enables, 2 enables random sampling across log id connection
+groups with all connections in a group receiving the same setting.
+.It Va perconn_stats_sample_rates
+A CSV list of template_spec=percent key-value pairs which controls the per
+template sampling rates when
+.Xr stats 3
+sampling is enabled.
.El
.Sh ERRORS
A socket operation may fail with one of the following errors returned:
@@ -645,6 +660,7 @@
.Sh SEE ALSO
.Xr getsockopt 2 ,
.Xr socket 2 ,
+.Xr stats 3 ,
.Xr sysctl 3 ,
.Xr blackhole 4 ,
.Xr inet 4 ,
Index: sys/conf/files
===================================================================
--- sys/conf/files
+++ sys/conf/files
@@ -4271,6 +4271,7 @@
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap
netinet/tcp_reass.c optional inet | inet6
netinet/tcp_sack.c optional inet | inet6
+netinet/tcp_stats.c optional stats inet | stats inet6
netinet/tcp_subr.c optional inet | inet6
netinet/tcp_syncache.c optional inet | inet6
netinet/tcp_timer.c optional inet | inet6
Index: sys/netinet/tcp.h
===================================================================
--- sys/netinet/tcp.h
+++ sys/netinet/tcp.h
@@ -168,6 +168,7 @@
#define TCP_NOOPT 8 /* don't use TCP options */
#define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */
#define TCP_INFO 32 /* retrieve tcp_info structure */
+#define TCP_STATS 33 /* retrieve stats blob structure */
#define TCP_LOG 34 /* configure event logging for connection */
#define TCP_LOGBUF 35 /* retrieve event log for connection */
#define TCP_LOGID 36 /* configure log ID to correlate connections */
@@ -250,7 +251,20 @@
#define TCPI_OPT_WSCALE 0x04
#define TCPI_OPT_ECN 0x08
#define TCPI_OPT_TOE 0x10
+#define TCPI_OPT_TFO 0x20
+/*
+ * Storage for details exchanged during the handshake, regardless of whether
+ * they are in fact successfully negotiated or not. This is particularly useful
+ * for servers to gain insight into the options their clients try to use.
+ */
+struct tcpsyninfo {
+ uint8_t to_synopts[13]; /* opts recorded on SYN (in order) */
+ uint8_t to_nrsynopts; /* # opts recorded on SYN */
+ uint8_t to_npsynopts; /* # opts present on SYN */
+ uint8_t th_flags; /* flags present on SYN */
+};
+
/* Maximum length of log ID. */
#define TCP_LOG_ID_LEN 64
@@ -315,9 +329,10 @@
u_int32_t tcpi_snd_rexmitpack; /* Retransmitted packets */
u_int32_t tcpi_rcv_ooopack; /* Out-of-order packets */
u_int32_t tcpi_snd_zerowin; /* Zero-sized windows sent */
-
+ struct tcpsyninfo tcpi_rxsyninfo; /* Peers' SYN info */
+
/* Padding to grow without breaking ABI. */
- u_int32_t __tcpi_pad[26]; /* Padding. */
+ u_int32_t __tcpi_pad[22]; /* Padding. */
};
/*
@@ -337,4 +352,17 @@
uint32_t pcbcnt;
};
+/*
+ * TCP specific variables of interest for tp->t_stats stats(9) accounting.
+ */
+#define VOI_TCP_TXPB 0 /* Transmit payload bytes */
+#define VOI_TCP_RETXPB 1 /* Retransmit payload bytes */
+#define VOI_TCP_FRWIN 2 /* Foreign receive window */
+#define VOI_TCP_LCWIN 3 /* Local congesiton window */
+#define VOI_TCP_RTT 4 /* Round trip time */
+#define VOI_TCP_CSIG 5 /* Congestion signal */
+#define VOI_TCP_GPUT 6 /* Goodput */
+#define VOI_TCP_CALCFRWINDIFF 7 /* Congestion avoidance LCWIN - FRWIN */
+#define VOI_TCP_GPUT_ND 8 /* Goodput normalised delta */
+#define VOI_TCP_ACKLEN 9 /* Average ACKed bytes per ACK */
#endif /* !_NETINET_TCP_H_ */
Index: sys/netinet/tcp_input.c
===================================================================
--- sys/netinet/tcp_input.c
+++ sys/netinet/tcp_input.c
@@ -66,6 +66,7 @@
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
#include <sys/protosw.h>
+#include <sys/qmath.h>
#include <sys/sdt.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
@@ -73,6 +74,8 @@
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
+#include <sys/tree.h>
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
#include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
@@ -293,6 +296,10 @@
cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs,
uint16_t type)
{
+#ifdef STATS
+ int32_t gput;
+#endif
+
INP_WLOCK_ASSERT(tp->t_inpcb);
tp->ccv->nsegs = nsegs;
@@ -303,6 +310,32 @@
tp->ccv->flags &= ~CCF_CWND_LIMITED;
if (type == CC_ACK) {
+#ifdef STATS
+ stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
+ ((int32_t)tp->snd_cwnd) - tp->snd_wnd);
+ if (!IN_RECOVERY(tp->t_flags))
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN,
+ tp->ccv->bytes_this_ack / (tcp_maxseg(tp) * nsegs));
+ if ((tp->t_flags & TF_GPUTINPROG) &&
+ SEQ_GEQ(th->th_ack, tp->gput_ack)) {
+ gput = (((int64_t)(th->th_ack - tp->gput_seq)) << 3) /
+ max(1, tcp_ts_getticks() - tp->gput_ts);
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
+ gput);
+ /*
+ * XXXLAS: This is a temporary hack, and should be
+ * chained off VOI_TCP_GPUT when stats(9) grows an API
+ * to deal with chained VOIs.
+ */
+ if (tp->t_stats_gput_prev > 0)
+ stats_voi_update_abs_s32(tp->t_stats,
+ VOI_TCP_GPUT_ND,
+ ((gput - tp->t_stats_gput_prev) * 100) /
+ tp->t_stats_gput_prev);
+ tp->t_flags &= ~TF_GPUTINPROG;
+ tp->t_stats_gput_prev = gput;
+ }
+#endif /* STATS */
if (tp->snd_cwnd > tp->snd_ssthresh) {
tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
nsegs * V_tcp_abc_l_var * tcp_maxseg(tp));
@@ -321,6 +354,9 @@
tp->ccv->curack = th->th_ack;
CC_ALGO(tp)->ack_received(tp->ccv, type);
}
+#ifdef STATS
+ stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
+#endif
}
void
@@ -386,6 +422,10 @@
INP_WLOCK_ASSERT(tp->t_inpcb);
+#ifdef STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
+#endif
+
switch(type) {
case CC_NDUPACK:
if (!IN_FASTRECOVERY(tp->t_flags)) {
@@ -1571,6 +1611,9 @@
* For the SYN_SENT state the scale is zero.
*/
tiwin = th->th_win << tp->snd_scale;
+#ifdef STATS
+ stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
+#endif
/*
* TCP ECN processing.
@@ -3289,9 +3332,15 @@
void
tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
{
+ struct tcpsyninfo *syninfo;
int opt, optlen;
to->to_flags = 0;
+ if (flags & TO_SYN) {
+ syninfo = &to->to_syninfo;
+ syninfo->to_npsynopts = syninfo->to_nrsynopts =
+ syninfo->th_flags = 0;
+ }
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[0];
if (opt == TCPOPT_EOL)
@@ -3305,6 +3354,12 @@
if (optlen < 2 || optlen > cnt)
break;
}
+ if (flags & TO_SYN) {
+ syninfo->to_npsynopts++;
+ if (syninfo->to_nrsynopts < sizeof(syninfo->to_synopts))
+ syninfo->to_synopts[syninfo->to_nrsynopts++] =
+ cp[0];
+ }
switch (opt) {
case TCPOPT_MAXSEG:
if (optlen != TCPOLEN_MAXSEG)
@@ -3436,6 +3491,9 @@
TCPSTAT_INC(tcps_rttupdated);
tp->t_rttupdated++;
+#ifdef STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt));
+#endif
if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) {
/*
* srtt is stored as fixed point with 5 bits after the
Index: sys/netinet/tcp_log_buf.c
===================================================================
--- sys/netinet/tcp_log_buf.c
+++ sys/netinet/tcp_log_buf.c
@@ -34,6 +34,7 @@
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mutex.h>
+#include <sys/qmath.h>
#include <sys/queue.h>
#include <sys/refcount.h>
#include <sys/rwlock.h>
@@ -41,6 +42,7 @@
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/tree.h>
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
#include <sys/counter.h>
#include <dev/tcp_log/tcp_log_dev.h>
@@ -475,7 +477,7 @@
INP_WLOCK_ASSERT(tp->t_inpcb);
-#ifdef NETFLIX
+#ifdef STATS
if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL)
(void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id));
#endif
Index: sys/netinet/tcp_output.c
===================================================================
--- sys/netinet/tcp_output.c
+++ sys/netinet/tcp_output.c
@@ -50,10 +50,13 @@
#include <sys/mbuf.h>
#include <sys/mutex.h>
#include <sys/protosw.h>
+#include <sys/qmath.h>
#include <sys/sdt.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
+#include <sys/tree.h>
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
#include <net/if.h>
#include <net/route.h>
@@ -968,15 +971,31 @@
struct sockbuf *msb;
u_int moff;
- if ((tp->t_flags & TF_FORCEDATA) && len == 1)
+ if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
TCPSTAT_INC(tcps_sndprobe);
- else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
+#ifdef STATS
+ if (SEQ_LT(tp->snd_nxt, tp->snd_max))
+ stats_voi_update_abs_u32(tp->t_stats,
+ VOI_TCP_RETXPB, len);
+ else
+ stats_voi_update_abs_u64(tp->t_stats,
+ VOI_TCP_TXPB, len);
+#endif /* STATS */
+ } else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
tp->t_sndrexmitpack++;
TCPSTAT_INC(tcps_sndrexmitpack);
TCPSTAT_ADD(tcps_sndrexmitbyte, len);
+#ifdef STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
+ len);
+#endif /* STATS */
} else {
TCPSTAT_INC(tcps_sndpack);
TCPSTAT_ADD(tcps_sndbyte, len);
+#ifdef STATS
+ stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
+ len);
+#endif /* STATS */
}
#ifdef INET6
if (MHLEN < hdrlen + max_linkhdr)
@@ -1448,6 +1467,13 @@
tp->t_rtttime = ticks;
tp->t_rtseq = startseq;
TCPSTAT_INC(tcps_segstimed);
+ }
+ if (!(tp->t_flags & TF_GPUTINPROG) && len) {
+ tp->t_flags |= TF_GPUTINPROG;
+ tp->gput_seq = startseq;
+ tp->gput_ack = startseq +
+ ulmin(sbavail(&so->so_snd) - off, sendwin);
+ tp->gput_ts = tcp_ts_getticks();
}
}
Index: sys/netinet/tcp_reass.c
===================================================================
--- sys/netinet/tcp_reass.c
+++ sys/netinet/tcp_reass.c
@@ -47,11 +47,14 @@
#include <sys/eventhandler.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
+#include <sys/qmath.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
+#include <sys/tree.h>
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
#include <vm/uma.h>
Index: sys/netinet/tcp_stats.c
===================================================================
--- /dev/null
+++ sys/netinet/tcp_stats.c
@@ -0,0 +1,269 @@
+/*-
+ * Copyright (c) 2016-2018 Netflix, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Author: Lawrence Stewart <lstewart@netflix.com>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/qmath.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/tree.h>
+#ifdef _KERNEL
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/systm.h>
+#endif
+#include <sys/stats.h>
+
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+
+#include <netinet/cc/cc.h>
+
+VNET_DEFINE(int, tcp_perconn_stats_dflt_tpl) = -1;
+
+#ifdef _KERNEL
+
+VNET_DEFINE(int, tcp_perconn_stats_enable) = 2;
+VNET_DEFINE_STATIC(struct stats_tpl_sample_rate *, tcp_perconn_stats_sample_rates);
+VNET_DEFINE_STATIC(int, tcp_stats_nrates) = 0;
+#define V_tcp_perconn_stats_sample_rates VNET(tcp_perconn_stats_sample_rates)
+#define V_tcp_stats_nrates VNET(tcp_stats_nrates)
+
+static struct rmlock tcp_stats_tpl_sampling_lock;
+static int tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action,
+ struct stats_tpl_sample_rate **rates, int *nrates, void *ctx);
+
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, perconn_stats_enable,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_perconn_stats_enable), 0,
+ "Enable per-connection TCP stats gathering; 1 enables for all connections, "
+ "2 enables random sampling across log id connection groups");
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, perconn_stats_sample_rates,
+ CTLTYPE_STRING | CTLFLAG_RW, tcp_stats_tpl_sr_cb,
+ sizeof(struct rm_priotracker), stats_tpl_sample_rates, "A",
+ "TCP stats per template random sampling rates, in CSV tpl_spec=percent "
+ "key-value pairs (see stats(9) for template spec details)");
+#endif /* _KERNEL */
+
+int
+#ifndef _KERNEL
+/* Ensure all templates are also added to the userland template list. */
+__attribute__ ((constructor))
+#endif
+tcp_stats_init()
+{
+ int err, lasterr;
+
+ err = lasterr = 0;
+
+ V_tcp_perconn_stats_dflt_tpl = stats_tpl_alloc("TCP_DEFAULT", 0);
+ if (V_tcp_perconn_stats_dflt_tpl < 0)
+ return (-V_tcp_perconn_stats_dflt_tpl);
+
+ struct voistatspec vss_sum[] = {
+ STATS_VSS_SUM(),
+ };
+ err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+ VOI_TCP_TXPB, "TCP_TXPB", VSD_DTYPE_INT_U64,
+ NVSS(vss_sum), vss_sum, 0);
+ lasterr = err ? err : lasterr;
+ err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+ VOI_TCP_RETXPB, "TCP_RETXPB", VSD_DTYPE_INT_U32,
+ NVSS(vss_sum), vss_sum, 0);
+ lasterr = err ? err : lasterr;
+
+ struct voistatspec vss_max[] = {
+ STATS_VSS_MAX(),
+ };
+ err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+ VOI_TCP_FRWIN, "TCP_FRWIN", VSD_DTYPE_INT_ULONG,
+ NVSS(vss_max), vss_max, 0);
+ lasterr = err ? err : lasterr;
+ err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+ VOI_TCP_LCWIN, "TCP_LCWIN", VSD_DTYPE_INT_ULONG,
+ NVSS(vss_max), vss_max, 0);
+ lasterr = err ? err : lasterr;
+
+ struct voistatspec vss_rtt[] = {
+ STATS_VSS_MAX(),
+ STATS_VSS_MIN(),
+ STATS_VSS_TDGSTCLUST32(20, 4),
+ };
+ err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+ VOI_TCP_RTT, "TCP_RTT", VSD_DTYPE_INT_U32,
+ NVSS(vss_rtt), vss_rtt, 0);
+ lasterr = err ? err : lasterr;
+
+ struct voistatspec vss_congsig[] = {
+ STATS_VSS_DVHIST32_USR(HBKTS(DVBKT(CC_ECN), DVBKT(CC_RTO),
+ DVBKT(CC_RTO_ERR), DVBKT(CC_NDUPACK)), 0)
+ };
+ err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+ VOI_TCP_CSIG, "TCP_CSIG", VSD_DTYPE_INT_U32,
+ NVSS(vss_congsig), vss_congsig, 0);
+ lasterr = err ? err : lasterr;
+
+ struct voistatspec vss_gput[] = {
+ STATS_VSS_MAX(),
+ STATS_VSS_TDGSTCLUST32(20, 4),
+ };
+ err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+ VOI_TCP_GPUT, "TCP_GPUT", VSD_DTYPE_INT_U32,
+ NVSS(vss_gput), vss_gput, 0);
+ lasterr = err ? err : lasterr;
+
+ struct voistatspec vss_gput_nd[] = {
+ STATS_VSS_TDGSTCLUST32(10, 4),
+ };
+ err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+ VOI_TCP_GPUT_ND, "TCP_GPUT_ND", VSD_DTYPE_INT_S32,
+ NVSS(vss_gput_nd), vss_gput_nd, 0);
+ lasterr = err ? err : lasterr;
+
+ struct voistatspec vss_windiff[] = {
+ STATS_VSS_CRHIST32_USR(HBKTS(CRBKT(0)), VSD_HIST_LBOUND_INF)
+ };
+ err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+ VOI_TCP_CALCFRWINDIFF, "TCP_CALCFRWINDIFF", VSD_DTYPE_INT_S32,
+ NVSS(vss_windiff), vss_windiff, 0);
+ lasterr = err ? err : lasterr;
+
+ struct voistatspec vss_acklen[] = {
+ STATS_VSS_MAX(),
+ STATS_VSS_CRHIST32_LIN(0, 9, 1, VSD_HIST_UBOUND_INF)
+ };
+ err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+ VOI_TCP_ACKLEN, "TCP_ACKLEN", VSD_DTYPE_INT_U32,
+ NVSS(vss_acklen), vss_acklen, 0);
+ lasterr = err ? err : lasterr;
+
+ return (lasterr);
+}
+
+#ifdef _KERNEL
+int
+tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
+ size_t seed_len)
+{
+ struct rm_priotracker tracker;
+ int tpl;
+
+ tpl = -1;
+
+ if (V_tcp_stats_nrates > 0) {
+ rm_rlock(&tcp_stats_tpl_sampling_lock, &tracker);
+ tpl = stats_tpl_sample_rollthedice(V_tcp_perconn_stats_sample_rates,
+ V_tcp_stats_nrates, seed_bytes, seed_len);
+ rm_runlock(&tcp_stats_tpl_sampling_lock, &tracker);
+
+ if (tpl >= 0) {
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (tp->t_stats != NULL)
+ stats_blob_destroy(tp->t_stats);
+ tp->t_stats = stats_blob_alloc(tpl, 0);
+ if (tp->t_stats == NULL)
+ tpl = -ENOMEM;
+ }
+ }
+
+ return (tpl);
+}
+
+/*
+ * Callback function for stats_tpl_sample_rates() to interact with the TCP
+ * subsystem's stats template sample rates list.
+ */
+int
+tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action,
+ struct stats_tpl_sample_rate **rates, int *nrates, void *ctx)
+{
+ struct stats_tpl_sample_rate *old_rates;
+ int old_nrates;
+
+ if (ctx == NULL)
+ return (ENOMEM);
+
+ switch (action) {
+ case TPL_SR_RLOCKED_GET:
+ /*
+ * Return with rlock held i.e. this call must be paired with a
+ * "action == TPL_SR_RUNLOCK" call.
+ */
+ rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED);
+ rm_rlock(&tcp_stats_tpl_sampling_lock,
+ (struct rm_priotracker *)ctx);
+ /* FALLTHROUGH */
+ case TPL_SR_UNLOCKED_GET:
+ if (rates != NULL)
+ *rates = V_tcp_perconn_stats_sample_rates;
+ if (nrates != NULL)
+ *nrates = V_tcp_stats_nrates;
+ break;
+ case TPL_SR_RUNLOCK:
+ rm_assert(&tcp_stats_tpl_sampling_lock, RA_RLOCKED);
+ rm_runlock(&tcp_stats_tpl_sampling_lock,
+ (struct rm_priotracker *)ctx);
+ break;
+ case TPL_SR_PUT:
+ KASSERT(rates != NULL && nrates != NULL,
+ ("%s: PUT without new rates", __func__));
+ rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED);
+ if (rates == NULL || nrates == NULL)
+ return (EINVAL);
+ rm_wlock(&tcp_stats_tpl_sampling_lock);
+ old_rates = V_tcp_perconn_stats_sample_rates;
+ old_nrates = V_tcp_stats_nrates;
+ V_tcp_perconn_stats_sample_rates = *rates;
+ V_tcp_stats_nrates = *nrates;
+ rm_wunlock(&tcp_stats_tpl_sampling_lock);
+ *rates = old_rates;
+ *nrates = old_nrates;
+ break;
+ default:
+ return (EINVAL);
+ break;
+ }
+
+ return (0);
+}
+
+RM_SYSINIT(tcp_stats_tpl_sampling_lock, &tcp_stats_tpl_sampling_lock,
+ "tcp_stats_tpl_sampling_lock");
+#endif /* _KERNEL */
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -50,6 +50,9 @@
#ifdef TCP_HHOOK
#include <sys/khelp.h>
#endif
+#include <sys/qmath.h>
+#include <sys/tree.h>
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
#include <sys/sysctl.h>
#include <sys/jail.h>
#include <sys/malloc.h>
@@ -992,6 +995,13 @@
&V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
printf("%s: WARNING: unable to register helper hook\n", __func__);
#endif
+
+#ifdef STATS
+ if (tcp_stats_init())
+ printf("%s: WARNING: unable to initialise TCP stats\n",
+ __func__);
+#endif
+
hashsize = TCBHASHSIZE;
TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
if (hashsize == 0) {
@@ -1674,6 +1684,11 @@
if (tp->t_fb->tfb_tcp_fb_init) {
(*tp->t_fb->tfb_tcp_fb_init)(tp);
}
+#ifdef STATS
+ if (V_tcp_perconn_stats_enable == 1)
+ tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0);
+#endif
+
return (tp); /* XXX */
}
@@ -1892,7 +1907,9 @@
#ifdef TCP_HHOOK
khelp_destroy_osd(tp->osd);
#endif
-
+#ifdef STATS
+ stats_blob_destroy(tp->t_stats);
+#endif
CC_ALGO(tp) = NULL;
inp->inp_ppcb = NULL;
if (tp->t_timers->tt_draincnt == 0) {
Index: sys/netinet/tcp_usrreq.c
===================================================================
--- sys/netinet/tcp_usrreq.c
+++ sys/netinet/tcp_usrreq.c
@@ -52,6 +52,7 @@
#include <sys/malloc.h>
#include <sys/refcount.h>
#include <sys/kernel.h>
+#include <sys/qmath.h>
#include <sys/sysctl.h>
#include <sys/mbuf.h>
#ifdef INET6
@@ -63,6 +64,8 @@
#include <sys/proc.h>
#include <sys/jail.h>
#include <sys/syslog.h>
+#include <sys/tree.h>
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
#ifdef DDB
#include <ddb/ddb.h>
@@ -106,6 +109,13 @@
#endif
#include <netipsec/ipsec_support.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
/*
* TCP protocol interface to socket abstraction.
*/
@@ -1520,6 +1530,9 @@
if (tp->t_flags & TF_ECN_PERMIT)
ti->tcpi_options |= TCPI_OPT_ECN;
+ if (tp->t_flags & TF_FASTOPEN)
+ ti->tcpi_options |= TCPI_OPT_TFO;
+
ti->tcpi_rto = tp->t_rxtcur * tick;
ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
@@ -1541,6 +1554,8 @@
ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
ti->tcpi_snd_zerowin = tp->t_sndzerowin;
+
+ memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo));
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
ti->tcpi_options |= TCPI_OPT_TOE;
@@ -1758,6 +1773,9 @@
struct tcp_info ti;
struct cc_algo *algo;
char *pbuf, buf[TCP_LOG_ID_LEN];
+#ifdef STATS
+ struct statsblob *sbp;
+#endif
size_t len;
/*
@@ -1875,6 +1893,36 @@
error = EINVAL;
break;
+ case TCP_STATS:
+#ifdef STATS
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ if (optval > 0)
+ sbp = stats_blob_alloc(
+ V_tcp_perconn_stats_dflt_tpl, 0);
+ else
+ sbp = NULL;
+
+ INP_WLOCK_RECHECK(inp);
+ if ((tp->t_stats != NULL && sbp == NULL) ||
+ (tp->t_stats == NULL && sbp != NULL)) {
+ struct statsblob *t = tp->t_stats;
+ tp->t_stats = sbp;
+ sbp = t;
+ }
+ INP_WUNLOCK(inp);
+
+ stats_blob_destroy(sbp);
+#else /* !STATS */
+ INP_WUNLOCK(inp);
+ return (EOPNOTSUPP);
+#endif /* !STATS */
+ break;
+
case TCP_CONGESTION:
INP_WUNLOCK(inp);
error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
@@ -2138,6 +2186,55 @@
INP_WUNLOCK(inp);
error = sooptcopyout(sopt, &ti, sizeof ti);
break;
+ case TCP_STATS:
+ {
+#ifdef STATS
+ int nheld;
+ TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0;
+
+ error = 0;
+ socklen_t outsbsz = sopt->sopt_valsize;
+ if (tp->t_stats == NULL)
+ error = ENOENT;
+ else if (outsbsz >= tp->t_stats->cursz)
+ outsbsz = tp->t_stats->cursz;
+ else if (outsbsz >= sizeof(struct statsblob))
+ outsbsz = sizeof(struct statsblob);
+ else
+ error = EINVAL;
+ INP_WUNLOCK(inp);
+ if (error)
+ break;
+
+ sbp = sopt->sopt_val;
+ nheld = atop(round_page(((vm_offset_t)sbp) +
+ (vm_size_t)outsbsz) - trunc_page(sbp));
+ vm_page_t ma[nheld];
+ if (vm_fault_quick_hold_pages(
+ &curproc->p_vmspace->vm_map, (vm_offset_t)sbp,
+ outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma,
+ nheld) < 0) {
+ error = EFAULT;
+ break;
+ }
+
+ if ((error = copyin_nofault(&(sbp->flags), &sbflags,
+ SIZEOF_MEMBER(struct statsblob, flags))))
+ goto unhold;
+
+ INP_WLOCK_RECHECK(inp);
+ error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats,
+ sbflags | SB_CLONE_USRDSTNOFAULT);
+ INP_WUNLOCK(inp);
+ sopt->sopt_valsize = outsbsz;
+unhold:
+ vm_page_unhold_pages(ma, nheld);
+#else /* !STATS */
+ INP_WUNLOCK(inp);
+ error = EOPNOTSUPP;
+#endif /* !STATS */
+ break;
+ }
case TCP_CONGESTION:
len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
INP_WUNLOCK(inp);
Index: sys/netinet/tcp_var.h
===================================================================
--- sys/netinet/tcp_var.h
+++ sys/netinet/tcp_var.h
@@ -209,7 +209,14 @@
struct tcp_log_id_node *t_lin;
struct tcp_log_id_bucket *t_lib;
const char *t_output_caller; /* Function that called tcp_output */
+ struct statsblob *t_stats; /* Per-connection stats */
uint32_t t_logsn; /* Log "serial number" */
+ uint32_t gput_ts; /* Time goodput measurement started */
+ tcp_seq gput_seq; /* Outbound measurement seq */
+ tcp_seq gput_ack; /* Inbound measurement ack */
+ struct tcpsyninfo t_rxsyninfo; /* Peer's SYN details */
+ int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */
+
uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */
union {
@@ -320,6 +327,7 @@
#define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */
#define TF_NOPUSH 0x001000 /* don't push */
#define TF_PREVVALID 0x002000 /* saved values for bad rxmit valid */
+#define TF_GPUTINPROG 0x008000 /* Goodput measurement in progress */
#define TF_MORETOCOME 0x010000 /* More data to be appended to sock */
#define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */
#define TF_LASTIDLE 0x040000 /* connection was previously idle */
@@ -399,7 +407,7 @@
u_int8_t to_wscale; /* window scaling */
u_int8_t to_nsacks; /* number of SACK blocks */
u_int8_t to_tfo_len; /* TFO cookie length */
- u_int32_t to_spare; /* UTO */
+ struct tcpsyninfo to_syninfo; /* SYN specific opts data */
};
/*
@@ -738,6 +746,11 @@
#define TCPCTL_DROP 15 /* drop tcp connection */
#define TCPCTL_STATES 16 /* connection counts by TCP state */
+/* These stats(9) related bits need to be visible to userland code. */
+int tcp_stats_init(void);
+#define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable)
+#define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl)
+
#ifdef _KERNEL
#ifdef SYSCTL_DECL
SYSCTL_DECL(_net_inet_tcp);
@@ -774,6 +787,8 @@
VNET_DECLARE(int, tcp_insecure_syn);
VNET_DECLARE(int, tcp_minmss);
VNET_DECLARE(int, tcp_mssdflt);
+VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl);
+VNET_DECLARE(int, tcp_perconn_stats_enable);
VNET_DECLARE(int, tcp_recvspace);
VNET_DECLARE(int, tcp_sack_globalholes);
VNET_DECLARE(int, tcp_sack_globalmaxholes);
@@ -942,6 +957,8 @@
int tcp_compute_pipe(struct tcpcb *);
uint32_t tcp_compute_initwnd(uint32_t);
void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
+int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
+ size_t seed_len);
struct mbuf *
tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
int32_t seglimit, int32_t segsize, struct sockbuf *sb);

File Metadata

Mime Type
text/plain
Expires
Thu, Mar 13, 3:18 AM (14 h, 52 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
17129343
Default Alt Text
D20655.id58671.diff (27 KB)

Event Timeline