D39210.diff
No OneTemporary
Actions

Size

59 KB

Referenced Files

None

Subscribers

None

D39210.diff
View Options

	Index: sys/conf/options
	===================================================================
	--- sys/conf/options
	+++ sys/conf/options
	@@ -227,6 +227,7 @@
	SYSVSHM opt_sysvipc.h
	SW_WATCHDOG opt_watchdog.h
	TCPHPTS opt_inet.h
	+TCP_REQUEST_TRK opt_global.h
	TCP_ACCOUNTING opt_inet.h
	TURNSTILE_PROFILING
	UMTX_PROFILING
	Index: sys/kern/kern_sendfile.c
	===================================================================
	--- sys/kern/kern_sendfile.c
	+++ sys/kern/kern_sendfile.c
	@@ -57,6 +57,9 @@
	#include <net/vnet.h>
	#include <netinet/in.h>
	#include <netinet/tcp.h>
	+#include <netinet/in_pcb.h>
	+#include <netinet/tcp_var.h>
	+#include <netinet/tcp_log_buf.h>

	#include <security/audit/audit.h>
	#include <security/mac/mac_framework.h>
	@@ -1188,6 +1191,12 @@
	NULL, NULL, td);
	sendfile_iodone(sfio, NULL, 0, error);
	}
	+#ifdef TCP_REQUEST_TRK
	+ if (so->so_proto->pr_protocol == IPPROTO_TCP) {
	+ /* log the sendfile call to the TCP log, if enabled */
	+ tcp_log_sendfile(so, offset, nbytes, flags);
	+ }
	+#endif
	CURVNET_RESTORE();

	m = NULL;
	Index: sys/modules/tcp/rack/Makefile
	===================================================================
	--- sys/modules/tcp/rack/Makefile
	+++ sys/modules/tcp/rack/Makefile
	@@ -6,7 +6,7 @@

	STACKNAME= rack
	KMOD= tcp_${STACKNAME}
	-SRCS= rack.c sack_filter.c rack_bbr_common.c
	+SRCS= rack.c sack_filter.c rack_bbr_common.c #tailq_hash.c

	SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
	SRCS+= opt_kern_tls.h
	Index: sys/netinet/tcp.h
	===================================================================
	--- sys/netinet/tcp.h
	+++ sys/netinet/tcp.h
	@@ -217,15 +217,15 @@
	/* Options for Rack and BBR */
	#define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */
	#define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */
	-#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */
	+#define TCP_RACK_PROP 1051 /* Not used */
	#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */
	#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacingv reduction factor (divisor) */
	#define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send */
	#define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */
	-#define TCP_RACK_PROP_RATE 1056 /* The proportional reduction rate */
	+#define TCP_RACK_PROP_RATE 1056 /* Not used */
	#define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */
	#define TCP_RACK_MIN_TO 1058 /* Minimum time between rack t-o's in ms */
	-#define TCP_RACK_EARLY_RECOV 1059 /* Should recovery happen early (bool) */
	+#define TCP_RACK_EARLY_RECOV 1059 /* Not used */
	#define TCP_RACK_EARLY_SEG 1060 /* If early recovery max segments */
	#define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */
	#define TCP_RACK_REORD_FADE 1062 /* Does reordering fade after ms time */
	@@ -309,12 +309,22 @@
	#define TCP_REC_ABC_VAL 1134 /* Do we use the ABC value for recovery or the override one from sysctl */
	#define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */
	#define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */
	-#define TCP_FAST_RSM_HACK 1137 /* Do we do the broken thing where we don't twiddle the TLP bits properly in fast_rsm_output? */
	+#define TCP_FAST_RSM_HACK 1137 /* Not used in modern stacks */
	#define TCP_RACK_PACING_BETA 1138 /* Changing the beta for pacing */
	#define TCP_RACK_PACING_BETA_ECN 1139 /* Changing the beta for ecn with pacing */
	#define TCP_RACK_TIMER_SLOP 1140 /* Set or get the timer slop used */
	#define TCP_RACK_DSACK_OPT 1141 /* How do we setup rack timer DSACK options bit 1/2 */
	#define TCP_RACK_ENABLE_HYSTART 1142 /* Do we allow hystart in the CC modules */
	+#define TCP_RACK_SET_RXT_OPTIONS 1143 /* Set the bits in the retransmit options */
	+#define TCP_RACK_HI_BETA 1144 /* Turn on/off high beta */
	+#define TCP_RACK_SPLIT_LIMIT 1145 /* Set a split limit for split allocations */
	+#define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */
	+#define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */
	+#define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */
	+#define TCP_RXT_CLAMP 1149 /* Do we apply a threshold to rack so if excess rxt clamp cwnd? */
	+#define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */
	+#define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */
	+
	/* Start of reserved space for third-party user-settable options. */
	#define TCP_VENDOR SO_VENDOR

	@@ -447,6 +457,53 @@
	#define TLS_SET_RECORD_TYPE 1
	#define TLS_GET_RECORD 2

	+/*
	+ * TCP log user opaque
	+ */
	+struct http_req {
	+ uint64_t timestamp;
	+ uint64_t start;
	+ uint64_t end;
	+ uint32_t flags;
	+};
	+
	+union tcp_log_userdata {
	+ struct http_req http_req;
	+};
	+
	+struct tcp_log_user {
	+ uint32_t type;
	+ uint32_t subtype;
	+ union tcp_log_userdata data;
	+};
	+
	+/* user types, i.e. apps */
	+#define TCP_LOG_USER_HTTPD 1
	+
	+/* user subtypes */
	+#define TCP_LOG_HTTPD_TS 1 /* client timestamp */
	+#define TCP_LOG_HTTPD_TS_REQ 2 /* client timestamp and request info */
	+
	+/* HTTPD REQ flags */
	+#define TCP_LOG_HTTPD_RANGE_START 0x0001
	+#define TCP_LOG_HTTPD_RANGE_END 0x0002
	+
	+/* Flags for hybrid pacing */
	+#define TCP_HYBRID_PACING_CU 0x0001 /* Enable catch-up mode */
	+#define TCP_HYBRID_PACING_DTL 0x0002 /* Enable Detailed logging */
	+#define TCP_HYBRID_PACING_CSPR 0x0004 /* A client suggested rate is present */
	+#define TCP_HYBRID_PACING_H_MS 0x0008 /* A client hint for maxseg is present */
	+#define TCP_HYBRID_PACING_ENABLE 0x0010 /* We are enabling hybrid pacing else disable */
	+#define TCP_HYBRID_PACING_S_MSS 0x0020 /* Clent wants us to set the mss overriding gp est in CU */
	+#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tellsus we set the mss on this entry */
	+
	+struct tcp_hybrid_req {
	+ struct http_req req;
	+ uint64_t cspr;
	+ uint32_t hint_maxseg;
	+ uint32_t hybrid_flags;
	+};
	+
	/*
	* TCP specific variables of interest for tp->t_stats stats(9) accounting.
	*/
	@@ -460,6 +517,7 @@
	#define VOI_TCP_CALCFRWINDIFF 7 /* Congestion avoidance LCWIN - FRWIN */
	#define VOI_TCP_GPUT_ND 8 /* Goodput normalised delta */
	#define VOI_TCP_ACKLEN 9 /* Average ACKed bytes per ACK */
	+#define VOI_TCP_PATHRTT 10 /* The path RTT based on ACK arrival */

	#define TCP_REUSPORT_LB_NUMA_NODOM (-2) /* remove numa binding */
	#define TCP_REUSPORT_LB_NUMA_CURDOM (-1) /* bind to current domain */
	Index: sys/netinet/tcp_hpts.h
	===================================================================
	--- sys/netinet/tcp_hpts.h
	+++ sys/netinet/tcp_hpts.h
	@@ -187,6 +187,15 @@
	}

	#ifdef _KERNEL
	+
	+extern int32_t tcp_min_hptsi_time;
	+
	+__inline int32_t
	+get_hpts_min_sleep_time()
	+{
	+ return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT);
	+}
	+
	static __inline uint32_t
	tcp_gethptstick(struct timeval *sv)
	{
	Index: sys/netinet/tcp_log_buf.c
	===================================================================
	--- sys/netinet/tcp_log_buf.c
	+++ sys/netinet/tcp_log_buf.c
	@@ -58,6 +58,7 @@
	#include <netinet/in_var.h>
	#include <netinet/tcp_var.h>
	#include <netinet/tcp_log_buf.h>
	+#include <netinet/tcp_seq.h>
	#include <netinet/tcp_hpts.h>

	/* Default expiry time */
	@@ -2844,6 +2845,10 @@
	{
	struct inpcb *inp;
	struct tcpcb *tp;
	+#ifdef TCP_REQUEST_TRK
	+ struct http_sendfile_track *ent;
	+ int i, fnd;
	+#endif

	inp = sotoinpcb(so);
	KASSERT(inp != NULL, ("tcp_log_sendfile: inp == NULL"));
	@@ -2873,6 +2878,90 @@
	&tptosocket(tp)->so_snd,
	TCP_LOG_SENDFILE, 0, 0, &log, false, &tv);
	}
	+#ifdef TCP_REQUEST_TRK
	+ if (tp->t_http_req == 0) {
	+ /* No http requests to track */
	+ goto done;
	+ }
	+ fnd = 0;
	+ if (tp->t_http_closed == 0) {
	+ /* No closed end req to track */
	+ goto skip_closed_req;
	+ }
	+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
	+ /* Lets see if this one can be found */
	+ ent = &tp->t_http_info[i];
	+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
	+ /* Not used */
	+ continue;
	+ }
	+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) {
	+ /* This pass does not consider open requests */
	+ continue;
	+ }
	+ if (ent->flags & TCP_HTTP_TRACK_FLG_COMP) {
	+ /* Don't look at what we have completed */
	+ continue;
	+ }
	+ /* If we reach here its a allocated closed end request */
	+ if ((ent->start == offset) \|\|
	+ ((offset > ent->start) && (offset < ent->end))){
	+ /* Its within this request?? */
	+ fnd = 1;
	+ }
	+ if (fnd) {
	+ /*
	+ * It is at or past the end, its complete.
	+ */
	+ ent->flags \|= TCP_HTTP_TRACK_FLG_SEQV;
	+ /*
	+ * When an entry completes we can take (snd_una + sb_cc) and know where
	+ * the end of the range really is. Note that this works since two
	+ * requests must be sequential and sendfile now is complete for this request.
	+ * we must use sb_ccc since the data may still be in-flight in TLS.
	+ *
	+ * We always cautiously move the end_seq only if our calculations
	+ * show it happened (just in case sf has the call to here at the wrong
	+ * place). When we go COMP we will stop coming here and hopefully be
	+ * left with the correct end_seq.
	+ */
	+ if (SEQ_GT((tp->snd_una + so->so_snd.sb_ccc), ent->end_seq))
	+ ent->end_seq = tp->snd_una + so->so_snd.sb_ccc;
	+ if ((offset + nbytes) >= ent->end) {
	+ ent->flags \|= TCP_HTTP_TRACK_FLG_COMP;
	+ tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_COMPLETE, offset, nbytes);
	+ } else {
	+ tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_MOREYET, offset, nbytes);
	+ }
	+ /* We assume that sendfile never sends overlapping requests */
	+ goto done;
	+ }
	+ }
	+skip_closed_req:
	+ if (!fnd) {
	+ /* Ok now lets look for open requests */
	+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
	+ ent = &tp->t_http_info[i];
	+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
	+ /* Not used */
	+ continue;
	+ }
	+ if ((ent->flags & TCP_HTTP_TRACK_FLG_OPEN) == 0)
	+ continue;
	+ /* If we reach here its an allocated open request */
	+ if (ent->start == offset) {
	+ /* It begins this request */
	+ ent->start_seq = tp->snd_una +
	+ tptosocket(tp)->so_snd.sb_ccc;
	+ ent->flags \|= TCP_HTTP_TRACK_FLG_SEQV;
	+ break;
	+ } else if (offset > ent->start) {
	+ ent->flags \|= TCP_HTTP_TRACK_FLG_SEQV;
	+ break;
	+ }
	+ }
	+ }
	+#endif
	done:
	INP_WUNLOCK(inp);
	}
	Index: sys/netinet/tcp_stacks/bbr.c
	===================================================================
	--- sys/netinet/tcp_stacks/bbr.c
	+++ sys/netinet/tcp_stacks/bbr.c
	@@ -500,7 +500,7 @@
	bbr_enter_persist(struct tcpcb tp, struct tcp_bbr bbr, uint32_t cts,
	int32_t line);
	static void
	-bbr_stop_all_timers(struct tcpcb *tp);
	+bbr_stop_all_timers(struct tcpcb tp, struct tcp_bbr bbr);
	static void
	bbr_exit_probe_rtt(struct tcpcb tp, struct tcp_bbr bbr, uint32_t cts);
	static void
	@@ -1970,7 +1970,7 @@
	static void
	bbr_log_msgsize_fail(struct tcp_bbr bbr, struct tcpcb tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts)
	{
	- if (tcp_bblogging_on(bbr->rc_tp)) {
	+ if (tcp_bblogging_on(tp)) {
	union tcp_log_stackspecific log;

	bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
	@@ -2669,7 +2669,7 @@
	uint32_t newbw, uint32_t obw, uint32_t diff,
	uint32_t tim)
	{
	- if (tcp_bblogging_on(bbr->rc_tp)) {
	+ if (/bbr_verbose_logging && /tcp_bblogging_on(bbr->rc_tp)) {
	union tcp_log_stackspecific log;

	bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
	@@ -2697,7 +2697,7 @@
	static inline void
	bbr_log_progress_event(struct tcp_bbr bbr, struct tcpcb tp, uint32_t tick, int event, int line)
	{
	- if (tcp_bblogging_on(bbr->rc_tp)) {
	+ if (bbr_verbose_logging && tcp_bblogging_on(bbr->rc_tp)) {
	union tcp_log_stackspecific log;

	bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
	@@ -6281,6 +6281,9 @@
	else
	apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
	}
	+#ifdef STATS
	+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rtt));
	+#endif
	if (bbr->rc_ack_was_delayed)
	rtt += bbr->r_ctl.rc_ack_hdwr_delay;

	@@ -9850,16 +9853,13 @@
	}

	static void
	-bbr_stop_all_timers(struct tcpcb *tp)
	+bbr_stop_all_timers(struct tcpcb tp, struct tcp_bbr bbr)
	{
	- struct tcp_bbr *bbr;
	-
	/*
	* Assure no timers are running.
	*/
	if (tcp_timer_active(tp, TT_PERSIST)) {
	/* We enter in persists, set the flag appropriately */
	- bbr = (struct tcp_bbr *)tp->t_fb_ptr;
	bbr->rc_in_persist = 1;
	}
	}
	@@ -9927,14 +9927,14 @@
	* which indicates the error (usually no memory).
	*/
	static int
	-bbr_init(struct tcpcb *tp)
	+bbr_init(struct tcpcb tp, void *ptr)
	{
	struct inpcb *inp = tptoinpcb(tp);
	struct tcp_bbr *bbr = NULL;
	uint32_t cts;

	- tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT \| M_ZERO));
	- if (tp->t_fb_ptr == NULL) {
	+ *ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT \| M_ZERO));
	+ if (*ptr == NULL) {
	/*
	* We need to allocate memory but cant. The INP and INP_INFO
	* locks and they are recursive (happens during setup. So a
	@@ -9943,10 +9943,16 @@
	*/
	return (ENOMEM);
	}
	- bbr = (struct tcp_bbr *)tp->t_fb_ptr;
	+ bbr = (struct tcp_bbr )ptr;
	bbr->rtt_valid = 0;
	inp->inp_flags2 \|= INP_CANNOT_DO_ECN;
	inp->inp_flags2 \|= INP_SUPPORTS_MBUFQ;
	+ /* Take off any undesired flags */
	+ inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
	+ inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
	+ inp->inp_flags2 &= ~INP_MBUF_ACKCMP;
	+ inp->inp_flags2 &= ~INP_MBUF_L_ACKS;
	+
	TAILQ_INIT(&bbr->r_ctl.rc_map);
	TAILQ_INIT(&bbr->r_ctl.rc_free);
	TAILQ_INIT(&bbr->r_ctl.rc_tmap);
	@@ -10074,8 +10080,8 @@

	rsm = bbr_alloc(bbr);
	if (rsm == NULL) {
	- uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
	- tp->t_fb_ptr = NULL;
	+ uma_zfree(bbr_pcb_zone, *ptr);
	+ *ptr = NULL;
	return (ENOMEM);
	}
	rsm->r_rtt_not_allowed = 1;
	@@ -10128,7 +10134,17 @@
	* the TCB on the hptsi wheel if a timer is needed with appropriate
	* flags.
	*/
	- bbr_stop_all_timers(tp);
	+ bbr_stop_all_timers(tp, bbr);
	+ /*
	+ * Validate the timers are not in usec, if they are convert.
	+ * BBR should in theory move to USEC and get rid of a
	+ * lot of the TICKS_2 calls.. but for now we stay
	+ * with tick timers.
	+ */
	+ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
	+ TCPT_RANGESET(tp->t_rxtcur,
	+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
	+ tp->t_rttmin, TCPTV_REXMTMAX);
	bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0);
	return (0);
	}
	@@ -10172,7 +10188,6 @@
	bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
	{
	if (tp->t_fb_ptr) {
	- struct inpcb *inp = tptoinpcb(tp);
	uint32_t calc;
	struct tcp_bbr *bbr;
	struct bbr_sendmap *rsm;
	@@ -10182,10 +10197,6 @@
	tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
	bbr_log_flowend(bbr);
	bbr->rc_tp = NULL;
	- /* Backout any flags2 we applied */
	- inp->inp_flags2 &= ~INP_CANNOT_DO_ECN;
	- inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
	- inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
	if (bbr->bbr_hdrw_pacing)
	counter_u64_add(bbr_flows_whdwr_pacing, -1);
	else
	@@ -11853,7 +11864,6 @@
	int32_t isipv6;
	#endif
	uint8_t app_limited = BBR_JR_SENT_DATA;
	- uint8_t filled_all = 0;
	bbr = (struct tcp_bbr *)tp->t_fb_ptr;
	/* We take a cache hit here */
	memcpy(&bbr->rc_tv, tv, sizeof(struct timeval));
	@@ -13162,7 +13172,7 @@
	if_hw_tsomaxsegsize, msb,
	((rsm == NULL) ? hw_tls : 0)
	#ifdef NETFLIX_COPY_ARGS
	- , &filled_all
	+ , NULL, NULL
	#endif
	);
	if (len <= maxseg) {
	@@ -13474,7 +13484,7 @@
	#endif

	/* Log to the black box */
	- if (tcp_bblogging_on(bbr->rc_tp)) {
	+ if (tcp_bblogging_on(tp)) {
	union tcp_log_stackspecific log;

	bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
	@@ -13483,13 +13493,10 @@
	log.u_bbr.flex2 = (bbr->r_recovery_bw << 3);
	log.u_bbr.flex3 = maxseg;
	log.u_bbr.flex4 = delay_calc;
	- /* Encode filled_all into the upper flex5 bit */
	log.u_bbr.flex5 = bbr->rc_past_init_win;
	log.u_bbr.flex5 <<= 1;
	log.u_bbr.flex5 \|= bbr->rc_no_pacing;
	log.u_bbr.flex5 <<= 29;
	- if (filled_all)
	- log.u_bbr.flex5 \|= 0x80000000;
	log.u_bbr.flex5 \|= tp->t_maxseg;
	log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs;
	log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) \| bbr_state_val(bbr);
	@@ -14073,6 +14080,56 @@
	return (0);
	}

	+static void
	+bbr_switch_failed(struct tcpcb *tp)
	+{
	+ /*
	+ * If a switch fails we only need to
	+ * make sure mbuf_queuing is still in place.
	+ * We also need to make sure we are still in
	+ * ticks granularity (though we should probably
	+ * change bbr to go to USECs).
	+ *
	+ * For timers we need to see if we are still in the
	+ * pacer (if our flags are up) if so we are good, if
	+ * not we need to get back into the pacer.
	+ */
	+ struct inpcb *inp = tptoinpcb(tp);
	+ struct timeval tv;
	+ uint32_t cts;
	+ uint32_t toval;
	+ struct tcp_bbr *bbr;
	+ struct hpts_diag diag;
	+
	+ inp->inp_flags2 \|= INP_CANNOT_DO_ECN;
	+ inp->inp_flags2 \|= INP_SUPPORTS_MBUFQ;
	+ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
	+ if (inp->inp_in_hpts) {
	+ return;
	+ }
	+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
	+ cts = tcp_get_usecs(&tv);
	+ if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
	+ if (TSTMP_GT(bbr->rc_pacer_started, cts)) {
	+ toval = bbr->rc_pacer_started - cts;
	+ } else {
	+ /* one slot please */
	+ toval = HPTS_TICKS_PER_SLOT;
	+ }
	+ } else if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
	+ if (TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) {
	+ toval = bbr->r_ctl.rc_timer_exp - cts;
	+ } else {
	+ /* one slot please */
	+ toval = HPTS_TICKS_PER_SLOT;
	+ }
	+ } else
	+ toval = HPTS_TICKS_PER_SLOT;
	+ (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval),
	+ __LINE__, &diag);
	+ bbr_log_hpts_diag(bbr, cts, &diag);
	+}
	+
	struct tcp_function_block __tcp_bbr = {
	.tfb_tcp_block_name = __XSTRING(STACKNAME),
	.tfb_tcp_output = bbr_output,
	@@ -14087,6 +14144,7 @@
	.tfb_tcp_handoff_ok = bbr_handoff_ok,
	.tfb_tcp_mtu_chg = bbr_mtu_chg,
	.tfb_pru_options = bbr_pru_options,
	+ .tfb_switch_failed = bbr_switch_failed,
	.tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
	};

	Index: sys/netinet/tcp_stacks/rack.c
	===================================================================
	--- sys/netinet/tcp_stacks/rack.c
	+++ sys/netinet/tcp_stacks/rack.c
	@@ -458,7 +458,7 @@
	static uint32_t
	rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
	static int32_t rack_handoff_ok(struct tcpcb *tp);
	-static int32_t rack_init(struct tcpcb *tp);
	+static int32_t rack_init(struct tcpcb tp, void *ptr);
	static void rack_init_sysctls(void);
	static void
	rack_log_ack(struct tcpcb tp, struct tcpopt to,
	@@ -12344,7 +12344,7 @@
	}

	static int
	-rack_init(struct tcpcb *tp)
	+rack_init(struct tcpcb tp, void *ptr)
	{
	struct inpcb *inp = tptoinpcb(tp);
	struct tcp_rack *rack = NULL;
	@@ -12354,8 +12354,8 @@
	uint32_t iwin, snt, us_cts;
	int err;

	- tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
	- if (tp->t_fb_ptr == NULL) {
	+ *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
	+ if (*ptr == NULL) {
	/*
	* We need to allocate memory but cant. The INP and INP_INFO
	* locks and they are recursive (happens during setup. So a
	@@ -12364,9 +12364,9 @@
	*/
	return (ENOMEM);
	}
	- memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
	+ memset(ptr, 0, sizeof(struct tcp_rack));

	- rack = (struct tcp_rack *)tp->t_fb_ptr;
	+ rack = (struct tcp_rack *)ptr;
	RB_INIT(&rack->r_ctl.rc_mtree);
	TAILQ_INIT(&rack->r_ctl.rc_free);
	TAILQ_INIT(&rack->r_ctl.rc_tmap);
	Index: sys/netinet/tcp_subr.c
	===================================================================
	--- sys/netinet/tcp_subr.c
	+++ sys/netinet/tcp_subr.c
	@@ -109,6 +109,7 @@
	#include <netinet/tcp_log_buf.h>
	#include <netinet/tcp_syncache.h>
	#include <netinet/tcp_hpts.h>
	+#include <netinet/tcp_lro.h>
	#include <netinet/cc/cc.h>
	#include <netinet/tcpip.h>
	#include <netinet/tcp_fastopen.h>
	@@ -152,6 +153,11 @@
	CTLFLAG_RW,
	&tcp_force_detection, 0,
	"Do we force detection even if the INP has it off?");
	+int32_t tcp_sad_limit = 10000;
	+SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit,
	+ CTLFLAG_RW,
	+ &tcp_sad_limit, 10000,
	+ "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?");
	int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */
	SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh,
	CTLFLAG_RW,
	@@ -363,7 +369,7 @@
	VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]);
	#define V_ts_offset_secret VNET(ts_offset_secret)

	-static int tcp_default_fb_init(struct tcpcb *tp);
	+static int tcp_default_fb_init(struct tcpcb tp, void *ptr);
	static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
	static int tcp_default_handoff_ok(struct tcpcb *tp);
	static struct inpcb tcp_notify(struct inpcb , int);
	@@ -519,18 +525,11 @@
	tcp_switch_back_to_default(struct tcpcb *tp)
	{
	struct tcp_function_block *tfb;
	+ void *ptr = NULL;

	KASSERT(tp->t_fb != &tcp_def_funcblk,
	("%s: called by the built-in default stack", __func__));

	- /*
	- * Release the old stack. This function will either find a new one
	- * or panic.
	- */
	- if (tp->t_fb->tfb_tcp_fb_fini != NULL)
	- (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
	- refcount_release(&tp->t_fb->tfb_refcnt);
	-
	/*
	* Now, we'll find a new function block to use.
	* Start by trying the current user-selected
	@@ -551,14 +550,20 @@
	/* Try to use that stack. */
	if (tfb != NULL) {
	/* Initialize the new stack. If it succeeds, we are done. */
	- tp->t_fb = tfb;
	- if (tp->t_fb->tfb_tcp_fb_init == NULL \|\|
	- (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
	+ if (tfb->tfb_tcp_fb_init == NULL \|\|
	+ (*tfb->tfb_tcp_fb_init)(tp, &ptr) == 0) {
	+ /* Release the old stack */
	+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
	+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
	+ refcount_release(&tp->t_fb->tfb_refcnt);
	+ /* Now set in all the pointers */
	+ tp->t_fb = tfb;
	+ tp->t_fb_ptr = ptr;
	return;
	-
	+ }
	/*
	* Initialization failed. Release the reference count on
	- * the stack.
	+ * the looked up default stack.
	*/
	refcount_release(&tfb->tfb_refcnt);
	}
	@@ -578,12 +583,18 @@
	panic("Default stack rejects a new session?");
	}
	}
	- tp->t_fb = tfb;
	- if (tp->t_fb->tfb_tcp_fb_init != NULL &&
	- (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
	+ if (tfb->tfb_tcp_fb_init != NULL &&
	+ (*tfb->tfb_tcp_fb_init)(tp, &ptr)) {
	/* The default stack cannot fail */
	panic("Default stack initialization failed");
	}
	+ /* Now release the old stack */
	+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
	+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
	+ refcount_release(&tp->t_fb->tfb_refcnt);
	+ /* And set in the pointers to the new */
	+ tp->t_fb = tfb;
	+ tp->t_fb_ptr = ptr;
	}

	static bool
	@@ -1040,16 +1051,37 @@
	* it is required to always succeed since it is the stack of last resort!
	*/
	static int
	-tcp_default_fb_init(struct tcpcb *tp)
	+tcp_default_fb_init(struct tcpcb tp, void *ptr)
	{
	struct socket *so = tptosocket(tp);
	+ int rexmt;

	INP_WLOCK_ASSERT(tptoinpcb(tp));
	+ /* We don't use the pointer */
	+ *ptr = NULL;

	KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
	("%s: connection %p in unexpected state %d", __func__, tp,
	tp->t_state));

	+ /* Make sure we get no interesting mbuf queuing behavior */
	+ /* All mbuf queue/ack compress flags should be off */
	+ tcp_lro_features_off(tptoinpcb(tp));
	+
	+ /* Cancel the GP measurement in progress */
	+ tp->t_flags &= ~TF_GPUTINPROG;
	+ /* Validate the timers are not in usec, if they are convert */
	+ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
	+ if ((tp->t_state == TCPS_SYN_SENT) \|\|
	+ (tp->t_state == TCPS_SYN_RECEIVED))
	+ rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift];
	+ else
	+ rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
	+ if (tp->t_rxtshift == 0)
	+ tp->t_rxtcur = rexmt;
	+ else
	+ TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX);
	+
	/*
	* Nothing to do for ESTABLISHED or LISTEN states. And, we don't
	* know what to do for unexpected states (which includes TIME_WAIT).
	@@ -2240,6 +2272,8 @@
	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
	tp->t_rcvtime = ticks;
	+ /* We always start with ticks granularity */
	+ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS;
	/*
	* IPv4 TTL initialization is necessary for an IPv6 socket as well,
	* because the socket may be bound to an IPv6 wildcard address,
	@@ -2265,7 +2299,7 @@
	#endif
	tp->t_pacing_rate = -1;
	if (tp->t_fb->tfb_tcp_fb_init) {
	- if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) {
	+ if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) {
	refcount_release(&tp->t_fb->tfb_refcnt);
	return (NULL);
	}
	@@ -4019,3 +4053,524 @@
	}
	}
	#endif
	+
	+void
	+tcp_change_time_units(struct tcpcb *tp, int granularity)
	+{
	+ if (tp->t_tmr_granularity == granularity) {
	+ /* We are there */
	+ return;
	+ }
	+ if (granularity == TCP_TMR_GRANULARITY_USEC) {
	+ KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS),
	+ ("Granularity is not TICKS its %u in tp:%p",
	+ tp->t_tmr_granularity, tp));
	+ tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow);
	+ if (tp->t_srtt > 1) {
	+ uint32_t val, frac;
	+
	+ val = tp->t_srtt >> TCP_RTT_SHIFT;
	+ frac = tp->t_srtt & 0x1f;
	+ tp->t_srtt = TICKS_2_USEC(val);
	+ /*
	+ * frac is the fractional part of the srtt (if any)
	+ * but its in ticks and every bit represents
	+ * 1/32nd of a hz.
	+ */
	+ if (frac) {
	+ if (hz == 1000) {
	+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
	+ } else {
	+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
	+ }
	+ tp->t_srtt += frac;
	+ }
	+ }
	+ if (tp->t_rttvar) {
	+ uint32_t val, frac;
	+
	+ val = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
	+ frac = tp->t_rttvar & 0x1f;
	+ tp->t_rttvar = TICKS_2_USEC(val);
	+ /*
	+ * frac is the fractional part of the srtt (if any)
	+ * but its in ticks and every bit represents
	+ * 1/32nd of a hz.
	+ */
	+ if (frac) {
	+ if (hz == 1000) {
	+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
	+ } else {
	+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
	+ }
	+ tp->t_rttvar += frac;
	+ }
	+ }
	+ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_USEC;
	+ } else if (granularity == TCP_TMR_GRANULARITY_TICKS) {
	+ /* Convert back to ticks, with */
	+ KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_USEC),
	+ ("Granularity is not USEC its %u in tp:%p",
	+ tp->t_tmr_granularity, tp));
	+ if (tp->t_srtt > 1) {
	+ uint32_t val, frac;
	+
	+ val = USEC_2_TICKS(tp->t_srtt);
	+ frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
	+ tp->t_srtt = val << TCP_RTT_SHIFT;
	+ /*
	+ * frac is the fractional part here is left
	+ * over from converting to hz and shifting.
	+ * We need to convert this to the 5 bit
	+ * remainder.
	+ */
	+ if (frac) {
	+ if (hz == 1000) {
	+ frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
	+ } else {
	+ frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
	+ }
	+ tp->t_srtt += frac;
	+ }
	+ }
	+ if (tp->t_rttvar) {
	+ uint32_t val, frac;
	+
	+ val = USEC_2_TICKS(tp->t_rttvar);
	+ frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
	+ tp->t_rttvar = val << TCP_RTTVAR_SHIFT;
	+ /*
	+ * frac is the fractional part here is left
	+ * over from converting to hz and shifting.
	+ * We need to convert this to the 5 bit
	+ * remainder.
	+ */
	+ if (frac) {
	+ if (hz == 1000) {
	+ frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
	+ } else {
	+ frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
	+ }
	+ tp->t_rttvar += frac;
	+ }
	+ }
	+ tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow);
	+ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS;
	+ }
	+#ifdef INVARIANTS
	+ else {
	+ panic("Unknown granularity:%d tp:%p",
	+ granularity, tp);
	+ }
	+#endif
	+}
	+
	+void
	+tcp_handle_orphaned_packets(struct tcpcb *tp)
	+{
	+ struct mbuf save, m, *prev;
	+ /*
	+ * Called when a stack switch is occuring from the fini()
	+ * of the old stack. We assue the init() as already been
	+ * run of the new stack and it has set the inp_flags2 to
	+ * what it supports. This function will then deal with any
	+ * differences i.e. cleanup packets that maybe queued that
	+ * the newstack does not support.
	+ */
	+
	+ if (tptoinpcb(tp)->inp_flags2 & INP_MBUF_L_ACKS)
	+ return;
	+ if ((tptoinpcb(tp)->inp_flags2 & INP_SUPPORTS_MBUFQ) == 0) {
	+ /*
	+ * It is unsafe to process the packets since a
	+ * reset may be lurking in them (its rare but it
	+ * can occur). If we were to find a RST, then we
	+ * would end up dropping the connection and the
	+ * INP lock, so when we return the caller (tcp_usrreq)
	+ * will blow up when it trys to unlock the inp.
	+ * This new stack does not do any fancy LRO features
	+ * so all we can do is toss the packets.
	+ */
	+ m = tp->t_in_pkt;
	+ tp->t_in_pkt = NULL;
	+ tp->t_tail_pkt = NULL;
	+ while (m) {
	+ save = m->m_nextpkt;
	+ m->m_nextpkt = NULL;
	+ m_freem(m);
	+ m = save;
	+ }
	+ } else {
	+ /*
	+ * Here we have a stack that does mbuf queuing but
	+ * does not support compressed ack's. We must
	+ * walk all the mbufs and discard any compressed acks.
	+ */
	+ m = tp->t_in_pkt;
	+ prev = NULL;
	+ while (m) {
	+ if (m->m_flags & M_ACKCMP) {
	+ /* We must toss this packet */
	+ if (tp->t_tail_pkt == m)
	+ tp->t_tail_pkt = prev;
	+ if (prev)
	+ prev->m_nextpkt = m->m_nextpkt;
	+ else
	+ tp->t_in_pkt = m->m_nextpkt;
	+ m->m_nextpkt = NULL;
	+ m_freem(m);
	+ /* move forward */
	+ if (prev)
	+ m = prev->m_nextpkt;
	+ else
	+ m = tp->t_in_pkt;
	+ } else {
	+ /* this one is ok */
	+ prev = m;
	+ m = m->m_nextpkt;
	+ }
	+ }
	+ }
	+}
	+
	+#ifdef TCP_REQUEST_TRK
	+uint32_t
	+tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes)
	+{
	+#ifdef KERN_TLS
	+ struct ktls_session *tls;
	+ uint32_t rec_oh, records;
	+
	+ tls = so->so_snd.sb_tls_info;
	+ if (tls == NULL)
	+ return (0);
	+
	+ rec_oh = tls->params.tls_hlen + tls->params.tls_tlen;
	+ records = ((tls_usr_bytes + tls->params.max_frame_len - 1)/tls->params.max_frame_len);
	+ return (records * rec_oh);
	+#else
	+ return (0);
	+#endif
	+}
	+
	+extern uint32_t tcp_stale_entry_time;
	+uint32_t tcp_stale_entry_time = 250000;
	+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, usrlog_stale, CTLFLAG_RW,
	+ &tcp_stale_entry_time, 250000, "Time that a http entry without a sendfile ages out");
	+
	+void
	+tcp_http_log_req_info(struct tcpcb tp, struct http_sendfile_track http,
	+ uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes)
	+{
	+ if (tcp_bblogging_on(tp)) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+#ifdef TCPHPTS
	+ log.u_bbr.inhpts = tcp_in_hpts(tptoinpcb(tp));
	+#endif
	+ log.u_bbr.flex8 = val;
	+ log.u_bbr.rttProp = http->timestamp;
	+ log.u_bbr.delRate = http->start;
	+ log.u_bbr.cur_del_rate = http->end;
	+ log.u_bbr.flex1 = http->start_seq;
	+ log.u_bbr.flex2 = http->end_seq;
	+ log.u_bbr.flex3 = http->flags;
	+ log.u_bbr.flex4 = ((http->localtime >> 32) & 0x00000000ffffffff);
	+ log.u_bbr.flex5 = (http->localtime & 0x00000000ffffffff);
	+ log.u_bbr.flex7 = slot;
	+ log.u_bbr.bw_inuse = offset;
	+ /* nbytes = flex6 \| epoch */
	+ log.u_bbr.flex6 = ((nbytes >> 32) & 0x00000000ffffffff);
	+ log.u_bbr.epoch = (nbytes & 0x00000000ffffffff);
	+ /* cspr = lt_epoch \| pkts_out */
	+ log.u_bbr.lt_epoch = ((http->cspr >> 32) & 0x00000000ffffffff);
	+ log.u_bbr.pkts_out \|= (http->cspr & 0x00000000ffffffff);
	+ log.u_bbr.applimited = tp->t_http_closed;
	+ log.u_bbr.applimited <<= 8;
	+ log.u_bbr.applimited \|= tp->t_http_open;
	+ log.u_bbr.applimited <<= 8;
	+ log.u_bbr.applimited \|= tp->t_http_req;
	+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ TCP_LOG_EVENTP(tp, NULL,
	+ &tptosocket(tp)->so_rcv,
	+ &tptosocket(tp)->so_snd,
	+ TCP_LOG_HTTP_T, 0,
	+ 0, &log, false, &tv);
	+ }
	+}
	+
	+void
	+tcp_http_free_a_slot(struct tcpcb tp, struct http_sendfile_track ent)
	+{
	+ if (tp->t_http_req > 0)
	+ tp->t_http_req--;
	+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) {
	+ if (tp->t_http_open > 0)
	+ tp->t_http_open--;
	+ } else {
	+ if (tp->t_http_closed > 0)
	+ tp->t_http_closed--;
	+ }
	+ ent->flags = TCP_HTTP_TRACK_FLG_EMPTY;
	+}
	+
	+static void
	+tcp_http_check_for_stale_entries(struct tcpcb *tp, uint64_t ts, int rm_oldest)
	+{
	+ struct http_sendfile_track *ent;
	+ uint64_t time_delta, oldest_delta;
	+ int i, oldest, oldest_set = 0, cnt_rm = 0;
	+
	+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
	+ ent = &tp->t_http_info[i];
	+ if (ent->flags != TCP_HTTP_TRACK_FLG_USED) {
	+ /*
	+ * We only care about closed end ranges
	+ * that are allocated and have no sendfile
	+ * ever touching them. They would be in
	+ * state USED.
	+ */
	+ continue;
	+ }
	+ if (ts >= ent->localtime)
	+ time_delta = ts - ent->localtime;
	+ else
	+ time_delta = 0;
	+ if (time_delta &&
	+ ((oldest_delta < time_delta) \|\| (oldest_set == 0))) {
	+ oldest_set = 1;
	+ oldest = i;
	+ oldest_delta = time_delta;
	+ }
	+ if (tcp_stale_entry_time && (time_delta >= tcp_stale_entry_time)) {
	+ /*
	+ * No sendfile in a our time-limit
	+ * time to purge it.
	+ */
	+ cnt_rm++;
	+ tcp_http_log_req_info(tp, &tp->t_http_info[i], i, TCP_HTTP_REQ_LOG_STALE,
	+ time_delta, 0);
	+ tcp_http_free_a_slot(tp, ent);
	+ }
	+ }
	+ if ((cnt_rm == 0) && rm_oldest && oldest_set) {
	+ ent = &tp->t_http_info[oldest];
	+ tcp_http_log_req_info(tp, &tp->t_http_info[i], i, TCP_HTTP_REQ_LOG_STALE,
	+ oldest_delta, 1);
	+ tcp_http_free_a_slot(tp, ent);
	+ }
	+}
	+
	+int
	+tcp_http_check_for_comp(struct tcpcb *tp, tcp_seq ack_point)
	+{
	+ int i, ret=0;
	+ struct http_sendfile_track *ent;
	+
	+ /* Clean up any old closed end requests that are now completed */
	+ if (tp->t_http_req == 0)
	+ return(0);
	+ if (tp->t_http_closed == 0)
	+ return(0);
	+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
	+ ent = &tp->t_http_info[i];
	+ /* Skip empty ones */
	+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY)
	+ continue;
	+ /* Skip open ones */
	+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN)
	+ continue;
	+ if (SEQ_GEQ(ack_point, ent->end_seq)) {
	+ /* We are past it -- free it */
	+ tcp_http_log_req_info(tp, ent,
	+ i, TCP_HTTP_REQ_LOG_FREED, 0, 0);
	+ tcp_http_free_a_slot(tp, ent);
	+ ret++;
	+ }
	+ }
	+ return (ret);
	+}
	+
	+int
	+tcp_http_is_entry_comp(struct tcpcb tp, struct http_sendfile_track ent, tcp_seq ack_point)
	+{
	+ if (tp->t_http_req == 0)
	+ return(-1);
	+ if (tp->t_http_closed == 0)
	+ return(-1);
	+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY)
	+ return(-1);
	+ if (SEQ_GEQ(ack_point, ent->end_seq)) {
	+ return (1);
	+ }
	+ return (0);
	+}
	+
	+struct http_sendfile_track *
	+tcp_http_find_a_req_that_is_completed_by(struct tcpcb tp, tcp_seq th_ack, int ip)
	+{
	+ /*
	+ * Given an ack point (th_ack) walk through our entries and
	+ * return the first one found that th_ack goes past the
	+ * end_seq.
	+ */
	+ struct http_sendfile_track *ent;
	+ int i;
	+
	+ if (tp->t_http_req == 0) {
	+ /* none open */
	+ return (NULL);
	+ }
	+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
	+ ent = &tp->t_http_info[i];
	+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY)
	+ continue;
	+ if ((ent->flags & TCP_HTTP_TRACK_FLG_OPEN) == 0) {
	+ if (SEQ_GEQ(th_ack, ent->end_seq)) {
	+ *ip = i;
	+ return (ent);
	+ }
	+ }
	+ }
	+ return (NULL);
	+}
	+
	+struct http_sendfile_track *
	+tcp_http_find_req_for_seq(struct tcpcb *tp, tcp_seq seq)
	+{
	+ struct http_sendfile_track *ent;
	+ int i;
	+
	+ if (tp->t_http_req == 0) {
	+ /* none open */
	+ return (NULL);
	+ }
	+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
	+ ent = &tp->t_http_info[i];
	+ tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_SEARCH,
	+ (uint64_t)seq, 0);
	+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
	+ continue;
	+ }
	+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) {
	+ /*
	+ * An open end request only needs to
	+ * match the beginning seq or be
	+ * all we have (once we keep going on
	+ * a open end request we may have a seq
	+ * wrap).
	+ */
	+ if ((SEQ_GEQ(seq, ent->start_seq)) \|\|
	+ (tp->t_http_closed == 0))
	+ return (ent);
	+ } else {
	+ /*
	+ * For this one we need to
	+ * be a bit more careful if its
	+ * completed at least.
	+ */
	+ if ((SEQ_GEQ(seq, ent->start_seq)) &&
	+ (SEQ_LT(seq, ent->end_seq))) {
	+ return (ent);
	+ }
	+ }
	+ }
	+ return (NULL);
	+}
	+
	+/* Should this be in its own file tcp_http.c ? */
	+struct http_sendfile_track *
	+tcp_http_alloc_req_full(struct tcpcb tp, struct http_req req, uint64_t ts, int rec_dups)
	+{
	+ struct http_sendfile_track *fil;
	+ int i, allocated;
	+
	+ /* In case the stack does not check for completions do so now */
	+ tcp_http_check_for_comp(tp, tp->snd_una);
	+ /* Check for stale entries */
	+ if (tp->t_http_req)
	+ tcp_http_check_for_stale_entries(tp, ts,
	+ (tp->t_http_req >= MAX_TCP_HTTP_REQ));
	+ /* Check to see if this is a duplicate of one not started */
	+ if (tp->t_http_req) {
	+ for(i = 0, allocated = 0; i < MAX_TCP_HTTP_REQ; i++) {
	+ fil = &tp->t_http_info[i];
	+ if (fil->flags != TCP_HTTP_TRACK_FLG_USED)
	+ continue;
	+ if ((fil->timestamp == req->timestamp) &&
	+ (fil->start == req->start) &&
	+ ((fil->flags & TCP_HTTP_TRACK_FLG_OPEN) \|\|
	+ (fil->end == req->end))) {
	+ /*
	+ * We already have this request
	+ * and it has not been started with sendfile.
	+ * This probably means the user was returned
	+ * a 4xx of some sort and its going to age
	+ * out, lets not duplicate it.
	+ */
	+ return(fil);
	+ }
	+ }
	+ }
	+ /* Ok if there is no room at the inn we are in trouble */
	+ if (tp->t_http_req >= MAX_TCP_HTTP_REQ) {
	+ tcp_trace_point(tp, TCP_TP_HTTP_LOG_FAIL);
	+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
	+ tcp_http_log_req_info(tp, &tp->t_http_info[i],
	+ i, TCP_HTTP_REQ_LOG_ALLOCFAIL, 0, 0);
	+ }
	+ return (NULL);
	+ }
	+ for(i = 0, allocated = 0; i < MAX_TCP_HTTP_REQ; i++) {
	+ fil = &tp->t_http_info[i];
	+ if (fil->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
	+ allocated = 1;
	+ fil->flags = TCP_HTTP_TRACK_FLG_USED;
	+ fil->timestamp = req->timestamp;
	+ fil->localtime = ts;
	+ fil->start = req->start;
	+ if (req->flags & TCP_LOG_HTTPD_RANGE_END) {
	+ fil->end = req->end;
	+ } else {
	+ fil->end = 0;
	+ fil->flags \|= TCP_HTTP_TRACK_FLG_OPEN;
	+ }
	+ /*
	+ * We can set the min boundaries to the TCP Sequence space,
	+ * but it might be found to be further up when sendfile
	+ * actually runs on this range (if it ever does).
	+ */
	+ fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc;
	+ fil->start_seq = tp->snd_una +
	+ tptosocket(tp)->so_snd.sb_ccc;
	+ fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start)));
	+ if (tptosocket(tp)->so_snd.sb_tls_info) {
	+ /*
	+ * This session is doing TLS. Take a swag guess
	+ * at the overhead.
	+ */
	+ fil->end_seq += tcp_estimate_tls_overhead(
	+ tptosocket(tp), (fil->end - fil->start));
	+ }
	+ tp->t_http_req++;
	+ if (fil->flags & TCP_HTTP_TRACK_FLG_OPEN)
	+ tp->t_http_open++;
	+ else
	+ tp->t_http_closed++;
	+ tcp_http_log_req_info(tp, fil, i,
	+ TCP_HTTP_REQ_LOG_NEW, 0, 0);
	+ break;
	+ } else
	+ fil = NULL;
	+ }
	+ return (fil);
	+}
	+
	+void
	+tcp_http_alloc_req(struct tcpcb tp, union tcp_log_userdata user, uint64_t ts)
	+{
	+ (void)tcp_http_alloc_req_full(tp, &user->http_req, ts, 1);
	+}
	+#endif
	Index: sys/netinet/tcp_syncache.c
	===================================================================
	--- sys/netinet/tcp_syncache.c
	+++ sys/netinet/tcp_syncache.c
	@@ -932,22 +932,27 @@
	* pickup one on the new entry.
	*/
	struct tcp_function_block *rblk;
	+ void *ptr = NULL;

	rblk = find_and_ref_tcp_fb(blk);
	KASSERT(rblk != NULL,
	("cannot find blk %p out of syncache?", blk));
	- if (tp->t_fb->tfb_tcp_fb_fini)
	- (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
	- refcount_release(&tp->t_fb->tfb_refcnt);
	- tp->t_fb = rblk;
	- /*
	- * XXXrrs this is quite dangerous, it is possible
	- * for the new function to fail to init. We also
	- * are not asking if the handoff_is_ok though at
	- * the very start thats probalbly ok.
	- */
	- if (tp->t_fb->tfb_tcp_fb_init) {
	- (*tp->t_fb->tfb_tcp_fb_init)(tp);
	+
	+ if (rblk->tfb_tcp_fb_init == NULL \|\|
	+ (*rblk->tfb_tcp_fb_init)(tp, &ptr) == 0) {
	+ /* Release the old stack */
	+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
	+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
	+ refcount_release(&tp->t_fb->tfb_refcnt);
	+ /* Now set in all the pointers */
	+ tp->t_fb = rblk;
	+ tp->t_fb_ptr = ptr;
	+ } else {
	+ /*
	+ * Initialization failed. Release the reference count on
	+ * the looked up default stack.
	+ */
	+ refcount_release(&rblk->tfb_refcnt);
	}
	}
	tp->snd_wl1 = sc->sc_irs;
	Index: sys/netinet/tcp_usrreq.c
	===================================================================
	--- sys/netinet/tcp_usrreq.c
	+++ sys/netinet/tcp_usrreq.c
	@@ -1659,6 +1659,7 @@
	*/
	struct tcp_function_set fsn;
	struct tcp_function_block *blk;
	+ void *ptr = NULL;

	INP_WUNLOCK(inp);
	error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn);
	@@ -1666,10 +1667,6 @@
	return (error);

	INP_WLOCK(inp);
	- if (inp->inp_flags & INP_DROPPED) {
	- INP_WUNLOCK(inp);
	- return (ECONNRESET);
	- }
	tp = intotcpcb(inp);

	blk = find_and_ref_tcp_functions(&fsn);
	@@ -1710,41 +1707,57 @@
	return (ENOENT);
	}
	/*
	- * Release the old refcnt, the
	- * lookup acquired a ref on the
	- * new one already.
	+ * Ensure the new stack takes ownership with a
	+ * clean slate on peak rate threshold.
	*/
	- if (tp->t_fb->tfb_tcp_fb_fini) {
	- struct epoch_tracker et;
	- /*
	- * Tell the stack to cleanup with 0 i.e.
	- * the tcb is not going away.
	- */
	- NET_EPOCH_ENTER(et);
	- (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
	- NET_EPOCH_EXIT(et);
	- }
	+ tp->t_peakrate_thr = 0;
	#ifdef TCPHPTS
	/* Assure that we are not on any hpts */
	tcp_hpts_remove(tptoinpcb(tp));
	#endif
	if (blk->tfb_tcp_fb_init) {
	- error = (*blk->tfb_tcp_fb_init)(tp);
	+ error = (*blk->tfb_tcp_fb_init)(tp, &ptr);
	if (error) {
	+ /*
	+ * Release the ref count the lookup
	+ * acquired.
	+ */
	refcount_release(&blk->tfb_refcnt);
	- if (tp->t_fb->tfb_tcp_fb_init) {
	- if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) {
	- /* Fall back failed, drop the connection */
	- INP_WUNLOCK(inp);
	- soabort(so);
	- return (error);
	- }
	+ /*
	+ * Now there is a chance that the
	+ * init() function mucked with some
	+ * things before it failed, such as
	+ * hpts or inp_flags2 or timer granularity.
	+ * It should not of, but lets give the old
	+ * stack a chance to reset to a known good state.
	+ */
	+ if (tp->t_fb->tfb_switch_failed) {
	+ (*tp->t_fb->tfb_switch_failed)(tp);
	}
	- goto err_out;
	+ goto err_out;
	}
	}
	+ if (tp->t_fb->tfb_tcp_fb_fini) {
	+ struct epoch_tracker et;
	+ /*
	+ * Tell the stack to cleanup with 0 i.e.
	+ * the tcb is not going away.
	+ */
	+ NET_EPOCH_ENTER(et);
	+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
	+ NET_EPOCH_EXIT(et);
	+ }
	+ /*
	+ * Release the old refcnt, the
	+ * lookup acquired a ref on the
	+ * new one already.
	+ */
	refcount_release(&tp->t_fb->tfb_refcnt);
	+ /*
	+ * Set in the new stack.
	+ */
	tp->t_fb = blk;
	+ tp->t_fb_ptr = ptr;
	#ifdef TCP_OFFLOAD
	if (tp->t_flags & TF_TOE) {
	tcp_offload_ctloutput(tp, sopt->sopt_dir,
	@@ -1754,6 +1767,7 @@
	err_out:
	INP_WUNLOCK(inp);
	return (error);
	+
	}

	/* Pass in the INP locked, callee must unlock it. */
	Index: sys/netinet/tcp_var.h
	===================================================================
	--- sys/netinet/tcp_var.h
	+++ sys/netinet/tcp_var.h
	@@ -61,6 +61,15 @@
	#define TCP_EI_STATUS_2MSL 0xb
	#define TCP_EI_STATUS_MAX_VALUE 0xb

	+#define TCP_HTTP_REQ_LOG_NEW 0x01
	+#define TCP_HTTP_REQ_LOG_COMPLETE 0x02
	+#define TCP_HTTP_REQ_LOG_FREED 0x03
	+#define TCP_HTTP_REQ_LOG_ALLOCFAIL 0x04
	+#define TCP_HTTP_REQ_LOG_MOREYET 0x05
	+#define TCP_HTTP_REQ_LOG_FORCEFREE 0x06
	+#define TCP_HTTP_REQ_LOG_STALE 0x07
	+#define TCP_HTTP_REQ_LOG_SEARCH 0x08
	+
	/************************************************/
	/* Status bits we track to assure no duplicates,
	* the bits here are not used by the code but
	@@ -126,6 +135,154 @@

	STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);

	+#define TCP_HTTP_TRACK_FLG_EMPTY 0x00 /* Available */
	+#define TCP_HTTP_TRACK_FLG_USED 0x01 /* In use */
	+#define TCP_HTTP_TRACK_FLG_OPEN 0x02 /* End is not valid (open range request) */
	+#define TCP_HTTP_TRACK_FLG_SEQV 0x04 /* We had a sendfile that touched it */
	+#define TCP_HTTP_TRACK_FLG_COMP 0x08 /* Sendfile as placed the last bits (range req only) */
	+#define TCP_HTTP_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */
	+#define MAX_TCP_HTTP_REQ 5 /* Max we will have at once */
	+
	+#ifdef TCP_REQUEST_TRK
	+struct http_sendfile_track {
	+ uint64_t timestamp; /* User sent timestamp */
	+ uint64_t start; /* Start of sendfile offset */
	+ uint64_t end; /* End if not open-range req */
	+ uint64_t localtime; /* Time we actually got the req */
	+ uint64_t deadline; /* If in CU mode, deadline to delivery */
	+ uint64_t first_send; /* Time of first send in the range */
	+ uint64_t cspr; /* Client suggested pace rate */
	+ uint64_t sent_at_fs; /* What was t_sndbytes as we begun sending */
	+ uint64_t rxt_at_fs; /* What was t_snd_rxt_bytes as we begun sending */
	+ tcp_seq start_seq; /* First TCP Seq assigned */
	+ tcp_seq end_seq; /* If range req last seq */
	+ uint32_t flags; /* Type of request open etc */
	+ uint32_t sbcc_at_s; /* When we allocate what is the sb_cc */
	+ uint32_t hint_maxseg; /* Client hinted maxseg */
	+ uint32_t hybrid_flags; /* Hybrid flags on this request */
	+};
	+
	+#endif
	+
	+/*
	+ * Change Query responses for a stack switch we create a structure
	+ * that allows query response from the new stack to the old, if
	+ * supported.
	+ *
	+ * There are three queries currently defined.
	+ * - sendmap
	+ * - timers
	+ * - rack_times
	+ *
	+ * For the sendmap query the caller fills in the
	+ * req and the req_param as the first seq (usually
	+ * snd_una). When the response comes back indicating
	+ * that there was data (return value 1), then the caller
	+ * can build a sendmap entry based on the range and the
	+ * times. The next query would then be done at the
	+ * newly created sendmap_end. Repeated until sendmap_end == snd_max.
	+ *
	+ * Flags in sendmap_flags are defined below as well.
	+ *
	+ * For timers the standard PACE_TMR_XXXX flags are returned indicating
	+ * a pacing timer (possibly) and one other timer. If pacing timer then
	+ * the expiration timeout time in microseconds is in timer_pacing_to.
	+ * And the value used with whatever timer (if a flag is set) is in
	+ * timer_rxt. If no timers are running a 0 is returned and of
	+ * course no flags are set in timer_hpts_flags.
	+ *
	+ * The rack_times are a misc collection of information that
	+ * the old stack might possibly fill in. Of course its possible
	+ * that an old stack may not have a piece of information. If so
	+ * then setting that value to zero is advised. Setting any
	+ * timestamp passed should only place a zero in it when it
	+ * is unfilled. This may mean that a time is off by a micro-second
	+ * but this is ok in the grand scheme of things.
	+ *
	+ * When switching stacks it is desireable to get as much information
	+ * from the old stack to the new stack as possible. Though not always
	+ * will the stack be compatible in the types of information. The
	+ * init() function needs to take care when it begins changing
	+ * things such as inp_flags2 and the timer units to position these
	+ * changes at a point where it is unlikely they will fail after
	+ * making such changes. A stack optionally can have an "undo"
	+ * function
	+ *
	+ * To transfer information to the old stack from the new in
	+ * respect to LRO and the inp_flags2, the new stack should set
	+ * the inp_flags2 to what it supports. The old stack in its
	+ * fini() function should call the tcp_handle_orphaned_packets()
	+ * to clean up any packets. Note that a new stack should attempt
	+ */
	+
	+/* Query types */
	+#define TCP_QUERY_SENDMAP 1
	+#define TCP_QUERY_TIMERS_UP 2
	+#define TCP_QUERY_RACK_TIMES 3
	+
	+/* Flags returned in sendmap_flags */
	+#define SNDMAP_ACKED 0x000001/* The remote endpoint acked this */
	+#define SNDMAP_OVERMAX 0x000008/* We have more retran's then we can fit */
	+#define SNDMAP_SACK_PASSED 0x000010/* A sack was done above this block */
	+#define SNDMAP_HAS_FIN 0x000040/* segment is sent with fin */
	+#define SNDMAP_TLP 0x000080/* segment sent as tail-loss-probe */
	+#define SNDMAP_HAS_SYN 0x000800/* SYN is on this guy */
	+#define SNDMAP_HAD_PUSH 0x008000/* Push was sent on original send */
	+#define SNDMAP_MASK (SNDMAP_ACKED\|SNDMAP_OVERMAX\|SNDMAP_SACK_PASSED\|SNDMAP_HAS_FIN\
	+ \|SNDMAP_TLP\|SNDMAP_HAS_SYN\|SNDMAP_HAD_PUSH)
	+#define SNDMAP_NRTX 3
	+
	+struct tcp_query_resp {
	+ int req;
	+ uint32_t req_param;
	+ union {
	+ struct {
	+ tcp_seq sendmap_start;
	+ tcp_seq sendmap_end;
	+ int sendmap_send_cnt;
	+ uint64_t sendmap_time[SNDMAP_NRTX];
	+ uint64_t sendmap_ack_arrival;
	+ int sendmap_flags;
	+ uint32_t sendmap_r_rtr_bytes;
	+ /* If FAS is available if not 0 */
	+ uint32_t sendmap_fas;
	+ uint8_t sendmap_dupacks;
	+ };
	+ struct {
	+ uint32_t timer_hpts_flags;
	+ uint32_t timer_pacing_to;
	+ uint32_t timer_timer_exp;
	+ };
	+ struct {
	+ /* Timestamps and rtt's */
	+ uint32_t rack_reorder_ts; /* Last uscts that reordering was seen */
	+ uint32_t rack_num_dsacks; /* Num of dsacks seen */
	+ uint32_t rack_rxt_last_time; /* Last time a RXT/TLP or rack tmr went off */
	+ uint32_t rack_min_rtt; /* never 0 smallest rtt seen */
	+ uint32_t rack_rtt; /* Last rtt used by rack */
	+ uint32_t rack_tmit_time; /* The time the rtt seg was tmited */
	+ uint32_t rack_time_went_idle; /* If in persist the time we went idle */
	+ /* Prr data */
	+ uint32_t rack_sacked;
	+ uint32_t rack_holes_rxt;
	+ uint32_t rack_prr_delivered;
	+ uint32_t rack_prr_recovery_fs;
	+ uint32_t rack_prr_out;
	+ uint32_t rack_prr_sndcnt;
	+ /* TLP data */
	+ uint16_t rack_tlp_cnt_out; /* How many tlp's have been sent */
	+ /* Various bits */
	+ uint8_t rack_tlp_out; /* Is a TLP outstanding */
	+ uint8_t rack_srtt_measured; /* The previous stack has measured srtt */
	+ uint8_t rack_in_persist; /* Is the old stack in persists? */
	+ uint8_t rack_wanted_output; /* Did the prevous stack have a want output set */
	+ };
	+ };
	+};
	+
	+#define TCP_TMR_GRANULARITY_TICKS 1 /* TCP timers are in ticks (msec if hz=1000) */
	+#define TCP_TMR_GRANULARITY_USEC 2 /* TCP timers are in microseconds */
	+
	typedef enum {
	TT_REXMT = 0,
	TT_PERSIST,
	@@ -276,6 +433,11 @@
	#ifdef TCP_ACCOUNTING
	uint64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS];
	uint64_t tcp_proc_time[TCP_NUM_CNT_COUNTERS];
	+#endif
	+#ifdef TCP_REQUEST_TRK
	+ uint32_t tcp_hybrid_start; /* Num of times we started hybrid pacing */
	+ uint32_t tcp_hybrid_stop; /* Num of times we stopped hybrid pacing */
	+ uint32_t tcp_hybrid_error; /* Num of times we failed to start hybrid pacing */
	#endif
	uint32_t t_logsn; /* Log "serial number" */
	uint32_t gput_ts; /* Time goodput measurement started */
	@@ -290,6 +452,7 @@
	uint32_t t_dsack_bytes; /* dsack bytes received */
	uint32_t t_dsack_tlp_bytes; /* dsack bytes received for TLPs sent */
	uint32_t t_dsack_pack; /* dsack packets we have eceived */
	+ uint8_t t_tmr_granularity; /* Granularity of all timers srtt etc */
	uint8_t t_rttupdated; /* number of times rtt sampled */
	/* TCP Fast Open */
	uint8_t t_tfo_client_cookie_len; /* TFO client cookie length */
	@@ -311,6 +474,13 @@
	struct osd t_osd; /* storage for Khelp module data */
	#endif
	uint8_t _t_logpoint; /* Used when a BB log points is enabled */
	+#ifdef TCP_REQUEST_TRK
	+ /* Response tracking addons. */
	+ uint8_t t_http_req; /* Request count */
	+ uint8_t t_http_open; /* Number of open range requests */
	+ uint8_t t_http_closed; /* Number of closed range requests */
	+ struct http_sendfile_track t_http_info[MAX_TCP_HTTP_REQ];
	+#endif
	};
	#endif /* _KERNEL \|\| _WANT_TCPCB */

	@@ -346,7 +516,7 @@
	#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */
	#define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */

	-/*
	+/**
	* If defining the optional tcp_timers, in the
	* tfb_tcp_timer_stop call you must use the
	* callout_async_drain() function with the
	@@ -356,6 +526,7 @@
	* does not know your callbacks you must provide a
	* stop_all function that loops through and calls
	* tcp_timer_stop() with each of your defined timers.
	+ *
	* Adding a tfb_tcp_handoff_ok function allows the socket
	* option to change stacks to query you even if the
	* connection is in a later stage. You return 0 to
	@@ -363,16 +534,67 @@
	* non-zero (an error number) to say no you can't.
	* If the function is undefined you can only change
	* in the early states (before connect or listen).
	+ *
	+ * tfb_tcp_fb_init is used to allow the new stack to
	+ * setup its control block. Among the things it must
	+ * do is:
	+ * a) Make sure that the inp_flags2 is setup correctly
	+ * for LRO. There are two flags that the previous
	+ * stack may have set INP_MBUF_ACKCMP and
	+ * INP_SUPPORTS_MBUFQ. If the new stack does not
	+ * support these it should clear the flags.
	+ * b) Make sure that the timers are in the proper
	+ * granularity that the stack wants. The stack
	+ * should check the t_tmr_granularity field. Currently
	+ * there are two values that it may hold
	+ * TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC.
	+ * Use the functions tcp_timer_convert(tp, granularity);
	+ * to move the timers to the correct format for your stack.
	+ *
	+ * The new stack may also optionally query the tfb_chg_query
	+ * function if the old stack has one. The new stack may ask
	+ * for one of three entries and can also state to the old
	+ * stack its support for the INP_MBUF_ACKCMP and
	+ * INP_SUPPORTS_MBUFQ. This is important since if there are
	+ * queued ack's without that statement the old stack will
	+ * be forced to discard the queued acks. The requests that
	+ * can be made for information by the new stacks are:
	+ *
	+ * Note also that the tfb_tcp_fb_init() when called can
	+ * determine if a query is needed by looking at the
	+ * value passed in the ptr. The ptr is designed to be
	+ * set in with any allocated memory, but the address
	+ * of the condtion (ptr == &tp->t_fb_ptr) will be
	+ * true if this is not a stack switch but the initial
	+ * setup of a tcb (which means no query would be needed).
	+ * If, however, the value is not t_fb_ptr, then the caller
	+ * is in the middle of a stack switch and is the new stack.
	+ * A query would be appropriate (if the new stack support
	+ * the query mechanism).
	+ *
	+ * TCP_QUERY_SENDMAP - Query of outstanding data.
	+ * TCP_QUERY_TIMERS_UP - Query about running timers.
	+ * TCP_SUPPORTED_LRO - Declaration in req_param of
	+ * the inp_flags2 supported by
	+ * the new stack.
	+ * TCP_QUERY_RACK_TIMES - Enquire about various timestamps
	+ * and states the old stack may be in.
	+ *
	* tfb_tcp_fb_fini is changed to add a flag to tell
	* the old stack if the tcb is being destroyed or
	* not. A one in the flag means the TCB is being
	* destroyed, a zero indicates its transitioning to
	- * another stack (via socket option).
	+ * another stack (via socket option). The
	+ * tfb_tcp_fb_fini() function itself should not change timers
	+ * or inp_flags2 (the tfb_tcp_fb_init() must do that). However
	+ * if the old stack supports the LRO mbuf queuing, and the new
	+ * stack does not communicate via chg messages that it too does,
	+ * it must assume it does not and free any queued mbufs.
	+ *
	*/
	struct tcp_function_block {
	char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
	int (tfb_tcp_output)(struct tcpcb );
	- int (tfb_tcp_output_wtime)(struct tcpcb , const struct timeval *);
	void (tfb_tcp_do_segment)(struct mbuf , struct tcphdr *,
	struct socket , struct tcpcb ,
	int, int, uint8_t);
	@@ -387,15 +609,18 @@
	int, struct timeval *);
	int (tfb_tcp_ctloutput)(struct inpcb inp, struct sockopt *sopt);
	/* Optional memory allocation/free routine */
	- int (tfb_tcp_fb_init)(struct tcpcb );
	+ int (tfb_tcp_fb_init)(struct tcpcb , void **);
	void (tfb_tcp_fb_fini)(struct tcpcb , int);
	/* Optional timers, must define all if you define one */
	int (tfb_tcp_timer_stop_all)(struct tcpcb );
	void (tfb_tcp_rexmit_tmr)(struct tcpcb );
	int (tfb_tcp_handoff_ok)(struct tcpcb );
	- void (tfb_tcp_mtu_chg)(struct tcpcb );
	+ void (tfb_tcp_mtu_chg)(struct tcpcb tp);
	int (tfb_pru_options)(struct tcpcb , int);
	void (tfb_hwtls_change)(struct tcpcb , int);
	+ int (tfb_chg_query)(struct tcpcb , struct tcp_query_resp *);
	+ void (tfb_switch_failed)(struct tcpcb );
	+ bool (tfb_early_wake_check)(struct tcpcb );
	int (tfb_compute_pipe)(struct tcpcb tp);
	volatile uint32_t tfb_refcnt;
	uint32_t tfb_flags;
	@@ -445,6 +670,16 @@
	return (rv);
	}

	+static inline void
	+tcp_lro_features_off(struct inpcb *inp)
	+{
	+ inp->inp_flags2 &= ~(INP_SUPPORTS_MBUFQ\|
	+ INP_MBUF_QUEUE_READY\|
	+ INP_DONT_SACK_QUEUE\|
	+ INP_MBUF_ACKCMP\|
	+ INP_MBUF_L_ACKS);
	+}
	+
	/*
	* tcp_output_unlock()
	* Always returns unlocked, handles drop request from advanced stacks.
	@@ -1169,6 +1404,7 @@
	#ifdef NETFLIX_EXP_DETECTION
	/* Various SACK attack thresholds */
	extern int32_t tcp_force_detection;
	+extern int32_t tcp_sad_limit;
	extern int32_t tcp_sack_to_ack_thresh;
	extern int32_t tcp_sack_to_move_thresh;
	extern int32_t tcp_restoral_thresh;
	@@ -1176,6 +1412,7 @@
	extern int32_t tcp_sad_pacing_interval;
	extern int32_t tcp_sad_low_pps;
	extern int32_t tcp_map_minimum;
	+extern int32_t tcp_attack_on_turns_on_logging;
	#endif
	extern uint32_t tcp_ack_war_time_window;
	extern uint32_t tcp_ack_war_cnt;
	@@ -1246,6 +1483,8 @@
	size_t seed_len);
	int tcp_can_enable_pacing(void);
	void tcp_decrement_paced_conn(void);
	+void tcp_change_time_units(struct tcpcb *, int);
	+void tcp_handle_orphaned_packets(struct tcpcb *);

	struct mbuf *
	tcp_m_copym(struct mbuf m, int32_t off0, int32_t plen,
	@@ -1253,6 +1492,31 @@

	int tcp_stats_init(void);
	void tcp_log_end_status(struct tcpcb *tp, uint8_t status);
	+#ifdef TCP_REQUEST_TRK
	+void tcp_http_free_a_slot(struct tcpcb tp, struct http_sendfile_track ent);
	+struct http_sendfile_track *
	+tcp_http_find_a_req_that_is_completed_by(struct tcpcb tp, tcp_seq th_ack, int ip);
	+int tcp_http_check_for_comp(struct tcpcb *tp, tcp_seq ack_point);
	+int
	+tcp_http_is_entry_comp(struct tcpcb tp, struct http_sendfile_track ent, tcp_seq ack_point);
	+struct http_sendfile_track *
	+tcp_http_find_req_for_seq(struct tcpcb *tp, tcp_seq seq);
	+void
	+tcp_http_log_req_info(struct tcpcb *tp,
	+ struct http_sendfile_track *http, uint16_t slot,
	+ uint8_t val, uint64_t offset, uint64_t nbytes);
	+
	+uint32_t
	+tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes);
	+void
	+tcp_http_alloc_req(struct tcpcb tp, union tcp_log_userdata user,
	+ uint64_t ts);
	+
	+struct http_sendfile_track *
	+tcp_http_alloc_req_full(struct tcpcb tp, struct http_req req, uint64_t ts, int rec_dups);
	+
	+
	+#endif
	#ifdef TCP_ACCOUNTING
	int tcp_do_ack_accounting(struct tcpcb tp, struct tcphdr th, struct tcpopt *to, uint32_t tiwin, int mss);
	#endif
	Index: sys/sys/mbuf.h
	===================================================================
	--- sys/sys/mbuf.h
	+++ sys/sys/mbuf.h
	@@ -1235,6 +1235,16 @@
	#define M_LEADINGSPACE(m) \
	(M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0)

	+/*
	+ * So M_TRAILINGROOM() is for when you want to know how much space
	+ * would be there if it was writable. This can be used to
	+ * detect changes in mbufs by knowing the value at one point
	+ * and then being able to compare it later to the current M_TRAILINGROOM().
	+ * The TRAILINGSPACE() macro is not suitable for this since an mbuf
	+ * at one point might not be writable and then later it becomes writable
	+ * even though the space at the back of it has not changed.
	+ */
	+#define M_TRAILINGROOM(m) ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len))
	/*
	* Compute the amount of space available after the end of data in an mbuf.
	*
	@@ -1245,9 +1255,7 @@
	* for mbufs with external storage. We now allow mbuf-embedded data to be
	* read-only as well.
	*/
	-#define M_TRAILINGSPACE(m) \
	- (M_WRITABLE(m) ? \
	- ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0)
	+#define M_TRAILINGSPACE(m) (M_WRITABLE(m) ? M_TRAILINGROOM(m) : 0)

	/*
	* Arrange to prepend space of size plen to mbuf m. If a new mbuf must be

File Metadata

Mime Type: text/plain
Expires: Fri, Jan 23, 11:27 PM (12 h, 5 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 27891603
Default Alt Text: D39210.diff (59 KB)

D39210.diffNo OneTemporaryActions

D39210.diffView Options

File Metadata

Event Timeline

D39210.diff
No OneTemporary
Actions

D39210.diff
View Options