Page MenuHomeFreeBSD

D39210.diff
No OneTemporary

D39210.diff

Index: sys/conf/options
===================================================================
--- sys/conf/options
+++ sys/conf/options
@@ -227,6 +227,7 @@
SYSVSHM opt_sysvipc.h
SW_WATCHDOG opt_watchdog.h
TCPHPTS opt_inet.h
+TCP_REQUEST_TRK opt_global.h
TCP_ACCOUNTING opt_inet.h
TURNSTILE_PROFILING
UMTX_PROFILING
Index: sys/kern/kern_sendfile.c
===================================================================
--- sys/kern/kern_sendfile.c
+++ sys/kern/kern_sendfile.c
@@ -57,6 +57,9 @@
#include <net/vnet.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_log_buf.h>
#include <security/audit/audit.h>
#include <security/mac/mac_framework.h>
@@ -1188,6 +1191,12 @@
NULL, NULL, td);
sendfile_iodone(sfio, NULL, 0, error);
}
+#ifdef TCP_REQUEST_TRK
+ if (so->so_proto->pr_protocol == IPPROTO_TCP) {
+ /* log the sendfile call to the TCP log, if enabled */
+ tcp_log_sendfile(so, offset, nbytes, flags);
+ }
+#endif
CURVNET_RESTORE();
m = NULL;
Index: sys/modules/tcp/rack/Makefile
===================================================================
--- sys/modules/tcp/rack/Makefile
+++ sys/modules/tcp/rack/Makefile
@@ -6,7 +6,7 @@
STACKNAME= rack
KMOD= tcp_${STACKNAME}
-SRCS= rack.c sack_filter.c rack_bbr_common.c
+SRCS= rack.c sack_filter.c rack_bbr_common.c #tailq_hash.c
SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
SRCS+= opt_kern_tls.h
Index: sys/netinet/tcp.h
===================================================================
--- sys/netinet/tcp.h
+++ sys/netinet/tcp.h
@@ -217,15 +217,15 @@
/* Options for Rack and BBR */
#define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */
#define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */
-#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */
+#define TCP_RACK_PROP 1051 /* Not used */
#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */
#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacingv reduction factor (divisor) */
#define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send */
#define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */
-#define TCP_RACK_PROP_RATE 1056 /* The proportional reduction rate */
+#define TCP_RACK_PROP_RATE 1056 /* Not used */
#define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */
#define TCP_RACK_MIN_TO 1058 /* Minimum time between rack t-o's in ms */
-#define TCP_RACK_EARLY_RECOV 1059 /* Should recovery happen early (bool) */
+#define TCP_RACK_EARLY_RECOV 1059 /* Not used */
#define TCP_RACK_EARLY_SEG 1060 /* If early recovery max segments */
#define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */
#define TCP_RACK_REORD_FADE 1062 /* Does reordering fade after ms time */
@@ -309,12 +309,22 @@
#define TCP_REC_ABC_VAL 1134 /* Do we use the ABC value for recovery or the override one from sysctl */
#define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */
#define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */
-#define TCP_FAST_RSM_HACK 1137 /* Do we do the broken thing where we don't twiddle the TLP bits properly in fast_rsm_output? */
+#define TCP_FAST_RSM_HACK 1137 /* Not used in modern stacks */
#define TCP_RACK_PACING_BETA 1138 /* Changing the beta for pacing */
#define TCP_RACK_PACING_BETA_ECN 1139 /* Changing the beta for ecn with pacing */
#define TCP_RACK_TIMER_SLOP 1140 /* Set or get the timer slop used */
#define TCP_RACK_DSACK_OPT 1141 /* How do we setup rack timer DSACK options bit 1/2 */
#define TCP_RACK_ENABLE_HYSTART 1142 /* Do we allow hystart in the CC modules */
+#define TCP_RACK_SET_RXT_OPTIONS 1143 /* Set the bits in the retransmit options */
+#define TCP_RACK_HI_BETA 1144 /* Turn on/off high beta */
+#define TCP_RACK_SPLIT_LIMIT 1145 /* Set a split limit for split allocations */
+#define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */
+#define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */
+#define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */
+#define TCP_RXT_CLAMP 1149 /* Do we apply a threshold to rack so if excess rxt clamp cwnd? */
+#define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */
+#define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */
+
/* Start of reserved space for third-party user-settable options. */
#define TCP_VENDOR SO_VENDOR
@@ -447,6 +457,53 @@
#define TLS_SET_RECORD_TYPE 1
#define TLS_GET_RECORD 2
+/*
+ * TCP log user opaque
+ */
+struct http_req {
+ uint64_t timestamp;
+ uint64_t start;
+ uint64_t end;
+ uint32_t flags;
+};
+
+union tcp_log_userdata {
+ struct http_req http_req;
+};
+
+struct tcp_log_user {
+ uint32_t type;
+ uint32_t subtype;
+ union tcp_log_userdata data;
+};
+
+/* user types, i.e. apps */
+#define TCP_LOG_USER_HTTPD 1
+
+/* user subtypes */
+#define TCP_LOG_HTTPD_TS 1 /* client timestamp */
+#define TCP_LOG_HTTPD_TS_REQ 2 /* client timestamp and request info */
+
+/* HTTPD REQ flags */
+#define TCP_LOG_HTTPD_RANGE_START 0x0001
+#define TCP_LOG_HTTPD_RANGE_END 0x0002
+
+/* Flags for hybrid pacing */
+#define TCP_HYBRID_PACING_CU 0x0001 /* Enable catch-up mode */
+#define TCP_HYBRID_PACING_DTL 0x0002 /* Enable Detailed logging */
+#define TCP_HYBRID_PACING_CSPR 0x0004 /* A client suggested rate is present */
+#define TCP_HYBRID_PACING_H_MS 0x0008 /* A client hint for maxseg is present */
+#define TCP_HYBRID_PACING_ENABLE 0x0010 /* We are enabling hybrid pacing else disable */
+#define TCP_HYBRID_PACING_S_MSS 0x0020 /* Clent wants us to set the mss overriding gp est in CU */
+#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tellsus we set the mss on this entry */
+
+struct tcp_hybrid_req {
+ struct http_req req;
+ uint64_t cspr;
+ uint32_t hint_maxseg;
+ uint32_t hybrid_flags;
+};
+
/*
* TCP specific variables of interest for tp->t_stats stats(9) accounting.
*/
@@ -460,6 +517,7 @@
#define VOI_TCP_CALCFRWINDIFF 7 /* Congestion avoidance LCWIN - FRWIN */
#define VOI_TCP_GPUT_ND 8 /* Goodput normalised delta */
#define VOI_TCP_ACKLEN 9 /* Average ACKed bytes per ACK */
+#define VOI_TCP_PATHRTT 10 /* The path RTT based on ACK arrival */
#define TCP_REUSPORT_LB_NUMA_NODOM (-2) /* remove numa binding */
#define TCP_REUSPORT_LB_NUMA_CURDOM (-1) /* bind to current domain */
Index: sys/netinet/tcp_hpts.h
===================================================================
--- sys/netinet/tcp_hpts.h
+++ sys/netinet/tcp_hpts.h
@@ -187,6 +187,15 @@
}
#ifdef _KERNEL
+
+extern int32_t tcp_min_hptsi_time;
+
+__inline int32_t
+get_hpts_min_sleep_time()
+{
+ return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT);
+}
+
static __inline uint32_t
tcp_gethptstick(struct timeval *sv)
{
Index: sys/netinet/tcp_log_buf.c
===================================================================
--- sys/netinet/tcp_log_buf.c
+++ sys/netinet/tcp_log_buf.c
@@ -58,6 +58,7 @@
#include <netinet/in_var.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_log_buf.h>
+#include <netinet/tcp_seq.h>
#include <netinet/tcp_hpts.h>
/* Default expiry time */
@@ -2844,6 +2845,10 @@
{
struct inpcb *inp;
struct tcpcb *tp;
+#ifdef TCP_REQUEST_TRK
+ struct http_sendfile_track *ent;
+ int i, fnd;
+#endif
inp = sotoinpcb(so);
KASSERT(inp != NULL, ("tcp_log_sendfile: inp == NULL"));
@@ -2873,6 +2878,90 @@
&tptosocket(tp)->so_snd,
TCP_LOG_SENDFILE, 0, 0, &log, false, &tv);
}
+#ifdef TCP_REQUEST_TRK
+ if (tp->t_http_req == 0) {
+ /* No http requests to track */
+ goto done;
+ }
+ fnd = 0;
+ if (tp->t_http_closed == 0) {
+ /* No closed end req to track */
+ goto skip_closed_req;
+ }
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ /* Lets see if this one can be found */
+ ent = &tp->t_http_info[i];
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
+ /* Not used */
+ continue;
+ }
+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) {
+ /* This pass does not consider open requests */
+ continue;
+ }
+ if (ent->flags & TCP_HTTP_TRACK_FLG_COMP) {
+ /* Don't look at what we have completed */
+ continue;
+ }
+ /* If we reach here its a allocated closed end request */
+ if ((ent->start == offset) ||
+ ((offset > ent->start) && (offset < ent->end))){
+ /* Its within this request?? */
+ fnd = 1;
+ }
+ if (fnd) {
+ /*
+ * It is at or past the end, its complete.
+ */
+ ent->flags |= TCP_HTTP_TRACK_FLG_SEQV;
+ /*
+ * When an entry completes we can take (snd_una + sb_cc) and know where
+ * the end of the range really is. Note that this works since two
+ * requests must be sequential and sendfile now is complete for *this* request.
+ * we must use sb_ccc since the data may still be in-flight in TLS.
+ *
+ * We always cautiously move the end_seq only if our calculations
+ * show it happened (just in case sf has the call to here at the wrong
+ * place). When we go COMP we will stop coming here and hopefully be
+ * left with the correct end_seq.
+ */
+ if (SEQ_GT((tp->snd_una + so->so_snd.sb_ccc), ent->end_seq))
+ ent->end_seq = tp->snd_una + so->so_snd.sb_ccc;
+ if ((offset + nbytes) >= ent->end) {
+ ent->flags |= TCP_HTTP_TRACK_FLG_COMP;
+ tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_COMPLETE, offset, nbytes);
+ } else {
+ tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_MOREYET, offset, nbytes);
+ }
+ /* We assume that sendfile never sends overlapping requests */
+ goto done;
+ }
+ }
+skip_closed_req:
+ if (!fnd) {
+ /* Ok now lets look for open requests */
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ ent = &tp->t_http_info[i];
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
+ /* Not used */
+ continue;
+ }
+ if ((ent->flags & TCP_HTTP_TRACK_FLG_OPEN) == 0)
+ continue;
+ /* If we reach here its an allocated open request */
+ if (ent->start == offset) {
+ /* It begins this request */
+ ent->start_seq = tp->snd_una +
+ tptosocket(tp)->so_snd.sb_ccc;
+ ent->flags |= TCP_HTTP_TRACK_FLG_SEQV;
+ break;
+ } else if (offset > ent->start) {
+ ent->flags |= TCP_HTTP_TRACK_FLG_SEQV;
+ break;
+ }
+ }
+ }
+#endif
done:
INP_WUNLOCK(inp);
}
Index: sys/netinet/tcp_stacks/bbr.c
===================================================================
--- sys/netinet/tcp_stacks/bbr.c
+++ sys/netinet/tcp_stacks/bbr.c
@@ -500,7 +500,7 @@
bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts,
int32_t line);
static void
-bbr_stop_all_timers(struct tcpcb *tp);
+bbr_stop_all_timers(struct tcpcb *tp, struct tcp_bbr *bbr);
static void
bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts);
static void
@@ -1970,7 +1970,7 @@
static void
bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts)
{
- if (tcp_bblogging_on(bbr->rc_tp)) {
+ if (tcp_bblogging_on(tp)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
@@ -2669,7 +2669,7 @@
uint32_t newbw, uint32_t obw, uint32_t diff,
uint32_t tim)
{
- if (tcp_bblogging_on(bbr->rc_tp)) {
+ if (/*bbr_verbose_logging && */tcp_bblogging_on(bbr->rc_tp)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
@@ -2697,7 +2697,7 @@
static inline void
bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line)
{
- if (tcp_bblogging_on(bbr->rc_tp)) {
+ if (bbr_verbose_logging && tcp_bblogging_on(bbr->rc_tp)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime);
@@ -6281,6 +6281,9 @@
else
apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts);
}
+#ifdef STATS
+ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rtt));
+#endif
if (bbr->rc_ack_was_delayed)
rtt += bbr->r_ctl.rc_ack_hdwr_delay;
@@ -9850,16 +9853,13 @@
}
static void
-bbr_stop_all_timers(struct tcpcb *tp)
+bbr_stop_all_timers(struct tcpcb *tp, struct tcp_bbr *bbr)
{
- struct tcp_bbr *bbr;
-
/*
* Assure no timers are running.
*/
if (tcp_timer_active(tp, TT_PERSIST)) {
/* We enter in persists, set the flag appropriately */
- bbr = (struct tcp_bbr *)tp->t_fb_ptr;
bbr->rc_in_persist = 1;
}
}
@@ -9927,14 +9927,14 @@
* which indicates the error (usually no memory).
*/
static int
-bbr_init(struct tcpcb *tp)
+bbr_init(struct tcpcb *tp, void **ptr)
{
struct inpcb *inp = tptoinpcb(tp);
struct tcp_bbr *bbr = NULL;
uint32_t cts;
- tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO));
- if (tp->t_fb_ptr == NULL) {
+ *ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO));
+ if (*ptr == NULL) {
/*
* We need to allocate memory but cant. The INP and INP_INFO
* locks and they are recursive (happens during setup. So a
@@ -9943,10 +9943,16 @@
*/
return (ENOMEM);
}
- bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ bbr = (struct tcp_bbr *)*ptr;
bbr->rtt_valid = 0;
inp->inp_flags2 |= INP_CANNOT_DO_ECN;
inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ /* Take off any undesired flags */
+ inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
+ inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
+ inp->inp_flags2 &= ~INP_MBUF_ACKCMP;
+ inp->inp_flags2 &= ~INP_MBUF_L_ACKS;
+
TAILQ_INIT(&bbr->r_ctl.rc_map);
TAILQ_INIT(&bbr->r_ctl.rc_free);
TAILQ_INIT(&bbr->r_ctl.rc_tmap);
@@ -10074,8 +10080,8 @@
rsm = bbr_alloc(bbr);
if (rsm == NULL) {
- uma_zfree(bbr_pcb_zone, tp->t_fb_ptr);
- tp->t_fb_ptr = NULL;
+ uma_zfree(bbr_pcb_zone, *ptr);
+ *ptr = NULL;
return (ENOMEM);
}
rsm->r_rtt_not_allowed = 1;
@@ -10128,7 +10134,17 @@
* the TCB on the hptsi wheel if a timer is needed with appropriate
* flags.
*/
- bbr_stop_all_timers(tp);
+ bbr_stop_all_timers(tp, bbr);
+ /*
+ * Validate the timers are not in usec, if they are convert.
+ * BBR should in theory move to USEC and get rid of a
+ * lot of the TICKS_2 calls.. but for now we stay
+ * with tick timers.
+ */
+ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
+ TCPT_RANGESET(tp->t_rxtcur,
+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+ tp->t_rttmin, TCPTV_REXMTMAX);
bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0);
return (0);
}
@@ -10172,7 +10188,6 @@
bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged)
{
if (tp->t_fb_ptr) {
- struct inpcb *inp = tptoinpcb(tp);
uint32_t calc;
struct tcp_bbr *bbr;
struct bbr_sendmap *rsm;
@@ -10182,10 +10197,6 @@
tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp);
bbr_log_flowend(bbr);
bbr->rc_tp = NULL;
- /* Backout any flags2 we applied */
- inp->inp_flags2 &= ~INP_CANNOT_DO_ECN;
- inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ;
- inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY;
if (bbr->bbr_hdrw_pacing)
counter_u64_add(bbr_flows_whdwr_pacing, -1);
else
@@ -11853,7 +11864,6 @@
int32_t isipv6;
#endif
uint8_t app_limited = BBR_JR_SENT_DATA;
- uint8_t filled_all = 0;
bbr = (struct tcp_bbr *)tp->t_fb_ptr;
/* We take a cache hit here */
memcpy(&bbr->rc_tv, tv, sizeof(struct timeval));
@@ -13162,7 +13172,7 @@
if_hw_tsomaxsegsize, msb,
((rsm == NULL) ? hw_tls : 0)
#ifdef NETFLIX_COPY_ARGS
- , &filled_all
+ , NULL, NULL
#endif
);
if (len <= maxseg) {
@@ -13474,7 +13484,7 @@
#endif
/* Log to the black box */
- if (tcp_bblogging_on(bbr->rc_tp)) {
+ if (tcp_bblogging_on(tp)) {
union tcp_log_stackspecific log;
bbr_fill_in_logging_data(bbr, &log.u_bbr, cts);
@@ -13483,13 +13493,10 @@
log.u_bbr.flex2 = (bbr->r_recovery_bw << 3);
log.u_bbr.flex3 = maxseg;
log.u_bbr.flex4 = delay_calc;
- /* Encode filled_all into the upper flex5 bit */
log.u_bbr.flex5 = bbr->rc_past_init_win;
log.u_bbr.flex5 <<= 1;
log.u_bbr.flex5 |= bbr->rc_no_pacing;
log.u_bbr.flex5 <<= 29;
- if (filled_all)
- log.u_bbr.flex5 |= 0x80000000;
log.u_bbr.flex5 |= tp->t_maxseg;
log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs;
log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr);
@@ -14073,6 +14080,56 @@
return (0);
}
+static void
+bbr_switch_failed(struct tcpcb *tp)
+{
+ /*
+ * If a switch fails we only need to
+ * make sure mbuf_queuing is still in place.
+ * We also need to make sure we are still in
+ * ticks granularity (though we should probably
+ * change bbr to go to USECs).
+ *
+ * For timers we need to see if we are still in the
+ * pacer (if our flags are up) if so we are good, if
+ * not we need to get back into the pacer.
+ */
+ struct inpcb *inp = tptoinpcb(tp);
+ struct timeval tv;
+ uint32_t cts;
+ uint32_t toval;
+ struct tcp_bbr *bbr;
+ struct hpts_diag diag;
+
+ inp->inp_flags2 |= INP_CANNOT_DO_ECN;
+ inp->inp_flags2 |= INP_SUPPORTS_MBUFQ;
+ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
+ if (inp->inp_in_hpts) {
+ return;
+ }
+ bbr = (struct tcp_bbr *)tp->t_fb_ptr;
+ cts = tcp_get_usecs(&tv);
+ if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) {
+ if (TSTMP_GT(bbr->rc_pacer_started, cts)) {
+ toval = bbr->rc_pacer_started - cts;
+ } else {
+ /* one slot please */
+ toval = HPTS_TICKS_PER_SLOT;
+ }
+ } else if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) {
+ if (TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) {
+ toval = bbr->r_ctl.rc_timer_exp - cts;
+ } else {
+ /* one slot please */
+ toval = HPTS_TICKS_PER_SLOT;
+ }
+ } else
+ toval = HPTS_TICKS_PER_SLOT;
+ (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval),
+ __LINE__, &diag);
+ bbr_log_hpts_diag(bbr, cts, &diag);
+}
+
struct tcp_function_block __tcp_bbr = {
.tfb_tcp_block_name = __XSTRING(STACKNAME),
.tfb_tcp_output = bbr_output,
@@ -14087,6 +14144,7 @@
.tfb_tcp_handoff_ok = bbr_handoff_ok,
.tfb_tcp_mtu_chg = bbr_mtu_chg,
.tfb_pru_options = bbr_pru_options,
+ .tfb_switch_failed = bbr_switch_failed,
.tfb_flags = TCP_FUNC_OUTPUT_CANDROP,
};
Index: sys/netinet/tcp_stacks/rack.c
===================================================================
--- sys/netinet/tcp_stacks/rack.c
+++ sys/netinet/tcp_stacks/rack.c
@@ -458,7 +458,7 @@
static uint32_t
rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss);
static int32_t rack_handoff_ok(struct tcpcb *tp);
-static int32_t rack_init(struct tcpcb *tp);
+static int32_t rack_init(struct tcpcb *tp, void **ptr);
static void rack_init_sysctls(void);
static void
rack_log_ack(struct tcpcb *tp, struct tcpopt *to,
@@ -12344,7 +12344,7 @@
}
static int
-rack_init(struct tcpcb *tp)
+rack_init(struct tcpcb *tp, void **ptr)
{
struct inpcb *inp = tptoinpcb(tp);
struct tcp_rack *rack = NULL;
@@ -12354,8 +12354,8 @@
uint32_t iwin, snt, us_cts;
int err;
- tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
- if (tp->t_fb_ptr == NULL) {
+ *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT);
+ if (*ptr == NULL) {
/*
* We need to allocate memory but cant. The INP and INP_INFO
* locks and they are recursive (happens during setup. So a
@@ -12364,9 +12364,9 @@
*/
return (ENOMEM);
}
- memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack));
+ memset(ptr, 0, sizeof(struct tcp_rack));
- rack = (struct tcp_rack *)tp->t_fb_ptr;
+ rack = (struct tcp_rack *)ptr;
RB_INIT(&rack->r_ctl.rc_mtree);
TAILQ_INIT(&rack->r_ctl.rc_free);
TAILQ_INIT(&rack->r_ctl.rc_tmap);
Index: sys/netinet/tcp_subr.c
===================================================================
--- sys/netinet/tcp_subr.c
+++ sys/netinet/tcp_subr.c
@@ -109,6 +109,7 @@
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_syncache.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_lro.h>
#include <netinet/cc/cc.h>
#include <netinet/tcpip.h>
#include <netinet/tcp_fastopen.h>
@@ -152,6 +153,11 @@
CTLFLAG_RW,
&tcp_force_detection, 0,
"Do we force detection even if the INP has it off?");
+int32_t tcp_sad_limit = 10000;
+SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit,
+ CTLFLAG_RW,
+ &tcp_sad_limit, 10000,
+ "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?");
int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */
SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh,
CTLFLAG_RW,
@@ -363,7 +369,7 @@
VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]);
#define V_ts_offset_secret VNET(ts_offset_secret)
-static int tcp_default_fb_init(struct tcpcb *tp);
+static int tcp_default_fb_init(struct tcpcb *tp, void **ptr);
static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
static int tcp_default_handoff_ok(struct tcpcb *tp);
static struct inpcb *tcp_notify(struct inpcb *, int);
@@ -519,18 +525,11 @@
tcp_switch_back_to_default(struct tcpcb *tp)
{
struct tcp_function_block *tfb;
+ void *ptr = NULL;
KASSERT(tp->t_fb != &tcp_def_funcblk,
("%s: called by the built-in default stack", __func__));
- /*
- * Release the old stack. This function will either find a new one
- * or panic.
- */
- if (tp->t_fb->tfb_tcp_fb_fini != NULL)
- (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
- refcount_release(&tp->t_fb->tfb_refcnt);
-
/*
* Now, we'll find a new function block to use.
* Start by trying the current user-selected
@@ -551,14 +550,20 @@
/* Try to use that stack. */
if (tfb != NULL) {
/* Initialize the new stack. If it succeeds, we are done. */
- tp->t_fb = tfb;
- if (tp->t_fb->tfb_tcp_fb_init == NULL ||
- (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
+ if (tfb->tfb_tcp_fb_init == NULL ||
+ (*tfb->tfb_tcp_fb_init)(tp, &ptr) == 0) {
+ /* Release the old stack */
+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+ /* Now set in all the pointers */
+ tp->t_fb = tfb;
+ tp->t_fb_ptr = ptr;
return;
-
+ }
/*
* Initialization failed. Release the reference count on
- * the stack.
+ * the looked up default stack.
*/
refcount_release(&tfb->tfb_refcnt);
}
@@ -578,12 +583,18 @@
panic("Default stack rejects a new session?");
}
}
- tp->t_fb = tfb;
- if (tp->t_fb->tfb_tcp_fb_init != NULL &&
- (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
+ if (tfb->tfb_tcp_fb_init != NULL &&
+ (*tfb->tfb_tcp_fb_init)(tp, &ptr)) {
/* The default stack cannot fail */
panic("Default stack initialization failed");
}
+ /* Now release the old stack */
+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+ /* And set in the pointers to the new */
+ tp->t_fb = tfb;
+ tp->t_fb_ptr = ptr;
}
static bool
@@ -1040,16 +1051,37 @@
* it is required to always succeed since it is the stack of last resort!
*/
static int
-tcp_default_fb_init(struct tcpcb *tp)
+tcp_default_fb_init(struct tcpcb *tp, void **ptr)
{
struct socket *so = tptosocket(tp);
+ int rexmt;
INP_WLOCK_ASSERT(tptoinpcb(tp));
+ /* We don't use the pointer */
+ *ptr = NULL;
KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
("%s: connection %p in unexpected state %d", __func__, tp,
tp->t_state));
+ /* Make sure we get no interesting mbuf queuing behavior */
+ /* All mbuf queue/ack compress flags should be off */
+ tcp_lro_features_off(tptoinpcb(tp));
+
+ /* Cancel the GP measurement in progress */
+ tp->t_flags &= ~TF_GPUTINPROG;
+ /* Validate the timers are not in usec, if they are convert */
+ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS);
+ if ((tp->t_state == TCPS_SYN_SENT) ||
+ (tp->t_state == TCPS_SYN_RECEIVED))
+ rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift];
+ else
+ rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
+ if (tp->t_rxtshift == 0)
+ tp->t_rxtcur = rexmt;
+ else
+ TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX);
+
/*
* Nothing to do for ESTABLISHED or LISTEN states. And, we don't
* know what to do for unexpected states (which includes TIME_WAIT).
@@ -2240,6 +2272,8 @@
tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
tp->t_rcvtime = ticks;
+ /* We always start with ticks granularity */
+ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS;
/*
* IPv4 TTL initialization is necessary for an IPv6 socket as well,
* because the socket may be bound to an IPv6 wildcard address,
@@ -2265,7 +2299,7 @@
#endif
tp->t_pacing_rate = -1;
if (tp->t_fb->tfb_tcp_fb_init) {
- if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) {
+ if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) {
refcount_release(&tp->t_fb->tfb_refcnt);
return (NULL);
}
@@ -4019,3 +4053,524 @@
}
}
#endif
+
+void
+tcp_change_time_units(struct tcpcb *tp, int granularity)
+{
+ if (tp->t_tmr_granularity == granularity) {
+ /* We are there */
+ return;
+ }
+ if (granularity == TCP_TMR_GRANULARITY_USEC) {
+ KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS),
+ ("Granularity is not TICKS its %u in tp:%p",
+ tp->t_tmr_granularity, tp));
+ tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow);
+ if (tp->t_srtt > 1) {
+ uint32_t val, frac;
+
+ val = tp->t_srtt >> TCP_RTT_SHIFT;
+ frac = tp->t_srtt & 0x1f;
+ tp->t_srtt = TICKS_2_USEC(val);
+ /*
+ * frac is the fractional part of the srtt (if any)
+ * but its in ticks and every bit represents
+ * 1/32nd of a hz.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
+ }
+ tp->t_srtt += frac;
+ }
+ }
+ if (tp->t_rttvar) {
+ uint32_t val, frac;
+
+ val = tp->t_rttvar >> TCP_RTTVAR_SHIFT;
+ frac = tp->t_rttvar & 0x1f;
+ tp->t_rttvar = TICKS_2_USEC(val);
+ /*
+ * frac is the fractional part of the srtt (if any)
+ * but its in ticks and every bit represents
+ * 1/32nd of a hz.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE));
+ }
+ tp->t_rttvar += frac;
+ }
+ }
+ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_USEC;
+ } else if (granularity == TCP_TMR_GRANULARITY_TICKS) {
+ /* Convert back to ticks, with */
+ KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_USEC),
+ ("Granularity is not USEC its %u in tp:%p",
+ tp->t_tmr_granularity, tp));
+ if (tp->t_srtt > 1) {
+ uint32_t val, frac;
+
+ val = USEC_2_TICKS(tp->t_srtt);
+ frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
+ tp->t_srtt = val << TCP_RTT_SHIFT;
+ /*
+ * frac is the fractional part here is left
+ * over from converting to hz and shifting.
+ * We need to convert this to the 5 bit
+ * remainder.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
+ }
+ tp->t_srtt += frac;
+ }
+ }
+ if (tp->t_rttvar) {
+ uint32_t val, frac;
+
+ val = USEC_2_TICKS(tp->t_rttvar);
+ frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz);
+ tp->t_rttvar = val << TCP_RTTVAR_SHIFT;
+ /*
+ * frac is the fractional part here is left
+ * over from converting to hz and shifting.
+ * We need to convert this to the 5 bit
+ * remainder.
+ */
+ if (frac) {
+ if (hz == 1000) {
+ frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC);
+ } else {
+ frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC);
+ }
+ tp->t_rttvar += frac;
+ }
+ }
+ tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow);
+ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS;
+ }
+#ifdef INVARIANTS
+ else {
+ panic("Unknown granularity:%d tp:%p",
+ granularity, tp);
+ }
+#endif
+}
+
+void
+tcp_handle_orphaned_packets(struct tcpcb *tp)
+{
+ struct mbuf *save, *m, *prev;
+ /*
+ * Called when a stack switch is occuring from the fini()
+ * of the old stack. We assue the init() as already been
+ * run of the new stack and it has set the inp_flags2 to
+ * what it supports. This function will then deal with any
+ * differences i.e. cleanup packets that maybe queued that
+ * the newstack does not support.
+ */
+
+ if (tptoinpcb(tp)->inp_flags2 & INP_MBUF_L_ACKS)
+ return;
+ if ((tptoinpcb(tp)->inp_flags2 & INP_SUPPORTS_MBUFQ) == 0) {
+ /*
+ * It is unsafe to process the packets since a
+ * reset may be lurking in them (its rare but it
+ * can occur). If we were to find a RST, then we
+ * would end up dropping the connection and the
+ * INP lock, so when we return the caller (tcp_usrreq)
+ * will blow up when it trys to unlock the inp.
+ * This new stack does not do any fancy LRO features
+ * so all we can do is toss the packets.
+ */
+ m = tp->t_in_pkt;
+ tp->t_in_pkt = NULL;
+ tp->t_tail_pkt = NULL;
+ while (m) {
+ save = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ m = save;
+ }
+ } else {
+ /*
+ * Here we have a stack that does mbuf queuing but
+ * does not support compressed ack's. We must
+ * walk all the mbufs and discard any compressed acks.
+ */
+ m = tp->t_in_pkt;
+ prev = NULL;
+ while (m) {
+ if (m->m_flags & M_ACKCMP) {
+ /* We must toss this packet */
+ if (tp->t_tail_pkt == m)
+ tp->t_tail_pkt = prev;
+ if (prev)
+ prev->m_nextpkt = m->m_nextpkt;
+ else
+ tp->t_in_pkt = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ /* move forward */
+ if (prev)
+ m = prev->m_nextpkt;
+ else
+ m = tp->t_in_pkt;
+ } else {
+ /* this one is ok */
+ prev = m;
+ m = m->m_nextpkt;
+ }
+ }
+ }
+}
+
+#ifdef TCP_REQUEST_TRK
+uint32_t
+tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes)
+{
+#ifdef KERN_TLS
+ struct ktls_session *tls;
+ uint32_t rec_oh, records;
+
+ tls = so->so_snd.sb_tls_info;
+ if (tls == NULL)
+ return (0);
+
+ rec_oh = tls->params.tls_hlen + tls->params.tls_tlen;
+ records = ((tls_usr_bytes + tls->params.max_frame_len - 1)/tls->params.max_frame_len);
+ return (records * rec_oh);
+#else
+ return (0);
+#endif
+}
+
+extern uint32_t tcp_stale_entry_time;
+uint32_t tcp_stale_entry_time = 250000;
+SYSCTL_UINT(_net_inet_tcp, OID_AUTO, usrlog_stale, CTLFLAG_RW,
+ &tcp_stale_entry_time, 250000, "Time that a http entry without a sendfile ages out");
+
+void
+tcp_http_log_req_info(struct tcpcb *tp, struct http_sendfile_track *http,
+ uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes)
+{
+ if (tcp_bblogging_on(tp)) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+#ifdef TCPHPTS
+ log.u_bbr.inhpts = tcp_in_hpts(tptoinpcb(tp));
+#endif
+ log.u_bbr.flex8 = val;
+ log.u_bbr.rttProp = http->timestamp;
+ log.u_bbr.delRate = http->start;
+ log.u_bbr.cur_del_rate = http->end;
+ log.u_bbr.flex1 = http->start_seq;
+ log.u_bbr.flex2 = http->end_seq;
+ log.u_bbr.flex3 = http->flags;
+ log.u_bbr.flex4 = ((http->localtime >> 32) & 0x00000000ffffffff);
+ log.u_bbr.flex5 = (http->localtime & 0x00000000ffffffff);
+ log.u_bbr.flex7 = slot;
+ log.u_bbr.bw_inuse = offset;
+ /* nbytes = flex6 | epoch */
+ log.u_bbr.flex6 = ((nbytes >> 32) & 0x00000000ffffffff);
+ log.u_bbr.epoch = (nbytes & 0x00000000ffffffff);
+ /* cspr = lt_epoch | pkts_out */
+ log.u_bbr.lt_epoch = ((http->cspr >> 32) & 0x00000000ffffffff);
+ log.u_bbr.pkts_out |= (http->cspr & 0x00000000ffffffff);
+ log.u_bbr.applimited = tp->t_http_closed;
+ log.u_bbr.applimited <<= 8;
+ log.u_bbr.applimited |= tp->t_http_open;
+ log.u_bbr.applimited <<= 8;
+ log.u_bbr.applimited |= tp->t_http_req;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ TCP_LOG_EVENTP(tp, NULL,
+ &tptosocket(tp)->so_rcv,
+ &tptosocket(tp)->so_snd,
+ TCP_LOG_HTTP_T, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+void
+tcp_http_free_a_slot(struct tcpcb *tp, struct http_sendfile_track *ent)
+{
+ if (tp->t_http_req > 0)
+ tp->t_http_req--;
+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) {
+ if (tp->t_http_open > 0)
+ tp->t_http_open--;
+ } else {
+ if (tp->t_http_closed > 0)
+ tp->t_http_closed--;
+ }
+ ent->flags = TCP_HTTP_TRACK_FLG_EMPTY;
+}
+
+static void
+tcp_http_check_for_stale_entries(struct tcpcb *tp, uint64_t ts, int rm_oldest)
+{
+ struct http_sendfile_track *ent;
+ uint64_t time_delta, oldest_delta;
+ int i, oldest, oldest_set = 0, cnt_rm = 0;
+
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ ent = &tp->t_http_info[i];
+ if (ent->flags != TCP_HTTP_TRACK_FLG_USED) {
+ /*
+ * We only care about closed end ranges
+ * that are allocated and have no sendfile
+ * ever touching them. They would be in
+ * state USED.
+ */
+ continue;
+ }
+ if (ts >= ent->localtime)
+ time_delta = ts - ent->localtime;
+ else
+ time_delta = 0;
+ if (time_delta &&
+ ((oldest_delta < time_delta) || (oldest_set == 0))) {
+ oldest_set = 1;
+ oldest = i;
+ oldest_delta = time_delta;
+ }
+ if (tcp_stale_entry_time && (time_delta >= tcp_stale_entry_time)) {
+ /*
+ * No sendfile in a our time-limit
+ * time to purge it.
+ */
+ cnt_rm++;
+ tcp_http_log_req_info(tp, &tp->t_http_info[i], i, TCP_HTTP_REQ_LOG_STALE,
+ time_delta, 0);
+ tcp_http_free_a_slot(tp, ent);
+ }
+ }
+ if ((cnt_rm == 0) && rm_oldest && oldest_set) {
+ ent = &tp->t_http_info[oldest];
+ tcp_http_log_req_info(tp, &tp->t_http_info[i], i, TCP_HTTP_REQ_LOG_STALE,
+ oldest_delta, 1);
+ tcp_http_free_a_slot(tp, ent);
+ }
+}
+
+int
+tcp_http_check_for_comp(struct tcpcb *tp, tcp_seq ack_point)
+{
+ int i, ret=0;
+ struct http_sendfile_track *ent;
+
+ /* Clean up any old closed end requests that are now completed */
+ if (tp->t_http_req == 0)
+ return(0);
+ if (tp->t_http_closed == 0)
+ return(0);
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ ent = &tp->t_http_info[i];
+ /* Skip empty ones */
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY)
+ continue;
+ /* Skip open ones */
+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN)
+ continue;
+ if (SEQ_GEQ(ack_point, ent->end_seq)) {
+ /* We are past it -- free it */
+ tcp_http_log_req_info(tp, ent,
+ i, TCP_HTTP_REQ_LOG_FREED, 0, 0);
+ tcp_http_free_a_slot(tp, ent);
+ ret++;
+ }
+ }
+ return (ret);
+}
+
+int
+tcp_http_is_entry_comp(struct tcpcb *tp, struct http_sendfile_track *ent, tcp_seq ack_point)
+{
+ if (tp->t_http_req == 0)
+ return(-1);
+ if (tp->t_http_closed == 0)
+ return(-1);
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY)
+ return(-1);
+ if (SEQ_GEQ(ack_point, ent->end_seq)) {
+ return (1);
+ }
+ return (0);
+}
+
+struct http_sendfile_track *
+tcp_http_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip)
+{
+ /*
+ * Given an ack point (th_ack) walk through our entries and
+ * return the first one found that th_ack goes past the
+ * end_seq.
+ */
+ struct http_sendfile_track *ent;
+ int i;
+
+ if (tp->t_http_req == 0) {
+ /* none open */
+ return (NULL);
+ }
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ ent = &tp->t_http_info[i];
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY)
+ continue;
+ if ((ent->flags & TCP_HTTP_TRACK_FLG_OPEN) == 0) {
+ if (SEQ_GEQ(th_ack, ent->end_seq)) {
+ *ip = i;
+ return (ent);
+ }
+ }
+ }
+ return (NULL);
+}
+
+struct http_sendfile_track *
+tcp_http_find_req_for_seq(struct tcpcb *tp, tcp_seq seq)
+{
+ struct http_sendfile_track *ent;
+ int i;
+
+ if (tp->t_http_req == 0) {
+ /* none open */
+ return (NULL);
+ }
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ ent = &tp->t_http_info[i];
+ tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_SEARCH,
+ (uint64_t)seq, 0);
+ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
+ continue;
+ }
+ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) {
+ /*
+ * An open end request only needs to
+ * match the beginning seq or be
+ * all we have (once we keep going on
+ * a open end request we may have a seq
+ * wrap).
+ */
+ if ((SEQ_GEQ(seq, ent->start_seq)) ||
+ (tp->t_http_closed == 0))
+ return (ent);
+ } else {
+ /*
+ * For this one we need to
+ * be a bit more careful if its
+ * completed at least.
+ */
+ if ((SEQ_GEQ(seq, ent->start_seq)) &&
+ (SEQ_LT(seq, ent->end_seq))) {
+ return (ent);
+ }
+ }
+ }
+ return (NULL);
+}
+
+/* Should this be in its own file tcp_http.c ? */
+struct http_sendfile_track *
+tcp_http_alloc_req_full(struct tcpcb *tp, struct http_req *req, uint64_t ts, int rec_dups)
+{
+ struct http_sendfile_track *fil;
+ int i, allocated;
+
+ /* In case the stack does not check for completions do so now */
+ tcp_http_check_for_comp(tp, tp->snd_una);
+ /* Check for stale entries */
+ if (tp->t_http_req)
+ tcp_http_check_for_stale_entries(tp, ts,
+ (tp->t_http_req >= MAX_TCP_HTTP_REQ));
+ /* Check to see if this is a duplicate of one not started */
+ if (tp->t_http_req) {
+ for(i = 0, allocated = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ fil = &tp->t_http_info[i];
+ if (fil->flags != TCP_HTTP_TRACK_FLG_USED)
+ continue;
+ if ((fil->timestamp == req->timestamp) &&
+ (fil->start == req->start) &&
+ ((fil->flags & TCP_HTTP_TRACK_FLG_OPEN) ||
+ (fil->end == req->end))) {
+ /*
+ * We already have this request
+ * and it has not been started with sendfile.
+ * This probably means the user was returned
+ * a 4xx of some sort and its going to age
+ * out, lets not duplicate it.
+ */
+ return(fil);
+ }
+ }
+ }
+ /* Ok if there is no room at the inn we are in trouble */
+ if (tp->t_http_req >= MAX_TCP_HTTP_REQ) {
+ tcp_trace_point(tp, TCP_TP_HTTP_LOG_FAIL);
+ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ tcp_http_log_req_info(tp, &tp->t_http_info[i],
+ i, TCP_HTTP_REQ_LOG_ALLOCFAIL, 0, 0);
+ }
+ return (NULL);
+ }
+ for(i = 0, allocated = 0; i < MAX_TCP_HTTP_REQ; i++) {
+ fil = &tp->t_http_info[i];
+ if (fil->flags == TCP_HTTP_TRACK_FLG_EMPTY) {
+ allocated = 1;
+ fil->flags = TCP_HTTP_TRACK_FLG_USED;
+ fil->timestamp = req->timestamp;
+ fil->localtime = ts;
+ fil->start = req->start;
+ if (req->flags & TCP_LOG_HTTPD_RANGE_END) {
+ fil->end = req->end;
+ } else {
+ fil->end = 0;
+ fil->flags |= TCP_HTTP_TRACK_FLG_OPEN;
+ }
+ /*
+ * We can set the min boundaries to the TCP Sequence space,
+ * but it might be found to be further up when sendfile
+ * actually runs on this range (if it ever does).
+ */
+ fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc;
+ fil->start_seq = tp->snd_una +
+ tptosocket(tp)->so_snd.sb_ccc;
+ fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start)));
+ if (tptosocket(tp)->so_snd.sb_tls_info) {
+ /*
+ * This session is doing TLS. Take a swag guess
+ * at the overhead.
+ */
+ fil->end_seq += tcp_estimate_tls_overhead(
+ tptosocket(tp), (fil->end - fil->start));
+ }
+ tp->t_http_req++;
+ if (fil->flags & TCP_HTTP_TRACK_FLG_OPEN)
+ tp->t_http_open++;
+ else
+ tp->t_http_closed++;
+ tcp_http_log_req_info(tp, fil, i,
+ TCP_HTTP_REQ_LOG_NEW, 0, 0);
+ break;
+ } else
+ fil = NULL;
+ }
+ return (fil);
+}
+
+void
+tcp_http_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, uint64_t ts)
+{
+ (void)tcp_http_alloc_req_full(tp, &user->http_req, ts, 1);
+}
+#endif
Index: sys/netinet/tcp_syncache.c
===================================================================
--- sys/netinet/tcp_syncache.c
+++ sys/netinet/tcp_syncache.c
@@ -932,22 +932,27 @@
* pickup one on the new entry.
*/
struct tcp_function_block *rblk;
+ void *ptr = NULL;
rblk = find_and_ref_tcp_fb(blk);
KASSERT(rblk != NULL,
("cannot find blk %p out of syncache?", blk));
- if (tp->t_fb->tfb_tcp_fb_fini)
- (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
- refcount_release(&tp->t_fb->tfb_refcnt);
- tp->t_fb = rblk;
- /*
- * XXXrrs this is quite dangerous, it is possible
- * for the new function to fail to init. We also
- * are not asking if the handoff_is_ok though at
- * the very start thats probalbly ok.
- */
- if (tp->t_fb->tfb_tcp_fb_init) {
- (*tp->t_fb->tfb_tcp_fb_init)(tp);
+
+ if (rblk->tfb_tcp_fb_init == NULL ||
+ (*rblk->tfb_tcp_fb_init)(tp, &ptr) == 0) {
+ /* Release the old stack */
+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+ /* Now set in all the pointers */
+ tp->t_fb = rblk;
+ tp->t_fb_ptr = ptr;
+ } else {
+ /*
+ * Initialization failed. Release the reference count on
+ * the looked up default stack.
+ */
+ refcount_release(&rblk->tfb_refcnt);
}
}
tp->snd_wl1 = sc->sc_irs;
Index: sys/netinet/tcp_usrreq.c
===================================================================
--- sys/netinet/tcp_usrreq.c
+++ sys/netinet/tcp_usrreq.c
@@ -1659,6 +1659,7 @@
*/
struct tcp_function_set fsn;
struct tcp_function_block *blk;
+ void *ptr = NULL;
INP_WUNLOCK(inp);
error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn);
@@ -1666,10 +1667,6 @@
return (error);
INP_WLOCK(inp);
- if (inp->inp_flags & INP_DROPPED) {
- INP_WUNLOCK(inp);
- return (ECONNRESET);
- }
tp = intotcpcb(inp);
blk = find_and_ref_tcp_functions(&fsn);
@@ -1710,41 +1707,57 @@
return (ENOENT);
}
/*
- * Release the old refcnt, the
- * lookup acquired a ref on the
- * new one already.
+ * Ensure the new stack takes ownership with a
+ * clean slate on peak rate threshold.
*/
- if (tp->t_fb->tfb_tcp_fb_fini) {
- struct epoch_tracker et;
- /*
- * Tell the stack to cleanup with 0 i.e.
- * the tcb is not going away.
- */
- NET_EPOCH_ENTER(et);
- (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
- NET_EPOCH_EXIT(et);
- }
+ tp->t_peakrate_thr = 0;
#ifdef TCPHPTS
/* Assure that we are not on any hpts */
tcp_hpts_remove(tptoinpcb(tp));
#endif
if (blk->tfb_tcp_fb_init) {
- error = (*blk->tfb_tcp_fb_init)(tp);
+ error = (*blk->tfb_tcp_fb_init)(tp, &ptr);
if (error) {
+ /*
+ * Release the ref count the lookup
+ * acquired.
+ */
refcount_release(&blk->tfb_refcnt);
- if (tp->t_fb->tfb_tcp_fb_init) {
- if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) {
- /* Fall back failed, drop the connection */
- INP_WUNLOCK(inp);
- soabort(so);
- return (error);
- }
+ /*
+ * Now there is a chance that the
+ * init() function mucked with some
+ * things before it failed, such as
+ * hpts or inp_flags2 or timer granularity.
+ * It should not of, but lets give the old
+ * stack a chance to reset to a known good state.
+ */
+ if (tp->t_fb->tfb_switch_failed) {
+ (*tp->t_fb->tfb_switch_failed)(tp);
}
- goto err_out;
+ goto err_out;
}
}
+ if (tp->t_fb->tfb_tcp_fb_fini) {
+ struct epoch_tracker et;
+ /*
+ * Tell the stack to cleanup with 0 i.e.
+ * the tcb is not going away.
+ */
+ NET_EPOCH_ENTER(et);
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ NET_EPOCH_EXIT(et);
+ }
+ /*
+ * Release the old refcnt, the
+ * lookup acquired a ref on the
+ * new one already.
+ */
refcount_release(&tp->t_fb->tfb_refcnt);
+ /*
+ * Set in the new stack.
+ */
tp->t_fb = blk;
+ tp->t_fb_ptr = ptr;
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
tcp_offload_ctloutput(tp, sopt->sopt_dir,
@@ -1754,6 +1767,7 @@
err_out:
INP_WUNLOCK(inp);
return (error);
+
}
/* Pass in the INP locked, callee must unlock it. */
Index: sys/netinet/tcp_var.h
===================================================================
--- sys/netinet/tcp_var.h
+++ sys/netinet/tcp_var.h
@@ -61,6 +61,15 @@
#define TCP_EI_STATUS_2MSL 0xb
#define TCP_EI_STATUS_MAX_VALUE 0xb
+#define TCP_HTTP_REQ_LOG_NEW 0x01
+#define TCP_HTTP_REQ_LOG_COMPLETE 0x02
+#define TCP_HTTP_REQ_LOG_FREED 0x03
+#define TCP_HTTP_REQ_LOG_ALLOCFAIL 0x04
+#define TCP_HTTP_REQ_LOG_MOREYET 0x05
+#define TCP_HTTP_REQ_LOG_FORCEFREE 0x06
+#define TCP_HTTP_REQ_LOG_STALE 0x07
+#define TCP_HTTP_REQ_LOG_SEARCH 0x08
+
/************************************************/
/* Status bits we track to assure no duplicates,
* the bits here are not used by the code but
@@ -126,6 +135,154 @@
STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
+#define TCP_HTTP_TRACK_FLG_EMPTY 0x00 /* Available */
+#define TCP_HTTP_TRACK_FLG_USED 0x01 /* In use */
+#define TCP_HTTP_TRACK_FLG_OPEN 0x02 /* End is not valid (open range request) */
+#define TCP_HTTP_TRACK_FLG_SEQV 0x04 /* We had a sendfile that touched it */
+#define TCP_HTTP_TRACK_FLG_COMP 0x08 /* Sendfile as placed the last bits (range req only) */
+#define TCP_HTTP_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */
+#define MAX_TCP_HTTP_REQ 5 /* Max we will have at once */
+
+#ifdef TCP_REQUEST_TRK
+struct http_sendfile_track {
+ uint64_t timestamp; /* User sent timestamp */
+ uint64_t start; /* Start of sendfile offset */
+ uint64_t end; /* End if not open-range req */
+ uint64_t localtime; /* Time we actually got the req */
+ uint64_t deadline; /* If in CU mode, deadline to delivery */
+ uint64_t first_send; /* Time of first send in the range */
+ uint64_t cspr; /* Client suggested pace rate */
+ uint64_t sent_at_fs; /* What was t_sndbytes as we begun sending */
+ uint64_t rxt_at_fs; /* What was t_snd_rxt_bytes as we begun sending */
+ tcp_seq start_seq; /* First TCP Seq assigned */
+ tcp_seq end_seq; /* If range req last seq */
+ uint32_t flags; /* Type of request open etc */
+ uint32_t sbcc_at_s; /* When we allocate what is the sb_cc */
+ uint32_t hint_maxseg; /* Client hinted maxseg */
+ uint32_t hybrid_flags; /* Hybrid flags on this request */
+};
+
+#endif
+
+/*
+ * Change Query responses for a stack switch we create a structure
+ * that allows query response from the new stack to the old, if
+ * supported.
+ *
+ * There are three queries currently defined.
+ * - sendmap
+ * - timers
+ * - rack_times
+ *
+ * For the sendmap query the caller fills in the
+ * req and the req_param as the first seq (usually
+ * snd_una). When the response comes back indicating
+ * that there was data (return value 1), then the caller
+ * can build a sendmap entry based on the range and the
+ * times. The next query would then be done at the
+ * newly created sendmap_end. Repeated until sendmap_end == snd_max.
+ *
+ * Flags in sendmap_flags are defined below as well.
+ *
+ * For timers the standard PACE_TMR_XXXX flags are returned indicating
+ * a pacing timer (possibly) and one other timer. If pacing timer then
+ * the expiration timeout time in microseconds is in timer_pacing_to.
+ * And the value used with whatever timer (if a flag is set) is in
+ * timer_rxt. If no timers are running a 0 is returned and of
+ * course no flags are set in timer_hpts_flags.
+ *
+ * The rack_times are a misc collection of information that
+ * the old stack might possibly fill in. Of course its possible
+ * that an old stack may not have a piece of information. If so
+ * then setting that value to zero is advised. Setting any
+ * timestamp passed should only place a zero in it when it
+ * is unfilled. This may mean that a time is off by a micro-second
+ * but this is ok in the grand scheme of things.
+ *
+ * When switching stacks it is desireable to get as much information
+ * from the old stack to the new stack as possible. Though not always
+ * will the stack be compatible in the types of information. The
+ * init() function needs to take care when it begins changing
+ * things such as inp_flags2 and the timer units to position these
+ * changes at a point where it is unlikely they will fail after
+ * making such changes. A stack optionally can have an "undo"
+ * function
+ *
+ * To transfer information to the old stack from the new in
+ * respect to LRO and the inp_flags2, the new stack should set
+ * the inp_flags2 to what it supports. The old stack in its
+ * fini() function should call the tcp_handle_orphaned_packets()
+ * to clean up any packets. Note that a new stack should attempt
+ */
+
+/* Query types */
+#define TCP_QUERY_SENDMAP 1
+#define TCP_QUERY_TIMERS_UP 2
+#define TCP_QUERY_RACK_TIMES 3
+
+/* Flags returned in sendmap_flags */
+#define SNDMAP_ACKED 0x000001/* The remote endpoint acked this */
+#define SNDMAP_OVERMAX 0x000008/* We have more retran's then we can fit */
+#define SNDMAP_SACK_PASSED 0x000010/* A sack was done above this block */
+#define SNDMAP_HAS_FIN 0x000040/* segment is sent with fin */
+#define SNDMAP_TLP 0x000080/* segment sent as tail-loss-probe */
+#define SNDMAP_HAS_SYN 0x000800/* SYN is on this guy */
+#define SNDMAP_HAD_PUSH 0x008000/* Push was sent on original send */
+#define SNDMAP_MASK (SNDMAP_ACKED|SNDMAP_OVERMAX|SNDMAP_SACK_PASSED|SNDMAP_HAS_FIN\
+ |SNDMAP_TLP|SNDMAP_HAS_SYN|SNDMAP_HAD_PUSH)
+#define SNDMAP_NRTX 3
+
+struct tcp_query_resp {
+ int req;
+ uint32_t req_param;
+ union {
+ struct {
+ tcp_seq sendmap_start;
+ tcp_seq sendmap_end;
+ int sendmap_send_cnt;
+ uint64_t sendmap_time[SNDMAP_NRTX];
+ uint64_t sendmap_ack_arrival;
+ int sendmap_flags;
+ uint32_t sendmap_r_rtr_bytes;
+ /* If FAS is available if not 0 */
+ uint32_t sendmap_fas;
+ uint8_t sendmap_dupacks;
+ };
+ struct {
+ uint32_t timer_hpts_flags;
+ uint32_t timer_pacing_to;
+ uint32_t timer_timer_exp;
+ };
+ struct {
+ /* Timestamps and rtt's */
+ uint32_t rack_reorder_ts; /* Last uscts that reordering was seen */
+ uint32_t rack_num_dsacks; /* Num of dsacks seen */
+ uint32_t rack_rxt_last_time; /* Last time a RXT/TLP or rack tmr went off */
+ uint32_t rack_min_rtt; /* never 0 smallest rtt seen */
+ uint32_t rack_rtt; /* Last rtt used by rack */
+ uint32_t rack_tmit_time; /* The time the rtt seg was tmited */
+ uint32_t rack_time_went_idle; /* If in persist the time we went idle */
+ /* Prr data */
+ uint32_t rack_sacked;
+ uint32_t rack_holes_rxt;
+ uint32_t rack_prr_delivered;
+ uint32_t rack_prr_recovery_fs;
+ uint32_t rack_prr_out;
+ uint32_t rack_prr_sndcnt;
+ /* TLP data */
+ uint16_t rack_tlp_cnt_out; /* How many tlp's have been sent */
+ /* Various bits */
+ uint8_t rack_tlp_out; /* Is a TLP outstanding */
+ uint8_t rack_srtt_measured; /* The previous stack has measured srtt */
+ uint8_t rack_in_persist; /* Is the old stack in persists? */
+ uint8_t rack_wanted_output; /* Did the prevous stack have a want output set */
+ };
+ };
+};
+
+#define TCP_TMR_GRANULARITY_TICKS 1 /* TCP timers are in ticks (msec if hz=1000) */
+#define TCP_TMR_GRANULARITY_USEC 2 /* TCP timers are in microseconds */
+
typedef enum {
TT_REXMT = 0,
TT_PERSIST,
@@ -276,6 +433,11 @@
#ifdef TCP_ACCOUNTING
uint64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS];
uint64_t tcp_proc_time[TCP_NUM_CNT_COUNTERS];
+#endif
+#ifdef TCP_REQUEST_TRK
+ uint32_t tcp_hybrid_start; /* Num of times we started hybrid pacing */
+ uint32_t tcp_hybrid_stop; /* Num of times we stopped hybrid pacing */
+ uint32_t tcp_hybrid_error; /* Num of times we failed to start hybrid pacing */
#endif
uint32_t t_logsn; /* Log "serial number" */
uint32_t gput_ts; /* Time goodput measurement started */
@@ -290,6 +452,7 @@
uint32_t t_dsack_bytes; /* dsack bytes received */
uint32_t t_dsack_tlp_bytes; /* dsack bytes received for TLPs sent */
uint32_t t_dsack_pack; /* dsack packets we have eceived */
+ uint8_t t_tmr_granularity; /* Granularity of all timers srtt etc */
uint8_t t_rttupdated; /* number of times rtt sampled */
/* TCP Fast Open */
uint8_t t_tfo_client_cookie_len; /* TFO client cookie length */
@@ -311,6 +474,13 @@
struct osd t_osd; /* storage for Khelp module data */
#endif
uint8_t _t_logpoint; /* Used when a BB log points is enabled */
+#ifdef TCP_REQUEST_TRK
+ /* Response tracking addons. */
+ uint8_t t_http_req; /* Request count */
+ uint8_t t_http_open; /* Number of open range requests */
+ uint8_t t_http_closed; /* Number of closed range requests */
+ struct http_sendfile_track t_http_info[MAX_TCP_HTTP_REQ];
+#endif
};
#endif /* _KERNEL || _WANT_TCPCB */
@@ -346,7 +516,7 @@
#define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */
#define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */
-/*
+/**
* If defining the optional tcp_timers, in the
* tfb_tcp_timer_stop call you must use the
* callout_async_drain() function with the
@@ -356,6 +526,7 @@
* does not know your callbacks you must provide a
* stop_all function that loops through and calls
* tcp_timer_stop() with each of your defined timers.
+ *
* Adding a tfb_tcp_handoff_ok function allows the socket
* option to change stacks to query you even if the
* connection is in a later stage. You return 0 to
@@ -363,16 +534,67 @@
* non-zero (an error number) to say no you can't.
* If the function is undefined you can only change
* in the early states (before connect or listen).
+ *
+ * tfb_tcp_fb_init is used to allow the new stack to
+ * setup its control block. Among the things it must
+ * do is:
+ * a) Make sure that the inp_flags2 is setup correctly
+ * for LRO. There are two flags that the previous
+ * stack may have set INP_MBUF_ACKCMP and
+ * INP_SUPPORTS_MBUFQ. If the new stack does not
+ * support these it *should* clear the flags.
+ * b) Make sure that the timers are in the proper
+ * granularity that the stack wants. The stack
+ * should check the t_tmr_granularity field. Currently
+ * there are two values that it may hold
+ * TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC.
+ * Use the functions tcp_timer_convert(tp, granularity);
+ * to move the timers to the correct format for your stack.
+ *
+ * The new stack may also optionally query the tfb_chg_query
+ * function if the old stack has one. The new stack may ask
+ * for one of three entries and can also state to the old
+ * stack its support for the INP_MBUF_ACKCMP and
+ * INP_SUPPORTS_MBUFQ. This is important since if there are
+ * queued ack's without that statement the old stack will
+ * be forced to discard the queued acks. The requests that
+ * can be made for information by the new stacks are:
+ *
+ * Note also that the tfb_tcp_fb_init() when called can
+ * determine if a query is needed by looking at the
+ * value passed in the ptr. The ptr is designed to be
+ * set in with any allocated memory, but the address
+ * of the condtion (ptr == &tp->t_fb_ptr) will be
+ * true if this is not a stack switch but the initial
+ * setup of a tcb (which means no query would be needed).
+ * If, however, the value is not t_fb_ptr, then the caller
+ * is in the middle of a stack switch and is the new stack.
+ * A query would be appropriate (if the new stack support
+ * the query mechanism).
+ *
+ * TCP_QUERY_SENDMAP - Query of outstanding data.
+ * TCP_QUERY_TIMERS_UP - Query about running timers.
+ * TCP_SUPPORTED_LRO - Declaration in req_param of
+ * the inp_flags2 supported by
+ * the new stack.
+ * TCP_QUERY_RACK_TIMES - Enquire about various timestamps
+ * and states the old stack may be in.
+ *
* tfb_tcp_fb_fini is changed to add a flag to tell
* the old stack if the tcb is being destroyed or
* not. A one in the flag means the TCB is being
* destroyed, a zero indicates its transitioning to
- * another stack (via socket option).
+ * another stack (via socket option). The
+ * tfb_tcp_fb_fini() function itself should not change timers
+ * or inp_flags2 (the tfb_tcp_fb_init() must do that). However
+ * if the old stack supports the LRO mbuf queuing, and the new
+ * stack does not communicate via chg messages that it too does,
+ * it must assume it does not and free any queued mbufs.
+ *
*/
struct tcp_function_block {
char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
int (*tfb_tcp_output)(struct tcpcb *);
- int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *);
void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t);
@@ -387,15 +609,18 @@
int, struct timeval *);
int (*tfb_tcp_ctloutput)(struct inpcb *inp, struct sockopt *sopt);
/* Optional memory allocation/free routine */
- int (*tfb_tcp_fb_init)(struct tcpcb *);
+ int (*tfb_tcp_fb_init)(struct tcpcb *, void **);
void (*tfb_tcp_fb_fini)(struct tcpcb *, int);
/* Optional timers, must define all if you define one */
int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
int (*tfb_tcp_handoff_ok)(struct tcpcb *);
- void (*tfb_tcp_mtu_chg)(struct tcpcb *);
+ void (*tfb_tcp_mtu_chg)(struct tcpcb *tp);
int (*tfb_pru_options)(struct tcpcb *, int);
void (*tfb_hwtls_change)(struct tcpcb *, int);
+ int (*tfb_chg_query)(struct tcpcb *, struct tcp_query_resp *);
+ void (*tfb_switch_failed)(struct tcpcb *);
+ bool (*tfb_early_wake_check)(struct tcpcb *);
int (*tfb_compute_pipe)(struct tcpcb *tp);
volatile uint32_t tfb_refcnt;
uint32_t tfb_flags;
@@ -445,6 +670,16 @@
return (rv);
}
+static inline void
+tcp_lro_features_off(struct inpcb *inp)
+{
+ inp->inp_flags2 &= ~(INP_SUPPORTS_MBUFQ|
+ INP_MBUF_QUEUE_READY|
+ INP_DONT_SACK_QUEUE|
+ INP_MBUF_ACKCMP|
+ INP_MBUF_L_ACKS);
+}
+
/*
* tcp_output_unlock()
* Always returns unlocked, handles drop request from advanced stacks.
@@ -1169,6 +1404,7 @@
#ifdef NETFLIX_EXP_DETECTION
/* Various SACK attack thresholds */
extern int32_t tcp_force_detection;
+extern int32_t tcp_sad_limit;
extern int32_t tcp_sack_to_ack_thresh;
extern int32_t tcp_sack_to_move_thresh;
extern int32_t tcp_restoral_thresh;
@@ -1176,6 +1412,7 @@
extern int32_t tcp_sad_pacing_interval;
extern int32_t tcp_sad_low_pps;
extern int32_t tcp_map_minimum;
+extern int32_t tcp_attack_on_turns_on_logging;
#endif
extern uint32_t tcp_ack_war_time_window;
extern uint32_t tcp_ack_war_cnt;
@@ -1246,6 +1483,8 @@
size_t seed_len);
int tcp_can_enable_pacing(void);
void tcp_decrement_paced_conn(void);
+void tcp_change_time_units(struct tcpcb *, int);
+void tcp_handle_orphaned_packets(struct tcpcb *);
struct mbuf *
tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
@@ -1253,6 +1492,31 @@
int tcp_stats_init(void);
void tcp_log_end_status(struct tcpcb *tp, uint8_t status);
+#ifdef TCP_REQUEST_TRK
+void tcp_http_free_a_slot(struct tcpcb *tp, struct http_sendfile_track *ent);
+struct http_sendfile_track *
+tcp_http_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip);
+int tcp_http_check_for_comp(struct tcpcb *tp, tcp_seq ack_point);
+int
+tcp_http_is_entry_comp(struct tcpcb *tp, struct http_sendfile_track *ent, tcp_seq ack_point);
+struct http_sendfile_track *
+tcp_http_find_req_for_seq(struct tcpcb *tp, tcp_seq seq);
+void
+tcp_http_log_req_info(struct tcpcb *tp,
+ struct http_sendfile_track *http, uint16_t slot,
+ uint8_t val, uint64_t offset, uint64_t nbytes);
+
+uint32_t
+tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes);
+void
+tcp_http_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user,
+ uint64_t ts);
+
+struct http_sendfile_track *
+tcp_http_alloc_req_full(struct tcpcb *tp, struct http_req *req, uint64_t ts, int rec_dups);
+
+
+#endif
#ifdef TCP_ACCOUNTING
int tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss);
#endif
Index: sys/sys/mbuf.h
===================================================================
--- sys/sys/mbuf.h
+++ sys/sys/mbuf.h
@@ -1235,6 +1235,16 @@
#define M_LEADINGSPACE(m) \
(M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0)
+/*
+ * So M_TRAILINGROOM() is for when you want to know how much space
+ * would be there if it was writable. This can be used to
+ * detect changes in mbufs by knowing the value at one point
+ * and then being able to compare it later to the current M_TRAILINGROOM().
+ * The TRAILINGSPACE() macro is not suitable for this since an mbuf
+ * at one point might not be writable and then later it becomes writable
+ * even though the space at the back of it has not changed.
+ */
+#define M_TRAILINGROOM(m) ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len))
/*
* Compute the amount of space available after the end of data in an mbuf.
*
@@ -1245,9 +1255,7 @@
* for mbufs with external storage. We now allow mbuf-embedded data to be
* read-only as well.
*/
-#define M_TRAILINGSPACE(m) \
- (M_WRITABLE(m) ? \
- ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0)
+#define M_TRAILINGSPACE(m) (M_WRITABLE(m) ? M_TRAILINGROOM(m) : 0)
/*
* Arrange to prepend space of size plen to mbuf m. If a new mbuf must be

File Metadata

Mime Type
text/plain
Expires
Fri, Jan 23, 11:27 PM (12 h, 5 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27891603
Default Alt Text
D39210.diff (59 KB)

Event Timeline