Index: sys/conf/options =================================================================== --- sys/conf/options +++ sys/conf/options @@ -227,6 +227,7 @@ SYSVSHM opt_sysvipc.h SW_WATCHDOG opt_watchdog.h TCPHPTS opt_inet.h +TCP_REQUEST_TRK opt_global.h TCP_ACCOUNTING opt_inet.h TURNSTILE_PROFILING UMTX_PROFILING Index: sys/kern/kern_sendfile.c =================================================================== --- sys/kern/kern_sendfile.c +++ sys/kern/kern_sendfile.c @@ -57,6 +57,9 @@ #include #include #include +#include +#include +#include #include #include @@ -1188,6 +1191,12 @@ NULL, NULL, td); sendfile_iodone(sfio, NULL, 0, error); } +#ifdef TCP_REQUEST_TRK + if (so->so_proto->pr_protocol == IPPROTO_TCP) { + /* log the sendfile call to the TCP log, if enabled */ + tcp_log_sendfile(so, offset, nbytes, flags); + } +#endif CURVNET_RESTORE(); m = NULL; Index: sys/modules/tcp/rack/Makefile =================================================================== --- sys/modules/tcp/rack/Makefile +++ sys/modules/tcp/rack/Makefile @@ -6,7 +6,7 @@ STACKNAME= rack KMOD= tcp_${STACKNAME} -SRCS= rack.c sack_filter.c rack_bbr_common.c +SRCS= rack.c sack_filter.c rack_bbr_common.c #tailq_hash.c SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h SRCS+= opt_kern_tls.h Index: sys/netinet/tcp.h =================================================================== --- sys/netinet/tcp.h +++ sys/netinet/tcp.h @@ -217,15 +217,15 @@ /* Options for Rack and BBR */ #define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */ #define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */ -#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */ +#define TCP_RACK_PROP 1051 /* Not used */ #define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */ #define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacingv reduction factor (divisor) */ #define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send */ #define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */ -#define TCP_RACK_PROP_RATE 1056 /* The proportional reduction rate */ +#define TCP_RACK_PROP_RATE 1056 /* Not used */ #define TCP_RACK_PRR_SENDALOT 1057 /* Allow PRR to send more than one seg */ #define TCP_RACK_MIN_TO 1058 /* Minimum time between rack t-o's in ms */ -#define TCP_RACK_EARLY_RECOV 1059 /* Should recovery happen early (bool) */ +#define TCP_RACK_EARLY_RECOV 1059 /* Not used */ #define TCP_RACK_EARLY_SEG 1060 /* If early recovery max segments */ #define TCP_RACK_REORD_THRESH 1061 /* RACK reorder threshold (shift amount) */ #define TCP_RACK_REORD_FADE 1062 /* Does reordering fade after ms time */ @@ -309,12 +309,22 @@ #define TCP_REC_ABC_VAL 1134 /* Do we use the ABC value for recovery or the override one from sysctl */ #define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */ #define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */ -#define TCP_FAST_RSM_HACK 1137 /* Do we do the broken thing where we don't twiddle the TLP bits properly in fast_rsm_output? */ +#define TCP_FAST_RSM_HACK 1137 /* Not used in modern stacks */ #define TCP_RACK_PACING_BETA 1138 /* Changing the beta for pacing */ #define TCP_RACK_PACING_BETA_ECN 1139 /* Changing the beta for ecn with pacing */ #define TCP_RACK_TIMER_SLOP 1140 /* Set or get the timer slop used */ #define TCP_RACK_DSACK_OPT 1141 /* How do we setup rack timer DSACK options bit 1/2 */ #define TCP_RACK_ENABLE_HYSTART 1142 /* Do we allow hystart in the CC modules */ +#define TCP_RACK_SET_RXT_OPTIONS 1143 /* Set the bits in the retransmit options */ +#define TCP_RACK_HI_BETA 1144 /* Turn on/off high beta */ +#define TCP_RACK_SPLIT_LIMIT 1145 /* Set a split limit for split allocations */ +#define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */ +#define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */ +#define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */ +#define TCP_RXT_CLAMP 1149 /* Do we apply a threshold to rack so if excess rxt clamp cwnd? */ +#define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */ +#define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */ + /* Start of reserved space for third-party user-settable options. */ #define TCP_VENDOR SO_VENDOR @@ -447,6 +457,53 @@ #define TLS_SET_RECORD_TYPE 1 #define TLS_GET_RECORD 2 +/* + * TCP log user opaque + */ +struct http_req { + uint64_t timestamp; + uint64_t start; + uint64_t end; + uint32_t flags; +}; + +union tcp_log_userdata { + struct http_req http_req; +}; + +struct tcp_log_user { + uint32_t type; + uint32_t subtype; + union tcp_log_userdata data; +}; + +/* user types, i.e. apps */ +#define TCP_LOG_USER_HTTPD 1 + +/* user subtypes */ +#define TCP_LOG_HTTPD_TS 1 /* client timestamp */ +#define TCP_LOG_HTTPD_TS_REQ 2 /* client timestamp and request info */ + +/* HTTPD REQ flags */ +#define TCP_LOG_HTTPD_RANGE_START 0x0001 +#define TCP_LOG_HTTPD_RANGE_END 0x0002 + +/* Flags for hybrid pacing */ +#define TCP_HYBRID_PACING_CU 0x0001 /* Enable catch-up mode */ +#define TCP_HYBRID_PACING_DTL 0x0002 /* Enable Detailed logging */ +#define TCP_HYBRID_PACING_CSPR 0x0004 /* A client suggested rate is present */ +#define TCP_HYBRID_PACING_H_MS 0x0008 /* A client hint for maxseg is present */ +#define TCP_HYBRID_PACING_ENABLE 0x0010 /* We are enabling hybrid pacing else disable */ +#define TCP_HYBRID_PACING_S_MSS 0x0020 /* Clent wants us to set the mss overriding gp est in CU */ +#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tellsus we set the mss on this entry */ + +struct tcp_hybrid_req { + struct http_req req; + uint64_t cspr; + uint32_t hint_maxseg; + uint32_t hybrid_flags; +}; + /* * TCP specific variables of interest for tp->t_stats stats(9) accounting. */ @@ -460,6 +517,7 @@ #define VOI_TCP_CALCFRWINDIFF 7 /* Congestion avoidance LCWIN - FRWIN */ #define VOI_TCP_GPUT_ND 8 /* Goodput normalised delta */ #define VOI_TCP_ACKLEN 9 /* Average ACKed bytes per ACK */ +#define VOI_TCP_PATHRTT 10 /* The path RTT based on ACK arrival */ #define TCP_REUSPORT_LB_NUMA_NODOM (-2) /* remove numa binding */ #define TCP_REUSPORT_LB_NUMA_CURDOM (-1) /* bind to current domain */ Index: sys/netinet/tcp_hpts.h =================================================================== --- sys/netinet/tcp_hpts.h +++ sys/netinet/tcp_hpts.h @@ -187,6 +187,15 @@ } #ifdef _KERNEL + +extern int32_t tcp_min_hptsi_time; + +__inline int32_t +get_hpts_min_sleep_time() +{ + return (tcp_min_hptsi_time + HPTS_TICKS_PER_SLOT); +} + static __inline uint32_t tcp_gethptstick(struct timeval *sv) { Index: sys/netinet/tcp_log_buf.c =================================================================== --- sys/netinet/tcp_log_buf.c +++ sys/netinet/tcp_log_buf.c @@ -58,6 +58,7 @@ #include #include #include +#include #include /* Default expiry time */ @@ -2844,6 +2845,10 @@ { struct inpcb *inp; struct tcpcb *tp; +#ifdef TCP_REQUEST_TRK + struct http_sendfile_track *ent; + int i, fnd; +#endif inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_log_sendfile: inp == NULL")); @@ -2873,6 +2878,90 @@ &tptosocket(tp)->so_snd, TCP_LOG_SENDFILE, 0, 0, &log, false, &tv); } +#ifdef TCP_REQUEST_TRK + if (tp->t_http_req == 0) { + /* No http requests to track */ + goto done; + } + fnd = 0; + if (tp->t_http_closed == 0) { + /* No closed end req to track */ + goto skip_closed_req; + } + for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { + /* Lets see if this one can be found */ + ent = &tp->t_http_info[i]; + if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) { + /* Not used */ + continue; + } + if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) { + /* This pass does not consider open requests */ + continue; + } + if (ent->flags & TCP_HTTP_TRACK_FLG_COMP) { + /* Don't look at what we have completed */ + continue; + } + /* If we reach here its a allocated closed end request */ + if ((ent->start == offset) || + ((offset > ent->start) && (offset < ent->end))){ + /* Its within this request?? */ + fnd = 1; + } + if (fnd) { + /* + * It is at or past the end, its complete. + */ + ent->flags |= TCP_HTTP_TRACK_FLG_SEQV; + /* + * When an entry completes we can take (snd_una + sb_cc) and know where + * the end of the range really is. Note that this works since two + * requests must be sequential and sendfile now is complete for *this* request. + * we must use sb_ccc since the data may still be in-flight in TLS. + * + * We always cautiously move the end_seq only if our calculations + * show it happened (just in case sf has the call to here at the wrong + * place). When we go COMP we will stop coming here and hopefully be + * left with the correct end_seq. + */ + if (SEQ_GT((tp->snd_una + so->so_snd.sb_ccc), ent->end_seq)) + ent->end_seq = tp->snd_una + so->so_snd.sb_ccc; + if ((offset + nbytes) >= ent->end) { + ent->flags |= TCP_HTTP_TRACK_FLG_COMP; + tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_COMPLETE, offset, nbytes); + } else { + tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_MOREYET, offset, nbytes); + } + /* We assume that sendfile never sends overlapping requests */ + goto done; + } + } +skip_closed_req: + if (!fnd) { + /* Ok now lets look for open requests */ + for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { + ent = &tp->t_http_info[i]; + if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) { + /* Not used */ + continue; + } + if ((ent->flags & TCP_HTTP_TRACK_FLG_OPEN) == 0) + continue; + /* If we reach here its an allocated open request */ + if (ent->start == offset) { + /* It begins this request */ + ent->start_seq = tp->snd_una + + tptosocket(tp)->so_snd.sb_ccc; + ent->flags |= TCP_HTTP_TRACK_FLG_SEQV; + break; + } else if (offset > ent->start) { + ent->flags |= TCP_HTTP_TRACK_FLG_SEQV; + break; + } + } + } +#endif done: INP_WUNLOCK(inp); } Index: sys/netinet/tcp_stacks/bbr.c =================================================================== --- sys/netinet/tcp_stacks/bbr.c +++ sys/netinet/tcp_stacks/bbr.c @@ -500,7 +500,7 @@ bbr_enter_persist(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, int32_t line); static void -bbr_stop_all_timers(struct tcpcb *tp); +bbr_stop_all_timers(struct tcpcb *tp, struct tcp_bbr *bbr); static void bbr_exit_probe_rtt(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts); static void @@ -1970,7 +1970,7 @@ static void bbr_log_msgsize_fail(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t len, uint32_t maxseg, uint32_t mtu, int32_t csum_flags, int32_t tso, uint32_t cts) { - if (tcp_bblogging_on(bbr->rc_tp)) { + if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); @@ -2669,7 +2669,7 @@ uint32_t newbw, uint32_t obw, uint32_t diff, uint32_t tim) { - if (tcp_bblogging_on(bbr->rc_tp)) { + if (/*bbr_verbose_logging && */tcp_bblogging_on(bbr->rc_tp)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); @@ -2697,7 +2697,7 @@ static inline void bbr_log_progress_event(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t tick, int event, int line) { - if (tcp_bblogging_on(bbr->rc_tp)) { + if (bbr_verbose_logging && tcp_bblogging_on(bbr->rc_tp)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, bbr->r_ctl.rc_rcvtime); @@ -6281,6 +6281,9 @@ else apply_filter_min_small(&bbr->r_ctl.rc_rttprop, rtt, cts); } +#ifdef STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rtt)); +#endif if (bbr->rc_ack_was_delayed) rtt += bbr->r_ctl.rc_ack_hdwr_delay; @@ -9850,16 +9853,13 @@ } static void -bbr_stop_all_timers(struct tcpcb *tp) +bbr_stop_all_timers(struct tcpcb *tp, struct tcp_bbr *bbr) { - struct tcp_bbr *bbr; - /* * Assure no timers are running. */ if (tcp_timer_active(tp, TT_PERSIST)) { /* We enter in persists, set the flag appropriately */ - bbr = (struct tcp_bbr *)tp->t_fb_ptr; bbr->rc_in_persist = 1; } } @@ -9927,14 +9927,14 @@ * which indicates the error (usually no memory). */ static int -bbr_init(struct tcpcb *tp) +bbr_init(struct tcpcb *tp, void **ptr) { struct inpcb *inp = tptoinpcb(tp); struct tcp_bbr *bbr = NULL; uint32_t cts; - tp->t_fb_ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO)); - if (tp->t_fb_ptr == NULL) { + *ptr = uma_zalloc(bbr_pcb_zone, (M_NOWAIT | M_ZERO)); + if (*ptr == NULL) { /* * We need to allocate memory but cant. The INP and INP_INFO * locks and they are recursive (happens during setup. So a @@ -9943,10 +9943,16 @@ */ return (ENOMEM); } - bbr = (struct tcp_bbr *)tp->t_fb_ptr; + bbr = (struct tcp_bbr *)*ptr; bbr->rtt_valid = 0; inp->inp_flags2 |= INP_CANNOT_DO_ECN; inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; + /* Take off any undesired flags */ + inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; + inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; + inp->inp_flags2 &= ~INP_MBUF_ACKCMP; + inp->inp_flags2 &= ~INP_MBUF_L_ACKS; + TAILQ_INIT(&bbr->r_ctl.rc_map); TAILQ_INIT(&bbr->r_ctl.rc_free); TAILQ_INIT(&bbr->r_ctl.rc_tmap); @@ -10074,8 +10080,8 @@ rsm = bbr_alloc(bbr); if (rsm == NULL) { - uma_zfree(bbr_pcb_zone, tp->t_fb_ptr); - tp->t_fb_ptr = NULL; + uma_zfree(bbr_pcb_zone, *ptr); + *ptr = NULL; return (ENOMEM); } rsm->r_rtt_not_allowed = 1; @@ -10128,7 +10134,17 @@ * the TCB on the hptsi wheel if a timer is needed with appropriate * flags. */ - bbr_stop_all_timers(tp); + bbr_stop_all_timers(tp, bbr); + /* + * Validate the timers are not in usec, if they are convert. + * BBR should in theory move to USEC and get rid of a + * lot of the TICKS_2 calls.. but for now we stay + * with tick timers. + */ + tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); bbr_start_hpts_timer(bbr, tp, cts, 5, 0, 0); return (0); } @@ -10172,7 +10188,6 @@ bbr_fini(struct tcpcb *tp, int32_t tcb_is_purged) { if (tp->t_fb_ptr) { - struct inpcb *inp = tptoinpcb(tp); uint32_t calc; struct tcp_bbr *bbr; struct bbr_sendmap *rsm; @@ -10182,10 +10197,6 @@ tcp_rel_pacing_rate(bbr->r_ctl.crte, bbr->rc_tp); bbr_log_flowend(bbr); bbr->rc_tp = NULL; - /* Backout any flags2 we applied */ - inp->inp_flags2 &= ~INP_CANNOT_DO_ECN; - inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; - inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; if (bbr->bbr_hdrw_pacing) counter_u64_add(bbr_flows_whdwr_pacing, -1); else @@ -11853,7 +11864,6 @@ int32_t isipv6; #endif uint8_t app_limited = BBR_JR_SENT_DATA; - uint8_t filled_all = 0; bbr = (struct tcp_bbr *)tp->t_fb_ptr; /* We take a cache hit here */ memcpy(&bbr->rc_tv, tv, sizeof(struct timeval)); @@ -13162,7 +13172,7 @@ if_hw_tsomaxsegsize, msb, ((rsm == NULL) ? hw_tls : 0) #ifdef NETFLIX_COPY_ARGS - , &filled_all + , NULL, NULL #endif ); if (len <= maxseg) { @@ -13474,7 +13484,7 @@ #endif /* Log to the black box */ - if (tcp_bblogging_on(bbr->rc_tp)) { + if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; bbr_fill_in_logging_data(bbr, &log.u_bbr, cts); @@ -13483,13 +13493,10 @@ log.u_bbr.flex2 = (bbr->r_recovery_bw << 3); log.u_bbr.flex3 = maxseg; log.u_bbr.flex4 = delay_calc; - /* Encode filled_all into the upper flex5 bit */ log.u_bbr.flex5 = bbr->rc_past_init_win; log.u_bbr.flex5 <<= 1; log.u_bbr.flex5 |= bbr->rc_no_pacing; log.u_bbr.flex5 <<= 29; - if (filled_all) - log.u_bbr.flex5 |= 0x80000000; log.u_bbr.flex5 |= tp->t_maxseg; log.u_bbr.flex6 = bbr->r_ctl.rc_pace_max_segs; log.u_bbr.flex7 = (bbr->rc_bbr_state << 8) | bbr_state_val(bbr); @@ -14073,6 +14080,56 @@ return (0); } +static void +bbr_switch_failed(struct tcpcb *tp) +{ + /* + * If a switch fails we only need to + * make sure mbuf_queuing is still in place. + * We also need to make sure we are still in + * ticks granularity (though we should probably + * change bbr to go to USECs). + * + * For timers we need to see if we are still in the + * pacer (if our flags are up) if so we are good, if + * not we need to get back into the pacer. + */ + struct inpcb *inp = tptoinpcb(tp); + struct timeval tv; + uint32_t cts; + uint32_t toval; + struct tcp_bbr *bbr; + struct hpts_diag diag; + + inp->inp_flags2 |= INP_CANNOT_DO_ECN; + inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; + tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); + if (inp->inp_in_hpts) { + return; + } + bbr = (struct tcp_bbr *)tp->t_fb_ptr; + cts = tcp_get_usecs(&tv); + if (bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { + if (TSTMP_GT(bbr->rc_pacer_started, cts)) { + toval = bbr->rc_pacer_started - cts; + } else { + /* one slot please */ + toval = HPTS_TICKS_PER_SLOT; + } + } else if (bbr->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { + if (TSTMP_GT(bbr->r_ctl.rc_timer_exp, cts)) { + toval = bbr->r_ctl.rc_timer_exp - cts; + } else { + /* one slot please */ + toval = HPTS_TICKS_PER_SLOT; + } + } else + toval = HPTS_TICKS_PER_SLOT; + (void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(toval), + __LINE__, &diag); + bbr_log_hpts_diag(bbr, cts, &diag); +} + struct tcp_function_block __tcp_bbr = { .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = bbr_output, @@ -14087,6 +14144,7 @@ .tfb_tcp_handoff_ok = bbr_handoff_ok, .tfb_tcp_mtu_chg = bbr_mtu_chg, .tfb_pru_options = bbr_pru_options, + .tfb_switch_failed = bbr_switch_failed, .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, }; Index: sys/netinet/tcp_stacks/rack.c =================================================================== --- sys/netinet/tcp_stacks/rack.c +++ sys/netinet/tcp_stacks/rack.c @@ -458,7 +458,7 @@ static uint32_t rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss); static int32_t rack_handoff_ok(struct tcpcb *tp); -static int32_t rack_init(struct tcpcb *tp); +static int32_t rack_init(struct tcpcb *tp, void **ptr); static void rack_init_sysctls(void); static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, @@ -12344,7 +12344,7 @@ } static int -rack_init(struct tcpcb *tp) +rack_init(struct tcpcb *tp, void **ptr) { struct inpcb *inp = tptoinpcb(tp); struct tcp_rack *rack = NULL; @@ -12354,8 +12354,8 @@ uint32_t iwin, snt, us_cts; int err; - tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); - if (tp->t_fb_ptr == NULL) { + *ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); + if (*ptr == NULL) { /* * We need to allocate memory but cant. The INP and INP_INFO * locks and they are recursive (happens during setup. So a @@ -12364,9 +12364,9 @@ */ return (ENOMEM); } - memset(tp->t_fb_ptr, 0, sizeof(struct tcp_rack)); + memset(ptr, 0, sizeof(struct tcp_rack)); - rack = (struct tcp_rack *)tp->t_fb_ptr; + rack = (struct tcp_rack *)ptr; RB_INIT(&rack->r_ctl.rc_mtree); TAILQ_INIT(&rack->r_ctl.rc_free); TAILQ_INIT(&rack->r_ctl.rc_tmap); Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -109,6 +109,7 @@ #include #include #include +#include #include #include #include @@ -152,6 +153,11 @@ CTLFLAG_RW, &tcp_force_detection, 0, "Do we force detection even if the INP has it off?"); +int32_t tcp_sad_limit = 10000; +SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit, + CTLFLAG_RW, + &tcp_sad_limit, 10000, + "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?"); int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh, CTLFLAG_RW, @@ -363,7 +369,7 @@ VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]); #define V_ts_offset_secret VNET(ts_offset_secret) -static int tcp_default_fb_init(struct tcpcb *tp); +static int tcp_default_fb_init(struct tcpcb *tp, void **ptr); static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); @@ -519,18 +525,11 @@ tcp_switch_back_to_default(struct tcpcb *tp) { struct tcp_function_block *tfb; + void *ptr = NULL; KASSERT(tp->t_fb != &tcp_def_funcblk, ("%s: called by the built-in default stack", __func__)); - /* - * Release the old stack. This function will either find a new one - * or panic. - */ - if (tp->t_fb->tfb_tcp_fb_fini != NULL) - (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); - refcount_release(&tp->t_fb->tfb_refcnt); - /* * Now, we'll find a new function block to use. * Start by trying the current user-selected @@ -551,14 +550,20 @@ /* Try to use that stack. */ if (tfb != NULL) { /* Initialize the new stack. If it succeeds, we are done. */ - tp->t_fb = tfb; - if (tp->t_fb->tfb_tcp_fb_init == NULL || - (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0) + if (tfb->tfb_tcp_fb_init == NULL || + (*tfb->tfb_tcp_fb_init)(tp, &ptr) == 0) { + /* Release the old stack */ + if (tp->t_fb->tfb_tcp_fb_fini != NULL) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); + refcount_release(&tp->t_fb->tfb_refcnt); + /* Now set in all the pointers */ + tp->t_fb = tfb; + tp->t_fb_ptr = ptr; return; - + } /* * Initialization failed. Release the reference count on - * the stack. + * the looked up default stack. */ refcount_release(&tfb->tfb_refcnt); } @@ -578,12 +583,18 @@ panic("Default stack rejects a new session?"); } } - tp->t_fb = tfb; - if (tp->t_fb->tfb_tcp_fb_init != NULL && - (*tp->t_fb->tfb_tcp_fb_init)(tp)) { + if (tfb->tfb_tcp_fb_init != NULL && + (*tfb->tfb_tcp_fb_init)(tp, &ptr)) { /* The default stack cannot fail */ panic("Default stack initialization failed"); } + /* Now release the old stack */ + if (tp->t_fb->tfb_tcp_fb_fini != NULL) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); + refcount_release(&tp->t_fb->tfb_refcnt); + /* And set in the pointers to the new */ + tp->t_fb = tfb; + tp->t_fb_ptr = ptr; } static bool @@ -1040,16 +1051,37 @@ * it is required to always succeed since it is the stack of last resort! */ static int -tcp_default_fb_init(struct tcpcb *tp) +tcp_default_fb_init(struct tcpcb *tp, void **ptr) { struct socket *so = tptosocket(tp); + int rexmt; INP_WLOCK_ASSERT(tptoinpcb(tp)); + /* We don't use the pointer */ + *ptr = NULL; KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT, ("%s: connection %p in unexpected state %d", __func__, tp, tp->t_state)); + /* Make sure we get no interesting mbuf queuing behavior */ + /* All mbuf queue/ack compress flags should be off */ + tcp_lro_features_off(tptoinpcb(tp)); + + /* Cancel the GP measurement in progress */ + tp->t_flags &= ~TF_GPUTINPROG; + /* Validate the timers are not in usec, if they are convert */ + tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); + if ((tp->t_state == TCPS_SYN_SENT) || + (tp->t_state == TCPS_SYN_RECEIVED)) + rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; + else + rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; + if (tp->t_rxtshift == 0) + tp->t_rxtcur = rexmt; + else + TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); + /* * Nothing to do for ESTABLISHED or LISTEN states. And, we don't * know what to do for unexpected states (which includes TIME_WAIT). @@ -2240,6 +2272,8 @@ tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; + /* We always start with ticks granularity */ + tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, @@ -2265,7 +2299,7 @@ #endif tp->t_pacing_rate = -1; if (tp->t_fb->tfb_tcp_fb_init) { - if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) { + if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) { refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } @@ -4019,3 +4053,524 @@ } } #endif + +void +tcp_change_time_units(struct tcpcb *tp, int granularity) +{ + if (tp->t_tmr_granularity == granularity) { + /* We are there */ + return; + } + if (granularity == TCP_TMR_GRANULARITY_USEC) { + KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS), + ("Granularity is not TICKS its %u in tp:%p", + tp->t_tmr_granularity, tp)); + tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); + if (tp->t_srtt > 1) { + uint32_t val, frac; + + val = tp->t_srtt >> TCP_RTT_SHIFT; + frac = tp->t_srtt & 0x1f; + tp->t_srtt = TICKS_2_USEC(val); + /* + * frac is the fractional part of the srtt (if any) + * but its in ticks and every bit represents + * 1/32nd of a hz. + */ + if (frac) { + if (hz == 1000) { + frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); + } else { + frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); + } + tp->t_srtt += frac; + } + } + if (tp->t_rttvar) { + uint32_t val, frac; + + val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; + frac = tp->t_rttvar & 0x1f; + tp->t_rttvar = TICKS_2_USEC(val); + /* + * frac is the fractional part of the srtt (if any) + * but its in ticks and every bit represents + * 1/32nd of a hz. + */ + if (frac) { + if (hz == 1000) { + frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); + } else { + frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); + } + tp->t_rttvar += frac; + } + } + tp->t_tmr_granularity = TCP_TMR_GRANULARITY_USEC; + } else if (granularity == TCP_TMR_GRANULARITY_TICKS) { + /* Convert back to ticks, with */ + KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_USEC), + ("Granularity is not USEC its %u in tp:%p", + tp->t_tmr_granularity, tp)); + if (tp->t_srtt > 1) { + uint32_t val, frac; + + val = USEC_2_TICKS(tp->t_srtt); + frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); + tp->t_srtt = val << TCP_RTT_SHIFT; + /* + * frac is the fractional part here is left + * over from converting to hz and shifting. + * We need to convert this to the 5 bit + * remainder. + */ + if (frac) { + if (hz == 1000) { + frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); + } else { + frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); + } + tp->t_srtt += frac; + } + } + if (tp->t_rttvar) { + uint32_t val, frac; + + val = USEC_2_TICKS(tp->t_rttvar); + frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); + tp->t_rttvar = val << TCP_RTTVAR_SHIFT; + /* + * frac is the fractional part here is left + * over from converting to hz and shifting. + * We need to convert this to the 5 bit + * remainder. + */ + if (frac) { + if (hz == 1000) { + frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); + } else { + frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); + } + tp->t_rttvar += frac; + } + } + tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); + tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; + } +#ifdef INVARIANTS + else { + panic("Unknown granularity:%d tp:%p", + granularity, tp); + } +#endif +} + +void +tcp_handle_orphaned_packets(struct tcpcb *tp) +{ + struct mbuf *save, *m, *prev; + /* + * Called when a stack switch is occuring from the fini() + * of the old stack. We assue the init() as already been + * run of the new stack and it has set the inp_flags2 to + * what it supports. This function will then deal with any + * differences i.e. cleanup packets that maybe queued that + * the newstack does not support. + */ + + if (tptoinpcb(tp)->inp_flags2 & INP_MBUF_L_ACKS) + return; + if ((tptoinpcb(tp)->inp_flags2 & INP_SUPPORTS_MBUFQ) == 0) { + /* + * It is unsafe to process the packets since a + * reset may be lurking in them (its rare but it + * can occur). If we were to find a RST, then we + * would end up dropping the connection and the + * INP lock, so when we return the caller (tcp_usrreq) + * will blow up when it trys to unlock the inp. + * This new stack does not do any fancy LRO features + * so all we can do is toss the packets. + */ + m = tp->t_in_pkt; + tp->t_in_pkt = NULL; + tp->t_tail_pkt = NULL; + while (m) { + save = m->m_nextpkt; + m->m_nextpkt = NULL; + m_freem(m); + m = save; + } + } else { + /* + * Here we have a stack that does mbuf queuing but + * does not support compressed ack's. We must + * walk all the mbufs and discard any compressed acks. + */ + m = tp->t_in_pkt; + prev = NULL; + while (m) { + if (m->m_flags & M_ACKCMP) { + /* We must toss this packet */ + if (tp->t_tail_pkt == m) + tp->t_tail_pkt = prev; + if (prev) + prev->m_nextpkt = m->m_nextpkt; + else + tp->t_in_pkt = m->m_nextpkt; + m->m_nextpkt = NULL; + m_freem(m); + /* move forward */ + if (prev) + m = prev->m_nextpkt; + else + m = tp->t_in_pkt; + } else { + /* this one is ok */ + prev = m; + m = m->m_nextpkt; + } + } + } +} + +#ifdef TCP_REQUEST_TRK +uint32_t +tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes) +{ +#ifdef KERN_TLS + struct ktls_session *tls; + uint32_t rec_oh, records; + + tls = so->so_snd.sb_tls_info; + if (tls == NULL) + return (0); + + rec_oh = tls->params.tls_hlen + tls->params.tls_tlen; + records = ((tls_usr_bytes + tls->params.max_frame_len - 1)/tls->params.max_frame_len); + return (records * rec_oh); +#else + return (0); +#endif +} + +extern uint32_t tcp_stale_entry_time; +uint32_t tcp_stale_entry_time = 250000; +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, usrlog_stale, CTLFLAG_RW, + &tcp_stale_entry_time, 250000, "Time that a http entry without a sendfile ages out"); + +void +tcp_http_log_req_info(struct tcpcb *tp, struct http_sendfile_track *http, + uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes) +{ + if (tcp_bblogging_on(tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); +#ifdef TCPHPTS + log.u_bbr.inhpts = tcp_in_hpts(tptoinpcb(tp)); +#endif + log.u_bbr.flex8 = val; + log.u_bbr.rttProp = http->timestamp; + log.u_bbr.delRate = http->start; + log.u_bbr.cur_del_rate = http->end; + log.u_bbr.flex1 = http->start_seq; + log.u_bbr.flex2 = http->end_seq; + log.u_bbr.flex3 = http->flags; + log.u_bbr.flex4 = ((http->localtime >> 32) & 0x00000000ffffffff); + log.u_bbr.flex5 = (http->localtime & 0x00000000ffffffff); + log.u_bbr.flex7 = slot; + log.u_bbr.bw_inuse = offset; + /* nbytes = flex6 | epoch */ + log.u_bbr.flex6 = ((nbytes >> 32) & 0x00000000ffffffff); + log.u_bbr.epoch = (nbytes & 0x00000000ffffffff); + /* cspr = lt_epoch | pkts_out */ + log.u_bbr.lt_epoch = ((http->cspr >> 32) & 0x00000000ffffffff); + log.u_bbr.pkts_out |= (http->cspr & 0x00000000ffffffff); + log.u_bbr.applimited = tp->t_http_closed; + log.u_bbr.applimited <<= 8; + log.u_bbr.applimited |= tp->t_http_open; + log.u_bbr.applimited <<= 8; + log.u_bbr.applimited |= tp->t_http_req; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + TCP_LOG_EVENTP(tp, NULL, + &tptosocket(tp)->so_rcv, + &tptosocket(tp)->so_snd, + TCP_LOG_HTTP_T, 0, + 0, &log, false, &tv); + } +} + +void +tcp_http_free_a_slot(struct tcpcb *tp, struct http_sendfile_track *ent) +{ + if (tp->t_http_req > 0) + tp->t_http_req--; + if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) { + if (tp->t_http_open > 0) + tp->t_http_open--; + } else { + if (tp->t_http_closed > 0) + tp->t_http_closed--; + } + ent->flags = TCP_HTTP_TRACK_FLG_EMPTY; +} + +static void +tcp_http_check_for_stale_entries(struct tcpcb *tp, uint64_t ts, int rm_oldest) +{ + struct http_sendfile_track *ent; + uint64_t time_delta, oldest_delta; + int i, oldest, oldest_set = 0, cnt_rm = 0; + + for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { + ent = &tp->t_http_info[i]; + if (ent->flags != TCP_HTTP_TRACK_FLG_USED) { + /* + * We only care about closed end ranges + * that are allocated and have no sendfile + * ever touching them. They would be in + * state USED. + */ + continue; + } + if (ts >= ent->localtime) + time_delta = ts - ent->localtime; + else + time_delta = 0; + if (time_delta && + ((oldest_delta < time_delta) || (oldest_set == 0))) { + oldest_set = 1; + oldest = i; + oldest_delta = time_delta; + } + if (tcp_stale_entry_time && (time_delta >= tcp_stale_entry_time)) { + /* + * No sendfile in a our time-limit + * time to purge it. + */ + cnt_rm++; + tcp_http_log_req_info(tp, &tp->t_http_info[i], i, TCP_HTTP_REQ_LOG_STALE, + time_delta, 0); + tcp_http_free_a_slot(tp, ent); + } + } + if ((cnt_rm == 0) && rm_oldest && oldest_set) { + ent = &tp->t_http_info[oldest]; + tcp_http_log_req_info(tp, &tp->t_http_info[i], i, TCP_HTTP_REQ_LOG_STALE, + oldest_delta, 1); + tcp_http_free_a_slot(tp, ent); + } +} + +int +tcp_http_check_for_comp(struct tcpcb *tp, tcp_seq ack_point) +{ + int i, ret=0; + struct http_sendfile_track *ent; + + /* Clean up any old closed end requests that are now completed */ + if (tp->t_http_req == 0) + return(0); + if (tp->t_http_closed == 0) + return(0); + for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { + ent = &tp->t_http_info[i]; + /* Skip empty ones */ + if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) + continue; + /* Skip open ones */ + if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) + continue; + if (SEQ_GEQ(ack_point, ent->end_seq)) { + /* We are past it -- free it */ + tcp_http_log_req_info(tp, ent, + i, TCP_HTTP_REQ_LOG_FREED, 0, 0); + tcp_http_free_a_slot(tp, ent); + ret++; + } + } + return (ret); +} + +int +tcp_http_is_entry_comp(struct tcpcb *tp, struct http_sendfile_track *ent, tcp_seq ack_point) +{ + if (tp->t_http_req == 0) + return(-1); + if (tp->t_http_closed == 0) + return(-1); + if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) + return(-1); + if (SEQ_GEQ(ack_point, ent->end_seq)) { + return (1); + } + return (0); +} + +struct http_sendfile_track * +tcp_http_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip) +{ + /* + * Given an ack point (th_ack) walk through our entries and + * return the first one found that th_ack goes past the + * end_seq. + */ + struct http_sendfile_track *ent; + int i; + + if (tp->t_http_req == 0) { + /* none open */ + return (NULL); + } + for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { + ent = &tp->t_http_info[i]; + if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) + continue; + if ((ent->flags & TCP_HTTP_TRACK_FLG_OPEN) == 0) { + if (SEQ_GEQ(th_ack, ent->end_seq)) { + *ip = i; + return (ent); + } + } + } + return (NULL); +} + +struct http_sendfile_track * +tcp_http_find_req_for_seq(struct tcpcb *tp, tcp_seq seq) +{ + struct http_sendfile_track *ent; + int i; + + if (tp->t_http_req == 0) { + /* none open */ + return (NULL); + } + for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { + ent = &tp->t_http_info[i]; + tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_SEARCH, + (uint64_t)seq, 0); + if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) { + continue; + } + if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) { + /* + * An open end request only needs to + * match the beginning seq or be + * all we have (once we keep going on + * a open end request we may have a seq + * wrap). + */ + if ((SEQ_GEQ(seq, ent->start_seq)) || + (tp->t_http_closed == 0)) + return (ent); + } else { + /* + * For this one we need to + * be a bit more careful if its + * completed at least. + */ + if ((SEQ_GEQ(seq, ent->start_seq)) && + (SEQ_LT(seq, ent->end_seq))) { + return (ent); + } + } + } + return (NULL); +} + +/* Should this be in its own file tcp_http.c ? */ +struct http_sendfile_track * +tcp_http_alloc_req_full(struct tcpcb *tp, struct http_req *req, uint64_t ts, int rec_dups) +{ + struct http_sendfile_track *fil; + int i, allocated; + + /* In case the stack does not check for completions do so now */ + tcp_http_check_for_comp(tp, tp->snd_una); + /* Check for stale entries */ + if (tp->t_http_req) + tcp_http_check_for_stale_entries(tp, ts, + (tp->t_http_req >= MAX_TCP_HTTP_REQ)); + /* Check to see if this is a duplicate of one not started */ + if (tp->t_http_req) { + for(i = 0, allocated = 0; i < MAX_TCP_HTTP_REQ; i++) { + fil = &tp->t_http_info[i]; + if (fil->flags != TCP_HTTP_TRACK_FLG_USED) + continue; + if ((fil->timestamp == req->timestamp) && + (fil->start == req->start) && + ((fil->flags & TCP_HTTP_TRACK_FLG_OPEN) || + (fil->end == req->end))) { + /* + * We already have this request + * and it has not been started with sendfile. + * This probably means the user was returned + * a 4xx of some sort and its going to age + * out, lets not duplicate it. + */ + return(fil); + } + } + } + /* Ok if there is no room at the inn we are in trouble */ + if (tp->t_http_req >= MAX_TCP_HTTP_REQ) { + tcp_trace_point(tp, TCP_TP_HTTP_LOG_FAIL); + for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { + tcp_http_log_req_info(tp, &tp->t_http_info[i], + i, TCP_HTTP_REQ_LOG_ALLOCFAIL, 0, 0); + } + return (NULL); + } + for(i = 0, allocated = 0; i < MAX_TCP_HTTP_REQ; i++) { + fil = &tp->t_http_info[i]; + if (fil->flags == TCP_HTTP_TRACK_FLG_EMPTY) { + allocated = 1; + fil->flags = TCP_HTTP_TRACK_FLG_USED; + fil->timestamp = req->timestamp; + fil->localtime = ts; + fil->start = req->start; + if (req->flags & TCP_LOG_HTTPD_RANGE_END) { + fil->end = req->end; + } else { + fil->end = 0; + fil->flags |= TCP_HTTP_TRACK_FLG_OPEN; + } + /* + * We can set the min boundaries to the TCP Sequence space, + * but it might be found to be further up when sendfile + * actually runs on this range (if it ever does). + */ + fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc; + fil->start_seq = tp->snd_una + + tptosocket(tp)->so_snd.sb_ccc; + fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start))); + if (tptosocket(tp)->so_snd.sb_tls_info) { + /* + * This session is doing TLS. Take a swag guess + * at the overhead. + */ + fil->end_seq += tcp_estimate_tls_overhead( + tptosocket(tp), (fil->end - fil->start)); + } + tp->t_http_req++; + if (fil->flags & TCP_HTTP_TRACK_FLG_OPEN) + tp->t_http_open++; + else + tp->t_http_closed++; + tcp_http_log_req_info(tp, fil, i, + TCP_HTTP_REQ_LOG_NEW, 0, 0); + break; + } else + fil = NULL; + } + return (fil); +} + +void +tcp_http_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, uint64_t ts) +{ + (void)tcp_http_alloc_req_full(tp, &user->http_req, ts, 1); +} +#endif Index: sys/netinet/tcp_syncache.c =================================================================== --- sys/netinet/tcp_syncache.c +++ sys/netinet/tcp_syncache.c @@ -932,22 +932,27 @@ * pickup one on the new entry. */ struct tcp_function_block *rblk; + void *ptr = NULL; rblk = find_and_ref_tcp_fb(blk); KASSERT(rblk != NULL, ("cannot find blk %p out of syncache?", blk)); - if (tp->t_fb->tfb_tcp_fb_fini) - (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); - refcount_release(&tp->t_fb->tfb_refcnt); - tp->t_fb = rblk; - /* - * XXXrrs this is quite dangerous, it is possible - * for the new function to fail to init. We also - * are not asking if the handoff_is_ok though at - * the very start thats probalbly ok. - */ - if (tp->t_fb->tfb_tcp_fb_init) { - (*tp->t_fb->tfb_tcp_fb_init)(tp); + + if (rblk->tfb_tcp_fb_init == NULL || + (*rblk->tfb_tcp_fb_init)(tp, &ptr) == 0) { + /* Release the old stack */ + if (tp->t_fb->tfb_tcp_fb_fini != NULL) + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); + refcount_release(&tp->t_fb->tfb_refcnt); + /* Now set in all the pointers */ + tp->t_fb = rblk; + tp->t_fb_ptr = ptr; + } else { + /* + * Initialization failed. Release the reference count on + * the looked up default stack. + */ + refcount_release(&rblk->tfb_refcnt); } } tp->snd_wl1 = sc->sc_irs; Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -1659,6 +1659,7 @@ */ struct tcp_function_set fsn; struct tcp_function_block *blk; + void *ptr = NULL; INP_WUNLOCK(inp); error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn); @@ -1666,10 +1667,6 @@ return (error); INP_WLOCK(inp); - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - return (ECONNRESET); - } tp = intotcpcb(inp); blk = find_and_ref_tcp_functions(&fsn); @@ -1710,41 +1707,57 @@ return (ENOENT); } /* - * Release the old refcnt, the - * lookup acquired a ref on the - * new one already. + * Ensure the new stack takes ownership with a + * clean slate on peak rate threshold. */ - if (tp->t_fb->tfb_tcp_fb_fini) { - struct epoch_tracker et; - /* - * Tell the stack to cleanup with 0 i.e. - * the tcb is not going away. - */ - NET_EPOCH_ENTER(et); - (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); - NET_EPOCH_EXIT(et); - } + tp->t_peakrate_thr = 0; #ifdef TCPHPTS /* Assure that we are not on any hpts */ tcp_hpts_remove(tptoinpcb(tp)); #endif if (blk->tfb_tcp_fb_init) { - error = (*blk->tfb_tcp_fb_init)(tp); + error = (*blk->tfb_tcp_fb_init)(tp, &ptr); if (error) { + /* + * Release the ref count the lookup + * acquired. + */ refcount_release(&blk->tfb_refcnt); - if (tp->t_fb->tfb_tcp_fb_init) { - if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) { - /* Fall back failed, drop the connection */ - INP_WUNLOCK(inp); - soabort(so); - return (error); - } + /* + * Now there is a chance that the + * init() function mucked with some + * things before it failed, such as + * hpts or inp_flags2 or timer granularity. + * It should not of, but lets give the old + * stack a chance to reset to a known good state. + */ + if (tp->t_fb->tfb_switch_failed) { + (*tp->t_fb->tfb_switch_failed)(tp); } - goto err_out; + goto err_out; } } + if (tp->t_fb->tfb_tcp_fb_fini) { + struct epoch_tracker et; + /* + * Tell the stack to cleanup with 0 i.e. + * the tcb is not going away. + */ + NET_EPOCH_ENTER(et); + (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); + NET_EPOCH_EXIT(et); + } + /* + * Release the old refcnt, the + * lookup acquired a ref on the + * new one already. + */ refcount_release(&tp->t_fb->tfb_refcnt); + /* + * Set in the new stack. + */ tp->t_fb = blk; + tp->t_fb_ptr = ptr; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { tcp_offload_ctloutput(tp, sopt->sopt_dir, @@ -1754,6 +1767,7 @@ err_out: INP_WUNLOCK(inp); return (error); + } /* Pass in the INP locked, callee must unlock it. */ Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -61,6 +61,15 @@ #define TCP_EI_STATUS_2MSL 0xb #define TCP_EI_STATUS_MAX_VALUE 0xb +#define TCP_HTTP_REQ_LOG_NEW 0x01 +#define TCP_HTTP_REQ_LOG_COMPLETE 0x02 +#define TCP_HTTP_REQ_LOG_FREED 0x03 +#define TCP_HTTP_REQ_LOG_ALLOCFAIL 0x04 +#define TCP_HTTP_REQ_LOG_MOREYET 0x05 +#define TCP_HTTP_REQ_LOG_FORCEFREE 0x06 +#define TCP_HTTP_REQ_LOG_STALE 0x07 +#define TCP_HTTP_REQ_LOG_SEARCH 0x08 + /************************************************/ /* Status bits we track to assure no duplicates, * the bits here are not used by the code but @@ -126,6 +135,154 @@ STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); +#define TCP_HTTP_TRACK_FLG_EMPTY 0x00 /* Available */ +#define TCP_HTTP_TRACK_FLG_USED 0x01 /* In use */ +#define TCP_HTTP_TRACK_FLG_OPEN 0x02 /* End is not valid (open range request) */ +#define TCP_HTTP_TRACK_FLG_SEQV 0x04 /* We had a sendfile that touched it */ +#define TCP_HTTP_TRACK_FLG_COMP 0x08 /* Sendfile as placed the last bits (range req only) */ +#define TCP_HTTP_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */ +#define MAX_TCP_HTTP_REQ 5 /* Max we will have at once */ + +#ifdef TCP_REQUEST_TRK +struct http_sendfile_track { + uint64_t timestamp; /* User sent timestamp */ + uint64_t start; /* Start of sendfile offset */ + uint64_t end; /* End if not open-range req */ + uint64_t localtime; /* Time we actually got the req */ + uint64_t deadline; /* If in CU mode, deadline to delivery */ + uint64_t first_send; /* Time of first send in the range */ + uint64_t cspr; /* Client suggested pace rate */ + uint64_t sent_at_fs; /* What was t_sndbytes as we begun sending */ + uint64_t rxt_at_fs; /* What was t_snd_rxt_bytes as we begun sending */ + tcp_seq start_seq; /* First TCP Seq assigned */ + tcp_seq end_seq; /* If range req last seq */ + uint32_t flags; /* Type of request open etc */ + uint32_t sbcc_at_s; /* When we allocate what is the sb_cc */ + uint32_t hint_maxseg; /* Client hinted maxseg */ + uint32_t hybrid_flags; /* Hybrid flags on this request */ +}; + +#endif + +/* + * Change Query responses for a stack switch we create a structure + * that allows query response from the new stack to the old, if + * supported. + * + * There are three queries currently defined. + * - sendmap + * - timers + * - rack_times + * + * For the sendmap query the caller fills in the + * req and the req_param as the first seq (usually + * snd_una). When the response comes back indicating + * that there was data (return value 1), then the caller + * can build a sendmap entry based on the range and the + * times. The next query would then be done at the + * newly created sendmap_end. Repeated until sendmap_end == snd_max. + * + * Flags in sendmap_flags are defined below as well. + * + * For timers the standard PACE_TMR_XXXX flags are returned indicating + * a pacing timer (possibly) and one other timer. If pacing timer then + * the expiration timeout time in microseconds is in timer_pacing_to. + * And the value used with whatever timer (if a flag is set) is in + * timer_rxt. If no timers are running a 0 is returned and of + * course no flags are set in timer_hpts_flags. + * + * The rack_times are a misc collection of information that + * the old stack might possibly fill in. Of course its possible + * that an old stack may not have a piece of information. If so + * then setting that value to zero is advised. Setting any + * timestamp passed should only place a zero in it when it + * is unfilled. This may mean that a time is off by a micro-second + * but this is ok in the grand scheme of things. + * + * When switching stacks it is desireable to get as much information + * from the old stack to the new stack as possible. Though not always + * will the stack be compatible in the types of information. The + * init() function needs to take care when it begins changing + * things such as inp_flags2 and the timer units to position these + * changes at a point where it is unlikely they will fail after + * making such changes. A stack optionally can have an "undo" + * function + * + * To transfer information to the old stack from the new in + * respect to LRO and the inp_flags2, the new stack should set + * the inp_flags2 to what it supports. The old stack in its + * fini() function should call the tcp_handle_orphaned_packets() + * to clean up any packets. Note that a new stack should attempt + */ + +/* Query types */ +#define TCP_QUERY_SENDMAP 1 +#define TCP_QUERY_TIMERS_UP 2 +#define TCP_QUERY_RACK_TIMES 3 + +/* Flags returned in sendmap_flags */ +#define SNDMAP_ACKED 0x000001/* The remote endpoint acked this */ +#define SNDMAP_OVERMAX 0x000008/* We have more retran's then we can fit */ +#define SNDMAP_SACK_PASSED 0x000010/* A sack was done above this block */ +#define SNDMAP_HAS_FIN 0x000040/* segment is sent with fin */ +#define SNDMAP_TLP 0x000080/* segment sent as tail-loss-probe */ +#define SNDMAP_HAS_SYN 0x000800/* SYN is on this guy */ +#define SNDMAP_HAD_PUSH 0x008000/* Push was sent on original send */ +#define SNDMAP_MASK (SNDMAP_ACKED|SNDMAP_OVERMAX|SNDMAP_SACK_PASSED|SNDMAP_HAS_FIN\ + |SNDMAP_TLP|SNDMAP_HAS_SYN|SNDMAP_HAD_PUSH) +#define SNDMAP_NRTX 3 + +struct tcp_query_resp { + int req; + uint32_t req_param; + union { + struct { + tcp_seq sendmap_start; + tcp_seq sendmap_end; + int sendmap_send_cnt; + uint64_t sendmap_time[SNDMAP_NRTX]; + uint64_t sendmap_ack_arrival; + int sendmap_flags; + uint32_t sendmap_r_rtr_bytes; + /* If FAS is available if not 0 */ + uint32_t sendmap_fas; + uint8_t sendmap_dupacks; + }; + struct { + uint32_t timer_hpts_flags; + uint32_t timer_pacing_to; + uint32_t timer_timer_exp; + }; + struct { + /* Timestamps and rtt's */ + uint32_t rack_reorder_ts; /* Last uscts that reordering was seen */ + uint32_t rack_num_dsacks; /* Num of dsacks seen */ + uint32_t rack_rxt_last_time; /* Last time a RXT/TLP or rack tmr went off */ + uint32_t rack_min_rtt; /* never 0 smallest rtt seen */ + uint32_t rack_rtt; /* Last rtt used by rack */ + uint32_t rack_tmit_time; /* The time the rtt seg was tmited */ + uint32_t rack_time_went_idle; /* If in persist the time we went idle */ + /* Prr data */ + uint32_t rack_sacked; + uint32_t rack_holes_rxt; + uint32_t rack_prr_delivered; + uint32_t rack_prr_recovery_fs; + uint32_t rack_prr_out; + uint32_t rack_prr_sndcnt; + /* TLP data */ + uint16_t rack_tlp_cnt_out; /* How many tlp's have been sent */ + /* Various bits */ + uint8_t rack_tlp_out; /* Is a TLP outstanding */ + uint8_t rack_srtt_measured; /* The previous stack has measured srtt */ + uint8_t rack_in_persist; /* Is the old stack in persists? */ + uint8_t rack_wanted_output; /* Did the prevous stack have a want output set */ + }; + }; +}; + +#define TCP_TMR_GRANULARITY_TICKS 1 /* TCP timers are in ticks (msec if hz=1000) */ +#define TCP_TMR_GRANULARITY_USEC 2 /* TCP timers are in microseconds */ + typedef enum { TT_REXMT = 0, TT_PERSIST, @@ -276,6 +433,11 @@ #ifdef TCP_ACCOUNTING uint64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS]; uint64_t tcp_proc_time[TCP_NUM_CNT_COUNTERS]; +#endif +#ifdef TCP_REQUEST_TRK + uint32_t tcp_hybrid_start; /* Num of times we started hybrid pacing */ + uint32_t tcp_hybrid_stop; /* Num of times we stopped hybrid pacing */ + uint32_t tcp_hybrid_error; /* Num of times we failed to start hybrid pacing */ #endif uint32_t t_logsn; /* Log "serial number" */ uint32_t gput_ts; /* Time goodput measurement started */ @@ -290,6 +452,7 @@ uint32_t t_dsack_bytes; /* dsack bytes received */ uint32_t t_dsack_tlp_bytes; /* dsack bytes received for TLPs sent */ uint32_t t_dsack_pack; /* dsack packets we have eceived */ + uint8_t t_tmr_granularity; /* Granularity of all timers srtt etc */ uint8_t t_rttupdated; /* number of times rtt sampled */ /* TCP Fast Open */ uint8_t t_tfo_client_cookie_len; /* TFO client cookie length */ @@ -311,6 +474,13 @@ struct osd t_osd; /* storage for Khelp module data */ #endif uint8_t _t_logpoint; /* Used when a BB log points is enabled */ +#ifdef TCP_REQUEST_TRK + /* Response tracking addons. */ + uint8_t t_http_req; /* Request count */ + uint8_t t_http_open; /* Number of open range requests */ + uint8_t t_http_closed; /* Number of closed range requests */ + struct http_sendfile_track t_http_info[MAX_TCP_HTTP_REQ]; +#endif }; #endif /* _KERNEL || _WANT_TCPCB */ @@ -346,7 +516,7 @@ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ #define TCP_FUNC_OUTPUT_CANDROP 0x02 /* tfb_tcp_output may ask tcp_drop */ -/* +/** * If defining the optional tcp_timers, in the * tfb_tcp_timer_stop call you must use the * callout_async_drain() function with the @@ -356,6 +526,7 @@ * does not know your callbacks you must provide a * stop_all function that loops through and calls * tcp_timer_stop() with each of your defined timers. + * * Adding a tfb_tcp_handoff_ok function allows the socket * option to change stacks to query you even if the * connection is in a later stage. You return 0 to @@ -363,16 +534,67 @@ * non-zero (an error number) to say no you can't. * If the function is undefined you can only change * in the early states (before connect or listen). + * + * tfb_tcp_fb_init is used to allow the new stack to + * setup its control block. Among the things it must + * do is: + * a) Make sure that the inp_flags2 is setup correctly + * for LRO. There are two flags that the previous + * stack may have set INP_MBUF_ACKCMP and + * INP_SUPPORTS_MBUFQ. If the new stack does not + * support these it *should* clear the flags. + * b) Make sure that the timers are in the proper + * granularity that the stack wants. The stack + * should check the t_tmr_granularity field. Currently + * there are two values that it may hold + * TCP_TMR_GRANULARITY_TICKS and TCP_TMR_GRANULARITY_USEC. + * Use the functions tcp_timer_convert(tp, granularity); + * to move the timers to the correct format for your stack. + * + * The new stack may also optionally query the tfb_chg_query + * function if the old stack has one. The new stack may ask + * for one of three entries and can also state to the old + * stack its support for the INP_MBUF_ACKCMP and + * INP_SUPPORTS_MBUFQ. This is important since if there are + * queued ack's without that statement the old stack will + * be forced to discard the queued acks. The requests that + * can be made for information by the new stacks are: + * + * Note also that the tfb_tcp_fb_init() when called can + * determine if a query is needed by looking at the + * value passed in the ptr. The ptr is designed to be + * set in with any allocated memory, but the address + * of the condtion (ptr == &tp->t_fb_ptr) will be + * true if this is not a stack switch but the initial + * setup of a tcb (which means no query would be needed). + * If, however, the value is not t_fb_ptr, then the caller + * is in the middle of a stack switch and is the new stack. + * A query would be appropriate (if the new stack support + * the query mechanism). + * + * TCP_QUERY_SENDMAP - Query of outstanding data. + * TCP_QUERY_TIMERS_UP - Query about running timers. + * TCP_SUPPORTED_LRO - Declaration in req_param of + * the inp_flags2 supported by + * the new stack. + * TCP_QUERY_RACK_TIMES - Enquire about various timestamps + * and states the old stack may be in. + * * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being * destroyed, a zero indicates its transitioning to - * another stack (via socket option). + * another stack (via socket option). The + * tfb_tcp_fb_fini() function itself should not change timers + * or inp_flags2 (the tfb_tcp_fb_init() must do that). However + * if the old stack supports the LRO mbuf queuing, and the new + * stack does not communicate via chg messages that it too does, + * it must assume it does not and free any queued mbufs. + * */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); - int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *); void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); @@ -387,15 +609,18 @@ int, struct timeval *); int (*tfb_tcp_ctloutput)(struct inpcb *inp, struct sockopt *sopt); /* Optional memory allocation/free routine */ - int (*tfb_tcp_fb_init)(struct tcpcb *); + int (*tfb_tcp_fb_init)(struct tcpcb *, void **); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); - void (*tfb_tcp_mtu_chg)(struct tcpcb *); + void (*tfb_tcp_mtu_chg)(struct tcpcb *tp); int (*tfb_pru_options)(struct tcpcb *, int); void (*tfb_hwtls_change)(struct tcpcb *, int); + int (*tfb_chg_query)(struct tcpcb *, struct tcp_query_resp *); + void (*tfb_switch_failed)(struct tcpcb *); + bool (*tfb_early_wake_check)(struct tcpcb *); int (*tfb_compute_pipe)(struct tcpcb *tp); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; @@ -445,6 +670,16 @@ return (rv); } +static inline void +tcp_lro_features_off(struct inpcb *inp) +{ + inp->inp_flags2 &= ~(INP_SUPPORTS_MBUFQ| + INP_MBUF_QUEUE_READY| + INP_DONT_SACK_QUEUE| + INP_MBUF_ACKCMP| + INP_MBUF_L_ACKS); +} + /* * tcp_output_unlock() * Always returns unlocked, handles drop request from advanced stacks. @@ -1169,6 +1404,7 @@ #ifdef NETFLIX_EXP_DETECTION /* Various SACK attack thresholds */ extern int32_t tcp_force_detection; +extern int32_t tcp_sad_limit; extern int32_t tcp_sack_to_ack_thresh; extern int32_t tcp_sack_to_move_thresh; extern int32_t tcp_restoral_thresh; @@ -1176,6 +1412,7 @@ extern int32_t tcp_sad_pacing_interval; extern int32_t tcp_sad_low_pps; extern int32_t tcp_map_minimum; +extern int32_t tcp_attack_on_turns_on_logging; #endif extern uint32_t tcp_ack_war_time_window; extern uint32_t tcp_ack_war_cnt; @@ -1246,6 +1483,8 @@ size_t seed_len); int tcp_can_enable_pacing(void); void tcp_decrement_paced_conn(void); +void tcp_change_time_units(struct tcpcb *, int); +void tcp_handle_orphaned_packets(struct tcpcb *); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, @@ -1253,6 +1492,31 @@ int tcp_stats_init(void); void tcp_log_end_status(struct tcpcb *tp, uint8_t status); +#ifdef TCP_REQUEST_TRK +void tcp_http_free_a_slot(struct tcpcb *tp, struct http_sendfile_track *ent); +struct http_sendfile_track * +tcp_http_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip); +int tcp_http_check_for_comp(struct tcpcb *tp, tcp_seq ack_point); +int +tcp_http_is_entry_comp(struct tcpcb *tp, struct http_sendfile_track *ent, tcp_seq ack_point); +struct http_sendfile_track * +tcp_http_find_req_for_seq(struct tcpcb *tp, tcp_seq seq); +void +tcp_http_log_req_info(struct tcpcb *tp, + struct http_sendfile_track *http, uint16_t slot, + uint8_t val, uint64_t offset, uint64_t nbytes); + +uint32_t +tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes); +void +tcp_http_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, + uint64_t ts); + +struct http_sendfile_track * +tcp_http_alloc_req_full(struct tcpcb *tp, struct http_req *req, uint64_t ts, int rec_dups); + + +#endif #ifdef TCP_ACCOUNTING int tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss); #endif Index: sys/sys/mbuf.h =================================================================== --- sys/sys/mbuf.h +++ sys/sys/mbuf.h @@ -1235,6 +1235,16 @@ #define M_LEADINGSPACE(m) \ (M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0) +/* + * So M_TRAILINGROOM() is for when you want to know how much space + * would be there if it was writable. This can be used to + * detect changes in mbufs by knowing the value at one point + * and then being able to compare it later to the current M_TRAILINGROOM(). + * The TRAILINGSPACE() macro is not suitable for this since an mbuf + * at one point might not be writable and then later it becomes writable + * even though the space at the back of it has not changed. + */ +#define M_TRAILINGROOM(m) ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) /* * Compute the amount of space available after the end of data in an mbuf. * @@ -1245,9 +1255,7 @@ * for mbufs with external storage. We now allow mbuf-embedded data to be * read-only as well. */ -#define M_TRAILINGSPACE(m) \ - (M_WRITABLE(m) ? \ - ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0) +#define M_TRAILINGSPACE(m) (M_WRITABLE(m) ? M_TRAILINGROOM(m) : 0) /* * Arrange to prepend space of size plen to mbuf m. If a new mbuf must be