Page MenuHomeFreeBSD

D20834.id59366.diff
No OneTemporary

D20834.id59366.diff

Index: modules/Makefile
===================================================================
--- modules/Makefile
+++ modules/Makefile
@@ -268,7 +268,6 @@
nge \
nmdm \
nullfs \
- ${_ntb} \
${_nvd} \
${_nvdimm} \
${_nvme} \
Index: modules/tcp/rack/Makefile
===================================================================
--- modules/tcp/rack/Makefile
+++ modules/tcp/rack/Makefile
@@ -6,7 +6,7 @@
STACKNAME= rack
KMOD= tcp_${STACKNAME}
-SRCS= rack.c sack_filter.c
+SRCS= rack.c sack_filter.c rack_bbr_common.c
SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
SRCS+= opt_tcpdebug.h
Index: netinet/in_pcb.h
===================================================================
--- netinet/in_pcb.h
+++ netinet/in_pcb.h
@@ -759,7 +759,9 @@
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */
#define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */
-
+#define INP_SUPPORTS_MBUFQ 0x00004000 /* Supports the mbuf queue method of LRO */
+#define INP_MBUF_QUEUE_READY 0x00008000 /* The transport is pacing, inputs can be queued */
+#define INP_DONT_SACK_QUEUE 0x00010000 /* If a sack arrives do not wake me */
/*
* Flags passed to in_pcblookup*() functions.
*/
Index: netinet/tcp.h
===================================================================
--- netinet/tcp.h
+++ netinet/tcp.h
@@ -201,9 +201,8 @@
#define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */
#define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */
#define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */
-#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */
#define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */
-#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */
+#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */
#define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */
#define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */
#define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */
@@ -211,14 +210,18 @@
#define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */
#define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */
#define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */
-#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */
-#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */
-#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */
+#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */
+#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */
+#define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */
+#define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */
+#define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */
+#define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */
#define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */
#define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */
#define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */
#define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */
-#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */
+#define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */
+#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */
#define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */
#define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */
#define TCP_BBR_PACE_PER_SEC 1086
@@ -227,11 +230,12 @@
#define TCP_BBR_PACE_SEG_MIN 1089
#define TCP_BBR_PACE_CROSS 1090
#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
-#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
#define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */
#define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */
+#define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase */
#define TCP_RACK_TLP_USE 1095
#define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */
+#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */
#define TCP_BBR_EXTRA_GAIN 1097
#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */
#define TCP_BBR_RETRAN_WTSO 1099
@@ -238,6 +242,15 @@
#define TCP_DATA_AFTER_CLOSE 1100
#define TCP_BBR_PROBE_RTT_GAIN 1101
#define TCP_BBR_PROBE_RTT_LEN 1102
+#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */
+#define TCP_BBR_USE_RACK_CHEAT 1104 /* Do we use the rack cheat for pacing rxt's */
+#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */
+#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */
+#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */
+#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */
+#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */
+#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */
+#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */
/* Start of reserved space for third-party user-settable options. */
Index: netinet/tcp_hpts.h
===================================================================
--- netinet/tcp_hpts.h
+++ netinet/tcp_hpts.h
@@ -45,112 +45,80 @@
/* Number of useconds in a hpts tick */
#define HPTS_TICKS_PER_USEC 10
-#define HPTS_MS_TO_SLOTS(x) (x * 100)
+#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
#define HPTS_USEC_IN_SEC 1000000
#define HPTS_MSEC_IN_SEC 1000
#define HPTS_USEC_IN_MSEC 1000
-#define DEFAULT_HPTS_LOG 3072
-/*
- * Log flags consist of
- * 7f 7f 1 1 bits
- * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE
- *
- * So for example cpu 10, number 10 would with
- * input active would show up as:
- * p_flags = 0001010 0001010 1 0
- * <or>
- * p_flags = 0x142a
- */
-#define HPTS_HPTS_ACTIVE 0x01
-#define HPTS_INPUT_ACTIVE 0x02
-
-#define HPTSLOG_IMMEDIATE 1
-#define HPTSLOG_INSERT_NORMAL 2
-#define HPTSLOG_INSERT_SLEEPER 3
-#define HPTSLOG_SLEEP_AFTER 4
-#define HPTSLOG_SLEEP_BEFORE 5
-#define HPTSLOG_INSERTED 6
-#define HPTSLOG_WAKEUP_HPTS 7
-#define HPTSLOG_SETTORUN 8
-#define HPTSLOG_HPTSI 9
-#define HPTSLOG_TOLONG 10
-#define HPTSLOG_AWAKENS 11
-#define HPTSLOG_TIMESOUT 12
-#define HPTSLOG_SLEEPSET 13
-#define HPTSLOG_WAKEUP_INPUT 14
-#define HPTSLOG_RESCHEDULE 15
-#define HPTSLOG_AWAKE 16
-#define HPTSLOG_INP_DONE 17
-
-struct hpts_log {
- struct inpcb *inp;
- int32_t event;
- uint32_t cts;
- int32_t line;
- uint32_t ticknow;
- uint32_t t_paceslot;
- uint32_t t_hptsreq;
- uint32_t p_curtick;
- uint32_t p_prevtick;
- uint32_t slot_req;
- uint32_t p_on_queue_cnt;
- uint32_t p_nxt_slot;
- uint32_t p_cur_slot;
- uint32_t p_hpts_sleep_time;
- uint16_t p_flags;
- uint8_t p_onhpts;
- uint8_t p_oninput;
- uint8_t is_notempty;
-};
-
struct hpts_diag {
- uint32_t p_hpts_active;
- uint32_t p_nxt_slot;
- uint32_t p_cur_slot;
- uint32_t slot_req;
- uint32_t inp_hptsslot;
- uint32_t slot_now;
- uint32_t have_slept;
- uint32_t hpts_sleep_time;
- uint32_t yet_to_sleep;
- uint32_t need_new_to;
- int32_t co_ret;
- uint8_t p_on_min_sleep;
+ uint32_t p_hpts_active; /* bbr->flex7 x */
+ uint32_t p_nxt_slot; /* bbr->flex1 x */
+ uint32_t p_cur_slot; /* bbr->flex2 x */
+ uint32_t p_prev_slot; /* bbr->delivered */
+ uint32_t p_runningtick; /* bbr->inflight */
+ uint32_t slot_req; /* bbr->flex3 x */
+ uint32_t inp_hptsslot; /* bbr->flex4 x */
+ uint32_t slot_remaining; /* bbr->flex5 x */
+ uint32_t have_slept; /* bbr->epoch x */
+ uint32_t hpts_sleep_time; /* bbr->applimited x */
+ uint32_t yet_to_sleep; /* bbr->lt_epoch x */
+ uint32_t need_new_to; /* bbr->flex6 x */
+ uint32_t wheel_tick; /* bbr->bw_inuse x */
+ uint32_t maxticks; /* bbr->delRate x */
+ uint32_t wheel_cts; /* bbr->rttProp x */
+ int32_t co_ret; /* bbr->pkts_out x */
+ uint32_t p_curtick; /* upper bbr->cur_del_rate */
+ uint32_t p_lasttick; /* lower bbr->cur_del_rate */
+ uint8_t p_on_min_sleep; /* bbr->flex8 x */
};
+/* Magic flags to tell whats cooking on the pacing wheel */
+#define PACE_TMR_DELACK 0x01 /* Delayed ack timer running */
+#define PACE_TMR_RACK 0x02 /* RACK timer running */
+#define PACE_TMR_TLP 0x04 /* TLP timer running */
+#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
+#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
+#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
+#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */
+#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
+
#ifdef _KERNEL
/* Each hpts has its own p_mtx which is used for locking */
struct tcp_hpts_entry {
/* Cache line 0x00 */
struct mtx p_mtx; /* Mutex for hpts */
- uint32_t p_hpts_active; /* Flag that says hpts is awake */
- uint32_t p_curtick; /* Current tick in 10 us the hpts is at */
- uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */
+ uint16_t p_hpts_active; /* Flag that says hpts is awake */
+ uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */
+ uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
+ uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
+ uint32_t p_runningtick; /* Current tick we are at if we are running */
+ uint32_t p_prev_slot; /* Previous slot we were on */
uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
uint32_t p_nxt_slot; /* The next slot outside the current range of
* slots that the hpts is running on. */
int32_t p_on_queue_cnt; /* Count on queue in this hpts */
- uint32_t enobuf_cnt;
- uint16_t p_log_at;
+ uint32_t p_lasttick; /* Last tick before the current one */
uint8_t p_direct_wake :1, /* boolean */
- p_log_wrapped :1, /* boolean */
- p_on_min_sleep:1; /* boolean */
- uint8_t p_fill;
+ p_on_min_sleep:1, /* boolean */
+ p_avail:6;
+ uint8_t p_fill[3]; /* Fill to 32 bits */
/* Cache line 0x40 */
void *p_inp;
struct hptsh p_input; /* For the tcp-input runner */
/* Hptsi wheel */
struct hptsh *p_hptss;
- struct hpts_log *p_log;
- uint32_t p_logsize;
int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
uint32_t hit_no_enobuf;
uint32_t p_dyn_adjust;
uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
* of 255ms */
+ uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
+ uint32_t saved_lasttick; /* for logging */
+ uint32_t saved_curtick; /* for logging */
+ uint32_t saved_curslot; /* for logging */
+ uint32_t saved_prev_slot; /* for logging */
uint32_t p_delayed_by; /* How much were we delayed by */
/* Cache line 0x80 */
struct sysctl_ctx_list hpts_ctx;
@@ -236,13 +204,9 @@
int
__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line);
#define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
-void
-tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos);
int
-__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line);
-#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__)
+__tcp_queue_to_input(struct inpcb *inp, int32_t line);
+#define tcp_queue_to_input(a) __tcp_queue_to_input(a, __LINE__)
uint16_t tcp_hpts_delayedby(struct inpcb *inp);
Index: netinet/tcp_hpts.c
===================================================================
--- netinet/tcp_hpts.c
+++ netinet/tcp_hpts.c
@@ -37,7 +37,7 @@
* pacing packets out onto the wire. It can be used in two ways
* by a given TCP stack (and those two methods can be used simultaneously).
*
- * First, and probably the main thing its used by Rack and BBR for, it can
+ * First, and probably the main thing its used by Rack and BBR, it can
* be used to call tcp_output() of a transport stack at some time in the future.
* The normal way this is done is that tcp_output() of the stack schedules
* itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
@@ -59,42 +59,57 @@
* to prevent output processing until the time alotted has gone by.
* Of course this is a bare bones example and the stack will probably
* have more consideration then just the above.
- *
- * Now the tcp_hpts system will call tcp_output in one of two forms,
- * it will first check to see if the stack as defined a
- * tfb_tcp_output_wtime() function, if so that is the routine it
- * will call, if that function is not defined then it will call the
- * tfb_tcp_output() function. The only difference between these
- * two calls is that the former passes the time in to the function
- * so the function does not have to access the time (which tcp_hpts
- * already has). What these functions do is of course totally up
- * to the individual tcp stack.
- *
+ *
* Now the second function (actually two functions I guess :D)
* the tcp_hpts system provides is the ability to either abort
- * a connection (later) or process input on a connection.
- * Why would you want to do this? To keep processor locality.
+ * a connection (later) or process input on a connection.
+ * Why would you want to do this? To keep processor locality
+ * and or not have to worry about untangling any recursive
+ * locks. The input function now is hooked to the new LRO
+ * system as well.
*
- * So in order to use the input redirection function the
- * stack changes its tcp_do_segment() routine to instead
- * of process the data call the function:
+ * In order to use the input redirection function the
+ * tcp stack must define an input function for
+ * tfb_do_queued_segments(). This function understands
+ * how to dequeue a array of packets that were input and
+ * knows how to call the correct processing routine.
*
- * tcp_queue_pkt_to_input()
+ * Locking in this is important as well so most likely the
+ * stack will need to define the tfb_do_segment_nounlock()
+ * splitting tfb_do_segment() into two parts. The main processing
+ * part that does not unlock the INP and returns a value of 1 or 0.
+ * It returns 0 if all is well and the lock was not released. It
+ * returns 1 if we had to destroy the TCB (a reset received etc).
+ * The remains of tfb_do_segment() then become just a simple call
+ * to the tfb_do_segment_nounlock() function and check the return
+ * code and possibly unlock.
+ *
+ * The stack must also set the flag on the INP that it supports this
+ * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
+ * this flag as well and will queue packets when it is set.
+ * There are other flags as well INP_MBUF_QUEUE_READY and
+ * INP_DONT_SACK_QUEUE. The first flag tells the LRO code
+ * that we are in the pacer for output so there is no
+ * need to wake up the hpts system to get immediate
+ * input. The second tells the LRO code that its okay
+ * if a SACK arrives you can still defer input and let
+ * the current hpts timer run (this is usually set when
+ * a rack timer is up so we know SACK's are happening
+ * on the connection already and don't want to wakeup yet).
*
- * You will note that the arguments to this function look
- * a lot like tcp_do_segments's arguments. This function
- * will assure that the tcp_hpts system will
- * call the functions tfb_tcp_hpts_do_segment() from the
- * correct CPU. Note that multiple calls can get pushed
- * into the tcp_hpts system this will be indicated by
- * the next to last argument to tfb_tcp_hpts_do_segment()
- * (nxt_pkt). If nxt_pkt is a 1 then another packet is
- * coming. If nxt_pkt is a 0 then this is the last call
- * that the tcp_hpts system has available for the tcp stack.
- *
- * The other point of the input system is to be able to safely
- * drop a tcp connection without worrying about the recursive
- * locking that may be occuring on the INP_WLOCK. So if
+ * There is a common functions within the rack_bbr_common code
+ * version i.e. ctf_do_queued_segments(). This function
+ * knows how to take the input queue of packets from
+ * tp->t_in_pkts and process them digging out
+ * all the arguments, calling any bpf tap and
+ * calling into tfb_do_segment_nounlock(). The common
+ * function (ctf_do_queued_segments()) requires that
+ * you have defined the tfb_do_segment_nounlock() as
+ * described above.
+ *
+ * The second feature of the input side of hpts is the
+ * dropping of a connection. This is due to the way that
+ * locking may have occured on the INP_WLOCK. So if
* a stack wants to drop a connection it calls:
*
* tcp_set_inp_to_drop(tp, ETIMEDOUT)
@@ -156,6 +171,7 @@
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_log_buf.h>
#ifdef tcpdebug
#include <netinet/tcp_debug.h>
@@ -168,8 +184,6 @@
MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
#ifdef RSS
-#include <net/netisr.h>
-#include <net/rss_config.h>
static int tcp_bind_threads = 1;
#else
static int tcp_bind_threads = 2;
@@ -176,16 +190,13 @@
#endif
TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
-static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG;
-
-TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
-
static struct tcp_hptsi tcp_pace;
+static int hpts_does_tp_logging = 0;
static void tcp_wakehpts(struct tcp_hpts_entry *p);
static void tcp_wakeinput(struct tcp_hpts_entry *p);
static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
-static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick);
+static void tcp_hptsi(struct tcp_hpts_entry *hpts);
static void tcp_hpts_thread(void *ctx);
static void tcp_init_hptsi(void *st);
@@ -204,8 +215,6 @@
} \
} while (0)
-static int32_t logging_on = 0;
-static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
static int32_t tcp_hpts_precision = 120;
struct hpts_domain_info {
@@ -219,10 +228,6 @@
&tcp_hpts_precision, 120,
"Value for PRE() precision of callout");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
- &logging_on, 0,
- "Turn on logging if compiled in");
-
counter_u64_t hpts_loops;
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
@@ -233,30 +238,53 @@
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
&back_tosleep, "Number of times hpts found no tcbs");
-static int32_t in_newts_every_tcb = 0;
+counter_u64_t combined_wheel_wrap;
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW,
- &in_newts_every_tcb, 0,
- "Do we have a new cts every tcb we process for input");
-static int32_t in_ts_percision = 0;
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
+ &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW,
- &in_ts_percision, 0,
- "Do we use percise timestamp for clients on input");
-static int32_t out_newts_every_tcb = 0;
+counter_u64_t wheel_wrap;
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW,
- &out_newts_every_tcb, 0,
- "Do we have a new cts every tcb we process for output");
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD,
+ &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
+
static int32_t out_ts_percision = 0;
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
&out_ts_percision, 0,
"Do we use a percise timestamp for every output cts");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
+ &hpts_does_tp_logging, 0,
+ "Do we add to any tp that has logging on pacer logs");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW,
+
+#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)
+
+static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;
+
+static int
+sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ uint32_t new;
+
+ new = hpts_sleep_max;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error == 0 && req->newptr) {
+ if ((new < (NUM_OF_HPTSI_SLOTS / 4)) ||
+ (new > HPTS_MAX_SLEEP_ALLOWED))
+ error = EINVAL;
+ else
+ hpts_sleep_max = new;
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
+ CTLTYPE_UINT | CTLFLAG_RW,
&hpts_sleep_max, 0,
- "The maximum time the hpts will sleep <1 - 254>");
+ &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
+ "Maximum time hpts will sleep");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
&tcp_min_hptsi_time, 0,
@@ -267,55 +295,35 @@
"Do we have the callout call directly to the hpts?");
static void
-__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot,
- uint32_t ticknow, int32_t line)
+tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
+ int ticks_to_run, int idx)
{
- struct hpts_log *pl;
-
- HPTS_MTX_ASSERT(hpts);
- if (hpts->p_log == NULL)
- return;
- pl = &hpts->p_log[hpts->p_log_at];
- hpts->p_log_at++;
- if (hpts->p_log_at >= hpts->p_logsize) {
- hpts->p_log_at = 0;
- hpts->p_log_wrapped = 1;
- }
- pl->inp = inp;
- if (inp) {
- pl->t_paceslot = inp->inp_hptsslot;
- pl->t_hptsreq = inp->inp_hpts_request;
- pl->p_onhpts = inp->inp_in_hpts;
- pl->p_oninput = inp->inp_in_input;
- } else {
- pl->t_paceslot = 0;
- pl->t_hptsreq = 0;
- pl->p_onhpts = 0;
- pl->p_oninput = 0;
- }
- pl->is_notempty = 1;
- pl->event = event;
- pl->line = line;
- pl->cts = tcp_get_usecs(NULL);
- pl->p_curtick = hpts->p_curtick;
- pl->p_prevtick = hpts->p_prevtick;
- pl->p_on_queue_cnt = hpts->p_on_queue_cnt;
- pl->ticknow = ticknow;
- pl->slot_req = slot;
- pl->p_nxt_slot = hpts->p_nxt_slot;
- pl->p_cur_slot = hpts->p_cur_slot;
- pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time;
- pl->p_flags = (hpts->p_cpu & 0x7f);
- pl->p_flags <<= 7;
- pl->p_flags |= (hpts->p_num & 0x7f);
- pl->p_flags <<= 2;
- if (hpts->p_hpts_active) {
- pl->p_flags |= HPTS_HPTS_ACTIVE;
- }
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex1 = hpts->p_nxt_slot;
+ log.u_bbr.flex2 = hpts->p_cur_slot;
+ log.u_bbr.flex3 = hpts->p_prev_slot;
+ log.u_bbr.flex4 = idx;
+ log.u_bbr.flex5 = hpts->p_curtick;
+ log.u_bbr.flex6 = hpts->p_on_queue_cnt;
+ log.u_bbr.use_lt_bw = 1;
+ log.u_bbr.inflight = ticks_to_run;
+ log.u_bbr.applimited = hpts->overidden_sleep;
+ log.u_bbr.delivered = hpts->saved_curtick;
+ log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
+ log.u_bbr.epoch = hpts->saved_curslot;
+ log.u_bbr.lt_epoch = hpts->saved_prev_slot;
+ log.u_bbr.pkts_out = hpts->p_delayed_by;
+ log.u_bbr.lost = hpts->p_hpts_sleep_time;
+ log.u_bbr.cur_del_rate = hpts->p_runningtick;
+ TCP_LOG_EVENTP(tp, NULL,
+ &tp->t_inpcb->inp_socket->so_rcv,
+ &tp->t_inpcb->inp_socket->so_snd,
+ BBR_LOG_HPTSDIAG, 0,
+ 0, &log, false, tv);
}
-#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__)
-
static void
hpts_timeout_swi(void *arg)
{
@@ -347,12 +355,6 @@
/* We are not on the hpts? */
panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
}
- if (TAILQ_EMPTY(head) &&
- (hpts->p_on_queue_cnt != 0)) {
- /* We should not be empty with a queue count */
- panic("%s hpts:%p hpts bucket empty but cnt:%d",
- __FUNCTION__, hpts, hpts->p_on_queue_cnt);
- }
#endif
TAILQ_REMOVE(head, inp, inp_hpts);
hpts->p_on_queue_cnt--;
@@ -456,58 +458,13 @@
in_pcbref(inp);
}
-static int
-sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS)
-{
- struct tcp_hpts_entry *hpts;
- size_t sz;
- int32_t logging_was, i;
- int32_t error = 0;
-
- /*
- * HACK: Turn off logging so no locks are required this really needs
- * a memory barrier :)
- */
- logging_was = logging_on;
- logging_on = 0;
- if (!req->oldptr) {
- /* How much? */
- sz = 0;
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- hpts = tcp_pace.rp_ent[i];
- if (hpts->p_log == NULL)
- continue;
- sz += (sizeof(struct hpts_log) * hpts->p_logsize);
- }
- error = SYSCTL_OUT(req, 0, sz);
- } else {
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- hpts = tcp_pace.rp_ent[i];
- if (hpts->p_log == NULL)
- continue;
- if (hpts->p_log_wrapped)
- sz = (sizeof(struct hpts_log) * hpts->p_logsize);
- else
- sz = (sizeof(struct hpts_log) * hpts->p_log_at);
- error = SYSCTL_OUT(req, hpts->p_log, sz);
- }
- }
- logging_on = logging_was;
- return error;
-}
-
-SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
- 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
-
-
static void
tcp_wakehpts(struct tcp_hpts_entry *hpts)
{
HPTS_MTX_ASSERT(hpts);
- swi_sched(hpts->ie_cookie, 0);
- if (hpts->p_hpts_active == 2) {
- /* Rare sleeping on a ENOBUF */
- wakeup_one(hpts);
+ if (hpts->p_hpts_wake_scheduled == 0) {
+ hpts->p_hpts_wake_scheduled = 1;
+ swi_sched(hpts->ie_cookie, 0);
}
}
@@ -515,10 +472,9 @@
tcp_wakeinput(struct tcp_hpts_entry *hpts)
{
HPTS_MTX_ASSERT(hpts);
- swi_sched(hpts->ie_cookie, 0);
- if (hpts->p_hpts_active == 2) {
- /* Rare sleeping on a ENOBUF */
- wakeup_one(hpts);
+ if (hpts->p_hpts_wake_scheduled == 0) {
+ hpts->p_hpts_wake_scheduled = 1;
+ swi_sched(hpts->ie_cookie, 0);
}
}
@@ -648,8 +604,8 @@
* Valid values in the flags are
* HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
* HPTS_REMOVE_INPUT - remove from the input of the hpts.
- * Note that you can or both values together and get two
- * actions.
+ * Note that you can use one or both values together
+ * and get two actions.
*/
void
__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
@@ -670,53 +626,198 @@
}
static inline int
-hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus)
+hpts_tick(uint32_t wheel_tick, uint32_t plus)
{
- return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS);
+ /*
+ * Given a slot on the wheel, what slot
+ * is that plus ticks out?
+ */
+ KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick));
+ return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS);
}
+static inline int
+tick_to_wheel(uint32_t cts_in_wticks)
+{
+ /*
+ * Given a timestamp in wheel ticks (10usec inc's)
+ * map it to our limited space wheel.
+ */
+ return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
+}
+
+static inline int
+hpts_ticks_diff(int prev_tick, int tick_now)
+{
+ /*
+ * Given two ticks that are someplace
+ * on our wheel. How far are they apart?
+ */
+ if (tick_now > prev_tick)
+ return (tick_now - prev_tick);
+ else if (tick_now == prev_tick)
+ /*
+ * Special case, same means we can go all of our
+ * wheel less one slot.
+ */
+ return (NUM_OF_HPTSI_SLOTS - 1);
+ else
+ return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now);
+}
+
+/*
+ * Given a tick on the wheel that is the current time
+ * mapped to the wheel (wheel_tick), what is the maximum
+ * distance forward that can be obtained without
+ * wrapping past either prev_tick or running_tick
+ * depending on the htps state? Also if passed
+ * a uint32_t *, fill it with the tick location.
+ *
+ * Note if you do not give this function the current
+ * time (that you think it is) mapped to the wheel
+ * then the results will not be what you expect and
+ * could lead to invalid inserts.
+ */
+static inline int32_t
+max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick)
+{
+ uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel;
+
+ if ((hpts->p_hpts_active == 1) &&
+ (hpts->p_wheel_complete == 0)) {
+ end_tick = hpts->p_runningtick;
+ /* Back up one tick */
+ if (end_tick == 0)
+ end_tick = NUM_OF_HPTSI_SLOTS - 1;
+ else
+ end_tick--;
+ if (target_tick)
+ *target_tick = end_tick;
+ } else {
+ /*
+ * For the case where we are
+ * not active, or we have
+ * completed the pass over
+ * the wheel, we can use the
+ * prev tick and subtract one from it. This puts us
+ * as far out as possible on the wheel.
+ */
+ end_tick = hpts->p_prev_slot;
+ if (end_tick == 0)
+ end_tick = NUM_OF_HPTSI_SLOTS - 1;
+ else
+ end_tick--;
+ if (target_tick)
+ *target_tick = end_tick;
+ /*
+ * Now we have close to the full wheel left minus the
+ * time it has been since the pacer went to sleep. Note
+ * that wheel_tick, passed in, should be the current time
+ * from the perspective of the caller, mapped to the wheel.
+ */
+ if (hpts->p_prev_slot != wheel_tick)
+ dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
+ else
+ dis_to_travel = 1;
+ /*
+ * dis_to_travel in this case is the space from when the
+ * pacer stopped (p_prev_slot) and where our wheel_tick
+ * is now. To know how many slots we can put it in we
+ * subtract from the wheel size. We would not want
+ * to place something after p_prev_slot or it will
+ * get ran too soon.
+ */
+ return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
+ }
+ /*
+ * So how many slots are open between p_runningtick -> p_cur_slot
+ * that is what is currently un-available for insertion. Special
+ * case when we are at the last slot, this gets 1, so that
+ * the answer to how many slots are available is all but 1.
+ */
+ if (hpts->p_runningtick == hpts->p_cur_slot)
+ dis_to_travel = 1;
+ else
+ dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+ /*
+ * How long has the pacer been running?
+ */
+ if (hpts->p_cur_slot != wheel_tick) {
+ /* The pacer is a bit late */
+ pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick);
+ } else {
+ /* The pacer is right on time, now == pacers start time */
+ pacer_to_now = 0;
+ }
+ /*
+ * To get the number left we can insert into we simply
+ * subract the distance the pacer has to run from how
+ * many slots there are.
+ */
+ avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
+ /*
+ * Now how many of those we will eat due to the pacer's
+ * time (p_cur_slot) of start being behind the
+ * real time (wheel_tick)?
+ */
+ if (avail_on_wheel <= pacer_to_now) {
+ /*
+ * Wheel wrap, we can't fit on the wheel, that
+ * is unusual the system must be way overloaded!
+ * Insert into the assured tick, and return special
+ * "0".
+ */
+ counter_u64_add(combined_wheel_wrap, 1);
+ *target_tick = hpts->p_nxt_slot;
+ return (0);
+ } else {
+ /*
+ * We know how many slots are open
+ * on the wheel (the reverse of what
+ * is left to run. Take away the time
+ * the pacer started to now (wheel_tick)
+ * and that tells you how many slots are
+ * open that can be inserted into that won't
+ * be touched by the pacer until later.
+ */
+ return (avail_on_wheel - pacer_to_now);
+ }
+}
+
static int
tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
{
- int32_t need_wake = 0;
- uint32_t ticknow = 0;
-
+ uint32_t need_wake = 0;
+
HPTS_MTX_ASSERT(hpts);
if (inp->inp_in_hpts == 0) {
/* Ok we need to set it on the hpts in the current slot */
- if (hpts->p_hpts_active == 0) {
- /* A sleeping hpts we want in next slot to run */
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0,
- hpts_tick(hpts, 1));
- }
- inp->inp_hptsslot = hpts_tick(hpts, 1);
- inp->inp_hpts_request = 0;
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow);
- }
- need_wake = 1;
+ inp->inp_hpts_request = 0;
+ if ((hpts->p_hpts_active == 0) ||
+ (hpts->p_wheel_complete)) {
+ /*
+ * A sleeping hpts we want in next slot to run
+ * note that in this state p_prev_slot == p_cur_slot
+ */
+ inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1);
+ if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
+ need_wake = 1;
} else if ((void *)inp == hpts->p_inp) {
/*
+ * The hpts system is running and the caller
+ * was awoken by the hpts system.
* We can't allow you to go into the same slot we
- * are in. We must put you out.
+ * are in (we don't want a loop :-D).
*/
inp->inp_hptsslot = hpts->p_nxt_slot;
} else
- inp->inp_hptsslot = hpts->p_cur_slot;
+ inp->inp_hptsslot = hpts->p_runningtick;
hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
- inp->inp_hpts_request = 0;
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0);
- }
if (need_wake) {
/*
* Activate the hpts if it is sleeping and its
* timeout is not 1.
*/
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow);
- }
hpts->p_direct_wake = 1;
tcp_wakehpts(hpts);
}
@@ -737,141 +838,129 @@
return (ret);
}
+#ifdef INVARIANTS
static void
-tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line,
- struct hpts_diag *diag, int32_t noref)
+check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
{
- int32_t need_new_to = 0;
- int32_t need_wakeup = 0;
- uint32_t largest_slot;
- uint32_t ticknow = 0;
- uint32_t slot_calc;
+ /*
+ * Sanity checks for the pacer with invariants
+ * on insert.
+ */
+ if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS)
+ panic("hpts:%p inp:%p slot:%d > max",
+ hpts, inp, inp_hptsslot);
+ if ((hpts->p_hpts_active) &&
+ (hpts->p_wheel_complete == 0)) {
+ /*
+ * If the pacer is processing a arc
+ * of the wheel, we need to make
+ * sure we are not inserting within
+ * that arc.
+ */
+ int distance, yet_to_run;
+ distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot);
+ if (hpts->p_runningtick != hpts->p_cur_slot)
+ yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+ else
+ yet_to_run = 0; /* processing last slot */
+ if (yet_to_run > distance) {
+ panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
+ hpts, inp, inp_hptsslot,
+ distance, yet_to_run,
+ hpts->p_runningtick, hpts->p_cur_slot);
+ }
+ }
+}
+#endif
+
+static void
+tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line,
+ struct hpts_diag *diag, struct timeval *tv)
+{
+ uint32_t need_new_to = 0;
+ uint32_t wheel_cts, last_tick;
+ int32_t wheel_tick, maxticks;
+ int8_t need_wakeup = 0;
+
HPTS_MTX_ASSERT(hpts);
if (diag) {
memset(diag, 0, sizeof(struct hpts_diag));
diag->p_hpts_active = hpts->p_hpts_active;
+ diag->p_prev_slot = hpts->p_prev_slot;
+ diag->p_runningtick = hpts->p_runningtick;
diag->p_nxt_slot = hpts->p_nxt_slot;
diag->p_cur_slot = hpts->p_cur_slot;
+ diag->p_curtick = hpts->p_curtick;
+ diag->p_lasttick = hpts->p_lasttick;
diag->slot_req = slot;
+ diag->p_on_min_sleep = hpts->p_on_min_sleep;
+ diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
}
- if ((inp->inp_in_hpts == 0) || noref) {
- inp->inp_hpts_request = slot;
+ if (inp->inp_in_hpts == 0) {
if (slot == 0) {
/* Immediate */
- tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref);
+ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
return;
}
- if (hpts->p_hpts_active) {
- /*
- * Its slot - 1 since nxt_slot is the next tick that
- * will go off since the hpts is awake
- */
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0);
- }
- /*
- * We want to make sure that we don't place a inp in
- * the range of p_cur_slot <-> p_nxt_slot. If we
- * take from p_nxt_slot to the end, plus p_cur_slot
- * and then take away 2, we will know how many is
- * the max slots we can use.
- */
- if (hpts->p_nxt_slot > hpts->p_cur_slot) {
- /*
- * Non-wrap case nxt_slot <-> cur_slot we
- * don't want to land in. So the diff gives
- * us what is taken away from the number of
- * slots.
+ /* Get the current time relative to the wheel */
+ wheel_cts = tcp_tv_to_hptstick(tv);
+ /* Map it onto the wheel */
+ wheel_tick = tick_to_wheel(wheel_cts);
+ /* Now what's the max we can place it at? */
+ maxticks = max_ticks_available(hpts, wheel_tick, &last_tick);
+ if (diag) {
+ diag->wheel_tick = wheel_tick;
+ diag->maxticks = maxticks;
+ diag->wheel_cts = wheel_cts;
+ }
+ if (maxticks == 0) {
+ /* The pacer is in a wheel wrap behind, yikes! */
+ if (slot > 1) {
+ /*
+ * Reduce by 1 to prevent a forever loop in
+ * case something else is wrong. Note this
+ * probably does not hurt because the pacer
+ * if its true is so far behind we will be
+ * > 1second late calling anyway.
*/
- largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot);
- } else if (hpts->p_nxt_slot == hpts->p_cur_slot) {
- largest_slot = NUM_OF_HPTSI_SLOTS - 2;
- } else {
- /*
- * Wrap case so the diff gives us the number
- * of slots that we can land in.
- */
- largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot;
+ slot--;
}
- /*
- * We take away two so we never have a problem (20
- * usec's) out of 1024000 usecs
- */
- largest_slot -= 2;
- if (inp->inp_hpts_request > largest_slot) {
- /*
- * Restrict max jump of slots and remember
- * leftover
- */
- slot = largest_slot;
- inp->inp_hpts_request -= largest_slot;
- } else {
- /* This one will run when we hit it */
- inp->inp_hpts_request = 0;
- }
- if (hpts->p_nxt_slot == hpts->p_cur_slot)
- slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS;
- else
- slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS;
- if (slot_calc == hpts->p_cur_slot) {
+ inp->inp_hptsslot = last_tick;
+ inp->inp_hpts_request = slot;
+ } else if (maxticks >= slot) {
+ /* It all fits on the wheel */
+ inp->inp_hpts_request = 0;
+ inp->inp_hptsslot = hpts_tick(wheel_tick, slot);
+ } else {
+ /* It does not fit */
+ inp->inp_hpts_request = slot - maxticks;
+ inp->inp_hptsslot = last_tick;
+ }
+ if (diag) {
+ diag->slot_remaining = inp->inp_hpts_request;
+ diag->inp_hptsslot = inp->inp_hptsslot;
+ }
#ifdef INVARIANTS
- /* TSNH */
- panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n",
- hpts, slot_calc, slot, largest_slot);
+ check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
#endif
- if (slot_calc)
- slot_calc--;
- else
- slot_calc = NUM_OF_HPTSI_SLOTS - 1;
- }
- inp->inp_hptsslot = slot_calc;
- if (diag) {
- diag->inp_hptsslot = inp->inp_hptsslot;
- }
- } else {
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
+ if ((hpts->p_hpts_active == 0) &&
+ (inp->inp_hpts_request == 0) &&
+ (hpts->p_on_min_sleep == 0)) {
/*
- * The hpts is sleeping, we need to figure out where
+ * The hpts is sleeping and not on a minimum
+ * sleep time, we need to figure out where
* it will wake up at and if we need to reschedule
* its time-out.
*/
uint32_t have_slept, yet_to_sleep;
- uint32_t slot_now;
- struct timeval tv;
- ticknow = tcp_gethptstick(&tv);
- slot_now = ticknow % NUM_OF_HPTSI_SLOTS;
- /*
- * The user wants to be inserted at (slot_now +
- * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up.
- */
- largest_slot = NUM_OF_HPTSI_SLOTS - 2;
- if (inp->inp_hpts_request > largest_slot) {
- /* Adjust the residual in inp_hpts_request */
- slot = largest_slot;
- inp->inp_hpts_request -= largest_slot;
- } else {
- /* No residual it all fits */
- inp->inp_hpts_request = 0;
- }
- inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS;
- if (diag) {
- diag->slot_now = slot_now;
- diag->inp_hptsslot = inp->inp_hptsslot;
- diag->p_on_min_sleep = hpts->p_on_min_sleep;
- }
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow);
- }
/* Now do we need to restart the hpts's timer? */
- if (TSTMP_GT(ticknow, hpts->p_curtick))
- have_slept = ticknow - hpts->p_curtick;
- else
- have_slept = 0;
- if (have_slept < hpts->p_hpts_sleep_time) {
- /* This should be what happens */
+ have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
+ if (have_slept < hpts->p_hpts_sleep_time)
yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
- } else {
+ else {
/* We are over-due */
yet_to_sleep = 0;
need_wakeup = 1;
@@ -879,20 +968,16 @@
if (diag) {
diag->have_slept = have_slept;
diag->yet_to_sleep = yet_to_sleep;
- diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
}
- if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) {
+ if (yet_to_sleep &&
+ (yet_to_sleep > slot)) {
/*
- * We need to reschedule the hptss time-out.
+ * We need to reschedule the hpts's time-out.
*/
hpts->p_hpts_sleep_time = slot;
need_new_to = slot * HPTS_TICKS_PER_USEC;
}
}
- hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow);
- }
/*
* Now how far is the hpts sleeping to? if active is 1, its
* up and ticking we do nothing, otherwise we may need to
@@ -899,9 +984,6 @@
* reschedule its callout if need_new_to is set from above.
*/
if (need_wakeup) {
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0);
- }
hpts->p_direct_wake = 1;
tcp_wakehpts(hpts);
if (diag) {
@@ -944,9 +1026,10 @@
}
uint32_t
-tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){
+tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag)
+{
struct tcp_hpts_entry *hpts;
- uint32_t slot_on, cts;
+ uint32_t slot_on;
struct timeval tv;
/*
@@ -956,12 +1039,8 @@
*/
INP_WLOCK_ASSERT(inp);
hpts = tcp_hpts_lock(inp);
- if (in_ts_percision)
- microuptime(&tv);
- else
- getmicrouptime(&tv);
- cts = tcp_tv_to_usectick(&tv);
- tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0);
+ microuptime(&tv);
+ tcp_hpts_insert_locked(hpts, inp, slot, line, diag, &tv);
slot_on = hpts->p_nxt_slot;
mtx_unlock(&hpts->p_mtx);
return (slot_on);
@@ -971,7 +1050,6 @@
__tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
return (tcp_hpts_insert_diag(inp, slot, line, NULL));
}
-
int
__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
{
@@ -986,9 +1064,6 @@
/*
* Activate the hpts if it is sleeping.
*/
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0);
- }
retval = 2;
hpts->p_direct_wake = 1;
tcp_wakeinput(hpts);
@@ -1001,36 +1076,14 @@
return (retval);
}
-void
-tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos)
+int32_t
+__tcp_queue_to_input(struct inpcb *inp, int line)
{
- /* Setup packet for input first */
- INP_WLOCK_ASSERT(tp->t_inpcb);
- m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t));
- m->m_pkthdr.pace_tlen = (uint16_t) tlen;
- m->m_pkthdr.pace_drphdrlen = drop_hdrlen;
- m->m_pkthdr.pace_tos = iptos;
- m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0);
- if (tp->t_in_pkt == NULL) {
- tp->t_in_pkt = m;
- tp->t_tail_pkt = m;
- } else {
- tp->t_tail_pkt->m_nextpkt = m;
- tp->t_tail_pkt = m;
- }
-}
-
-
-int32_t
-__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){
struct tcp_hpts_entry *hpts;
int32_t ret;
- tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
- hpts = tcp_input_lock(tp->t_inpcb);
- ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line);
+ hpts = tcp_input_lock(inp);
+ ret = __tcp_queue_to_input_locked(inp, hpts, line);
mtx_unlock(&hpts->p_mtx);
return (ret);
}
@@ -1132,6 +1185,25 @@
#endif
}
+static void
+tcp_drop_in_pkts(struct tcpcb *tp)
+{
+ struct mbuf *m, *n;
+
+ m = tp->t_in_pkt;
+ if (m)
+ n = m->m_nextpkt;
+ else
+ n = NULL;
+ tp->t_in_pkt = NULL;
+ while (m) {
+ m_freem(m);
+ m = n;
+ if (m)
+ n = m->m_nextpkt;
+ }
+}
+
/*
* Do NOT try to optimize the processing of inp's
* by first pulling off all the inp's into a temporary
@@ -1142,7 +1214,7 @@
* but then while you were processing one of the inp's
* some other one that you switch will get a new
* packet on the different CPU. It will insert it
- * on the new hptss input list. Creating a temporary
+ * on the new hpts's input list. Creating a temporary
* link in the inp will not fix it either, since
* the other hpts will be doing the same thing and
* you will both end up using the temporary link.
@@ -1155,16 +1227,16 @@
static void
tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
{
- struct mbuf *m, *n;
struct tcpcb *tp;
struct inpcb *inp;
uint16_t drop_reason;
int16_t set_cpu;
uint32_t did_prefetch = 0;
- int32_t ti_locked = TI_UNLOCKED;
+ int dropped;
struct epoch_tracker et;
HPTS_MTX_ASSERT(hpts);
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
HPTS_MTX_ASSERT(hpts);
hpts_sane_input_remove(hpts, inp, 0);
@@ -1178,24 +1250,14 @@
inp->inp_in_input = 0;
mtx_unlock(&hpts->p_mtx);
CURVNET_SET(inp->inp_vnet);
- if (drop_reason) {
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- ti_locked = TI_RLOCKED;
- } else {
- ti_locked = TI_UNLOCKED;
- }
INP_WLOCK(inp);
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
(inp->inp_flags2 & INP_FREED)) {
out:
hpts->p_inp = NULL;
- if (ti_locked == TI_RLOCKED) {
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
- }
if (in_pcbrele_wlocked(inp) == 0) {
INP_WUNLOCK(inp);
}
- ti_locked = TI_UNLOCKED;
CURVNET_RESTORE();
mtx_lock(&hpts->p_mtx);
continue;
@@ -1206,20 +1268,8 @@
}
if (drop_reason) {
/* This tcb is being destroyed for drop_reason */
- m = tp->t_in_pkt;
- if (m)
- n = m->m_nextpkt;
- else
- n = NULL;
- tp->t_in_pkt = NULL;
- while (m) {
- m_freem(m);
- m = n;
- if (m)
- n = m->m_nextpkt;
- }
+ tcp_drop_in_pkts(tp);
tp = tcp_drop(tp, drop_reason);
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
if (tp == NULL) {
INP_WLOCK(inp);
}
@@ -1246,212 +1296,168 @@
*/
tcp_set_hpts(inp);
}
- m = tp->t_in_pkt;
- n = NULL;
- if (m != NULL &&
- (m->m_pkthdr.pace_lock == TI_RLOCKED ||
- tp->t_state != TCPS_ESTABLISHED)) {
- ti_locked = TI_RLOCKED;
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- m = tp->t_in_pkt;
- }
- if (in_newts_every_tcb) {
- if (in_ts_percision)
- microuptime(tv);
- else
- getmicrouptime(tv);
- }
if (tp->t_fb_ptr != NULL) {
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
did_prefetch = 1;
}
- /* Any input work to do, if so do it first */
- if ((m != NULL) && (m == tp->t_in_pkt)) {
- struct tcphdr *th;
- int32_t tlen, drop_hdrlen, nxt_pkt;
- uint8_t iptos;
-
- n = m->m_nextpkt;
- tp->t_in_pkt = tp->t_tail_pkt = NULL;
- while (m) {
- th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff);
- tlen = m->m_pkthdr.pace_tlen;
- drop_hdrlen = m->m_pkthdr.pace_drphdrlen;
- iptos = m->m_pkthdr.pace_tos;
- m->m_nextpkt = NULL;
- if (n)
- nxt_pkt = 1;
- else
- nxt_pkt = 0;
- inp->inp_input_calls = 1;
- if (tp->t_fb->tfb_tcp_hpts_do_segment) {
- /* Use the hpts specific do_segment */
- (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket,
- tp, drop_hdrlen,
- tlen, iptos, nxt_pkt, tv);
- } else {
- /* Use the default do_segment */
- (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket,
- tp, drop_hdrlen,
- tlen, iptos);
- }
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
- /*
- * Do segment returns unlocked we need the
- * lock again but we also need some kasserts
- * here.
- */
- INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
- INP_UNLOCK_ASSERT(inp);
- m = n;
- if (m)
- n = m->m_nextpkt;
- if (m != NULL &&
- m->m_pkthdr.pace_lock == TI_RLOCKED) {
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- ti_locked = TI_RLOCKED;
- } else
- ti_locked = TI_UNLOCKED;
+ if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
+ if (inp->inp_in_input)
+ tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
+ dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
+ if (dropped) {
+ /* Re-acquire the wlock so we can release the reference */
INP_WLOCK(inp);
- /*
- * Since we have an opening here we must
- * re-check if the tcb went away while we
- * were getting the lock(s).
- */
- if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
- (inp->inp_flags2 & INP_FREED)) {
- while (m) {
- m_freem(m);
- m = n;
- if (m)
- n = m->m_nextpkt;
- }
- goto out;
- }
- /*
- * Now that we hold the INP lock, check if
- * we need to upgrade our lock.
- */
- if (ti_locked == TI_UNLOCKED &&
- (tp->t_state != TCPS_ESTABLISHED)) {
- ti_locked = TI_RLOCKED;
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- }
- } /** end while(m) */
- } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */
+ }
+ } else if (tp->t_in_pkt) {
+ /*
+ * We reach here only if we had a
+ * stack that supported INP_SUPPORTS_MBUFQ
+ * and then somehow switched to a stack that
+ * does not. The packets are basically stranded
+ * and would hang with the connection until
+ * cleanup without this code. Its not the
+ * best way but I know of no other way to
+ * handle it since the stack needs functions
+ * it does not have to handle queued packets.
+ */
+ tcp_drop_in_pkts(tp);
+ }
if (in_pcbrele_wlocked(inp) == 0)
INP_WUNLOCK(inp);
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
- INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
INP_UNLOCK_ASSERT(inp);
- ti_locked = TI_UNLOCKED;
mtx_lock(&hpts->p_mtx);
hpts->p_inp = NULL;
CURVNET_RESTORE();
}
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
}
-static int
-tcp_hpts_est_run(struct tcp_hpts_entry *hpts)
-{
- int32_t ticks_to_run;
-
- if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) {
- ticks_to_run = hpts->p_curtick - hpts->p_prevtick;
- if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) {
- ticks_to_run = NUM_OF_HPTSI_SLOTS - 2;
- }
- } else {
- if (hpts->p_prevtick == hpts->p_curtick) {
- /* This happens when we get woken up right away */
- return (-1);
- }
- ticks_to_run = 1;
- }
- /* Set in where we will be when we catch up */
- hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS;
- if (hpts->p_nxt_slot == hpts->p_cur_slot) {
- panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d",
- hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run);
- }
- return (ticks_to_run);
-}
-
static void
-tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick)
+tcp_hptsi(struct tcp_hpts_entry *hpts)
{
+ struct epoch_tracker et;
struct tcpcb *tp;
struct inpcb *inp = NULL, *ninp;
struct timeval tv;
- int32_t ticks_to_run, i, error, tick_now, interum_tick;
+ int32_t ticks_to_run, i, error;
int32_t paced_cnt = 0;
int32_t did_prefetch = 0;
int32_t prefetch_ninp = 0;
int32_t prefetch_tp = 0;
- uint32_t cts;
int16_t set_cpu;
HPTS_MTX_ASSERT(hpts);
- hpts->p_curtick = tcp_tv_to_hptstick(ctick);
- cts = tcp_tv_to_usectick(ctick);
- memcpy(&tv, ctick, sizeof(struct timeval));
- hpts->p_cur_slot = hpts_tick(hpts, 1);
+ /* record previous info for any logging */
+ hpts->saved_lasttick = hpts->p_lasttick;
+ hpts->saved_curtick = hpts->p_curtick;
+ hpts->saved_curslot = hpts->p_cur_slot;
+ hpts->saved_prev_slot = hpts->p_prev_slot;
- /* Figure out if we had missed ticks */
+ hpts->p_lasttick = hpts->p_curtick;
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ if ((hpts->p_on_queue_cnt == 0) ||
+ (hpts->p_lasttick == hpts->p_curtick)) {
+ /*
+ * No time has yet passed,
+ * or nothing to do.
+ */
+ hpts->p_prev_slot = hpts->p_cur_slot;
+ hpts->p_lasttick = hpts->p_curtick;
+ goto no_run;
+ }
again:
+ hpts->p_wheel_complete = 0;
HPTS_MTX_ASSERT(hpts);
- ticks_to_run = tcp_hpts_est_run(hpts);
- if (!TAILQ_EMPTY(&hpts->p_input)) {
- tcp_input_data(hpts, &tv);
+ ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot);
+ if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) &&
+ (hpts->p_on_queue_cnt != 0)) {
+ /*
+ * Wheel wrap is occuring, basically we
+ * are behind and the distance between
+ * run's has spread so much it has exceeded
+ * the time on the wheel (1.024 seconds). This
+ * is ugly and should NOT be happening. We
+ * need to run the entire wheel. We last processed
+ * p_prev_slot, so that needs to be the last slot
+ * we run. The next slot after that should be our
+ * reserved first slot for new, and then starts
+ * the running postion. Now the problem is the
+ * reserved "not to yet" place does not exist
+ * and there may be inp's in there that need
+ * running. We can merge those into the
+ * first slot at the head.
+ */
+ hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1);
+ hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2);
+ /*
+ * Adjust p_cur_slot to be where we are starting from
+ * hopefully we will catch up (fat chance if something
+ * is broken this bad :( )
+ */
+ hpts->p_cur_slot = hpts->p_prev_slot;
+ /*
+ * The next slot has guys to run too, and that would
+ * be where we would normally start, lets move them into
+ * the next slot (p_prev_slot + 2) so that we will
+ * run them, the extra 10usecs of late (by being
+ * put behind) does not really matter in this situation.
+ */
+#ifdef INVARIANTS
+ /*
+ * To prevent a panic we need to update the inpslot to the
+ * new location. This is safe since it takes both the
+ * INP lock and the pacer mutex to change the inp_hptsslot.
+ */
+ TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) {
+ inp->inp_hptsslot = hpts->p_runningtick;
+ }
+#endif
+ TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick],
+ &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts);
+ ticks_to_run = NUM_OF_HPTSI_SLOTS - 1;
+ counter_u64_add(wheel_wrap, 1);
+ } else {
+ /*
+ * Nxt slot is always one after p_runningtick though
+ * its not used usually unless we are doing wheel wrap.
+ */
+ hpts->p_nxt_slot = hpts->p_prev_slot;
+ hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1);
}
#ifdef INVARIANTS
if (TAILQ_EMPTY(&hpts->p_input) &&
(hpts->p_on_inqueue_cnt != 0)) {
panic("tp:%p in_hpts input empty but cnt:%d",
- hpts, hpts->p_on_inqueue_cnt);
+ hpts, hpts->p_on_inqueue_cnt);
}
#endif
HPTS_MTX_ASSERT(hpts);
- /* Reset the ticks to run and time if we need too */
- interum_tick = tcp_gethptstick(&tv);
- if (interum_tick != hpts->p_curtick) {
- /* Save off the new time we execute to */
- *ctick = tv;
- hpts->p_curtick = interum_tick;
- cts = tcp_tv_to_usectick(&tv);
- hpts->p_cur_slot = hpts_tick(hpts, 1);
- ticks_to_run = tcp_hpts_est_run(hpts);
- }
- if (ticks_to_run == -1) {
- goto no_run;
- }
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0);
- }
if (hpts->p_on_queue_cnt == 0) {
goto no_one;
}
HPTS_MTX_ASSERT(hpts);
+#ifndef VIMAGE
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
for (i = 0; i < ticks_to_run; i++) {
/*
* Calculate our delay, if there are no extra ticks there
- * was not any
+ * was not any (i.e. if ticks_to_run == 1, no delay).
*/
hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
HPTS_MTX_ASSERT(hpts);
- while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+ while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
/* For debugging */
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i);
- }
hpts->p_inp = inp;
paced_cnt++;
- if (hpts->p_cur_slot != inp->inp_hptsslot) {
+#ifdef INVARIANTS
+ if (hpts->p_runningtick != inp->inp_hptsslot) {
panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
- hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot);
+ hpts, inp, hpts->p_runningtick, inp->inp_hptsslot);
}
+#endif
/* Now pull it */
if (inp->inp_hpts_cpu_set == 0) {
set_cpu = 1;
@@ -1458,8 +1464,8 @@
} else {
set_cpu = 0;
}
- hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0);
- if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+ hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0);
+ if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
/* We prefetch the next inp if possible */
kern_prefetch(ninp, &prefetch_ninp);
prefetch_ninp = 1;
@@ -1467,25 +1473,36 @@
if (inp->inp_hpts_request) {
/*
* This guy is deferred out further in time
- * then our wheel had on it. Push him back
- * on the wheel.
+ * then our wheel had available on it.
+ * Push him back on the wheel or run it
+ * depending.
*/
- int32_t remaining_slots;
-
+ uint32_t maxticks, last_tick, remaining_slots;
+
remaining_slots = ticks_to_run - (i + 1);
if (inp->inp_hpts_request > remaining_slots) {
/*
- * Keep INVARIANTS happy by clearing
- * the flag
+ * How far out can we go?
*/
- tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1);
+ maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick);
+ if (maxticks >= inp->inp_hpts_request) {
+ /* we can place it finally to be processed */
+ inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request);
+ inp->inp_hpts_request = 0;
+ } else {
+ /* Work off some more time */
+ inp->inp_hptsslot = last_tick;
+ inp->inp_hpts_request-= maxticks;
+ }
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1);
hpts->p_inp = NULL;
continue;
}
inp->inp_hpts_request = 0;
+ /* Fall through we will so do it now */
}
/*
- * We clear the hpts flag here after dealing with
+ * We clear the hpts flag here after dealing with
* remaining slots. This way anyone looking with the
* TCB lock will see its on the hpts until just
* before we unlock.
@@ -1495,23 +1512,20 @@
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
mtx_lock(&hpts->p_mtx);
- if (logging_on)
- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1);
hpts->p_inp = NULL;
continue;
}
- if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
-out_now:
+ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+ (inp->inp_flags2 & INP_FREED)) {
+ out_now:
#ifdef INVARIANTS
if (mtx_owned(&hpts->p_mtx)) {
panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
+ hpts, __LINE__);
}
#endif
INP_WUNLOCK(inp);
mtx_lock(&hpts->p_mtx);
- if (logging_on)
- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3);
hpts->p_inp = NULL;
continue;
}
@@ -1539,16 +1553,14 @@
*/
tcp_set_hpts(inp);
}
- if (out_newts_every_tcb) {
- struct timeval sv;
-
- if (out_ts_percision)
- microuptime(&sv);
- else
- getmicrouptime(&sv);
- cts = tcp_tv_to_usectick(&sv);
+ CURVNET_SET(inp->inp_vnet);
+#ifdef VIMAGE
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
+ /* Lets do any logging that we might want to */
+ if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i);
}
- CURVNET_SET(inp->inp_vnet);
/*
* There is a hole here, we get the refcnt on the
* inp so it will still be preserved but to make
@@ -1560,7 +1572,7 @@
#ifdef INVARIANTS
if (mtx_owned(&hpts->p_mtx)) {
panic("Hpts:%p owns mtx before tcp-output:%d",
- hpts, __LINE__);
+ hpts, __LINE__);
}
#endif
if (tp->t_fb_ptr != NULL) {
@@ -1567,12 +1579,16 @@
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
did_prefetch = 1;
}
+ if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
+ error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
+ if (error) {
+ /* The input killed the connection */
+ goto skip_pacing;
+ }
+ }
inp->inp_hpts_calls = 1;
- if (tp->t_fb->tfb_tcp_output_wtime != NULL) {
- error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv);
- } else {
- error = tp->t_fb->tfb_tcp_output(tp);
- }
+ error = tp->t_fb->tfb_tcp_output(tp);
+ inp->inp_hpts_calls = 0;
if (ninp && ninp->inp_ppcb) {
/*
* If we have a nxt inp, see if we can
@@ -1609,74 +1625,92 @@
prefetch_tp = 1;
}
INP_WUNLOCK(inp);
+ skip_pacing:
+#ifdef VIMAGE
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+#endif
INP_UNLOCK_ASSERT(inp);
CURVNET_RESTORE();
#ifdef INVARIANTS
if (mtx_owned(&hpts->p_mtx)) {
panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
+ hpts, __LINE__);
}
#endif
mtx_lock(&hpts->p_mtx);
- if (logging_on)
- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4);
hpts->p_inp = NULL;
}
HPTS_MTX_ASSERT(hpts);
hpts->p_inp = NULL;
- hpts->p_cur_slot++;
- if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) {
- hpts->p_cur_slot = 0;
+ hpts->p_runningtick++;
+ if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) {
+ hpts->p_runningtick = 0;
}
}
+#ifndef VIMAGE
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+#endif
no_one:
HPTS_MTX_ASSERT(hpts);
- hpts->p_prevtick = hpts->p_curtick;
hpts->p_delayed_by = 0;
/*
* Check to see if we took an excess amount of time and need to run
* more ticks (if we did not hit eno-bufs).
*/
- /* Re-run any input that may be there */
- (void)tcp_gethptstick(&tv);
- if (!TAILQ_EMPTY(&hpts->p_input)) {
- tcp_input_data(hpts, &tv);
- }
#ifdef INVARIANTS
if (TAILQ_EMPTY(&hpts->p_input) &&
(hpts->p_on_inqueue_cnt != 0)) {
panic("tp:%p in_hpts input empty but cnt:%d",
- hpts, hpts->p_on_inqueue_cnt);
+ hpts, hpts->p_on_inqueue_cnt);
}
#endif
- tick_now = tcp_gethptstick(&tv);
- if (SEQ_GT(tick_now, hpts->p_prevtick)) {
- struct timeval res;
-
- /* Did we really spend a full tick or more in here? */
- timersub(&tv, ctick, &res);
- if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) {
+ hpts->p_prev_slot = hpts->p_cur_slot;
+ hpts->p_lasttick = hpts->p_curtick;
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ if (hpts->p_lasttick != hpts->p_curtick) {
+ counter_u64_add(hpts_loops, 1);
+ goto again;
+ }
+no_run:
+ /*
+ * Set flag to tell that we are done for
+ * any slot input that happens during
+ * input.
+ */
+ hpts->p_wheel_complete = 1;
+ /*
+ * Run any input that may be there not covered
+ * in running data.
+ */
+ if (!TAILQ_EMPTY(&hpts->p_input)) {
+ tcp_input_data(hpts, &tv);
+ /*
+ * Now did we spend too long running
+ * input and need to run more ticks?
+ */
+ KASSERT(hpts->p_prev_slot == hpts->p_cur_slot,
+ ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
+ hpts->p_prev_slot, hpts->p_cur_slot));
+ KASSERT(hpts->p_lasttick == hpts->p_curtick,
+ ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
+ hpts->p_lasttick, hpts->p_curtick));
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ if (hpts->p_lasttick != hpts->p_curtick) {
counter_u64_add(hpts_loops, 1);
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now);
- }
- *ctick = res;
- hpts->p_curtick = tick_now;
+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
goto again;
}
}
-no_run:
{
uint32_t t = 0, i, fnd = 0;
if (hpts->p_on_queue_cnt) {
-
-
/*
* Find next slot that is occupied and use that to
* be the sleep time.
*/
- for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) {
+ for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
fnd = 1;
break;
@@ -1684,27 +1718,20 @@
t = (t + 1) % NUM_OF_HPTSI_SLOTS;
}
if (fnd) {
- hpts->p_hpts_sleep_time = i;
+ hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
} else {
- counter_u64_add(back_tosleep, 1);
#ifdef INVARIANTS
- panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt);
+ panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt);
#endif
+ counter_u64_add(back_tosleep, 1);
hpts->p_on_queue_cnt = 0;
goto non_found;
}
- t++;
} else {
- /* No one on the wheel sleep for all but 2 slots */
-non_found:
- if (hpts_sleep_max == 0)
- hpts_sleep_max = 1;
- hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max);
- t = 0;
+ /* No one on the wheel sleep for all but 400 slots or sleep max */
+ non_found:
+ hpts->p_hpts_sleep_time = hpts_sleep_max;
}
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC));
- }
}
}
@@ -1746,33 +1773,29 @@
mtx_lock(&hpts->p_mtx);
if (hpts->p_direct_wake) {
/* Signaled by input */
- if (logging_on)
- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1);
callout_stop(&hpts->co);
} else {
/* Timed out */
if (callout_pending(&hpts->co) ||
!callout_active(&hpts->co)) {
- if (logging_on)
- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2);
mtx_unlock(&hpts->p_mtx);
return;
}
callout_deactivate(&hpts->co);
- if (logging_on)
- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3);
}
+ hpts->p_hpts_wake_scheduled = 0;
hpts->p_hpts_active = 1;
- (void)tcp_gethptstick(&tv);
- tcp_hptsi(hpts, &tv);
+ tcp_hptsi(hpts);
HPTS_MTX_ASSERT(hpts);
tv.tv_sec = 0;
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
+ hpts->overidden_sleep = tv.tv_usec;
tv.tv_usec = tcp_min_hptsi_time;
hpts->p_on_min_sleep = 1;
} else {
/* Clear the min sleep flag */
+ hpts->overidden_sleep = 0;
hpts->p_on_min_sleep = 0;
}
hpts->p_hpts_active = 0;
@@ -1811,7 +1834,8 @@
tcp_pace.rp_num_hptss = ncpus;
hpts_loops = counter_u64_alloc(M_WAITOK);
back_tosleep = counter_u64_alloc(M_WAITOK);
-
+ combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
+ wheel_wrap = counter_u64_alloc(M_WAITOK);
sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
@@ -1850,7 +1874,7 @@
OID_AUTO, "out_qcnt", CTLFLAG_RD,
&hpts->p_on_queue_cnt, 0,
"Count TCB's awaiting output processing");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_ADD_U16(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "active", CTLFLAG_RD,
&hpts->p_hpts_active, 0,
@@ -1859,29 +1883,23 @@
SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "curslot", CTLFLAG_RD,
&hpts->p_cur_slot, 0,
- "What the current slot is if active");
+ "What the current running pacers goal");
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "runtick", CTLFLAG_RD,
+ &hpts->p_runningtick, 0,
+ "What the running pacers current slot is");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "curtick", CTLFLAG_RD,
&hpts->p_curtick, 0,
- "What the current tick on if active");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "logsize", CTLFLAG_RD,
- &hpts->p_logsize, 0,
- "Hpts logging buffer size");
- hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2;
+ "What the running pacers last tick mapped to the wheel was");
+ hpts->p_hpts_sleep_time = hpts_sleep_max;
hpts->p_num = i;
- hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv);
- hpts->p_prevtick -= 1;
- hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS;
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
hpts->p_cpu = 0xffff;
- hpts->p_nxt_slot = 1;
- hpts->p_logsize = tcp_hpts_logging_size;
- if (hpts->p_logsize) {
- sz = (sizeof(struct hpts_log) * hpts->p_logsize);
- hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
- }
+ hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1);
callout_init(&hpts->co, 1);
}
Index: netinet/tcp_log_buf.h
===================================================================
--- netinet/tcp_log_buf.h
+++ netinet/tcp_log_buf.h
@@ -175,7 +175,7 @@
TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
TCP_LOG_PRR, /* Doing PRR 6 */
TCP_LOG_REORDER,/* Detected reorder 7 */
- TCP_LOG_PACER, /* Pacer sending a packet 8 */
+ TCP_LOG_HPTS, /* Hpts sending a packet 8 */
BBR_LOG_BBRUPD, /* We updated BBR info 9 */
BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */
BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */
@@ -194,31 +194,38 @@
BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */
TCP_LOG_FLOWEND, /* End of a flow 25 */
BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */
- BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */
- BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */
+ BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */
+ BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */
BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */
BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */
TCP_LOG_USERSEND, /* User level sends data 31 */
- UNUSED_32, /* Unused 32 */
- UNUSED_33, /* Unused 33 */
+ BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */
+ BBR_LOG_STATE_TARGET, /* Log of target at state 33 */
BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */
BBR_LOG_TO_PROCESS, /* A to was processed 35 */
BBR_LOG_BBRTSO, /* TSO update 36 */
- BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */
+ BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */
BBR_LOG_LOWGAIN, /* Low gain accounting 38 */
BBR_LOG_PROGRESS, /* Progress timer event 39 */
TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */
BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */
BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */
- BBR_LOG_PACING_CALC, /* calc the pacing time 43 */
+ BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */
BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */
BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */
BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/
TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */
BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */
- BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */
+ BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */
TCP_LOG_REASS, /* Reassembly buffer logging 50 */
- TCP_LOG_END /* End (keep at end) 51 */
+ TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */
+ BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */
+ BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */
+ TCP_LOG_CONNEND, /* End of connection 54 */
+ TCP_LOG_LRO, /* LRO entry 55 */
+ TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */
+ TCP_SAD_DETECTION, /* Sack Attack Detection 57 */
+ TCP_LOG_END /* End (keep at end) 58 */
};
enum tcp_log_states {
@@ -275,8 +282,8 @@
#ifdef _KERNEL
-#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000
-#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000
+#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000
+#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000
/*
* TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always
Index: netinet/tcp_stacks/rack_bbr_common.h
===================================================================
--- netinet/tcp_stacks/rack_bbr_common.h
+++ netinet/tcp_stacks/rack_bbr_common.h
@@ -38,17 +38,8 @@
#define TCP_MSS_ACCT_SIZE 70
#define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF)
+#define DUP_ACK_THRESHOLD 3
-/* Magic flags to tell whats cooking on the pacing wheel */
-#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */
-#define PACE_TMR_RACK 0x02 /* RACK timer running */
-#define PACE_TMR_TLP 0x04 /* TLP timer running */
-#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
-#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
-#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
-#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */
-#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
-
/* Magic flags for tracing progress events */
#define PROGRESS_DROP 1
#define PROGRESS_UPDATE 2
@@ -61,8 +52,66 @@
#define USE_RTT_LOW 1
#define USE_RTT_AVG 2
+#define PACE_MAX_IP_BYTES 65536
+#define USECS_IN_SECOND 1000000
+#define MSEC_IN_SECOND 1000
+#define MS_IN_USEC 1000
+#define USEC_TO_MSEC(x) (x / MS_IN_USEC)
+#define TCP_TS_OVERHEAD 12 /* Overhead of having Timestamps on */
+
#ifdef _KERNEL
/* We have only 7 bits in rack so assert its true */
CTASSERT((PACE_TMR_MASK & 0x80) == 0);
+#ifdef KERN_TLS
+uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd);
#endif
+int
+ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so,
+ struct mbuf *m, int has_pkt);
+int
+ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt);
+uint32_t ctf_outstanding(struct tcpcb *tp);
+uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked);
+int
+ctf_drop_checks(struct tcpopt *to, struct mbuf *m,
+ struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
+ int32_t * drop_hdrlen, int32_t * ret_val);
+void
+ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
+ struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
+void
+ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
+ struct tcphdr *th, int32_t rstreason, int32_t tlen);
+void
+ctf_do_drop(struct mbuf *m, struct tcpcb *tp);
+
+int
+ctf_process_rst(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp);
+
+void
+ctf_challenge_ack(struct mbuf *m, struct tcphdr *th,
+ struct tcpcb *tp, int32_t * ret_val);
+
+int
+ctf_ts_check(struct mbuf *m, struct tcphdr *th,
+ struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
+
+void
+ctf_calc_rwin(struct socket *so, struct tcpcb *tp);
+
+void
+ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+ int32_t rstreason, int32_t tlen);
+
+uint32_t
+ctf_fixed_maxseg(struct tcpcb *tp);
+
+void
+ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks);
+
+uint32_t
+ctf_decay_count(uint32_t count, uint32_t decay_percentage);
+
#endif
+#endif
Index: netinet/tcp_stacks/rack_bbr_common.c
===================================================================
--- netinet/tcp_stacks/rack_bbr_common.c
+++ netinet/tcp_stacks/rack_bbr_common.c
@@ -0,0 +1,859 @@
+/*-
+ * Copyright (c) 2016-2018
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * Author: Randall Stewart <rrs@netflix.com>
+ * This work is based on the ACM Queue paper
+ * BBR - Congestion Based Congestion Control
+ * and also numerous discussions with Neal, Yuchung and Van.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+#include "opt_kern_tls.h"
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#ifdef TCP_HHOOK
+#include <sys/hhook.h>
+#endif
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/sockbuf_tls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/tree.h>
+#include <sys/refcount.h>
+#include <sys/queue.h>
+#include <sys/smp.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/time.h>
+#include <vm/uma.h>
+#include <sys/kern_prefetch.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+#include <net/ethernet.h>
+#include <net/bpf.h>
+
+#define TCPSTATES /* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* required for icmp_var.h */
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#define TCPOUTFLAGS
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_hpts.h>
+#include <netinet/cc/cc.h>
+#include <netinet/tcp_log_buf.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_fastopen.h>
+
+#include <netipsec/ipsec_support.h>
+#include <net/if.h>
+#include <net/if_var.h>
+
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /* IPSEC */
+
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <machine/in_cksum.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+#include "rack_bbr_common.h"
+
+/*
+ * Common TCP Functions - These are shared by borth
+ * rack and BBR.
+ */
+
+
+#ifdef KERN_TLS
+uint32_t
+ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
+{
+ struct sbtls_info *tls;
+ uint32_t len;
+
+again:
+ tls = so->so_snd.sb_tls_info;
+ len = tls->sb_params.sb_maxlen; /* max tls payload */
+ len += tls->sb_params.sb_tls_hlen; /* tls header len */
+ len += tls->sb_params.sb_tls_tlen; /* tls trailer len */
+ if ((len * 4) > rwnd) {
+ /*
+ * Stroke this will suck counter and what
+ * else should we do Drew? From the
+ * TCP perspective I am not sure
+ * what should be done...
+ */
+ if (tls->sb_params.sb_maxlen > 4096) {
+ tls->sb_params.sb_maxlen -= 4096;
+ if (tls->sb_params.sb_maxlen < 4096)
+ tls->sb_params.sb_maxlen = 4096;
+ goto again;
+ }
+ }
+ return (len);
+}
+#endif
+
+int
+ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt)
+{
+ /*
+ * We are passed a raw change of mbuf packets
+ * that arrived in LRO. They are linked via
+ * the m_nextpkt link in the pkt-headers.
+ *
+ * We process each one by:
+ * a) saving off the next
+ * b) stripping off the ether-header
+ * c) formulating the arguments for
+ * the tfb_tcp_hpts_do_segment
+ * d) calling each mbuf to tfb_tcp_hpts_do_segment
+ * after adjusting the time to match the arrival time.
+ * Note that the LRO code assures no IP options are present.
+ *
+ * The symantics for calling tfb_tcp_hpts_do_segment are the
+ * following:
+ * 1) It returns 0 if all went well and you (the caller) need
+ * to release the lock.
+ * 2) If nxt_pkt is set, then the function will surpress calls
+ * to tfb_tcp_output() since you are promising to call again
+ * with another packet.
+ * 3) If it returns 1, then you must free all the packets being
+ * shipped in, the tcb has been destroyed (or about to be destroyed).
+ */
+ struct mbuf *m_save;
+ struct ether_header *eh;
+ struct epoch_tracker et;
+ struct tcphdr *th;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
+#endif
+#ifdef INET
+ struct ip *ip = NULL; /* Keep compiler happy. */
+#endif
+ struct ifnet *ifp;
+ struct timeval tv;
+ int32_t retval, nxt_pkt, tlen, off;
+ uint16_t etype;
+ uint16_t drop_hdrlen;
+ uint8_t iptos, no_vn=0, bpf_req=0;
+
+ /*
+ * This is a bit deceptive, we get the
+ * "info epoch" which is really the network
+ * epoch. This covers us on both any INP
+ * type change but also if the ifp goes
+ * away it covers us as well.
+ */
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+ if (m && m->m_pkthdr.rcvif)
+ ifp = m->m_pkthdr.rcvif;
+ else
+ ifp = NULL;
+ if (ifp) {
+ bpf_req = bpf_peers_present(ifp->if_bpf);
+ } else {
+ /*
+ * We probably should not work around
+ * but kassert, since lro alwasy sets rcvif.
+ */
+ no_vn = 1;
+ goto skip_vnet;
+ }
+ CURVNET_SET(ifp->if_vnet);
+skip_vnet:
+ while (m) {
+ m_save = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ /* Now lets get the ether header */
+ eh = mtod(m, struct ether_header *);
+ etype = ntohs(eh->ether_type);
+ /* Let the BPF see the packet */
+ if (bpf_req && ifp)
+ ETHER_BPF_MTAP(ifp, m);
+ m_adj(m, sizeof(*eh));
+ /* Trim off the ethernet header */
+ switch (etype) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ {
+ if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
+ m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
+ if (m == NULL) {
+ TCPSTAT_INC(tcps_rcvshort);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ }
+ ip6 = (struct ip6_hdr *)(eh + 1);
+ th = (struct tcphdr *)(ip6 + 1);
+ tlen = ntohs(ip6->ip6_plen);
+ drop_hdrlen = sizeof(*ip6);
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in6_cksum_pseudo(ip6, tlen,
+ IPPROTO_TCP, m->m_pkthdr.csum_data);
+ th->th_sum ^= 0xffff;
+ } else
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
+ if (th->th_sum) {
+ TCPSTAT_INC(tcps_rcvbadsum);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ /*
+ * Be proactive about unspecified IPv6 address in source.
+ * As we use all-zero to indicate unbounded/unconnected pcb,
+ * unspecified IPv6 address can be used to confuse us.
+ *
+ * Note that packets with unspecified IPv6 destination is
+ * already dropped in ip6_input.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
+ /* XXX stat */
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+ break;
+ }
+#endif
+#ifdef INET
+ case ETHERTYPE_IP:
+ {
+ if (m->m_len < sizeof (struct tcpiphdr)) {
+ if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
+ == NULL) {
+ TCPSTAT_INC(tcps_rcvshort);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ }
+ ip = (struct ip *)(eh + 1);
+ th = (struct tcphdr *)(ip + 1);
+ drop_hdrlen = sizeof(*ip);
+ iptos = ip->ip_tos;
+ tlen = ntohs(ip->ip_len) - sizeof(struct ip);
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr,
+ htonl(m->m_pkthdr.csum_data + tlen +
+ IPPROTO_TCP));
+ th->th_sum ^= 0xffff;
+ } else {
+ int len;
+ struct ipovly *ipov = (struct ipovly *)ip;
+ /*
+ * Checksum extended TCP header and data.
+ */
+ len = drop_hdrlen + tlen;
+ bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
+ ipov->ih_len = htons(tlen);
+ th->th_sum = in_cksum(m, len);
+ /* Reset length for SDT probes. */
+ ip->ip_len = htons(len);
+ /* Reset TOS bits */
+ ip->ip_tos = iptos;
+ /* Re-initialization for later version check */
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = sizeof(*ip) >> 2;
+ }
+ if (th->th_sum) {
+ TCPSTAT_INC(tcps_rcvbadsum);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ break;
+ }
+#endif
+ }
+ /*
+ * Convert TCP protocol specific fields to host format.
+ */
+ tcp_fields_to_host(th);
+
+ off = th->th_off << 2;
+ if (off < sizeof (struct tcphdr) || off > tlen) {
+ TCPSTAT_INC(tcps_rcvbadoff);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ tlen -= off;
+ drop_hdrlen += off;
+ /*
+ * Now lets setup the timeval to be when we should
+ * have been called (if we can).
+ */
+ m->m_pkthdr.lro_nsegs = 1;
+ if (m->m_flags & M_TSTMP_LRO) {
+ tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
+ tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
+ } else {
+ /* Should not be should we kassert instead? */
+ tcp_get_usecs(&tv);
+ }
+ /* Now what about next packet? */
+ if (m_save || has_pkt)
+ nxt_pkt = 1;
+ else
+ nxt_pkt = 0;
+ retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen,
+ iptos, nxt_pkt, &tv);
+ if (retval) {
+ /* We lost the lock and tcb probably */
+ m = m_save;
+ while (m) {
+ m_save = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ m = m_save;
+ }
+ if (no_vn == 0)
+ CURVNET_RESTORE();
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+ return (retval);
+ }
+skipped_pkt:
+ m = m_save;
+ }
+ if (no_vn == 0)
+ CURVNET_RESTORE();
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+ return (retval);
+}
+
+int
+ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
+{
+ struct mbuf *m;
+
+ /* First lets see if we have old packets */
+ if (tp->t_in_pkt) {
+ m = tp->t_in_pkt;
+ tp->t_in_pkt = NULL;
+ tp->t_tail_pkt = NULL;
+ if (ctf_process_inbound_raw(tp, so, m, have_pkt)) {
+ /* We lost the tcpcb (maybe a RST came in)? */
+ return (1);
+ }
+ }
+ return (0);
+}
+
+uint32_t
+ctf_outstanding(struct tcpcb *tp)
+{
+ return (tp->snd_max - tp->snd_una);
+}
+
+uint32_t
+ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
+{
+ if (rc_sacked <= ctf_outstanding(tp))
+ return (ctf_outstanding(tp) - rc_sacked);
+ else {
+ /* TSNH */
+#ifdef INVARIANTS
+ panic("tp:%p rc_sacked:%d > out:%d",
+ tp, rc_sacked, ctf_outstanding(tp));
+#endif
+ return (0);
+ }
+}
+
+void
+ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+ int32_t rstreason, int32_t tlen)
+{
+ if (tp != NULL) {
+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ INP_WUNLOCK(tp->t_inpcb);
+ } else
+ tcp_dropwithreset(m, th, NULL, tlen, rstreason);
+}
+
+/*
+ * ctf_drop_checks returns 1 for you should not proceed. It places
+ * in ret_val what should be returned 1/0 by the caller. The 1 indicates
+ * that the TCB is unlocked and probably dropped. The 0 indicates the
+ * TCB is still valid and locked.
+ */
+int
+ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
+{
+ int32_t todrop;
+ int32_t thflags;
+ int32_t tlen;
+
+ thflags = *thf;
+ tlen = *tlenp;
+ todrop = tp->rcv_nxt - th->th_seq;
+ if (todrop > 0) {
+ if (thflags & TH_SYN) {
+ thflags &= ~TH_SYN;
+ th->th_seq++;
+ if (th->th_urp > 1)
+ th->th_urp--;
+ else
+ thflags &= ~TH_URG;
+ todrop--;
+ }
+ /*
+ * Following if statement from Stevens, vol. 2, p. 960.
+ */
+ if (todrop > tlen
+ || (todrop == tlen && (thflags & TH_FIN) == 0)) {
+ /*
+ * Any valid FIN must be to the left of the window.
+ * At this point the FIN must be a duplicate or out
+ * of sequence; drop it.
+ */
+ thflags &= ~TH_FIN;
+ /*
+ * Send an ACK to resynchronize and drop any data.
+ * But keep on processing for RST or ACK.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ todrop = tlen;
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
+ } else {
+ TCPSTAT_INC(tcps_rcvpartduppack);
+ TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
+ }
+ /*
+ * DSACK - add SACK block for dropped range
+ */
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen);
+ /*
+ * ACK now, as the next in-sequence segment
+ * will clear the DSACK block again
+ */
+ tp->t_flags |= TF_ACKNOW;
+ }
+ *drop_hdrlen += todrop; /* drop from the top afterwards */
+ th->th_seq += todrop;
+ tlen -= todrop;
+ if (th->th_urp > todrop)
+ th->th_urp -= todrop;
+ else {
+ thflags &= ~TH_URG;
+ th->th_urp = 0;
+ }
+ }
+ /*
+ * If segment ends after window, drop trailing data (and PUSH and
+ * FIN); if nothing left, just ACK.
+ */
+ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
+ if (todrop > 0) {
+ TCPSTAT_INC(tcps_rcvpackafterwin);
+ if (todrop >= tlen) {
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
+ /*
+ * If window is closed can only take segments at
+ * window edge, and have to drop data and PUSH from
+ * incoming segments. Continue processing, but
+ * remember to ack. Otherwise, drop segment and
+ * ack.
+ */
+ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
+ tp->t_flags |= TF_ACKNOW;
+ TCPSTAT_INC(tcps_rcvwinprobe);
+ } else {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+ return (1);
+ }
+ } else
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+ m_adj(m, -todrop);
+ tlen -= todrop;
+ thflags &= ~(TH_PUSH | TH_FIN);
+ }
+ *thf = thflags;
+ *tlenp = tlen;
+ return (0);
+}
+
+/*
+ * The value in ret_val informs the caller
+ * if we dropped the tcb (and lock) or not.
+ * 1 = we dropped it, 0 = the TCB is still locked
+ * and valid.
+ */
+void
+ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
+{
+ /*
+ * Generate an ACK dropping incoming segment if it occupies sequence
+ * space, where the ACK reflects our state.
+ *
+ * We can now skip the test for the RST flag since all paths to this
+ * code happen after packets containing RST have been dropped.
+ *
+ * In the SYN-RECEIVED state, don't send an ACK unless the segment
+ * we received passes the SYN-RECEIVED ACK test. If it fails send a
+ * RST. This breaks the loop in the "LAND" DoS attack, and also
+ * prevents an ACK storm between two listening ports that have been
+ * sent forged SYN segments, each with the source address of the
+ * other.
+ */
+ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
+ (SEQ_GT(tp->snd_una, th->th_ack) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ *ret_val = 1;
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return;
+ } else
+ *ret_val = 0;
+ tp->t_flags |= TF_ACKNOW;
+ if (m)
+ m_freem(m);
+}
+
+void
+ctf_do_drop(struct mbuf *m, struct tcpcb *tp)
+{
+
+ /*
+ * Drop space held by incoming segment and return.
+ */
+ if (tp != NULL)
+ INP_WUNLOCK(tp->t_inpcb);
+ if (m)
+ m_freem(m);
+}
+
+int
+ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
+{
+ /*
+ * RFC5961 Section 3.2
+ *
+ * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
+ * window, we send challenge ACK.
+ *
+ * Note: to take into account delayed ACKs, we should test against
+ * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
+ * of closed window, not covered by the RFC.
+ */
+ int dropped = 0;
+
+ if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
+ (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
+
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(tp->t_state != TCPS_SYN_SENT,
+ ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
+ __func__, th, tp));
+
+ if (V_tcp_insecure_rst ||
+ (tp->last_ack_sent == th->th_seq) ||
+ (tp->rcv_nxt == th->th_seq) ||
+ ((tp->last_ack_sent - 1) == th->th_seq)) {
+ TCPSTAT_INC(tcps_drops);
+ /* Drop the connection. */
+ switch (tp->t_state) {
+ case TCPS_SYN_RECEIVED:
+ so->so_error = ECONNREFUSED;
+ goto close;
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ so->so_error = ECONNRESET;
+ close:
+ tcp_state_change(tp, TCPS_CLOSED);
+ /* FALLTHROUGH */
+ default:
+ tp = tcp_close(tp);
+ }
+ dropped = 1;
+ ctf_do_drop(m, tp);
+ } else {
+ TCPSTAT_INC(tcps_badrst);
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m,
+ tp->rcv_nxt, tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ }
+ } else {
+ m_freem(m);
+ }
+ return (dropped);
+}
+
+/*
+ * The value in ret_val informs the caller
+ * if we dropped the tcb (and lock) or not.
+ * 1 = we dropped it, 0 = the TCB is still locked
+ * and valid.
+ */
+void
+ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
+{
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+ TCPSTAT_INC(tcps_badsyn);
+ if (V_tcp_insecure_syn &&
+ SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+ tp = tcp_drop(tp, ECONNRESET);
+ *ret_val = 1;
+ ctf_do_drop(m, tp);
+ } else {
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
+ tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ m = NULL;
+ *ret_val = 0;
+ ctf_do_drop(m, NULL);
+ }
+}
+
+/*
+ * bbr_ts_check returns 1 for you should not proceed, the state
+ * machine should return. It places in ret_val what should
+ * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
+ * that the TCB is unlocked and probably dropped. The 0 indicates the
+ * TCB is still valid and locked.
+ */
+int
+ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
+ int32_t tlen, int32_t thflags, int32_t * ret_val)
+{
+
+ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
+ /*
+ * Invalidate ts_recent. If this segment updates ts_recent,
+ * the age will be reset later and ts_recent will get a
+ * valid value. If it does not, setting ts_recent to zero
+ * will at least satisfy the requirement that zero be placed
+ * in the timestamp echo reply when ts_recent isn't valid.
+ * The age isn't reset until we get a valid ts_recent
+ * because we don't want out-of-order segments to be dropped
+ * when ts_recent is old.
+ */
+ tp->ts_recent = 0;
+ } else {
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
+ TCPSTAT_INC(tcps_pawsdrop);
+ *ret_val = 0;
+ if (tlen) {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+ } else {
+ ctf_do_drop(m, NULL);
+ }
+ return (1);
+ }
+ return (0);
+}
+
+void
+ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
+{
+ int32_t win;
+
+ /*
+ * Calculate amount of space in receive window, and then do TCP
+ * input processing. Receive window is amount of space in rcv queue,
+ * but not less than advertised window.
+ */
+ win = sbspace(&so->so_rcv);
+ if (win < 0)
+ win = 0;
+ tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+}
+
+void
+ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+ int32_t rstreason, int32_t tlen)
+{
+
+ if (tp->t_inpcb) {
+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+ }
+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ INP_WUNLOCK(tp->t_inpcb);
+}
+
+uint32_t
+ctf_fixed_maxseg(struct tcpcb *tp)
+{
+ int optlen;
+
+ if (tp->t_flags & TF_NOOPT)
+ return (tp->t_maxseg);
+
+ /*
+ * Here we have a simplified code from tcp_addoptions(),
+ * without a proper loop, and having most of paddings hardcoded.
+ * We only consider fixed options that we would send every
+ * time I.e. SACK is not considered.
+ *
+ */
+#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4)
+ if (TCPS_HAVEESTABLISHED(tp->t_state)) {
+ if (tp->t_flags & TF_RCVD_TSTMP)
+ optlen = TCPOLEN_TSTAMP_APPA;
+ else
+ optlen = 0;
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (tp->t_flags & TF_SIGNATURE)
+ optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+ } else {
+ if (tp->t_flags & TF_REQ_TSTMP)
+ optlen = TCPOLEN_TSTAMP_APPA;
+ else
+ optlen = PAD(TCPOLEN_MAXSEG);
+ if (tp->t_flags & TF_REQ_SCALE)
+ optlen += PAD(TCPOLEN_WINDOW);
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (tp->t_flags & TF_SIGNATURE)
+ optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+ if (tp->t_flags & TF_SACK_PERMIT)
+ optlen += PAD(TCPOLEN_SACK_PERMITTED);
+ }
+#undef PAD
+ optlen = min(optlen, TCP_MAXOLEN);
+ return (tp->t_maxseg - optlen);
+}
+
+void
+ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks)
+{
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex8 = num_sack_blks;
+ if (num_sack_blks > 0) {
+ log.u_bbr.flex1 = sack_blocks[0].start;
+ log.u_bbr.flex2 = sack_blocks[0].end;
+ }
+ if (num_sack_blks > 1) {
+ log.u_bbr.flex3 = sack_blocks[1].start;
+ log.u_bbr.flex4 = sack_blocks[1].end;
+ }
+ if (num_sack_blks > 2) {
+ log.u_bbr.flex5 = sack_blocks[2].start;
+ log.u_bbr.flex6 = sack_blocks[2].end;
+ }
+ if (num_sack_blks > 3) {
+ log.u_bbr.applimited = sack_blocks[3].start;
+ log.u_bbr.pkts_out = sack_blocks[3].end;
+ }
+ TCP_LOG_EVENTP(tp, NULL,
+ &tp->t_inpcb->inp_socket->so_rcv,
+ &tp->t_inpcb->inp_socket->so_snd,
+ TCP_SACK_FILTER_RES, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+uint32_t
+ctf_decay_count(uint32_t count, uint32_t decay)
+{
+ /*
+ * Given a count, decay it by a set percentage. The
+ * percentage is in thousands i.e. 100% = 1000,
+ * 19.3% = 193.
+ */
+ uint64_t perc_count, decay_per;
+ uint32_t decayed_count;
+ if (decay > 1000) {
+ /* We don't raise it */
+ return (count);
+ }
+ perc_count = count;
+ decay_per = decay;
+ perc_count *= decay_per;
+ perc_count /= 1000;
+ /*
+ * So now perc_count holds the
+ * count decay value.
+ */
+ decayed_count = count - (uint32_t)perc_count;
+ return (decayed_count);
+}
Index: netinet/tcp_var.h
===================================================================
--- netinet/tcp_var.h
+++ netinet/tcp_var.h
@@ -102,7 +102,8 @@
t_state:4, /* state of this connection */
t_idle_reduce : 1,
t_delayed_ack: 7, /* Delayed ack variable */
- bits_spare : 4;
+ t_fin_is_rst: 1, /* Are fin's treated as resets */
+ bits_spare : 3;
u_int t_flags;
tcp_seq snd_una; /* sent but unacknowledged */
tcp_seq snd_max; /* highest sequence number sent;
@@ -271,6 +272,11 @@
void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t);
+ int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int);
+ int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *,
+ int, int, uint8_t,
+ int, struct timeval *);
void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t,
Index: sys/mbuf.h
===================================================================
--- sys/mbuf.h
+++ sys/mbuf.h
@@ -407,6 +407,7 @@
#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically
hw-stamped on port (useful for IEEE 1588
and 802.1AS) */
+#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */
#define M_PROTO1 0x00001000 /* protocol-specific */
#define M_PROTO2 0x00002000 /* protocol-specific */

File Metadata

Mime Type
text/plain
Expires
Thu, Dec 25, 9:43 PM (1 h, 51 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27263579
Default Alt Text
D20834.id59366.diff (99 KB)

Event Timeline