Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F140523954
D20834.id59366.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
99 KB
Referenced Files
None
Subscribers
None
D20834.id59366.diff
View Options
Index: modules/Makefile
===================================================================
--- modules/Makefile
+++ modules/Makefile
@@ -268,7 +268,6 @@
nge \
nmdm \
nullfs \
- ${_ntb} \
${_nvd} \
${_nvdimm} \
${_nvme} \
Index: modules/tcp/rack/Makefile
===================================================================
--- modules/tcp/rack/Makefile
+++ modules/tcp/rack/Makefile
@@ -6,7 +6,7 @@
STACKNAME= rack
KMOD= tcp_${STACKNAME}
-SRCS= rack.c sack_filter.c
+SRCS= rack.c sack_filter.c rack_bbr_common.c
SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
SRCS+= opt_tcpdebug.h
Index: netinet/in_pcb.h
===================================================================
--- netinet/in_pcb.h
+++ netinet/in_pcb.h
@@ -759,7 +759,9 @@
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */
#define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */
-
+#define INP_SUPPORTS_MBUFQ 0x00004000 /* Supports the mbuf queue method of LRO */
+#define INP_MBUF_QUEUE_READY 0x00008000 /* The transport is pacing, inputs can be queued */
+#define INP_DONT_SACK_QUEUE 0x00010000 /* If a sack arrives do not wake me */
/*
* Flags passed to in_pcblookup*() functions.
*/
Index: netinet/tcp.h
===================================================================
--- netinet/tcp.h
+++ netinet/tcp.h
@@ -201,9 +201,8 @@
#define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */
#define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */
#define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */
-#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */
#define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */
-#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */
+#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */
#define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */
#define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */
#define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */
@@ -211,14 +210,18 @@
#define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */
#define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */
#define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */
-#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */
-#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */
-#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */
+#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */
+#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */
+#define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */
+#define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */
+#define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */
+#define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */
#define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */
#define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */
#define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */
#define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */
-#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */
+#define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */
+#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */
#define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */
#define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */
#define TCP_BBR_PACE_PER_SEC 1086
@@ -227,11 +230,12 @@
#define TCP_BBR_PACE_SEG_MIN 1089
#define TCP_BBR_PACE_CROSS 1090
#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
-#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
#define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */
#define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */
+#define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase */
#define TCP_RACK_TLP_USE 1095
#define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */
+#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */
#define TCP_BBR_EXTRA_GAIN 1097
#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */
#define TCP_BBR_RETRAN_WTSO 1099
@@ -238,6 +242,15 @@
#define TCP_DATA_AFTER_CLOSE 1100
#define TCP_BBR_PROBE_RTT_GAIN 1101
#define TCP_BBR_PROBE_RTT_LEN 1102
+#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */
+#define TCP_BBR_USE_RACK_CHEAT 1104 /* Do we use the rack cheat for pacing rxt's */
+#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */
+#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */
+#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */
+#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */
+#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */
+#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */
+#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */
/* Start of reserved space for third-party user-settable options. */
Index: netinet/tcp_hpts.h
===================================================================
--- netinet/tcp_hpts.h
+++ netinet/tcp_hpts.h
@@ -45,112 +45,80 @@
/* Number of useconds in a hpts tick */
#define HPTS_TICKS_PER_USEC 10
-#define HPTS_MS_TO_SLOTS(x) (x * 100)
+#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
#define HPTS_USEC_IN_SEC 1000000
#define HPTS_MSEC_IN_SEC 1000
#define HPTS_USEC_IN_MSEC 1000
-#define DEFAULT_HPTS_LOG 3072
-/*
- * Log flags consist of
- * 7f 7f 1 1 bits
- * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE
- *
- * So for example cpu 10, number 10 would with
- * input active would show up as:
- * p_flags = 0001010 0001010 1 0
- * <or>
- * p_flags = 0x142a
- */
-#define HPTS_HPTS_ACTIVE 0x01
-#define HPTS_INPUT_ACTIVE 0x02
-
-#define HPTSLOG_IMMEDIATE 1
-#define HPTSLOG_INSERT_NORMAL 2
-#define HPTSLOG_INSERT_SLEEPER 3
-#define HPTSLOG_SLEEP_AFTER 4
-#define HPTSLOG_SLEEP_BEFORE 5
-#define HPTSLOG_INSERTED 6
-#define HPTSLOG_WAKEUP_HPTS 7
-#define HPTSLOG_SETTORUN 8
-#define HPTSLOG_HPTSI 9
-#define HPTSLOG_TOLONG 10
-#define HPTSLOG_AWAKENS 11
-#define HPTSLOG_TIMESOUT 12
-#define HPTSLOG_SLEEPSET 13
-#define HPTSLOG_WAKEUP_INPUT 14
-#define HPTSLOG_RESCHEDULE 15
-#define HPTSLOG_AWAKE 16
-#define HPTSLOG_INP_DONE 17
-
-struct hpts_log {
- struct inpcb *inp;
- int32_t event;
- uint32_t cts;
- int32_t line;
- uint32_t ticknow;
- uint32_t t_paceslot;
- uint32_t t_hptsreq;
- uint32_t p_curtick;
- uint32_t p_prevtick;
- uint32_t slot_req;
- uint32_t p_on_queue_cnt;
- uint32_t p_nxt_slot;
- uint32_t p_cur_slot;
- uint32_t p_hpts_sleep_time;
- uint16_t p_flags;
- uint8_t p_onhpts;
- uint8_t p_oninput;
- uint8_t is_notempty;
-};
-
struct hpts_diag {
- uint32_t p_hpts_active;
- uint32_t p_nxt_slot;
- uint32_t p_cur_slot;
- uint32_t slot_req;
- uint32_t inp_hptsslot;
- uint32_t slot_now;
- uint32_t have_slept;
- uint32_t hpts_sleep_time;
- uint32_t yet_to_sleep;
- uint32_t need_new_to;
- int32_t co_ret;
- uint8_t p_on_min_sleep;
+ uint32_t p_hpts_active; /* bbr->flex7 x */
+ uint32_t p_nxt_slot; /* bbr->flex1 x */
+ uint32_t p_cur_slot; /* bbr->flex2 x */
+ uint32_t p_prev_slot; /* bbr->delivered */
+ uint32_t p_runningtick; /* bbr->inflight */
+ uint32_t slot_req; /* bbr->flex3 x */
+ uint32_t inp_hptsslot; /* bbr->flex4 x */
+ uint32_t slot_remaining; /* bbr->flex5 x */
+ uint32_t have_slept; /* bbr->epoch x */
+ uint32_t hpts_sleep_time; /* bbr->applimited x */
+ uint32_t yet_to_sleep; /* bbr->lt_epoch x */
+ uint32_t need_new_to; /* bbr->flex6 x */
+ uint32_t wheel_tick; /* bbr->bw_inuse x */
+ uint32_t maxticks; /* bbr->delRate x */
+ uint32_t wheel_cts; /* bbr->rttProp x */
+ int32_t co_ret; /* bbr->pkts_out x */
+ uint32_t p_curtick; /* upper bbr->cur_del_rate */
+ uint32_t p_lasttick; /* lower bbr->cur_del_rate */
+ uint8_t p_on_min_sleep; /* bbr->flex8 x */
};
+/* Magic flags to tell whats cooking on the pacing wheel */
+#define PACE_TMR_DELACK 0x01 /* Delayed ack timer running */
+#define PACE_TMR_RACK 0x02 /* RACK timer running */
+#define PACE_TMR_TLP 0x04 /* TLP timer running */
+#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
+#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
+#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
+#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */
+#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
+
#ifdef _KERNEL
/* Each hpts has its own p_mtx which is used for locking */
struct tcp_hpts_entry {
/* Cache line 0x00 */
struct mtx p_mtx; /* Mutex for hpts */
- uint32_t p_hpts_active; /* Flag that says hpts is awake */
- uint32_t p_curtick; /* Current tick in 10 us the hpts is at */
- uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */
+ uint16_t p_hpts_active; /* Flag that says hpts is awake */
+ uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */
+ uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
+ uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
+ uint32_t p_runningtick; /* Current tick we are at if we are running */
+ uint32_t p_prev_slot; /* Previous slot we were on */
uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
uint32_t p_nxt_slot; /* The next slot outside the current range of
* slots that the hpts is running on. */
int32_t p_on_queue_cnt; /* Count on queue in this hpts */
- uint32_t enobuf_cnt;
- uint16_t p_log_at;
+ uint32_t p_lasttick; /* Last tick before the current one */
uint8_t p_direct_wake :1, /* boolean */
- p_log_wrapped :1, /* boolean */
- p_on_min_sleep:1; /* boolean */
- uint8_t p_fill;
+ p_on_min_sleep:1, /* boolean */
+ p_avail:6;
+ uint8_t p_fill[3]; /* Fill to 32 bits */
/* Cache line 0x40 */
void *p_inp;
struct hptsh p_input; /* For the tcp-input runner */
/* Hptsi wheel */
struct hptsh *p_hptss;
- struct hpts_log *p_log;
- uint32_t p_logsize;
int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
uint32_t hit_no_enobuf;
uint32_t p_dyn_adjust;
uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
* of 255ms */
+ uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
+ uint32_t saved_lasttick; /* for logging */
+ uint32_t saved_curtick; /* for logging */
+ uint32_t saved_curslot; /* for logging */
+ uint32_t saved_prev_slot; /* for logging */
uint32_t p_delayed_by; /* How much were we delayed by */
/* Cache line 0x80 */
struct sysctl_ctx_list hpts_ctx;
@@ -236,13 +204,9 @@
int
__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line);
#define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
-void
-tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos);
int
-__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line);
-#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__)
+__tcp_queue_to_input(struct inpcb *inp, int32_t line);
+#define tcp_queue_to_input(a) __tcp_queue_to_input(a, __LINE__)
uint16_t tcp_hpts_delayedby(struct inpcb *inp);
Index: netinet/tcp_hpts.c
===================================================================
--- netinet/tcp_hpts.c
+++ netinet/tcp_hpts.c
@@ -37,7 +37,7 @@
* pacing packets out onto the wire. It can be used in two ways
* by a given TCP stack (and those two methods can be used simultaneously).
*
- * First, and probably the main thing its used by Rack and BBR for, it can
+ * First, and probably the main thing its used by Rack and BBR, it can
* be used to call tcp_output() of a transport stack at some time in the future.
* The normal way this is done is that tcp_output() of the stack schedules
* itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
@@ -59,42 +59,57 @@
* to prevent output processing until the time alotted has gone by.
* Of course this is a bare bones example and the stack will probably
* have more consideration then just the above.
- *
- * Now the tcp_hpts system will call tcp_output in one of two forms,
- * it will first check to see if the stack as defined a
- * tfb_tcp_output_wtime() function, if so that is the routine it
- * will call, if that function is not defined then it will call the
- * tfb_tcp_output() function. The only difference between these
- * two calls is that the former passes the time in to the function
- * so the function does not have to access the time (which tcp_hpts
- * already has). What these functions do is of course totally up
- * to the individual tcp stack.
- *
+ *
* Now the second function (actually two functions I guess :D)
* the tcp_hpts system provides is the ability to either abort
- * a connection (later) or process input on a connection.
- * Why would you want to do this? To keep processor locality.
+ * a connection (later) or process input on a connection.
+ * Why would you want to do this? To keep processor locality
+ * and or not have to worry about untangling any recursive
+ * locks. The input function now is hooked to the new LRO
+ * system as well.
*
- * So in order to use the input redirection function the
- * stack changes its tcp_do_segment() routine to instead
- * of process the data call the function:
+ * In order to use the input redirection function the
+ * tcp stack must define an input function for
+ * tfb_do_queued_segments(). This function understands
+ * how to dequeue a array of packets that were input and
+ * knows how to call the correct processing routine.
*
- * tcp_queue_pkt_to_input()
+ * Locking in this is important as well so most likely the
+ * stack will need to define the tfb_do_segment_nounlock()
+ * splitting tfb_do_segment() into two parts. The main processing
+ * part that does not unlock the INP and returns a value of 1 or 0.
+ * It returns 0 if all is well and the lock was not released. It
+ * returns 1 if we had to destroy the TCB (a reset received etc).
+ * The remains of tfb_do_segment() then become just a simple call
+ * to the tfb_do_segment_nounlock() function and check the return
+ * code and possibly unlock.
+ *
+ * The stack must also set the flag on the INP that it supports this
+ * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
+ * this flag as well and will queue packets when it is set.
+ * There are other flags as well INP_MBUF_QUEUE_READY and
+ * INP_DONT_SACK_QUEUE. The first flag tells the LRO code
+ * that we are in the pacer for output so there is no
+ * need to wake up the hpts system to get immediate
+ * input. The second tells the LRO code that its okay
+ * if a SACK arrives you can still defer input and let
+ * the current hpts timer run (this is usually set when
+ * a rack timer is up so we know SACK's are happening
+ * on the connection already and don't want to wakeup yet).
*
- * You will note that the arguments to this function look
- * a lot like tcp_do_segments's arguments. This function
- * will assure that the tcp_hpts system will
- * call the functions tfb_tcp_hpts_do_segment() from the
- * correct CPU. Note that multiple calls can get pushed
- * into the tcp_hpts system this will be indicated by
- * the next to last argument to tfb_tcp_hpts_do_segment()
- * (nxt_pkt). If nxt_pkt is a 1 then another packet is
- * coming. If nxt_pkt is a 0 then this is the last call
- * that the tcp_hpts system has available for the tcp stack.
- *
- * The other point of the input system is to be able to safely
- * drop a tcp connection without worrying about the recursive
- * locking that may be occuring on the INP_WLOCK. So if
+ * There is a common functions within the rack_bbr_common code
+ * version i.e. ctf_do_queued_segments(). This function
+ * knows how to take the input queue of packets from
+ * tp->t_in_pkts and process them digging out
+ * all the arguments, calling any bpf tap and
+ * calling into tfb_do_segment_nounlock(). The common
+ * function (ctf_do_queued_segments()) requires that
+ * you have defined the tfb_do_segment_nounlock() as
+ * described above.
+ *
+ * The second feature of the input side of hpts is the
+ * dropping of a connection. This is due to the way that
+ * locking may have occured on the INP_WLOCK. So if
* a stack wants to drop a connection it calls:
*
* tcp_set_inp_to_drop(tp, ETIMEDOUT)
@@ -156,6 +171,7 @@
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_log_buf.h>
#ifdef tcpdebug
#include <netinet/tcp_debug.h>
@@ -168,8 +184,6 @@
MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
#ifdef RSS
-#include <net/netisr.h>
-#include <net/rss_config.h>
static int tcp_bind_threads = 1;
#else
static int tcp_bind_threads = 2;
@@ -176,16 +190,13 @@
#endif
TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
-static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG;
-
-TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
-
static struct tcp_hptsi tcp_pace;
+static int hpts_does_tp_logging = 0;
static void tcp_wakehpts(struct tcp_hpts_entry *p);
static void tcp_wakeinput(struct tcp_hpts_entry *p);
static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
-static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick);
+static void tcp_hptsi(struct tcp_hpts_entry *hpts);
static void tcp_hpts_thread(void *ctx);
static void tcp_init_hptsi(void *st);
@@ -204,8 +215,6 @@
} \
} while (0)
-static int32_t logging_on = 0;
-static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
static int32_t tcp_hpts_precision = 120;
struct hpts_domain_info {
@@ -219,10 +228,6 @@
&tcp_hpts_precision, 120,
"Value for PRE() precision of callout");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
- &logging_on, 0,
- "Turn on logging if compiled in");
-
counter_u64_t hpts_loops;
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
@@ -233,30 +238,53 @@
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
&back_tosleep, "Number of times hpts found no tcbs");
-static int32_t in_newts_every_tcb = 0;
+counter_u64_t combined_wheel_wrap;
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW,
- &in_newts_every_tcb, 0,
- "Do we have a new cts every tcb we process for input");
-static int32_t in_ts_percision = 0;
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
+ &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW,
- &in_ts_percision, 0,
- "Do we use percise timestamp for clients on input");
-static int32_t out_newts_every_tcb = 0;
+counter_u64_t wheel_wrap;
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW,
- &out_newts_every_tcb, 0,
- "Do we have a new cts every tcb we process for output");
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD,
+ &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
+
static int32_t out_ts_percision = 0;
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
&out_ts_percision, 0,
"Do we use a percise timestamp for every output cts");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
+ &hpts_does_tp_logging, 0,
+ "Do we add to any tp that has logging on pacer logs");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW,
+
+#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)
+
+static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;
+
+static int
+sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ uint32_t new;
+
+ new = hpts_sleep_max;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error == 0 && req->newptr) {
+ if ((new < (NUM_OF_HPTSI_SLOTS / 4)) ||
+ (new > HPTS_MAX_SLEEP_ALLOWED))
+ error = EINVAL;
+ else
+ hpts_sleep_max = new;
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
+ CTLTYPE_UINT | CTLFLAG_RW,
&hpts_sleep_max, 0,
- "The maximum time the hpts will sleep <1 - 254>");
+ &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
+ "Maximum time hpts will sleep");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
&tcp_min_hptsi_time, 0,
@@ -267,55 +295,35 @@
"Do we have the callout call directly to the hpts?");
static void
-__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot,
- uint32_t ticknow, int32_t line)
+tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
+ int ticks_to_run, int idx)
{
- struct hpts_log *pl;
-
- HPTS_MTX_ASSERT(hpts);
- if (hpts->p_log == NULL)
- return;
- pl = &hpts->p_log[hpts->p_log_at];
- hpts->p_log_at++;
- if (hpts->p_log_at >= hpts->p_logsize) {
- hpts->p_log_at = 0;
- hpts->p_log_wrapped = 1;
- }
- pl->inp = inp;
- if (inp) {
- pl->t_paceslot = inp->inp_hptsslot;
- pl->t_hptsreq = inp->inp_hpts_request;
- pl->p_onhpts = inp->inp_in_hpts;
- pl->p_oninput = inp->inp_in_input;
- } else {
- pl->t_paceslot = 0;
- pl->t_hptsreq = 0;
- pl->p_onhpts = 0;
- pl->p_oninput = 0;
- }
- pl->is_notempty = 1;
- pl->event = event;
- pl->line = line;
- pl->cts = tcp_get_usecs(NULL);
- pl->p_curtick = hpts->p_curtick;
- pl->p_prevtick = hpts->p_prevtick;
- pl->p_on_queue_cnt = hpts->p_on_queue_cnt;
- pl->ticknow = ticknow;
- pl->slot_req = slot;
- pl->p_nxt_slot = hpts->p_nxt_slot;
- pl->p_cur_slot = hpts->p_cur_slot;
- pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time;
- pl->p_flags = (hpts->p_cpu & 0x7f);
- pl->p_flags <<= 7;
- pl->p_flags |= (hpts->p_num & 0x7f);
- pl->p_flags <<= 2;
- if (hpts->p_hpts_active) {
- pl->p_flags |= HPTS_HPTS_ACTIVE;
- }
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex1 = hpts->p_nxt_slot;
+ log.u_bbr.flex2 = hpts->p_cur_slot;
+ log.u_bbr.flex3 = hpts->p_prev_slot;
+ log.u_bbr.flex4 = idx;
+ log.u_bbr.flex5 = hpts->p_curtick;
+ log.u_bbr.flex6 = hpts->p_on_queue_cnt;
+ log.u_bbr.use_lt_bw = 1;
+ log.u_bbr.inflight = ticks_to_run;
+ log.u_bbr.applimited = hpts->overidden_sleep;
+ log.u_bbr.delivered = hpts->saved_curtick;
+ log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
+ log.u_bbr.epoch = hpts->saved_curslot;
+ log.u_bbr.lt_epoch = hpts->saved_prev_slot;
+ log.u_bbr.pkts_out = hpts->p_delayed_by;
+ log.u_bbr.lost = hpts->p_hpts_sleep_time;
+ log.u_bbr.cur_del_rate = hpts->p_runningtick;
+ TCP_LOG_EVENTP(tp, NULL,
+ &tp->t_inpcb->inp_socket->so_rcv,
+ &tp->t_inpcb->inp_socket->so_snd,
+ BBR_LOG_HPTSDIAG, 0,
+ 0, &log, false, tv);
}
-#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__)
-
static void
hpts_timeout_swi(void *arg)
{
@@ -347,12 +355,6 @@
/* We are not on the hpts? */
panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
}
- if (TAILQ_EMPTY(head) &&
- (hpts->p_on_queue_cnt != 0)) {
- /* We should not be empty with a queue count */
- panic("%s hpts:%p hpts bucket empty but cnt:%d",
- __FUNCTION__, hpts, hpts->p_on_queue_cnt);
- }
#endif
TAILQ_REMOVE(head, inp, inp_hpts);
hpts->p_on_queue_cnt--;
@@ -456,58 +458,13 @@
in_pcbref(inp);
}
-static int
-sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS)
-{
- struct tcp_hpts_entry *hpts;
- size_t sz;
- int32_t logging_was, i;
- int32_t error = 0;
-
- /*
- * HACK: Turn off logging so no locks are required this really needs
- * a memory barrier :)
- */
- logging_was = logging_on;
- logging_on = 0;
- if (!req->oldptr) {
- /* How much? */
- sz = 0;
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- hpts = tcp_pace.rp_ent[i];
- if (hpts->p_log == NULL)
- continue;
- sz += (sizeof(struct hpts_log) * hpts->p_logsize);
- }
- error = SYSCTL_OUT(req, 0, sz);
- } else {
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- hpts = tcp_pace.rp_ent[i];
- if (hpts->p_log == NULL)
- continue;
- if (hpts->p_log_wrapped)
- sz = (sizeof(struct hpts_log) * hpts->p_logsize);
- else
- sz = (sizeof(struct hpts_log) * hpts->p_log_at);
- error = SYSCTL_OUT(req, hpts->p_log, sz);
- }
- }
- logging_on = logging_was;
- return error;
-}
-
-SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
- 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
-
-
static void
tcp_wakehpts(struct tcp_hpts_entry *hpts)
{
HPTS_MTX_ASSERT(hpts);
- swi_sched(hpts->ie_cookie, 0);
- if (hpts->p_hpts_active == 2) {
- /* Rare sleeping on a ENOBUF */
- wakeup_one(hpts);
+ if (hpts->p_hpts_wake_scheduled == 0) {
+ hpts->p_hpts_wake_scheduled = 1;
+ swi_sched(hpts->ie_cookie, 0);
}
}
@@ -515,10 +472,9 @@
tcp_wakeinput(struct tcp_hpts_entry *hpts)
{
HPTS_MTX_ASSERT(hpts);
- swi_sched(hpts->ie_cookie, 0);
- if (hpts->p_hpts_active == 2) {
- /* Rare sleeping on a ENOBUF */
- wakeup_one(hpts);
+ if (hpts->p_hpts_wake_scheduled == 0) {
+ hpts->p_hpts_wake_scheduled = 1;
+ swi_sched(hpts->ie_cookie, 0);
}
}
@@ -648,8 +604,8 @@
* Valid values in the flags are
* HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
* HPTS_REMOVE_INPUT - remove from the input of the hpts.
- * Note that you can or both values together and get two
- * actions.
+ * Note that you can use one or both values together
+ * and get two actions.
*/
void
__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
@@ -670,53 +626,198 @@
}
static inline int
-hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus)
+hpts_tick(uint32_t wheel_tick, uint32_t plus)
{
- return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS);
+ /*
+ * Given a slot on the wheel, what slot
+ * is that plus ticks out?
+ */
+ KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick));
+ return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS);
}
+static inline int
+tick_to_wheel(uint32_t cts_in_wticks)
+{
+ /*
+ * Given a timestamp in wheel ticks (10usec inc's)
+ * map it to our limited space wheel.
+ */
+ return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
+}
+
+static inline int
+hpts_ticks_diff(int prev_tick, int tick_now)
+{
+ /*
+ * Given two ticks that are someplace
+ * on our wheel. How far are they apart?
+ */
+ if (tick_now > prev_tick)
+ return (tick_now - prev_tick);
+ else if (tick_now == prev_tick)
+ /*
+ * Special case, same means we can go all of our
+ * wheel less one slot.
+ */
+ return (NUM_OF_HPTSI_SLOTS - 1);
+ else
+ return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now);
+}
+
+/*
+ * Given a tick on the wheel that is the current time
+ * mapped to the wheel (wheel_tick), what is the maximum
+ * distance forward that can be obtained without
+ * wrapping past either prev_tick or running_tick
+ * depending on the htps state? Also if passed
+ * a uint32_t *, fill it with the tick location.
+ *
+ * Note if you do not give this function the current
+ * time (that you think it is) mapped to the wheel
+ * then the results will not be what you expect and
+ * could lead to invalid inserts.
+ */
+static inline int32_t
+max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick)
+{
+ uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel;
+
+ if ((hpts->p_hpts_active == 1) &&
+ (hpts->p_wheel_complete == 0)) {
+ end_tick = hpts->p_runningtick;
+ /* Back up one tick */
+ if (end_tick == 0)
+ end_tick = NUM_OF_HPTSI_SLOTS - 1;
+ else
+ end_tick--;
+ if (target_tick)
+ *target_tick = end_tick;
+ } else {
+ /*
+ * For the case where we are
+ * not active, or we have
+ * completed the pass over
+ * the wheel, we can use the
+ * prev tick and subtract one from it. This puts us
+ * as far out as possible on the wheel.
+ */
+ end_tick = hpts->p_prev_slot;
+ if (end_tick == 0)
+ end_tick = NUM_OF_HPTSI_SLOTS - 1;
+ else
+ end_tick--;
+ if (target_tick)
+ *target_tick = end_tick;
+ /*
+ * Now we have close to the full wheel left minus the
+ * time it has been since the pacer went to sleep. Note
+ * that wheel_tick, passed in, should be the current time
+ * from the perspective of the caller, mapped to the wheel.
+ */
+ if (hpts->p_prev_slot != wheel_tick)
+ dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
+ else
+ dis_to_travel = 1;
+ /*
+ * dis_to_travel in this case is the space from when the
+ * pacer stopped (p_prev_slot) and where our wheel_tick
+ * is now. To know how many slots we can put it in we
+ * subtract from the wheel size. We would not want
+ * to place something after p_prev_slot or it will
+ * get ran too soon.
+ */
+ return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
+ }
+ /*
+ * So how many slots are open between p_runningtick -> p_cur_slot
+ * that is what is currently un-available for insertion. Special
+ * case when we are at the last slot, this gets 1, so that
+ * the answer to how many slots are available is all but 1.
+ */
+ if (hpts->p_runningtick == hpts->p_cur_slot)
+ dis_to_travel = 1;
+ else
+ dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+ /*
+ * How long has the pacer been running?
+ */
+ if (hpts->p_cur_slot != wheel_tick) {
+ /* The pacer is a bit late */
+ pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick);
+ } else {
+ /* The pacer is right on time, now == pacers start time */
+ pacer_to_now = 0;
+ }
+ /*
+ * To get the number left we can insert into we simply
+ * subract the distance the pacer has to run from how
+ * many slots there are.
+ */
+ avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
+ /*
+ * Now how many of those we will eat due to the pacer's
+ * time (p_cur_slot) of start being behind the
+ * real time (wheel_tick)?
+ */
+ if (avail_on_wheel <= pacer_to_now) {
+ /*
+ * Wheel wrap, we can't fit on the wheel, that
+ * is unusual the system must be way overloaded!
+ * Insert into the assured tick, and return special
+ * "0".
+ */
+ counter_u64_add(combined_wheel_wrap, 1);
+ *target_tick = hpts->p_nxt_slot;
+ return (0);
+ } else {
+ /*
+ * We know how many slots are open
+ * on the wheel (the reverse of what
+ * is left to run. Take away the time
+ * the pacer started to now (wheel_tick)
+ * and that tells you how many slots are
+ * open that can be inserted into that won't
+ * be touched by the pacer until later.
+ */
+ return (avail_on_wheel - pacer_to_now);
+ }
+}
+
static int
tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
{
- int32_t need_wake = 0;
- uint32_t ticknow = 0;
-
+ uint32_t need_wake = 0;
+
HPTS_MTX_ASSERT(hpts);
if (inp->inp_in_hpts == 0) {
/* Ok we need to set it on the hpts in the current slot */
- if (hpts->p_hpts_active == 0) {
- /* A sleeping hpts we want in next slot to run */
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0,
- hpts_tick(hpts, 1));
- }
- inp->inp_hptsslot = hpts_tick(hpts, 1);
- inp->inp_hpts_request = 0;
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow);
- }
- need_wake = 1;
+ inp->inp_hpts_request = 0;
+ if ((hpts->p_hpts_active == 0) ||
+ (hpts->p_wheel_complete)) {
+ /*
+ * A sleeping hpts we want in next slot to run
+ * note that in this state p_prev_slot == p_cur_slot
+ */
+ inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1);
+ if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
+ need_wake = 1;
} else if ((void *)inp == hpts->p_inp) {
/*
+ * The hpts system is running and the caller
+ * was awoken by the hpts system.
* We can't allow you to go into the same slot we
- * are in. We must put you out.
+ * are in (we don't want a loop :-D).
*/
inp->inp_hptsslot = hpts->p_nxt_slot;
} else
- inp->inp_hptsslot = hpts->p_cur_slot;
+ inp->inp_hptsslot = hpts->p_runningtick;
hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
- inp->inp_hpts_request = 0;
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0);
- }
if (need_wake) {
/*
* Activate the hpts if it is sleeping and its
* timeout is not 1.
*/
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow);
- }
hpts->p_direct_wake = 1;
tcp_wakehpts(hpts);
}
@@ -737,141 +838,129 @@
return (ret);
}
+#ifdef INVARIANTS
static void
-tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line,
- struct hpts_diag *diag, int32_t noref)
+check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
{
- int32_t need_new_to = 0;
- int32_t need_wakeup = 0;
- uint32_t largest_slot;
- uint32_t ticknow = 0;
- uint32_t slot_calc;
+ /*
+ * Sanity checks for the pacer with invariants
+ * on insert.
+ */
+ if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS)
+ panic("hpts:%p inp:%p slot:%d > max",
+ hpts, inp, inp_hptsslot);
+ if ((hpts->p_hpts_active) &&
+ (hpts->p_wheel_complete == 0)) {
+ /*
+ * If the pacer is processing a arc
+ * of the wheel, we need to make
+ * sure we are not inserting within
+ * that arc.
+ */
+ int distance, yet_to_run;
+ distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot);
+ if (hpts->p_runningtick != hpts->p_cur_slot)
+ yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+ else
+ yet_to_run = 0; /* processing last slot */
+ if (yet_to_run > distance) {
+ panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
+ hpts, inp, inp_hptsslot,
+ distance, yet_to_run,
+ hpts->p_runningtick, hpts->p_cur_slot);
+ }
+ }
+}
+#endif
+
+static void
+tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line,
+ struct hpts_diag *diag, struct timeval *tv)
+{
+ uint32_t need_new_to = 0;
+ uint32_t wheel_cts, last_tick;
+ int32_t wheel_tick, maxticks;
+ int8_t need_wakeup = 0;
+
HPTS_MTX_ASSERT(hpts);
if (diag) {
memset(diag, 0, sizeof(struct hpts_diag));
diag->p_hpts_active = hpts->p_hpts_active;
+ diag->p_prev_slot = hpts->p_prev_slot;
+ diag->p_runningtick = hpts->p_runningtick;
diag->p_nxt_slot = hpts->p_nxt_slot;
diag->p_cur_slot = hpts->p_cur_slot;
+ diag->p_curtick = hpts->p_curtick;
+ diag->p_lasttick = hpts->p_lasttick;
diag->slot_req = slot;
+ diag->p_on_min_sleep = hpts->p_on_min_sleep;
+ diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
}
- if ((inp->inp_in_hpts == 0) || noref) {
- inp->inp_hpts_request = slot;
+ if (inp->inp_in_hpts == 0) {
if (slot == 0) {
/* Immediate */
- tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref);
+ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
return;
}
- if (hpts->p_hpts_active) {
- /*
- * Its slot - 1 since nxt_slot is the next tick that
- * will go off since the hpts is awake
- */
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0);
- }
- /*
- * We want to make sure that we don't place a inp in
- * the range of p_cur_slot <-> p_nxt_slot. If we
- * take from p_nxt_slot to the end, plus p_cur_slot
- * and then take away 2, we will know how many is
- * the max slots we can use.
- */
- if (hpts->p_nxt_slot > hpts->p_cur_slot) {
- /*
- * Non-wrap case nxt_slot <-> cur_slot we
- * don't want to land in. So the diff gives
- * us what is taken away from the number of
- * slots.
+ /* Get the current time relative to the wheel */
+ wheel_cts = tcp_tv_to_hptstick(tv);
+ /* Map it onto the wheel */
+ wheel_tick = tick_to_wheel(wheel_cts);
+ /* Now what's the max we can place it at? */
+ maxticks = max_ticks_available(hpts, wheel_tick, &last_tick);
+ if (diag) {
+ diag->wheel_tick = wheel_tick;
+ diag->maxticks = maxticks;
+ diag->wheel_cts = wheel_cts;
+ }
+ if (maxticks == 0) {
+ /* The pacer is in a wheel wrap behind, yikes! */
+ if (slot > 1) {
+ /*
+ * Reduce by 1 to prevent a forever loop in
+ * case something else is wrong. Note this
+ * probably does not hurt because the pacer
+ * if its true is so far behind we will be
+ * > 1second late calling anyway.
*/
- largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot);
- } else if (hpts->p_nxt_slot == hpts->p_cur_slot) {
- largest_slot = NUM_OF_HPTSI_SLOTS - 2;
- } else {
- /*
- * Wrap case so the diff gives us the number
- * of slots that we can land in.
- */
- largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot;
+ slot--;
}
- /*
- * We take away two so we never have a problem (20
- * usec's) out of 1024000 usecs
- */
- largest_slot -= 2;
- if (inp->inp_hpts_request > largest_slot) {
- /*
- * Restrict max jump of slots and remember
- * leftover
- */
- slot = largest_slot;
- inp->inp_hpts_request -= largest_slot;
- } else {
- /* This one will run when we hit it */
- inp->inp_hpts_request = 0;
- }
- if (hpts->p_nxt_slot == hpts->p_cur_slot)
- slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS;
- else
- slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS;
- if (slot_calc == hpts->p_cur_slot) {
+ inp->inp_hptsslot = last_tick;
+ inp->inp_hpts_request = slot;
+ } else if (maxticks >= slot) {
+ /* It all fits on the wheel */
+ inp->inp_hpts_request = 0;
+ inp->inp_hptsslot = hpts_tick(wheel_tick, slot);
+ } else {
+ /* It does not fit */
+ inp->inp_hpts_request = slot - maxticks;
+ inp->inp_hptsslot = last_tick;
+ }
+ if (diag) {
+ diag->slot_remaining = inp->inp_hpts_request;
+ diag->inp_hptsslot = inp->inp_hptsslot;
+ }
#ifdef INVARIANTS
- /* TSNH */
- panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n",
- hpts, slot_calc, slot, largest_slot);
+ check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
#endif
- if (slot_calc)
- slot_calc--;
- else
- slot_calc = NUM_OF_HPTSI_SLOTS - 1;
- }
- inp->inp_hptsslot = slot_calc;
- if (diag) {
- diag->inp_hptsslot = inp->inp_hptsslot;
- }
- } else {
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
+ if ((hpts->p_hpts_active == 0) &&
+ (inp->inp_hpts_request == 0) &&
+ (hpts->p_on_min_sleep == 0)) {
/*
- * The hpts is sleeping, we need to figure out where
+ * The hpts is sleeping and not on a minimum
+ * sleep time, we need to figure out where
* it will wake up at and if we need to reschedule
* its time-out.
*/
uint32_t have_slept, yet_to_sleep;
- uint32_t slot_now;
- struct timeval tv;
- ticknow = tcp_gethptstick(&tv);
- slot_now = ticknow % NUM_OF_HPTSI_SLOTS;
- /*
- * The user wants to be inserted at (slot_now +
- * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up.
- */
- largest_slot = NUM_OF_HPTSI_SLOTS - 2;
- if (inp->inp_hpts_request > largest_slot) {
- /* Adjust the residual in inp_hpts_request */
- slot = largest_slot;
- inp->inp_hpts_request -= largest_slot;
- } else {
- /* No residual it all fits */
- inp->inp_hpts_request = 0;
- }
- inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS;
- if (diag) {
- diag->slot_now = slot_now;
- diag->inp_hptsslot = inp->inp_hptsslot;
- diag->p_on_min_sleep = hpts->p_on_min_sleep;
- }
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow);
- }
/* Now do we need to restart the hpts's timer? */
- if (TSTMP_GT(ticknow, hpts->p_curtick))
- have_slept = ticknow - hpts->p_curtick;
- else
- have_slept = 0;
- if (have_slept < hpts->p_hpts_sleep_time) {
- /* This should be what happens */
+ have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
+ if (have_slept < hpts->p_hpts_sleep_time)
yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
- } else {
+ else {
/* We are over-due */
yet_to_sleep = 0;
need_wakeup = 1;
@@ -879,20 +968,16 @@
if (diag) {
diag->have_slept = have_slept;
diag->yet_to_sleep = yet_to_sleep;
- diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
}
- if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) {
+ if (yet_to_sleep &&
+ (yet_to_sleep > slot)) {
/*
- * We need to reschedule the hptss time-out.
+ * We need to reschedule the hpts's time-out.
*/
hpts->p_hpts_sleep_time = slot;
need_new_to = slot * HPTS_TICKS_PER_USEC;
}
}
- hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow);
- }
/*
* Now how far is the hpts sleeping to? if active is 1, its
* up and ticking we do nothing, otherwise we may need to
@@ -899,9 +984,6 @@
* reschedule its callout if need_new_to is set from above.
*/
if (need_wakeup) {
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0);
- }
hpts->p_direct_wake = 1;
tcp_wakehpts(hpts);
if (diag) {
@@ -944,9 +1026,10 @@
}
uint32_t
-tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){
+tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag)
+{
struct tcp_hpts_entry *hpts;
- uint32_t slot_on, cts;
+ uint32_t slot_on;
struct timeval tv;
/*
@@ -956,12 +1039,8 @@
*/
INP_WLOCK_ASSERT(inp);
hpts = tcp_hpts_lock(inp);
- if (in_ts_percision)
- microuptime(&tv);
- else
- getmicrouptime(&tv);
- cts = tcp_tv_to_usectick(&tv);
- tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0);
+ microuptime(&tv);
+ tcp_hpts_insert_locked(hpts, inp, slot, line, diag, &tv);
slot_on = hpts->p_nxt_slot;
mtx_unlock(&hpts->p_mtx);
return (slot_on);
@@ -971,7 +1050,6 @@
__tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
return (tcp_hpts_insert_diag(inp, slot, line, NULL));
}
-
int
__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
{
@@ -986,9 +1064,6 @@
/*
* Activate the hpts if it is sleeping.
*/
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0);
- }
retval = 2;
hpts->p_direct_wake = 1;
tcp_wakeinput(hpts);
@@ -1001,36 +1076,14 @@
return (retval);
}
-void
-tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos)
+int32_t
+__tcp_queue_to_input(struct inpcb *inp, int line)
{
- /* Setup packet for input first */
- INP_WLOCK_ASSERT(tp->t_inpcb);
- m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t));
- m->m_pkthdr.pace_tlen = (uint16_t) tlen;
- m->m_pkthdr.pace_drphdrlen = drop_hdrlen;
- m->m_pkthdr.pace_tos = iptos;
- m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0);
- if (tp->t_in_pkt == NULL) {
- tp->t_in_pkt = m;
- tp->t_tail_pkt = m;
- } else {
- tp->t_tail_pkt->m_nextpkt = m;
- tp->t_tail_pkt = m;
- }
-}
-
-
-int32_t
-__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){
struct tcp_hpts_entry *hpts;
int32_t ret;
- tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
- hpts = tcp_input_lock(tp->t_inpcb);
- ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line);
+ hpts = tcp_input_lock(inp);
+ ret = __tcp_queue_to_input_locked(inp, hpts, line);
mtx_unlock(&hpts->p_mtx);
return (ret);
}
@@ -1132,6 +1185,25 @@
#endif
}
+static void
+tcp_drop_in_pkts(struct tcpcb *tp)
+{
+ struct mbuf *m, *n;
+
+ m = tp->t_in_pkt;
+ if (m)
+ n = m->m_nextpkt;
+ else
+ n = NULL;
+ tp->t_in_pkt = NULL;
+ while (m) {
+ m_freem(m);
+ m = n;
+ if (m)
+ n = m->m_nextpkt;
+ }
+}
+
/*
* Do NOT try to optimize the processing of inp's
* by first pulling off all the inp's into a temporary
@@ -1142,7 +1214,7 @@
* but then while you were processing one of the inp's
* some other one that you switch will get a new
* packet on the different CPU. It will insert it
- * on the new hptss input list. Creating a temporary
+ * on the new hpts's input list. Creating a temporary
* link in the inp will not fix it either, since
* the other hpts will be doing the same thing and
* you will both end up using the temporary link.
@@ -1155,16 +1227,16 @@
static void
tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
{
- struct mbuf *m, *n;
struct tcpcb *tp;
struct inpcb *inp;
uint16_t drop_reason;
int16_t set_cpu;
uint32_t did_prefetch = 0;
- int32_t ti_locked = TI_UNLOCKED;
+ int dropped;
struct epoch_tracker et;
HPTS_MTX_ASSERT(hpts);
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
HPTS_MTX_ASSERT(hpts);
hpts_sane_input_remove(hpts, inp, 0);
@@ -1178,24 +1250,14 @@
inp->inp_in_input = 0;
mtx_unlock(&hpts->p_mtx);
CURVNET_SET(inp->inp_vnet);
- if (drop_reason) {
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- ti_locked = TI_RLOCKED;
- } else {
- ti_locked = TI_UNLOCKED;
- }
INP_WLOCK(inp);
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
(inp->inp_flags2 & INP_FREED)) {
out:
hpts->p_inp = NULL;
- if (ti_locked == TI_RLOCKED) {
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
- }
if (in_pcbrele_wlocked(inp) == 0) {
INP_WUNLOCK(inp);
}
- ti_locked = TI_UNLOCKED;
CURVNET_RESTORE();
mtx_lock(&hpts->p_mtx);
continue;
@@ -1206,20 +1268,8 @@
}
if (drop_reason) {
/* This tcb is being destroyed for drop_reason */
- m = tp->t_in_pkt;
- if (m)
- n = m->m_nextpkt;
- else
- n = NULL;
- tp->t_in_pkt = NULL;
- while (m) {
- m_freem(m);
- m = n;
- if (m)
- n = m->m_nextpkt;
- }
+ tcp_drop_in_pkts(tp);
tp = tcp_drop(tp, drop_reason);
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
if (tp == NULL) {
INP_WLOCK(inp);
}
@@ -1246,212 +1296,168 @@
*/
tcp_set_hpts(inp);
}
- m = tp->t_in_pkt;
- n = NULL;
- if (m != NULL &&
- (m->m_pkthdr.pace_lock == TI_RLOCKED ||
- tp->t_state != TCPS_ESTABLISHED)) {
- ti_locked = TI_RLOCKED;
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- m = tp->t_in_pkt;
- }
- if (in_newts_every_tcb) {
- if (in_ts_percision)
- microuptime(tv);
- else
- getmicrouptime(tv);
- }
if (tp->t_fb_ptr != NULL) {
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
did_prefetch = 1;
}
- /* Any input work to do, if so do it first */
- if ((m != NULL) && (m == tp->t_in_pkt)) {
- struct tcphdr *th;
- int32_t tlen, drop_hdrlen, nxt_pkt;
- uint8_t iptos;
-
- n = m->m_nextpkt;
- tp->t_in_pkt = tp->t_tail_pkt = NULL;
- while (m) {
- th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff);
- tlen = m->m_pkthdr.pace_tlen;
- drop_hdrlen = m->m_pkthdr.pace_drphdrlen;
- iptos = m->m_pkthdr.pace_tos;
- m->m_nextpkt = NULL;
- if (n)
- nxt_pkt = 1;
- else
- nxt_pkt = 0;
- inp->inp_input_calls = 1;
- if (tp->t_fb->tfb_tcp_hpts_do_segment) {
- /* Use the hpts specific do_segment */
- (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket,
- tp, drop_hdrlen,
- tlen, iptos, nxt_pkt, tv);
- } else {
- /* Use the default do_segment */
- (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket,
- tp, drop_hdrlen,
- tlen, iptos);
- }
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
- /*
- * Do segment returns unlocked we need the
- * lock again but we also need some kasserts
- * here.
- */
- INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
- INP_UNLOCK_ASSERT(inp);
- m = n;
- if (m)
- n = m->m_nextpkt;
- if (m != NULL &&
- m->m_pkthdr.pace_lock == TI_RLOCKED) {
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- ti_locked = TI_RLOCKED;
- } else
- ti_locked = TI_UNLOCKED;
+ if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
+ if (inp->inp_in_input)
+ tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
+ dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
+ if (dropped) {
+ /* Re-acquire the wlock so we can release the reference */
INP_WLOCK(inp);
- /*
- * Since we have an opening here we must
- * re-check if the tcb went away while we
- * were getting the lock(s).
- */
- if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
- (inp->inp_flags2 & INP_FREED)) {
- while (m) {
- m_freem(m);
- m = n;
- if (m)
- n = m->m_nextpkt;
- }
- goto out;
- }
- /*
- * Now that we hold the INP lock, check if
- * we need to upgrade our lock.
- */
- if (ti_locked == TI_UNLOCKED &&
- (tp->t_state != TCPS_ESTABLISHED)) {
- ti_locked = TI_RLOCKED;
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- }
- } /** end while(m) */
- } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */
+ }
+ } else if (tp->t_in_pkt) {
+ /*
+ * We reach here only if we had a
+ * stack that supported INP_SUPPORTS_MBUFQ
+ * and then somehow switched to a stack that
+ * does not. The packets are basically stranded
+ * and would hang with the connection until
+ * cleanup without this code. Its not the
+ * best way but I know of no other way to
+ * handle it since the stack needs functions
+ * it does not have to handle queued packets.
+ */
+ tcp_drop_in_pkts(tp);
+ }
if (in_pcbrele_wlocked(inp) == 0)
INP_WUNLOCK(inp);
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
- INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
INP_UNLOCK_ASSERT(inp);
- ti_locked = TI_UNLOCKED;
mtx_lock(&hpts->p_mtx);
hpts->p_inp = NULL;
CURVNET_RESTORE();
}
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
}
-static int
-tcp_hpts_est_run(struct tcp_hpts_entry *hpts)
-{
- int32_t ticks_to_run;
-
- if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) {
- ticks_to_run = hpts->p_curtick - hpts->p_prevtick;
- if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) {
- ticks_to_run = NUM_OF_HPTSI_SLOTS - 2;
- }
- } else {
- if (hpts->p_prevtick == hpts->p_curtick) {
- /* This happens when we get woken up right away */
- return (-1);
- }
- ticks_to_run = 1;
- }
- /* Set in where we will be when we catch up */
- hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS;
- if (hpts->p_nxt_slot == hpts->p_cur_slot) {
- panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d",
- hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run);
- }
- return (ticks_to_run);
-}
-
static void
-tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick)
+tcp_hptsi(struct tcp_hpts_entry *hpts)
{
+ struct epoch_tracker et;
struct tcpcb *tp;
struct inpcb *inp = NULL, *ninp;
struct timeval tv;
- int32_t ticks_to_run, i, error, tick_now, interum_tick;
+ int32_t ticks_to_run, i, error;
int32_t paced_cnt = 0;
int32_t did_prefetch = 0;
int32_t prefetch_ninp = 0;
int32_t prefetch_tp = 0;
- uint32_t cts;
int16_t set_cpu;
HPTS_MTX_ASSERT(hpts);
- hpts->p_curtick = tcp_tv_to_hptstick(ctick);
- cts = tcp_tv_to_usectick(ctick);
- memcpy(&tv, ctick, sizeof(struct timeval));
- hpts->p_cur_slot = hpts_tick(hpts, 1);
+ /* record previous info for any logging */
+ hpts->saved_lasttick = hpts->p_lasttick;
+ hpts->saved_curtick = hpts->p_curtick;
+ hpts->saved_curslot = hpts->p_cur_slot;
+ hpts->saved_prev_slot = hpts->p_prev_slot;
- /* Figure out if we had missed ticks */
+ hpts->p_lasttick = hpts->p_curtick;
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ if ((hpts->p_on_queue_cnt == 0) ||
+ (hpts->p_lasttick == hpts->p_curtick)) {
+ /*
+ * No time has yet passed,
+ * or nothing to do.
+ */
+ hpts->p_prev_slot = hpts->p_cur_slot;
+ hpts->p_lasttick = hpts->p_curtick;
+ goto no_run;
+ }
again:
+ hpts->p_wheel_complete = 0;
HPTS_MTX_ASSERT(hpts);
- ticks_to_run = tcp_hpts_est_run(hpts);
- if (!TAILQ_EMPTY(&hpts->p_input)) {
- tcp_input_data(hpts, &tv);
+ ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot);
+ if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) &&
+ (hpts->p_on_queue_cnt != 0)) {
+ /*
+ * Wheel wrap is occuring, basically we
+ * are behind and the distance between
+ * run's has spread so much it has exceeded
+ * the time on the wheel (1.024 seconds). This
+ * is ugly and should NOT be happening. We
+ * need to run the entire wheel. We last processed
+ * p_prev_slot, so that needs to be the last slot
+ * we run. The next slot after that should be our
+ * reserved first slot for new, and then starts
+ * the running postion. Now the problem is the
+ * reserved "not to yet" place does not exist
+ * and there may be inp's in there that need
+ * running. We can merge those into the
+ * first slot at the head.
+ */
+ hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1);
+ hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2);
+ /*
+ * Adjust p_cur_slot to be where we are starting from
+ * hopefully we will catch up (fat chance if something
+ * is broken this bad :( )
+ */
+ hpts->p_cur_slot = hpts->p_prev_slot;
+ /*
+ * The next slot has guys to run too, and that would
+ * be where we would normally start, lets move them into
+ * the next slot (p_prev_slot + 2) so that we will
+ * run them, the extra 10usecs of late (by being
+ * put behind) does not really matter in this situation.
+ */
+#ifdef INVARIANTS
+ /*
+ * To prevent a panic we need to update the inpslot to the
+ * new location. This is safe since it takes both the
+ * INP lock and the pacer mutex to change the inp_hptsslot.
+ */
+ TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) {
+ inp->inp_hptsslot = hpts->p_runningtick;
+ }
+#endif
+ TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick],
+ &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts);
+ ticks_to_run = NUM_OF_HPTSI_SLOTS - 1;
+ counter_u64_add(wheel_wrap, 1);
+ } else {
+ /*
+ * Nxt slot is always one after p_runningtick though
+ * its not used usually unless we are doing wheel wrap.
+ */
+ hpts->p_nxt_slot = hpts->p_prev_slot;
+ hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1);
}
#ifdef INVARIANTS
if (TAILQ_EMPTY(&hpts->p_input) &&
(hpts->p_on_inqueue_cnt != 0)) {
panic("tp:%p in_hpts input empty but cnt:%d",
- hpts, hpts->p_on_inqueue_cnt);
+ hpts, hpts->p_on_inqueue_cnt);
}
#endif
HPTS_MTX_ASSERT(hpts);
- /* Reset the ticks to run and time if we need too */
- interum_tick = tcp_gethptstick(&tv);
- if (interum_tick != hpts->p_curtick) {
- /* Save off the new time we execute to */
- *ctick = tv;
- hpts->p_curtick = interum_tick;
- cts = tcp_tv_to_usectick(&tv);
- hpts->p_cur_slot = hpts_tick(hpts, 1);
- ticks_to_run = tcp_hpts_est_run(hpts);
- }
- if (ticks_to_run == -1) {
- goto no_run;
- }
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0);
- }
if (hpts->p_on_queue_cnt == 0) {
goto no_one;
}
HPTS_MTX_ASSERT(hpts);
+#ifndef VIMAGE
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
for (i = 0; i < ticks_to_run; i++) {
/*
* Calculate our delay, if there are no extra ticks there
- * was not any
+ * was not any (i.e. if ticks_to_run == 1, no delay).
*/
hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
HPTS_MTX_ASSERT(hpts);
- while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+ while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
/* For debugging */
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i);
- }
hpts->p_inp = inp;
paced_cnt++;
- if (hpts->p_cur_slot != inp->inp_hptsslot) {
+#ifdef INVARIANTS
+ if (hpts->p_runningtick != inp->inp_hptsslot) {
panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
- hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot);
+ hpts, inp, hpts->p_runningtick, inp->inp_hptsslot);
}
+#endif
/* Now pull it */
if (inp->inp_hpts_cpu_set == 0) {
set_cpu = 1;
@@ -1458,8 +1464,8 @@
} else {
set_cpu = 0;
}
- hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0);
- if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+ hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0);
+ if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
/* We prefetch the next inp if possible */
kern_prefetch(ninp, &prefetch_ninp);
prefetch_ninp = 1;
@@ -1467,25 +1473,36 @@
if (inp->inp_hpts_request) {
/*
* This guy is deferred out further in time
- * then our wheel had on it. Push him back
- * on the wheel.
+ * then our wheel had available on it.
+ * Push him back on the wheel or run it
+ * depending.
*/
- int32_t remaining_slots;
-
+ uint32_t maxticks, last_tick, remaining_slots;
+
remaining_slots = ticks_to_run - (i + 1);
if (inp->inp_hpts_request > remaining_slots) {
/*
- * Keep INVARIANTS happy by clearing
- * the flag
+ * How far out can we go?
*/
- tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1);
+ maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick);
+ if (maxticks >= inp->inp_hpts_request) {
+ /* we can place it finally to be processed */
+ inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request);
+ inp->inp_hpts_request = 0;
+ } else {
+ /* Work off some more time */
+ inp->inp_hptsslot = last_tick;
+ inp->inp_hpts_request-= maxticks;
+ }
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1);
hpts->p_inp = NULL;
continue;
}
inp->inp_hpts_request = 0;
+ /* Fall through we will so do it now */
}
/*
- * We clear the hpts flag here after dealing with
+ * We clear the hpts flag here after dealing with
* remaining slots. This way anyone looking with the
* TCB lock will see its on the hpts until just
* before we unlock.
@@ -1495,23 +1512,20 @@
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
mtx_lock(&hpts->p_mtx);
- if (logging_on)
- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1);
hpts->p_inp = NULL;
continue;
}
- if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
-out_now:
+ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+ (inp->inp_flags2 & INP_FREED)) {
+ out_now:
#ifdef INVARIANTS
if (mtx_owned(&hpts->p_mtx)) {
panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
+ hpts, __LINE__);
}
#endif
INP_WUNLOCK(inp);
mtx_lock(&hpts->p_mtx);
- if (logging_on)
- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3);
hpts->p_inp = NULL;
continue;
}
@@ -1539,16 +1553,14 @@
*/
tcp_set_hpts(inp);
}
- if (out_newts_every_tcb) {
- struct timeval sv;
-
- if (out_ts_percision)
- microuptime(&sv);
- else
- getmicrouptime(&sv);
- cts = tcp_tv_to_usectick(&sv);
+ CURVNET_SET(inp->inp_vnet);
+#ifdef VIMAGE
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
+ /* Lets do any logging that we might want to */
+ if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i);
}
- CURVNET_SET(inp->inp_vnet);
/*
* There is a hole here, we get the refcnt on the
* inp so it will still be preserved but to make
@@ -1560,7 +1572,7 @@
#ifdef INVARIANTS
if (mtx_owned(&hpts->p_mtx)) {
panic("Hpts:%p owns mtx before tcp-output:%d",
- hpts, __LINE__);
+ hpts, __LINE__);
}
#endif
if (tp->t_fb_ptr != NULL) {
@@ -1567,12 +1579,16 @@
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
did_prefetch = 1;
}
+ if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
+ error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
+ if (error) {
+ /* The input killed the connection */
+ goto skip_pacing;
+ }
+ }
inp->inp_hpts_calls = 1;
- if (tp->t_fb->tfb_tcp_output_wtime != NULL) {
- error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv);
- } else {
- error = tp->t_fb->tfb_tcp_output(tp);
- }
+ error = tp->t_fb->tfb_tcp_output(tp);
+ inp->inp_hpts_calls = 0;
if (ninp && ninp->inp_ppcb) {
/*
* If we have a nxt inp, see if we can
@@ -1609,74 +1625,92 @@
prefetch_tp = 1;
}
INP_WUNLOCK(inp);
+ skip_pacing:
+#ifdef VIMAGE
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+#endif
INP_UNLOCK_ASSERT(inp);
CURVNET_RESTORE();
#ifdef INVARIANTS
if (mtx_owned(&hpts->p_mtx)) {
panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
+ hpts, __LINE__);
}
#endif
mtx_lock(&hpts->p_mtx);
- if (logging_on)
- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4);
hpts->p_inp = NULL;
}
HPTS_MTX_ASSERT(hpts);
hpts->p_inp = NULL;
- hpts->p_cur_slot++;
- if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) {
- hpts->p_cur_slot = 0;
+ hpts->p_runningtick++;
+ if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) {
+ hpts->p_runningtick = 0;
}
}
+#ifndef VIMAGE
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+#endif
no_one:
HPTS_MTX_ASSERT(hpts);
- hpts->p_prevtick = hpts->p_curtick;
hpts->p_delayed_by = 0;
/*
* Check to see if we took an excess amount of time and need to run
* more ticks (if we did not hit eno-bufs).
*/
- /* Re-run any input that may be there */
- (void)tcp_gethptstick(&tv);
- if (!TAILQ_EMPTY(&hpts->p_input)) {
- tcp_input_data(hpts, &tv);
- }
#ifdef INVARIANTS
if (TAILQ_EMPTY(&hpts->p_input) &&
(hpts->p_on_inqueue_cnt != 0)) {
panic("tp:%p in_hpts input empty but cnt:%d",
- hpts, hpts->p_on_inqueue_cnt);
+ hpts, hpts->p_on_inqueue_cnt);
}
#endif
- tick_now = tcp_gethptstick(&tv);
- if (SEQ_GT(tick_now, hpts->p_prevtick)) {
- struct timeval res;
-
- /* Did we really spend a full tick or more in here? */
- timersub(&tv, ctick, &res);
- if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) {
+ hpts->p_prev_slot = hpts->p_cur_slot;
+ hpts->p_lasttick = hpts->p_curtick;
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ if (hpts->p_lasttick != hpts->p_curtick) {
+ counter_u64_add(hpts_loops, 1);
+ goto again;
+ }
+no_run:
+ /*
+ * Set flag to tell that we are done for
+ * any slot input that happens during
+ * input.
+ */
+ hpts->p_wheel_complete = 1;
+ /*
+ * Run any input that may be there not covered
+ * in running data.
+ */
+ if (!TAILQ_EMPTY(&hpts->p_input)) {
+ tcp_input_data(hpts, &tv);
+ /*
+ * Now did we spend too long running
+ * input and need to run more ticks?
+ */
+ KASSERT(hpts->p_prev_slot == hpts->p_cur_slot,
+ ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
+ hpts->p_prev_slot, hpts->p_cur_slot));
+ KASSERT(hpts->p_lasttick == hpts->p_curtick,
+ ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
+ hpts->p_lasttick, hpts->p_curtick));
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ if (hpts->p_lasttick != hpts->p_curtick) {
counter_u64_add(hpts_loops, 1);
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now);
- }
- *ctick = res;
- hpts->p_curtick = tick_now;
+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
goto again;
}
}
-no_run:
{
uint32_t t = 0, i, fnd = 0;
if (hpts->p_on_queue_cnt) {
-
-
/*
* Find next slot that is occupied and use that to
* be the sleep time.
*/
- for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) {
+ for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
fnd = 1;
break;
@@ -1684,27 +1718,20 @@
t = (t + 1) % NUM_OF_HPTSI_SLOTS;
}
if (fnd) {
- hpts->p_hpts_sleep_time = i;
+ hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
} else {
- counter_u64_add(back_tosleep, 1);
#ifdef INVARIANTS
- panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt);
+ panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt);
#endif
+ counter_u64_add(back_tosleep, 1);
hpts->p_on_queue_cnt = 0;
goto non_found;
}
- t++;
} else {
- /* No one on the wheel sleep for all but 2 slots */
-non_found:
- if (hpts_sleep_max == 0)
- hpts_sleep_max = 1;
- hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max);
- t = 0;
+ /* No one on the wheel sleep for all but 400 slots or sleep max */
+ non_found:
+ hpts->p_hpts_sleep_time = hpts_sleep_max;
}
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC));
- }
}
}
@@ -1746,33 +1773,29 @@
mtx_lock(&hpts->p_mtx);
if (hpts->p_direct_wake) {
/* Signaled by input */
- if (logging_on)
- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1);
callout_stop(&hpts->co);
} else {
/* Timed out */
if (callout_pending(&hpts->co) ||
!callout_active(&hpts->co)) {
- if (logging_on)
- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2);
mtx_unlock(&hpts->p_mtx);
return;
}
callout_deactivate(&hpts->co);
- if (logging_on)
- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3);
}
+ hpts->p_hpts_wake_scheduled = 0;
hpts->p_hpts_active = 1;
- (void)tcp_gethptstick(&tv);
- tcp_hptsi(hpts, &tv);
+ tcp_hptsi(hpts);
HPTS_MTX_ASSERT(hpts);
tv.tv_sec = 0;
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
+ hpts->overidden_sleep = tv.tv_usec;
tv.tv_usec = tcp_min_hptsi_time;
hpts->p_on_min_sleep = 1;
} else {
/* Clear the min sleep flag */
+ hpts->overidden_sleep = 0;
hpts->p_on_min_sleep = 0;
}
hpts->p_hpts_active = 0;
@@ -1811,7 +1834,8 @@
tcp_pace.rp_num_hptss = ncpus;
hpts_loops = counter_u64_alloc(M_WAITOK);
back_tosleep = counter_u64_alloc(M_WAITOK);
-
+ combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
+ wheel_wrap = counter_u64_alloc(M_WAITOK);
sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
@@ -1850,7 +1874,7 @@
OID_AUTO, "out_qcnt", CTLFLAG_RD,
&hpts->p_on_queue_cnt, 0,
"Count TCB's awaiting output processing");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_ADD_U16(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "active", CTLFLAG_RD,
&hpts->p_hpts_active, 0,
@@ -1859,29 +1883,23 @@
SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "curslot", CTLFLAG_RD,
&hpts->p_cur_slot, 0,
- "What the current slot is if active");
+ "What the current running pacers goal");
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "runtick", CTLFLAG_RD,
+ &hpts->p_runningtick, 0,
+ "What the running pacers current slot is");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "curtick", CTLFLAG_RD,
&hpts->p_curtick, 0,
- "What the current tick on if active");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
- SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "logsize", CTLFLAG_RD,
- &hpts->p_logsize, 0,
- "Hpts logging buffer size");
- hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2;
+ "What the running pacers last tick mapped to the wheel was");
+ hpts->p_hpts_sleep_time = hpts_sleep_max;
hpts->p_num = i;
- hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv);
- hpts->p_prevtick -= 1;
- hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS;
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
hpts->p_cpu = 0xffff;
- hpts->p_nxt_slot = 1;
- hpts->p_logsize = tcp_hpts_logging_size;
- if (hpts->p_logsize) {
- sz = (sizeof(struct hpts_log) * hpts->p_logsize);
- hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
- }
+ hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1);
callout_init(&hpts->co, 1);
}
Index: netinet/tcp_log_buf.h
===================================================================
--- netinet/tcp_log_buf.h
+++ netinet/tcp_log_buf.h
@@ -175,7 +175,7 @@
TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
TCP_LOG_PRR, /* Doing PRR 6 */
TCP_LOG_REORDER,/* Detected reorder 7 */
- TCP_LOG_PACER, /* Pacer sending a packet 8 */
+ TCP_LOG_HPTS, /* Hpts sending a packet 8 */
BBR_LOG_BBRUPD, /* We updated BBR info 9 */
BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */
BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */
@@ -194,31 +194,38 @@
BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */
TCP_LOG_FLOWEND, /* End of a flow 25 */
BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */
- BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */
- BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */
+ BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */
+ BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */
BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */
BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */
TCP_LOG_USERSEND, /* User level sends data 31 */
- UNUSED_32, /* Unused 32 */
- UNUSED_33, /* Unused 33 */
+ BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */
+ BBR_LOG_STATE_TARGET, /* Log of target at state 33 */
BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */
BBR_LOG_TO_PROCESS, /* A to was processed 35 */
BBR_LOG_BBRTSO, /* TSO update 36 */
- BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */
+ BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */
BBR_LOG_LOWGAIN, /* Low gain accounting 38 */
BBR_LOG_PROGRESS, /* Progress timer event 39 */
TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */
BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */
BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */
- BBR_LOG_PACING_CALC, /* calc the pacing time 43 */
+ BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */
BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */
BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */
BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/
TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */
BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */
- BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */
+ BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */
TCP_LOG_REASS, /* Reassembly buffer logging 50 */
- TCP_LOG_END /* End (keep at end) 51 */
+ TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */
+ BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */
+ BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */
+ TCP_LOG_CONNEND, /* End of connection 54 */
+ TCP_LOG_LRO, /* LRO entry 55 */
+ TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */
+ TCP_SAD_DETECTION, /* Sack Attack Detection 57 */
+ TCP_LOG_END /* End (keep at end) 58 */
};
enum tcp_log_states {
@@ -275,8 +282,8 @@
#ifdef _KERNEL
-#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000
-#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000
+#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000
+#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000
/*
* TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always
Index: netinet/tcp_stacks/rack_bbr_common.h
===================================================================
--- netinet/tcp_stacks/rack_bbr_common.h
+++ netinet/tcp_stacks/rack_bbr_common.h
@@ -38,17 +38,8 @@
#define TCP_MSS_ACCT_SIZE 70
#define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF)
+#define DUP_ACK_THRESHOLD 3
-/* Magic flags to tell whats cooking on the pacing wheel */
-#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */
-#define PACE_TMR_RACK 0x02 /* RACK timer running */
-#define PACE_TMR_TLP 0x04 /* TLP timer running */
-#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
-#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
-#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
-#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */
-#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
-
/* Magic flags for tracing progress events */
#define PROGRESS_DROP 1
#define PROGRESS_UPDATE 2
@@ -61,8 +52,66 @@
#define USE_RTT_LOW 1
#define USE_RTT_AVG 2
+#define PACE_MAX_IP_BYTES 65536
+#define USECS_IN_SECOND 1000000
+#define MSEC_IN_SECOND 1000
+#define MS_IN_USEC 1000
+#define USEC_TO_MSEC(x) (x / MS_IN_USEC)
+#define TCP_TS_OVERHEAD 12 /* Overhead of having Timestamps on */
+
#ifdef _KERNEL
/* We have only 7 bits in rack so assert its true */
CTASSERT((PACE_TMR_MASK & 0x80) == 0);
+#ifdef KERN_TLS
+uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd);
#endif
+int
+ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so,
+ struct mbuf *m, int has_pkt);
+int
+ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt);
+uint32_t ctf_outstanding(struct tcpcb *tp);
+uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked);
+int
+ctf_drop_checks(struct tcpopt *to, struct mbuf *m,
+ struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
+ int32_t * drop_hdrlen, int32_t * ret_val);
+void
+ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
+ struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
+void
+ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
+ struct tcphdr *th, int32_t rstreason, int32_t tlen);
+void
+ctf_do_drop(struct mbuf *m, struct tcpcb *tp);
+
+int
+ctf_process_rst(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp);
+
+void
+ctf_challenge_ack(struct mbuf *m, struct tcphdr *th,
+ struct tcpcb *tp, int32_t * ret_val);
+
+int
+ctf_ts_check(struct mbuf *m, struct tcphdr *th,
+ struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
+
+void
+ctf_calc_rwin(struct socket *so, struct tcpcb *tp);
+
+void
+ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+ int32_t rstreason, int32_t tlen);
+
+uint32_t
+ctf_fixed_maxseg(struct tcpcb *tp);
+
+void
+ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks);
+
+uint32_t
+ctf_decay_count(uint32_t count, uint32_t decay_percentage);
+
#endif
+#endif
Index: netinet/tcp_stacks/rack_bbr_common.c
===================================================================
--- netinet/tcp_stacks/rack_bbr_common.c
+++ netinet/tcp_stacks/rack_bbr_common.c
@@ -0,0 +1,859 @@
+/*-
+ * Copyright (c) 2016-2018
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * Author: Randall Stewart <rrs@netflix.com>
+ * This work is based on the ACM Queue paper
+ * BBR - Congestion Based Congestion Control
+ * and also numerous discussions with Neal, Yuchung and Van.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+#include "opt_kern_tls.h"
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#ifdef TCP_HHOOK
+#include <sys/hhook.h>
+#endif
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/sockbuf_tls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/tree.h>
+#include <sys/refcount.h>
+#include <sys/queue.h>
+#include <sys/smp.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/time.h>
+#include <vm/uma.h>
+#include <sys/kern_prefetch.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+#include <net/ethernet.h>
+#include <net/bpf.h>
+
+#define TCPSTATES /* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* required for icmp_var.h */
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#define TCPOUTFLAGS
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_hpts.h>
+#include <netinet/cc/cc.h>
+#include <netinet/tcp_log_buf.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_fastopen.h>
+
+#include <netipsec/ipsec_support.h>
+#include <net/if.h>
+#include <net/if_var.h>
+
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /* IPSEC */
+
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <machine/in_cksum.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+#include "rack_bbr_common.h"
+
+/*
+ * Common TCP Functions - These are shared by borth
+ * rack and BBR.
+ */
+
+
+#ifdef KERN_TLS
+uint32_t
+ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
+{
+ struct sbtls_info *tls;
+ uint32_t len;
+
+again:
+ tls = so->so_snd.sb_tls_info;
+ len = tls->sb_params.sb_maxlen; /* max tls payload */
+ len += tls->sb_params.sb_tls_hlen; /* tls header len */
+ len += tls->sb_params.sb_tls_tlen; /* tls trailer len */
+ if ((len * 4) > rwnd) {
+ /*
+ * Stroke this will suck counter and what
+ * else should we do Drew? From the
+ * TCP perspective I am not sure
+ * what should be done...
+ */
+ if (tls->sb_params.sb_maxlen > 4096) {
+ tls->sb_params.sb_maxlen -= 4096;
+ if (tls->sb_params.sb_maxlen < 4096)
+ tls->sb_params.sb_maxlen = 4096;
+ goto again;
+ }
+ }
+ return (len);
+}
+#endif
+
+int
+ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt)
+{
+ /*
+ * We are passed a raw change of mbuf packets
+ * that arrived in LRO. They are linked via
+ * the m_nextpkt link in the pkt-headers.
+ *
+ * We process each one by:
+ * a) saving off the next
+ * b) stripping off the ether-header
+ * c) formulating the arguments for
+ * the tfb_tcp_hpts_do_segment
+ * d) calling each mbuf to tfb_tcp_hpts_do_segment
+ * after adjusting the time to match the arrival time.
+ * Note that the LRO code assures no IP options are present.
+ *
+ * The symantics for calling tfb_tcp_hpts_do_segment are the
+ * following:
+ * 1) It returns 0 if all went well and you (the caller) need
+ * to release the lock.
+ * 2) If nxt_pkt is set, then the function will surpress calls
+ * to tfb_tcp_output() since you are promising to call again
+ * with another packet.
+ * 3) If it returns 1, then you must free all the packets being
+ * shipped in, the tcb has been destroyed (or about to be destroyed).
+ */
+ struct mbuf *m_save;
+ struct ether_header *eh;
+ struct epoch_tracker et;
+ struct tcphdr *th;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
+#endif
+#ifdef INET
+ struct ip *ip = NULL; /* Keep compiler happy. */
+#endif
+ struct ifnet *ifp;
+ struct timeval tv;
+ int32_t retval, nxt_pkt, tlen, off;
+ uint16_t etype;
+ uint16_t drop_hdrlen;
+ uint8_t iptos, no_vn=0, bpf_req=0;
+
+ /*
+ * This is a bit deceptive, we get the
+ * "info epoch" which is really the network
+ * epoch. This covers us on both any INP
+ * type change but also if the ifp goes
+ * away it covers us as well.
+ */
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+ if (m && m->m_pkthdr.rcvif)
+ ifp = m->m_pkthdr.rcvif;
+ else
+ ifp = NULL;
+ if (ifp) {
+ bpf_req = bpf_peers_present(ifp->if_bpf);
+ } else {
+ /*
+ * We probably should not work around
+ * but kassert, since lro alwasy sets rcvif.
+ */
+ no_vn = 1;
+ goto skip_vnet;
+ }
+ CURVNET_SET(ifp->if_vnet);
+skip_vnet:
+ while (m) {
+ m_save = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ /* Now lets get the ether header */
+ eh = mtod(m, struct ether_header *);
+ etype = ntohs(eh->ether_type);
+ /* Let the BPF see the packet */
+ if (bpf_req && ifp)
+ ETHER_BPF_MTAP(ifp, m);
+ m_adj(m, sizeof(*eh));
+ /* Trim off the ethernet header */
+ switch (etype) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ {
+ if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
+ m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
+ if (m == NULL) {
+ TCPSTAT_INC(tcps_rcvshort);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ }
+ ip6 = (struct ip6_hdr *)(eh + 1);
+ th = (struct tcphdr *)(ip6 + 1);
+ tlen = ntohs(ip6->ip6_plen);
+ drop_hdrlen = sizeof(*ip6);
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in6_cksum_pseudo(ip6, tlen,
+ IPPROTO_TCP, m->m_pkthdr.csum_data);
+ th->th_sum ^= 0xffff;
+ } else
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
+ if (th->th_sum) {
+ TCPSTAT_INC(tcps_rcvbadsum);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ /*
+ * Be proactive about unspecified IPv6 address in source.
+ * As we use all-zero to indicate unbounded/unconnected pcb,
+ * unspecified IPv6 address can be used to confuse us.
+ *
+ * Note that packets with unspecified IPv6 destination is
+ * already dropped in ip6_input.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
+ /* XXX stat */
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+ break;
+ }
+#endif
+#ifdef INET
+ case ETHERTYPE_IP:
+ {
+ if (m->m_len < sizeof (struct tcpiphdr)) {
+ if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
+ == NULL) {
+ TCPSTAT_INC(tcps_rcvshort);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ }
+ ip = (struct ip *)(eh + 1);
+ th = (struct tcphdr *)(ip + 1);
+ drop_hdrlen = sizeof(*ip);
+ iptos = ip->ip_tos;
+ tlen = ntohs(ip->ip_len) - sizeof(struct ip);
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr,
+ htonl(m->m_pkthdr.csum_data + tlen +
+ IPPROTO_TCP));
+ th->th_sum ^= 0xffff;
+ } else {
+ int len;
+ struct ipovly *ipov = (struct ipovly *)ip;
+ /*
+ * Checksum extended TCP header and data.
+ */
+ len = drop_hdrlen + tlen;
+ bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
+ ipov->ih_len = htons(tlen);
+ th->th_sum = in_cksum(m, len);
+ /* Reset length for SDT probes. */
+ ip->ip_len = htons(len);
+ /* Reset TOS bits */
+ ip->ip_tos = iptos;
+ /* Re-initialization for later version check */
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = sizeof(*ip) >> 2;
+ }
+ if (th->th_sum) {
+ TCPSTAT_INC(tcps_rcvbadsum);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ break;
+ }
+#endif
+ }
+ /*
+ * Convert TCP protocol specific fields to host format.
+ */
+ tcp_fields_to_host(th);
+
+ off = th->th_off << 2;
+ if (off < sizeof (struct tcphdr) || off > tlen) {
+ TCPSTAT_INC(tcps_rcvbadoff);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ tlen -= off;
+ drop_hdrlen += off;
+ /*
+ * Now lets setup the timeval to be when we should
+ * have been called (if we can).
+ */
+ m->m_pkthdr.lro_nsegs = 1;
+ if (m->m_flags & M_TSTMP_LRO) {
+ tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
+ tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
+ } else {
+ /* Should not be should we kassert instead? */
+ tcp_get_usecs(&tv);
+ }
+ /* Now what about next packet? */
+ if (m_save || has_pkt)
+ nxt_pkt = 1;
+ else
+ nxt_pkt = 0;
+ retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen,
+ iptos, nxt_pkt, &tv);
+ if (retval) {
+ /* We lost the lock and tcb probably */
+ m = m_save;
+ while (m) {
+ m_save = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ m = m_save;
+ }
+ if (no_vn == 0)
+ CURVNET_RESTORE();
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+ return (retval);
+ }
+skipped_pkt:
+ m = m_save;
+ }
+ if (no_vn == 0)
+ CURVNET_RESTORE();
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+ return (retval);
+}
+
+int
+ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
+{
+ struct mbuf *m;
+
+ /* First lets see if we have old packets */
+ if (tp->t_in_pkt) {
+ m = tp->t_in_pkt;
+ tp->t_in_pkt = NULL;
+ tp->t_tail_pkt = NULL;
+ if (ctf_process_inbound_raw(tp, so, m, have_pkt)) {
+ /* We lost the tcpcb (maybe a RST came in)? */
+ return (1);
+ }
+ }
+ return (0);
+}
+
+uint32_t
+ctf_outstanding(struct tcpcb *tp)
+{
+ return (tp->snd_max - tp->snd_una);
+}
+
+uint32_t
+ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
+{
+ if (rc_sacked <= ctf_outstanding(tp))
+ return (ctf_outstanding(tp) - rc_sacked);
+ else {
+ /* TSNH */
+#ifdef INVARIANTS
+ panic("tp:%p rc_sacked:%d > out:%d",
+ tp, rc_sacked, ctf_outstanding(tp));
+#endif
+ return (0);
+ }
+}
+
+void
+ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+ int32_t rstreason, int32_t tlen)
+{
+ if (tp != NULL) {
+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ INP_WUNLOCK(tp->t_inpcb);
+ } else
+ tcp_dropwithreset(m, th, NULL, tlen, rstreason);
+}
+
+/*
+ * ctf_drop_checks returns 1 for you should not proceed. It places
+ * in ret_val what should be returned 1/0 by the caller. The 1 indicates
+ * that the TCB is unlocked and probably dropped. The 0 indicates the
+ * TCB is still valid and locked.
+ */
+int
+ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
+{
+ int32_t todrop;
+ int32_t thflags;
+ int32_t tlen;
+
+ thflags = *thf;
+ tlen = *tlenp;
+ todrop = tp->rcv_nxt - th->th_seq;
+ if (todrop > 0) {
+ if (thflags & TH_SYN) {
+ thflags &= ~TH_SYN;
+ th->th_seq++;
+ if (th->th_urp > 1)
+ th->th_urp--;
+ else
+ thflags &= ~TH_URG;
+ todrop--;
+ }
+ /*
+ * Following if statement from Stevens, vol. 2, p. 960.
+ */
+ if (todrop > tlen
+ || (todrop == tlen && (thflags & TH_FIN) == 0)) {
+ /*
+ * Any valid FIN must be to the left of the window.
+ * At this point the FIN must be a duplicate or out
+ * of sequence; drop it.
+ */
+ thflags &= ~TH_FIN;
+ /*
+ * Send an ACK to resynchronize and drop any data.
+ * But keep on processing for RST or ACK.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ todrop = tlen;
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
+ } else {
+ TCPSTAT_INC(tcps_rcvpartduppack);
+ TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
+ }
+ /*
+ * DSACK - add SACK block for dropped range
+ */
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen);
+ /*
+ * ACK now, as the next in-sequence segment
+ * will clear the DSACK block again
+ */
+ tp->t_flags |= TF_ACKNOW;
+ }
+ *drop_hdrlen += todrop; /* drop from the top afterwards */
+ th->th_seq += todrop;
+ tlen -= todrop;
+ if (th->th_urp > todrop)
+ th->th_urp -= todrop;
+ else {
+ thflags &= ~TH_URG;
+ th->th_urp = 0;
+ }
+ }
+ /*
+ * If segment ends after window, drop trailing data (and PUSH and
+ * FIN); if nothing left, just ACK.
+ */
+ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
+ if (todrop > 0) {
+ TCPSTAT_INC(tcps_rcvpackafterwin);
+ if (todrop >= tlen) {
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
+ /*
+ * If window is closed can only take segments at
+ * window edge, and have to drop data and PUSH from
+ * incoming segments. Continue processing, but
+ * remember to ack. Otherwise, drop segment and
+ * ack.
+ */
+ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
+ tp->t_flags |= TF_ACKNOW;
+ TCPSTAT_INC(tcps_rcvwinprobe);
+ } else {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+ return (1);
+ }
+ } else
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+ m_adj(m, -todrop);
+ tlen -= todrop;
+ thflags &= ~(TH_PUSH | TH_FIN);
+ }
+ *thf = thflags;
+ *tlenp = tlen;
+ return (0);
+}
+
+/*
+ * The value in ret_val informs the caller
+ * if we dropped the tcb (and lock) or not.
+ * 1 = we dropped it, 0 = the TCB is still locked
+ * and valid.
+ */
+void
+ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
+{
+ /*
+ * Generate an ACK dropping incoming segment if it occupies sequence
+ * space, where the ACK reflects our state.
+ *
+ * We can now skip the test for the RST flag since all paths to this
+ * code happen after packets containing RST have been dropped.
+ *
+ * In the SYN-RECEIVED state, don't send an ACK unless the segment
+ * we received passes the SYN-RECEIVED ACK test. If it fails send a
+ * RST. This breaks the loop in the "LAND" DoS attack, and also
+ * prevents an ACK storm between two listening ports that have been
+ * sent forged SYN segments, each with the source address of the
+ * other.
+ */
+ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
+ (SEQ_GT(tp->snd_una, th->th_ack) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ *ret_val = 1;
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return;
+ } else
+ *ret_val = 0;
+ tp->t_flags |= TF_ACKNOW;
+ if (m)
+ m_freem(m);
+}
+
+void
+ctf_do_drop(struct mbuf *m, struct tcpcb *tp)
+{
+
+ /*
+ * Drop space held by incoming segment and return.
+ */
+ if (tp != NULL)
+ INP_WUNLOCK(tp->t_inpcb);
+ if (m)
+ m_freem(m);
+}
+
+int
+ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
+{
+ /*
+ * RFC5961 Section 3.2
+ *
+ * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
+ * window, we send challenge ACK.
+ *
+ * Note: to take into account delayed ACKs, we should test against
+ * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
+ * of closed window, not covered by the RFC.
+ */
+ int dropped = 0;
+
+ if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
+ (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
+
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(tp->t_state != TCPS_SYN_SENT,
+ ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
+ __func__, th, tp));
+
+ if (V_tcp_insecure_rst ||
+ (tp->last_ack_sent == th->th_seq) ||
+ (tp->rcv_nxt == th->th_seq) ||
+ ((tp->last_ack_sent - 1) == th->th_seq)) {
+ TCPSTAT_INC(tcps_drops);
+ /* Drop the connection. */
+ switch (tp->t_state) {
+ case TCPS_SYN_RECEIVED:
+ so->so_error = ECONNREFUSED;
+ goto close;
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ so->so_error = ECONNRESET;
+ close:
+ tcp_state_change(tp, TCPS_CLOSED);
+ /* FALLTHROUGH */
+ default:
+ tp = tcp_close(tp);
+ }
+ dropped = 1;
+ ctf_do_drop(m, tp);
+ } else {
+ TCPSTAT_INC(tcps_badrst);
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m,
+ tp->rcv_nxt, tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ }
+ } else {
+ m_freem(m);
+ }
+ return (dropped);
+}
+
+/*
+ * The value in ret_val informs the caller
+ * if we dropped the tcb (and lock) or not.
+ * 1 = we dropped it, 0 = the TCB is still locked
+ * and valid.
+ */
+void
+ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
+{
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+ TCPSTAT_INC(tcps_badsyn);
+ if (V_tcp_insecure_syn &&
+ SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+ tp = tcp_drop(tp, ECONNRESET);
+ *ret_val = 1;
+ ctf_do_drop(m, tp);
+ } else {
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
+ tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ m = NULL;
+ *ret_val = 0;
+ ctf_do_drop(m, NULL);
+ }
+}
+
+/*
+ * bbr_ts_check returns 1 for you should not proceed, the state
+ * machine should return. It places in ret_val what should
+ * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
+ * that the TCB is unlocked and probably dropped. The 0 indicates the
+ * TCB is still valid and locked.
+ */
+int
+ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
+ int32_t tlen, int32_t thflags, int32_t * ret_val)
+{
+
+ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
+ /*
+ * Invalidate ts_recent. If this segment updates ts_recent,
+ * the age will be reset later and ts_recent will get a
+ * valid value. If it does not, setting ts_recent to zero
+ * will at least satisfy the requirement that zero be placed
+ * in the timestamp echo reply when ts_recent isn't valid.
+ * The age isn't reset until we get a valid ts_recent
+ * because we don't want out-of-order segments to be dropped
+ * when ts_recent is old.
+ */
+ tp->ts_recent = 0;
+ } else {
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
+ TCPSTAT_INC(tcps_pawsdrop);
+ *ret_val = 0;
+ if (tlen) {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+ } else {
+ ctf_do_drop(m, NULL);
+ }
+ return (1);
+ }
+ return (0);
+}
+
+void
+ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
+{
+ int32_t win;
+
+ /*
+ * Calculate amount of space in receive window, and then do TCP
+ * input processing. Receive window is amount of space in rcv queue,
+ * but not less than advertised window.
+ */
+ win = sbspace(&so->so_rcv);
+ if (win < 0)
+ win = 0;
+ tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+}
+
+void
+ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+ int32_t rstreason, int32_t tlen)
+{
+
+ if (tp->t_inpcb) {
+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+ }
+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ INP_WUNLOCK(tp->t_inpcb);
+}
+
+uint32_t
+ctf_fixed_maxseg(struct tcpcb *tp)
+{
+ int optlen;
+
+ if (tp->t_flags & TF_NOOPT)
+ return (tp->t_maxseg);
+
+ /*
+ * Here we have a simplified code from tcp_addoptions(),
+ * without a proper loop, and having most of paddings hardcoded.
+ * We only consider fixed options that we would send every
+ * time I.e. SACK is not considered.
+ *
+ */
+#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4)
+ if (TCPS_HAVEESTABLISHED(tp->t_state)) {
+ if (tp->t_flags & TF_RCVD_TSTMP)
+ optlen = TCPOLEN_TSTAMP_APPA;
+ else
+ optlen = 0;
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (tp->t_flags & TF_SIGNATURE)
+ optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+ } else {
+ if (tp->t_flags & TF_REQ_TSTMP)
+ optlen = TCPOLEN_TSTAMP_APPA;
+ else
+ optlen = PAD(TCPOLEN_MAXSEG);
+ if (tp->t_flags & TF_REQ_SCALE)
+ optlen += PAD(TCPOLEN_WINDOW);
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (tp->t_flags & TF_SIGNATURE)
+ optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+ if (tp->t_flags & TF_SACK_PERMIT)
+ optlen += PAD(TCPOLEN_SACK_PERMITTED);
+ }
+#undef PAD
+ optlen = min(optlen, TCP_MAXOLEN);
+ return (tp->t_maxseg - optlen);
+}
+
+void
+ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks)
+{
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex8 = num_sack_blks;
+ if (num_sack_blks > 0) {
+ log.u_bbr.flex1 = sack_blocks[0].start;
+ log.u_bbr.flex2 = sack_blocks[0].end;
+ }
+ if (num_sack_blks > 1) {
+ log.u_bbr.flex3 = sack_blocks[1].start;
+ log.u_bbr.flex4 = sack_blocks[1].end;
+ }
+ if (num_sack_blks > 2) {
+ log.u_bbr.flex5 = sack_blocks[2].start;
+ log.u_bbr.flex6 = sack_blocks[2].end;
+ }
+ if (num_sack_blks > 3) {
+ log.u_bbr.applimited = sack_blocks[3].start;
+ log.u_bbr.pkts_out = sack_blocks[3].end;
+ }
+ TCP_LOG_EVENTP(tp, NULL,
+ &tp->t_inpcb->inp_socket->so_rcv,
+ &tp->t_inpcb->inp_socket->so_snd,
+ TCP_SACK_FILTER_RES, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+uint32_t
+ctf_decay_count(uint32_t count, uint32_t decay)
+{
+ /*
+ * Given a count, decay it by a set percentage. The
+ * percentage is in thousands i.e. 100% = 1000,
+ * 19.3% = 193.
+ */
+ uint64_t perc_count, decay_per;
+ uint32_t decayed_count;
+ if (decay > 1000) {
+ /* We don't raise it */
+ return (count);
+ }
+ perc_count = count;
+ decay_per = decay;
+ perc_count *= decay_per;
+ perc_count /= 1000;
+ /*
+ * So now perc_count holds the
+ * count decay value.
+ */
+ decayed_count = count - (uint32_t)perc_count;
+ return (decayed_count);
+}
Index: netinet/tcp_var.h
===================================================================
--- netinet/tcp_var.h
+++ netinet/tcp_var.h
@@ -102,7 +102,8 @@
t_state:4, /* state of this connection */
t_idle_reduce : 1,
t_delayed_ack: 7, /* Delayed ack variable */
- bits_spare : 4;
+ t_fin_is_rst: 1, /* Are fin's treated as resets */
+ bits_spare : 3;
u_int t_flags;
tcp_seq snd_una; /* sent but unacknowledged */
tcp_seq snd_max; /* highest sequence number sent;
@@ -271,6 +272,11 @@
void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t);
+ int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int);
+ int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *,
+ int, int, uint8_t,
+ int, struct timeval *);
void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t,
Index: sys/mbuf.h
===================================================================
--- sys/mbuf.h
+++ sys/mbuf.h
@@ -407,6 +407,7 @@
#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically
hw-stamped on port (useful for IEEE 1588
and 802.1AS) */
+#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */
#define M_PROTO1 0x00001000 /* protocol-specific */
#define M_PROTO2 0x00002000 /* protocol-specific */
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Thu, Dec 25, 9:43 PM (1 h, 51 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27263579
Default Alt Text
D20834.id59366.diff (99 KB)
Attached To
Mode
D20834: First step in bring hpts and infrastructure up for bbr v1 update part1
Attached
Detach File
Event Timeline
Log In to Comment