Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F109026220
D20834.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
151 KB
Referenced Files
None
Subscribers
None
D20834.diff
View Options
Index: head/sys/modules/tcp/rack/Makefile
===================================================================
--- head/sys/modules/tcp/rack/Makefile
+++ head/sys/modules/tcp/rack/Makefile
@@ -6,7 +6,7 @@
STACKNAME= rack
KMOD= tcp_${STACKNAME}
-SRCS= rack.c sack_filter.c
+SRCS= rack.c sack_filter.c rack_bbr_common.c
SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
SRCS+= opt_tcpdebug.h
Index: head/sys/netinet/in_pcb.h
===================================================================
--- head/sys/netinet/in_pcb.h
+++ head/sys/netinet/in_pcb.h
@@ -759,7 +759,9 @@
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */
#define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */
-
+#define INP_SUPPORTS_MBUFQ 0x00004000 /* Supports the mbuf queue method of LRO */
+#define INP_MBUF_QUEUE_READY 0x00008000 /* The transport is pacing, inputs can be queued */
+#define INP_DONT_SACK_QUEUE 0x00010000 /* If a sack arrives do not wake me */
/*
* Flags passed to in_pcblookup*() functions.
*/
Index: head/sys/netinet/tcp.h
===================================================================
--- head/sys/netinet/tcp.h
+++ head/sys/netinet/tcp.h
@@ -201,9 +201,8 @@
#define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */
#define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */
#define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */
-#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */
#define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */
-#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */
+#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */
#define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */
#define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */
#define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */
@@ -211,14 +210,18 @@
#define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */
#define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */
#define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */
-#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */
-#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */
-#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */
+#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */
+#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */
+#define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */
+#define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */
+#define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */
+#define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */
#define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */
#define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */
#define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */
#define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */
-#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */
+#define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */
+#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */
#define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */
#define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */
#define TCP_BBR_PACE_PER_SEC 1086
@@ -227,17 +230,27 @@
#define TCP_BBR_PACE_SEG_MIN 1089
#define TCP_BBR_PACE_CROSS 1090
#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
-#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
#define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */
#define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */
+#define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase */
#define TCP_RACK_TLP_USE 1095
#define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */
+#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */
#define TCP_BBR_EXTRA_GAIN 1097
#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */
#define TCP_BBR_RETRAN_WTSO 1099
#define TCP_DATA_AFTER_CLOSE 1100
#define TCP_BBR_PROBE_RTT_GAIN 1101
#define TCP_BBR_PROBE_RTT_LEN 1102
+#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */
+#define TCP_BBR_USE_RACK_CHEAT 1104 /* Do we use the rack cheat for pacing rxt's */
+#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */
+#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */
+#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */
+#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */
+#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */
+#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */
+#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */
/* Start of reserved space for third-party user-settable options. */
Index: head/sys/netinet/tcp_hpts.h
===================================================================
--- head/sys/netinet/tcp_hpts.h
+++ head/sys/netinet/tcp_hpts.h
@@ -45,112 +45,80 @@
/* Number of useconds in a hpts tick */
#define HPTS_TICKS_PER_USEC 10
-#define HPTS_MS_TO_SLOTS(x) (x * 100)
+#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
#define HPTS_USEC_IN_SEC 1000000
#define HPTS_MSEC_IN_SEC 1000
#define HPTS_USEC_IN_MSEC 1000
-#define DEFAULT_HPTS_LOG 3072
-/*
- * Log flags consist of
- * 7f 7f 1 1 bits
- * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE
- *
- * So for example cpu 10, number 10 would with
- * input active would show up as:
- * p_flags = 0001010 0001010 1 0
- * <or>
- * p_flags = 0x142a
- */
-#define HPTS_HPTS_ACTIVE 0x01
-#define HPTS_INPUT_ACTIVE 0x02
-
-#define HPTSLOG_IMMEDIATE 1
-#define HPTSLOG_INSERT_NORMAL 2
-#define HPTSLOG_INSERT_SLEEPER 3
-#define HPTSLOG_SLEEP_AFTER 4
-#define HPTSLOG_SLEEP_BEFORE 5
-#define HPTSLOG_INSERTED 6
-#define HPTSLOG_WAKEUP_HPTS 7
-#define HPTSLOG_SETTORUN 8
-#define HPTSLOG_HPTSI 9
-#define HPTSLOG_TOLONG 10
-#define HPTSLOG_AWAKENS 11
-#define HPTSLOG_TIMESOUT 12
-#define HPTSLOG_SLEEPSET 13
-#define HPTSLOG_WAKEUP_INPUT 14
-#define HPTSLOG_RESCHEDULE 15
-#define HPTSLOG_AWAKE 16
-#define HPTSLOG_INP_DONE 17
-
-struct hpts_log {
- struct inpcb *inp;
- int32_t event;
- uint32_t cts;
- int32_t line;
- uint32_t ticknow;
- uint32_t t_paceslot;
- uint32_t t_hptsreq;
- uint32_t p_curtick;
- uint32_t p_prevtick;
- uint32_t slot_req;
- uint32_t p_on_queue_cnt;
- uint32_t p_nxt_slot;
- uint32_t p_cur_slot;
- uint32_t p_hpts_sleep_time;
- uint16_t p_flags;
- uint8_t p_onhpts;
- uint8_t p_oninput;
- uint8_t is_notempty;
-};
-
struct hpts_diag {
- uint32_t p_hpts_active;
- uint32_t p_nxt_slot;
- uint32_t p_cur_slot;
- uint32_t slot_req;
- uint32_t inp_hptsslot;
- uint32_t slot_now;
- uint32_t have_slept;
- uint32_t hpts_sleep_time;
- uint32_t yet_to_sleep;
- uint32_t need_new_to;
- int32_t co_ret;
- uint8_t p_on_min_sleep;
+ uint32_t p_hpts_active; /* bbr->flex7 x */
+ uint32_t p_nxt_slot; /* bbr->flex1 x */
+ uint32_t p_cur_slot; /* bbr->flex2 x */
+ uint32_t p_prev_slot; /* bbr->delivered */
+ uint32_t p_runningtick; /* bbr->inflight */
+ uint32_t slot_req; /* bbr->flex3 x */
+ uint32_t inp_hptsslot; /* bbr->flex4 x */
+ uint32_t slot_remaining; /* bbr->flex5 x */
+ uint32_t have_slept; /* bbr->epoch x */
+ uint32_t hpts_sleep_time; /* bbr->applimited x */
+ uint32_t yet_to_sleep; /* bbr->lt_epoch x */
+ uint32_t need_new_to; /* bbr->flex6 x */
+ uint32_t wheel_tick; /* bbr->bw_inuse x */
+ uint32_t maxticks; /* bbr->delRate x */
+ uint32_t wheel_cts; /* bbr->rttProp x */
+ int32_t co_ret; /* bbr->pkts_out x */
+ uint32_t p_curtick; /* upper bbr->cur_del_rate */
+ uint32_t p_lasttick; /* lower bbr->cur_del_rate */
+ uint8_t p_on_min_sleep; /* bbr->flex8 x */
};
+/* Magic flags to tell whats cooking on the pacing wheel */
+#define PACE_TMR_DELACK 0x01 /* Delayed ack timer running */
+#define PACE_TMR_RACK 0x02 /* RACK timer running */
+#define PACE_TMR_TLP 0x04 /* TLP timer running */
+#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
+#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
+#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
+#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */
+#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
+
#ifdef _KERNEL
/* Each hpts has its own p_mtx which is used for locking */
struct tcp_hpts_entry {
/* Cache line 0x00 */
struct mtx p_mtx; /* Mutex for hpts */
- uint32_t p_hpts_active; /* Flag that says hpts is awake */
- uint32_t p_curtick; /* Current tick in 10 us the hpts is at */
- uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */
+ uint16_t p_hpts_active; /* Flag that says hpts is awake */
+ uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */
+ uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
+ uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
+ uint32_t p_runningtick; /* Current tick we are at if we are running */
+ uint32_t p_prev_slot; /* Previous slot we were on */
uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
uint32_t p_nxt_slot; /* The next slot outside the current range of
* slots that the hpts is running on. */
int32_t p_on_queue_cnt; /* Count on queue in this hpts */
- uint32_t enobuf_cnt;
- uint16_t p_log_at;
+ uint32_t p_lasttick; /* Last tick before the current one */
uint8_t p_direct_wake :1, /* boolean */
- p_log_wrapped :1, /* boolean */
- p_on_min_sleep:1; /* boolean */
- uint8_t p_fill;
+ p_on_min_sleep:1, /* boolean */
+ p_avail:6;
+ uint8_t p_fill[3]; /* Fill to 32 bits */
/* Cache line 0x40 */
void *p_inp;
struct hptsh p_input; /* For the tcp-input runner */
/* Hptsi wheel */
struct hptsh *p_hptss;
- struct hpts_log *p_log;
- uint32_t p_logsize;
int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
uint32_t hit_no_enobuf;
uint32_t p_dyn_adjust;
uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
* of 255ms */
+ uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
+ uint32_t saved_lasttick; /* for logging */
+ uint32_t saved_curtick; /* for logging */
+ uint32_t saved_curslot; /* for logging */
+ uint32_t saved_prev_slot; /* for logging */
uint32_t p_delayed_by; /* How much were we delayed by */
/* Cache line 0x80 */
struct sysctl_ctx_list hpts_ctx;
@@ -236,13 +204,9 @@
int
__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line);
#define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
-void
-tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos);
int
-__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line);
-#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__)
+__tcp_queue_to_input(struct inpcb *inp, int32_t line);
+#define tcp_queue_to_input(a) __tcp_queue_to_input(a, __LINE__)
uint16_t tcp_hpts_delayedby(struct inpcb *inp);
Index: head/sys/netinet/tcp_hpts.c
===================================================================
--- head/sys/netinet/tcp_hpts.c
+++ head/sys/netinet/tcp_hpts.c
@@ -37,7 +37,7 @@
* pacing packets out onto the wire. It can be used in two ways
* by a given TCP stack (and those two methods can be used simultaneously).
*
- * First, and probably the main thing its used by Rack and BBR for, it can
+ * First, and probably the main thing its used by Rack and BBR, it can
* be used to call tcp_output() of a transport stack at some time in the future.
* The normal way this is done is that tcp_output() of the stack schedules
* itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
@@ -59,42 +59,57 @@
* to prevent output processing until the time alotted has gone by.
* Of course this is a bare bones example and the stack will probably
* have more consideration then just the above.
- *
- * Now the tcp_hpts system will call tcp_output in one of two forms,
- * it will first check to see if the stack as defined a
- * tfb_tcp_output_wtime() function, if so that is the routine it
- * will call, if that function is not defined then it will call the
- * tfb_tcp_output() function. The only difference between these
- * two calls is that the former passes the time in to the function
- * so the function does not have to access the time (which tcp_hpts
- * already has). What these functions do is of course totally up
- * to the individual tcp stack.
- *
+ *
* Now the second function (actually two functions I guess :D)
* the tcp_hpts system provides is the ability to either abort
- * a connection (later) or process input on a connection.
- * Why would you want to do this? To keep processor locality.
+ * a connection (later) or process input on a connection.
+ * Why would you want to do this? To keep processor locality
+ * and or not have to worry about untangling any recursive
+ * locks. The input function now is hooked to the new LRO
+ * system as well.
*
- * So in order to use the input redirection function the
- * stack changes its tcp_do_segment() routine to instead
- * of process the data call the function:
+ * In order to use the input redirection function the
+ * tcp stack must define an input function for
+ * tfb_do_queued_segments(). This function understands
+ * how to dequeue a array of packets that were input and
+ * knows how to call the correct processing routine.
*
- * tcp_queue_pkt_to_input()
- *
- * You will note that the arguments to this function look
- * a lot like tcp_do_segments's arguments. This function
- * will assure that the tcp_hpts system will
- * call the functions tfb_tcp_hpts_do_segment() from the
- * correct CPU. Note that multiple calls can get pushed
- * into the tcp_hpts system this will be indicated by
- * the next to last argument to tfb_tcp_hpts_do_segment()
- * (nxt_pkt). If nxt_pkt is a 1 then another packet is
- * coming. If nxt_pkt is a 0 then this is the last call
- * that the tcp_hpts system has available for the tcp stack.
+ * Locking in this is important as well so most likely the
+ * stack will need to define the tfb_do_segment_nounlock()
+ * splitting tfb_do_segment() into two parts. The main processing
+ * part that does not unlock the INP and returns a value of 1 or 0.
+ * It returns 0 if all is well and the lock was not released. It
+ * returns 1 if we had to destroy the TCB (a reset received etc).
+ * The remains of tfb_do_segment() then become just a simple call
+ * to the tfb_do_segment_nounlock() function and check the return
+ * code and possibly unlock.
*
- * The other point of the input system is to be able to safely
- * drop a tcp connection without worrying about the recursive
- * locking that may be occuring on the INP_WLOCK. So if
+ * The stack must also set the flag on the INP that it supports this
+ * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
+ * this flag as well and will queue packets when it is set.
+ * There are other flags as well INP_MBUF_QUEUE_READY and
+ * INP_DONT_SACK_QUEUE. The first flag tells the LRO code
+ * that we are in the pacer for output so there is no
+ * need to wake up the hpts system to get immediate
+ * input. The second tells the LRO code that its okay
+ * if a SACK arrives you can still defer input and let
+ * the current hpts timer run (this is usually set when
+ * a rack timer is up so we know SACK's are happening
+ * on the connection already and don't want to wakeup yet).
+ *
+ * There is a common functions within the rack_bbr_common code
+ * version i.e. ctf_do_queued_segments(). This function
+ * knows how to take the input queue of packets from
+ * tp->t_in_pkts and process them digging out
+ * all the arguments, calling any bpf tap and
+ * calling into tfb_do_segment_nounlock(). The common
+ * function (ctf_do_queued_segments()) requires that
+ * you have defined the tfb_do_segment_nounlock() as
+ * described above.
+ *
+ * The second feature of the input side of hpts is the
+ * dropping of a connection. This is due to the way that
+ * locking may have occured on the INP_WLOCK. So if
* a stack wants to drop a connection it calls:
*
* tcp_set_inp_to_drop(tp, ETIMEDOUT)
@@ -156,6 +171,7 @@
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
#include <netinet/tcp_hpts.h>
+#include <netinet/tcp_log_buf.h>
#ifdef tcpdebug
#include <netinet/tcp_debug.h>
@@ -168,24 +184,19 @@
MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
#ifdef RSS
-#include <net/netisr.h>
-#include <net/rss_config.h>
static int tcp_bind_threads = 1;
#else
static int tcp_bind_threads = 2;
#endif
TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
-static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG;
-
-TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
-
static struct tcp_hptsi tcp_pace;
+static int hpts_does_tp_logging = 0;
static void tcp_wakehpts(struct tcp_hpts_entry *p);
static void tcp_wakeinput(struct tcp_hpts_entry *p);
static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
-static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick);
+static void tcp_hptsi(struct tcp_hpts_entry *hpts);
static void tcp_hpts_thread(void *ctx);
static void tcp_init_hptsi(void *st);
@@ -204,8 +215,6 @@
} \
} while (0)
-static int32_t logging_on = 0;
-static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
static int32_t tcp_hpts_precision = 120;
struct hpts_domain_info {
@@ -219,44 +228,75 @@
&tcp_hpts_precision, 120,
"Value for PRE() precision of callout");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
- &logging_on, 0,
- "Turn on logging if compiled in");
+counter_u64_t hpts_hopelessly_behind;
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD,
+ &hpts_hopelessly_behind,
+ "Number of times hpts could not catch up and was behind hopelessly");
+
counter_u64_t hpts_loops;
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
&hpts_loops, "Number of times hpts had to loop to catch up");
+
counter_u64_t back_tosleep;
SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
&back_tosleep, "Number of times hpts found no tcbs");
-static int32_t in_newts_every_tcb = 0;
+counter_u64_t combined_wheel_wrap;
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW,
- &in_newts_every_tcb, 0,
- "Do we have a new cts every tcb we process for input");
-static int32_t in_ts_percision = 0;
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
+ &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW,
- &in_ts_percision, 0,
- "Do we use percise timestamp for clients on input");
-static int32_t out_newts_every_tcb = 0;
+counter_u64_t wheel_wrap;
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW,
- &out_newts_every_tcb, 0,
- "Do we have a new cts every tcb we process for output");
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD,
+ &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
+
static int32_t out_ts_percision = 0;
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
&out_ts_percision, 0,
"Do we use a percise timestamp for every output cts");
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
+ &hpts_does_tp_logging, 0,
+ "Do we add to any tp that has logging on pacer logs");
-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW,
+static int32_t max_pacer_loops = 10;
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
+ &max_pacer_loops, 10,
+ "What is the maximum number of times the pacer will loop trying to catch up");
+
+#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)
+
+static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;
+
+
+static int
+sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ uint32_t new;
+
+ new = hpts_sleep_max;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error == 0 && req->newptr) {
+ if ((new < (NUM_OF_HPTSI_SLOTS / 4)) ||
+ (new > HPTS_MAX_SLEEP_ALLOWED))
+ error = EINVAL;
+ else
+ hpts_sleep_max = new;
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
+ CTLTYPE_UINT | CTLFLAG_RW,
&hpts_sleep_max, 0,
- "The maximum time the hpts will sleep <1 - 254>");
+ &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
+ "Maximum time hpts will sleep");
SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
&tcp_min_hptsi_time, 0,
@@ -267,55 +307,35 @@
"Do we have the callout call directly to the hpts?");
static void
-__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot,
- uint32_t ticknow, int32_t line)
+tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
+ int ticks_to_run, int idx)
{
- struct hpts_log *pl;
-
- HPTS_MTX_ASSERT(hpts);
- if (hpts->p_log == NULL)
- return;
- pl = &hpts->p_log[hpts->p_log_at];
- hpts->p_log_at++;
- if (hpts->p_log_at >= hpts->p_logsize) {
- hpts->p_log_at = 0;
- hpts->p_log_wrapped = 1;
- }
- pl->inp = inp;
- if (inp) {
- pl->t_paceslot = inp->inp_hptsslot;
- pl->t_hptsreq = inp->inp_hpts_request;
- pl->p_onhpts = inp->inp_in_hpts;
- pl->p_oninput = inp->inp_in_input;
- } else {
- pl->t_paceslot = 0;
- pl->t_hptsreq = 0;
- pl->p_onhpts = 0;
- pl->p_oninput = 0;
- }
- pl->is_notempty = 1;
- pl->event = event;
- pl->line = line;
- pl->cts = tcp_get_usecs(NULL);
- pl->p_curtick = hpts->p_curtick;
- pl->p_prevtick = hpts->p_prevtick;
- pl->p_on_queue_cnt = hpts->p_on_queue_cnt;
- pl->ticknow = ticknow;
- pl->slot_req = slot;
- pl->p_nxt_slot = hpts->p_nxt_slot;
- pl->p_cur_slot = hpts->p_cur_slot;
- pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time;
- pl->p_flags = (hpts->p_cpu & 0x7f);
- pl->p_flags <<= 7;
- pl->p_flags |= (hpts->p_num & 0x7f);
- pl->p_flags <<= 2;
- if (hpts->p_hpts_active) {
- pl->p_flags |= HPTS_HPTS_ACTIVE;
- }
+ union tcp_log_stackspecific log;
+
+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
+ log.u_bbr.flex1 = hpts->p_nxt_slot;
+ log.u_bbr.flex2 = hpts->p_cur_slot;
+ log.u_bbr.flex3 = hpts->p_prev_slot;
+ log.u_bbr.flex4 = idx;
+ log.u_bbr.flex5 = hpts->p_curtick;
+ log.u_bbr.flex6 = hpts->p_on_queue_cnt;
+ log.u_bbr.use_lt_bw = 1;
+ log.u_bbr.inflight = ticks_to_run;
+ log.u_bbr.applimited = hpts->overidden_sleep;
+ log.u_bbr.delivered = hpts->saved_curtick;
+ log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
+ log.u_bbr.epoch = hpts->saved_curslot;
+ log.u_bbr.lt_epoch = hpts->saved_prev_slot;
+ log.u_bbr.pkts_out = hpts->p_delayed_by;
+ log.u_bbr.lost = hpts->p_hpts_sleep_time;
+ log.u_bbr.cur_del_rate = hpts->p_runningtick;
+ TCP_LOG_EVENTP(tp, NULL,
+ &tp->t_inpcb->inp_socket->so_rcv,
+ &tp->t_inpcb->inp_socket->so_snd,
+ BBR_LOG_HPTSDIAG, 0,
+ 0, &log, false, tv);
}
-#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__)
-
static void
hpts_timeout_swi(void *arg)
{
@@ -347,12 +367,6 @@
/* We are not on the hpts? */
panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
}
- if (TAILQ_EMPTY(head) &&
- (hpts->p_on_queue_cnt != 0)) {
- /* We should not be empty with a queue count */
- panic("%s hpts:%p hpts bucket empty but cnt:%d",
- __FUNCTION__, hpts, hpts->p_on_queue_cnt);
- }
#endif
TAILQ_REMOVE(head, inp, inp_hpts);
hpts->p_on_queue_cnt--;
@@ -456,58 +470,13 @@
in_pcbref(inp);
}
-static int
-sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS)
-{
- struct tcp_hpts_entry *hpts;
- size_t sz;
- int32_t logging_was, i;
- int32_t error = 0;
-
- /*
- * HACK: Turn off logging so no locks are required this really needs
- * a memory barrier :)
- */
- logging_was = logging_on;
- logging_on = 0;
- if (!req->oldptr) {
- /* How much? */
- sz = 0;
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- hpts = tcp_pace.rp_ent[i];
- if (hpts->p_log == NULL)
- continue;
- sz += (sizeof(struct hpts_log) * hpts->p_logsize);
- }
- error = SYSCTL_OUT(req, 0, sz);
- } else {
- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
- hpts = tcp_pace.rp_ent[i];
- if (hpts->p_log == NULL)
- continue;
- if (hpts->p_log_wrapped)
- sz = (sizeof(struct hpts_log) * hpts->p_logsize);
- else
- sz = (sizeof(struct hpts_log) * hpts->p_log_at);
- error = SYSCTL_OUT(req, hpts->p_log, sz);
- }
- }
- logging_on = logging_was;
- return error;
-}
-
-SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
- 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
-
-
static void
tcp_wakehpts(struct tcp_hpts_entry *hpts)
{
HPTS_MTX_ASSERT(hpts);
- swi_sched(hpts->ie_cookie, 0);
- if (hpts->p_hpts_active == 2) {
- /* Rare sleeping on a ENOBUF */
- wakeup_one(hpts);
+ if (hpts->p_hpts_wake_scheduled == 0) {
+ hpts->p_hpts_wake_scheduled = 1;
+ swi_sched(hpts->ie_cookie, 0);
}
}
@@ -515,10 +484,9 @@
tcp_wakeinput(struct tcp_hpts_entry *hpts)
{
HPTS_MTX_ASSERT(hpts);
- swi_sched(hpts->ie_cookie, 0);
- if (hpts->p_hpts_active == 2) {
- /* Rare sleeping on a ENOBUF */
- wakeup_one(hpts);
+ if (hpts->p_hpts_wake_scheduled == 0) {
+ hpts->p_hpts_wake_scheduled = 1;
+ swi_sched(hpts->ie_cookie, 0);
}
}
@@ -648,8 +616,8 @@
* Valid values in the flags are
* HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
* HPTS_REMOVE_INPUT - remove from the input of the hpts.
- * Note that you can or both values together and get two
- * actions.
+ * Note that you can use one or both values together
+ * and get two actions.
*/
void
__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
@@ -670,53 +638,198 @@
}
static inline int
-hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus)
+hpts_tick(uint32_t wheel_tick, uint32_t plus)
{
- return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS);
+ /*
+ * Given a slot on the wheel, what slot
+ * is that plus ticks out?
+ */
+ KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick));
+ return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS);
}
+static inline int
+tick_to_wheel(uint32_t cts_in_wticks)
+{
+ /*
+ * Given a timestamp in wheel ticks (10usec inc's)
+ * map it to our limited space wheel.
+ */
+ return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
+}
+
+static inline int
+hpts_ticks_diff(int prev_tick, int tick_now)
+{
+ /*
+ * Given two ticks that are someplace
+ * on our wheel. How far are they apart?
+ */
+ if (tick_now > prev_tick)
+ return (tick_now - prev_tick);
+ else if (tick_now == prev_tick)
+ /*
+ * Special case, same means we can go all of our
+ * wheel less one slot.
+ */
+ return (NUM_OF_HPTSI_SLOTS - 1);
+ else
+ return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now);
+}
+
+/*
+ * Given a tick on the wheel that is the current time
+ * mapped to the wheel (wheel_tick), what is the maximum
+ * distance forward that can be obtained without
+ * wrapping past either prev_tick or running_tick
+ * depending on the htps state? Also if passed
+ * a uint32_t *, fill it with the tick location.
+ *
+ * Note if you do not give this function the current
+ * time (that you think it is) mapped to the wheel
+ * then the results will not be what you expect and
+ * could lead to invalid inserts.
+ */
+static inline int32_t
+max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick)
+{
+ uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel;
+
+ if ((hpts->p_hpts_active == 1) &&
+ (hpts->p_wheel_complete == 0)) {
+ end_tick = hpts->p_runningtick;
+ /* Back up one tick */
+ if (end_tick == 0)
+ end_tick = NUM_OF_HPTSI_SLOTS - 1;
+ else
+ end_tick--;
+ if (target_tick)
+ *target_tick = end_tick;
+ } else {
+ /*
+ * For the case where we are
+ * not active, or we have
+ * completed the pass over
+ * the wheel, we can use the
+ * prev tick and subtract one from it. This puts us
+ * as far out as possible on the wheel.
+ */
+ end_tick = hpts->p_prev_slot;
+ if (end_tick == 0)
+ end_tick = NUM_OF_HPTSI_SLOTS - 1;
+ else
+ end_tick--;
+ if (target_tick)
+ *target_tick = end_tick;
+ /*
+ * Now we have close to the full wheel left minus the
+ * time it has been since the pacer went to sleep. Note
+ * that wheel_tick, passed in, should be the current time
+ * from the perspective of the caller, mapped to the wheel.
+ */
+ if (hpts->p_prev_slot != wheel_tick)
+ dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
+ else
+ dis_to_travel = 1;
+ /*
+ * dis_to_travel in this case is the space from when the
+ * pacer stopped (p_prev_slot) and where our wheel_tick
+ * is now. To know how many slots we can put it in we
+ * subtract from the wheel size. We would not want
+ * to place something after p_prev_slot or it will
+ * get ran too soon.
+ */
+ return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
+ }
+ /*
+ * So how many slots are open between p_runningtick -> p_cur_slot
+ * that is what is currently un-available for insertion. Special
+ * case when we are at the last slot, this gets 1, so that
+ * the answer to how many slots are available is all but 1.
+ */
+ if (hpts->p_runningtick == hpts->p_cur_slot)
+ dis_to_travel = 1;
+ else
+ dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+ /*
+ * How long has the pacer been running?
+ */
+ if (hpts->p_cur_slot != wheel_tick) {
+ /* The pacer is a bit late */
+ pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick);
+ } else {
+ /* The pacer is right on time, now == pacers start time */
+ pacer_to_now = 0;
+ }
+ /*
+ * To get the number left we can insert into we simply
+ * subract the distance the pacer has to run from how
+ * many slots there are.
+ */
+ avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
+ /*
+ * Now how many of those we will eat due to the pacer's
+ * time (p_cur_slot) of start being behind the
+ * real time (wheel_tick)?
+ */
+ if (avail_on_wheel <= pacer_to_now) {
+ /*
+ * Wheel wrap, we can't fit on the wheel, that
+ * is unusual the system must be way overloaded!
+ * Insert into the assured tick, and return special
+ * "0".
+ */
+ counter_u64_add(combined_wheel_wrap, 1);
+ *target_tick = hpts->p_nxt_slot;
+ return (0);
+ } else {
+ /*
+ * We know how many slots are open
+ * on the wheel (the reverse of what
+ * is left to run. Take away the time
+ * the pacer started to now (wheel_tick)
+ * and that tells you how many slots are
+ * open that can be inserted into that won't
+ * be touched by the pacer until later.
+ */
+ return (avail_on_wheel - pacer_to_now);
+ }
+}
+
static int
tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
{
- int32_t need_wake = 0;
- uint32_t ticknow = 0;
-
+ uint32_t need_wake = 0;
+
HPTS_MTX_ASSERT(hpts);
if (inp->inp_in_hpts == 0) {
/* Ok we need to set it on the hpts in the current slot */
- if (hpts->p_hpts_active == 0) {
- /* A sleeping hpts we want in next slot to run */
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0,
- hpts_tick(hpts, 1));
- }
- inp->inp_hptsslot = hpts_tick(hpts, 1);
- inp->inp_hpts_request = 0;
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow);
- }
- need_wake = 1;
+ inp->inp_hpts_request = 0;
+ if ((hpts->p_hpts_active == 0) ||
+ (hpts->p_wheel_complete)) {
+ /*
+ * A sleeping hpts we want in next slot to run
+ * note that in this state p_prev_slot == p_cur_slot
+ */
+ inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1);
+ if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
+ need_wake = 1;
} else if ((void *)inp == hpts->p_inp) {
/*
+ * The hpts system is running and the caller
+ * was awoken by the hpts system.
* We can't allow you to go into the same slot we
- * are in. We must put you out.
+ * are in (we don't want a loop :-D).
*/
inp->inp_hptsslot = hpts->p_nxt_slot;
} else
- inp->inp_hptsslot = hpts->p_cur_slot;
+ inp->inp_hptsslot = hpts->p_runningtick;
hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
- inp->inp_hpts_request = 0;
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0);
- }
if (need_wake) {
/*
* Activate the hpts if it is sleeping and its
* timeout is not 1.
*/
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow);
- }
hpts->p_direct_wake = 1;
tcp_wakehpts(hpts);
}
@@ -737,141 +850,129 @@
return (ret);
}
+#ifdef INVARIANTS
static void
-tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line,
- struct hpts_diag *diag, int32_t noref)
+check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
{
- int32_t need_new_to = 0;
- int32_t need_wakeup = 0;
- uint32_t largest_slot;
- uint32_t ticknow = 0;
- uint32_t slot_calc;
+ /*
+ * Sanity checks for the pacer with invariants
+ * on insert.
+ */
+ if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS)
+ panic("hpts:%p inp:%p slot:%d > max",
+ hpts, inp, inp_hptsslot);
+ if ((hpts->p_hpts_active) &&
+ (hpts->p_wheel_complete == 0)) {
+ /*
+ * If the pacer is processing a arc
+ * of the wheel, we need to make
+ * sure we are not inserting within
+ * that arc.
+ */
+ int distance, yet_to_run;
+ distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot);
+ if (hpts->p_runningtick != hpts->p_cur_slot)
+ yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
+ else
+ yet_to_run = 0; /* processing last slot */
+ if (yet_to_run > distance) {
+ panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
+ hpts, inp, inp_hptsslot,
+ distance, yet_to_run,
+ hpts->p_runningtick, hpts->p_cur_slot);
+ }
+ }
+}
+#endif
+
+static void
+tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line,
+ struct hpts_diag *diag, struct timeval *tv)
+{
+ uint32_t need_new_to = 0;
+ uint32_t wheel_cts, last_tick;
+ int32_t wheel_tick, maxticks;
+ int8_t need_wakeup = 0;
+
HPTS_MTX_ASSERT(hpts);
if (diag) {
memset(diag, 0, sizeof(struct hpts_diag));
diag->p_hpts_active = hpts->p_hpts_active;
+ diag->p_prev_slot = hpts->p_prev_slot;
+ diag->p_runningtick = hpts->p_runningtick;
diag->p_nxt_slot = hpts->p_nxt_slot;
diag->p_cur_slot = hpts->p_cur_slot;
+ diag->p_curtick = hpts->p_curtick;
+ diag->p_lasttick = hpts->p_lasttick;
diag->slot_req = slot;
+ diag->p_on_min_sleep = hpts->p_on_min_sleep;
+ diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
}
- if ((inp->inp_in_hpts == 0) || noref) {
- inp->inp_hpts_request = slot;
+ if (inp->inp_in_hpts == 0) {
if (slot == 0) {
/* Immediate */
- tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref);
+ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
return;
}
- if (hpts->p_hpts_active) {
- /*
- * Its slot - 1 since nxt_slot is the next tick that
- * will go off since the hpts is awake
- */
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0);
- }
- /*
- * We want to make sure that we don't place a inp in
- * the range of p_cur_slot <-> p_nxt_slot. If we
- * take from p_nxt_slot to the end, plus p_cur_slot
- * and then take away 2, we will know how many is
- * the max slots we can use.
- */
- if (hpts->p_nxt_slot > hpts->p_cur_slot) {
- /*
- * Non-wrap case nxt_slot <-> cur_slot we
- * don't want to land in. So the diff gives
- * us what is taken away from the number of
- * slots.
+ /* Get the current time relative to the wheel */
+ wheel_cts = tcp_tv_to_hptstick(tv);
+ /* Map it onto the wheel */
+ wheel_tick = tick_to_wheel(wheel_cts);
+ /* Now what's the max we can place it at? */
+ maxticks = max_ticks_available(hpts, wheel_tick, &last_tick);
+ if (diag) {
+ diag->wheel_tick = wheel_tick;
+ diag->maxticks = maxticks;
+ diag->wheel_cts = wheel_cts;
+ }
+ if (maxticks == 0) {
+ /* The pacer is in a wheel wrap behind, yikes! */
+ if (slot > 1) {
+ /*
+ * Reduce by 1 to prevent a forever loop in
+ * case something else is wrong. Note this
+ * probably does not hurt because the pacer
+ * if its true is so far behind we will be
+ * > 1second late calling anyway.
*/
- largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot);
- } else if (hpts->p_nxt_slot == hpts->p_cur_slot) {
- largest_slot = NUM_OF_HPTSI_SLOTS - 2;
- } else {
- /*
- * Wrap case so the diff gives us the number
- * of slots that we can land in.
- */
- largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot;
+ slot--;
}
- /*
- * We take away two so we never have a problem (20
- * usec's) out of 1024000 usecs
- */
- largest_slot -= 2;
- if (inp->inp_hpts_request > largest_slot) {
- /*
- * Restrict max jump of slots and remember
- * leftover
- */
- slot = largest_slot;
- inp->inp_hpts_request -= largest_slot;
- } else {
- /* This one will run when we hit it */
- inp->inp_hpts_request = 0;
- }
- if (hpts->p_nxt_slot == hpts->p_cur_slot)
- slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS;
- else
- slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS;
- if (slot_calc == hpts->p_cur_slot) {
+ inp->inp_hptsslot = last_tick;
+ inp->inp_hpts_request = slot;
+ } else if (maxticks >= slot) {
+ /* It all fits on the wheel */
+ inp->inp_hpts_request = 0;
+ inp->inp_hptsslot = hpts_tick(wheel_tick, slot);
+ } else {
+ /* It does not fit */
+ inp->inp_hpts_request = slot - maxticks;
+ inp->inp_hptsslot = last_tick;
+ }
+ if (diag) {
+ diag->slot_remaining = inp->inp_hpts_request;
+ diag->inp_hptsslot = inp->inp_hptsslot;
+ }
#ifdef INVARIANTS
- /* TSNH */
- panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n",
- hpts, slot_calc, slot, largest_slot);
+ check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
#endif
- if (slot_calc)
- slot_calc--;
- else
- slot_calc = NUM_OF_HPTSI_SLOTS - 1;
- }
- inp->inp_hptsslot = slot_calc;
- if (diag) {
- diag->inp_hptsslot = inp->inp_hptsslot;
- }
- } else {
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
+ if ((hpts->p_hpts_active == 0) &&
+ (inp->inp_hpts_request == 0) &&
+ (hpts->p_on_min_sleep == 0)) {
/*
- * The hpts is sleeping, we need to figure out where
+ * The hpts is sleeping and not on a minimum
+ * sleep time, we need to figure out where
* it will wake up at and if we need to reschedule
* its time-out.
*/
uint32_t have_slept, yet_to_sleep;
- uint32_t slot_now;
- struct timeval tv;
- ticknow = tcp_gethptstick(&tv);
- slot_now = ticknow % NUM_OF_HPTSI_SLOTS;
- /*
- * The user wants to be inserted at (slot_now +
- * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up.
- */
- largest_slot = NUM_OF_HPTSI_SLOTS - 2;
- if (inp->inp_hpts_request > largest_slot) {
- /* Adjust the residual in inp_hpts_request */
- slot = largest_slot;
- inp->inp_hpts_request -= largest_slot;
- } else {
- /* No residual it all fits */
- inp->inp_hpts_request = 0;
- }
- inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS;
- if (diag) {
- diag->slot_now = slot_now;
- diag->inp_hptsslot = inp->inp_hptsslot;
- diag->p_on_min_sleep = hpts->p_on_min_sleep;
- }
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow);
- }
/* Now do we need to restart the hpts's timer? */
- if (TSTMP_GT(ticknow, hpts->p_curtick))
- have_slept = ticknow - hpts->p_curtick;
- else
- have_slept = 0;
- if (have_slept < hpts->p_hpts_sleep_time) {
- /* This should be what happens */
+ have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
+ if (have_slept < hpts->p_hpts_sleep_time)
yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
- } else {
+ else {
/* We are over-due */
yet_to_sleep = 0;
need_wakeup = 1;
@@ -879,29 +980,22 @@
if (diag) {
diag->have_slept = have_slept;
diag->yet_to_sleep = yet_to_sleep;
- diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
}
- if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) {
+ if (yet_to_sleep &&
+ (yet_to_sleep > slot)) {
/*
- * We need to reschedule the hptss time-out.
+ * We need to reschedule the hpts's time-out.
*/
hpts->p_hpts_sleep_time = slot;
need_new_to = slot * HPTS_TICKS_PER_USEC;
}
}
- hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow);
- }
/*
* Now how far is the hpts sleeping to? if active is 1, its
* up and ticking we do nothing, otherwise we may need to
* reschedule its callout if need_new_to is set from above.
*/
if (need_wakeup) {
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0);
- }
hpts->p_direct_wake = 1;
tcp_wakehpts(hpts);
if (diag) {
@@ -944,9 +1038,10 @@
}
uint32_t
-tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){
+tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag)
+{
struct tcp_hpts_entry *hpts;
- uint32_t slot_on, cts;
+ uint32_t slot_on;
struct timeval tv;
/*
@@ -956,12 +1051,8 @@
*/
INP_WLOCK_ASSERT(inp);
hpts = tcp_hpts_lock(inp);
- if (in_ts_percision)
- microuptime(&tv);
- else
- getmicrouptime(&tv);
- cts = tcp_tv_to_usectick(&tv);
- tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0);
+ microuptime(&tv);
+ tcp_hpts_insert_locked(hpts, inp, slot, line, diag, &tv);
slot_on = hpts->p_nxt_slot;
mtx_unlock(&hpts->p_mtx);
return (slot_on);
@@ -971,7 +1062,6 @@
__tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
return (tcp_hpts_insert_diag(inp, slot, line, NULL));
}
-
int
__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
{
@@ -986,9 +1076,6 @@
/*
* Activate the hpts if it is sleeping.
*/
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0);
- }
retval = 2;
hpts->p_direct_wake = 1;
tcp_wakeinput(hpts);
@@ -1001,36 +1088,14 @@
return (retval);
}
-void
-tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos)
-{
- /* Setup packet for input first */
- INP_WLOCK_ASSERT(tp->t_inpcb);
- m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t));
- m->m_pkthdr.pace_tlen = (uint16_t) tlen;
- m->m_pkthdr.pace_drphdrlen = drop_hdrlen;
- m->m_pkthdr.pace_tos = iptos;
- m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0);
- if (tp->t_in_pkt == NULL) {
- tp->t_in_pkt = m;
- tp->t_tail_pkt = m;
- } else {
- tp->t_tail_pkt->m_nextpkt = m;
- tp->t_tail_pkt = m;
- }
-}
-
-
int32_t
-__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){
+__tcp_queue_to_input(struct inpcb *inp, int line)
+{
struct tcp_hpts_entry *hpts;
int32_t ret;
- tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
- hpts = tcp_input_lock(tp->t_inpcb);
- ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line);
+ hpts = tcp_input_lock(inp);
+ ret = __tcp_queue_to_input_locked(inp, hpts, line);
mtx_unlock(&hpts->p_mtx);
return (ret);
}
@@ -1132,6 +1197,25 @@
#endif
}
+static void
+tcp_drop_in_pkts(struct tcpcb *tp)
+{
+ struct mbuf *m, *n;
+
+ m = tp->t_in_pkt;
+ if (m)
+ n = m->m_nextpkt;
+ else
+ n = NULL;
+ tp->t_in_pkt = NULL;
+ while (m) {
+ m_freem(m);
+ m = n;
+ if (m)
+ n = m->m_nextpkt;
+ }
+}
+
/*
* Do NOT try to optimize the processing of inp's
* by first pulling off all the inp's into a temporary
@@ -1142,7 +1226,7 @@
* but then while you were processing one of the inp's
* some other one that you switch will get a new
* packet on the different CPU. It will insert it
- * on the new hptss input list. Creating a temporary
+ * on the new hpts's input list. Creating a temporary
* link in the inp will not fix it either, since
* the other hpts will be doing the same thing and
* you will both end up using the temporary link.
@@ -1155,16 +1239,18 @@
static void
tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
{
- struct mbuf *m, *n;
struct tcpcb *tp;
struct inpcb *inp;
uint16_t drop_reason;
int16_t set_cpu;
uint32_t did_prefetch = 0;
- int32_t ti_locked = TI_UNLOCKED;
+ int dropped;
struct epoch_tracker et;
HPTS_MTX_ASSERT(hpts);
+#ifndef VIMAGE
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
HPTS_MTX_ASSERT(hpts);
hpts_sane_input_remove(hpts, inp, 0);
@@ -1177,26 +1263,22 @@
drop_reason = inp->inp_hpts_drop_reas;
inp->inp_in_input = 0;
mtx_unlock(&hpts->p_mtx);
- CURVNET_SET(inp->inp_vnet);
- if (drop_reason) {
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- ti_locked = TI_RLOCKED;
- } else {
- ti_locked = TI_UNLOCKED;
- }
INP_WLOCK(inp);
+#ifdef VIMAGE
+ CURVNET_SET(inp->inp_vnet);
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
(inp->inp_flags2 & INP_FREED)) {
out:
hpts->p_inp = NULL;
- if (ti_locked == TI_RLOCKED) {
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
- }
if (in_pcbrele_wlocked(inp) == 0) {
INP_WUNLOCK(inp);
}
- ti_locked = TI_UNLOCKED;
+#ifdef VIMAGE
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
CURVNET_RESTORE();
+#endif
mtx_lock(&hpts->p_mtx);
continue;
}
@@ -1206,26 +1288,17 @@
}
if (drop_reason) {
/* This tcb is being destroyed for drop_reason */
- m = tp->t_in_pkt;
- if (m)
- n = m->m_nextpkt;
- else
- n = NULL;
- tp->t_in_pkt = NULL;
- while (m) {
- m_freem(m);
- m = n;
- if (m)
- n = m->m_nextpkt;
- }
+ tcp_drop_in_pkts(tp);
tp = tcp_drop(tp, drop_reason);
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
if (tp == NULL) {
INP_WLOCK(inp);
}
if (in_pcbrele_wlocked(inp) == 0)
INP_WUNLOCK(inp);
+#ifdef VIMAGE
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
CURVNET_RESTORE();
+#endif
mtx_lock(&hpts->p_mtx);
continue;
}
@@ -1246,220 +1319,184 @@
*/
tcp_set_hpts(inp);
}
- m = tp->t_in_pkt;
- n = NULL;
- if (m != NULL &&
- (m->m_pkthdr.pace_lock == TI_RLOCKED ||
- tp->t_state != TCPS_ESTABLISHED)) {
- ti_locked = TI_RLOCKED;
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- m = tp->t_in_pkt;
- }
- if (in_newts_every_tcb) {
- if (in_ts_percision)
- microuptime(tv);
- else
- getmicrouptime(tv);
- }
if (tp->t_fb_ptr != NULL) {
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
did_prefetch = 1;
}
- /* Any input work to do, if so do it first */
- if ((m != NULL) && (m == tp->t_in_pkt)) {
- struct tcphdr *th;
- int32_t tlen, drop_hdrlen, nxt_pkt;
- uint8_t iptos;
-
- n = m->m_nextpkt;
- tp->t_in_pkt = tp->t_tail_pkt = NULL;
- while (m) {
- th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff);
- tlen = m->m_pkthdr.pace_tlen;
- drop_hdrlen = m->m_pkthdr.pace_drphdrlen;
- iptos = m->m_pkthdr.pace_tos;
- m->m_nextpkt = NULL;
- if (n)
- nxt_pkt = 1;
- else
- nxt_pkt = 0;
- inp->inp_input_calls = 1;
- if (tp->t_fb->tfb_tcp_hpts_do_segment) {
- /* Use the hpts specific do_segment */
- (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket,
- tp, drop_hdrlen,
- tlen, iptos, nxt_pkt, tv);
- } else {
- /* Use the default do_segment */
- (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket,
- tp, drop_hdrlen,
- tlen, iptos);
- }
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
- /*
- * Do segment returns unlocked we need the
- * lock again but we also need some kasserts
- * here.
- */
- INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
- INP_UNLOCK_ASSERT(inp);
- m = n;
- if (m)
- n = m->m_nextpkt;
- if (m != NULL &&
- m->m_pkthdr.pace_lock == TI_RLOCKED) {
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- ti_locked = TI_RLOCKED;
- } else
- ti_locked = TI_UNLOCKED;
+ if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
+ if (inp->inp_in_input)
+ tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
+ dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
+ if (dropped) {
+ /* Re-acquire the wlock so we can release the reference */
INP_WLOCK(inp);
- /*
- * Since we have an opening here we must
- * re-check if the tcb went away while we
- * were getting the lock(s).
- */
- if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
- (inp->inp_flags2 & INP_FREED)) {
- while (m) {
- m_freem(m);
- m = n;
- if (m)
- n = m->m_nextpkt;
- }
- goto out;
- }
- /*
- * Now that we hold the INP lock, check if
- * we need to upgrade our lock.
- */
- if (ti_locked == TI_UNLOCKED &&
- (tp->t_state != TCPS_ESTABLISHED)) {
- ti_locked = TI_RLOCKED;
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
- }
- } /** end while(m) */
- } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */
+ }
+ } else if (tp->t_in_pkt) {
+ /*
+ * We reach here only if we had a
+ * stack that supported INP_SUPPORTS_MBUFQ
+ * and then somehow switched to a stack that
+ * does not. The packets are basically stranded
+ * and would hang with the connection until
+ * cleanup without this code. Its not the
+ * best way but I know of no other way to
+ * handle it since the stack needs functions
+ * it does not have to handle queued packets.
+ */
+ tcp_drop_in_pkts(tp);
+ }
if (in_pcbrele_wlocked(inp) == 0)
INP_WUNLOCK(inp);
- if (ti_locked == TI_RLOCKED)
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
- INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
INP_UNLOCK_ASSERT(inp);
- ti_locked = TI_UNLOCKED;
+#ifdef VIMAGE
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+ CURVNET_RESTORE();
+#endif
mtx_lock(&hpts->p_mtx);
hpts->p_inp = NULL;
- CURVNET_RESTORE();
}
+#ifndef VIMAGE
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+#endif
}
-static int
-tcp_hpts_est_run(struct tcp_hpts_entry *hpts)
-{
- int32_t ticks_to_run;
-
- if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) {
- ticks_to_run = hpts->p_curtick - hpts->p_prevtick;
- if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) {
- ticks_to_run = NUM_OF_HPTSI_SLOTS - 2;
- }
- } else {
- if (hpts->p_prevtick == hpts->p_curtick) {
- /* This happens when we get woken up right away */
- return (-1);
- }
- ticks_to_run = 1;
- }
- /* Set in where we will be when we catch up */
- hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS;
- if (hpts->p_nxt_slot == hpts->p_cur_slot) {
- panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d",
- hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run);
- }
- return (ticks_to_run);
-}
-
static void
-tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick)
+tcp_hptsi(struct tcp_hpts_entry *hpts)
{
+ struct epoch_tracker et;
struct tcpcb *tp;
struct inpcb *inp = NULL, *ninp;
struct timeval tv;
- int32_t ticks_to_run, i, error, tick_now, interum_tick;
+ int32_t ticks_to_run, i, error;
int32_t paced_cnt = 0;
+ int32_t loop_cnt = 0;
int32_t did_prefetch = 0;
int32_t prefetch_ninp = 0;
int32_t prefetch_tp = 0;
- uint32_t cts;
+ int32_t wrap_loop_cnt = 0;
int16_t set_cpu;
HPTS_MTX_ASSERT(hpts);
- hpts->p_curtick = tcp_tv_to_hptstick(ctick);
- cts = tcp_tv_to_usectick(ctick);
- memcpy(&tv, ctick, sizeof(struct timeval));
- hpts->p_cur_slot = hpts_tick(hpts, 1);
+ /* record previous info for any logging */
+ hpts->saved_lasttick = hpts->p_lasttick;
+ hpts->saved_curtick = hpts->p_curtick;
+ hpts->saved_curslot = hpts->p_cur_slot;
+ hpts->saved_prev_slot = hpts->p_prev_slot;
- /* Figure out if we had missed ticks */
+ hpts->p_lasttick = hpts->p_curtick;
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ if ((hpts->p_on_queue_cnt == 0) ||
+ (hpts->p_lasttick == hpts->p_curtick)) {
+ /*
+ * No time has yet passed,
+ * or nothing to do.
+ */
+ hpts->p_prev_slot = hpts->p_cur_slot;
+ hpts->p_lasttick = hpts->p_curtick;
+ goto no_run;
+ }
again:
+ hpts->p_wheel_complete = 0;
HPTS_MTX_ASSERT(hpts);
- ticks_to_run = tcp_hpts_est_run(hpts);
- if (!TAILQ_EMPTY(&hpts->p_input)) {
- tcp_input_data(hpts, &tv);
+ ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot);
+ if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) &&
+ (hpts->p_on_queue_cnt != 0)) {
+ /*
+ * Wheel wrap is occuring, basically we
+ * are behind and the distance between
+ * run's has spread so much it has exceeded
+ * the time on the wheel (1.024 seconds). This
+ * is ugly and should NOT be happening. We
+ * need to run the entire wheel. We last processed
+ * p_prev_slot, so that needs to be the last slot
+ * we run. The next slot after that should be our
+ * reserved first slot for new, and then starts
+ * the running postion. Now the problem is the
+ * reserved "not to yet" place does not exist
+ * and there may be inp's in there that need
+ * running. We can merge those into the
+ * first slot at the head.
+ */
+ wrap_loop_cnt++;
+ hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1);
+ hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2);
+ /*
+ * Adjust p_cur_slot to be where we are starting from
+ * hopefully we will catch up (fat chance if something
+ * is broken this bad :( )
+ */
+ hpts->p_cur_slot = hpts->p_prev_slot;
+ /*
+ * The next slot has guys to run too, and that would
+ * be where we would normally start, lets move them into
+ * the next slot (p_prev_slot + 2) so that we will
+ * run them, the extra 10usecs of late (by being
+ * put behind) does not really matter in this situation.
+ */
+#ifdef INVARIANTS
+ /*
+ * To prevent a panic we need to update the inpslot to the
+ * new location. This is safe since it takes both the
+ * INP lock and the pacer mutex to change the inp_hptsslot.
+ */
+ TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) {
+ inp->inp_hptsslot = hpts->p_runningtick;
+ }
+#endif
+ TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick],
+ &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts);
+ ticks_to_run = NUM_OF_HPTSI_SLOTS - 1;
+ counter_u64_add(wheel_wrap, 1);
+ } else {
+ /*
+ * Nxt slot is always one after p_runningtick though
+ * its not used usually unless we are doing wheel wrap.
+ */
+ hpts->p_nxt_slot = hpts->p_prev_slot;
+ hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1);
}
#ifdef INVARIANTS
if (TAILQ_EMPTY(&hpts->p_input) &&
(hpts->p_on_inqueue_cnt != 0)) {
panic("tp:%p in_hpts input empty but cnt:%d",
- hpts, hpts->p_on_inqueue_cnt);
+ hpts, hpts->p_on_inqueue_cnt);
}
#endif
HPTS_MTX_ASSERT(hpts);
- /* Reset the ticks to run and time if we need too */
- interum_tick = tcp_gethptstick(&tv);
- if (interum_tick != hpts->p_curtick) {
- /* Save off the new time we execute to */
- *ctick = tv;
- hpts->p_curtick = interum_tick;
- cts = tcp_tv_to_usectick(&tv);
- hpts->p_cur_slot = hpts_tick(hpts, 1);
- ticks_to_run = tcp_hpts_est_run(hpts);
- }
- if (ticks_to_run == -1) {
- goto no_run;
- }
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0);
- }
if (hpts->p_on_queue_cnt == 0) {
goto no_one;
}
HPTS_MTX_ASSERT(hpts);
+#ifndef VIMAGE
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
for (i = 0; i < ticks_to_run; i++) {
/*
* Calculate our delay, if there are no extra ticks there
- * was not any
+ * was not any (i.e. if ticks_to_run == 1, no delay).
*/
hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
HPTS_MTX_ASSERT(hpts);
- while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+ while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
/* For debugging */
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i);
- }
hpts->p_inp = inp;
paced_cnt++;
- if (hpts->p_cur_slot != inp->inp_hptsslot) {
+#ifdef INVARIANTS
+ if (hpts->p_runningtick != inp->inp_hptsslot) {
panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
- hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot);
+ hpts, inp, hpts->p_runningtick, inp->inp_hptsslot);
}
+#endif
/* Now pull it */
if (inp->inp_hpts_cpu_set == 0) {
set_cpu = 1;
} else {
set_cpu = 0;
}
- hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0);
- if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+ hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0);
+ if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
/* We prefetch the next inp if possible */
kern_prefetch(ninp, &prefetch_ninp);
prefetch_ninp = 1;
@@ -1467,25 +1504,36 @@
if (inp->inp_hpts_request) {
/*
* This guy is deferred out further in time
- * then our wheel had on it. Push him back
- * on the wheel.
+ * then our wheel had available on it.
+ * Push him back on the wheel or run it
+ * depending.
*/
- int32_t remaining_slots;
-
+ uint32_t maxticks, last_tick, remaining_slots;
+
remaining_slots = ticks_to_run - (i + 1);
if (inp->inp_hpts_request > remaining_slots) {
/*
- * Keep INVARIANTS happy by clearing
- * the flag
+ * How far out can we go?
*/
- tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1);
+ maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick);
+ if (maxticks >= inp->inp_hpts_request) {
+ /* we can place it finally to be processed */
+ inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request);
+ inp->inp_hpts_request = 0;
+ } else {
+ /* Work off some more time */
+ inp->inp_hptsslot = last_tick;
+ inp->inp_hpts_request-= maxticks;
+ }
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1);
hpts->p_inp = NULL;
continue;
}
inp->inp_hpts_request = 0;
+ /* Fall through we will so do it now */
}
/*
- * We clear the hpts flag here after dealing with
+ * We clear the hpts flag here after dealing with
* remaining slots. This way anyone looking with the
* TCB lock will see its on the hpts until just
* before we unlock.
@@ -1495,23 +1543,20 @@
INP_WLOCK(inp);
if (in_pcbrele_wlocked(inp)) {
mtx_lock(&hpts->p_mtx);
- if (logging_on)
- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1);
hpts->p_inp = NULL;
continue;
}
- if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
-out_now:
+ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+ (inp->inp_flags2 & INP_FREED)) {
+ out_now:
#ifdef INVARIANTS
if (mtx_owned(&hpts->p_mtx)) {
panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
+ hpts, __LINE__);
}
#endif
INP_WUNLOCK(inp);
mtx_lock(&hpts->p_mtx);
- if (logging_on)
- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3);
hpts->p_inp = NULL;
continue;
}
@@ -1539,16 +1584,14 @@
*/
tcp_set_hpts(inp);
}
- if (out_newts_every_tcb) {
- struct timeval sv;
-
- if (out_ts_percision)
- microuptime(&sv);
- else
- getmicrouptime(&sv);
- cts = tcp_tv_to_usectick(&sv);
- }
+#ifdef VIMAGE
CURVNET_SET(inp->inp_vnet);
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+#endif
+ /* Lets do any logging that we might want to */
+ if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
+ tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i);
+ }
/*
* There is a hole here, we get the refcnt on the
* inp so it will still be preserved but to make
@@ -1560,19 +1603,23 @@
#ifdef INVARIANTS
if (mtx_owned(&hpts->p_mtx)) {
panic("Hpts:%p owns mtx before tcp-output:%d",
- hpts, __LINE__);
+ hpts, __LINE__);
}
#endif
if (tp->t_fb_ptr != NULL) {
kern_prefetch(tp->t_fb_ptr, &did_prefetch);
did_prefetch = 1;
}
- inp->inp_hpts_calls = 1;
- if (tp->t_fb->tfb_tcp_output_wtime != NULL) {
- error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv);
- } else {
- error = tp->t_fb->tfb_tcp_output(tp);
+ if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
+ error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
+ if (error) {
+ /* The input killed the connection */
+ goto skip_pacing;
+ }
}
+ inp->inp_hpts_calls = 1;
+ error = tp->t_fb->tfb_tcp_output(tp);
+ inp->inp_hpts_calls = 0;
if (ninp && ninp->inp_ppcb) {
/*
* If we have a nxt inp, see if we can
@@ -1609,74 +1656,112 @@
prefetch_tp = 1;
}
INP_WUNLOCK(inp);
- INP_UNLOCK_ASSERT(inp);
+ skip_pacing:
+#ifdef VIMAGE
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
CURVNET_RESTORE();
+#endif
+ INP_UNLOCK_ASSERT(inp);
#ifdef INVARIANTS
if (mtx_owned(&hpts->p_mtx)) {
panic("Hpts:%p owns mtx prior-to lock line:%d",
- hpts, __LINE__);
+ hpts, __LINE__);
}
#endif
mtx_lock(&hpts->p_mtx);
- if (logging_on)
- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4);
hpts->p_inp = NULL;
}
HPTS_MTX_ASSERT(hpts);
hpts->p_inp = NULL;
- hpts->p_cur_slot++;
- if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) {
- hpts->p_cur_slot = 0;
+ hpts->p_runningtick++;
+ if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) {
+ hpts->p_runningtick = 0;
}
}
+#ifndef VIMAGE
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+#endif
no_one:
HPTS_MTX_ASSERT(hpts);
- hpts->p_prevtick = hpts->p_curtick;
hpts->p_delayed_by = 0;
/*
* Check to see if we took an excess amount of time and need to run
* more ticks (if we did not hit eno-bufs).
*/
- /* Re-run any input that may be there */
- (void)tcp_gethptstick(&tv);
- if (!TAILQ_EMPTY(&hpts->p_input)) {
- tcp_input_data(hpts, &tv);
- }
#ifdef INVARIANTS
if (TAILQ_EMPTY(&hpts->p_input) &&
(hpts->p_on_inqueue_cnt != 0)) {
panic("tp:%p in_hpts input empty but cnt:%d",
- hpts, hpts->p_on_inqueue_cnt);
+ hpts, hpts->p_on_inqueue_cnt);
}
#endif
- tick_now = tcp_gethptstick(&tv);
- if (SEQ_GT(tick_now, hpts->p_prevtick)) {
- struct timeval res;
-
- /* Did we really spend a full tick or more in here? */
- timersub(&tv, ctick, &res);
- if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) {
+ hpts->p_prev_slot = hpts->p_cur_slot;
+ hpts->p_lasttick = hpts->p_curtick;
+ if (loop_cnt > max_pacer_loops) {
+ /*
+ * Something is serious slow we have
+ * looped through processing the wheel
+ * and by the time we cleared the
+ * needs to run max_pacer_loops time
+ * we still needed to run. That means
+ * the system is hopelessly behind and
+ * can never catch up :(
+ *
+ * We will just lie to this thread
+ * and let it thing p_curtick is
+ * correct. When it next awakens
+ * it will find itself further behind.
+ */
+ counter_u64_add(hpts_hopelessly_behind, 1);
+ goto no_run;
+ }
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
+ if ((wrap_loop_cnt < 2) &&
+ (hpts->p_lasttick != hpts->p_curtick)) {
+ counter_u64_add(hpts_loops, 1);
+ loop_cnt++;
+ goto again;
+ }
+no_run:
+ /*
+ * Set flag to tell that we are done for
+ * any slot input that happens during
+ * input.
+ */
+ hpts->p_wheel_complete = 1;
+ /*
+ * Run any input that may be there not covered
+ * in running data.
+ */
+ if (!TAILQ_EMPTY(&hpts->p_input)) {
+ tcp_input_data(hpts, &tv);
+ /*
+ * Now did we spend too long running
+ * input and need to run more ticks?
+ */
+ KASSERT(hpts->p_prev_slot == hpts->p_cur_slot,
+ ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
+ hpts->p_prev_slot, hpts->p_cur_slot));
+ KASSERT(hpts->p_lasttick == hpts->p_curtick,
+ ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
+ hpts->p_lasttick, hpts->p_curtick));
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ if (hpts->p_lasttick != hpts->p_curtick) {
counter_u64_add(hpts_loops, 1);
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now);
- }
- *ctick = res;
- hpts->p_curtick = tick_now;
+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
goto again;
}
}
-no_run:
{
uint32_t t = 0, i, fnd = 0;
- if (hpts->p_on_queue_cnt) {
-
-
+ if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
/*
* Find next slot that is occupied and use that to
* be the sleep time.
*/
- for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) {
+ for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
fnd = 1;
break;
@@ -1684,27 +1769,23 @@
t = (t + 1) % NUM_OF_HPTSI_SLOTS;
}
if (fnd) {
- hpts->p_hpts_sleep_time = i;
+ hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
} else {
- counter_u64_add(back_tosleep, 1);
#ifdef INVARIANTS
- panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt);
+ panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt);
#endif
+ counter_u64_add(back_tosleep, 1);
hpts->p_on_queue_cnt = 0;
goto non_found;
}
- t++;
+ } else if (wrap_loop_cnt >= 2) {
+ /* Special case handling */
+ hpts->p_hpts_sleep_time = tcp_min_hptsi_time;
} else {
- /* No one on the wheel sleep for all but 2 slots */
-non_found:
- if (hpts_sleep_max == 0)
- hpts_sleep_max = 1;
- hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max);
- t = 0;
+ /* No one on the wheel sleep for all but 400 slots or sleep max */
+ non_found:
+ hpts->p_hpts_sleep_time = hpts_sleep_max;
}
- if (logging_on) {
- tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC));
- }
}
}
@@ -1746,33 +1827,29 @@
mtx_lock(&hpts->p_mtx);
if (hpts->p_direct_wake) {
/* Signaled by input */
- if (logging_on)
- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1);
callout_stop(&hpts->co);
} else {
/* Timed out */
if (callout_pending(&hpts->co) ||
!callout_active(&hpts->co)) {
- if (logging_on)
- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2);
mtx_unlock(&hpts->p_mtx);
return;
}
callout_deactivate(&hpts->co);
- if (logging_on)
- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3);
}
+ hpts->p_hpts_wake_scheduled = 0;
hpts->p_hpts_active = 1;
- (void)tcp_gethptstick(&tv);
- tcp_hptsi(hpts, &tv);
+ tcp_hptsi(hpts);
HPTS_MTX_ASSERT(hpts);
tv.tv_sec = 0;
tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
+ hpts->overidden_sleep = tv.tv_usec;
tv.tv_usec = tcp_min_hptsi_time;
hpts->p_on_min_sleep = 1;
} else {
/* Clear the min sleep flag */
+ hpts->overidden_sleep = 0;
hpts->p_on_min_sleep = 0;
}
hpts->p_hpts_active = 0;
@@ -1809,9 +1886,11 @@
tcp_pace.rp_proc = NULL;
tcp_pace.rp_num_hptss = ncpus;
+ hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
hpts_loops = counter_u64_alloc(M_WAITOK);
back_tosleep = counter_u64_alloc(M_WAITOK);
-
+ combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
+ wheel_wrap = counter_u64_alloc(M_WAITOK);
sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
@@ -1850,7 +1929,7 @@
OID_AUTO, "out_qcnt", CTLFLAG_RD,
&hpts->p_on_queue_cnt, 0,
"Count TCB's awaiting output processing");
- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_ADD_U16(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "active", CTLFLAG_RD,
&hpts->p_hpts_active, 0,
@@ -1859,29 +1938,23 @@
SYSCTL_CHILDREN(hpts->hpts_root),
OID_AUTO, "curslot", CTLFLAG_RD,
&hpts->p_cur_slot, 0,
- "What the current slot is if active");
+ "What the current running pacers goal");
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "curtick", CTLFLAG_RD,
- &hpts->p_curtick, 0,
- "What the current tick on if active");
+ OID_AUTO, "runtick", CTLFLAG_RD,
+ &hpts->p_runningtick, 0,
+ "What the running pacers current slot is");
SYSCTL_ADD_UINT(&hpts->hpts_ctx,
SYSCTL_CHILDREN(hpts->hpts_root),
- OID_AUTO, "logsize", CTLFLAG_RD,
- &hpts->p_logsize, 0,
- "Hpts logging buffer size");
- hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2;
+ OID_AUTO, "curtick", CTLFLAG_RD,
+ &hpts->p_curtick, 0,
+ "What the running pacers last tick mapped to the wheel was");
+ hpts->p_hpts_sleep_time = hpts_sleep_max;
hpts->p_num = i;
- hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv);
- hpts->p_prevtick -= 1;
- hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS;
+ hpts->p_curtick = tcp_gethptstick(&tv);
+ hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
hpts->p_cpu = 0xffff;
- hpts->p_nxt_slot = 1;
- hpts->p_logsize = tcp_hpts_logging_size;
- if (hpts->p_logsize) {
- sz = (sizeof(struct hpts_log) * hpts->p_logsize);
- hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
- }
+ hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1);
callout_init(&hpts->co, 1);
}
Index: head/sys/netinet/tcp_log_buf.h
===================================================================
--- head/sys/netinet/tcp_log_buf.h
+++ head/sys/netinet/tcp_log_buf.h
@@ -175,7 +175,7 @@
TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
TCP_LOG_PRR, /* Doing PRR 6 */
TCP_LOG_REORDER,/* Detected reorder 7 */
- TCP_LOG_PACER, /* Pacer sending a packet 8 */
+ TCP_LOG_HPTS, /* Hpts sending a packet 8 */
BBR_LOG_BBRUPD, /* We updated BBR info 9 */
BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */
BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */
@@ -194,31 +194,38 @@
BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */
TCP_LOG_FLOWEND, /* End of a flow 25 */
BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */
- BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */
- BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */
+ BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */
+ BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */
BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */
BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */
TCP_LOG_USERSEND, /* User level sends data 31 */
- UNUSED_32, /* Unused 32 */
- UNUSED_33, /* Unused 33 */
+ BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */
+ BBR_LOG_STATE_TARGET, /* Log of target at state 33 */
BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */
BBR_LOG_TO_PROCESS, /* A to was processed 35 */
BBR_LOG_BBRTSO, /* TSO update 36 */
- BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */
+ BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */
BBR_LOG_LOWGAIN, /* Low gain accounting 38 */
BBR_LOG_PROGRESS, /* Progress timer event 39 */
TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */
BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */
BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */
- BBR_LOG_PACING_CALC, /* calc the pacing time 43 */
+ BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */
BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */
BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */
BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/
TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */
BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */
- BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */
+ BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */
TCP_LOG_REASS, /* Reassembly buffer logging 50 */
- TCP_LOG_END /* End (keep at end) 51 */
+ TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */
+ BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */
+ BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */
+ TCP_LOG_CONNEND, /* End of connection 54 */
+ TCP_LOG_LRO, /* LRO entry 55 */
+ TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */
+ TCP_SAD_DETECTION, /* Sack Attack Detection 57 */
+ TCP_LOG_END /* End (keep at end) 58 */
};
enum tcp_log_states {
@@ -275,8 +282,8 @@
#ifdef _KERNEL
-#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000
-#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000
+#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000
+#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000
/*
* TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always
Index: head/sys/netinet/tcp_stacks/rack.c
===================================================================
--- head/sys/netinet/tcp_stacks/rack.c
+++ head/sys/netinet/tcp_stacks/rack.c
@@ -1,5 +1,6 @@
/*-
- * Copyright (c) 2016-2019 Netflix, Inc.
+ * Copyright (c) 2016
+ * Netflix Inc. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -44,12 +45,16 @@
#include <sys/mutex.h>
#include <sys/mbuf.h>
#include <sys/proc.h> /* for proc0 declaration */
+#ifdef NETFLIX_STATS
+#include <sys/qmath.h>
+#endif
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
+#include <sys/tree.h>
#ifdef NETFLIX_STATS
-#include <sys/stats.h>
+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
#endif
#include <sys/refcount.h>
#include <sys/queue.h>
@@ -74,8 +79,8 @@
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
-#include <netinet/tcp.h>
#define TCPOUTFLAGS
+#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_log_buf.h>
#include <netinet/tcp_seq.h>
@@ -84,9 +89,6 @@
#include <netinet/tcp_hpts.h>
#include <netinet/tcpip.h>
#include <netinet/cc/cc.h>
-#ifdef NETFLIX_CWV
-#include <netinet/tcp_newcwv.h>
-#endif
#include <netinet/tcp_fastopen.h>
#ifdef TCPDEBUG
#include <netinet/tcp_debug.h>
@@ -126,6 +128,10 @@
struct sysctl_ctx_list rack_sysctl_ctx;
struct sysctl_oid *rack_sysctl_root;
+#ifndef TCPHPTS
+fatal error missing option TCPHSTS in the build;
+#endif
+
#define CUM_ACKED 1
#define SACKED 2
@@ -178,6 +184,9 @@
static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */
static int32_t rack_verbose_logging = 0;
static int32_t rack_ignore_data_after_close = 1;
+static int32_t rack_map_entries_limit = 1024;
+static int32_t rack_map_split_limit = 256;
+
/*
* Currently regular tcp has a rto_min of 30ms
* the backoff goes 12 times so that ends up
@@ -202,7 +211,6 @@
static int32_t rack_sack_block_limit = 128;
static int32_t rack_use_sack_filter = 1;
static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
-static uint32_t rack_map_split_limit = 0; /* unlimited by default */
/* Rack specific counters */
counter_u64_t rack_badfr;
@@ -228,6 +236,7 @@
counter_u64_t rack_to_alloc;
counter_u64_t rack_to_alloc_hard;
counter_u64_t rack_to_alloc_emerg;
+counter_u64_t rack_to_alloc_limited;
counter_u64_t rack_alloc_limited_conns;
counter_u64_t rack_split_limited;
@@ -248,12 +257,21 @@
counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];
+/*
+ * This was originally defined in tcp_timer.c, but is now reproduced here given
+ * the unification of the SYN and non-SYN retransmit timer exponents combined
+ * with wanting to retain previous behaviour for previously deployed stack
+ * versions.
+ */
+int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
+ { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
+
static void
rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line);
static int
rack_process_ack(struct mbuf *m, struct tcphdr *th,
- struct socket *so, struct tcpcb *tp, struct tcpopt *to,
+ struct socket *so, struct tcpcb *tp, struct tcpopt *to,
uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
static int
rack_process_data(struct mbuf *m, struct tcphdr *th,
@@ -351,14 +369,13 @@
rack_do_closing(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
+static void rack_do_drop(struct mbuf *m, struct tcpcb *tp);
static void
-rack_do_drop(struct mbuf *m, struct tcpcb *tp);
-static void
rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
static void
rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
- struct tcphdr *th, int32_t rstreason, int32_t tlen);
+ struct tcphdr *th, int32_t rstreason, int32_t tlen);
static int
rack_do_established(struct mbuf *m, struct tcphdr *th,
struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen,
@@ -449,6 +466,7 @@
counter_u64_zero(rack_sack_proc_short);
counter_u64_zero(rack_sack_proc_restart);
counter_u64_zero(rack_to_alloc);
+ counter_u64_zero(rack_to_alloc_limited);
counter_u64_zero(rack_alloc_limited_conns);
counter_u64_zero(rack_split_limited);
counter_u64_zero(rack_find_high);
@@ -470,6 +488,18 @@
{
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "map_limit", CTLFLAG_RW,
+ &rack_map_entries_limit , 1024,
+ "Is there a limit on how big the sendmap can grow? ");
+
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "map_splitlimit", CTLFLAG_RW,
+ &rack_map_split_limit , 256,
+ "Is there a limit on how much splitting a peer can do?");
+
+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "rate_sample_method", CTLFLAG_RW,
&rack_rate_sample_method , USE_RTT_LOW,
"What method should we use for rate sampling 0=high, 1=low ");
@@ -628,11 +658,6 @@
OID_AUTO, "pktdelay", CTLFLAG_RW,
&rack_pkt_delay, 1,
"Extra RACK time (in ms) besides reordering thresh");
- SYSCTL_ADD_U32(&rack_sysctl_ctx,
- SYSCTL_CHILDREN(rack_sysctl_root),
- OID_AUTO, "split_limit", CTLFLAG_RW,
- &rack_map_split_limit, 0,
- "Is there a limit on the number of map split entries (0=unlimited)");
SYSCTL_ADD_S32(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
OID_AUTO, "inc_var", CTLFLAG_RW,
@@ -769,6 +794,12 @@
OID_AUTO, "allocemerg", CTLFLAG_RD,
&rack_to_alloc_emerg,
"Total allocations done from emergency cache");
+ rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_sysctl_root),
+ OID_AUTO, "alloc_limited", CTLFLAG_RD,
+ &rack_to_alloc_limited,
+ "Total allocations dropped due to limit");
rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_sysctl_root),
@@ -859,6 +890,7 @@
static inline int32_t
rack_progress_timeout_check(struct tcpcb *tp)
{
+#ifdef NETFLIX_PROGRESS
if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
/*
@@ -869,13 +901,12 @@
struct tcp_rack *rack;
rack = (struct tcp_rack *)tp->t_fb_ptr;
counter_u64_add(rack_progress_drops, 1);
-#ifdef NETFLIX_STATS
TCPSTAT_INC(tcps_progdrops);
-#endif
rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
return (1);
}
}
+#endif
return (0);
}
@@ -962,6 +993,7 @@
union tcp_log_stackspecific log;
struct timeval tv;
+ memset(&log, 0, sizeof(log));
/* Convert our ms to a microsecond */
log.u_bbr.flex1 = rtt * 1000;
log.u_bbr.timeStamp = tcp_get_usecs(&tv);
@@ -1021,6 +1053,8 @@
{
if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+
+ memset(&log, 0, sizeof(log));
log.u_bbr.flex1 = did_out;
log.u_bbr.flex2 = nxt_pkt;
log.u_bbr.flex3 = way_out;
@@ -1127,6 +1161,8 @@
counter_u64_free(rack_sack_proc_short);
counter_u64_free(rack_sack_proc_restart);
counter_u64_free(rack_to_alloc);
+ counter_u64_free(rack_to_alloc_limited);
+ counter_u64_free(rack_split_limited);
counter_u64_free(rack_find_high);
counter_u64_free(rack_runt_sacks);
counter_u64_free(rack_enter_tlp_calc);
@@ -1146,9 +1182,8 @@
rsm = uma_zalloc(rack_zone, M_NOWAIT);
if (rsm) {
-alloc_done:
- counter_u64_add(rack_to_alloc, 1);
rack->r_ctl.rc_num_maps_alloced++;
+ counter_u64_add(rack_to_alloc, 1);
return (rsm);
}
if (rack->rc_free_cnt) {
@@ -1156,11 +1191,26 @@
rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
rack->rc_free_cnt--;
- goto alloc_done;
+ return (rsm);
}
return (NULL);
}
+static struct rack_sendmap *
+rack_alloc_full_limit(struct tcp_rack *rack)
+{
+ if ((rack_map_entries_limit > 0) &&
+ (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) {
+ counter_u64_add(rack_to_alloc_limited, 1);
+ if (!rack->alloc_limit_reported) {
+ rack->alloc_limit_reported = 1;
+ counter_u64_add(rack_alloc_limited_conns, 1);
+ }
+ return (NULL);
+ }
+ return (rack_alloc(rack));
+}
+
/* wrapper to allocate a sendmap entry, subject to a specific limit */
static struct rack_sendmap *
rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
@@ -1196,7 +1246,6 @@
/* currently there is only one limit type */
rack->r_ctl.rc_num_split_allocs--;
}
- rack->r_ctl.rc_num_maps_alloced--;
if (rack->r_ctl.rc_tlpsend == rsm)
rack->r_ctl.rc_tlpsend = NULL;
if (rack->r_ctl.rc_next == rsm)
@@ -1206,9 +1255,11 @@
if (rack->rc_free_cnt < rack_free_cache) {
memset(rsm, 0, sizeof(struct rack_sendmap));
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
+ rsm->r_limit_type = 0;
rack->rc_free_cnt++;
return;
}
+ rack->r_ctl.rc_num_maps_alloced--;
uma_zfree(rack_zone, rsm);
}
@@ -1222,11 +1273,9 @@
#ifdef NETFLIX_STATS
int32_t gput;
#endif
-#ifdef NETFLIX_CWV
- u_long old_cwnd = tp->snd_cwnd;
-#endif
INP_WLOCK_ASSERT(tp->t_inpcb);
+
tp->ccv->nsegs = nsegs;
tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
@@ -1264,7 +1313,6 @@
tp->t_stats_gput_prev);
tp->t_flags &= ~TF_GPUTINPROG;
tp->t_stats_gput_prev = gput;
-#ifdef NETFLIX_CWV
if (tp->t_maxpeakrate) {
/*
* We update t_peakrate_thr. This gives us roughly
@@ -1272,7 +1320,6 @@
*/
tcp_update_peakrate_thr(tp);
}
-#endif
}
#endif
if (tp->snd_cwnd > tp->snd_ssthresh) {
@@ -1298,39 +1345,10 @@
if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
}
-#ifdef NETFLIX_CWV
- if (tp->cwv_enabled) {
- /*
- * Per RFC 7661: The behaviour in the non-validated phase is
- * specified as: o A sender determines whether to increase
- * the cwnd based upon whether it is cwnd-limited (see
- * Section 4.5.3): * A sender that is cwnd-limited MAY use
- * the standard TCP method to increase cwnd (i.e., the
- * standard method permits a TCP sender that fully utilises
- * the cwnd to increase the cwnd each time it receives an
- * ACK). * A sender that is not cwnd-limited MUST NOT
- * increase the cwnd when ACK packets are received in this
- * phase (i.e., needs to avoid growing the cwnd when it has
- * not recently sent using the current size of cwnd).
- */
- if ((tp->snd_cwnd > old_cwnd) &&
- (tp->cwv_cwnd_valid == 0) &&
- (!(tp->ccv->flags & CCF_CWND_LIMITED))) {
- tp->snd_cwnd = old_cwnd;
- }
- /* Try to update pipeAck and NCWV state */
- if (TCPS_HAVEESTABLISHED(tp->t_state) &&
- !IN_RECOVERY(tp->t_flags)) {
- uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd));
-
- tcp_newcwv_update_pipeack(tp, data);
- }
- }
/* we enforce max peak rate if it is set. */
if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
tp->snd_cwnd = tp->t_peakrate_thr;
}
-#endif
}
static void
@@ -1379,16 +1397,8 @@
tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
rack->r_ctl.rc_prr_sndcnt = 0;
}
+ tp->snd_recover = tp->snd_una;
EXIT_RECOVERY(tp->t_flags);
-
-
-#ifdef NETFLIX_CWV
- if (tp->cwv_enabled) {
- if ((tp->cwv_cwnd_valid == 0) &&
- (tp->snd_cwv.in_recovery))
- tcp_newcwv_end_recovery(tp);
- }
-#endif
}
static void
@@ -1450,16 +1460,6 @@
tp->ccv->curack = th->th_ack;
CC_ALGO(tp)->cong_signal(tp->ccv, type);
}
-#ifdef NETFLIX_CWV
- if (tp->cwv_enabled) {
- if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) {
- tcp_newcwv_enter_recovery(tp);
- }
- if (type == CC_RTO) {
- tcp_newcwv_reset(tp);
- }
- }
-#endif
}
@@ -1479,11 +1479,21 @@
if (CC_ALGO(tp)->after_idle != NULL)
CC_ALGO(tp)->after_idle(tp->ccv);
- if (tp->snd_cwnd == 1)
- i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */
- else
- i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp));
-
+ if (V_tcp_initcwnd_segments)
+ i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
+ max(2 * tp->t_maxseg, 14600));
+ else if (V_tcp_do_rfc3390)
+ i_cwnd = min(4 * tp->t_maxseg,
+ max(2 * tp->t_maxseg, 4380));
+ else {
+ /* Per RFC5681 Section 3.1 */
+ if (tp->t_maxseg > 2190)
+ i_cwnd = 2 * tp->t_maxseg;
+ else if (tp->t_maxseg > 1095)
+ i_cwnd = 3 * tp->t_maxseg;
+ else
+ i_cwnd = 4 * tp->t_maxseg;
+ }
if (reduce_largest) {
/*
* Do we reduce the largest cwnd to make
@@ -1549,8 +1559,7 @@
}
static void
-rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
- int32_t rstreason, int32_t tlen)
+rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen)
{
if (tp != NULL) {
tcp_dropwithreset(m, th, tp, tlen, rstreason);
@@ -1736,7 +1745,7 @@
* TCB is still valid and locked.
*/
static int
-rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
+rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
{
int32_t todrop;
int32_t thflags;
@@ -1778,17 +1787,6 @@
TCPSTAT_INC(tcps_rcvpartduppack);
TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
}
- /*
- * DSACK - add SACK block for dropped range
- */
- if (tp->t_flags & TF_SACK_PERMIT) {
- tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen);
- /*
- * ACK now, as the next in-sequence segment
- * will clear the DSACK block again
- */
- tp->t_flags |= TF_ACKNOW;
- }
*drop_hdrlen += todrop; /* drop from the top afterwards */
th->th_seq += todrop;
tlen -= todrop;
@@ -2124,8 +2122,6 @@
/* We can't start any timer in persists */
return (rack_get_persists_timer_val(tp, rack));
}
- if (tp->t_state < TCPS_ESTABLISHED)
- goto activate_rxt;
rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
if (rsm == NULL) {
/* Nothing on the send map */
@@ -2184,6 +2180,12 @@
*/
goto activate_rxt;
}
+ if ((tp->snd_max - tp->snd_una) > tp->snd_wnd) {
+ /*
+ * Peer collapsed rwnd, don't do TLP.
+ */
+ goto activate_rxt;
+ }
rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
if (rsm == NULL) {
/* We found no rsm to TLP with. */
@@ -2288,7 +2290,9 @@
/* A previous call is already set up */
return;
}
- if (tp->t_state == TCPS_CLOSED) {
+
+ if ((tp->t_state == TCPS_CLOSED) ||
+ (tp->t_state == TCPS_LISTEN)) {
return;
}
stopped = rack->rc_tmr_stopped;
@@ -2307,8 +2311,8 @@
* We are still left on the hpts when the to goes
* it will be for output.
*/
- if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to))
- slot = cts - rack->r_ctl.rc_last_output_to;
+ if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts))
+ slot = rack->r_ctl.rc_last_output_to - cts;
else
slot = 1;
}
@@ -2330,7 +2334,7 @@
}
hpts_timeout = rack_timer_start(tp, rack, cts);
if (tp->t_flags & TF_DELACK) {
- delayed_ack = TICKS_2_MSEC(tcp_delacktime);
+ delayed_ack = tcp_delacktime;
rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK;
}
if (delayed_ack && ((hpts_timeout == 0) ||
@@ -2487,6 +2491,43 @@
return (0);
}
+static struct rack_sendmap *
+rack_merge_rsm(struct tcp_rack *rack,
+ struct rack_sendmap *l_rsm,
+ struct rack_sendmap *r_rsm)
+{
+ /*
+ * We are merging two ack'd RSM's,
+ * the l_rsm is on the left (lower seq
+ * values) and the r_rsm is on the right
+ * (higher seq value). The simplest way
+ * to merge these is to move the right
+ * one into the left. I don't think there
+ * is any reason we need to try to find
+ * the oldest (or last oldest retransmitted).
+ */
+ l_rsm->r_end = r_rsm->r_end;
+ if (r_rsm->r_rtr_bytes)
+ l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
+ if (r_rsm->r_in_tmap) {
+ /* This really should not happen */
+ TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
+ }
+ /* Now the flags */
+ if (r_rsm->r_flags & RACK_HAS_FIN)
+ l_rsm->r_flags |= RACK_HAS_FIN;
+ if (r_rsm->r_flags & RACK_TLP)
+ l_rsm->r_flags |= RACK_TLP;
+ TAILQ_REMOVE(&rack->r_ctl.rc_map, r_rsm, r_next);
+ if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
+ /* Transfer the split limit to the map we free */
+ r_rsm->r_limit_type = l_rsm->r_limit_type;
+ l_rsm->r_limit_type = 0;
+ }
+ rack_free(rack, r_rsm);
+ return(l_rsm);
+}
+
/*
* TLP Timer, here we simply setup what segment we want to
* have the TLP expire on, the normal rack_output() will then
@@ -2590,7 +2631,7 @@
int32_t idx;
struct rack_sendmap *nrsm;
- nrsm = rack_alloc(rack);
+ nrsm = rack_alloc_full_limit(rack);
if (nrsm == NULL) {
/*
* No memory to split, we will just exit and punt
@@ -2937,7 +2978,7 @@
TCPSTAT_INC(tcps_rexmttimeo);
if ((tp->t_state == TCPS_SYN_SENT) ||
(tp->t_state == TCPS_SYN_RECEIVED))
- rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
+ rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]);
else
rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
TCPT_RANGESET(tp->t_rxtcur, rexmt,
@@ -3281,7 +3322,7 @@
* Here we retransmitted less than the whole thing which means we
* have to split this into what was transmitted and what was not.
*/
- nrsm = rack_alloc(rack);
+ nrsm = rack_alloc_full_limit(rack);
if (nrsm == NULL) {
/*
* We can't get memory, so lets not proceed.
@@ -3415,9 +3456,6 @@
* Hmm out of memory and the tcb got destroyed while
* we tried to wait.
*/
-#ifdef INVARIANTS
- panic("Out of memory when we should not be rack:%p", rack);
-#endif
return;
}
if (th_flags & TH_FIN) {
@@ -3428,15 +3466,8 @@
rsm->r_tim_lastsent[0] = ts;
rsm->r_rtr_cnt = 1;
rsm->r_rtr_bytes = 0;
- if (th_flags & TH_SYN) {
- /* The data space is one beyond snd_una */
- rsm->r_start = seq_out + 1;
- rsm->r_end = rsm->r_start + (len - 1);
- } else {
- /* Normal case */
- rsm->r_start = seq_out;
- rsm->r_end = rsm->r_start + len;
- }
+ rsm->r_start = seq_out;
+ rsm->r_end = rsm->r_start + len;
rsm->r_sndcnt = 0;
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
@@ -3486,11 +3517,8 @@
* Ok we must split off the front and then let the
* update do the rest
*/
- nrsm = rack_alloc(rack);
+ nrsm = rack_alloc_full_limit(rack);
if (nrsm == NULL) {
-#ifdef INVARIANTS
- panic("Ran out of memory that was preallocated? rack:%p", rack);
-#endif
rack_update_rsm(tp, rack, rsm, ts);
return;
}
@@ -3908,6 +3936,14 @@
if (nrsm->r_flags & RACK_ACKED) {
/* Skip ack'd segments */
continue;
+ }
+ if (nrsm->r_flags & RACK_SACK_PASSED) {
+ /*
+ * We found one that is already marked
+ * passed, we have been here before and
+ * so all others below this are marked.
+ */
+ break;
}
idx = nrsm->r_rtr_cnt - 1;
if (ts == nrsm->r_tim_lastsent[idx]) {
@@ -4114,6 +4150,26 @@
rsm->r_in_tmap = 0;
}
out:
+ if (rsm && (rsm->r_flags & RACK_ACKED)) {
+ /*
+ * Now can we merge this newly acked
+ * block with either the previous or
+ * next block?
+ */
+ nrsm = TAILQ_NEXT(rsm, r_next);
+ if (nrsm &&
+ (nrsm->r_flags & RACK_ACKED)) {
+ /* yep this and next can be merged */
+ rsm = rack_merge_rsm(rack, rsm, nrsm);
+ }
+ /* Now what about the previous? */
+ nrsm = TAILQ_PREV(rsm, rack_head, r_next);
+ if (nrsm &&
+ (nrsm->r_flags & RACK_ACKED)) {
+ /* yep the previous and this can be merged */
+ rsm = rack_merge_rsm(rack, nrsm, rsm);
+ }
+ }
if (used_ref == 0) {
counter_u64_add(rack_sack_proc_all, 1);
} else {
@@ -4353,16 +4409,13 @@
}
sack_blocks[num_sack_blks] = sack;
num_sack_blks++;
-#ifdef NETFLIX_STATS
} else if (SEQ_LEQ(sack.start, th_ack) &&
SEQ_LEQ(sack.end, th_ack)) {
/*
* Its a D-SACK block.
*/
- tcp_record_dsack(sack.start, sack.end);
-#endif
+/* tcp_record_dsack(sack.start, sack.end); */
}
-
}
if (num_sack_blks == 0)
goto out;
@@ -4371,7 +4424,9 @@
* just one pass.
*/
if (rack_use_sack_filter) {
- num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack);
+ num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
+ num_sack_blks, th->th_ack);
+ ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
}
if (num_sack_blks < 2) {
goto do_sack_work;
@@ -4620,8 +4675,9 @@
return (0);
}
if (rack->r_ctl.rc_early_recovery) {
- if (IN_FASTRECOVERY(tp->t_flags)) {
- if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+ if (IN_RECOVERY(tp->t_flags)) {
+ if (SEQ_LT(th->th_ack, tp->snd_recover) &&
+ (SEQ_LT(th->th_ack, tp->snd_max))) {
tcp_rack_partialack(tp, th);
} else {
rack_post_recovery(tp, th);
@@ -4648,8 +4704,9 @@
sowwakeup_locked(so);
m_freem(mfree);
if (rack->r_ctl.rc_early_recovery == 0) {
- if (IN_FASTRECOVERY(tp->t_flags)) {
- if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+ if (IN_RECOVERY(tp->t_flags)) {
+ if (SEQ_LT(th->th_ack, tp->snd_recover) &&
+ (SEQ_LT(th->th_ack, tp->snd_max))) {
tcp_rack_partialack(tp, th);
} else {
rack_post_recovery(tp, th);
@@ -4707,7 +4764,11 @@
* send garbage on first SYN.
*/
int32_t nsegs;
+#ifdef TCP_RFC7413
int32_t tfo_syn;
+#else
+#define tfo_syn (FALSE)
+#endif
struct tcp_rack *rack;
rack = (struct tcp_rack *)tp->t_fb_ptr;
@@ -4816,8 +4877,10 @@
* PRU_RCVD). If a FIN has already been received on this connection
* then we just ignore the text.
*/
+#ifdef TCP_RFC7413
tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
- IS_FASTOPEN(tp->t_flags));
+ (tp->t_flags & TF_FASTOPEN));
+#endif
if ((tlen || (thflags & TH_FIN) || tfo_syn) &&
TCPS_HAVERCVDFIN(tp->t_state) == 0) {
tcp_seq save_start = th->th_seq;
@@ -5024,8 +5087,9 @@
/* Clean receiver SACK report if present */
- if (tp->rcv_numsacks)
- tcp_clean_sackreport(tp);
+/* if (tp->rcv_numsacks)
+ tcp_clean_sackreport(tp);
+*/
TCPSTAT_INC(tcps_preddat);
tp->rcv_nxt += tlen;
/*
@@ -5284,8 +5348,6 @@
tp->irs = th->th_seq;
tcp_rcvseqinit(tp);
if (thflags & TH_ACK) {
- int tfo_partial = 0;
-
TCPSTAT_INC(tcps_connects);
soisconnected(so);
#ifdef MAC
@@ -5299,19 +5361,10 @@
tp->rcv_adv += min(tp->rcv_wnd,
TCP_MAXWIN << tp->rcv_scale);
/*
- * If not all the data that was sent in the TFO SYN
- * has been acked, resend the remainder right away.
- */
- if (IS_FASTOPEN(tp->t_flags) &&
- (tp->snd_una != tp->snd_max)) {
- tp->snd_nxt = th->th_ack;
- tfo_partial = 1;
- }
- /*
* If there's data, delay ACK; if there's also a FIN ACKNOW
* will be turned on later.
*/
- if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
+ if (DELAY_ACK(tp, tlen) && tlen != 0) {
rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr,
((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__);
tp->t_flags |= TF_DELACK;
@@ -5320,26 +5373,10 @@
tp->t_flags |= TF_ACKNOW;
}
- if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) &&
- V_tcp_do_ecn) {
+ if ((thflags & TH_ECE) && V_tcp_do_ecn) {
tp->t_flags |= TF_ECN_PERMIT;
TCPSTAT_INC(tcps_ecn_shs);
}
- if (SEQ_GT(th->th_ack, tp->snd_una)) {
- /*
- * We advance snd_una for the
- * fast open case. If th_ack is
- * acknowledging data beyond
- * snd_una we can't just call
- * ack-processing since the
- * data stream in our send-map
- * will start at snd_una + 1 (one
- * beyond the SYN). If its just
- * equal we don't need to do that
- * and there is no send_map.
- */
- tp->snd_una++;
- }
/*
* Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
* SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
@@ -5423,7 +5460,7 @@
}
}
return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
- tiwin, thflags, nxt_pkt));
+ tiwin, thflags, nxt_pkt));
}
/*
@@ -5447,13 +5484,13 @@
rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
return (1);
}
- if (IS_FASTOPEN(tp->t_flags)) {
+#ifdef TCP_RFC7413
+ if (tp->t_flags & TF_FASTOPEN) {
/*
- * When a TFO connection is in SYN_RECEIVED, the
- * only valid packets are the initial SYN, a
- * retransmit/copy of the initial SYN (possibly with
- * a subset of the original data), a valid ACK, a
- * FIN, or a RST.
+ * When a TFO connection is in SYN_RECEIVED, the only valid
+ * packets are the initial SYN, a retransmit/copy of the
+ * initial SYN (possibly with a subset of the original
+ * data), a valid ACK, a FIN, or a RST.
*/
if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) {
rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
@@ -5474,9 +5511,18 @@
return (0);
}
}
+#endif
if (thflags & TH_RST)
return (rack_process_rst(m, th, so, tp));
/*
+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
+ * synchronized state.
+ */
+ if (thflags & TH_SYN) {
+ rack_challenge_ack(m, th, tp, &ret_val);
+ return (ret_val);
+ }
+ /*
* RFC 1323 PAWS: If we have a timestamp reply on this segment and
* it's less than ts_recent, drop it.
*/
@@ -5520,16 +5566,18 @@
tp->ts_recent_age = tcp_ts_getticks();
tp->ts_recent = to->to_tsval;
}
- tp->snd_wnd = tiwin;
/*
* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
* is on (half-synchronized state), then queue data for later
* processing; else drop segment and return.
*/
if ((thflags & TH_ACK) == 0) {
- if (IS_FASTOPEN(tp->t_flags)) {
+#ifdef TCP_RFC7413
+ if (tp->t_flags & TF_FASTOPEN) {
+ tp->snd_wnd = tiwin;
cc_conn_init(tp);
}
+#endif
return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
tiwin, thflags, nxt_pkt));
}
@@ -5539,22 +5587,13 @@
if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) ==
(TF_RCVD_SCALE | TF_REQ_SCALE)) {
tp->rcv_scale = tp->request_r_scale;
+ tp->snd_wnd = tiwin;
}
/*
* Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* ->
* FIN-WAIT-1
*/
tp->t_starttime = ticks;
- if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
- tcp_fastopen_decrement_counter(tp->t_tfo_pending);
- tp->t_tfo_pending = NULL;
-
- /*
- * Account for the ACK of our SYN prior to
- * regular ACK processing below.
- */
- tp->snd_una++;
- }
if (tp->t_flags & TF_NEEDFIN) {
tcp_state_change(tp, TCPS_FIN_WAIT_1);
tp->t_flags &= ~TF_NEEDFIN;
@@ -5562,13 +5601,25 @@
tcp_state_change(tp, TCPS_ESTABLISHED);
TCP_PROBE5(accept__established, NULL, tp,
mtod(m, const char *), tp, th);
+#ifdef TCP_RFC7413
+ if (tp->t_tfo_pending) {
+ tcp_fastopen_decrement_counter(tp->t_tfo_pending);
+ tp->t_tfo_pending = NULL;
+
+ /*
+ * Account for the ACK of our SYN prior to regular
+ * ACK processing below.
+ */
+ tp->snd_una++;
+ }
/*
* TFO connections call cc_conn_init() during SYN
* processing. Calling it again here for such connections
* is not harmless as it would undo the snd_cwnd reduction
* that occurs when a TFO SYN|ACK is retransmitted.
*/
- if (!IS_FASTOPEN(tp->t_flags))
+ if (!(tp->t_flags & TF_FASTOPEN))
+#endif
cc_conn_init(tp);
}
/*
@@ -5576,7 +5627,7 @@
* not, do so now to pass queued data to user.
*/
if (tlen == 0 && (thflags & TH_FIN) == 0)
- (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
+ (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
(struct mbuf *)0);
tp->snd_wl1 = th->th_seq - 1;
if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
@@ -5836,7 +5887,7 @@
rack_check_data_after_close(struct mbuf *m,
struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so)
{
- struct tcp_rack *rack;
+ struct tcp_rack *rack;
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
rack = (struct tcp_rack *)tp->t_fb_ptr;
@@ -6353,7 +6404,6 @@
rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
rack->r_ctl.rc_min_to = rack_min_to;
rack->r_ctl.rc_prr_inc_var = rack_inc_var;
- rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
if (tp->snd_una != tp->snd_max) {
/* Create a send map for the current outstanding data */
struct rack_sendmap *rsm;
@@ -6375,6 +6425,8 @@
TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
rsm->r_in_tmap = 1;
}
+ rack_stop_all_timers(tp);
+ rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
return (0);
}
@@ -6431,6 +6483,8 @@
uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
tp->t_fb_ptr = NULL;
}
+ /* Make sure snd_nxt is correctly set */
+ tp->snd_nxt = tp->snd_max;
}
static void
@@ -6473,9 +6527,6 @@
case TCPS_CLOSED:
case TCPS_TIME_WAIT:
default:
-#ifdef INVARIANTS
- panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state);
-#endif
break;
};
}
@@ -6585,10 +6636,6 @@
* allow the tcbinfo to be in either locked or unlocked, as the
* caller may have unnecessarily acquired a lock due to a race.
*/
- if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
- tp->t_state != TCPS_ESTABLISHED) {
- INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
- }
INP_WLOCK_ASSERT(tp->t_inpcb);
KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
__func__));
@@ -6600,37 +6647,17 @@
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
log.u_bbr.ininput = rack->rc_inp->inp_in_input;
+ log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
tlen, &log, true);
}
- if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
- way_out = 4;
- goto done_with_input;
- }
/*
- * If a segment with the ACK-bit set arrives in the SYN-SENT state
- * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
- */
- if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
- (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) {
- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
- return;
- }
- /*
* Segment received on connection. Reset idle time and keep-alive
* timer. XXX: This should be done after segment validation to
* ignore broken/spoofed segs.
*/
if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) {
-#ifdef NETFLIX_CWV
- if ((tp->cwv_enabled) &&
- ((tp->cwv_cwnd_valid == 0) &&
- TCPS_HAVEESTABLISHED(tp->t_state) &&
- (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) {
- tcp_newcwv_nvp_closedown(tp);
- } else
-#endif
- if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
+ if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
counter_u64_add(rack_input_idle_reduces, 1);
rack_cc_after_idle(tp,
(rack->r_idle_reduce_largest ? 1 :0));
@@ -6639,14 +6666,6 @@
rack->r_ctl.rc_rcvtime = cts;
tp->t_rcvtime = ticks;
-#ifdef NETFLIX_CWV
- if (tp->cwv_enabled) {
- if ((tp->cwv_cwnd_valid == 0) &&
- TCPS_HAVEESTABLISHED(tp->t_state) &&
- (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
- tcp_newcwv_nvp_closedown(tp);
- }
-#endif
/*
* Unscale the window into a 32-bit value. For the SYN_SENT state
* the scale is zero.
@@ -6737,22 +6756,6 @@
if ((tp->t_flags & TF_SACK_PERMIT) &&
(to.to_flags & TOF_SACKPERM) == 0)
tp->t_flags &= ~TF_SACK_PERMIT;
- if (IS_FASTOPEN(tp->t_flags)) {
- if (to.to_flags & TOF_FASTOPEN) {
- uint16_t mss;
-
- if (to.to_flags & TOF_MSS)
- mss = to.to_mss;
- else
- if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
- mss = TCP6_MSS;
- else
- mss = TCP_MSS;
- tcp_fastopen_update_cache(tp, mss,
- to.to_tfo_len, to.to_tfo_cookie);
- } else
- tcp_fastopen_disable_path(tp);
- }
}
/*
* At this point we are at the initial call. Here we decide
@@ -6769,7 +6772,6 @@
/* Set the flag */
rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
tcp_set_hpts(tp->t_inpcb);
- rack_stop_all_timers(tp);
sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
}
/*
@@ -6801,24 +6803,6 @@
*/
INP_WLOCK_ASSERT(tp->t_inpcb);
tcp_rack_xmit_timer_commit(rack, tp);
- if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) &&
- (rack->rc_in_persist == 0)){
- /*
- * The peer shrunk its window on us to the point
- * where we have sent too much. The only thing
- * we can do here is stop any timers and
- * enter persist. We most likely lost the last
- * bytes we sent but oh well, we will have to
- * retransmit them after the peer is caught up.
- */
- if (rack->rc_inp->inp_in_hpts)
- tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
- rack_timer_cancel(tp, rack, cts, __LINE__);
- rack_enter_persist(tp, rack, cts);
- rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
- way_out = 3;
- goto done_with_input;
- }
if (nxt_pkt == 0) {
if (rack->r_wanted_output != 0) {
did_out = 1;
@@ -6848,7 +6832,6 @@
rack_timer_audit(tp, rack, &so->so_snd);
way_out = 2;
}
- done_with_input:
rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
if (did_out)
rack->r_wanted_output = 0;
@@ -6871,7 +6854,7 @@
#ifdef RSS
struct tcp_function_block *tfb;
struct tcp_rack *rack;
- struct epoch_tracker et;
+ struct inpcb *inp;
rack = (struct tcp_rack *)tp->t_fb_ptr;
if (rack->r_state == 0) {
@@ -6879,11 +6862,9 @@
* Initial input (ACK to SYN-ACK etc)lets go ahead and get
* it processed
*/
- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
tcp_get_usecs(&tv);
rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
tlen, iptos, 0, &tv);
- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
return;
}
tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
@@ -6959,13 +6940,17 @@
#ifdef TCPDEBUG
struct ipovly *ipov = NULL;
#endif
+#ifdef NETFLIX_TCP_O_UDP
struct udphdr *udp = NULL;
+#endif
struct tcp_rack *rack;
struct tcphdr *th;
uint8_t pass = 0;
- uint8_t wanted_cookie = 0;
u_char opt[TCP_MAXOLEN];
- unsigned ipoptlen, optlen, hdrlen, ulen=0;
+ unsigned ipoptlen, optlen, hdrlen;
+#ifdef NETFLIX_TCP_O_UDP
+ unsigned ulen;
+#endif
uint32_t rack_seq;
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
@@ -7004,6 +6989,18 @@
if (tp->t_flags & TF_TOE)
return (tcp_offload_output(tp));
#endif
+
+#ifdef TCP_RFC7413
+ /*
+ * For TFO connections in SYN_RECEIVED, only allow the initial
+ * SYN|ACK and those sent by the retransmit timer.
+ */
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ (tp->t_state == TCPS_SYN_RECEIVED) &&
+ SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */
+ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */
+ return (0);
+#endif
#ifdef INET6
if (rack->r_state) {
/* Use the cache line loaded if possible */
@@ -7046,31 +7043,12 @@
rack->r_wanted_output = 0;
rack->r_timer_override = 0;
/*
- * For TFO connections in SYN_SENT or SYN_RECEIVED,
- * only allow the initial SYN or SYN|ACK and those sent
- * by the retransmit timer.
- */
- if (IS_FASTOPEN(tp->t_flags) &&
- ((tp->t_state == TCPS_SYN_RECEIVED) ||
- (tp->t_state == TCPS_SYN_SENT)) &&
- SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */
- (tp->t_rxtshift == 0)) /* not a retransmit */
- return (0);
- /*
* Determine length of data that should be transmitted, and flags
* that will be used. If there is some data or critical controls
* (SYN, RST) to send, then transmit; otherwise, investigate
* further.
*/
idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
-#ifdef NETFLIX_CWV
- if (tp->cwv_enabled) {
- if ((tp->cwv_cwnd_valid == 0) &&
- TCPS_HAVEESTABLISHED(tp->t_state) &&
- (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
- tcp_newcwv_nvp_closedown(tp);
- } else
-#endif
if (tp->t_idle_reduce) {
if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
rack_cc_after_idle(tp,
@@ -7141,10 +7119,12 @@
tlen = rsm->r_end - rsm->r_start;
if (tlen > tp->t_maxseg)
tlen = tp->t_maxseg;
- KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
- ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
- __func__, __LINE__,
- rsm->r_start, tp->snd_una, tp, rack, rsm));
+#ifdef INVARIANTS
+ if (SEQ_GT(tp->snd_una, rsm->r_start)) {
+ panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u",
+ tp, rack, tp->snd_una, rsm, rsm->r_start);
+ }
+#endif
sb_offset = rsm->r_start - tp->snd_una;
cwin = min(tp->snd_wnd, tlen);
len = cwin;
@@ -7155,14 +7135,12 @@
len = rsm->r_end - rsm->r_start;
sack_rxmit = 1;
sendalot = 0;
- KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
- ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
- __func__, __LINE__,
- rsm->r_start, tp->snd_una, tp, rack, rsm));
sb_offset = rsm->r_start - tp->snd_una;
if (len >= tp->t_maxseg) {
len = tp->t_maxseg;
}
+ KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
+ __func__, sb_offset));
} else if ((rack->rc_in_persist == 0) &&
((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
long tlen;
@@ -7187,10 +7165,6 @@
}
#endif
tlen = rsm->r_end - rsm->r_start;
- KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
- ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
- __func__, __LINE__,
- rsm->r_start, tp->snd_una, tp, rack, rsm));
sb_offset = rsm->r_start - tp->snd_una;
if (tlen > rack->r_ctl.rc_prr_sndcnt) {
len = rack->r_ctl.rc_prr_sndcnt;
@@ -7212,6 +7186,8 @@
goto just_return_nolock;
}
}
+ KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
+ __func__, sb_offset));
if (len > 0) {
sub_from_prr = 1;
sack_rxmit = 1;
@@ -7236,6 +7212,20 @@
/* For debugging */
rack->r_ctl.rc_rsm_at_retran = rsm;
#endif
+ /*
+ * Enforce a connection sendmap count limit if set
+ * as long as we are not retransmiting.
+ */
+ if ((rsm == NULL) &&
+ (rack_map_entries_limit > 0) &&
+ (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) {
+ counter_u64_add(rack_to_alloc_limited, 1);
+ if (!rack->alloc_limit_reported) {
+ rack->alloc_limit_reported = 1;
+ counter_u64_add(rack_alloc_limited_conns, 1);
+ }
+ goto just_return_nolock;
+ }
/*
* Get standard flags, and add SYN or FIN if requested by 'hidden'
* state flags.
@@ -7306,7 +7296,7 @@
uint32_t avail;
avail = sbavail(sb);
- if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
+ if (SEQ_GT(tp->snd_nxt, tp->snd_una))
sb_offset = tp->snd_nxt - tp->snd_una;
else
sb_offset = 0;
@@ -7347,9 +7337,18 @@
* data possible so far in the scoreboard.
*/
outstanding = tp->snd_max - tp->snd_una;
- if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd)
- len = 0;
- else if (avail > sb_offset)
+ if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
+ if (tp->snd_wnd > outstanding) {
+ len = tp->snd_wnd - outstanding;
+ /* Check to see if we have the data */
+ if (((sb_offset + len) > avail) &&
+ (avail > sb_offset))
+ len = avail - sb_offset;
+ else
+ len = 0;
+ } else
+ len = 0;
+ } else if (avail > sb_offset)
len = avail - sb_offset;
else
len = 0;
@@ -7398,18 +7397,22 @@
* SYN-SENT state and if segment contains data and if we don't know
* that foreign host supports TAO, suppress sending segment.
*/
- if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
- ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
- if (tp->t_state != TCPS_SYN_RECEIVED)
+ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
+ if ((tp->t_state != TCPS_SYN_RECEIVED) &&
+ (tp->t_state != TCPS_SYN_SENT))
flags &= ~TH_SYN;
+#ifdef TCP_RFC7413
/*
* When sending additional segments following a TFO SYN|ACK,
* do not include the SYN bit.
*/
- if (IS_FASTOPEN(tp->t_flags) &&
+ if ((tp->t_flags & TF_FASTOPEN) &&
(tp->t_state == TCPS_SYN_RECEIVED))
flags &= ~TH_SYN;
+#endif
sb_offset--, len++;
+ if (sbavail(sb) == 0)
+ len = 0;
}
/*
* Be careful not to send data and/or FIN on SYN segments. This
@@ -7420,29 +7423,16 @@
len = 0;
flags &= ~TH_FIN;
}
+#ifdef TCP_RFC7413
/*
- * On TFO sockets, ensure no data is sent in the following cases:
- *
- * - When retransmitting SYN|ACK on a passively-created socket
- *
- * - When retransmitting SYN on an actively created socket
- *
- * - When sending a zero-length cookie (cookie request) on an
- * actively created socket
- *
- * - When the socket is in the CLOSED state (RST is being sent)
+ * When retransmitting SYN|ACK on a passively-created TFO socket,
+ * don't include data, as the presence of data may have caused the
+ * original SYN|ACK to have been dropped by a middlebox.
*/
- if (IS_FASTOPEN(tp->t_flags) &&
- (((flags & TH_SYN) && (tp->t_rxtshift > 0)) ||
- ((tp->t_state == TCPS_SYN_SENT) &&
- (tp->t_tfo_client_cookie_len == 0)) ||
- (flags & TH_RST))) {
- sack_rxmit = 0;
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)))
len = 0;
- }
- /* Without fast-open there should never be data sent on a SYN */
- if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
- len = 0;
+#endif
if (len <= 0) {
/*
* If FIN has been sent but not acked, but we haven't been
@@ -7519,7 +7509,9 @@
ipoptlen += ipsec_optlen;
#endif
if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
+#ifdef NETFLIX_TCP_O_UDP
(tp->t_port == 0) &&
+#endif
((tp->t_flags & TF_SIGNATURE) == 0) &&
tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
ipoptlen == 0)
@@ -7688,10 +7680,13 @@
* If our state indicates that FIN should be sent and we have not
* yet done so, then we need to send.
*/
- if ((flags & TH_FIN) &&
- (tp->snd_nxt == tp->snd_una)) {
- pass = 11;
- goto send;
+ if (flags & TH_FIN) {
+ if ((tp->t_flags & TF_SENTFIN) ||
+ (((tp->t_flags & TF_SENTFIN) == 0) &&
+ (tp->snd_nxt == tp->snd_una))) {
+ pass = 11;
+ goto send;
+ }
}
/*
* No reason to send a segment, just return.
@@ -7750,44 +7745,27 @@
if (flags & TH_SYN) {
tp->snd_nxt = tp->iss;
to.to_mss = tcp_mssopt(&inp->inp_inc);
-#ifdef NETFLIX_TCPOUDP
+#ifdef NETFLIX_TCP_O_UDP
if (tp->t_port)
to.to_mss -= V_tcp_udp_tunneling_overhead;
#endif
to.to_flags |= TOF_MSS;
-
+#ifdef TCP_RFC7413
/*
- * On SYN or SYN|ACK transmits on TFO connections,
- * only include the TFO option if it is not a
- * retransmit, as the presence of the TFO option may
- * have caused the original SYN or SYN|ACK to have
- * been dropped by a middlebox.
+ * Only include the TFO option on the first
+ * transmission of the SYN|ACK on a
+ * passively-created TFO socket, as the presence of
+ * the TFO option may have caused the original
+ * SYN|ACK to have been dropped by a middlebox.
*/
- if (IS_FASTOPEN(tp->t_flags) &&
+ if ((tp->t_flags & TF_FASTOPEN) &&
+ (tp->t_state == TCPS_SYN_RECEIVED) &&
(tp->t_rxtshift == 0)) {
- if (tp->t_state == TCPS_SYN_RECEIVED) {
- to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
- to.to_tfo_cookie =
- (u_int8_t *)&tp->t_tfo_cookie.server;
- to.to_flags |= TOF_FASTOPEN;
- wanted_cookie = 1;
- } else if (tp->t_state == TCPS_SYN_SENT) {
- to.to_tfo_len =
- tp->t_tfo_client_cookie_len;
- to.to_tfo_cookie =
- tp->t_tfo_cookie.client;
- to.to_flags |= TOF_FASTOPEN;
- wanted_cookie = 1;
- /*
- * If we wind up having more data to
- * send with the SYN than can fit in
- * one segment, don't send any more
- * until the SYN|ACK comes back from
- * the other end.
- */
- sendalot = 0;
- }
+ to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN;
+ to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
+ to.to_flags |= TOF_FASTOPEN;
}
+#endif
}
/* Window scaling. */
if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
@@ -7822,15 +7800,8 @@
/* Processing the options. */
hdrlen += optlen = tcp_addoptions(&to, opt);
- /*
- * If we wanted a TFO option to be added, but it was unable
- * to fit, ensure no data is sent.
- */
- if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
- !(to.to_flags & TOF_FASTOPEN))
- len = 0;
}
-#ifdef NETFLIX_TCPOUDP
+#ifdef NETFLIX_TCP_O_UDP
if (tp->t_port) {
if (V_tcp_udp_tunneling_port == 0) {
/* The port was removed?? */
@@ -7996,8 +7967,8 @@
msb = NULL;
else
msb = sb;
- m->m_next = tcp_m_copym(mb, moff, &len,
- if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb);
+ m->m_next = tcp_m_copym(/*tp, */ mb, moff, &len,
+ if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb /*, 0, NULL*/);
if (len <= (tp->t_maxseg - optlen)) {
/*
* Must have ran out of mbufs for the copy
@@ -8031,6 +8002,8 @@
* TLP should not count in retran count, but
* in its own bin
*/
+/* tp->t_sndtlppack++;*/
+/* tp->t_sndtlpbyte += len;*/
counter_u64_add(rack_tlp_retran, 1);
counter_u64_add(rack_tlp_retran_bytes, len);
} else {
@@ -8156,7 +8129,7 @@
#ifdef INET6
if (isipv6) {
ip6 = mtod(m, struct ip6_hdr *);
-#ifdef NETFLIX_TCPOUDP
+#ifdef NETFLIX_TCP_O_UDP
if (tp->t_port) {
udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -8164,10 +8137,10 @@
ulen = hdrlen + len - sizeof(struct ip6_hdr);
udp->uh_ulen = htons(ulen);
th = (struct tcphdr *)(udp + 1);
- } else
+ } else
#endif
th = (struct tcphdr *)(ip6 + 1);
- tcpip_fillheaders(inp, ip6, th);
+ tcpip_fillheaders(inp, /*tp->t_port, */ ip6, th);
} else
#endif /* INET6 */
{
@@ -8175,7 +8148,7 @@
#ifdef TCPDEBUG
ipov = (struct ipovly *)ip;
#endif
-#ifdef NETFLIX_TCPOUDP
+#ifdef NETFLIX_TCP_O_UDP
if (tp->t_port) {
udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -8186,7 +8159,7 @@
} else
#endif
th = (struct tcphdr *)(ip + 1);
- tcpip_fillheaders(inp, ip, th);
+ tcpip_fillheaders(inp,/*tp->t_port, */ ip, th);
}
/*
* Fill in fields, remembering maximum advertised window for use in
@@ -8277,20 +8250,15 @@
/*
* Calculate receive window. Don't shrink window, but avoid silly
* window syndrome.
- * If a RST segment is sent, advertise a window of zero.
*/
- if (flags & TH_RST) {
+ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
+ recwin < (long)tp->t_maxseg)
recwin = 0;
- } else {
- if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
- recwin < (long)tp->t_maxseg)
- recwin = 0;
- if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
- recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
- recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
- if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
- recwin = (long)TCP_MAXWIN << tp->rcv_scale;
- }
+ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
+ recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
+ recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
+ if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
+ recwin = (long)TCP_MAXWIN << tp->rcv_scale;
/*
* According to RFC1323 the window field in a SYN (i.e., a <SYN> or
@@ -8357,18 +8325,23 @@
* ip6_plen is not need to be filled now, and will be filled
* in ip6_output.
*/
+#ifdef NETFLIX_TCP_O_UDP
if (tp->t_port) {
m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
th->th_sum = htons(0);
+ UDPSTAT_INC(udps_opackets);
} else {
+#endif
m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
th->th_sum = in6_cksum_pseudo(ip6,
sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
0);
+#ifdef NETFLIX_TCP_O_UDP
}
+#endif
}
#endif
#if defined(INET6) && defined(INET)
@@ -8376,19 +8349,24 @@
#endif
#ifdef INET
{
+#ifdef NETFLIX_TCP_O_UDP
if (tp->t_port) {
m->m_pkthdr.csum_flags = CSUM_UDP;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
th->th_sum = htons(0);
+ UDPSTAT_INC(udps_opackets);
} else {
+#endif
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
th->th_sum = in_pseudo(ip->ip_src.s_addr,
ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
IPPROTO_TCP + len + optlen));
+#ifdef NETFLIX_TCP_O_UDP
}
+#endif
/* IP version must be set here for ipv4/ipv6 checking later */
KASSERT(ip->ip_v == IPVERSION,
("%s: IP version incorrect: %d", __func__, ip->ip_v));
@@ -8559,6 +8537,10 @@
* retransmit. In persist state, just set snd_max.
*/
if (error == 0) {
+/* if (TCPS_HAVEESTABLISHED(tp->t_state) &&
+ (tp->t_flags & TF_SACK_PERMIT) &&
+ tp->rcv_numsacks > 0)
+ tcp_clean_dsack_blocks(tp);*/
if (len == 0)
counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
else if (len == 1) {
@@ -8574,15 +8556,19 @@
}
}
if (sub_from_prr && (error == 0)) {
- rack->r_ctl.rc_prr_sndcnt -= len;
+ if (rack->r_ctl.rc_prr_sndcnt >= len)
+ rack->r_ctl.rc_prr_sndcnt -= len;
+ else
+ rack->r_ctl.rc_prr_sndcnt = 0;
}
sub_from_prr = 0;
rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
pass, rsm);
if ((tp->t_flags & TF_FORCEDATA) == 0 ||
(rack->rc_in_persist == 0)) {
+#ifdef NETFLIX_STATS
tcp_seq startseq = tp->snd_nxt;
-
+#endif
/*
* Advance snd_nxt over sequence space of this segment.
*/
@@ -8613,17 +8599,6 @@
tp->t_acktime = ticks;
}
tp->snd_max = tp->snd_nxt;
- /*
- * Time this transmission if not a retransmission and
- * not currently timing anything.
- * This is only relevant in case of switching back to
- * the base stack.
- */
- if (tp->t_rtttime == 0) {
- tp->t_rtttime = ticks;
- tp->t_rtseq = startseq;
- TCPSTAT_INC(tcps_segstimed);
- }
#ifdef NETFLIX_STATS
if (!(tp->t_flags & TF_GPUTINPROG) && len) {
tp->t_flags |= TF_GPUTINPROG;
@@ -8996,9 +8971,7 @@
return (tcp_default_ctloutput(so, sopt, inp, tp));
break;
}
-#ifdef NETFLIX_STATS
- tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
-#endif
+/* tcp_log_socket_option(tp, sopt->sopt_name, optval, error);*/
INP_WUNLOCK(inp);
return (error);
}
@@ -9131,7 +9104,6 @@
.tfb_tcp_block_name = __XSTRING(STACKNAME),
.tfb_tcp_output = rack_output,
.tfb_tcp_do_segment = rack_do_segment,
- .tfb_tcp_hpts_do_segment = rack_hpts_do_segment,
.tfb_tcp_ctloutput = rack_ctloutput,
.tfb_tcp_fb_init = rack_init,
.tfb_tcp_fb_fini = rack_fini,
@@ -9241,4 +9213,3 @@
MODULE_VERSION(MODNAME, 1);
DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
-MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);
Index: head/sys/netinet/tcp_stacks/rack_bbr_common.h
===================================================================
--- head/sys/netinet/tcp_stacks/rack_bbr_common.h
+++ head/sys/netinet/tcp_stacks/rack_bbr_common.h
@@ -38,17 +38,8 @@
#define TCP_MSS_ACCT_SIZE 70
#define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF)
+#define DUP_ACK_THRESHOLD 3
-/* Magic flags to tell whats cooking on the pacing wheel */
-#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */
-#define PACE_TMR_RACK 0x02 /* RACK timer running */
-#define PACE_TMR_TLP 0x04 /* TLP timer running */
-#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
-#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
-#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
-#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */
-#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK)
-
/* Magic flags for tracing progress events */
#define PROGRESS_DROP 1
#define PROGRESS_UPDATE 2
@@ -61,8 +52,66 @@
#define USE_RTT_LOW 1
#define USE_RTT_AVG 2
+#define PACE_MAX_IP_BYTES 65536
+#define USECS_IN_SECOND 1000000
+#define MSEC_IN_SECOND 1000
+#define MS_IN_USEC 1000
+#define USEC_TO_MSEC(x) (x / MS_IN_USEC)
+#define TCP_TS_OVERHEAD 12 /* Overhead of having Timestamps on */
+
#ifdef _KERNEL
/* We have only 7 bits in rack so assert its true */
CTASSERT((PACE_TMR_MASK & 0x80) == 0);
+#ifdef KERN_TLS
+uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd);
+#endif
+int
+ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so,
+ struct mbuf *m, int has_pkt);
+int
+ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt);
+uint32_t ctf_outstanding(struct tcpcb *tp);
+uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked);
+int
+ctf_drop_checks(struct tcpopt *to, struct mbuf *m,
+ struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf,
+ int32_t * drop_hdrlen, int32_t * ret_val);
+void
+ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp,
+ struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val);
+void
+ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp,
+ struct tcphdr *th, int32_t rstreason, int32_t tlen);
+void
+ctf_do_drop(struct mbuf *m, struct tcpcb *tp);
+
+int
+ctf_process_rst(struct mbuf *m, struct tcphdr *th,
+ struct socket *so, struct tcpcb *tp);
+
+void
+ctf_challenge_ack(struct mbuf *m, struct tcphdr *th,
+ struct tcpcb *tp, int32_t * ret_val);
+
+int
+ctf_ts_check(struct mbuf *m, struct tcphdr *th,
+ struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val);
+
+void
+ctf_calc_rwin(struct socket *so, struct tcpcb *tp);
+
+void
+ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+ int32_t rstreason, int32_t tlen);
+
+uint32_t
+ctf_fixed_maxseg(struct tcpcb *tp);
+
+void
+ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks);
+
+uint32_t
+ctf_decay_count(uint32_t count, uint32_t decay_percentage);
+
#endif
#endif
Index: head/sys/netinet/tcp_stacks/rack_bbr_common.c
===================================================================
--- head/sys/netinet/tcp_stacks/rack_bbr_common.c
+++ head/sys/netinet/tcp_stacks/rack_bbr_common.c
@@ -0,0 +1,859 @@
+/*-
+ * Copyright (c) 2016-2018
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * Author: Randall Stewart <rrs@netflix.com>
+ * This work is based on the ACM Queue paper
+ * BBR - Congestion Based Congestion Control
+ * and also numerous discussions with Neal, Yuchung and Van.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+/*#include "opt_kern_tls.h"*/
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#ifdef TCP_HHOOK
+#include <sys/hhook.h>
+#endif
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/sockbuf_tls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/tree.h>
+#include <sys/refcount.h>
+#include <sys/queue.h>
+#include <sys/smp.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/time.h>
+#include <vm/uma.h>
+#include <sys/kern_prefetch.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+#include <net/ethernet.h>
+#include <net/bpf.h>
+
+#define TCPSTATES /* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* required for icmp_var.h */
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#define TCPOUTFLAGS
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_hpts.h>
+#include <netinet/cc/cc.h>
+#include <netinet/tcp_log_buf.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_fastopen.h>
+
+#include <netipsec/ipsec_support.h>
+#include <net/if.h>
+#include <net/if_var.h>
+
+#if defined(IPSEC) || defined(IPSEC_SUPPORT)
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /* IPSEC */
+
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <machine/in_cksum.h>
+
+#ifdef MAC
+#include <security/mac/mac_framework.h>
+#endif
+#include "rack_bbr_common.h"
+
+/*
+ * Common TCP Functions - These are shared by borth
+ * rack and BBR.
+ */
+
+
+#ifdef KERN_TLS
+uint32_t
+ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
+{
+ struct sbtls_info *tls;
+ uint32_t len;
+
+again:
+ tls = so->so_snd.sb_tls_info;
+ len = tls->sb_params.sb_maxlen; /* max tls payload */
+ len += tls->sb_params.sb_tls_hlen; /* tls header len */
+ len += tls->sb_params.sb_tls_tlen; /* tls trailer len */
+ if ((len * 4) > rwnd) {
+ /*
+ * Stroke this will suck counter and what
+ * else should we do Drew? From the
+ * TCP perspective I am not sure
+ * what should be done...
+ */
+ if (tls->sb_params.sb_maxlen > 4096) {
+ tls->sb_params.sb_maxlen -= 4096;
+ if (tls->sb_params.sb_maxlen < 4096)
+ tls->sb_params.sb_maxlen = 4096;
+ goto again;
+ }
+ }
+ return (len);
+}
+#endif
+
+int
+ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt)
+{
+ /*
+ * We are passed a raw change of mbuf packets
+ * that arrived in LRO. They are linked via
+ * the m_nextpkt link in the pkt-headers.
+ *
+ * We process each one by:
+ * a) saving off the next
+ * b) stripping off the ether-header
+ * c) formulating the arguments for
+ * the tfb_tcp_hpts_do_segment
+ * d) calling each mbuf to tfb_tcp_hpts_do_segment
+ * after adjusting the time to match the arrival time.
+ * Note that the LRO code assures no IP options are present.
+ *
+ * The symantics for calling tfb_tcp_hpts_do_segment are the
+ * following:
+ * 1) It returns 0 if all went well and you (the caller) need
+ * to release the lock.
+ * 2) If nxt_pkt is set, then the function will surpress calls
+ * to tfb_tcp_output() since you are promising to call again
+ * with another packet.
+ * 3) If it returns 1, then you must free all the packets being
+ * shipped in, the tcb has been destroyed (or about to be destroyed).
+ */
+ struct mbuf *m_save;
+ struct ether_header *eh;
+ struct epoch_tracker et;
+ struct tcphdr *th;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */
+#endif
+#ifdef INET
+ struct ip *ip = NULL; /* Keep compiler happy. */
+#endif
+ struct ifnet *ifp;
+ struct timeval tv;
+ int32_t retval, nxt_pkt, tlen, off;
+ uint16_t etype;
+ uint16_t drop_hdrlen;
+ uint8_t iptos, no_vn=0, bpf_req=0;
+
+ /*
+ * This is a bit deceptive, we get the
+ * "info epoch" which is really the network
+ * epoch. This covers us on both any INP
+ * type change but also if the ifp goes
+ * away it covers us as well.
+ */
+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+ if (m && m->m_pkthdr.rcvif)
+ ifp = m->m_pkthdr.rcvif;
+ else
+ ifp = NULL;
+ if (ifp) {
+ bpf_req = bpf_peers_present(ifp->if_bpf);
+ } else {
+ /*
+ * We probably should not work around
+ * but kassert, since lro alwasy sets rcvif.
+ */
+ no_vn = 1;
+ goto skip_vnet;
+ }
+ CURVNET_SET(ifp->if_vnet);
+skip_vnet:
+ while (m) {
+ m_save = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ /* Now lets get the ether header */
+ eh = mtod(m, struct ether_header *);
+ etype = ntohs(eh->ether_type);
+ /* Let the BPF see the packet */
+ if (bpf_req && ifp)
+ ETHER_BPF_MTAP(ifp, m);
+ m_adj(m, sizeof(*eh));
+ /* Trim off the ethernet header */
+ switch (etype) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ {
+ if (m->m_len < (sizeof(*ip6) + sizeof(*th))) {
+ m = m_pullup(m, sizeof(*ip6) + sizeof(*th));
+ if (m == NULL) {
+ TCPSTAT_INC(tcps_rcvshort);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ }
+ ip6 = (struct ip6_hdr *)(eh + 1);
+ th = (struct tcphdr *)(ip6 + 1);
+ tlen = ntohs(ip6->ip6_plen);
+ drop_hdrlen = sizeof(*ip6);
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in6_cksum_pseudo(ip6, tlen,
+ IPPROTO_TCP, m->m_pkthdr.csum_data);
+ th->th_sum ^= 0xffff;
+ } else
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
+ if (th->th_sum) {
+ TCPSTAT_INC(tcps_rcvbadsum);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ /*
+ * Be proactive about unspecified IPv6 address in source.
+ * As we use all-zero to indicate unbounded/unconnected pcb,
+ * unspecified IPv6 address can be used to confuse us.
+ *
+ * Note that packets with unspecified IPv6 destination is
+ * already dropped in ip6_input.
+ */
+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
+ /* XXX stat */
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
+ break;
+ }
+#endif
+#ifdef INET
+ case ETHERTYPE_IP:
+ {
+ if (m->m_len < sizeof (struct tcpiphdr)) {
+ if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
+ == NULL) {
+ TCPSTAT_INC(tcps_rcvshort);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ }
+ ip = (struct ip *)(eh + 1);
+ th = (struct tcphdr *)(ip + 1);
+ drop_hdrlen = sizeof(*ip);
+ iptos = ip->ip_tos;
+ tlen = ntohs(ip->ip_len) - sizeof(struct ip);
+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
+ th->th_sum = m->m_pkthdr.csum_data;
+ else
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr,
+ htonl(m->m_pkthdr.csum_data + tlen +
+ IPPROTO_TCP));
+ th->th_sum ^= 0xffff;
+ } else {
+ int len;
+ struct ipovly *ipov = (struct ipovly *)ip;
+ /*
+ * Checksum extended TCP header and data.
+ */
+ len = drop_hdrlen + tlen;
+ bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
+ ipov->ih_len = htons(tlen);
+ th->th_sum = in_cksum(m, len);
+ /* Reset length for SDT probes. */
+ ip->ip_len = htons(len);
+ /* Reset TOS bits */
+ ip->ip_tos = iptos;
+ /* Re-initialization for later version check */
+ ip->ip_v = IPVERSION;
+ ip->ip_hl = sizeof(*ip) >> 2;
+ }
+ if (th->th_sum) {
+ TCPSTAT_INC(tcps_rcvbadsum);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ break;
+ }
+#endif
+ }
+ /*
+ * Convert TCP protocol specific fields to host format.
+ */
+ tcp_fields_to_host(th);
+
+ off = th->th_off << 2;
+ if (off < sizeof (struct tcphdr) || off > tlen) {
+ TCPSTAT_INC(tcps_rcvbadoff);
+ m_freem(m);
+ goto skipped_pkt;
+ }
+ tlen -= off;
+ drop_hdrlen += off;
+ /*
+ * Now lets setup the timeval to be when we should
+ * have been called (if we can).
+ */
+ m->m_pkthdr.lro_nsegs = 1;
+ if (m->m_flags & M_TSTMP_LRO) {
+ tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
+ tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
+ } else {
+ /* Should not be should we kassert instead? */
+ tcp_get_usecs(&tv);
+ }
+ /* Now what about next packet? */
+ if (m_save || has_pkt)
+ nxt_pkt = 1;
+ else
+ nxt_pkt = 0;
+ retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen,
+ iptos, nxt_pkt, &tv);
+ if (retval) {
+ /* We lost the lock and tcb probably */
+ m = m_save;
+ while (m) {
+ m_save = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ m = m_save;
+ }
+ if (no_vn == 0)
+ CURVNET_RESTORE();
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+ return (retval);
+ }
+skipped_pkt:
+ m = m_save;
+ }
+ if (no_vn == 0)
+ CURVNET_RESTORE();
+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
+ return (retval);
+}
+
+int
+ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt)
+{
+ struct mbuf *m;
+
+ /* First lets see if we have old packets */
+ if (tp->t_in_pkt) {
+ m = tp->t_in_pkt;
+ tp->t_in_pkt = NULL;
+ tp->t_tail_pkt = NULL;
+ if (ctf_process_inbound_raw(tp, so, m, have_pkt)) {
+ /* We lost the tcpcb (maybe a RST came in)? */
+ return (1);
+ }
+ }
+ return (0);
+}
+
+uint32_t
+ctf_outstanding(struct tcpcb *tp)
+{
+ return (tp->snd_max - tp->snd_una);
+}
+
+uint32_t
+ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
+{
+ if (rc_sacked <= ctf_outstanding(tp))
+ return (ctf_outstanding(tp) - rc_sacked);
+ else {
+ /* TSNH */
+#ifdef INVARIANTS
+ panic("tp:%p rc_sacked:%d > out:%d",
+ tp, rc_sacked, ctf_outstanding(tp));
+#endif
+ return (0);
+ }
+}
+
+void
+ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+ int32_t rstreason, int32_t tlen)
+{
+ if (tp != NULL) {
+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ INP_WUNLOCK(tp->t_inpcb);
+ } else
+ tcp_dropwithreset(m, th, NULL, tlen, rstreason);
+}
+
+/*
+ * ctf_drop_checks returns 1 for you should not proceed. It places
+ * in ret_val what should be returned 1/0 by the caller. The 1 indicates
+ * that the TCB is unlocked and probably dropped. The 0 indicates the
+ * TCB is still valid and locked.
+ */
+int
+ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
+{
+ int32_t todrop;
+ int32_t thflags;
+ int32_t tlen;
+
+ thflags = *thf;
+ tlen = *tlenp;
+ todrop = tp->rcv_nxt - th->th_seq;
+ if (todrop > 0) {
+ if (thflags & TH_SYN) {
+ thflags &= ~TH_SYN;
+ th->th_seq++;
+ if (th->th_urp > 1)
+ th->th_urp--;
+ else
+ thflags &= ~TH_URG;
+ todrop--;
+ }
+ /*
+ * Following if statement from Stevens, vol. 2, p. 960.
+ */
+ if (todrop > tlen
+ || (todrop == tlen && (thflags & TH_FIN) == 0)) {
+ /*
+ * Any valid FIN must be to the left of the window.
+ * At this point the FIN must be a duplicate or out
+ * of sequence; drop it.
+ */
+ thflags &= ~TH_FIN;
+ /*
+ * Send an ACK to resynchronize and drop any data.
+ * But keep on processing for RST or ACK.
+ */
+ tp->t_flags |= TF_ACKNOW;
+ todrop = tlen;
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
+ } else {
+ TCPSTAT_INC(tcps_rcvpartduppack);
+ TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
+ }
+ /*
+ * DSACK - add SACK block for dropped range
+ */
+ if (tp->t_flags & TF_SACK_PERMIT) {
+ tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen);
+ /*
+ * ACK now, as the next in-sequence segment
+ * will clear the DSACK block again
+ */
+ tp->t_flags |= TF_ACKNOW;
+ }
+ *drop_hdrlen += todrop; /* drop from the top afterwards */
+ th->th_seq += todrop;
+ tlen -= todrop;
+ if (th->th_urp > todrop)
+ th->th_urp -= todrop;
+ else {
+ thflags &= ~TH_URG;
+ th->th_urp = 0;
+ }
+ }
+ /*
+ * If segment ends after window, drop trailing data (and PUSH and
+ * FIN); if nothing left, just ACK.
+ */
+ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
+ if (todrop > 0) {
+ TCPSTAT_INC(tcps_rcvpackafterwin);
+ if (todrop >= tlen) {
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
+ /*
+ * If window is closed can only take segments at
+ * window edge, and have to drop data and PUSH from
+ * incoming segments. Continue processing, but
+ * remember to ack. Otherwise, drop segment and
+ * ack.
+ */
+ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
+ tp->t_flags |= TF_ACKNOW;
+ TCPSTAT_INC(tcps_rcvwinprobe);
+ } else {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+ return (1);
+ }
+ } else
+ TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+ m_adj(m, -todrop);
+ tlen -= todrop;
+ thflags &= ~(TH_PUSH | TH_FIN);
+ }
+ *thf = thflags;
+ *tlenp = tlen;
+ return (0);
+}
+
+/*
+ * The value in ret_val informs the caller
+ * if we dropped the tcb (and lock) or not.
+ * 1 = we dropped it, 0 = the TCB is still locked
+ * and valid.
+ */
+void
+ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val)
+{
+ /*
+ * Generate an ACK dropping incoming segment if it occupies sequence
+ * space, where the ACK reflects our state.
+ *
+ * We can now skip the test for the RST flag since all paths to this
+ * code happen after packets containing RST have been dropped.
+ *
+ * In the SYN-RECEIVED state, don't send an ACK unless the segment
+ * we received passes the SYN-RECEIVED ACK test. If it fails send a
+ * RST. This breaks the loop in the "LAND" DoS attack, and also
+ * prevents an ACK storm between two listening ports that have been
+ * sent forged SYN segments, each with the source address of the
+ * other.
+ */
+ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
+ (SEQ_GT(tp->snd_una, th->th_ack) ||
+ SEQ_GT(th->th_ack, tp->snd_max))) {
+ *ret_val = 1;
+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
+ return;
+ } else
+ *ret_val = 0;
+ tp->t_flags |= TF_ACKNOW;
+ if (m)
+ m_freem(m);
+}
+
+void
+ctf_do_drop(struct mbuf *m, struct tcpcb *tp)
+{
+
+ /*
+ * Drop space held by incoming segment and return.
+ */
+ if (tp != NULL)
+ INP_WUNLOCK(tp->t_inpcb);
+ if (m)
+ m_freem(m);
+}
+
+int
+ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp)
+{
+ /*
+ * RFC5961 Section 3.2
+ *
+ * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
+ * window, we send challenge ACK.
+ *
+ * Note: to take into account delayed ACKs, we should test against
+ * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
+ * of closed window, not covered by the RFC.
+ */
+ int dropped = 0;
+
+ if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
+ (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
+
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+ KASSERT(tp->t_state != TCPS_SYN_SENT,
+ ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
+ __func__, th, tp));
+
+ if (V_tcp_insecure_rst ||
+ (tp->last_ack_sent == th->th_seq) ||
+ (tp->rcv_nxt == th->th_seq) ||
+ ((tp->last_ack_sent - 1) == th->th_seq)) {
+ TCPSTAT_INC(tcps_drops);
+ /* Drop the connection. */
+ switch (tp->t_state) {
+ case TCPS_SYN_RECEIVED:
+ so->so_error = ECONNREFUSED;
+ goto close;
+ case TCPS_ESTABLISHED:
+ case TCPS_FIN_WAIT_1:
+ case TCPS_FIN_WAIT_2:
+ case TCPS_CLOSE_WAIT:
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ so->so_error = ECONNRESET;
+ close:
+ tcp_state_change(tp, TCPS_CLOSED);
+ /* FALLTHROUGH */
+ default:
+ tp = tcp_close(tp);
+ }
+ dropped = 1;
+ ctf_do_drop(m, tp);
+ } else {
+ TCPSTAT_INC(tcps_badrst);
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m,
+ tp->rcv_nxt, tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ }
+ } else {
+ m_freem(m);
+ }
+ return (dropped);
+}
+
+/*
+ * The value in ret_val informs the caller
+ * if we dropped the tcb (and lock) or not.
+ * 1 = we dropped it, 0 = the TCB is still locked
+ * and valid.
+ */
+void
+ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
+{
+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+ TCPSTAT_INC(tcps_badsyn);
+ if (V_tcp_insecure_syn &&
+ SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
+ tp = tcp_drop(tp, ECONNRESET);
+ *ret_val = 1;
+ ctf_do_drop(m, tp);
+ } else {
+ /* Send challenge ACK. */
+ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
+ tp->snd_nxt, TH_ACK);
+ tp->last_ack_sent = tp->rcv_nxt;
+ m = NULL;
+ *ret_val = 0;
+ ctf_do_drop(m, NULL);
+ }
+}
+
+/*
+ * bbr_ts_check returns 1 for you should not proceed, the state
+ * machine should return. It places in ret_val what should
+ * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
+ * that the TCB is unlocked and probably dropped. The 0 indicates the
+ * TCB is still valid and locked.
+ */
+int
+ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
+ int32_t tlen, int32_t thflags, int32_t * ret_val)
+{
+
+ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
+ /*
+ * Invalidate ts_recent. If this segment updates ts_recent,
+ * the age will be reset later and ts_recent will get a
+ * valid value. If it does not, setting ts_recent to zero
+ * will at least satisfy the requirement that zero be placed
+ * in the timestamp echo reply when ts_recent isn't valid.
+ * The age isn't reset until we get a valid ts_recent
+ * because we don't want out-of-order segments to be dropped
+ * when ts_recent is old.
+ */
+ tp->ts_recent = 0;
+ } else {
+ TCPSTAT_INC(tcps_rcvduppack);
+ TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
+ TCPSTAT_INC(tcps_pawsdrop);
+ *ret_val = 0;
+ if (tlen) {
+ ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
+ } else {
+ ctf_do_drop(m, NULL);
+ }
+ return (1);
+ }
+ return (0);
+}
+
+void
+ctf_calc_rwin(struct socket *so, struct tcpcb *tp)
+{
+ int32_t win;
+
+ /*
+ * Calculate amount of space in receive window, and then do TCP
+ * input processing. Receive window is amount of space in rcv queue,
+ * but not less than advertised window.
+ */
+ win = sbspace(&so->so_rcv);
+ if (win < 0)
+ win = 0;
+ tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+}
+
+void
+ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th,
+ int32_t rstreason, int32_t tlen)
+{
+
+ if (tp->t_inpcb) {
+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
+ }
+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
+ INP_WUNLOCK(tp->t_inpcb);
+}
+
+uint32_t
+ctf_fixed_maxseg(struct tcpcb *tp)
+{
+ int optlen;
+
+ if (tp->t_flags & TF_NOOPT)
+ return (tp->t_maxseg);
+
+ /*
+ * Here we have a simplified code from tcp_addoptions(),
+ * without a proper loop, and having most of paddings hardcoded.
+ * We only consider fixed options that we would send every
+ * time I.e. SACK is not considered.
+ *
+ */
+#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4)
+ if (TCPS_HAVEESTABLISHED(tp->t_state)) {
+ if (tp->t_flags & TF_RCVD_TSTMP)
+ optlen = TCPOLEN_TSTAMP_APPA;
+ else
+ optlen = 0;
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (tp->t_flags & TF_SIGNATURE)
+ optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+ } else {
+ if (tp->t_flags & TF_REQ_TSTMP)
+ optlen = TCPOLEN_TSTAMP_APPA;
+ else
+ optlen = PAD(TCPOLEN_MAXSEG);
+ if (tp->t_flags & TF_REQ_SCALE)
+ optlen += PAD(TCPOLEN_WINDOW);
+#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
+ if (tp->t_flags & TF_SIGNATURE)
+ optlen += PAD(TCPOLEN_SIGNATURE);
+#endif
+ if (tp->t_flags & TF_SACK_PERMIT)
+ optlen += PAD(TCPOLEN_SACK_PERMITTED);
+ }
+#undef PAD
+ optlen = min(optlen, TCP_MAXOLEN);
+ return (tp->t_maxseg - optlen);
+}
+
+void
+ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks)
+{
+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.flex8 = num_sack_blks;
+ if (num_sack_blks > 0) {
+ log.u_bbr.flex1 = sack_blocks[0].start;
+ log.u_bbr.flex2 = sack_blocks[0].end;
+ }
+ if (num_sack_blks > 1) {
+ log.u_bbr.flex3 = sack_blocks[1].start;
+ log.u_bbr.flex4 = sack_blocks[1].end;
+ }
+ if (num_sack_blks > 2) {
+ log.u_bbr.flex5 = sack_blocks[2].start;
+ log.u_bbr.flex6 = sack_blocks[2].end;
+ }
+ if (num_sack_blks > 3) {
+ log.u_bbr.applimited = sack_blocks[3].start;
+ log.u_bbr.pkts_out = sack_blocks[3].end;
+ }
+ TCP_LOG_EVENTP(tp, NULL,
+ &tp->t_inpcb->inp_socket->so_rcv,
+ &tp->t_inpcb->inp_socket->so_snd,
+ TCP_SACK_FILTER_RES, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+uint32_t
+ctf_decay_count(uint32_t count, uint32_t decay)
+{
+ /*
+ * Given a count, decay it by a set percentage. The
+ * percentage is in thousands i.e. 100% = 1000,
+ * 19.3% = 193.
+ */
+ uint64_t perc_count, decay_per;
+ uint32_t decayed_count;
+ if (decay > 1000) {
+ /* We don't raise it */
+ return (count);
+ }
+ perc_count = count;
+ decay_per = decay;
+ perc_count *= decay_per;
+ perc_count /= 1000;
+ /*
+ * So now perc_count holds the
+ * count decay value.
+ */
+ decayed_count = count - (uint32_t)perc_count;
+ return (decayed_count);
+}
Index: head/sys/netinet/tcp_var.h
===================================================================
--- head/sys/netinet/tcp_var.h
+++ head/sys/netinet/tcp_var.h
@@ -102,7 +102,8 @@
t_state:4, /* state of this connection */
t_idle_reduce : 1,
t_delayed_ack: 7, /* Delayed ack variable */
- bits_spare : 4;
+ t_fin_is_rst: 1, /* Are fin's treated as resets */
+ bits_spare : 3;
u_int t_flags;
tcp_seq snd_una; /* sent but unacknowledged */
tcp_seq snd_max; /* highest sequence number sent;
@@ -271,6 +272,11 @@
void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t);
+ int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int);
+ int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *,
+ int, int, uint8_t,
+ int, struct timeval *);
void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t,
Index: head/sys/sys/mbuf.h
===================================================================
--- head/sys/sys/mbuf.h
+++ head/sys/sys/mbuf.h
@@ -407,6 +407,7 @@
#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically
hw-stamped on port (useful for IEEE 1588
and 802.1AS) */
+#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */
#define M_PROTO1 0x00001000 /* protocol-specific */
#define M_PROTO2 0x00002000 /* protocol-specific */
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Jan 31, 7:34 PM (18 h, 46 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
16369431
Default Alt Text
D20834.diff (151 KB)
Attached To
Mode
D20834: First step in bring hpts and infrastructure up for bbr v1 update part1
Attached
Detach File
Event Timeline
Log In to Comment