Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F139470843
D15020.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
91 KB
Referenced Files
None
Subscribers
None
D15020.diff
View Options
Index: head/sys/conf/files
===================================================================
--- head/sys/conf/files
+++ head/sys/conf/files
@@ -4355,6 +4355,7 @@
netinet/tcp_lro.c optional inet | inet6
netinet/tcp_output.c optional inet | inet6
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
+netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap
netinet/tcp_reass.c optional inet | inet6
netinet/tcp_sack.c optional inet | inet6
Index: head/sys/conf/options
===================================================================
--- head/sys/conf/options
+++ head/sys/conf/options
@@ -218,6 +218,7 @@
SYSVSEM opt_sysvipc.h
SYSVSHM opt_sysvipc.h
SW_WATCHDOG opt_watchdog.h
+TCPHPTS opt_inet.h
TURNSTILE_PROFILING
UMTX_PROFILING
UMTX_CHAINS opt_global.h
Index: head/sys/netinet/in_pcb.h
===================================================================
--- head/sys/netinet/in_pcb.h
+++ head/sys/netinet/in_pcb.h
@@ -156,6 +156,7 @@
* from the global list.
*
* Key:
+ * (b) - Protected by the hpts lock.
* (c) - Constant after initialization
* (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
@@ -164,7 +165,52 @@
* (h) - Protected by the pcbhash lock for the inpcb
* (s) - Protected by another subsystem's locks
* (x) - Undefined locking
+ *
+ * Notes on the tcp_hpts:
+ *
+ * First Hpts lock order is
+ * 1) INP_WLOCK()
+ * 2) HPTS_LOCK() i.e. hpts->pmtx
*
+ * To insert a TCB on the hpts you *must* be holding the INP_WLOCK().
+ * You may check the inp->inp_in_hpts flag without the hpts lock.
+ * The hpts is the only one that will clear this flag holding
+ * only the hpts lock. This means that in your tcp_output()
+ * routine when you test for the inp_in_hpts flag to be 1
+ * it may be transitioning to 0 (by the hpts).
+ * That's ok since that will just mean an extra call to tcp_output
+ * that most likely will find the call you executed
+ * (when the mis-match occured) will have put the TCB back
+ * on the hpts and it will return. If your
+ * call did not add the inp back to the hpts then you will either
+ * over-send or the cwnd will block you from sending more.
+ *
+ * Note you should also be holding the INP_WLOCK() when you
+ * call the remove from the hpts as well. Though usually
+ * you are either doing this from a timer, where you need and have
+ * the INP_WLOCK() or from destroying your TCB where again
+ * you should already have the INP_WLOCK().
+ *
+ * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and
+ * inp_input_cpu_set fields are controlled completely by
+ * the hpts. Do not ever set these. The inp_hpts_cpu_set
+ * and inp_input_cpu_set fields indicate if the hpts has
+ * setup the respective cpu field. It is advised if this
+ * field is 0, to enqueue the packet with the appropriate
+ * hpts_immediate() call. If the _set field is 1, then
+ * you may compare the inp_*_cpu field to the curcpu and
+ * may want to again insert onto the hpts if these fields
+ * are not equal (i.e. you are not on the expected CPU).
+ *
+ * A note on inp_hpts_calls and inp_input_calls, these
+ * flags are set when the hpts calls either the output
+ * or do_segment routines respectively. If the routine
+ * being called wants to use this, then it needs to
+ * clear the flag before returning. The hpts will not
+ * clear the flag. The flags can be used to tell if
+ * the hpts is the function calling the respective
+ * routine.
+ *
* A few other notes:
*
* When a read lock is held, stability of the field is guaranteed; to write
@@ -190,14 +236,45 @@
LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
-#define inp_start_zero inp_refcount
+#define inp_start_zero inp_hpts
#define inp_zero_size (sizeof(struct inpcb) - \
offsetof(struct inpcb, inp_start_zero))
+ TAILQ_ENTRY(inpcb) inp_hpts; /* pacing out queue next lock(b) */
+
+ uint32_t inp_hpts_request; /* Current hpts request, zero if
+ * fits in the pacing window (i&b). */
+ /*
+ * Note the next fields are protected by a
+ * different lock (hpts-lock). This means that
+ * they must correspond in size to the smallest
+ * protectable bit field (uint8_t on x86, and
+ * other platfomrs potentially uint32_t?). Also
+ * since CPU switches can occur at different times the two
+ * fields can *not* be collapsed into a signal bit field.
+ */
+#if defined(__amd64__) || defined(__i386__)
+ volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */
+ volatile uint8_t inp_in_input; /* on input hpts (lock b) */
+#else
+ volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */
+ volatile uint32_t inp_in_input; /* on input hpts (lock b) */
+#endif
+ volatile uint16_t inp_hpts_cpu; /* Lock (i) */
u_int inp_refcount; /* (i) refcount */
int inp_flags; /* (i) generic IP/datagram flags */
int inp_flags2; /* (i) generic IP/datagram flags #2*/
+ volatile uint16_t inp_input_cpu; /* Lock (i) */
+ volatile uint8_t inp_hpts_cpu_set :1, /* on output hpts (i) */
+ inp_input_cpu_set : 1, /* on input hpts (i) */
+ inp_hpts_calls :1, /* (i) from output hpts */
+ inp_input_calls :1, /* (i) from input hpts */
+ inp_spare_bits2 : 4;
+ uint8_t inp_spare_byte; /* Compiler hole */
void *inp_ppcb; /* (i) pointer to per-protocol pcb */
struct socket *inp_socket; /* (i) back pointer to socket */
+ uint32_t inp_hptsslot; /* Hpts wheel slot this tcb is Lock(i&b) */
+ uint32_t inp_hpts_drop_reas; /* reason we are dropping the PCB (lock i&b) */
+ TAILQ_ENTRY(inpcb) inp_input; /* pacing in queue next lock(b) */
struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */
struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
@@ -638,6 +715,7 @@
#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */
#define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */
#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
+#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */
/*
* Flags passed to in_pcblookup*() functions.
Index: head/sys/netinet/in_pcb.c
===================================================================
--- head/sys/netinet/in_pcb.c
+++ head/sys/netinet/in_pcb.c
@@ -58,6 +58,7 @@
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/rmlock.h>
+#include <sys/smp.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
@@ -87,6 +88,9 @@
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/tcp_var.h>
+#ifdef TCPHPTS
+#include <netinet/tcp_hpts.h>
+#endif
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#endif
@@ -1224,9 +1228,28 @@
}
return (0);
}
-
+
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-
+#ifdef TCPHPTS
+ if (inp->inp_in_hpts || inp->inp_in_input) {
+ struct tcp_hpts_entry *hpts;
+ /*
+ * We should not be on the hpts at
+ * this point in any form. we must
+ * get the lock to be sure.
+ */
+ hpts = tcp_hpts_lock(inp);
+ if (inp->inp_in_hpts)
+ panic("Hpts:%p inp:%p at free still on hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ hpts = tcp_input_lock(inp);
+ if (inp->inp_in_input)
+ panic("Hpts:%p inp:%p at free still on input hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ }
+#endif
INP_RUNLOCK(inp);
pcbinfo = inp->inp_pcbinfo;
uma_zfree(pcbinfo->ipi_zone, inp);
@@ -1255,7 +1278,26 @@
}
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-
+#ifdef TCPHPTS
+ if (inp->inp_in_hpts || inp->inp_in_input) {
+ struct tcp_hpts_entry *hpts;
+ /*
+ * We should not be on the hpts at
+ * this point in any form. we must
+ * get the lock to be sure.
+ */
+ hpts = tcp_hpts_lock(inp);
+ if (inp->inp_in_hpts)
+ panic("Hpts:%p inp:%p at free still on hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ hpts = tcp_input_lock(inp);
+ if (inp->inp_in_input)
+ panic("Hpts:%p inp:%p at free still on input hpts",
+ hpts, inp);
+ mtx_unlock(&hpts->p_mtx);
+ }
+#endif
INP_WUNLOCK(inp);
pcbinfo = inp->inp_pcbinfo;
uma_zfree(pcbinfo->ipi_zone, inp);
Index: head/sys/netinet/tcp_hpts.h
===================================================================
--- head/sys/netinet/tcp_hpts.h
+++ head/sys/netinet/tcp_hpts.h
@@ -0,0 +1,304 @@
+#ifndef __tcp_hpts_h__
+#define __tcp_hpts_h__
+/*-
+ * Copyright (c) 2016-8
+ * Netflix Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("$FreeBSD$")
+ */
+
+/*
+ * The hpts uses a 102400 wheel. The wheel
+ * defines the time in 10 usec increments (102400 x 10).
+ * This gives a range of 10usec - 1024ms to place
+ * an entry within. If the user requests more than
+ * 1.024 second, a remaineder is attached and the hpts
+ * when seeing the remainder will re-insert the
+ * inpcb forward in time from where it is until
+ * the remainder is zero.
+ */
+
+#define NUM_OF_HPTSI_SLOTS 102400
+
+TAILQ_HEAD(hptsh, inpcb);
+
+/* Number of useconds in a hpts tick */
+#define HPTS_TICKS_PER_USEC 10
+#define HPTS_MS_TO_SLOTS(x) (x * 100)
+#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
+#define HPTS_USEC_IN_SEC 1000000
+#define HPTS_MSEC_IN_SEC 1000
+#define HPTS_USEC_IN_MSEC 1000
+
+#define DEFAULT_HPTS_LOG 3072
+
+/*
+ * Log flags consist of
+ * 7f 7f 1 1 bits
+ * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE
+ *
+ * So for example cpu 10, number 10 would with
+ * input active would show up as:
+ * p_flags = 0001010 0001010 1 0
+ * <or>
+ * p_flags = 0x142a
+ */
+#define HPTS_HPTS_ACTIVE 0x01
+#define HPTS_INPUT_ACTIVE 0x02
+
+#define HPTSLOG_IMMEDIATE 1
+#define HPTSLOG_INSERT_NORMAL 2
+#define HPTSLOG_INSERT_SLEEPER 3
+#define HPTSLOG_SLEEP_AFTER 4
+#define HPTSLOG_SLEEP_BEFORE 5
+#define HPTSLOG_INSERTED 6
+#define HPTSLOG_WAKEUP_HPTS 7
+#define HPTSLOG_SETTORUN 8
+#define HPTSLOG_HPTSI 9
+#define HPTSLOG_TOLONG 10
+#define HPTSLOG_AWAKENS 11
+#define HPTSLOG_TIMESOUT 12
+#define HPTSLOG_SLEEPSET 13
+#define HPTSLOG_WAKEUP_INPUT 14
+#define HPTSLOG_RESCHEDULE 15
+#define HPTSLOG_AWAKE 16
+#define HPTSLOG_INP_DONE 17
+
+struct hpts_log {
+ struct inpcb *inp;
+ int32_t event;
+ uint32_t cts;
+ int32_t line;
+ uint32_t ticknow;
+ uint32_t t_paceslot;
+ uint32_t t_hptsreq;
+ uint32_t p_curtick;
+ uint32_t p_prevtick;
+ uint32_t slot_req;
+ uint32_t p_on_queue_cnt;
+ uint32_t p_nxt_slot;
+ uint32_t p_cur_slot;
+ uint32_t p_hpts_sleep_time;
+ uint16_t p_flags;
+ uint8_t p_onhpts;
+ uint8_t p_oninput;
+ uint8_t is_notempty;
+};
+
+struct hpts_diag {
+ uint32_t p_hpts_active;
+ uint32_t p_nxt_slot;
+ uint32_t p_cur_slot;
+ uint32_t slot_req;
+ uint32_t inp_hptsslot;
+ uint32_t slot_now;
+ uint32_t have_slept;
+ uint32_t hpts_sleep_time;
+ uint32_t yet_to_sleep;
+ uint32_t need_new_to;
+ int32_t co_ret;
+ uint8_t p_on_min_sleep;
+};
+
+#ifdef _KERNEL
+/* Each hpts has its own p_mtx which is used for locking */
+struct tcp_hpts_entry {
+ /* Cache line 0x00 */
+ struct mtx p_mtx; /* Mutex for hpts */
+ uint32_t p_hpts_active; /* Flag that says hpts is awake */
+ uint32_t p_curtick; /* Current tick in 10 us the hpts is at */
+ uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */
+ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
+ uint32_t p_nxt_slot; /* The next slot outside the current range of
+ * slots that the hpts is running on. */
+ int32_t p_on_queue_cnt; /* Count on queue in this hpts */
+ uint32_t enobuf_cnt;
+ uint16_t p_log_at;
+ uint8_t p_direct_wake :1, /* boolean */
+ p_log_wrapped :1, /* boolean */
+ p_on_min_sleep:1; /* boolean */
+ uint8_t p_fill;
+ /* Cache line 0x40 */
+ void *p_inp;
+ struct hptsh p_input; /* For the tcp-input runner */
+ /* Hptsi wheel */
+ struct hptsh *p_hptss;
+ struct hpts_log *p_log;
+ uint32_t p_logsize;
+ int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
+ uint32_t hit_no_enobuf;
+ uint32_t p_dyn_adjust;
+ uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
+ * of 255ms */
+ uint32_t p_delayed_by; /* How much were we delayed by */
+ /* Cache line 0x80 */
+ struct sysctl_ctx_list hpts_ctx;
+ struct sysctl_oid *hpts_root;
+ struct intr_event *ie;
+ void *ie_cookie;
+ uint16_t p_num; /* The hpts number one per cpu */
+ uint16_t p_cpu; /* The hpts CPU */
+ /* There is extra space in here */
+ /* Cache line 0x100 */
+ struct callout co __aligned(CACHE_LINE_SIZE);
+} __aligned(CACHE_LINE_SIZE);
+
+struct tcp_hptsi {
+ struct proc *rp_proc; /* Process structure for hpts */
+ struct tcp_hpts_entry **rp_ent; /* Array of hptss */
+ uint32_t rp_num_hptss; /* Number of hpts threads */
+};
+
+#endif
+
+#define HPTS_REMOVE_INPUT 0x01
+#define HPTS_REMOVE_OUTPUT 0x02
+#define HPTS_REMOVE_ALL (HPTS_REMOVE_INPUT | HPTS_REMOVE_OUTPUT)
+
+/*
+ * When using the hpts, a TCP stack must make sure
+ * that once a INP_DROPPED flag is applied to a INP
+ * that it does not expect tcp_output() to ever be
+ * called by the hpts. The hpts will *not* call
+ * any output (or input) functions on a TCB that
+ * is in the DROPPED state.
+ *
+ * This implies final ACK's and RST's that might
+ * be sent when a TCB is still around must be
+ * sent from a routine like tcp_respond().
+ */
+#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep
+ * this determines min granularity of the
+ * hpts. If 0, granularity is 10useconds at
+ * the cost of more CPU (context switching). */
+#ifdef _KERNEL
+#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
+struct tcp_hpts_entry *tcp_hpts_lock(struct inpcb *inp);
+struct tcp_hpts_entry *tcp_input_lock(struct inpcb *inp);
+int __tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line);
+#define tcp_queue_to_hpts_immediate(a)__tcp_queue_to_hpts_immediate(a, __LINE__)
+
+struct tcp_hpts_entry *tcp_cur_hpts(struct inpcb *inp);
+#define tcp_hpts_remove(a, b) __tcp_hpts_remove(a, b, __LINE__)
+void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line);
+
+/*
+ * To insert a TCB on the hpts you *must* be holding the
+ * INP_WLOCK(). The hpts insert code will then acqurire
+ * the hpts's lock and insert the TCB on the requested
+ * slot possibly waking up the hpts if you are requesting
+ * a time earlier than what the hpts is sleeping to (if
+ * the hpts is sleeping). You may check the inp->inp_in_hpts
+ * flag without the hpts lock. The hpts is the only one
+ * that will clear this flag holding only the hpts lock. This
+ * means that in your tcp_output() routine when you test for
+ * it to be 1 (so you wont call output) it may be transitioning
+ * to 0 (by the hpts). That will be fine since that will just
+ * mean an extra call to tcp_output that most likely will find
+ * the call you executed (when the mis-match occured) will have
+ * put the TCB back on the hpts and it will return. If your
+ * call did not add it back to the hpts then you will either
+ * over-send or the cwnd will block you from sending more.
+ *
+ * Note you should also be holding the INP_WLOCK() when you
+ * call the remove from the hpts as well. Thoug usually
+ * you are either doing this from a timer, where you need
+ * that INP_WLOCK() or from destroying your TCB where again
+ * you should already have the INP_WLOCK().
+ */
+uint32_t __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line);
+#define tcp_hpts_insert(a, b) __tcp_hpts_insert(a, b, __LINE__)
+
+uint32_t
+tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag);
+
+int
+ __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line);
+#define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
+void
+tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+ int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked);
+int
+__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+ int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line);
+#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__)
+
+uint16_t tcp_hpts_delayedby(struct inpcb *inp);
+
+void __tcp_set_hpts(struct inpcb *inp, int32_t line);
+#define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
+
+void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
+#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)
+
+extern int32_t tcp_min_hptsi_time;
+
+static __inline uint32_t
+tcp_tv_to_hptstick(struct timeval *sv)
+{
+ return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
+}
+
+static __inline uint32_t
+tcp_gethptstick(struct timeval *sv)
+{
+ struct timeval tv;
+
+ if (sv == NULL)
+ sv = &tv;
+ microuptime(sv);
+ return (tcp_tv_to_hptstick(sv));
+}
+
+static __inline uint32_t
+tcp_tv_to_usectick(struct timeval *sv)
+{
+ return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
+}
+
+static __inline uint32_t
+tcp_tv_to_mssectick(struct timeval *sv)
+{
+ return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
+}
+
+static __inline void
+tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
+{
+ mtx_unlock(&hpts->p_mtx);
+}
+
+static __inline uint32_t
+tcp_get_usecs(struct timeval *tv)
+{
+ struct timeval tvd;
+
+ if (tv == NULL)
+ tv = &tvd;
+ microuptime(tv);
+ return (tcp_tv_to_usectick(tv));
+}
+
+#endif
+#endif
Index: head/sys/netinet/tcp_hpts.c
===================================================================
--- head/sys/netinet/tcp_hpts.c
+++ head/sys/netinet/tcp_hpts.c
@@ -0,0 +1,1964 @@
+/*-
+ * Copyright (c) 2016-8
+ * Netflix Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+/**
+ * Some notes about usage.
+ *
+ * The tcp_hpts system is designed to provide a high precision timer
+ * system for tcp. Its main purpose is to provide a mechanism for
+ * pacing packets out onto the wire. It can be used in two ways
+ * by a given TCP stack (and those two methods can be used simultaneously).
+ *
+ * First, and probably the main thing its used by Rack and BBR for, it can
+ * be used to call tcp_output() of a transport stack at some time in the future.
+ * The normal way this is done is that tcp_output() of the stack schedules
+ * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
+ * slot is the time from now that the stack wants to be called but it
+ * must be converted to tcp_hpts's notion of slot. This is done with
+ * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
+ * call from the tcp_output() routine might look like:
+ *
+ * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
+ *
+ * The above would schedule tcp_ouput() to be called in 550 useconds.
+ * Note that if using this mechanism the stack will want to add near
+ * its top a check to prevent unwanted calls (from user land or the
+ * arrival of incoming ack's). So it would add something like:
+ *
+ * if (inp->inp_in_hpts)
+ * return;
+ *
+ * to prevent output processing until the time alotted has gone by.
+ * Of course this is a bare bones example and the stack will probably
+ * have more consideration then just the above.
+ *
+ * Now the tcp_hpts system will call tcp_output in one of two forms,
+ * it will first check to see if the stack as defined a
+ * tfb_tcp_output_wtime() function, if so that is the routine it
+ * will call, if that function is not defined then it will call the
+ * tfb_tcp_output() function. The only difference between these
+ * two calls is that the former passes the time in to the function
+ * so the function does not have to access the time (which tcp_hpts
+ * already has). What these functions do is of course totally up
+ * to the individual tcp stack.
+ *
+ * Now the second function (actually two functions I guess :D)
+ * the tcp_hpts system provides is the ability to either abort
+ * a connection (later) or process input on a connection.
+ * Why would you want to do this? To keep processor locality.
+ *
+ * So in order to use the input redirection function the
+ * stack changes its tcp_do_segment() routine to instead
+ * of process the data call the function:
+ *
+ * tcp_queue_pkt_to_input()
+ *
+ * You will note that the arguments to this function look
+ * a lot like tcp_do_segments's arguments. This function
+ * will assure that the tcp_hpts system will
+ * call the functions tfb_tcp_hpts_do_segment() from the
+ * correct CPU. Note that multiple calls can get pushed
+ * into the tcp_hpts system this will be indicated by
+ * the next to last argument to tfb_tcp_hpts_do_segment()
+ * (nxt_pkt). If nxt_pkt is a 1 then another packet is
+ * coming. If nxt_pkt is a 0 then this is the last call
+ * that the tcp_hpts system has available for the tcp stack.
+ *
+ * The other point of the input system is to be able to safely
+ * drop a tcp connection without worrying about the recursive
+ * locking that may be occuring on the INP_WLOCK. So if
+ * a stack wants to drop a connection it calls:
+ *
+ * tcp_set_inp_to_drop(tp, ETIMEDOUT)
+ *
+ * To schedule the tcp_hpts system to call
+ *
+ * tcp_drop(tp, drop_reason)
+ *
+ * at a future point. This is quite handy to prevent locking
+ * issues when dropping connections.
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h> /* for proc0 declaration */
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/refcount.h>
+#include <sys/sched.h>
+#include <sys/queue.h>
+#include <sys/smp.h>
+#include <sys/counter.h>
+#include <sys/time.h>
+#include <sys/kthread.h>
+#include <sys/kern_prefetch.h>
+
+#include <vm/uma.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+
+#define TCPSTATES /* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> /* required for icmp_var.h */
+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#define TCPOUTFLAGS
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#include <netinet/tcp_hpts.h>
+
+#ifdef tcpdebug
+#include <netinet/tcp_debug.h>
+#endif /* tcpdebug */
+#ifdef tcp_offload
+#include <netinet/tcp_offload.h>
+#endif
+
+#ifdef ipsec
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /* ipsec */
+#include "opt_rss.h"
+
+MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
+#ifdef RSS
+static int tcp_bind_threads = 1;
+#else
+static int tcp_bind_threads = 0;
+#endif
+TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
+
+static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG;
+
+TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
+
+static struct tcp_hptsi tcp_pace;
+
+static int
+tcp_hptsi_lock_inpinfo(struct inpcb *inp,
+ struct tcpcb **tp);
+static void tcp_wakehpts(struct tcp_hpts_entry *p);
+static void tcp_wakeinput(struct tcp_hpts_entry *p);
+static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
+static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick);
+static void tcp_hpts_thread(void *ctx);
+static void tcp_init_hptsi(void *st);
+
+int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
+static int32_t tcp_hpts_callout_skip_swi = 0;
+
+SYSCTL_DECL(_net_inet_tcp);
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls");
+
+#define timersub(tvp, uvp, vvp) \
+ do { \
+ (vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec; \
+ (vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec; \
+ if ((vvp)->tv_usec < 0) { \
+ (vvp)->tv_sec--; \
+ (vvp)->tv_usec += 1000000; \
+ } \
+ } while (0)
+
+static int32_t logging_on = 0;
+static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
+static int32_t tcp_hpts_precision = 120;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
+ &tcp_hpts_precision, 120,
+ "Value for PRE() precision of callout");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
+ &logging_on, 0,
+ "Turn on logging if compiled in");
+
+counter_u64_t hpts_loops;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
+ &hpts_loops, "Number of times hpts had to loop to catch up");
+
+counter_u64_t back_tosleep;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
+ &back_tosleep, "Number of times hpts found no tcbs");
+
+static int32_t in_newts_every_tcb = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW,
+ &in_newts_every_tcb, 0,
+ "Do we have a new cts every tcb we process for input");
+static int32_t in_ts_percision = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW,
+ &in_ts_percision, 0,
+ "Do we use percise timestamp for clients on input");
+static int32_t out_newts_every_tcb = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW,
+ &out_newts_every_tcb, 0,
+ "Do we have a new cts every tcb we process for output");
+static int32_t out_ts_percision = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
+ &out_ts_percision, 0,
+ "Do we use a percise timestamp for every output cts");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW,
+ &hpts_sleep_max, 0,
+ "The maximum time the hpts will sleep <1 - 254>");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
+ &tcp_min_hptsi_time, 0,
+ "The minimum time the hpts must sleep before processing more slots");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
+ &tcp_hpts_callout_skip_swi, 0,
+ "Do we have the callout call directly to the hpts?");
+
+static void
+__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot,
+ uint32_t ticknow, int32_t line)
+{
+ struct hpts_log *pl;
+
+ HPTS_MTX_ASSERT(hpts);
+ if (hpts->p_log == NULL)
+ return;
+ pl = &hpts->p_log[hpts->p_log_at];
+ hpts->p_log_at++;
+ if (hpts->p_log_at >= hpts->p_logsize) {
+ hpts->p_log_at = 0;
+ hpts->p_log_wrapped = 1;
+ }
+ pl->inp = inp;
+ if (inp) {
+ pl->t_paceslot = inp->inp_hptsslot;
+ pl->t_hptsreq = inp->inp_hpts_request;
+ pl->p_onhpts = inp->inp_in_hpts;
+ pl->p_oninput = inp->inp_in_input;
+ } else {
+ pl->t_paceslot = 0;
+ pl->t_hptsreq = 0;
+ pl->p_onhpts = 0;
+ pl->p_oninput = 0;
+ }
+ pl->is_notempty = 1;
+ pl->event = event;
+ pl->line = line;
+ pl->cts = tcp_get_usecs(NULL);
+ pl->p_curtick = hpts->p_curtick;
+ pl->p_prevtick = hpts->p_prevtick;
+ pl->p_on_queue_cnt = hpts->p_on_queue_cnt;
+ pl->ticknow = ticknow;
+ pl->slot_req = slot;
+ pl->p_nxt_slot = hpts->p_nxt_slot;
+ pl->p_cur_slot = hpts->p_cur_slot;
+ pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time;
+ pl->p_flags = (hpts->p_cpu & 0x7f);
+ pl->p_flags <<= 7;
+ pl->p_flags |= (hpts->p_num & 0x7f);
+ pl->p_flags <<= 2;
+ if (hpts->p_hpts_active) {
+ pl->p_flags |= HPTS_HPTS_ACTIVE;
+ }
+}
+
+#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__)
+
+static void
+hpts_timeout_swi(void *arg)
+{
+ struct tcp_hpts_entry *hpts;
+
+ hpts = (struct tcp_hpts_entry *)arg;
+ swi_sched(hpts->ie_cookie, 0);
+}
+
+static void
+hpts_timeout_dir(void *arg)
+{
+ tcp_hpts_thread(arg);
+}
+
+static inline void
+hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
+{
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx) == 0) {
+ /* We don't own the mutex? */
+ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+ }
+ if (hpts->p_cpu != inp->inp_hpts_cpu) {
+ /* It is not the right cpu/mutex? */
+ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+ }
+ if (inp->inp_in_hpts == 0) {
+ /* We are not on the hpts? */
+ panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
+ }
+ if (TAILQ_EMPTY(head) &&
+ (hpts->p_on_queue_cnt != 0)) {
+ /* We should not be empty with a queue count */
+ panic("%s hpts:%p hpts bucket empty but cnt:%d",
+ __FUNCTION__, hpts, hpts->p_on_queue_cnt);
+ }
+#endif
+ TAILQ_REMOVE(head, inp, inp_hpts);
+ hpts->p_on_queue_cnt--;
+ if (hpts->p_on_queue_cnt < 0) {
+ /* Count should not go negative .. */
+#ifdef INVARIANTS
+ panic("Hpts goes negative inp:%p hpts:%p",
+ inp, hpts);
+#endif
+ hpts->p_on_queue_cnt = 0;
+ }
+ if (clear) {
+ inp->inp_hpts_request = 0;
+ inp->inp_in_hpts = 0;
+ }
+}
+
+static inline void
+hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
+{
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx) == 0) {
+ /* We don't own the mutex? */
+ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+ }
+ if (hpts->p_cpu != inp->inp_hpts_cpu) {
+ /* It is not the right cpu/mutex? */
+ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+ }
+ if ((noref == 0) && (inp->inp_in_hpts == 1)) {
+ /* We are already on the hpts? */
+ panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
+ }
+#endif
+ TAILQ_INSERT_TAIL(head, inp, inp_hpts);
+ inp->inp_in_hpts = 1;
+ hpts->p_on_queue_cnt++;
+ if (noref == 0) {
+ in_pcbref(inp);
+ }
+}
+
+static inline void
+hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
+{
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx) == 0) {
+ /* We don't own the mutex? */
+ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+ }
+ if (hpts->p_cpu != inp->inp_input_cpu) {
+ /* It is not the right cpu/mutex? */
+ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+ }
+ if (inp->inp_in_input == 0) {
+ /* We are not on the input hpts? */
+ panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
+ }
+#endif
+ TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
+ hpts->p_on_inqueue_cnt--;
+ if (hpts->p_on_inqueue_cnt < 0) {
+#ifdef INVARIANTS
+ panic("Hpts in goes negative inp:%p hpts:%p",
+ inp, hpts);
+#endif
+ hpts->p_on_inqueue_cnt = 0;
+ }
+#ifdef INVARIANTS
+ if (TAILQ_EMPTY(&hpts->p_input) &&
+ (hpts->p_on_inqueue_cnt != 0)) {
+ /* We should not be empty with a queue count */
+ panic("%s hpts:%p in_hpts input empty but cnt:%d",
+ __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
+ }
+#endif
+ if (clear)
+ inp->inp_in_input = 0;
+}
+
+static inline void
+hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
+{
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx) == 0) {
+ /* We don't own the mutex? */
+ panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+ }
+ if (hpts->p_cpu != inp->inp_input_cpu) {
+ /* It is not the right cpu/mutex? */
+ panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+ }
+ if (inp->inp_in_input == 1) {
+ /* We are already on the input hpts? */
+ panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
+ }
+#endif
+ TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
+ inp->inp_in_input = 1;
+ hpts->p_on_inqueue_cnt++;
+ in_pcbref(inp);
+}
+
+static int
+sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS)
+{
+ struct tcp_hpts_entry *hpts;
+ size_t sz;
+ int32_t logging_was, i;
+ int32_t error = 0;
+
+ /*
+ * HACK: Turn off logging so no locks are required this really needs
+ * a memory barrier :)
+ */
+ logging_was = logging_on;
+ logging_on = 0;
+ if (!req->oldptr) {
+ /* How much? */
+ sz = 0;
+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+ hpts = tcp_pace.rp_ent[i];
+ if (hpts->p_log == NULL)
+ continue;
+ sz += (sizeof(struct hpts_log) * hpts->p_logsize);
+ }
+ error = SYSCTL_OUT(req, 0, sz);
+ } else {
+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+ hpts = tcp_pace.rp_ent[i];
+ if (hpts->p_log == NULL)
+ continue;
+ if (hpts->p_log_wrapped)
+ sz = (sizeof(struct hpts_log) * hpts->p_logsize);
+ else
+ sz = (sizeof(struct hpts_log) * hpts->p_log_at);
+ error = SYSCTL_OUT(req, hpts->p_log, sz);
+ }
+ }
+ logging_on = logging_was;
+ return error;
+}
+
+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+ 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
+
+
+/*
+ * Try to get the INP_INFO lock.
+ *
+ * This function always succeeds in getting the lock. It will clear
+ * *tpp and return (1) if something critical changed while the inpcb
+ * was unlocked. Otherwise, it will leave *tpp unchanged and return (0).
+ *
+ * This function relies on the fact that the hpts always holds a
+ * reference on the inpcb while the segment is on the hptsi wheel and
+ * in the input queue.
+ *
+ */
+static int
+tcp_hptsi_lock_inpinfo(struct inpcb *inp, struct tcpcb **tpp)
+{
+ struct tcp_function_block *tfb;
+ struct tcpcb *tp;
+ void *ptr;
+
+ /* Try the easy way. */
+ if (INP_INFO_TRY_RLOCK(&V_tcbinfo))
+ return (0);
+
+ /*
+ * OK, let's try the hard way. We'll save the function pointer block
+ * to make sure that doesn't change while we aren't holding the
+ * lock.
+ */
+ tp = *tpp;
+ tfb = tp->t_fb;
+ ptr = tp->t_fb_ptr;
+ INP_WUNLOCK(inp);
+ INP_INFO_RLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ /* If the session went away, return an error. */
+ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+ (inp->inp_flags2 & INP_FREED)) {
+ *tpp = NULL;
+ return (1);
+ }
+ /*
+ * If the function block or stack-specific data block changed,
+ * report an error.
+ */
+ tp = intotcpcb(inp);
+ if ((tp->t_fb != tfb) && (tp->t_fb_ptr != ptr)) {
+ *tpp = NULL;
+ return (1);
+ }
+ return (0);
+}
+
+
+static void
+tcp_wakehpts(struct tcp_hpts_entry *hpts)
+{
+ HPTS_MTX_ASSERT(hpts);
+ swi_sched(hpts->ie_cookie, 0);
+ if (hpts->p_hpts_active == 2) {
+ /* Rare sleeping on a ENOBUF */
+ wakeup_one(hpts);
+ }
+}
+
+static void
+tcp_wakeinput(struct tcp_hpts_entry *hpts)
+{
+ HPTS_MTX_ASSERT(hpts);
+ swi_sched(hpts->ie_cookie, 0);
+ if (hpts->p_hpts_active == 2) {
+ /* Rare sleeping on a ENOBUF */
+ wakeup_one(hpts);
+ }
+}
+
+struct tcp_hpts_entry *
+tcp_cur_hpts(struct inpcb *inp)
+{
+ int32_t hpts_num;
+ struct tcp_hpts_entry *hpts;
+
+ hpts_num = inp->inp_hpts_cpu;
+ hpts = tcp_pace.rp_ent[hpts_num];
+ return (hpts);
+}
+
+struct tcp_hpts_entry *
+tcp_hpts_lock(struct inpcb *inp)
+{
+ struct tcp_hpts_entry *hpts;
+ int32_t hpts_num;
+
+again:
+ hpts_num = inp->inp_hpts_cpu;
+ hpts = tcp_pace.rp_ent[hpts_num];
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx)) {
+ panic("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__);
+ }
+#endif
+ mtx_lock(&hpts->p_mtx);
+ if (hpts_num != inp->inp_hpts_cpu) {
+ mtx_unlock(&hpts->p_mtx);
+ goto again;
+ }
+ return (hpts);
+}
+
+struct tcp_hpts_entry *
+tcp_input_lock(struct inpcb *inp)
+{
+ struct tcp_hpts_entry *hpts;
+ int32_t hpts_num;
+
+again:
+ hpts_num = inp->inp_input_cpu;
+ hpts = tcp_pace.rp_ent[hpts_num];
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx)) {
+ panic("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__);
+ }
+#endif
+ mtx_lock(&hpts->p_mtx);
+ if (hpts_num != inp->inp_input_cpu) {
+ mtx_unlock(&hpts->p_mtx);
+ goto again;
+ }
+ return (hpts);
+}
+
+static void
+tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
+{
+ int32_t add_freed;
+
+ if (inp->inp_flags2 & INP_FREED) {
+ /*
+ * Need to play a special trick so that in_pcbrele_wlocked
+ * does not return 1 when it really should have returned 0.
+ */
+ add_freed = 1;
+ inp->inp_flags2 &= ~INP_FREED;
+ } else {
+ add_freed = 0;
+ }
+#ifndef INP_REF_DEBUG
+ if (in_pcbrele_wlocked(inp)) {
+ /*
+ * This should not happen. We have the inpcb referred to by
+ * the main socket (why we are called) and the hpts. It
+ * should always return 0.
+ */
+ panic("inpcb:%p release ret 1",
+ inp);
+ }
+#else
+ if (__in_pcbrele_wlocked(inp, line)) {
+ /*
+ * This should not happen. We have the inpcb referred to by
+ * the main socket (why we are called) and the hpts. It
+ * should always return 0.
+ */
+ panic("inpcb:%p release ret 1",
+ inp);
+ }
+#endif
+ if (add_freed) {
+ inp->inp_flags2 |= INP_FREED;
+ }
+}
+
+static void
+tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
+{
+ if (inp->inp_in_hpts) {
+ hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
+ tcp_remove_hpts_ref(inp, hpts, line);
+ }
+}
+
+static void
+tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
+{
+ HPTS_MTX_ASSERT(hpts);
+ if (inp->inp_in_input) {
+ hpts_sane_input_remove(hpts, inp, 1);
+ tcp_remove_hpts_ref(inp, hpts, line);
+ }
+}
+
+/*
+ * Called normally with the INP_LOCKED but it
+ * does not matter, the hpts lock is the key
+ * but the lock order allows us to hold the
+ * INP lock and then get the hpts lock.
+ *
+ * Valid values in the flags are
+ * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
+ * HPTS_REMOVE_INPUT - remove from the input of the hpts.
+ * Note that you can or both values together and get two
+ * actions.
+ */
+void
+__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
+{
+ struct tcp_hpts_entry *hpts;
+
+ INP_WLOCK_ASSERT(inp);
+ if (flags & HPTS_REMOVE_OUTPUT) {
+ hpts = tcp_hpts_lock(inp);
+ tcp_hpts_remove_locked_output(hpts, inp, flags, line);
+ mtx_unlock(&hpts->p_mtx);
+ }
+ if (flags & HPTS_REMOVE_INPUT) {
+ hpts = tcp_input_lock(inp);
+ tcp_hpts_remove_locked_input(hpts, inp, flags, line);
+ mtx_unlock(&hpts->p_mtx);
+ }
+}
+
+static inline int
+hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus)
+{
+ return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS);
+}
+
+static int
+tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
+{
+ int32_t need_wake = 0;
+ uint32_t ticknow = 0;
+
+ HPTS_MTX_ASSERT(hpts);
+ if (inp->inp_in_hpts == 0) {
+ /* Ok we need to set it on the hpts in the current slot */
+ if (hpts->p_hpts_active == 0) {
+ /* A sleeping hpts we want in next slot to run */
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0,
+ hpts_tick(hpts, 1));
+ }
+ inp->inp_hptsslot = hpts_tick(hpts, 1);
+ inp->inp_hpts_request = 0;
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow);
+ }
+ need_wake = 1;
+ } else if ((void *)inp == hpts->p_inp) {
+ /*
+ * We can't allow you to go into the same slot we
+ * are in. We must put you out.
+ */
+ inp->inp_hptsslot = hpts->p_nxt_slot;
+ } else
+ inp->inp_hptsslot = hpts->p_cur_slot;
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
+ inp->inp_hpts_request = 0;
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0);
+ }
+ if (need_wake) {
+ /*
+ * Activate the hpts if it is sleeping and its
+ * timeout is not 1.
+ */
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow);
+ }
+ hpts->p_direct_wake = 1;
+ tcp_wakehpts(hpts);
+ }
+ }
+ return (need_wake);
+}
+
+int
+__tcp_queue_to_hpts_immediate(struct inpcb *inp, int32_t line)
+{
+ int32_t ret;
+ struct tcp_hpts_entry *hpts;
+
+ INP_WLOCK_ASSERT(inp);
+ hpts = tcp_hpts_lock(inp);
+ ret = tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
+ mtx_unlock(&hpts->p_mtx);
+ return (ret);
+}
+
+static void
+tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line,
+ struct hpts_diag *diag, int32_t noref)
+{
+ int32_t need_new_to = 0;
+ int32_t need_wakeup = 0;
+ uint32_t largest_slot;
+ uint32_t ticknow = 0;
+ uint32_t slot_calc;
+
+ HPTS_MTX_ASSERT(hpts);
+ if (diag) {
+ memset(diag, 0, sizeof(struct hpts_diag));
+ diag->p_hpts_active = hpts->p_hpts_active;
+ diag->p_nxt_slot = hpts->p_nxt_slot;
+ diag->p_cur_slot = hpts->p_cur_slot;
+ diag->slot_req = slot;
+ }
+ if ((inp->inp_in_hpts == 0) || noref) {
+ inp->inp_hpts_request = slot;
+ if (slot == 0) {
+ /* Immediate */
+ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref);
+ return;
+ }
+ if (hpts->p_hpts_active) {
+ /*
+ * Its slot - 1 since nxt_slot is the next tick that
+ * will go off since the hpts is awake
+ */
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0);
+ }
+ /*
+ * We want to make sure that we don't place a inp in
+ * the range of p_cur_slot <-> p_nxt_slot. If we
+ * take from p_nxt_slot to the end, plus p_cur_slot
+ * and then take away 2, we will know how many is
+ * the max slots we can use.
+ */
+ if (hpts->p_nxt_slot > hpts->p_cur_slot) {
+ /*
+ * Non-wrap case nxt_slot <-> cur_slot we
+ * don't want to land in. So the diff gives
+ * us what is taken away from the number of
+ * slots.
+ */
+ largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot);
+ } else if (hpts->p_nxt_slot == hpts->p_cur_slot) {
+ largest_slot = NUM_OF_HPTSI_SLOTS - 2;
+ } else {
+ /*
+ * Wrap case so the diff gives us the number
+ * of slots that we can land in.
+ */
+ largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot;
+ }
+ /*
+ * We take away two so we never have a problem (20
+ * usec's) out of 1024000 usecs
+ */
+ largest_slot -= 2;
+ if (inp->inp_hpts_request > largest_slot) {
+ /*
+ * Restrict max jump of slots and remember
+ * leftover
+ */
+ slot = largest_slot;
+ inp->inp_hpts_request -= largest_slot;
+ } else {
+ /* This one will run when we hit it */
+ inp->inp_hpts_request = 0;
+ }
+ if (hpts->p_nxt_slot == hpts->p_cur_slot)
+ slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS;
+ else
+ slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS;
+ if (slot_calc == hpts->p_cur_slot) {
+#ifdef INVARIANTS
+ /* TSNH */
+ panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n",
+ hpts, slot_calc, slot, largest_slot);
+#endif
+ if (slot_calc)
+ slot_calc--;
+ else
+ slot_calc = NUM_OF_HPTSI_SLOTS - 1;
+ }
+ inp->inp_hptsslot = slot_calc;
+ if (diag) {
+ diag->inp_hptsslot = inp->inp_hptsslot;
+ }
+ } else {
+ /*
+ * The hpts is sleeping, we need to figure out where
+ * it will wake up at and if we need to reschedule
+ * its time-out.
+ */
+ uint32_t have_slept, yet_to_sleep;
+ uint32_t slot_now;
+ struct timeval tv;
+
+ ticknow = tcp_gethptstick(&tv);
+ slot_now = ticknow % NUM_OF_HPTSI_SLOTS;
+ /*
+ * The user wants to be inserted at (slot_now +
+ * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up.
+ */
+ largest_slot = NUM_OF_HPTSI_SLOTS - 2;
+ if (inp->inp_hpts_request > largest_slot) {
+ /* Adjust the residual in inp_hpts_request */
+ slot = largest_slot;
+ inp->inp_hpts_request -= largest_slot;
+ } else {
+ /* No residual it all fits */
+ inp->inp_hpts_request = 0;
+ }
+ inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS;
+ if (diag) {
+ diag->slot_now = slot_now;
+ diag->inp_hptsslot = inp->inp_hptsslot;
+ diag->p_on_min_sleep = hpts->p_on_min_sleep;
+ }
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow);
+ }
+ /* Now do we need to restart the hpts's timer? */
+ if (TSTMP_GT(ticknow, hpts->p_curtick))
+ have_slept = ticknow - hpts->p_curtick;
+ else
+ have_slept = 0;
+ if (have_slept < hpts->p_hpts_sleep_time) {
+ /* This should be what happens */
+ yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
+ } else {
+ /* We are over-due */
+ yet_to_sleep = 0;
+ need_wakeup = 1;
+ }
+ if (diag) {
+ diag->have_slept = have_slept;
+ diag->yet_to_sleep = yet_to_sleep;
+ diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
+ }
+ if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) {
+ /*
+ * We need to reschedule the hptss time-out.
+ */
+ hpts->p_hpts_sleep_time = slot;
+ need_new_to = slot * HPTS_TICKS_PER_USEC;
+ }
+ }
+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow);
+ }
+ /*
+ * Now how far is the hpts sleeping to? if active is 1, its
+ * up and ticking we do nothing, otherwise we may need to
+ * reschedule its callout if need_new_to is set from above.
+ */
+ if (need_wakeup) {
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0);
+ }
+ hpts->p_direct_wake = 1;
+ tcp_wakehpts(hpts);
+ if (diag) {
+ diag->need_new_to = 0;
+ diag->co_ret = 0xffff0000;
+ }
+ } else if (need_new_to) {
+ int32_t co_ret;
+ struct timeval tv;
+ sbintime_t sb;
+
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ while (need_new_to > HPTS_USEC_IN_SEC) {
+ tv.tv_sec++;
+ need_new_to -= HPTS_USEC_IN_SEC;
+ }
+ tv.tv_usec = need_new_to;
+ sb = tvtosbt(tv);
+ if (tcp_hpts_callout_skip_swi == 0) {
+ co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, hpts->p_cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ } else {
+ co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_dir, hpts,
+ hpts->p_cpu,
+ C_PREL(tcp_hpts_precision));
+ }
+ if (diag) {
+ diag->need_new_to = need_new_to;
+ diag->co_ret = co_ret;
+ }
+ }
+ } else {
+#ifdef INVARIANTS
+ panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
+#endif
+ }
+}
+
+uint32_t
+tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){
+ struct tcp_hpts_entry *hpts;
+ uint32_t slot_on, cts;
+ struct timeval tv;
+
+ /*
+ * We now return the next-slot the hpts will be on, beyond its
+ * current run (if up) or where it was when it stopped if it is
+ * sleeping.
+ */
+ INP_WLOCK_ASSERT(inp);
+ hpts = tcp_hpts_lock(inp);
+ if (in_ts_percision)
+ microuptime(&tv);
+ else
+ getmicrouptime(&tv);
+ cts = tcp_tv_to_usectick(&tv);
+ tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0);
+ slot_on = hpts->p_nxt_slot;
+ mtx_unlock(&hpts->p_mtx);
+ return (slot_on);
+}
+
+uint32_t
+__tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
+ return (tcp_hpts_insert_diag(inp, slot, line, NULL));
+}
+
+int
+__tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line)
+{
+ int32_t retval = 0;
+
+ HPTS_MTX_ASSERT(hpts);
+ if (inp->inp_in_input == 0) {
+ /* Ok we need to set it on the hpts in the current slot */
+ hpts_sane_input_insert(hpts, inp, line);
+ retval = 1;
+ if (hpts->p_hpts_active == 0) {
+ /*
+ * Activate the hpts if it is sleeping.
+ */
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0);
+ }
+ retval = 2;
+ hpts->p_direct_wake = 1;
+ tcp_wakeinput(hpts);
+ }
+ } else if (hpts->p_hpts_active == 0) {
+ retval = 4;
+ hpts->p_direct_wake = 1;
+ tcp_wakeinput(hpts);
+ }
+ return (retval);
+}
+
+void
+tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+ int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked)
+{
+ /* Setup packet for input first */
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t));
+ m->m_pkthdr.pace_tlen = (uint16_t) tlen;
+ m->m_pkthdr.pace_drphdrlen = drop_hdrlen;
+ m->m_pkthdr.pace_tos = iptos;
+ m->m_pkthdr.pace_lock = (uint8_t) ti_locked;
+ if (tp->t_in_pkt == NULL) {
+ tp->t_in_pkt = m;
+ tp->t_tail_pkt = m;
+ } else {
+ tp->t_tail_pkt->m_nextpkt = m;
+ tp->t_tail_pkt = m;
+ }
+}
+
+
+int32_t
+__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th,
+ int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, uint8_t ti_locked, int32_t line){
+ struct tcp_hpts_entry *hpts;
+ int32_t ret;
+
+ tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos, ti_locked);
+ hpts = tcp_input_lock(tp->t_inpcb);
+ ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line);
+ mtx_unlock(&hpts->p_mtx);
+ return (ret);
+}
+
+void
+__tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line)
+{
+ struct tcp_hpts_entry *hpts;
+ struct tcpcb *tp;
+
+ tp = intotcpcb(inp);
+ hpts = tcp_input_lock(tp->t_inpcb);
+ if (inp->inp_in_input == 0) {
+ /* Ok we need to set it on the hpts in the current slot */
+ hpts_sane_input_insert(hpts, inp, line);
+ if (hpts->p_hpts_active == 0) {
+ /*
+ * Activate the hpts if it is sleeping.
+ */
+ hpts->p_direct_wake = 1;
+ tcp_wakeinput(hpts);
+ }
+ } else if (hpts->p_hpts_active == 0) {
+ hpts->p_direct_wake = 1;
+ tcp_wakeinput(hpts);
+ }
+ inp->inp_hpts_drop_reas = reason;
+ mtx_unlock(&hpts->p_mtx);
+}
+
+static uint16_t
+hpts_random_cpu(struct inpcb *inp){
+ /*
+ * No flow type set distribute the load randomly.
+ */
+ uint16_t cpuid;
+ uint32_t ran;
+
+ /*
+ * If one has been set use it i.e. we want both in and out on the
+ * same hpts.
+ */
+ if (inp->inp_input_cpu_set) {
+ return (inp->inp_input_cpu);
+ } else if (inp->inp_hpts_cpu_set) {
+ return (inp->inp_hpts_cpu);
+ }
+ /* Nothing set use a random number */
+ ran = arc4random();
+ cpuid = (ran & 0xffff) % mp_ncpus;
+ return (cpuid);
+}
+
+static uint16_t
+hpts_cpuid(struct inpcb *inp){
+ uint16_t cpuid;
+
+
+ /*
+ * If one has been set use it i.e. we want both in and out on the
+ * same hpts.
+ */
+ if (inp->inp_input_cpu_set) {
+ return (inp->inp_input_cpu);
+ } else if (inp->inp_hpts_cpu_set) {
+ return (inp->inp_hpts_cpu);
+ }
+ /* If one is set the other must be the same */
+#ifdef RSS
+ cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
+ if (cpuid == NETISR_CPUID_NONE)
+ return (hpts_random_cpu(inp));
+ else
+ return (cpuid);
+#else
+ /*
+ * We don't have a flowid -> cpuid mapping, so cheat and just map
+ * unknown cpuids to curcpu. Not the best, but apparently better
+ * than defaulting to swi 0.
+ */
+ if (inp->inp_flowtype != M_HASHTYPE_NONE) {
+ cpuid = inp->inp_flowid % mp_ncpus;
+ return (cpuid);
+ }
+ cpuid = hpts_random_cpu(inp);
+ return (cpuid);
+#endif
+}
+
+/*
+ * Do NOT try to optimize the processing of inp's
+ * by first pulling off all the inp's into a temporary
+ * list (e.g. TAILQ_CONCAT). If you do that the subtle
+ * interactions of switching CPU's will kill because of
+ * problems in the linked list manipulation. Basically
+ * you would switch cpu's with the hpts mutex locked
+ * but then while you were processing one of the inp's
+ * some other one that you switch will get a new
+ * packet on the different CPU. It will insert it
+ * on the new hptss input list. Creating a temporary
+ * link in the inp will not fix it either, since
+ * the other hpts will be doing the same thing and
+ * you will both end up using the temporary link.
+ *
+ * You will die in an ASSERT for tailq corruption if you
+ * run INVARIANTS or you will die horribly without
+ * INVARIANTS in some unknown way with a corrupt linked
+ * list.
+ */
+static void
+tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv)
+{
+ struct mbuf *m, *n;
+ struct tcpcb *tp;
+ struct inpcb *inp;
+ uint16_t drop_reason;
+ int16_t set_cpu;
+ uint32_t did_prefetch = 0;
+ int32_t ti_locked = TI_UNLOCKED;
+
+ HPTS_MTX_ASSERT(hpts);
+ while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
+ HPTS_MTX_ASSERT(hpts);
+ hpts_sane_input_remove(hpts, inp, 0);
+ if (inp->inp_input_cpu_set == 0) {
+ set_cpu = 1;
+ } else {
+ set_cpu = 0;
+ }
+ hpts->p_inp = inp;
+ drop_reason = inp->inp_hpts_drop_reas;
+ inp->inp_in_input = 0;
+ mtx_unlock(&hpts->p_mtx);
+ if (drop_reason) {
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
+ } else {
+ ti_locked = TI_UNLOCKED;
+ }
+ INP_WLOCK(inp);
+ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+ (inp->inp_flags2 & INP_FREED)) {
+out:
+ hpts->p_inp = NULL;
+ if (ti_locked == TI_RLOCKED) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ }
+ if (in_pcbrele_wlocked(inp) == 0) {
+ INP_WUNLOCK(inp);
+ }
+ ti_locked = TI_UNLOCKED;
+ mtx_lock(&hpts->p_mtx);
+ continue;
+ }
+ tp = intotcpcb(inp);
+ if ((tp == NULL) || (tp->t_inpcb == NULL)) {
+ goto out;
+ }
+ if (drop_reason) {
+ /* This tcb is being destroyed for drop_reason */
+ m = tp->t_in_pkt;
+ if (m)
+ n = m->m_nextpkt;
+ else
+ n = NULL;
+ tp->t_in_pkt = NULL;
+ while (m) {
+ m_freem(m);
+ m = n;
+ if (m)
+ n = m->m_nextpkt;
+ }
+ tp = tcp_drop(tp, drop_reason);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ if (tp == NULL) {
+ INP_WLOCK(inp);
+ }
+ if (in_pcbrele_wlocked(inp) == 0)
+ INP_WUNLOCK(inp);
+ mtx_lock(&hpts->p_mtx);
+ continue;
+ }
+ if (set_cpu) {
+ /*
+ * Setup so the next time we will move to the right
+ * CPU. This should be a rare event. It will
+ * sometimes happens when we are the client side
+ * (usually not the server). Somehow tcp_output()
+ * gets called before the tcp_do_segment() sets the
+ * intial state. This means the r_cpu and r_hpts_cpu
+ * is 0. We get on the hpts, and then tcp_input()
+ * gets called setting up the r_cpu to the correct
+ * value. The hpts goes off and sees the mis-match.
+ * We simply correct it here and the CPU will switch
+ * to the new hpts nextime the tcb gets added to the
+ * the hpts (not this time) :-)
+ */
+ tcp_set_hpts(inp);
+ }
+ CURVNET_SET(tp->t_vnet);
+ m = tp->t_in_pkt;
+ n = NULL;
+ if (m != NULL &&
+ (m->m_pkthdr.pace_lock == TI_RLOCKED ||
+ tp->t_state != TCPS_ESTABLISHED)) {
+ ti_locked = TI_RLOCKED;
+ if (tcp_hptsi_lock_inpinfo(inp, &tp)) {
+ CURVNET_RESTORE();
+ goto out;
+ }
+ m = tp->t_in_pkt;
+ }
+ if (in_newts_every_tcb) {
+ if (in_ts_percision)
+ microuptime(tv);
+ else
+ getmicrouptime(tv);
+ }
+ if (tp->t_fb_ptr != NULL) {
+ kern_prefetch(tp->t_fb_ptr, &did_prefetch);
+ did_prefetch = 1;
+ }
+ /* Any input work to do, if so do it first */
+ if ((m != NULL) && (m == tp->t_in_pkt)) {
+ struct tcphdr *th;
+ int32_t tlen, drop_hdrlen, nxt_pkt;
+ uint8_t iptos;
+
+ n = m->m_nextpkt;
+ tp->t_in_pkt = tp->t_tail_pkt = NULL;
+ while (m) {
+ th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff);
+ tlen = m->m_pkthdr.pace_tlen;
+ drop_hdrlen = m->m_pkthdr.pace_drphdrlen;
+ iptos = m->m_pkthdr.pace_tos;
+ m->m_nextpkt = NULL;
+ if (n)
+ nxt_pkt = 1;
+ else
+ nxt_pkt = 0;
+ inp->inp_input_calls = 1;
+ if (tp->t_fb->tfb_tcp_hpts_do_segment) {
+ /* Use the hpts specific do_segment */
+ (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket,
+ tp, drop_hdrlen,
+ tlen, iptos, ti_locked, nxt_pkt, tv);
+ } else {
+ /* Use the default do_segment */
+ (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket,
+ tp, drop_hdrlen,
+ tlen, iptos, ti_locked);
+ }
+ /*
+ * Do segment returns unlocked we need the
+ * lock again but we also need some kasserts
+ * here.
+ */
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ INP_UNLOCK_ASSERT(inp);
+ m = n;
+ if (m)
+ n = m->m_nextpkt;
+ if (m != NULL &&
+ m->m_pkthdr.pace_lock == TI_RLOCKED) {
+ INP_INFO_RLOCK(&V_tcbinfo);
+ ti_locked = TI_RLOCKED;
+ } else
+ ti_locked = TI_UNLOCKED;
+ INP_WLOCK(inp);
+ /*
+ * Since we have an opening here we must
+ * re-check if the tcb went away while we
+ * were getting the lock(s).
+ */
+ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+ (inp->inp_flags2 & INP_FREED)) {
+ out_free:
+ while (m) {
+ m_freem(m);
+ m = n;
+ if (m)
+ n = m->m_nextpkt;
+ }
+ CURVNET_RESTORE();
+ goto out;
+ }
+ /*
+ * Now that we hold the INP lock, check if
+ * we need to upgrade our lock.
+ */
+ if (ti_locked == TI_UNLOCKED &&
+ (tp->t_state != TCPS_ESTABLISHED)) {
+ ti_locked = TI_RLOCKED;
+ if (tcp_hptsi_lock_inpinfo(inp, &tp))
+ goto out_free;
+ }
+ } /** end while(m) */
+ } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */
+ if (in_pcbrele_wlocked(inp) == 0)
+ INP_WUNLOCK(inp);
+ if (ti_locked == TI_RLOCKED)
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+ INP_UNLOCK_ASSERT(inp);
+ ti_locked = TI_UNLOCKED;
+ mtx_lock(&hpts->p_mtx);
+ hpts->p_inp = NULL;
+ CURVNET_RESTORE();
+ }
+}
+
+static int
+tcp_hpts_est_run(struct tcp_hpts_entry *hpts)
+{
+ int32_t ticks_to_run;
+
+ if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) {
+ ticks_to_run = hpts->p_curtick - hpts->p_prevtick;
+ if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) {
+ ticks_to_run = NUM_OF_HPTSI_SLOTS - 2;
+ }
+ } else {
+ if (hpts->p_prevtick == hpts->p_curtick) {
+ /* This happens when we get woken up right away */
+ return (-1);
+ }
+ ticks_to_run = 1;
+ }
+ /* Set in where we will be when we catch up */
+ hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS;
+ if (hpts->p_nxt_slot == hpts->p_cur_slot) {
+ panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d",
+ hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run);
+ }
+ return (ticks_to_run);
+}
+
+static void
+tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick)
+{
+ struct tcpcb *tp;
+ struct inpcb *inp = NULL, *ninp;
+ struct timeval tv;
+ int32_t ticks_to_run, i, error, tick_now, interum_tick;
+ int32_t paced_cnt = 0;
+ int32_t did_prefetch = 0;
+ int32_t prefetch_ninp = 0;
+ int32_t prefetch_tp = 0;
+ uint32_t cts;
+ int16_t set_cpu;
+
+ HPTS_MTX_ASSERT(hpts);
+ hpts->p_curtick = tcp_tv_to_hptstick(ctick);
+ cts = tcp_tv_to_usectick(ctick);
+ memcpy(&tv, ctick, sizeof(struct timeval));
+ hpts->p_cur_slot = hpts_tick(hpts, 1);
+
+ /* Figure out if we had missed ticks */
+again:
+ HPTS_MTX_ASSERT(hpts);
+ ticks_to_run = tcp_hpts_est_run(hpts);
+ if (!TAILQ_EMPTY(&hpts->p_input)) {
+ tcp_input_data(hpts, &tv);
+ }
+#ifdef INVARIANTS
+ if (TAILQ_EMPTY(&hpts->p_input) &&
+ (hpts->p_on_inqueue_cnt != 0)) {
+ panic("tp:%p in_hpts input empty but cnt:%d",
+ hpts, hpts->p_on_inqueue_cnt);
+ }
+#endif
+ HPTS_MTX_ASSERT(hpts);
+ /* Reset the ticks to run and time if we need too */
+ interum_tick = tcp_gethptstick(&tv);
+ if (interum_tick != hpts->p_curtick) {
+ /* Save off the new time we execute to */
+ *ctick = tv;
+ hpts->p_curtick = interum_tick;
+ cts = tcp_tv_to_usectick(&tv);
+ hpts->p_cur_slot = hpts_tick(hpts, 1);
+ ticks_to_run = tcp_hpts_est_run(hpts);
+ }
+ if (ticks_to_run == -1) {
+ goto no_run;
+ }
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0);
+ }
+ if (hpts->p_on_queue_cnt == 0) {
+ goto no_one;
+ }
+ HPTS_MTX_ASSERT(hpts);
+ for (i = 0; i < ticks_to_run; i++) {
+ /*
+ * Calculate our delay, if there are no extra ticks there
+ * was not any
+ */
+ hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
+ HPTS_MTX_ASSERT(hpts);
+ while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+ /* For debugging */
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i);
+ }
+ hpts->p_inp = inp;
+ paced_cnt++;
+ if (hpts->p_cur_slot != inp->inp_hptsslot) {
+ panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
+ hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot);
+ }
+ /* Now pull it */
+ if (inp->inp_hpts_cpu_set == 0) {
+ set_cpu = 1;
+ } else {
+ set_cpu = 0;
+ }
+ hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0);
+ if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
+ /* We prefetch the next inp if possible */
+ kern_prefetch(ninp, &prefetch_ninp);
+ prefetch_ninp = 1;
+ }
+ if (inp->inp_hpts_request) {
+ /*
+ * This guy is deferred out further in time
+ * then our wheel had on it. Push him back
+ * on the wheel.
+ */
+ int32_t remaining_slots;
+
+ remaining_slots = ticks_to_run - (i + 1);
+ if (inp->inp_hpts_request > remaining_slots) {
+ /*
+ * Keep INVARIANTS happy by clearing
+ * the flag
+ */
+ tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1);
+ hpts->p_inp = NULL;
+ continue;
+ }
+ inp->inp_hpts_request = 0;
+ }
+ /*
+ * We clear the hpts flag here after dealing with
+ * remaining slots. This way anyone looking with the
+ * TCB lock will see its on the hpts until just
+ * before we unlock.
+ */
+ inp->inp_in_hpts = 0;
+ mtx_unlock(&hpts->p_mtx);
+ INP_WLOCK(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ mtx_lock(&hpts->p_mtx);
+ if (logging_on)
+ tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1);
+ hpts->p_inp = NULL;
+ continue;
+ }
+ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
+out_now:
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx)) {
+ panic("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__);
+ }
+#endif
+ INP_WUNLOCK(inp);
+ mtx_lock(&hpts->p_mtx);
+ if (logging_on)
+ tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3);
+ hpts->p_inp = NULL;
+ continue;
+ }
+ tp = intotcpcb(inp);
+ if ((tp == NULL) || (tp->t_inpcb == NULL)) {
+ goto out_now;
+ }
+ if (set_cpu) {
+ /*
+ * Setup so the next time we will move to
+ * the right CPU. This should be a rare
+ * event. It will sometimes happens when we
+ * are the client side (usually not the
+ * server). Somehow tcp_output() gets called
+ * before the tcp_do_segment() sets the
+ * intial state. This means the r_cpu and
+ * r_hpts_cpu is 0. We get on the hpts, and
+ * then tcp_input() gets called setting up
+ * the r_cpu to the correct value. The hpts
+ * goes off and sees the mis-match. We
+ * simply correct it here and the CPU will
+ * switch to the new hpts nextime the tcb
+ * gets added to the the hpts (not this one)
+ * :-)
+ */
+ tcp_set_hpts(inp);
+ }
+ if (out_newts_every_tcb) {
+ struct timeval sv;
+
+ if (out_ts_percision)
+ microuptime(&sv);
+ else
+ getmicrouptime(&sv);
+ cts = tcp_tv_to_usectick(&sv);
+ }
+ CURVNET_SET(tp->t_vnet);
+ /*
+ * There is a hole here, we get the refcnt on the
+ * inp so it will still be preserved but to make
+ * sure we can get the INP we need to hold the p_mtx
+ * above while we pull out the tp/inp, as long as
+ * fini gets the lock first we are assured of having
+ * a sane INP we can lock and test.
+ */
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx)) {
+ panic("Hpts:%p owns mtx before tcp-output:%d",
+ hpts, __LINE__);
+ }
+#endif
+ if (tp->t_fb_ptr != NULL) {
+ kern_prefetch(tp->t_fb_ptr, &did_prefetch);
+ did_prefetch = 1;
+ }
+ inp->inp_hpts_calls = 1;
+ if (tp->t_fb->tfb_tcp_output_wtime != NULL) {
+ error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv);
+ } else {
+ error = tp->t_fb->tfb_tcp_output(tp);
+ }
+ if (ninp && ninp->inp_ppcb) {
+ /*
+ * If we have a nxt inp, see if we can
+ * prefetch its ppcb. Note this may seem
+ * "risky" since we have no locks (other
+ * than the previous inp) and there no
+ * assurance that ninp was not pulled while
+ * we were processing inp and freed. If this
+ * occured it could mean that either:
+ *
+ * a) Its NULL (which is fine we won't go
+ * here) <or> b) Its valid (which is cool we
+ * will prefetch it) <or> c) The inp got
+ * freed back to the slab which was
+ * reallocated. Then the piece of memory was
+ * re-used and something else (not an
+ * address) is in inp_ppcb. If that occurs
+ * we don't crash, but take a TLB shootdown
+ * performance hit (same as if it was NULL
+ * and we tried to pre-fetch it).
+ *
+ * Considering that the likelyhood of <c> is
+ * quite rare we will take a risk on doing
+ * this. If performance drops after testing
+ * we can always take this out. NB: the
+ * kern_prefetch on amd64 actually has
+ * protection against a bad address now via
+ * the DMAP_() tests. This will prevent the
+ * TLB hit, and instead if <c> occurs just
+ * cause us to load cache with a useless
+ * address (to us).
+ */
+ kern_prefetch(ninp->inp_ppcb, &prefetch_tp);
+ prefetch_tp = 1;
+ }
+ INP_WUNLOCK(inp);
+ INP_UNLOCK_ASSERT(inp);
+ CURVNET_RESTORE();
+#ifdef INVARIANTS
+ if (mtx_owned(&hpts->p_mtx)) {
+ panic("Hpts:%p owns mtx prior-to lock line:%d",
+ hpts, __LINE__);
+ }
+#endif
+ mtx_lock(&hpts->p_mtx);
+ if (logging_on)
+ tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4);
+ hpts->p_inp = NULL;
+ }
+ HPTS_MTX_ASSERT(hpts);
+ hpts->p_inp = NULL;
+ hpts->p_cur_slot++;
+ if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) {
+ hpts->p_cur_slot = 0;
+ }
+ }
+no_one:
+ HPTS_MTX_ASSERT(hpts);
+ hpts->p_prevtick = hpts->p_curtick;
+ hpts->p_delayed_by = 0;
+ /*
+ * Check to see if we took an excess amount of time and need to run
+ * more ticks (if we did not hit eno-bufs).
+ */
+ /* Re-run any input that may be there */
+ (void)tcp_gethptstick(&tv);
+ if (!TAILQ_EMPTY(&hpts->p_input)) {
+ tcp_input_data(hpts, &tv);
+ }
+#ifdef INVARIANTS
+ if (TAILQ_EMPTY(&hpts->p_input) &&
+ (hpts->p_on_inqueue_cnt != 0)) {
+ panic("tp:%p in_hpts input empty but cnt:%d",
+ hpts, hpts->p_on_inqueue_cnt);
+ }
+#endif
+ tick_now = tcp_gethptstick(&tv);
+ if (SEQ_GT(tick_now, hpts->p_prevtick)) {
+ struct timeval res;
+
+ /* Did we really spend a full tick or more in here? */
+ timersub(&tv, ctick, &res);
+ if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) {
+ counter_u64_add(hpts_loops, 1);
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now);
+ }
+ *ctick = res;
+ hpts->p_curtick = tick_now;
+ goto again;
+ }
+ }
+no_run:
+ {
+ uint32_t t = 0, i, fnd = 0;
+
+ if (hpts->p_on_queue_cnt) {
+
+
+ /*
+ * Find next slot that is occupied and use that to
+ * be the sleep time.
+ */
+ for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) {
+ if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
+ fnd = 1;
+ break;
+ }
+ t = (t + 1) % NUM_OF_HPTSI_SLOTS;
+ }
+ if (fnd) {
+ hpts->p_hpts_sleep_time = i;
+ } else {
+ counter_u64_add(back_tosleep, 1);
+#ifdef INVARIANTS
+ panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt);
+#endif
+ hpts->p_on_queue_cnt = 0;
+ goto non_found;
+ }
+ t++;
+ } else {
+ /* No one on the wheel sleep for all but 2 slots */
+non_found:
+ if (hpts_sleep_max == 0)
+ hpts_sleep_max = 1;
+ hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max);
+ t = 0;
+ }
+ if (logging_on) {
+ tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC));
+ }
+ }
+}
+
+void
+__tcp_set_hpts(struct inpcb *inp, int32_t line)
+{
+ struct tcp_hpts_entry *hpts;
+
+ INP_WLOCK_ASSERT(inp);
+ hpts = tcp_hpts_lock(inp);
+ if ((inp->inp_in_hpts == 0) &&
+ (inp->inp_hpts_cpu_set == 0)) {
+ inp->inp_hpts_cpu = hpts_cpuid(inp);
+ inp->inp_hpts_cpu_set = 1;
+ }
+ mtx_unlock(&hpts->p_mtx);
+ hpts = tcp_input_lock(inp);
+ if ((inp->inp_input_cpu_set == 0) &&
+ (inp->inp_in_input == 0)) {
+ inp->inp_input_cpu = hpts_cpuid(inp);
+ inp->inp_input_cpu_set = 1;
+ }
+ mtx_unlock(&hpts->p_mtx);
+}
+
+uint16_t
+tcp_hpts_delayedby(struct inpcb *inp){
+ return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
+}
+
+static void
+tcp_hpts_thread(void *ctx)
+{
+ struct tcp_hpts_entry *hpts;
+ struct timeval tv;
+ sbintime_t sb;
+
+ hpts = (struct tcp_hpts_entry *)ctx;
+ mtx_lock(&hpts->p_mtx);
+ if (hpts->p_direct_wake) {
+ /* Signaled by input */
+ if (logging_on)
+ tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1);
+ callout_stop(&hpts->co);
+ } else {
+ /* Timed out */
+ if (callout_pending(&hpts->co) ||
+ !callout_active(&hpts->co)) {
+ if (logging_on)
+ tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2);
+ mtx_unlock(&hpts->p_mtx);
+ return;
+ }
+ callout_deactivate(&hpts->co);
+ if (logging_on)
+ tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3);
+ }
+ hpts->p_hpts_active = 1;
+ (void)tcp_gethptstick(&tv);
+ tcp_hptsi(hpts, &tv);
+ HPTS_MTX_ASSERT(hpts);
+ tv.tv_sec = 0;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
+ if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
+ tv.tv_usec = tcp_min_hptsi_time;
+ hpts->p_on_min_sleep = 1;
+ } else {
+ /* Clear the min sleep flag */
+ hpts->p_on_min_sleep = 0;
+ }
+ hpts->p_hpts_active = 0;
+ sb = tvtosbt(tv);
+ if (tcp_hpts_callout_skip_swi == 0) {
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, hpts->p_cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ } else {
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_dir, hpts,
+ hpts->p_cpu,
+ C_PREL(tcp_hpts_precision));
+ }
+ hpts->p_direct_wake = 0;
+ mtx_unlock(&hpts->p_mtx);
+}
+
+#undef timersub
+
+static void
+tcp_init_hptsi(void *st)
+{
+ int32_t i, j, error, bound = 0, created = 0;
+ size_t sz, asz;
+ struct timeval tv;
+ sbintime_t sb;
+ struct tcp_hpts_entry *hpts;
+ char unit[16];
+ uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
+
+ tcp_pace.rp_proc = NULL;
+ tcp_pace.rp_num_hptss = ncpus;
+ hpts_loops = counter_u64_alloc(M_WAITOK);
+ back_tosleep = counter_u64_alloc(M_WAITOK);
+
+ sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
+ tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+ asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+ tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
+ M_TCPHPTS, M_WAITOK | M_ZERO);
+ tcp_pace.rp_ent[i]->p_hptss = malloc(asz,
+ M_TCPHPTS, M_WAITOK);
+ hpts = tcp_pace.rp_ent[i];
+ /*
+ * Init all the hpts structures that are not specifically
+ * zero'd by the allocations. Also lets attach them to the
+ * appropriate sysctl block as well.
+ */
+ mtx_init(&hpts->p_mtx, "tcp_hpts_lck",
+ "hpts", MTX_DEF | MTX_DUPOK);
+ TAILQ_INIT(&hpts->p_input);
+ for (j = 0; j < NUM_OF_HPTSI_SLOTS; j++) {
+ TAILQ_INIT(&hpts->p_hptss[j]);
+ }
+ sysctl_ctx_init(&hpts->hpts_ctx);
+ sprintf(unit, "%d", i);
+ hpts->hpts_root = SYSCTL_ADD_NODE(&hpts->hpts_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_hpts),
+ OID_AUTO,
+ unit,
+ CTLFLAG_RW, 0,
+ "");
+ SYSCTL_ADD_INT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "in_qcnt", CTLFLAG_RD,
+ &hpts->p_on_inqueue_cnt, 0,
+ "Count TCB's awaiting input processing");
+ SYSCTL_ADD_INT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "out_qcnt", CTLFLAG_RD,
+ &hpts->p_on_queue_cnt, 0,
+ "Count TCB's awaiting output processing");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "active", CTLFLAG_RD,
+ &hpts->p_hpts_active, 0,
+ "Is the hpts active");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "curslot", CTLFLAG_RD,
+ &hpts->p_cur_slot, 0,
+ "What the current slot is if active");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "curtick", CTLFLAG_RD,
+ &hpts->p_curtick, 0,
+ "What the current tick on if active");
+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
+ SYSCTL_CHILDREN(hpts->hpts_root),
+ OID_AUTO, "logsize", CTLFLAG_RD,
+ &hpts->p_logsize, 0,
+ "Hpts logging buffer size");
+ hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2;
+ hpts->p_num = i;
+ hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv);
+ hpts->p_prevtick -= 1;
+ hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS;
+ hpts->p_cpu = 0xffff;
+ hpts->p_nxt_slot = 1;
+ hpts->p_logsize = tcp_hpts_logging_size;
+ if (hpts->p_logsize) {
+ sz = (sizeof(struct hpts_log) * hpts->p_logsize);
+ hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO);
+ }
+ callout_init(&hpts->co, 1);
+ }
+ /*
+ * Now lets start ithreads to handle the hptss.
+ */
+ CPU_FOREACH(i) {
+ hpts = tcp_pace.rp_ent[i];
+ hpts->p_cpu = i;
+ error = swi_add(&hpts->ie, "hpts",
+ tcp_hpts_thread, (void *)hpts,
+ SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
+ if (error) {
+ panic("Can't add hpts:%p i:%d err:%d",
+ hpts, i, error);
+ }
+ created++;
+ if (tcp_bind_threads) {
+ if (intr_event_bind(hpts->ie, i) == 0)
+ bound++;
+ }
+ tv.tv_sec = 0;
+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
+ sb = tvtosbt(tv);
+ if (tcp_hpts_callout_skip_swi == 0) {
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_swi, hpts, hpts->p_cpu,
+ (C_DIRECT_EXEC | C_PREL(tcp_hpts_precision)));
+ } else {
+ callout_reset_sbt_on(&hpts->co, sb, 0,
+ hpts_timeout_dir, hpts,
+ hpts->p_cpu,
+ C_PREL(tcp_hpts_precision));
+ }
+ }
+ printf("TCP Hpts created %d swi interrupt thread and bound %d\n",
+ created, bound);
+ return;
+}
+
+SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
Index: head/sys/netinet/tcp_stacks/fastpath.c
===================================================================
--- head/sys/netinet/tcp_stacks/fastpath.c
+++ head/sys/netinet/tcp_stacks/fastpath.c
@@ -2404,7 +2404,7 @@
err = register_tcp_functions(&__tcp_fastslow, M_WAITOK);
if (err) {
printf("Failed to register fastslow module -- err:%d\n", err);
- deregister_tcp_functions(&__tcp_fastack);
+ deregister_tcp_functions(&__tcp_fastack, false, true);
return(err);
}
break;
@@ -2412,12 +2412,12 @@
if ((__tcp_fastslow.tfb_refcnt) ||( __tcp_fastack.tfb_refcnt)) {
return(EBUSY);
}
+ err = deregister_tcp_functions(&__tcp_fastack, true, false);
+ err = deregister_tcp_functions(&__tcp_fastslow, true, false);
break;
case MOD_UNLOAD:
- err = deregister_tcp_functions(&__tcp_fastack);
- if (err == EBUSY)
- break;
- err = deregister_tcp_functions(&__tcp_fastslow);
+ err = deregister_tcp_functions(&__tcp_fastack, false, true);
+ err = deregister_tcp_functions(&__tcp_fastslow, false, true);
if (err == EBUSY)
break;
err = 0;
Index: head/sys/netinet/tcp_subr.c
===================================================================
--- head/sys/netinet/tcp_subr.c
+++ head/sys/netinet/tcp_subr.c
@@ -232,6 +232,9 @@
VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
#endif
+static int tcp_default_fb_init(struct tcpcb *tp);
+static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
+static int tcp_default_handoff_ok(struct tcpcb *tp);
static struct inpcb *tcp_notify(struct inpcb *, int);
static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
static void tcp_mtudisc(struct inpcb *, int);
@@ -240,18 +243,13 @@
static struct tcp_function_block tcp_def_funcblk = {
- "default",
- tcp_output,
- tcp_do_segment,
- tcp_default_ctloutput,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- NULL,
- 0,
- 0
+ .tfb_tcp_block_name = "freebsd",
+ .tfb_tcp_output = tcp_output,
+ .tfb_tcp_do_segment = tcp_do_segment,
+ .tfb_tcp_ctloutput = tcp_default_ctloutput,
+ .tfb_tcp_handoff_ok = tcp_default_handoff_ok,
+ .tfb_tcp_fb_init = tcp_default_fb_init,
+ .tfb_tcp_fb_fini = tcp_default_fb_fini,
};
int t_functions_inited = 0;
@@ -328,7 +326,89 @@
return(rblk);
}
+static struct tcp_function_block *
+find_and_ref_tcp_default_fb(void)
+{
+ struct tcp_function_block *rblk;
+ rw_rlock(&tcp_function_lock);
+ rblk = tcp_func_set_ptr;
+ refcount_acquire(&rblk->tfb_refcnt);
+ rw_runlock(&tcp_function_lock);
+ return (rblk);
+}
+
+void
+tcp_switch_back_to_default(struct tcpcb *tp)
+{
+ struct tcp_function_block *tfb;
+
+ KASSERT(tp->t_fb != &tcp_def_funcblk,
+ ("%s: called by the built-in default stack", __func__));
+
+ /*
+ * Release the old stack. This function will either find a new one
+ * or panic.
+ */
+ if (tp->t_fb->tfb_tcp_fb_fini != NULL)
+ (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+ refcount_release(&tp->t_fb->tfb_refcnt);
+
+ /*
+ * Now, we'll find a new function block to use.
+ * Start by trying the current user-selected
+ * default, unless this stack is the user-selected
+ * default.
+ */
+ tfb = find_and_ref_tcp_default_fb();
+ if (tfb == tp->t_fb) {
+ refcount_release(&tfb->tfb_refcnt);
+ tfb = NULL;
+ }
+ /* Does the stack accept this connection? */
+ if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL &&
+ (*tfb->tfb_tcp_handoff_ok)(tp)) {
+ refcount_release(&tfb->tfb_refcnt);
+ tfb = NULL;
+ }
+ /* Try to use that stack. */
+ if (tfb != NULL) {
+ /* Initialize the new stack. If it succeeds, we are done. */
+ tp->t_fb = tfb;
+ if (tp->t_fb->tfb_tcp_fb_init == NULL ||
+ (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
+ return;
+
+ /*
+ * Initialization failed. Release the reference count on
+ * the stack.
+ */
+ refcount_release(&tfb->tfb_refcnt);
+ }
+
+ /*
+ * If that wasn't feasible, use the built-in default
+ * stack which is not allowed to reject anyone.
+ */
+ tfb = find_and_ref_tcp_fb(&tcp_def_funcblk);
+ if (tfb == NULL) {
+ /* there always should be a default */
+ panic("Can't refer to tcp_def_funcblk");
+ }
+ if (tfb->tfb_tcp_handoff_ok != NULL) {
+ if ((*tfb->tfb_tcp_handoff_ok) (tp)) {
+ /* The default stack cannot say no */
+ panic("Default stack rejects a new session?");
+ }
+ }
+ tp->t_fb = tfb;
+ if (tp->t_fb->tfb_tcp_fb_init != NULL &&
+ (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
+ /* The default stack cannot fail */
+ panic("Default stack initialization failed");
+ }
+}
+
static int
sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
{
@@ -507,6 +587,89 @@
"List TCP function block name-to-ID mappings");
/*
+ * tfb_tcp_handoff_ok() function for the default stack.
+ * Note that we'll basically try to take all comers.
+ */
+static int
+tcp_default_handoff_ok(struct tcpcb *tp)
+{
+
+ return (0);
+}
+
+/*
+ * tfb_tcp_fb_init() function for the default stack.
+ *
+ * This handles making sure we have appropriate timers set if you are
+ * transitioning a socket that has some amount of setup done.
+ *
+ * The init() fuction from the default can *never* return non-zero i.e.
+ * it is required to always succeed since it is the stack of last resort!
+ */
+static int
+tcp_default_fb_init(struct tcpcb *tp)
+{
+
+ struct socket *so;
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
+ ("%s: connection %p in unexpected state %d", __func__, tp,
+ tp->t_state));
+
+ /*
+ * Nothing to do for ESTABLISHED or LISTEN states. And, we don't
+ * know what to do for unexpected states (which includes TIME_WAIT).
+ */
+ if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT)
+ return (0);
+
+ /*
+ * Make sure some kind of transmission timer is set if there is
+ * outstanding data.
+ */
+ so = tp->t_inpcb->inp_socket;
+ if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) ||
+ tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) ||
+ tcp_timer_active(tp, TT_PERSIST))) {
+ /*
+ * If the session has established and it looks like it should
+ * be in the persist state, set the persist timer. Otherwise,
+ * set the retransmit timer.
+ */
+ if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 &&
+ (int32_t)(tp->snd_nxt - tp->snd_una) <
+ (int32_t)sbavail(&so->so_snd))
+ tcp_setpersist(tp);
+ else
+ tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
+ }
+
+ /* All non-embryonic sessions get a keepalive timer. */
+ if (!tcp_timer_active(tp, TT_KEEP))
+ tcp_timer_activate(tp, TT_KEEP,
+ TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) :
+ TP_KEEPINIT(tp));
+
+ return (0);
+}
+
+/*
+ * tfb_tcp_fb_fini() function for the default stack.
+ *
+ * This changes state as necessary (or prudent) to prepare for another stack
+ * to assume responsibility for the connection.
+ */
+static void
+tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged)
+{
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+ return;
+}
+
+/*
* Target size of TCP PCB hash tables. Must be a power of two.
*
* Note that this can be overridden by the kernel environment
@@ -732,11 +895,28 @@
return (register_tcp_functions_as_name(blk, NULL, wait));
}
+/*
+ * Deregister all names associated with a function block. This
+ * functionally removes the function block from use within the system.
+ *
+ * When called with a true quiesce argument, mark the function block
+ * as being removed so no more stacks will use it and determine
+ * whether the removal would succeed.
+ *
+ * When called with a false quiesce argument, actually attempt the
+ * removal.
+ *
+ * When called with a force argument, attempt to switch all TCBs to
+ * use the default stack instead of returning EBUSY.
+ *
+ * Returns 0 on success (or if the removal would succeed, or an error
+ * code on failure.
+ */
int
-deregister_tcp_functions(struct tcp_function_block *blk)
+deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
+ bool force)
{
struct tcp_function *f;
- int error=ENOENT;
if (strcmp(blk->tfb_tcp_block_name, "default") == 0) {
/* You can't un-register the default */
@@ -748,22 +928,63 @@
rw_wunlock(&tcp_function_lock);
return (EBUSY);
}
+ /* Mark the block so no more stacks can use it. */
+ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
+ /*
+ * If TCBs are still attached to the stack, attempt to switch them
+ * to the default stack.
+ */
+ if (force && blk->tfb_refcnt) {
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ VNET_ITERATOR_DECL(vnet_iter);
+
+ rw_wunlock(&tcp_function_lock);
+
+ VNET_LIST_RLOCK();
+ VNET_FOREACH(vnet_iter) {
+ CURVNET_SET(vnet_iter);
+ INP_INFO_WLOCK(&V_tcbinfo);
+ LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) {
+ INP_WLOCK(inp);
+ if (inp->inp_flags & INP_TIMEWAIT) {
+ INP_WUNLOCK(inp);
+ continue;
+ }
+ tp = intotcpcb(inp);
+ if (tp == NULL || tp->t_fb != blk) {
+ INP_WUNLOCK(inp);
+ continue;
+ }
+ tcp_switch_back_to_default(tp);
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ }
+ VNET_LIST_RUNLOCK();
+
+ rw_wlock(&tcp_function_lock);
+ }
if (blk->tfb_refcnt) {
- /* Still tcb attached, mark it. */
- blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
- rw_wunlock(&tcp_function_lock);
+ /* TCBs still attached. */
+ rw_wunlock(&tcp_function_lock);
return (EBUSY);
}
+ if (quiesce) {
+ /* Skip removal. */
+ rw_wunlock(&tcp_function_lock);
+ return (0);
+ }
+ /* Remove any function names that map to this function block. */
while (find_tcp_fb_locked(blk, &f) != NULL) {
- /* Found */
TAILQ_REMOVE(&t_functions, f, tf_next);
tcp_fb_cnt--;
f->tf_fb = NULL;
free(f, M_TCPFUNCTIONS);
- error = 0;
}
rw_wunlock(&tcp_function_lock);
- return (error);
+ return (0);
}
void
Index: head/sys/netinet/tcp_syncache.c
===================================================================
--- head/sys/netinet/tcp_syncache.c
+++ head/sys/netinet/tcp_syncache.c
@@ -852,6 +852,12 @@
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
refcount_release(&tp->t_fb->tfb_refcnt);
tp->t_fb = rblk;
+ /*
+ * XXXrrs this is quite dangerous, it is possible
+ * for the new function to fail to init. We also
+ * are not asking if the handoff_is_ok though at
+ * the very start thats probalbly ok.
+ */
if (tp->t_fb->tfb_tcp_fb_init) {
(*tp->t_fb->tfb_tcp_fb_init)(tp);
}
Index: head/sys/netinet/tcp_usrreq.c
===================================================================
--- head/sys/netinet/tcp_usrreq.c
+++ head/sys/netinet/tcp_usrreq.c
@@ -1521,17 +1521,34 @@
*/
(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
}
+#ifdef TCPHPTS
+ /* Assure that we are not on any hpts */
+ tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_ALL);
+#endif
+ if (blk->tfb_tcp_fb_init) {
+ error = (*blk->tfb_tcp_fb_init)(tp);
+ if (error) {
+ refcount_release(&blk->tfb_refcnt);
+ if (tp->t_fb->tfb_tcp_fb_init) {
+ if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0) {
+ /* Fall back failed, drop the connection */
+ INP_WUNLOCK(inp);
+ soabort(so);
+ return(error);
+ }
+ }
+ goto err_out;
+ }
+ }
refcount_release(&tp->t_fb->tfb_refcnt);
tp->t_fb = blk;
- if (tp->t_fb->tfb_tcp_fb_init) {
- (*tp->t_fb->tfb_tcp_fb_init)(tp);
- }
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
tcp_offload_ctloutput(tp, sopt->sopt_dir,
sopt->sopt_name);
}
#endif
+err_out:
INP_WUNLOCK(inp);
return (error);
} else if ((sopt->sopt_dir == SOPT_GET) &&
Index: head/sys/netinet/tcp_var.h
===================================================================
--- head/sys/netinet/tcp_var.h
+++ head/sys/netinet/tcp_var.h
@@ -90,6 +90,8 @@
int t_segqlen; /* segment reassembly queue length */
int t_dupacks; /* consecutive dup acks recd */
+ struct mbuf *t_in_pkt; /* head of the input packet queue for the tcp_hpts system */
+ struct mbuf *t_tail_pkt; /* tail of the input packet queue for the tcp_hpts system */
struct tcp_timer *t_timers; /* All the TCP timers in one struct */
struct inpcb *t_inpcb; /* back pointer to internet pcb */
@@ -257,14 +259,19 @@
struct tcp_function_block {
char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
int (*tfb_tcp_output)(struct tcpcb *);
+ int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *);
void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
struct socket *, struct tcpcb *,
int, int, uint8_t,
int);
+ void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *,
+ struct socket *, struct tcpcb *,
+ int, int, uint8_t,
+ int, int, struct timeval *);
int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt,
struct inpcb *inp, struct tcpcb *tp);
/* Optional memory allocation/free routine */
- void (*tfb_tcp_fb_init)(struct tcpcb *);
+ int (*tfb_tcp_fb_init)(struct tcpcb *);
void (*tfb_tcp_fb_fini)(struct tcpcb *, int);
/* Optional timers, must define all if you define one */
int (*tfb_tcp_timer_stop_all)(struct tcpcb *);
@@ -274,6 +281,7 @@
void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
void (*tfb_tcp_rexmit_tmr)(struct tcpcb *);
int (*tfb_tcp_handoff_ok)(struct tcpcb *);
+ void (*tfb_tcp_mtu_chg)(struct tcpcb *);
volatile uint32_t tfb_refcnt;
uint32_t tfb_flags;
uint8_t tfb_id;
@@ -851,9 +859,12 @@
int wait, const char *names[], int *num_names);
int register_tcp_functions_as_name(struct tcp_function_block *blk,
const char *name, int wait);
-int deregister_tcp_functions(struct tcp_function_block *blk);
+int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
+ bool force);
struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs);
-struct tcp_function_block *find_and_ref_tcp_fb(struct tcp_function_block *blk);
+void tcp_switch_back_to_default(struct tcpcb *tp);
+struct tcp_function_block *
+find_and_ref_tcp_fb(struct tcp_function_block *fs);
int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp);
uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
Index: head/sys/sys/kern_prefetch.h
===================================================================
--- head/sys/sys/kern_prefetch.h
+++ head/sys/sys/kern_prefetch.h
@@ -0,0 +1,50 @@
+#ifndef __kern_prefetch_h__
+/*-
+ * Copyright (c) 2016-8
+ * Netflix Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * __FBSDID("$FreeBSD$")
+ */
+#define __kern_prefetch_h__
+#ifdef _KERNEL
+#if defined(__amd64__)
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/pmap.h>
+#endif
+
+static __inline void
+kern_prefetch(const volatile void *addr, void* before)
+{
+#if defined(__amd64__)
+ __asm __volatile("prefetcht1 (%1)":"=rm"(*((int32_t *)before)):"r"(addr):);
+#else
+ __builtin_prefetch(addr);
+#endif
+}
+
+#endif
+#endif
Index: head/sys/sys/mbuf.h
===================================================================
--- head/sys/sys/mbuf.h
+++ head/sys/sys/mbuf.h
@@ -196,6 +196,11 @@
#define lro_nsegs tso_segsz
#define csum_phsum PH_per.sixteen[2]
#define csum_data PH_per.thirtytwo[1]
+#define pace_thoff PH_loc.sixteen[0]
+#define pace_tlen PH_loc.sixteen[1]
+#define pace_drphdrlen PH_loc.sixteen[2]
+#define pace_tos PH_loc.eight[6]
+#define pace_lock PH_loc.eight[7]
/*
* Description of external storage mapped into mbuf; valid only if M_EXT is
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Sat, Dec 13, 11:32 AM (16 h, 55 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
26929883
Default Alt Text
D15020.diff (91 KB)
Attached To
Mode
D15020: Add of high-precision-timer-system used by rack and bbr for pacing. Also sync up the function block differences needed by bbr/rack.
Attached
Detach File
Event Timeline
Log In to Comment