diff --git a/sys/netinet/tcp_log_buf.c b/sys/netinet/tcp_log_buf.c index c533f8329fbe..fb8cb4453136 100644 --- a/sys/netinet/tcp_log_buf.c +++ b/sys/netinet/tcp_log_buf.c @@ -1,2973 +1,2973 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Must come after qmath.h and tree.h */ #include #include #include #include #include #include #include #include #include #include #include #include /* Default expiry time */ #define TCP_LOG_EXPIRE_TIME ((sbintime_t)60 * SBT_1S) /* Max interval at which to run the expiry timer */ #define TCP_LOG_EXPIRE_INTVL ((sbintime_t)5 * SBT_1S) bool tcp_log_verbose; static uma_zone_t tcp_log_id_bucket_zone, tcp_log_id_node_zone, tcp_log_zone; static int tcp_log_session_limit = TCP_LOG_BUF_DEFAULT_SESSION_LIMIT; static uint32_t tcp_log_version = TCP_LOG_BUF_VER; RB_HEAD(tcp_log_id_tree, tcp_log_id_bucket); static struct tcp_log_id_tree tcp_log_id_head; static STAILQ_HEAD(, tcp_log_id_node) tcp_log_expireq_head = STAILQ_HEAD_INITIALIZER(tcp_log_expireq_head); static struct mtx tcp_log_expireq_mtx; static struct callout tcp_log_expireq_callout; static u_long tcp_log_auto_ratio = 0; static volatile u_long tcp_log_auto_ratio_cur = 0; static uint32_t tcp_log_auto_mode = TCP_LOG_STATE_TAIL; static bool tcp_log_auto_all = false; static uint32_t tcp_disable_all_bb_logs = 0; RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp) SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP Black Box controls"); SYSCTL_NODE(_net_inet_tcp_bb, OID_AUTO, tp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "TCP Black Box Trace Point controls"); SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose, 0, "Force verbose logging for TCP traces"); SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit, CTLFLAG_RW, &tcp_log_session_limit, 0, "Maximum number of events maintained for each TCP session"); uint32_t tcp_trace_point_config = 0; SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, number, CTLFLAG_RW, &tcp_trace_point_config, TCP_LOG_STATE_HEAD_AUTO, "What is the trace point number to activate (0=none, 0xffffffff = all)?"); uint32_t tcp_trace_point_bb_mode = TCP_LOG_STATE_CONTINUAL; SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, bbmode, CTLFLAG_RW, &tcp_trace_point_bb_mode, TCP_LOG_STATE_HEAD_AUTO, "What is BB logging mode that is activated?"); int32_t tcp_trace_point_count = 0; SYSCTL_U32(_net_inet_tcp_bb_tp, OID_AUTO, count, CTLFLAG_RW, &tcp_trace_point_count, TCP_LOG_STATE_HEAD_AUTO, "How many connections will have BB logging turned on that hit the tracepoint?"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW, &tcp_log_zone, "Maximum number of events maintained for all TCP sessions"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_global_entries, CTLFLAG_RD, &tcp_log_zone, "Current number of events maintained for all TCP sessions"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_limit, CTLFLAG_RW, &tcp_log_id_bucket_zone, "Maximum number of log IDs"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_entries, CTLFLAG_RD, &tcp_log_id_bucket_zone, "Current number of log IDs"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_limit, CTLFLAG_RW, &tcp_log_id_node_zone, "Maximum number of tcpcbs with log IDs"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_entries, CTLFLAG_RD, &tcp_log_id_node_zone, "Current number of tcpcbs with log IDs"); SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_version, CTLFLAG_RD, &tcp_log_version, 0, "Version of log formats exported"); SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, disable_all, CTLFLAG_RW, &tcp_disable_all_bb_logs, 0, "Disable all BB logging for all connections"); SYSCTL_ULONG(_net_inet_tcp_bb, OID_AUTO, log_auto_ratio, CTLFLAG_RW, &tcp_log_auto_ratio, 0, "Do auto capturing for 1 out of N sessions"); SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_auto_mode, CTLFLAG_RW, &tcp_log_auto_mode, 0, "Logging mode for auto-selected sessions (default is TCP_LOG_STATE_TAIL)"); SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_auto_all, CTLFLAG_RW, &tcp_log_auto_all, 0, "Auto-select from all sessions (rather than just those with IDs)"); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_t tcp_log_queued; counter_u64_t tcp_log_que_fail1; counter_u64_t tcp_log_que_fail2; counter_u64_t tcp_log_que_fail3; counter_u64_t tcp_log_que_fail4; counter_u64_t tcp_log_que_fail5; counter_u64_t tcp_log_que_copyout; counter_u64_t tcp_log_que_read; counter_u64_t tcp_log_que_freed; SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, queued, CTLFLAG_RD, &tcp_log_queued, "Number of entries queued"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail1, CTLFLAG_RD, &tcp_log_que_fail1, "Number of entries queued but fail 1"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail2, CTLFLAG_RD, &tcp_log_que_fail2, "Number of entries queued but fail 2"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail3, CTLFLAG_RD, &tcp_log_que_fail3, "Number of entries queued but fail 3"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail4, CTLFLAG_RD, &tcp_log_que_fail4, "Number of entries queued but fail 4"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail5, CTLFLAG_RD, &tcp_log_que_fail5, "Number of entries queued but fail 4"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, copyout, CTLFLAG_RD, &tcp_log_que_copyout, "Number of entries copied out"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, read, CTLFLAG_RD, &tcp_log_que_read, "Number of entries read from the queue"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, freed, CTLFLAG_RD, &tcp_log_que_freed, "Number of entries freed after reading"); #endif #ifdef INVARIANTS #define TCPLOG_DEBUG_RINGBUF #endif /* Number of requests to consider a PBCID "active". */ #define ACTIVE_REQUEST_COUNT 10 /* Statistic tracking for "active" PBCIDs. */ static counter_u64_t tcp_log_pcb_ids_cur; static counter_u64_t tcp_log_pcb_ids_tot; SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_cur, CTLFLAG_RD, &tcp_log_pcb_ids_cur, "Number of pcb IDs allocated in the system"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_tot, CTLFLAG_RD, &tcp_log_pcb_ids_tot, "Total number of pcb IDs that have been allocated"); struct tcp_log_mem { STAILQ_ENTRY(tcp_log_mem) tlm_queue; struct tcp_log_buffer tlm_buf; struct tcp_log_verbose tlm_v; #ifdef TCPLOG_DEBUG_RINGBUF volatile int tlm_refcnt; #endif }; /* 60 bytes for the header, + 16 bytes for padding */ static uint8_t zerobuf[76]; /* * Lock order: * 1. TCPID_TREE * 2. TCPID_BUCKET * 3. INP * * Rules: * A. You need a lock on the Tree to add/remove buckets. * B. You need a lock on the bucket to add/remove nodes from the bucket. * C. To change information in a node, you need the INP lock if the tln_closed * field is false. Otherwise, you need the bucket lock. (Note that the * tln_closed field can change at any point, so you need to recheck the * entry after acquiring the INP lock.) * D. To remove a node from the bucket, you must have that entry locked, * according to the criteria of Rule C. Also, the node must not be on * the expiry queue. * E. The exception to C is the expiry queue fields, which are locked by * the TCPLOG_EXPIREQ lock. * * Buckets have a reference count. Each node is a reference. Further, * other callers may add reference counts to keep a bucket from disappearing. * You can add a reference as long as you own a lock sufficient to keep the * bucket from disappearing. For example, a common use is: * a. Have a locked INP, but need to lock the TCPID_BUCKET. * b. Add a refcount on the bucket. (Safe because the INP lock prevents * the TCPID_BUCKET from going away.) * c. Drop the INP lock. * d. Acquire a lock on the TCPID_BUCKET. * e. Acquire a lock on the INP. * f. Drop the refcount on the bucket. * (At this point, the bucket may disappear.) * * Expire queue lock: * You can acquire this with either the bucket or INP lock. Don't reverse it. * When the expire code has committed to freeing a node, it resets the expiry * time to SBT_MAX. That is the signal to everyone else that they should * leave that node alone. */ static struct rwlock tcp_id_tree_lock; #define TCPID_TREE_WLOCK() rw_wlock(&tcp_id_tree_lock) #define TCPID_TREE_RLOCK() rw_rlock(&tcp_id_tree_lock) #define TCPID_TREE_UPGRADE() rw_try_upgrade(&tcp_id_tree_lock) #define TCPID_TREE_WUNLOCK() rw_wunlock(&tcp_id_tree_lock) #define TCPID_TREE_RUNLOCK() rw_runlock(&tcp_id_tree_lock) #define TCPID_TREE_WLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_WLOCKED) #define TCPID_TREE_RLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_RLOCKED) #define TCPID_TREE_UNLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_UNLOCKED) #define TCPID_BUCKET_LOCK_INIT(tlb) mtx_init(&((tlb)->tlb_mtx), "tcp log id bucket", NULL, MTX_DEF) #define TCPID_BUCKET_LOCK_DESTROY(tlb) mtx_destroy(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_LOCK(tlb) mtx_lock(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_UNLOCK(tlb) mtx_unlock(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_LOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_OWNED) #define TCPID_BUCKET_UNLOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_NOTOWNED) #define TCPID_BUCKET_REF(tlb) refcount_acquire(&((tlb)->tlb_refcnt)) #define TCPID_BUCKET_UNREF(tlb) refcount_release(&((tlb)->tlb_refcnt)) #define TCPLOG_EXPIREQ_LOCK() mtx_lock(&tcp_log_expireq_mtx) #define TCPLOG_EXPIREQ_UNLOCK() mtx_unlock(&tcp_log_expireq_mtx) SLIST_HEAD(tcp_log_id_head, tcp_log_id_node); struct tcp_log_id_bucket { /* * tlb_id must be first. This lets us use strcmp on * (struct tcp_log_id_bucket *) and (char *) interchangeably. */ char tlb_id[TCP_LOG_ID_LEN]; char tlb_tag[TCP_LOG_TAG_LEN]; RB_ENTRY(tcp_log_id_bucket) tlb_rb; struct tcp_log_id_head tlb_head; struct mtx tlb_mtx; volatile u_int tlb_refcnt; volatile u_int tlb_reqcnt; uint32_t tlb_loglimit; int8_t tlb_logstate; }; struct tcp_log_id_node { SLIST_ENTRY(tcp_log_id_node) tln_list; STAILQ_ENTRY(tcp_log_id_node) tln_expireq; /* Locked by the expireq lock */ sbintime_t tln_expiretime; /* Locked by the expireq lock */ /* * If INP is NULL, that means the connection has closed. We've * saved the connection endpoint information and the log entries * in the tln_ie and tln_entries members. We've also saved a pointer * to the enclosing bucket here. If INP is not NULL, the information is * in the PCB and not here. */ struct inpcb *tln_inp; struct tcpcb *tln_tp; struct tcp_log_id_bucket *tln_bucket; struct in_endpoints tln_ie; struct tcp_log_stailq tln_entries; int tln_count; volatile int tln_closed; uint8_t tln_af; }; enum tree_lock_state { TREE_UNLOCKED = 0, TREE_RLOCKED, TREE_WLOCKED, }; /* Do we want to select this session for auto-logging? */ static __inline bool tcp_log_selectauto(void) { /* * If we are doing auto-capturing, figure out whether we will capture * this session. */ if (tcp_log_auto_ratio && (tcp_disable_all_bb_logs == 0) && (atomic_fetchadd_long(&tcp_log_auto_ratio_cur, 1) % tcp_log_auto_ratio) == 0) return (true); return (false); } static __inline int tcp_log_id_cmp(struct tcp_log_id_bucket *a, struct tcp_log_id_bucket *b) { KASSERT(a != NULL, ("tcp_log_id_cmp: argument a is unexpectedly NULL")); KASSERT(b != NULL, ("tcp_log_id_cmp: argument b is unexpectedly NULL")); return strncmp(a->tlb_id, b->tlb_id, TCP_LOG_ID_LEN); } RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp) static __inline void tcp_log_id_validate_tree_lock(int tree_locked) { #ifdef INVARIANTS switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WLOCK_ASSERT(); break; case TREE_RLOCKED: TCPID_TREE_RLOCK_ASSERT(); break; case TREE_UNLOCKED: TCPID_TREE_UNLOCK_ASSERT(); break; default: kassert_panic("%s:%d: unknown tree lock state", __func__, __LINE__); } #endif } static __inline void tcp_log_remove_bucket(struct tcp_log_id_bucket *tlb) { TCPID_TREE_WLOCK_ASSERT(); KASSERT(SLIST_EMPTY(&tlb->tlb_head), ("%s: Attempt to remove non-empty bucket", __func__)); if (RB_REMOVE(tcp_log_id_tree, &tcp_log_id_head, tlb) == NULL) { #ifdef INVARIANTS kassert_panic("%s:%d: error removing element from tree", __func__, __LINE__); #endif } TCPID_BUCKET_LOCK_DESTROY(tlb); counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1); uma_zfree(tcp_log_id_bucket_zone, tlb); } /* * Call with a referenced and locked bucket. * Will return true if the bucket was freed; otherwise, false. * tlb: The bucket to unreference. * tree_locked: A pointer to the state of the tree lock. If the tree lock * state changes, the function will update it. * inp: If not NULL and the function needs to drop the inp lock to relock the * tree, it will do so. (The caller must ensure inp will not become invalid, * probably by holding a reference to it.) */ static bool tcp_log_unref_bucket(struct tcp_log_id_bucket *tlb, int *tree_locked, struct inpcb *inp) { KASSERT(tlb != NULL, ("%s: called with NULL tlb", __func__)); KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked", __func__)); tcp_log_id_validate_tree_lock(*tree_locked); /* * Did we hold the last reference on the tlb? If so, we may need * to free it. (Note that we can realistically only execute the * loop twice: once without a write lock and once with a write * lock.) */ while (TCPID_BUCKET_UNREF(tlb)) { /* * We need a write lock on the tree to free this. * If we can upgrade the tree lock, this is "easy". If we * can't upgrade the tree lock, we need to do this the * "hard" way: unwind all our locks and relock everything. * In the meantime, anything could have changed. We even * need to validate that we still need to free the bucket. */ if (*tree_locked == TREE_RLOCKED && TCPID_TREE_UPGRADE()) *tree_locked = TREE_WLOCKED; else if (*tree_locked != TREE_WLOCKED) { TCPID_BUCKET_REF(tlb); if (inp != NULL) INP_WUNLOCK(inp); TCPID_BUCKET_UNLOCK(tlb); if (*tree_locked == TREE_RLOCKED) TCPID_TREE_RUNLOCK(); TCPID_TREE_WLOCK(); *tree_locked = TREE_WLOCKED; TCPID_BUCKET_LOCK(tlb); if (inp != NULL) INP_WLOCK(inp); continue; } /* * We have an empty bucket and a write lock on the tree. * Remove the empty bucket. */ tcp_log_remove_bucket(tlb); return (true); } return (false); } /* * Call with a locked bucket. This function will release the lock on the * bucket before returning. * * The caller is responsible for freeing the tp->t_lin/tln node! * * Note: one of tp or both tlb and tln must be supplied. * * inp: A pointer to the inp. If the function needs to drop the inp lock to * acquire the tree write lock, it will do so. (The caller must ensure inp * will not become invalid, probably by holding a reference to it.) * tp: A pointer to the tcpcb. (optional; if specified, tlb and tln are ignored) * tlb: A pointer to the bucket. (optional; ignored if tp is specified) * tln: A pointer to the node. (optional; ignored if tp is specified) * tree_locked: A pointer to the state of the tree lock. If the tree lock * state changes, the function will update it. * * Will return true if the INP lock was reacquired; otherwise, false. */ static bool tcp_log_remove_id_node(struct inpcb *inp, struct tcpcb *tp, struct tcp_log_id_bucket *tlb, struct tcp_log_id_node *tln, int *tree_locked) { int orig_tree_locked; KASSERT(tp != NULL || (tlb != NULL && tln != NULL), ("%s: called with tp=%p, tlb=%p, tln=%p", __func__, tp, tlb, tln)); KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked", __func__)); if (tp != NULL) { tlb = tp->t_lib; tln = tp->t_lin; KASSERT(tlb != NULL, ("%s: unexpectedly NULL tlb", __func__)); KASSERT(tln != NULL, ("%s: unexpectedly NULL tln", __func__)); } tcp_log_id_validate_tree_lock(*tree_locked); TCPID_BUCKET_LOCK_ASSERT(tlb); /* * Remove the node, clear the log bucket and node from the TCPCB, and * decrement the bucket refcount. In the process, if this is the * last reference, the bucket will be freed. */ SLIST_REMOVE(&tlb->tlb_head, tln, tcp_log_id_node, tln_list); if (tp != NULL) { tp->t_lib = NULL; tp->t_lin = NULL; } orig_tree_locked = *tree_locked; if (!tcp_log_unref_bucket(tlb, tree_locked, inp)) TCPID_BUCKET_UNLOCK(tlb); return (*tree_locked != orig_tree_locked); } #define RECHECK_INP_CLEAN(cleanup) do { \ if (inp->inp_flags & INP_DROPPED) { \ rv = ECONNRESET; \ cleanup; \ goto done; \ } \ tp = intotcpcb(inp); \ } while (0) #define RECHECK_INP() RECHECK_INP_CLEAN(/* noop */) static void tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp) { INP_WLOCK_ASSERT(tptoinpcb(tp)); #ifdef STATS if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL) (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id)); #endif } static void tcp_log_increment_reqcnt(struct tcp_log_id_bucket *tlb) { atomic_fetchadd_int(&tlb->tlb_reqcnt, 1); } int tcp_log_apply_ratio(struct tcpcb *tp, int ratio) { struct tcp_log_id_bucket *tlb; struct inpcb *inp = tptoinpcb(tp); uint32_t hash, ratio_hash_thresh; int rv, tree_locked; rv = 0; tree_locked = TREE_UNLOCKED; tlb = tp->t_lib; INP_WLOCK_ASSERT(inp); if (tlb == NULL) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } if (ratio) ratio_hash_thresh = max(1, UINT32_MAX / ratio); else ratio_hash_thresh = 0; TCPID_BUCKET_REF(tlb); INP_WUNLOCK(inp); TCPID_BUCKET_LOCK(tlb); hash = hash32_buf(tlb->tlb_id, strlen(tlb->tlb_id), 0); if (hash > ratio_hash_thresh && tp->_t_logstate == TCP_LOG_STATE_OFF && tlb->tlb_logstate == TCP_LOG_STATE_OFF) { /* * Ratio decision not to log this log ID (and this connection by * way of association). We only apply a log ratio log disable * decision if it would not interfere with a log enable decision * made elsewhere e.g. tcp_log_selectauto() or setsockopt(). */ tlb->tlb_logstate = TCP_LOG_STATE_RATIO_OFF; INP_WLOCK(inp); RECHECK_INP(); (void)tcp_log_state_change(tp, TCP_LOG_STATE_OFF); done: INP_WUNLOCK(inp); } INP_UNLOCK_ASSERT(inp); if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) TCPID_BUCKET_UNLOCK(tlb); if (tree_locked == TREE_WLOCKED) { TCPID_TREE_WLOCK_ASSERT(); TCPID_TREE_WUNLOCK(); } else if (tree_locked == TREE_RLOCKED) { TCPID_TREE_RLOCK_ASSERT(); TCPID_TREE_RUNLOCK(); } else TCPID_TREE_UNLOCK_ASSERT(); return (rv); } /* * Associate the specified tag with a particular TCP log ID. * Called with INPCB locked. Returns with it unlocked. * Returns 0 on success or EOPNOTSUPP if the connection has no TCP log ID. */ int tcp_log_set_tag(struct tcpcb *tp, char *tag) { struct inpcb *inp = tptoinpcb(tp); struct tcp_log_id_bucket *tlb; int tree_locked; INP_WLOCK_ASSERT(inp); tree_locked = TREE_UNLOCKED; tlb = tp->t_lib; if (tlb == NULL) { INP_WUNLOCK(inp); return (EOPNOTSUPP); } TCPID_BUCKET_REF(tlb); INP_WUNLOCK(inp); TCPID_BUCKET_LOCK(tlb); strlcpy(tlb->tlb_tag, tag, TCP_LOG_TAG_LEN); if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) TCPID_BUCKET_UNLOCK(tlb); if (tree_locked == TREE_WLOCKED) { TCPID_TREE_WLOCK_ASSERT(); TCPID_TREE_WUNLOCK(); } else if (tree_locked == TREE_RLOCKED) { TCPID_TREE_RLOCK_ASSERT(); TCPID_TREE_RUNLOCK(); } else TCPID_TREE_UNLOCK_ASSERT(); return (0); } /* * Set the TCP log ID for a TCPCB. * Called with INPCB locked. Returns with it unlocked. */ int tcp_log_set_id(struct tcpcb *tp, char *id) { struct tcp_log_id_bucket *tlb, *tmp_tlb; struct tcp_log_id_node *tln; struct inpcb *inp = tptoinpcb(tp); int tree_locked, rv; bool bucket_locked, same; tlb = NULL; tln = NULL; tree_locked = TREE_UNLOCKED; bucket_locked = false; restart: INP_WLOCK_ASSERT(inp); /* See if the ID is unchanged. */ same = ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) || (tp->t_lib == NULL && *id == 0)); if (tp->_t_logstate && STAILQ_FIRST(&tp->t_logs) && !same) { /* * There are residual logs left we may * be changing id's so dump what we can. */ switch(tp->_t_logstate) { case TCP_LOG_STATE_HEAD_AUTO: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head at id switch", M_NOWAIT, false); break; case TCP_LOG_STATE_TAIL_AUTO: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail at id switch", M_NOWAIT, false); break; case TCP_LOG_STATE_CONTINUAL: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual at id switch", M_NOWAIT, false); break; case TCP_LOG_VIA_BBPOINTS: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints at id switch", M_NOWAIT, false); break; } } if (same) { if (tp->t_lib != NULL) { tcp_log_increment_reqcnt(tp->t_lib); if ((tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) && (tp->t_log_state_set == 0)) { /* Clone in any logging */ tp->_t_logstate = tp->t_lib->tlb_logstate; } if ((tp->t_lib->tlb_loglimit) && (tp->t_log_state_set == 0)) { /* We also have a limit set */ tp->t_loglimit = tp->t_lib->tlb_loglimit; } } rv = 0; goto done; } /* * If the TCPCB had a previous ID, we need to extricate it from * the previous list. * * Drop the TCPCB lock and lock the tree and the bucket. * Because this is called in the socket context, we (theoretically) * don't need to worry about the INPCB completely going away * while we are gone. */ if (tp->t_lib != NULL) { tlb = tp->t_lib; TCPID_BUCKET_REF(tlb); INP_WUNLOCK(inp); if (tree_locked == TREE_UNLOCKED) { TCPID_TREE_RLOCK(); tree_locked = TREE_RLOCKED; } TCPID_BUCKET_LOCK(tlb); bucket_locked = true; INP_WLOCK(inp); /* * Unreference the bucket. If our bucket went away, it is no * longer locked or valid. */ if (tcp_log_unref_bucket(tlb, &tree_locked, inp)) { bucket_locked = false; tlb = NULL; } /* Validate the INP. */ RECHECK_INP(); /* * Evaluate whether the bucket changed while we were unlocked. * * Possible scenarios here: * 1. Bucket is unchanged and the same one we started with. * 2. The TCPCB no longer has a bucket and our bucket was * freed. * 3. The TCPCB has a new bucket, whether ours was freed. * 4. The TCPCB no longer has a bucket and our bucket was * not freed. * * In cases 2-4, we will start over. In case 1, we will * proceed here to remove the bucket. */ if (tlb == NULL || tp->t_lib != tlb) { KASSERT(bucket_locked || tlb == NULL, ("%s: bucket_locked (%d) and tlb (%p) are " "inconsistent", __func__, bucket_locked, tlb)); if (bucket_locked) { TCPID_BUCKET_UNLOCK(tlb); bucket_locked = false; tlb = NULL; } goto restart; } /* * Store the (struct tcp_log_id_node) for reuse. Then, remove * it from the bucket. In the process, we may end up relocking. * If so, we need to validate that the INP is still valid, and * the TCPCB entries match we expect. * * We will clear tlb and change the bucket_locked state just * before calling tcp_log_remove_id_node(), since that function * will unlock the bucket. */ if (tln != NULL) uma_zfree(tcp_log_id_node_zone, tln); tln = tp->t_lin; tlb = NULL; bucket_locked = false; if (tcp_log_remove_id_node(inp, tp, NULL, NULL, &tree_locked)) { RECHECK_INP(); /* * If the TCPCB moved to a new bucket while we had * dropped the lock, restart. */ if (tp->t_lib != NULL || tp->t_lin != NULL) goto restart; } /* * Yay! We successfully removed the TCPCB from its old * bucket. Phew! * * On to bigger and better things... */ } /* At this point, the TCPCB should not be in any bucket. */ KASSERT(tp->t_lib == NULL, ("%s: tp->t_lib is not NULL", __func__)); /* * If the new ID is not empty, we need to now assign this TCPCB to a * new bucket. */ if (*id) { /* Get a new tln, if we don't already have one to reuse. */ if (tln == NULL) { tln = uma_zalloc(tcp_log_id_node_zone, M_NOWAIT | M_ZERO); if (tln == NULL) { rv = ENOBUFS; goto done; } tln->tln_inp = inp; tln->tln_tp = tp; } /* * Drop the INP lock for a bit. We don't need it, and dropping * it prevents lock order reversals. */ INP_WUNLOCK(inp); /* Make sure we have at least a read lock on the tree. */ tcp_log_id_validate_tree_lock(tree_locked); if (tree_locked == TREE_UNLOCKED) { TCPID_TREE_RLOCK(); tree_locked = TREE_RLOCKED; } refind: /* * Remember that we constructed (struct tcp_log_id_node) so * we can safely cast the id to it for the purposes of finding. */ KASSERT(tlb == NULL, ("%s:%d tlb unexpectedly non-NULL", __func__, __LINE__)); tmp_tlb = RB_FIND(tcp_log_id_tree, &tcp_log_id_head, (struct tcp_log_id_bucket *) id); /* * If we didn't find a matching bucket, we need to add a new * one. This requires a write lock. But, of course, we will * need to recheck some things when we re-acquire the lock. */ if (tmp_tlb == NULL && tree_locked != TREE_WLOCKED) { tree_locked = TREE_WLOCKED; if (!TCPID_TREE_UPGRADE()) { TCPID_TREE_RUNLOCK(); TCPID_TREE_WLOCK(); /* * The tree may have changed while we were * unlocked. */ goto refind; } } /* If we need to add a new bucket, do it now. */ if (tmp_tlb == NULL) { /* Allocate new bucket. */ tlb = uma_zalloc(tcp_log_id_bucket_zone, M_NOWAIT); if (tlb == NULL) { rv = ENOBUFS; goto done_noinp; } counter_u64_add(tcp_log_pcb_ids_cur, 1); counter_u64_add(tcp_log_pcb_ids_tot, 1); if ((tcp_log_auto_all == false) && tcp_log_auto_mode && tcp_log_selectauto()) { /* Save off the log state */ tlb->tlb_logstate = tcp_log_auto_mode; } else tlb->tlb_logstate = TCP_LOG_STATE_OFF; tlb->tlb_loglimit = 0; tlb->tlb_tag[0] = '\0'; /* Default to an empty tag. */ /* * Copy the ID to the bucket. * NB: Don't use strlcpy() unless you are sure * we've always validated NULL termination. * * TODO: When I'm done writing this, see if we * we have correctly validated NULL termination and * can use strlcpy(). :-) */ strncpy(tlb->tlb_id, id, TCP_LOG_ID_LEN - 1); tlb->tlb_id[TCP_LOG_ID_LEN - 1] = '\0'; /* * Take the refcount for the first node and go ahead * and lock this. Note that we zero the tlb_mtx * structure, since 0xdeadc0de flips the right bits * for the code to think that this mutex has already * been initialized. :-( */ SLIST_INIT(&tlb->tlb_head); refcount_init(&tlb->tlb_refcnt, 1); tlb->tlb_reqcnt = 1; memset(&tlb->tlb_mtx, 0, sizeof(struct mtx)); TCPID_BUCKET_LOCK_INIT(tlb); TCPID_BUCKET_LOCK(tlb); bucket_locked = true; #define FREE_NEW_TLB() do { \ TCPID_BUCKET_LOCK_DESTROY(tlb); \ uma_zfree(tcp_log_id_bucket_zone, tlb); \ counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1); \ counter_u64_add(tcp_log_pcb_ids_tot, (int64_t)-1); \ bucket_locked = false; \ tlb = NULL; \ } while (0) /* * Relock the INP and make sure we are still * unassigned. */ INP_WLOCK(inp); RECHECK_INP_CLEAN(FREE_NEW_TLB()); if (tp->t_lib != NULL) { FREE_NEW_TLB(); goto restart; } /* Add the new bucket to the tree. */ tmp_tlb = RB_INSERT(tcp_log_id_tree, &tcp_log_id_head, tlb); KASSERT(tmp_tlb == NULL, ("%s: Unexpected conflicting bucket (%p) while " "adding new bucket (%p)", __func__, tmp_tlb, tlb)); /* * If we found a conflicting bucket, free the new * one we made and fall through to use the existing * bucket. */ if (tmp_tlb != NULL) { FREE_NEW_TLB(); INP_WUNLOCK(inp); } #undef FREE_NEW_TLB } /* If we found an existing bucket, use it. */ if (tmp_tlb != NULL) { tlb = tmp_tlb; TCPID_BUCKET_LOCK(tlb); bucket_locked = true; /* * Relock the INP and make sure we are still * unassigned. */ INP_UNLOCK_ASSERT(inp); INP_WLOCK(inp); RECHECK_INP(); if (tp->t_lib != NULL) { TCPID_BUCKET_UNLOCK(tlb); bucket_locked = false; tlb = NULL; goto restart; } /* Take a reference on the bucket. */ TCPID_BUCKET_REF(tlb); /* Record the request. */ tcp_log_increment_reqcnt(tlb); } tcp_log_grow_tlb(tlb->tlb_id, tp); /* Add the new node to the list. */ SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list); tp->t_lib = tlb; tp->t_lin = tln; if (tp->t_lib->tlb_logstate > TCP_LOG_STATE_OFF) { /* Clone in any logging */ tp->_t_logstate = tp->t_lib->tlb_logstate; } if (tp->t_lib->tlb_loglimit) { /* The loglimit too */ tp->t_loglimit = tp->t_lib->tlb_loglimit; } tln = NULL; } rv = 0; done: /* Unlock things, as needed, and return. */ INP_WUNLOCK(inp); done_noinp: INP_UNLOCK_ASSERT(inp); if (bucket_locked) { TCPID_BUCKET_LOCK_ASSERT(tlb); TCPID_BUCKET_UNLOCK(tlb); } else if (tlb != NULL) TCPID_BUCKET_UNLOCK_ASSERT(tlb); if (tree_locked == TREE_WLOCKED) { TCPID_TREE_WLOCK_ASSERT(); TCPID_TREE_WUNLOCK(); } else if (tree_locked == TREE_RLOCKED) { TCPID_TREE_RLOCK_ASSERT(); TCPID_TREE_RUNLOCK(); } else TCPID_TREE_UNLOCK_ASSERT(); if (tln != NULL) uma_zfree(tcp_log_id_node_zone, tln); return (rv); } /* * Get the TCP log ID for a TCPCB. * Called with INPCB locked. * 'buf' must point to a buffer that is at least TCP_LOG_ID_LEN bytes long. * Returns number of bytes copied. */ size_t tcp_log_get_id(struct tcpcb *tp, char *buf) { size_t len; INP_LOCK_ASSERT(tptoinpcb(tp)); if (tp->t_lib != NULL) { len = strlcpy(buf, tp->t_lib->tlb_id, TCP_LOG_ID_LEN); KASSERT(len < TCP_LOG_ID_LEN, ("%s:%d: tp->t_lib->tlb_id too long (%zu)", __func__, __LINE__, len)); } else { *buf = '\0'; len = 0; } return (len); } /* * Get the tag associated with the TCPCB's log ID. * Called with INPCB locked. Returns with it unlocked. * 'buf' must point to a buffer that is at least TCP_LOG_TAG_LEN bytes long. * Returns number of bytes copied. */ size_t tcp_log_get_tag(struct tcpcb *tp, char *buf) { struct inpcb *inp = tptoinpcb(tp); struct tcp_log_id_bucket *tlb; size_t len; int tree_locked; INP_WLOCK_ASSERT(inp); tree_locked = TREE_UNLOCKED; tlb = tp->t_lib; if (tlb != NULL) { TCPID_BUCKET_REF(tlb); INP_WUNLOCK(inp); TCPID_BUCKET_LOCK(tlb); len = strlcpy(buf, tlb->tlb_tag, TCP_LOG_TAG_LEN); KASSERT(len < TCP_LOG_TAG_LEN, ("%s:%d: tp->t_lib->tlb_tag too long (%zu)", __func__, __LINE__, len)); if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) TCPID_BUCKET_UNLOCK(tlb); if (tree_locked == TREE_WLOCKED) { TCPID_TREE_WLOCK_ASSERT(); TCPID_TREE_WUNLOCK(); } else if (tree_locked == TREE_RLOCKED) { TCPID_TREE_RLOCK_ASSERT(); TCPID_TREE_RUNLOCK(); } else TCPID_TREE_UNLOCK_ASSERT(); } else { INP_WUNLOCK(inp); *buf = '\0'; len = 0; } return (len); } /* * Get number of connections with the same log ID. * Log ID is taken from given TCPCB. * Called with INPCB locked. */ u_int tcp_log_get_id_cnt(struct tcpcb *tp) { INP_WLOCK_ASSERT(tptoinpcb(tp)); return ((tp->t_lib == NULL) ? 0 : tp->t_lib->tlb_refcnt); } #ifdef TCPLOG_DEBUG_RINGBUF /* * Functions/macros to increment/decrement reference count for a log * entry. This should catch when we do a double-free/double-remove or * a double-add. */ static inline void _tcp_log_entry_refcnt_add(struct tcp_log_mem *log_entry, const char *func, int line) { int refcnt; refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, 1); if (refcnt != 0) panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 0)", func, line, log_entry, refcnt); } #define tcp_log_entry_refcnt_add(l) \ _tcp_log_entry_refcnt_add((l), __func__, __LINE__) static inline void _tcp_log_entry_refcnt_rem(struct tcp_log_mem *log_entry, const char *func, int line) { int refcnt; refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, -1); if (refcnt != 1) panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 1)", func, line, log_entry, refcnt); } #define tcp_log_entry_refcnt_rem(l) \ _tcp_log_entry_refcnt_rem((l), __func__, __LINE__) #else /* !TCPLOG_DEBUG_RINGBUF */ #define tcp_log_entry_refcnt_add(l) #define tcp_log_entry_refcnt_rem(l) #endif /* * Cleanup after removing a log entry, but only decrement the count if we * are running INVARIANTS. */ static inline void tcp_log_free_log_common(struct tcp_log_mem *log_entry, int *count __unused) { uma_zfree(tcp_log_zone, log_entry); #ifdef INVARIANTS (*count)--; KASSERT(*count >= 0, ("%s: count unexpectedly negative", __func__)); #endif } static void tcp_log_free_entries(struct tcp_log_stailq *head, int *count) { struct tcp_log_mem *log_entry; /* Free the entries. */ while ((log_entry = STAILQ_FIRST(head)) != NULL) { STAILQ_REMOVE_HEAD(head, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); tcp_log_free_log_common(log_entry, count); } } /* Cleanup after removing a log entry. */ static inline void tcp_log_remove_log_cleanup(struct tcpcb *tp, struct tcp_log_mem *log_entry) { uma_zfree(tcp_log_zone, log_entry); tp->t_lognum--; KASSERT(tp->t_lognum >= 0, ("%s: tp->t_lognum unexpectedly negative", __func__)); } /* Remove a log entry from the head of a list. */ static inline void tcp_log_remove_log_head(struct tcpcb *tp, struct tcp_log_mem *log_entry) { KASSERT(log_entry == STAILQ_FIRST(&tp->t_logs), ("%s: attempt to remove non-HEAD log entry", __func__)); STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); tcp_log_remove_log_cleanup(tp, log_entry); } #ifdef TCPLOG_DEBUG_RINGBUF /* * Initialize the log entry's reference count, which we want to * survive allocations. */ static int tcp_log_zone_init(void *mem, int size, int flags __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; tlm->tlm_refcnt = 0; return (0); } /* * Double check that the refcnt is zero on allocation and return. */ static int tcp_log_zone_ctor(void *mem, int size, void *args __unused, int flags __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; if (tlm->tlm_refcnt != 0) panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)", __func__, __LINE__, tlm, tlm->tlm_refcnt); return (0); } static void tcp_log_zone_dtor(void *mem, int size, void *args __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; if (tlm->tlm_refcnt != 0) panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)", __func__, __LINE__, tlm, tlm->tlm_refcnt); } #endif /* TCPLOG_DEBUG_RINGBUF */ /* Do global initialization. */ void tcp_log_init(void) { tcp_log_zone = uma_zcreate("tcp_log", sizeof(struct tcp_log_mem), #ifdef TCPLOG_DEBUG_RINGBUF tcp_log_zone_ctor, tcp_log_zone_dtor, tcp_log_zone_init, #else NULL, NULL, NULL, #endif NULL, UMA_ALIGN_PTR, 0); (void)uma_zone_set_max(tcp_log_zone, TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT); tcp_log_id_bucket_zone = uma_zcreate("tcp_log_id_bucket", sizeof(struct tcp_log_id_bucket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_log_id_node_zone = uma_zcreate("tcp_log_id_node", sizeof(struct tcp_log_id_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); #ifdef TCPLOG_DEBUG_COUNTERS tcp_log_queued = counter_u64_alloc(M_WAITOK); tcp_log_que_fail1 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail2 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail3 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail4 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail5 = counter_u64_alloc(M_WAITOK); tcp_log_que_copyout = counter_u64_alloc(M_WAITOK); tcp_log_que_read = counter_u64_alloc(M_WAITOK); tcp_log_que_freed = counter_u64_alloc(M_WAITOK); #endif tcp_log_pcb_ids_cur = counter_u64_alloc(M_WAITOK); tcp_log_pcb_ids_tot = counter_u64_alloc(M_WAITOK); rw_init_flags(&tcp_id_tree_lock, "TCP ID tree", RW_NEW); mtx_init(&tcp_log_expireq_mtx, "TCP log expireq", NULL, MTX_DEF); callout_init(&tcp_log_expireq_callout, 1); } /* Do per-TCPCB initialization. */ void tcp_log_tcpcbinit(struct tcpcb *tp) { /* A new TCPCB should start out zero-initialized. */ STAILQ_INIT(&tp->t_logs); /* * If we are doing auto-capturing, figure out whether we will capture * this session. */ tp->t_loglimit = tcp_log_session_limit; if ((tcp_log_auto_all == true) && tcp_log_auto_mode && tcp_log_selectauto()) { tp->_t_logstate = tcp_log_auto_mode; tp->t_flags2 |= TF2_LOG_AUTO; } } /* Remove entries */ static void tcp_log_expire(void *unused __unused) { struct tcp_log_id_bucket *tlb; struct tcp_log_id_node *tln; sbintime_t expiry_limit; int tree_locked; TCPLOG_EXPIREQ_LOCK(); if (callout_pending(&tcp_log_expireq_callout)) { /* Callout was reset. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Process entries until we reach one that expires too far in the * future. Look one second in the future. */ expiry_limit = getsbinuptime() + SBT_1S; tree_locked = TREE_UNLOCKED; while ((tln = STAILQ_FIRST(&tcp_log_expireq_head)) != NULL && tln->tln_expiretime <= expiry_limit) { if (!callout_active(&tcp_log_expireq_callout)) { /* * Callout was stopped. I guess we should * just quit at this point. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Remove the node from the head of the list and unlock * the list. Change the expiry time to SBT_MAX as a signal * to other threads that we now own this. */ STAILQ_REMOVE_HEAD(&tcp_log_expireq_head, tln_expireq); tln->tln_expiretime = SBT_MAX; TCPLOG_EXPIREQ_UNLOCK(); /* * Remove the node from the bucket. */ tlb = tln->tln_bucket; TCPID_BUCKET_LOCK(tlb); if (tcp_log_remove_id_node(NULL, NULL, tlb, tln, &tree_locked)) { tcp_log_id_validate_tree_lock(tree_locked); if (tree_locked == TREE_WLOCKED) TCPID_TREE_WUNLOCK(); else TCPID_TREE_RUNLOCK(); tree_locked = TREE_UNLOCKED; } /* Drop the INP reference. */ INP_WLOCK(tln->tln_inp); if (!in_pcbrele_wlocked(tln->tln_inp)) INP_WUNLOCK(tln->tln_inp); /* Free the log records. */ tcp_log_free_entries(&tln->tln_entries, &tln->tln_count); /* Free the node. */ uma_zfree(tcp_log_id_node_zone, tln); /* Relock the expiry queue. */ TCPLOG_EXPIREQ_LOCK(); } /* * We've expired all the entries we can. Do we need to reschedule * ourselves? */ callout_deactivate(&tcp_log_expireq_callout); if (tln != NULL) { /* * Get max(now + TCP_LOG_EXPIRE_INTVL, tln->tln_expiretime) and * set the next callout to that. (This helps ensure we generally * run the callout no more often than desired.) */ expiry_limit = getsbinuptime() + TCP_LOG_EXPIRE_INTVL; if (expiry_limit < tln->tln_expiretime) expiry_limit = tln->tln_expiretime; callout_reset_sbt(&tcp_log_expireq_callout, expiry_limit, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } /* We're done. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Move log data from the TCPCB to a new node. This will reset the TCPCB log * entries and log count; however, it will not touch other things from the * TCPCB (e.g. t_lin, t_lib). * * NOTE: Must hold a lock on the INP. */ static void tcp_log_move_tp_to_node(struct tcpcb *tp, struct tcp_log_id_node *tln) { struct inpcb *inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); tln->tln_ie = inp->inp_inc.inc_ie; if (inp->inp_inc.inc_flags & INC_ISIPV6) tln->tln_af = AF_INET6; else tln->tln_af = AF_INET; tln->tln_entries = tp->t_logs; tln->tln_count = tp->t_lognum; tln->tln_bucket = tp->t_lib; /* Clear information from the PCB. */ STAILQ_INIT(&tp->t_logs); tp->t_lognum = 0; } /* Do per-TCPCB cleanup */ void tcp_log_tcpcbfini(struct tcpcb *tp) { struct tcp_log_id_node *tln, *tln_first; struct tcp_log_mem *log_entry; sbintime_t callouttime; INP_WLOCK_ASSERT(tptoinpcb(tp)); -#ifdef TCP_ACCOUNTING if (tp->_t_logstate) { - struct tcp_log_buffer *lgb; union tcp_log_stackspecific log; struct timeval tv; +#ifdef TCP_ACCOUNTING + struct tcp_log_buffer *lgb; int i; memset(&log, 0, sizeof(log)); if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { for (i = 0; i < TCP_NUM_CNT_COUNTERS; i++) { log.u_raw.u64_flex[i] = tp->tcp_cnt_counters[i]; } lgb = tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_ACCOUNTING, 0, 0, &log, false, NULL, NULL, 0, &tv); if (lgb != NULL) { lgb->tlb_flex1 = TCP_NUM_CNT_COUNTERS; lgb->tlb_flex2 = 1; } else goto skip_out; for (i = 0; itcp_proc_time[i]; } lgb = tcp_log_event(tp, NULL, NULL, NULL, TCP_LOG_ACCOUNTING, 0, 0, &log, false, NULL, NULL, 0, &tv); if (lgb != NULL) { lgb->tlb_flex1 = TCP_NUM_CNT_COUNTERS; lgb->tlb_flex2 = 2; } } skip_out: +#endif log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.cur_del_rate = tp->t_end_info; - TCP_LOG_EVENTP(tp, NULL, - NULL, - NULL, - TCP_LOG_CONNEND, 0, - 0, &log, false, &tv); + (void)tcp_log_event(tp, NULL, + NULL, + NULL, + TCP_LOG_CONNEND, 0, + 0, &log, false, NULL, NULL, 0, &tv); } -#endif /* * If we were gathering packets to be automatically dumped, try to do * it now. If this succeeds, the log information in the TCPCB will be * cleared. Otherwise, we'll handle the log information as we do * for other states. */ switch(tp->_t_logstate) { case TCP_LOG_STATE_HEAD_AUTO: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", M_NOWAIT, false); break; case TCP_LOG_STATE_TAIL_AUTO: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail", M_NOWAIT, false); break; case TCP_LOG_VIA_BBPOINTS: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints", M_NOWAIT, false); break; case TCP_LOG_STATE_CONTINUAL: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false); break; } /* * There are two ways we could keep logs: per-socket or per-ID. If * we are tracking logs with an ID, then the logs survive the * destruction of the TCPCB. * * If the TCPCB is associated with an ID node, move the logs from the * TCPCB to the ID node. In theory, this is safe, for reasons which I * will now explain for my own benefit when I next need to figure out * this code. :-) * * We own the INP lock. Therefore, no one else can change the contents * of this node (Rule C). Further, no one can remove this node from * the bucket while we hold the lock (Rule D). Basically, no one can * mess with this node. That leaves two states in which we could be: * * 1. Another thread is currently waiting to acquire the INP lock, with * plans to do something with this node. When we drop the INP lock, * they will have a chance to do that. They will recheck the * tln_closed field (see note to Rule C) and then acquire the * bucket lock before proceeding further. * * 2. Another thread will try to acquire a lock at some point in the * future. If they try to acquire a lock before we set the * tln_closed field, they will follow state #1. If they try to * acquire a lock after we set the tln_closed field, they will be * able to make changes to the node, at will, following Rule C. * * Therefore, we currently own this node and can make any changes * we want. But, as soon as we set the tln_closed field to true, we * have effectively dropped our lock on the node. (For this reason, we * also need to make sure our writes are ordered correctly. An atomic * operation with "release" semantics should be sufficient.) */ if (tp->t_lin != NULL) { struct inpcb *inp = tptoinpcb(tp); /* Copy the relevant information to the log entry. */ tln = tp->t_lin; KASSERT(tln->tln_inp == inp, ("%s: Mismatched inp (tln->tln_inp=%p, tp inpcb=%p)", __func__, tln->tln_inp, inp)); tcp_log_move_tp_to_node(tp, tln); /* Clear information from the PCB. */ tp->t_lin = NULL; tp->t_lib = NULL; /* * Take a reference on the INP. This ensures that the INP * remains valid while the node is on the expiry queue. This * ensures the INP is valid for other threads that may be * racing to lock this node when we move it to the expire * queue. */ in_pcbref(inp); /* * Store the entry on the expiry list. The exact behavior * depends on whether we have entries to keep. If so, we * put the entry at the tail of the list and expire in * TCP_LOG_EXPIRE_TIME. Otherwise, we expire "now" and put * the entry at the head of the list. (Handling the cleanup * via the expiry timer lets us avoid locking messy-ness here.) */ tln->tln_expiretime = getsbinuptime(); TCPLOG_EXPIREQ_LOCK(); if (tln->tln_count) { tln->tln_expiretime += TCP_LOG_EXPIRE_TIME; if (STAILQ_EMPTY(&tcp_log_expireq_head) && !callout_active(&tcp_log_expireq_callout)) { /* * We are adding the first entry and a callout * is not currently scheduled; therefore, we * need to schedule one. */ callout_reset_sbt(&tcp_log_expireq_callout, tln->tln_expiretime, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } STAILQ_INSERT_TAIL(&tcp_log_expireq_head, tln, tln_expireq); } else { callouttime = tln->tln_expiretime + TCP_LOG_EXPIRE_INTVL; tln_first = STAILQ_FIRST(&tcp_log_expireq_head); if ((tln_first == NULL || callouttime < tln_first->tln_expiretime) && (callout_pending(&tcp_log_expireq_callout) || !callout_active(&tcp_log_expireq_callout))) { /* * The list is empty, or we want to run the * expire code before the first entry's timer * fires. Also, we are in a case where a callout * is not actively running. We want to reset * the callout to occur sooner. */ callout_reset_sbt(&tcp_log_expireq_callout, callouttime, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } /* * Insert to the head, or just after the head, as * appropriate. (This might result in small * mis-orderings as a bunch of "expire now" entries * gather at the start of the list, but that should * not produce big problems, since the expire timer * will walk through all of them.) */ if (tln_first == NULL || tln->tln_expiretime < tln_first->tln_expiretime) STAILQ_INSERT_HEAD(&tcp_log_expireq_head, tln, tln_expireq); else STAILQ_INSERT_AFTER(&tcp_log_expireq_head, tln_first, tln, tln_expireq); } TCPLOG_EXPIREQ_UNLOCK(); /* * We are done messing with the tln. After this point, we * can't touch it. (Note that the "release" semantics should * be included with the TCPLOG_EXPIREQ_UNLOCK() call above. * Therefore, they should be unnecessary here. However, it * seems like a good idea to include them anyway, since we * really are releasing a lock here.) */ atomic_store_rel_int(&tln->tln_closed, 1); } else { /* Remove log entries. */ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); } /* * Change the log state to off (just in case anything tries to sneak * in a last-minute log). */ tp->_t_logstate = TCP_LOG_STATE_OFF; } static void tcp_log_purge_tp_logbuf(struct tcpcb *tp) { struct tcp_log_mem *log_entry; INP_WLOCK_ASSERT(tptoinpcb(tp)); if (tp->t_lognum == 0) return; while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); tp->_t_logstate = TCP_LOG_STATE_OFF; } /* * This logs an event for a TCP socket. Normally, this is called via * TCP_LOG_EVENT or TCP_LOG_EVENT_VERBOSE. See the documentation for * TCP_LOG_EVENT(). */ struct tcp_log_buffer * tcp_log_event(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *itv) { struct tcp_log_mem *log_entry; struct tcp_log_buffer *log_buf; int attempt_count = 0; struct tcp_log_verbose *log_verbose; uint32_t logsn; KASSERT((func == NULL && line == 0) || (func != NULL && line > 0), ("%s called with inconsistent func (%p) and line (%d) arguments", __func__, func, line)); INP_WLOCK_ASSERT(tptoinpcb(tp)); if (tcp_disable_all_bb_logs) { /* * The global shutdown logging * switch has been thrown. Call * the purge function that frees * purges out the logs and * turns off logging. */ tcp_log_purge_tp_logbuf(tp); return (NULL); } KASSERT(tp->_t_logstate == TCP_LOG_STATE_HEAD || tp->_t_logstate == TCP_LOG_STATE_TAIL || tp->_t_logstate == TCP_LOG_STATE_CONTINUAL || tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO || tp->_t_logstate == TCP_LOG_VIA_BBPOINTS || tp->_t_logstate == TCP_LOG_STATE_TAIL_AUTO, ("%s called with unexpected tp->_t_logstate (%d)", __func__, tp->_t_logstate)); /* * Get the serial number. We do this early so it will * increment even if we end up skipping the log entry for some * reason. */ logsn = tp->t_logsn++; /* * Can we get a new log entry? If so, increment the lognum counter * here. */ retry: if (tp->t_lognum < tp->t_loglimit) { if ((log_entry = uma_zalloc(tcp_log_zone, M_NOWAIT)) != NULL) tp->t_lognum++; } else log_entry = NULL; /* Do we need to try to reuse? */ if (log_entry == NULL) { /* * Sacrifice auto-logged sessions without a log ID if * tcp_log_auto_all is false. (If they don't have a log * ID by now, it is probable that either they won't get one * or we are resource-constrained.) */ if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) && !tcp_log_auto_all) { if (tcp_log_state_change(tp, TCP_LOG_STATE_CLEAR)) { #ifdef INVARIANTS panic("%s:%d: tcp_log_state_change() failed " "to set tp %p to TCP_LOG_STATE_CLEAR", __func__, __LINE__, tp); #endif tp->_t_logstate = TCP_LOG_STATE_OFF; } return (NULL); } /* * If we are in TCP_LOG_STATE_HEAD_AUTO state, try to dump * the buffers. If successful, deactivate tracing. Otherwise, * leave it active so we will retry. */ if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO && !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", M_NOWAIT, false)) { tp->_t_logstate = TCP_LOG_STATE_OFF; return(NULL); } else if ((tp->_t_logstate == TCP_LOG_STATE_CONTINUAL) && !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false)) { if (attempt_count == 0) { attempt_count++; goto retry; } #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail4, 1); #endif return(NULL); } else if ((tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) && !tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints", M_NOWAIT, false)) { if (attempt_count == 0) { attempt_count++; goto retry; } #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail4, 1); #endif return(NULL); } else if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO) return(NULL); /* If in HEAD state, just deactivate the tracing and return. */ if (tp->_t_logstate == TCP_LOG_STATE_HEAD) { tp->_t_logstate = TCP_LOG_STATE_OFF; return(NULL); } /* * Get a buffer to reuse. If that fails, just give up. * (We can't log anything without a buffer in which to * put it.) * * Note that we don't change the t_lognum counter * here. Because we are re-using the buffer, the total * number won't change. */ if ((log_entry = STAILQ_FIRST(&tp->t_logs)) == NULL) return(NULL); STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); } KASSERT(log_entry != NULL, ("%s: log_entry unexpectedly NULL", __func__)); /* Extract the log buffer and verbose buffer pointers. */ log_buf = &log_entry->tlm_buf; log_verbose = &log_entry->tlm_v; /* Basic entries. */ if (itv == NULL) microuptime(&log_buf->tlb_tv); else memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval)); log_buf->tlb_ticks = ticks; log_buf->tlb_sn = logsn; log_buf->tlb_stackid = tp->t_fb->tfb_id; log_buf->tlb_eventid = eventid; log_buf->tlb_eventflags = 0; log_buf->tlb_errno = errornum; /* Socket buffers */ if (rxbuf != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_RXBUF; log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc; log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc; log_buf->tlb_rxbuf.tls_sb_spare = 0; } else { log_buf->tlb_rxbuf.tls_sb_acc = 0; log_buf->tlb_rxbuf.tls_sb_ccc = 0; } if (txbuf != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_TXBUF; log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc; log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc; log_buf->tlb_txbuf.tls_sb_spare = 0; } else { log_buf->tlb_txbuf.tls_sb_acc = 0; log_buf->tlb_txbuf.tls_sb_ccc = 0; } /* Copy values from tp to the log entry. */ #define COPY_STAT(f) log_buf->tlb_ ## f = tp->f #define COPY_STAT_T(f) log_buf->tlb_ ## f = tp->t_ ## f COPY_STAT_T(state); COPY_STAT_T(starttime); COPY_STAT(iss); COPY_STAT_T(flags); COPY_STAT(snd_una); COPY_STAT(snd_max); COPY_STAT(snd_cwnd); COPY_STAT(snd_nxt); COPY_STAT(snd_recover); COPY_STAT(snd_wnd); COPY_STAT(snd_ssthresh); COPY_STAT_T(srtt); COPY_STAT_T(rttvar); COPY_STAT(rcv_up); COPY_STAT(rcv_adv); COPY_STAT(rcv_nxt); COPY_STAT(rcv_wnd); COPY_STAT_T(dupacks); COPY_STAT_T(segqlen); COPY_STAT(snd_numholes); COPY_STAT(snd_scale); COPY_STAT(rcv_scale); COPY_STAT_T(flags2); COPY_STAT_T(fbyte_in); COPY_STAT_T(fbyte_out); #undef COPY_STAT #undef COPY_STAT_T /* Copy stack-specific info. */ if (stackinfo != NULL) { memcpy(&log_buf->tlb_stackinfo, stackinfo, sizeof(log_buf->tlb_stackinfo)); log_buf->tlb_eventflags |= TLB_FLAG_STACKINFO; } /* The packet */ log_buf->tlb_len = len; if (th) { int optlen; log_buf->tlb_eventflags |= TLB_FLAG_HDR; log_buf->tlb_th = *th; if (th_hostorder) tcp_fields_to_net(&log_buf->tlb_th); optlen = (th->th_off << 2) - sizeof (struct tcphdr); if (optlen > 0) memcpy(log_buf->tlb_opts, th + 1, optlen); } else { memset(&log_buf->tlb_th, 0, sizeof(*th)); } /* Verbose information */ if (func != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_VERBOSE; if (output_caller != NULL) strlcpy(log_verbose->tlv_snd_frm, output_caller, TCP_FUNC_LEN); else *log_verbose->tlv_snd_frm = 0; strlcpy(log_verbose->tlv_trace_func, func, TCP_FUNC_LEN); log_verbose->tlv_trace_line = line; } /* Insert the new log at the tail. */ STAILQ_INSERT_TAIL(&tp->t_logs, log_entry, tlm_queue); tcp_log_entry_refcnt_add(log_entry); return (log_buf); } /* * Change the logging state for a TCPCB. Returns 0 on success or an * error code on failure. */ int tcp_log_state_change(struct tcpcb *tp, int state) { struct tcp_log_mem *log_entry; int rv; INP_WLOCK_ASSERT(tptoinpcb(tp)); rv = 0; switch(state) { case TCP_LOG_STATE_CLEAR: while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); /* Fall through */ case TCP_LOG_STATE_OFF: tp->_t_logstate = TCP_LOG_STATE_OFF; break; case TCP_LOG_STATE_TAIL: case TCP_LOG_STATE_HEAD: case TCP_LOG_STATE_CONTINUAL: case TCP_LOG_VIA_BBPOINTS: case TCP_LOG_STATE_HEAD_AUTO: case TCP_LOG_STATE_TAIL_AUTO: /* * When the RATIO_OFF state is set for the bucket, the log ID * this tp is associated with has been probabilistically opted * out of logging per tcp_log_apply_ratio(). */ if (tp->t_lib == NULL || tp->t_lib->tlb_logstate != TCP_LOG_STATE_RATIO_OFF) { tp->_t_logstate = state; } else { rv = ECANCELED; tp->_t_logstate = TCP_LOG_STATE_OFF; } break; default: return (EINVAL); } if (tcp_disable_all_bb_logs) { /* We are prohibited from doing any logs */ tp->_t_logstate = TCP_LOG_STATE_OFF; rv = EBUSY; } tp->t_flags2 &= ~(TF2_LOG_AUTO); return (rv); } /* If tcp_drain() is called, flush half the log entries. */ void tcp_log_drain(struct tcpcb *tp) { struct tcp_log_mem *log_entry, *next; int target, skip; INP_WLOCK_ASSERT(tptoinpcb(tp)); if ((target = tp->t_lognum / 2) == 0) return; /* * XXXRRS: At this I don't think this is wise that * we do this. All that a drain call means is that * we are hitting one of the system mbuf limits. BB * logging, or freeing of them, will not create any * more mbufs and really has nothing to do with * the system running out of mbufs. For now I * am changing this to free any "AUTO" by dumping * them out. But this should either be changed * so that it gets called when we hit the BB limit * or it should just not get called (one of the two) * since I don't think the mbuf <-> BB log cleanup * is the right thing to do here. */ /* * If we are logging the "head" packets, we want to discard * from the tail of the queue. Otherwise, we want to discard * from the head. */ if (tp->_t_logstate == TCP_LOG_STATE_HEAD) { skip = tp->t_lognum - target; STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue) if (!--skip) break; KASSERT(log_entry != NULL, ("%s: skipped through all entries!", __func__)); if (log_entry == NULL) return; while ((next = STAILQ_NEXT(log_entry, tlm_queue)) != NULL) { STAILQ_REMOVE_AFTER(&tp->t_logs, log_entry, tlm_queue); tcp_log_entry_refcnt_rem(next); tcp_log_remove_log_cleanup(tp, next); #ifdef INVARIANTS target--; #endif } KASSERT(target == 0, ("%s: After removing from tail, target was %d", __func__, target)); } else if (tp->_t_logstate == TCP_LOG_STATE_HEAD_AUTO) { (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head at drain", M_NOWAIT, false); } else if (tp->_t_logstate == TCP_LOG_STATE_TAIL_AUTO) { (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail at drain", M_NOWAIT, false); } else if (tp->_t_logstate == TCP_LOG_VIA_BBPOINTS) { (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from bbpoints", M_NOWAIT, false); } else if (tp->_t_logstate == TCP_LOG_STATE_CONTINUAL) { (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false); } else { while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL && target--) tcp_log_remove_log_head(tp, log_entry); KASSERT(target <= 0, ("%s: After removing from head, target was %d", __func__, target)); KASSERT(tp->t_lognum > 0, ("%s: After removing from head, tp->t_lognum was %d", __func__, target)); KASSERT(log_entry != NULL, ("%s: After removing from head, the tailq was empty", __func__)); } } static inline int tcp_log_copyout(struct sockopt *sopt, void *src, void *dst, size_t len) { if (sopt->sopt_td != NULL) return (copyout(src, dst, len)); bcopy(src, dst, len); return (0); } static int tcp_log_logs_to_buf(struct sockopt *sopt, struct tcp_log_stailq *log_tailqp, struct tcp_log_buffer **end, int count) { struct tcp_log_buffer *out_entry; struct tcp_log_mem *log_entry; size_t entrysize; int error; #ifdef INVARIANTS int orig_count = count; #endif /* Copy the data out. */ error = 0; out_entry = (struct tcp_log_buffer *) sopt->sopt_val; STAILQ_FOREACH(log_entry, log_tailqp, tlm_queue) { count--; KASSERT(count >= 0, ("%s:%d: Exceeded expected count (%d) processing list %p", __func__, __LINE__, orig_count, log_tailqp)); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_copyout, 1); #endif /* * Skip copying out the header if it isn't present. * Instead, copy out zeros (to ensure we don't leak info). * TODO: Make sure we truly do zero everything we don't * explicitly set. */ if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR) entrysize = sizeof(struct tcp_log_buffer); else entrysize = offsetof(struct tcp_log_buffer, tlb_th); error = tcp_log_copyout(sopt, &log_entry->tlm_buf, out_entry, entrysize); if (error) break; if (!(log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)) { error = tcp_log_copyout(sopt, zerobuf, ((uint8_t *)out_entry) + entrysize, sizeof(struct tcp_log_buffer) - entrysize); } /* * Copy out the verbose bit, if needed. Either way, * increment the output pointer the correct amount. */ if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) { error = tcp_log_copyout(sopt, &log_entry->tlm_v, out_entry->tlb_verbose, sizeof(struct tcp_log_verbose)); if (error) break; out_entry = (struct tcp_log_buffer *) (((uint8_t *) (out_entry + 1)) + sizeof(struct tcp_log_verbose)); } else out_entry++; } *end = out_entry; KASSERT(error || count == 0, ("%s:%d: Less than expected count (%d) processing list %p" " (%d remain)", __func__, __LINE__, orig_count, log_tailqp, count)); return (error); } /* * Copy out the buffer. Note that we do incremental copying, so * sooptcopyout() won't work. However, the goal is to produce the same * end result as if we copied in the entire user buffer, updated it, * and then used sooptcopyout() to copy it out. * * NOTE: This should be called with a write lock on the PCB; however, * the function will drop it after it extracts the data from the TCPCB. */ int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp) { struct tcp_log_stailq log_tailq; struct tcp_log_mem *log_entry, *log_next; struct tcp_log_buffer *out_entry; struct inpcb *inp = tptoinpcb(tp); size_t outsize, entrysize; int error, outnum; INP_WLOCK_ASSERT(inp); /* * Determine which log entries will fit in the buffer. As an * optimization, skip this if all the entries will clearly fit * in the buffer. (However, get an exact size if we are using * INVARIANTS.) */ #ifndef INVARIANTS if (sopt->sopt_valsize / (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)) >= tp->t_lognum) { log_entry = STAILQ_LAST(&tp->t_logs, tcp_log_mem, tlm_queue); log_next = NULL; outsize = 0; outnum = tp->t_lognum; } else { #endif outsize = outnum = 0; log_entry = NULL; STAILQ_FOREACH(log_next, &tp->t_logs, tlm_queue) { entrysize = sizeof(struct tcp_log_buffer); if (log_next->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) entrysize += sizeof(struct tcp_log_verbose); if ((sopt->sopt_valsize - outsize) < entrysize) break; outsize += entrysize; outnum++; log_entry = log_next; } KASSERT(outsize <= sopt->sopt_valsize, ("%s: calculated output size (%zu) greater than available" "space (%zu)", __func__, outsize, sopt->sopt_valsize)); #ifndef INVARIANTS } #endif /* * Copy traditional sooptcopyout() behavior: if sopt->sopt_val * is NULL, silently skip the copy. However, in this case, we * will leave the list alone and return. Functionally, this * gives userspace a way to poll for an approximate buffer * size they will need to get the log entries. */ if (sopt->sopt_val == NULL) { INP_WUNLOCK(inp); if (outsize == 0) { outsize = outnum * (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)); } if (sopt->sopt_valsize > outsize) sopt->sopt_valsize = outsize; return (0); } /* * Break apart the list. We'll save the ones we want to copy * out locally and remove them from the TCPCB list. We can * then drop the INPCB lock while we do the copyout. * * There are roughly three cases: * 1. There was nothing to copy out. That's easy: drop the * lock and return. * 2. We are copying out the entire list. Again, that's easy: * move the whole list. * 3. We are copying out a partial list. That's harder. We * need to update the list book-keeping entries. */ if (log_entry != NULL && log_next == NULL) { /* Move entire list. */ KASSERT(outnum == tp->t_lognum, ("%s:%d: outnum (%d) should match tp->t_lognum (%d)", __func__, __LINE__, outnum, tp->t_lognum)); log_tailq = tp->t_logs; tp->t_lognum = 0; STAILQ_INIT(&tp->t_logs); } else if (log_entry != NULL) { /* Move partial list. */ KASSERT(outnum < tp->t_lognum, ("%s:%d: outnum (%d) not less than tp->t_lognum (%d)", __func__, __LINE__, outnum, tp->t_lognum)); STAILQ_FIRST(&log_tailq) = STAILQ_FIRST(&tp->t_logs); STAILQ_FIRST(&tp->t_logs) = STAILQ_NEXT(log_entry, tlm_queue); KASSERT(STAILQ_NEXT(log_entry, tlm_queue) != NULL, ("%s:%d: tp->t_logs is unexpectedly shorter than expected" "(tp: %p, log_tailq: %p, outnum: %d, tp->t_lognum: %d)", __func__, __LINE__, tp, &log_tailq, outnum, tp->t_lognum)); STAILQ_NEXT(log_entry, tlm_queue) = NULL; log_tailq.stqh_last = &STAILQ_NEXT(log_entry, tlm_queue); tp->t_lognum -= outnum; } else STAILQ_INIT(&log_tailq); /* Drop the PCB lock. */ INP_WUNLOCK(inp); /* Copy the data out. */ error = tcp_log_logs_to_buf(sopt, &log_tailq, &out_entry, outnum); if (error) { /* Restore list */ INP_WLOCK(inp); if ((inp->inp_flags & INP_DROPPED) == 0) { tp = intotcpcb(inp); /* Merge the two lists. */ STAILQ_CONCAT(&log_tailq, &tp->t_logs); tp->t_logs = log_tailq; tp->t_lognum += outnum; } INP_WUNLOCK(inp); } else { /* Sanity check entries */ KASSERT(((caddr_t)out_entry - (caddr_t)sopt->sopt_val) == outsize, ("%s: Actual output size (%zu) != " "calculated output size (%zu)", __func__, (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val), outsize)); /* Free the entries we just copied out. */ STAILQ_FOREACH_SAFE(log_entry, &log_tailq, tlm_queue, log_next) { tcp_log_entry_refcnt_rem(log_entry); uma_zfree(tcp_log_zone, log_entry); } } sopt->sopt_valsize = (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val); return (error); } static void tcp_log_free_queue(struct tcp_log_dev_queue *param) { struct tcp_log_dev_log_queue *entry; KASSERT(param != NULL, ("%s: called with NULL param", __func__)); if (param == NULL) return; entry = (struct tcp_log_dev_log_queue *)param; /* Free the entries. */ tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count); /* Free the buffer, if it is allocated. */ if (entry->tldl_common.tldq_buf != NULL) free(entry->tldl_common.tldq_buf, M_TCPLOGDEV); /* Free the queue entry. */ free(entry, M_TCPLOGDEV); } static struct tcp_log_common_header * tcp_log_expandlogbuf(struct tcp_log_dev_queue *param) { struct tcp_log_dev_log_queue *entry; struct tcp_log_header *hdr; uint8_t *end; struct sockopt sopt; int error; entry = (struct tcp_log_dev_log_queue *)param; /* Take a worst-case guess at space needs. */ sopt.sopt_valsize = sizeof(struct tcp_log_header) + entry->tldl_count * (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)); hdr = malloc(sopt.sopt_valsize, M_TCPLOGDEV, M_NOWAIT); if (hdr == NULL) { #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail5, entry->tldl_count); #endif return (NULL); } sopt.sopt_val = hdr + 1; sopt.sopt_valsize -= sizeof(struct tcp_log_header); sopt.sopt_td = NULL; error = tcp_log_logs_to_buf(&sopt, &entry->tldl_entries, (struct tcp_log_buffer **)&end, entry->tldl_count); if (error) { free(hdr, M_TCPLOGDEV); return (NULL); } /* Free the entries. */ tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count); entry->tldl_count = 0; memset(hdr, 0, sizeof(struct tcp_log_header)); hdr->tlh_version = TCP_LOG_BUF_VER; hdr->tlh_type = TCP_LOG_DEV_TYPE_BBR; hdr->tlh_length = end - (uint8_t *)hdr; hdr->tlh_ie = entry->tldl_ie; hdr->tlh_af = entry->tldl_af; getboottime(&hdr->tlh_offset); strlcpy(hdr->tlh_id, entry->tldl_id, TCP_LOG_ID_LEN); strlcpy(hdr->tlh_tag, entry->tldl_tag, TCP_LOG_TAG_LEN); strlcpy(hdr->tlh_reason, entry->tldl_reason, TCP_LOG_REASON_LEN); return ((struct tcp_log_common_header *)hdr); } /* * Queue the tcpcb's log buffer for transmission via the log buffer facility. * * NOTE: This should be called with a write lock on the PCB. * * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop * and reacquire the INP lock if it needs to do so. * * If force is false, this will only dump auto-logged sessions if * tcp_log_auto_all is true or if there is a log ID defined for the session. */ int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force) { struct tcp_log_dev_log_queue *entry; struct inpcb *inp = tptoinpcb(tp); #ifdef TCPLOG_DEBUG_COUNTERS int num_entries; #endif INP_WLOCK_ASSERT(inp); /* If there are no log entries, there is nothing to do. */ if (tp->t_lognum == 0) return (0); /* Check for a log ID. */ if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) && !tcp_log_auto_all && !force) { struct tcp_log_mem *log_entry; /* * We needed a log ID and none was found. Free the log entries * and return success. Also, cancel further logging. If the * session doesn't have a log ID by now, we'll assume it isn't * going to get one. */ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); tp->_t_logstate = TCP_LOG_STATE_OFF; return (0); } /* * Allocate memory. If we must wait, we'll need to drop the locks * and reacquire them (and do all the related business that goes * along with that). */ entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_NOWAIT); if (entry == NULL && (how & M_NOWAIT)) { #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail3, 1); #endif return (ENOBUFS); } if (entry == NULL) { INP_WUNLOCK(inp); entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_WAITOK); INP_WLOCK(inp); /* * Note that this check is slightly overly-restrictive in * that the TCB can survive either of these events. * However, there is currently not a good way to ensure * that is the case. So, if we hit this M_WAIT path, we * may end up dropping some entries. That seems like a * small price to pay for safety. */ if (inp->inp_flags & INP_DROPPED) { free(entry, M_TCPLOGDEV); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail2, 1); #endif return (ECONNRESET); } tp = intotcpcb(inp); if (tp->t_lognum == 0) { free(entry, M_TCPLOGDEV); return (0); } } /* Fill in the unique parts of the queue entry. */ if (tp->t_lib != NULL) { strlcpy(entry->tldl_id, tp->t_lib->tlb_id, TCP_LOG_ID_LEN); strlcpy(entry->tldl_tag, tp->t_lib->tlb_tag, TCP_LOG_TAG_LEN); } else { strlcpy(entry->tldl_id, "UNKNOWN", TCP_LOG_ID_LEN); strlcpy(entry->tldl_tag, "UNKNOWN", TCP_LOG_TAG_LEN); } if (reason != NULL) strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN); else strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN); entry->tldl_ie = inp->inp_inc.inc_ie; if (inp->inp_inc.inc_flags & INC_ISIPV6) entry->tldl_af = AF_INET6; else entry->tldl_af = AF_INET; entry->tldl_entries = tp->t_logs; entry->tldl_count = tp->t_lognum; /* Fill in the common parts of the queue entry. */ entry->tldl_common.tldq_buf = NULL; entry->tldl_common.tldq_xform = tcp_log_expandlogbuf; entry->tldl_common.tldq_dtor = tcp_log_free_queue; /* Clear the log data from the TCPCB. */ #ifdef TCPLOG_DEBUG_COUNTERS num_entries = tp->t_lognum; #endif tp->t_lognum = 0; STAILQ_INIT(&tp->t_logs); /* Add the entry. If no one is listening, free the entry. */ if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) { tcp_log_free_queue((struct tcp_log_dev_queue *)entry); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail1, num_entries); } else { counter_u64_add(tcp_log_queued, num_entries); #endif } return (0); } /* * Queue the log_id_node's log buffers for transmission via the log buffer * facility. * * NOTE: This should be called with the bucket locked and referenced. * * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop * and reacquire the bucket lock if it needs to do so. (The caller must * ensure that the tln is no longer on any lists so no one else will mess * with this while the lock is dropped!) */ static int tcp_log_dump_node_logbuf(struct tcp_log_id_node *tln, char *reason, int how) { struct tcp_log_dev_log_queue *entry; struct tcp_log_id_bucket *tlb; tlb = tln->tln_bucket; TCPID_BUCKET_LOCK_ASSERT(tlb); KASSERT(tlb->tlb_refcnt > 0, ("%s:%d: Called with unreferenced bucket (tln=%p, tlb=%p)", __func__, __LINE__, tln, tlb)); KASSERT(tln->tln_closed, ("%s:%d: Called for node with tln_closed==false (tln=%p)", __func__, __LINE__, tln)); /* If there are no log entries, there is nothing to do. */ if (tln->tln_count == 0) return (0); /* * Allocate memory. If we must wait, we'll need to drop the locks * and reacquire them (and do all the related business that goes * along with that). */ entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_NOWAIT); if (entry == NULL && (how & M_NOWAIT)) return (ENOBUFS); if (entry == NULL) { TCPID_BUCKET_UNLOCK(tlb); entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_WAITOK); TCPID_BUCKET_LOCK(tlb); } /* Fill in the common parts of the queue entry.. */ entry->tldl_common.tldq_buf = NULL; entry->tldl_common.tldq_xform = tcp_log_expandlogbuf; entry->tldl_common.tldq_dtor = tcp_log_free_queue; /* Fill in the unique parts of the queue entry. */ strlcpy(entry->tldl_id, tlb->tlb_id, TCP_LOG_ID_LEN); strlcpy(entry->tldl_tag, tlb->tlb_tag, TCP_LOG_TAG_LEN); if (reason != NULL) strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN); else strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN); entry->tldl_ie = tln->tln_ie; entry->tldl_entries = tln->tln_entries; entry->tldl_count = tln->tln_count; entry->tldl_af = tln->tln_af; /* Add the entry. If no one is listening, free the entry. */ if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) tcp_log_free_queue((struct tcp_log_dev_queue *)entry); return (0); } /* * Queue the log buffers for all sessions in a bucket for transmissions via * the log buffer facility. * * NOTE: This should be called with a locked bucket; however, the function * will drop the lock. */ #define LOCAL_SAVE 10 static void tcp_log_dumpbucketlogs(struct tcp_log_id_bucket *tlb, char *reason) { struct tcp_log_id_node local_entries[LOCAL_SAVE]; struct inpcb *inp; struct tcpcb *tp; struct tcp_log_id_node *cur_tln, *prev_tln, *tmp_tln; int i, num_local_entries, tree_locked; bool expireq_locked; TCPID_BUCKET_LOCK_ASSERT(tlb); /* * Take a reference on the bucket to keep it from disappearing until * we are done. */ TCPID_BUCKET_REF(tlb); /* * We'll try to create these without dropping locks. However, we * might very well need to drop locks to get memory. If that's the * case, we'll save up to 10 on the stack, and sacrifice the rest. * (Otherwise, we need to worry about finding our place again in a * potentially changed list. It just doesn't seem worth the trouble * to do that. */ expireq_locked = false; num_local_entries = 0; prev_tln = NULL; tree_locked = TREE_UNLOCKED; SLIST_FOREACH_SAFE(cur_tln, &tlb->tlb_head, tln_list, tmp_tln) { /* * If this isn't associated with a TCPCB, we can pull it off * the list now. We need to be careful that the expire timer * hasn't already taken ownership (tln_expiretime == SBT_MAX). * If so, we let the expire timer code free the data. */ if (cur_tln->tln_closed) { no_inp: /* * Get the expireq lock so we can get a consistent * read of tln_expiretime and so we can remove this * from the expireq. */ if (!expireq_locked) { TCPLOG_EXPIREQ_LOCK(); expireq_locked = true; } /* * We ignore entries with tln_expiretime == SBT_MAX. * The expire timer code already owns those. */ KASSERT(cur_tln->tln_expiretime > (sbintime_t) 0, ("%s:%d: node on the expire queue without positive " "expire time", __func__, __LINE__)); if (cur_tln->tln_expiretime == SBT_MAX) { prev_tln = cur_tln; continue; } /* Remove the entry from the expireq. */ STAILQ_REMOVE(&tcp_log_expireq_head, cur_tln, tcp_log_id_node, tln_expireq); /* Remove the entry from the bucket. */ if (prev_tln != NULL) SLIST_REMOVE_AFTER(prev_tln, tln_list); else SLIST_REMOVE_HEAD(&tlb->tlb_head, tln_list); /* * Drop the INP and bucket reference counts. Due to * lock-ordering rules, we need to drop the expire * queue lock. */ TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; /* Drop the INP reference. */ INP_WLOCK(cur_tln->tln_inp); if (!in_pcbrele_wlocked(cur_tln->tln_inp)) INP_WUNLOCK(cur_tln->tln_inp); if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) { #ifdef INVARIANTS panic("%s: Bucket refcount unexpectedly 0.", __func__); #endif /* * Recover as best we can: free the entry we * own. */ tcp_log_free_entries(&cur_tln->tln_entries, &cur_tln->tln_count); uma_zfree(tcp_log_id_node_zone, cur_tln); goto done; } if (tcp_log_dump_node_logbuf(cur_tln, reason, M_NOWAIT)) { /* * If we have sapce, save the entries locally. * Otherwise, free them. */ if (num_local_entries < LOCAL_SAVE) { local_entries[num_local_entries] = *cur_tln; num_local_entries++; } else { tcp_log_free_entries( &cur_tln->tln_entries, &cur_tln->tln_count); } } /* No matter what, we are done with the node now. */ uma_zfree(tcp_log_id_node_zone, cur_tln); /* * Because we removed this entry from the list, prev_tln * (which tracks the previous entry still on the tlb * list) remains unchanged. */ continue; } /* * If we get to this point, the session data is still held in * the TCPCB. So, we need to pull the data out of that. * * We will need to drop the expireq lock so we can lock the INP. * We can then try to extract the data the "easy" way. If that * fails, we'll save the log entries for later. */ if (expireq_locked) { TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; } /* Lock the INP and then re-check the state. */ inp = cur_tln->tln_inp; INP_WLOCK(inp); /* * If we caught this while it was transitioning, the data * might have moved from the TCPCB to the tln (signified by * setting tln_closed to true. If so, treat this like an * inactive connection. */ if (cur_tln->tln_closed) { /* * It looks like we may have caught this connection * while it was transitioning from active to inactive. * Treat this like an inactive connection. */ INP_WUNLOCK(inp); goto no_inp; } /* * Try to dump the data from the tp without dropping the lock. * If this fails, try to save off the data locally. */ tp = cur_tln->tln_tp; if (tcp_log_dump_tp_logbuf(tp, reason, M_NOWAIT, true) && num_local_entries < LOCAL_SAVE) { tcp_log_move_tp_to_node(tp, &local_entries[num_local_entries]); local_entries[num_local_entries].tln_closed = 1; KASSERT(local_entries[num_local_entries].tln_bucket == tlb, ("%s: %d: bucket mismatch for node %p", __func__, __LINE__, cur_tln)); num_local_entries++; } INP_WUNLOCK(inp); /* * We are goint to leave the current tln on the list. It will * become the previous tln. */ prev_tln = cur_tln; } /* Drop our locks, if any. */ KASSERT(tree_locked == TREE_UNLOCKED, ("%s: %d: tree unexpectedly locked", __func__, __LINE__)); switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); tree_locked = TREE_UNLOCKED; break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); tree_locked = TREE_UNLOCKED; break; } if (expireq_locked) { TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; } /* * Try again for any saved entries. tcp_log_dump_node_logbuf() is * guaranteed to free the log entries within the node. And, since * the node itself is on our stack, we don't need to free it. */ for (i = 0; i < num_local_entries; i++) tcp_log_dump_node_logbuf(&local_entries[i], reason, M_WAITOK); /* Drop our reference. */ if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) TCPID_BUCKET_UNLOCK(tlb); done: /* Drop our locks, if any. */ switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); break; } if (expireq_locked) TCPLOG_EXPIREQ_UNLOCK(); } #undef LOCAL_SAVE /* * Queue the log buffers for all sessions in a bucket for transmissions via * the log buffer facility. * * NOTE: This should be called with a locked INP; however, the function * will drop the lock. */ void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason) { struct inpcb *inp = tptoinpcb(tp); struct tcp_log_id_bucket *tlb; int tree_locked; /* Figure out our bucket and lock it. */ INP_WLOCK_ASSERT(inp); tlb = tp->t_lib; if (tlb == NULL) { /* * No bucket; treat this like a request to dump a single * session's traces. */ (void)tcp_log_dump_tp_logbuf(tp, reason, M_WAITOK, true); INP_WUNLOCK(inp); return; } TCPID_BUCKET_REF(tlb); INP_WUNLOCK(inp); TCPID_BUCKET_LOCK(tlb); /* If we are the last reference, we have nothing more to do here. */ tree_locked = TREE_UNLOCKED; if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) { switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); break; } return; } /* Turn this over to tcp_log_dumpbucketlogs() to finish the work. */ tcp_log_dumpbucketlogs(tlb, reason); } /* * Mark the end of a flow with the current stack. A stack can add * stack-specific info to this trace event by overriding this * function (see bbr_log_flowend() for example). */ void tcp_log_flowend(struct tcpcb *tp) { if (tp->_t_logstate != TCP_LOG_STATE_OFF) { struct socket *so = tptosocket(tp); TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FLOWEND, 0, 0, NULL, false); } } void tcp_log_sendfile(struct socket *so, off_t offset, size_t nbytes, int flags) { struct inpcb *inp; struct tcpcb *tp; #ifdef TCP_REQUEST_TRK struct http_sendfile_track *ent; int i, fnd; #endif inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_log_sendfile: inp == NULL")); /* quick check to see if logging is enabled for this connection */ tp = intotcpcb(inp); if ((inp->inp_flags & INP_DROPPED) || (tp->_t_logstate == TCP_LOG_STATE_OFF)) { return; } INP_WLOCK(inp); /* double check log state now that we have the lock */ if (inp->inp_flags & INP_DROPPED) goto done; if (tp->_t_logstate != TCP_LOG_STATE_OFF) { struct timeval tv; tcp_log_eventspecific_t log; microuptime(&tv); log.u_sf.offset = offset; log.u_sf.length = nbytes; log.u_sf.flags = flags; TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_LOG_SENDFILE, 0, 0, &log, false, &tv); } #ifdef TCP_REQUEST_TRK if (tp->t_http_req == 0) { /* No http requests to track */ goto done; } fnd = 0; if (tp->t_http_closed == 0) { /* No closed end req to track */ goto skip_closed_req; } for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { /* Lets see if this one can be found */ ent = &tp->t_http_info[i]; if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) { /* Not used */ continue; } if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) { /* This pass does not consider open requests */ continue; } if (ent->flags & TCP_HTTP_TRACK_FLG_COMP) { /* Don't look at what we have completed */ continue; } /* If we reach here its a allocated closed end request */ - if ((ent->start == offset) || + if ((ent->start == offset) || ((offset > ent->start) && (offset < ent->end))){ /* Its within this request?? */ fnd = 1; } if (fnd) { /* * It is at or past the end, its complete. */ ent->flags |= TCP_HTTP_TRACK_FLG_SEQV; /* * When an entry completes we can take (snd_una + sb_cc) and know where * the end of the range really is. Note that this works since two * requests must be sequential and sendfile now is complete for *this* request. * we must use sb_ccc since the data may still be in-flight in TLS. * * We always cautiously move the end_seq only if our calculations * show it happened (just in case sf has the call to here at the wrong * place). When we go COMP we will stop coming here and hopefully be * left with the correct end_seq. */ if (SEQ_GT((tp->snd_una + so->so_snd.sb_ccc), ent->end_seq)) ent->end_seq = tp->snd_una + so->so_snd.sb_ccc; if ((offset + nbytes) >= ent->end) { ent->flags |= TCP_HTTP_TRACK_FLG_COMP; tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_COMPLETE, offset, nbytes); } else { tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_MOREYET, offset, nbytes); } /* We assume that sendfile never sends overlapping requests */ goto done; } } skip_closed_req: if (!fnd) { /* Ok now lets look for open requests */ for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { ent = &tp->t_http_info[i]; if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) { /* Not used */ continue; } if ((ent->flags & TCP_HTTP_TRACK_FLG_OPEN) == 0) continue; /* If we reach here its an allocated open request */ if (ent->start == offset) { /* It begins this request */ ent->start_seq = tp->snd_una + tptosocket(tp)->so_snd.sb_ccc; ent->flags |= TCP_HTTP_TRACK_FLG_SEQV; break; } else if (offset > ent->start) { ent->flags |= TCP_HTTP_TRACK_FLG_SEQV; break; } } } #endif done: INP_WUNLOCK(inp); } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index d88fc48fc021..db0c71fdbfff 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1,4633 +1,4633 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95 */ #include __FBSDID("$FreeBSD$"); #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_kern_tls.h" #include #include #include #include #include #ifdef TCP_HHOOK #include #endif #include #ifdef TCP_HHOOK #include #endif #ifdef KERN_TLS #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #include #include #include #include #include #include #include #endif #include #ifdef INVARIANTS #define TCPSTATES #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef TCPPCAP #include #endif #ifdef TCP_OFFLOAD #include #endif #include #include #ifdef INET6 #include #endif #include #include #include #include #ifdef INET6 static ip6proto_ctlinput_t tcp6_ctlinput; static udp_tun_icmp_t tcp6_ctlinput_viaudp; #endif VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS; #ifdef INET6 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS; #endif #ifdef TCP_SAD_DETECTION /* Sack attack detection thresholds and such */ SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack_attack, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Sack Attack detection thresholds"); int32_t tcp_force_detection = 0; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection, CTLFLAG_RW, &tcp_force_detection, 0, "Do we force detection even if the INP has it off?"); int32_t tcp_sad_limit = 10000; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, limit, CTLFLAG_RW, &tcp_sad_limit, 10000, "If SaD is enabled, what is the limit to sendmap entries (0 = unlimited)?"); int32_t tcp_sack_to_ack_thresh = 700; /* 70 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh, CTLFLAG_RW, &tcp_sack_to_ack_thresh, 700, "Percentage of sacks to acks we must see above (10.1 percent is 101)?"); int32_t tcp_sack_to_move_thresh = 600; /* 60 % */ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, move_thresh, CTLFLAG_RW, &tcp_sack_to_move_thresh, 600, "Percentage of sack moves we must see above (10.1 percent is 101)"); int32_t tcp_restoral_thresh = 450; /* 45 % (sack:2:ack -25%) (mv:ratio -15%) **/ SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, restore_thresh, CTLFLAG_RW, &tcp_restoral_thresh, 450, "Percentage of sack to ack percentage we must see below to restore(10.1 percent is 101)"); int32_t tcp_sad_decay_val = 800; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, decay_per, CTLFLAG_RW, &tcp_sad_decay_val, 800, "The decay percentage (10.1 percent equals 101 )"); int32_t tcp_map_minimum = 500; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, nummaps, CTLFLAG_RW, &tcp_map_minimum, 500, "Number of Map enteries before we start detection"); int32_t tcp_sad_pacing_interval = 2000; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_pacing_int, CTLFLAG_RW, &tcp_sad_pacing_interval, 2000, "What is the minimum pacing interval for a classified attacker?"); int32_t tcp_sad_low_pps = 100; SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps, CTLFLAG_RW, &tcp_sad_low_pps, 100, "What is the input pps that below which we do not decay?"); #endif uint32_t tcp_ack_war_time_window = 1000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow, CTLFLAG_RW, &tcp_ack_war_time_window, 1000, "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?"); uint32_t tcp_ack_war_cnt = 5; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt, CTLFLAG_RW, &tcp_ack_war_cnt, 5, "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?"); struct rwlock tcp_function_lock; static int sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I", "Default TCP Maximum Segment Size"); #ifdef INET6 static int sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_v6mssdflt; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if (new < TCP_MINMSS) error = EINVAL; else V_tcp_v6mssdflt = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I", "Default TCP Maximum Segment Size for IPv6"); #endif /* INET6 */ /* * Minimum MSS we accept and use. This prevents DoS attacks where * we are forced to a ridiculous low MSS like 20 and send hundreds * of packets instead of one. The effect scales with the available * bandwidth and quickly saturates the CPU and network interface * with packet generation and sending. Set to zero to disable MINMSS * checking. This setting prevents us from sending too small packets. */ VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS; SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_minmss), 0, "Minimum TCP Maximum Segment Size"); VNET_DEFINE(int, tcp_do_rfc1323) = 1; SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_rfc1323), 0, "Enable rfc1323 (high performance TCP) extensions"); /* * As of June 2021, several TCP stacks violate RFC 7323 from September 2014. * Some stacks negotiate TS, but never send them after connection setup. Some * stacks negotiate TS, but don't send them when sending keep-alive segments. * These include modern widely deployed TCP stacks. * Therefore tolerating violations for now... */ VNET_DEFINE(int, tcp_tolerate_missing_ts) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_tolerate_missing_ts), 0, "Tolerate missing TCP timestamps"); VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ts_offset_per_conn), 0, "Initialize TCP timestamps per connection instead of per host pair"); /* How many connections are pacing */ static volatile uint32_t number_of_tcp_connections_pacing = 0; static uint32_t shadow_num_connections = 0; static counter_u64_t tcp_pacing_failures; static int tcp_pacing_limit = 10000; SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW, &tcp_pacing_limit, 1000, "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD, &shadow_num_connections, 0, "Number of TCP connections being paced"); SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, pacing_failures, CTLFLAG_RD, &tcp_pacing_failures, "Number of times we failed to enable pacing to avoid exceeding the limit"); static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); static int tcp_tcbhashsize; SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable"); static int do_tcpdrain = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0, "Enable tcp_drain routine for extra help when low on mbufs"); SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD, &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs"); VNET_DEFINE_STATIC(int, icmp_may_rst) = 1; #define V_icmp_may_rst VNET(icmp_may_rst) SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp_may_rst), 0, "Certain ICMP unreachable messages may abort connections in SYN_SENT"); VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0; #define V_tcp_isn_reseed_interval VNET(tcp_isn_reseed_interval) SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_isn_reseed_interval), 0, "Seconds between reseeding of ISN secret"); static int tcp_soreceive_stream; SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN, &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets"); VNET_DEFINE(uma_zone_t, sack_hole_zone); #define V_sack_hole_zone VNET(sack_hole_zone) VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0; /* unlimited */ static int sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t new; new = V_tcp_map_entries_limit; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { /* only allow "0" and value > minimum */ if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT) error = EINVAL; else V_tcp_map_entries_limit = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit, CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, &VNET_NAME(tcp_map_entries_limit), 0, &sysctl_net_inet_tcp_map_limit_check, "IU", "Total sendmap entries limit"); VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0; /* unlimited */ SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_map_split_limit), 0, "Total sendmap split entries limit"); #ifdef TCP_HHOOK VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]); #endif #define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]); #define V_ts_offset_secret VNET(ts_offset_secret) static int tcp_default_fb_init(struct tcpcb *tp, void **ptr); static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged); static int tcp_default_handoff_ok(struct tcpcb *tp); static struct inpcb *tcp_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int); static struct inpcb *tcp_mtudisc(struct inpcb *, int); static struct inpcb *tcp_drop_syn_sent(struct inpcb *, int); static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr); static void tcp_default_switch_failed(struct tcpcb *tp); static ipproto_ctlinput_t tcp_ctlinput; static udp_tun_icmp_t tcp_ctlinput_viaudp; static struct tcp_function_block tcp_def_funcblk = { .tfb_tcp_block_name = "freebsd", .tfb_tcp_output = tcp_default_output, .tfb_tcp_do_segment = tcp_do_segment, .tfb_tcp_ctloutput = tcp_default_ctloutput, .tfb_tcp_handoff_ok = tcp_default_handoff_ok, .tfb_tcp_fb_init = tcp_default_fb_init, .tfb_tcp_fb_fini = tcp_default_fb_fini, .tfb_switch_failed = tcp_default_switch_failed, }; static int tcp_fb_cnt = 0; struct tcp_funchead t_functions; VNET_DEFINE_STATIC(struct tcp_function_block *, tcp_func_set_ptr) = &tcp_def_funcblk; #define V_tcp_func_set_ptr VNET(tcp_func_set_ptr) void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp) { TCPSTAT_INC(tcps_dsack_count); tp->t_dsack_pack++; if (tlp == 0) { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_bytes, (start - end)); } } else { if (SEQ_GT(end, start)) { tp->t_dsack_bytes += (end - start); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (end - start)); } else { tp->t_dsack_tlp_bytes += (start - end); TCPSTAT_ADD(tcps_dsack_tlp_bytes, (start - end)); } } } static struct tcp_function_block * find_tcp_functions_locked(struct tcp_function_set *fs) { struct tcp_function *f; struct tcp_function_block *blk=NULL; TAILQ_FOREACH(f, &t_functions, tf_next) { if (strcmp(f->tf_name, fs->function_set_name) == 0) { blk = f->tf_fb; break; } } return(blk); } static struct tcp_function_block * find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s) { struct tcp_function_block *rblk=NULL; struct tcp_function *f; TAILQ_FOREACH(f, &t_functions, tf_next) { if (f->tf_fb == blk) { rblk = blk; if (s) { *s = f; } break; } } return (rblk); } struct tcp_function_block * find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(blk); } struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(rblk); } /* Find a matching alias for the given tcp_function_block. */ int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs) { struct tcp_function *f; int found; found = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { if ((f->tf_fb == blk) && (strncmp(f->tf_name, blk->tfb_tcp_block_name, TCP_FUNCTION_NAME_LEN_MAX) != 0)) { /* Matching function block with different name. */ strncpy(fs->function_set_name, f->tf_name, TCP_FUNCTION_NAME_LEN_MAX); found = 1; break; } } /* Null terminate the string appropriately. */ if (found) { fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; } else { fs->function_set_name[0] = '\0'; } rw_runlock(&tcp_function_lock); return (found); } static struct tcp_function_block * find_and_ref_tcp_default_fb(void) { struct tcp_function_block *rblk; rw_rlock(&tcp_function_lock); rblk = V_tcp_func_set_ptr; refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return (rblk); } void tcp_switch_back_to_default(struct tcpcb *tp) { struct tcp_function_block *tfb; void *ptr = NULL; KASSERT(tp->t_fb != &tcp_def_funcblk, ("%s: called by the built-in default stack", __func__)); /* * Now, we'll find a new function block to use. * Start by trying the current user-selected * default, unless this stack is the user-selected * default. */ tfb = find_and_ref_tcp_default_fb(); if (tfb == tp->t_fb) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Does the stack accept this connection? */ if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL && (*tfb->tfb_tcp_handoff_ok)(tp)) { refcount_release(&tfb->tfb_refcnt); tfb = NULL; } /* Try to use that stack. */ if (tfb != NULL) { /* Initialize the new stack. If it succeeds, we are done. */ if (tfb->tfb_tcp_fb_init == NULL || (*tfb->tfb_tcp_fb_init)(tp, &ptr) == 0) { /* Release the old stack */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* Now set in all the pointers */ tp->t_fb = tfb; tp->t_fb_ptr = ptr; return; } /* * Initialization failed. Release the reference count on * the looked up default stack. */ refcount_release(&tfb->tfb_refcnt); } /* * If that wasn't feasible, use the built-in default * stack which is not allowed to reject anyone. */ tfb = find_and_ref_tcp_fb(&tcp_def_funcblk); if (tfb == NULL) { /* there always should be a default */ panic("Can't refer to tcp_def_funcblk"); } if (tfb->tfb_tcp_handoff_ok != NULL) { if ((*tfb->tfb_tcp_handoff_ok) (tp)) { /* The default stack cannot say no */ panic("Default stack rejects a new session?"); } } if (tfb->tfb_tcp_fb_init != NULL && (*tfb->tfb_tcp_fb_init)(tp, &ptr)) { /* The default stack cannot fail */ panic("Default stack initialization failed"); } /* Now release the old stack */ if (tp->t_fb->tfb_tcp_fb_fini != NULL) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 0); refcount_release(&tp->t_fb->tfb_refcnt); /* And set in the pointers to the new */ tp->t_fb = tfb; tp->t_fb_ptr = ptr; } static bool tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { struct ip *iph; #ifdef INET6 struct ip6_hdr *ip6; #endif struct udphdr *uh; struct tcphdr *th; int thlen; uint16_t port; TCPSTAT_INC(tcps_tunneled_pkts); if ((m->m_flags & M_PKTHDR) == 0) { /* Can't handle one that is not a pkt hdr */ TCPSTAT_INC(tcps_tunneled_errs); goto out; } thlen = sizeof(struct tcphdr); if (m->m_len < off + sizeof(struct udphdr) + thlen && (m = m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); thlen = th->th_off << 2; if (m->m_len < off + sizeof(struct udphdr) + thlen) { m = m_pullup(m, off + sizeof(struct udphdr) + thlen); if (m == NULL) { TCPSTAT_INC(tcps_tunneled_errs); goto out; } else { iph = mtod(m, struct ip *); uh = (struct udphdr *)((caddr_t)iph + off); th = (struct tcphdr *)(uh + 1); } } m->m_pkthdr.tcp_tun_port = port = uh->uh_sport; bcopy(th, uh, m->m_len - off); m->m_len -= sizeof(struct udphdr); m->m_pkthdr.len -= sizeof(struct udphdr); /* * We use the same algorithm for * both UDP and TCP for c-sum. So * the code in tcp_input will skip * the checksum. So we do nothing * with the flag (m->m_pkthdr.csum_flags). */ switch (iph->ip_v) { #ifdef INET case IPVERSION: iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr)); tcp_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif #ifdef INET6 case IPV6_VERSION >> 4: ip6 = mtod(m, struct ip6_hdr *); ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr)); tcp6_input_with_port(&m, &off, IPPROTO_TCP, port); break; #endif default: goto out; break; } return (true); out: m_freem(m); return (true); } static int sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS) { int error=ENOENT; struct tcp_function_set fs; struct tcp_function_block *blk; memset(&fs, 0, sizeof(fs)); rw_rlock(&tcp_function_lock); blk = find_tcp_fb_locked(V_tcp_func_set_ptr, NULL); if (blk) { /* Found him */ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); /* Check for error or no change */ if (error != 0 || req->newptr == NULL) return(error); rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { error = ENOENT; goto done; } V_tcp_func_set_ptr = blk; done: rw_wunlock(&tcp_function_lock); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_default_tcp_functions, "A", "Set/get the default TCP functions"); static int sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS) { int error, cnt, linesz; struct tcp_function *f; char *buffer, *cp; size_t bufsz, outsz; bool alias; cnt = 0; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { cnt++; } rw_runlock(&tcp_function_lock); bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1; buffer = malloc(bufsz, M_TEMP, M_WAITOK); error = 0; cp = buffer; linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D', "Alias", "PCB count"); cp += linesz; bufsz -= linesz; outsz = linesz; rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name); linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n", f->tf_fb->tfb_tcp_block_name, (f->tf_fb == V_tcp_func_set_ptr) ? '*' : ' ', alias ? f->tf_name : "-", f->tf_fb->tfb_refcnt); if (linesz >= bufsz) { error = EOVERFLOW; break; } cp += linesz; bufsz -= linesz; outsz += linesz; } rw_runlock(&tcp_function_lock); if (error == 0) error = sysctl_handle_string(oidp, buffer, outsz + 1, req); free(buffer, M_TEMP); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available, CTLFLAG_VNET | CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_net_inet_list_available, "A", "list available TCP Function sets"); VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT; #ifdef INET VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL; #define V_udp4_tun_socket VNET(udp4_tun_socket) #endif #ifdef INET6 VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL; #define V_udp6_tun_socket VNET(udp6_tun_socket) #endif static struct sx tcpoudp_lock; static void tcp_over_udp_stop(void) { sx_assert(&tcpoudp_lock, SA_XLOCKED); #ifdef INET if (V_udp4_tun_socket != NULL) { soclose(V_udp4_tun_socket); V_udp4_tun_socket = NULL; } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { soclose(V_udp6_tun_socket); V_udp6_tun_socket = NULL; } #endif } static int tcp_over_udp_start(void) { uint16_t port; int ret; #ifdef INET struct sockaddr_in sin; #endif #ifdef INET6 struct sockaddr_in6 sin6; #endif sx_assert(&tcpoudp_lock, SA_XLOCKED); port = V_tcp_udp_tunneling_port; if (ntohs(port) == 0) { /* Must have a port set */ return (EINVAL); } #ifdef INET if (V_udp4_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET6 if (V_udp6_tun_socket != NULL) { /* Already running -- must stop first */ return (EALREADY); } #endif #ifdef INET if ((ret = socreate(PF_INET, &V_udp4_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket, tcp_recv_udp_tunneled_packet, tcp_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin, 0, sizeof(struct sockaddr_in)); sin.sin_len = sizeof(struct sockaddr_in); sin.sin_family = AF_INET; sin.sin_port = htons(port); if ((ret = sobind(V_udp4_tun_socket, (struct sockaddr *)&sin, curthread))) { tcp_over_udp_stop(); return (ret); } #endif #ifdef INET6 if ((ret = socreate(PF_INET6, &V_udp6_tun_socket, SOCK_DGRAM, IPPROTO_UDP, curthread->td_ucred, curthread))) { tcp_over_udp_stop(); return (ret); } /* Call the special UDP hook. */ if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket, tcp_recv_udp_tunneled_packet, tcp6_ctlinput_viaudp, NULL))) { tcp_over_udp_stop(); return (ret); } /* Ok, we have a socket, bind it to the port. */ memset(&sin6, 0, sizeof(struct sockaddr_in6)); sin6.sin6_len = sizeof(struct sockaddr_in6); sin6.sin6_family = AF_INET6; sin6.sin6_port = htons(port); if ((ret = sobind(V_udp6_tun_socket, (struct sockaddr *)&sin6, curthread))) { tcp_over_udp_stop(); return (ret); } #endif return (0); } static int sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS) { int error; uint32_t old, new; old = V_tcp_udp_tunneling_port; new = old; error = sysctl_handle_int(oidp, &new, 0, req); if ((error == 0) && (req->newptr != NULL)) { if ((new < TCP_TUNNELING_PORT_MIN) || (new > TCP_TUNNELING_PORT_MAX)) { error = EINVAL; } else { sx_xlock(&tcpoudp_lock); V_tcp_udp_tunneling_port = new; if (old != 0) { tcp_over_udp_stop(); } if (new != 0) { error = tcp_over_udp_start(); if (error != 0) { V_tcp_udp_tunneling_port = 0; } } sx_xunlock(&tcpoudp_lock); } } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_port), 0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU", "Tunneling port for tcp over udp"); VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT; static int sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS) { int error, new; new = V_tcp_udp_tunneling_overhead; error = sysctl_handle_int(oidp, &new, 0, req); if (error == 0 && req->newptr) { if ((new < TCP_TUNNELING_OVERHEAD_MIN) || (new > TCP_TUNNELING_OVERHEAD_MAX)) error = EINVAL; else V_tcp_udp_tunneling_overhead = new; } return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &VNET_NAME(tcp_udp_tunneling_overhead), 0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU", "MSS reduction when using tcp over udp"); /* * Exports one (struct tcp_function_info) for each alias/name. */ static int sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS) { int cnt, error; struct tcp_function *f; struct tcp_function_info tfi; /* * We don't allow writes. */ if (req->newptr != NULL) return (EINVAL); /* * Wire the old buffer so we can directly copy the functions to * user space without dropping the lock. */ if (req->oldptr != NULL) { error = sysctl_wire_old_buffer(req, 0); if (error) return (error); } /* * Walk the list and copy out matching entries. If INVARIANTS * is compiled in, also walk the list to verify the length of * the list matches what we have recorded. */ rw_rlock(&tcp_function_lock); cnt = 0; #ifndef INVARIANTS if (req->oldptr == NULL) { cnt = tcp_fb_cnt; goto skip_loop; } #endif TAILQ_FOREACH(f, &t_functions, tf_next) { #ifdef INVARIANTS cnt++; #endif if (req->oldptr != NULL) { bzero(&tfi, sizeof(tfi)); tfi.tfi_refcnt = f->tf_fb->tfb_refcnt; tfi.tfi_id = f->tf_fb->tfb_id; (void)strlcpy(tfi.tfi_alias, f->tf_name, sizeof(tfi.tfi_alias)); (void)strlcpy(tfi.tfi_name, f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name)); error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); /* * Don't stop on error, as that is the * mechanism we use to accumulate length * information if the buffer was too short. */ } } KASSERT(cnt == tcp_fb_cnt, ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt)); #ifndef INVARIANTS skip_loop: #endif rw_runlock(&tcp_function_lock); if (req->oldptr == NULL) error = SYSCTL_OUT(req, NULL, (cnt + 1) * sizeof(struct tcp_function_info)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info, CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info", "List TCP function block name-to-ID mappings"); /* * tfb_tcp_handoff_ok() function for the default stack. * Note that we'll basically try to take all comers. */ static int tcp_default_handoff_ok(struct tcpcb *tp) { return (0); } /* * tfb_tcp_fb_init() function for the default stack. * * This handles making sure we have appropriate timers set if you are * transitioning a socket that has some amount of setup done. * * The init() fuction from the default can *never* return non-zero i.e. * it is required to always succeed since it is the stack of last resort! */ static int tcp_default_fb_init(struct tcpcb *tp, void **ptr) { struct socket *so = tptosocket(tp); int rexmt; INP_WLOCK_ASSERT(tptoinpcb(tp)); /* We don't use the pointer */ *ptr = NULL; KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT, ("%s: connection %p in unexpected state %d", __func__, tp, tp->t_state)); /* Make sure we get no interesting mbuf queuing behavior */ /* All mbuf queue/ack compress flags should be off */ tcp_lro_features_off(tptoinpcb(tp)); /* Cancel the GP measurement in progress */ tp->t_flags &= ~TF_GPUTINPROG; /* Validate the timers are not in usec, if they are convert */ tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) rexmt = tcp_rexmit_initial * tcp_backoff[tp->t_rxtshift]; else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; if (tp->t_rxtshift == 0) tp->t_rxtcur = rexmt; else TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX); /* * Nothing to do for ESTABLISHED or LISTEN states. And, we don't * know what to do for unexpected states (which includes TIME_WAIT). */ if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT) return (0); /* * Make sure some kind of transmission timer is set if there is * outstanding data. */ if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) || tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) || tcp_timer_active(tp, TT_PERSIST))) { /* * If the session has established and it looks like it should * be in the persist state, set the persist timer. Otherwise, * set the retransmit timer. */ if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 && (int32_t)(tp->snd_nxt - tp->snd_una) < (int32_t)sbavail(&so->so_snd)) tcp_setpersist(tp); else tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); } /* All non-embryonic sessions get a keepalive timer. */ if (!tcp_timer_active(tp, TT_KEEP)) tcp_timer_activate(tp, TT_KEEP, TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) : TP_KEEPINIT(tp)); /* * Make sure critical variables are initialized * if transitioning while in Recovery. */ if IN_FASTRECOVERY(tp->t_flags) { if (tp->sackhint.recover_fs == 0) tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } return (0); } /* * tfb_tcp_fb_fini() function for the default stack. * * This changes state as necessary (or prudent) to prepare for another stack * to assume responsibility for the connection. */ static void tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged) { INP_WLOCK_ASSERT(tptoinpcb(tp)); #ifdef TCP_BLACKBOX tcp_log_flowend(tp); #endif tp->t_acktime = 0; return; } /* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment * variable net.inet.tcp.tcbhashsize */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 0 #endif MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers"); MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory"); static struct mtx isn_mtx; #define ISN_LOCK_INIT() mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF) #define ISN_LOCK() mtx_lock(&isn_mtx) #define ISN_UNLOCK() mtx_unlock(&isn_mtx) INPCBSTORAGE_DEFINE(tcpcbstor, tcpcb, "tcpinp", "tcp_inpcb", "tcp", "tcphash"); /* * Take a value and get the next power of 2 that doesn't overflow. * Used to size the tcp_inpcb hash buckets. */ static int maketcp_hashsize(int size) { int hashsize; /* * auto tune. * get the next power of 2 higher than maxsockets. */ hashsize = 1 << fls(size); /* catch overflow, and just go one power of 2 smaller */ if (hashsize < size) { hashsize = 1 << (fls(size) - 1); } return (hashsize); } static volatile int next_tcp_stack_id = 1; /* * Register a TCP function block with the name provided in the names * array. (Note that this function does NOT automatically register * blk->tfb_tcp_block_name as a stack name. Therefore, you should * explicitly include blk->tfb_tcp_block_name in the list of names if * you wish to register the stack with that name.) * * Either all name registrations will succeed or all will fail. If * a name registration fails, the function will update the num_names * argument to point to the array index of the name that encountered * the failure. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names) { struct tcp_function *n; struct tcp_function_set fs; int error, i; KASSERT(names != NULL && *num_names > 0, ("%s: Called with 0-length name list", __func__)); KASSERT(names != NULL, ("%s: Called with NULL name list", __func__)); KASSERT(rw_initialized(&tcp_function_lock), ("%s: called too early", __func__)); if ((blk->tfb_tcp_output == NULL) || (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { /* * These functions are required and you * need a name. */ *num_names = 0; return (EINVAL); } if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) { *num_names = 0; return (EINVAL); } refcount_init(&blk->tfb_refcnt, 0); blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (n == NULL) { error = ENOMEM; goto cleanup; } n->tf_fb = blk; (void)strlcpy(fs.function_set_name, names[i], sizeof(fs.function_set_name)); rw_wlock(&tcp_function_lock); if (find_tcp_functions_locked(&fs) != NULL) { /* Duplicate name space not allowed */ rw_wunlock(&tcp_function_lock); free(n, M_TCPFUNCTIONS); error = EALREADY; goto cleanup; } (void)strlcpy(n->tf_name, names[i], sizeof(n->tf_name)); TAILQ_INSERT_TAIL(&t_functions, n, tf_next); tcp_fb_cnt++; rw_wunlock(&tcp_function_lock); } return(0); cleanup: /* * Deregister the names we just added. Because registration failed * for names[i], we don't need to deregister that name. */ *num_names = i; rw_wlock(&tcp_function_lock); while (--i >= 0) { TAILQ_FOREACH(n, &t_functions, tf_next) { if (!strncmp(n->tf_name, names[i], TCP_FUNCTION_NAME_LEN_MAX)) { TAILQ_REMOVE(&t_functions, n, tf_next); tcp_fb_cnt--; n->tf_fb = NULL; free(n, M_TCPFUNCTIONS); break; } } } rw_wunlock(&tcp_function_lock); return (error); } /* * Register a TCP function block using the name provided in the name * argument. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait) { const char *name_list[1]; int num_names, rv; num_names = 1; if (name != NULL) name_list[0] = name; else name_list[0] = blk->tfb_tcp_block_name; rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names); return (rv); } /* * Register a TCP function block using the name defined in * blk->tfb_tcp_block_name. * * Returns 0 on success, or an error code on failure. */ int register_tcp_functions(struct tcp_function_block *blk, int wait) { return (register_tcp_functions_as_name(blk, NULL, wait)); } /* * Deregister all names associated with a function block. This * functionally removes the function block from use within the system. * * When called with a true quiesce argument, mark the function block * as being removed so no more stacks will use it and determine * whether the removal would succeed. * * When called with a false quiesce argument, actually attempt the * removal. * * When called with a force argument, attempt to switch all TCBs to * use the default stack instead of returning EBUSY. * * Returns 0 on success (or if the removal would succeed), or an error * code on failure. */ int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force) { struct tcp_function *f; VNET_ITERATOR_DECL(vnet_iter); if (blk == &tcp_def_funcblk) { /* You can't un-register the default */ return (EPERM); } rw_wlock(&tcp_function_lock); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); if (blk == V_tcp_func_set_ptr) { /* You can't free the current default in some vnet. */ CURVNET_RESTORE(); VNET_LIST_RUNLOCK_NOSLEEP(); rw_wunlock(&tcp_function_lock); return (EBUSY); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); /* Mark the block so no more stacks can use it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; /* * If TCBs are still attached to the stack, attempt to switch them * to the default stack. */ if (force && blk->tfb_refcnt) { struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); rw_wunlock(&tcp_function_lock); VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); while ((inp = inp_next(&inpi)) != NULL) { tp = intotcpcb(inp); if (tp == NULL || tp->t_fb != blk) continue; tcp_switch_back_to_default(tp); } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); rw_wlock(&tcp_function_lock); } if (blk->tfb_refcnt) { /* TCBs still attached. */ rw_wunlock(&tcp_function_lock); return (EBUSY); } if (quiesce) { /* Skip removal. */ rw_wunlock(&tcp_function_lock); return (0); } /* Remove any function names that map to this function block. */ while (find_tcp_fb_locked(blk, &f) != NULL) { TAILQ_REMOVE(&t_functions, f, tf_next); tcp_fb_cnt--; f->tf_fb = NULL; free(f, M_TCPFUNCTIONS); } rw_wunlock(&tcp_function_lock); return (0); } static void tcp_drain(void) { struct epoch_tracker et; VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) return; NET_EPOCH_ENTER(et); VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_WLOCKPCB); struct inpcb *inpb; struct tcpcb *tcpb; /* * Walk the tcpbs, if existing, and flush the reassembly queue, * if there is one... * XXX: The "Net/3" implementation doesn't imply that the TCP * reassembly queue should be flushed, but in a situation * where we're really low on mbufs, this is potentially * useful. */ while ((inpb = inp_next(&inpi)) != NULL) { if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); #ifdef TCP_BLACKBOX tcp_log_drain(tcpb); #endif #ifdef TCPPCAP if (tcp_pcap_aggressive_free) { /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tcpb->t_inpkts)); tcp_pcap_drain(&(tcpb->t_outpkts)); } #endif } } CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); NET_EPOCH_EXIT(et); } static void tcp_vnet_init(void *arg __unused) { #ifdef TCP_HHOOK if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) printf("%s: WARNING: unable to register helper hook\n", __func__); #endif #ifdef STATS if (tcp_stats_init()) printf("%s: WARNING: unable to initialise TCP stats\n", __func__); #endif in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize, tcp_tcbhashsize); syncache_init(); tcp_hc_init(); TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack); V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_fastopen_init(); COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK); VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK); V_tcp_msl = TCPTV_MSL; } VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_vnet_init, NULL); static void tcp_init(void *arg __unused) { const char *tcbhash_tuneable; int hashsize; tcp_reass_global_init(); /* XXX virtualize those below? */ tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; tcp_keepidle = TCPTV_KEEP_IDLE; tcp_keepintvl = TCPTV_KEEPINTVL; tcp_maxpersistidle = TCPTV_KEEP_IDLE; tcp_rexmit_initial = TCPTV_RTOBASE; if (tcp_rexmit_initial < 1) tcp_rexmit_initial = 1; tcp_rexmit_min = TCPTV_MIN; if (tcp_rexmit_min < 1) tcp_rexmit_min = 1; tcp_persmin = TCPTV_PERSMIN; tcp_persmax = TCPTV_PERSMAX; tcp_rexmit_slop = TCPTV_CPU_VAR; tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT; /* Setup the tcp function block list */ TAILQ_INIT(&t_functions); rw_init(&tcp_function_lock, "tcp_func_lock"); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); sx_init(&tcpoudp_lock, "TCP over UDP configuration"); #ifdef TCP_BLACKBOX /* Initialize the TCP logging data. */ tcp_log_init(); #endif arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0); if (tcp_soreceive_stream) { #ifdef INET tcp_protosw.pr_soreceive = soreceive_stream; #endif #ifdef INET6 tcp6_protosw.pr_soreceive = soreceive_stream; #endif /* INET6 */ } #ifdef INET6 max_protohdr_grow(sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); #else /* INET6 */ max_protohdr_grow(sizeof(struct tcpiphdr)); #endif /* INET6 */ ISN_LOCK_INIT(); EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL, SHUTDOWN_PRI_DEFAULT); EVENTHANDLER_REGISTER(vm_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); EVENTHANDLER_REGISTER(mbuf_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT); tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK); tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK); tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK); tcp_extra_mbuf = counter_u64_alloc(M_WAITOK); tcp_would_have_but = counter_u64_alloc(M_WAITOK); tcp_comp_total = counter_u64_alloc(M_WAITOK); tcp_uncomp_total = counter_u64_alloc(M_WAITOK); tcp_bad_csums = counter_u64_alloc(M_WAITOK); tcp_pacing_failures = counter_u64_alloc(M_WAITOK); #ifdef TCPPCAP tcp_pcap_init(); #endif hashsize = TCBHASHSIZE; tcbhash_tuneable = "net.inet.tcp.tcbhashsize"; TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize); if (hashsize == 0) { /* * Auto tune the hash size based on maxsockets. * A perfect hash would have a 1:1 mapping * (hashsize = maxsockets) however it's been * suggested that O(2) average is better. */ hashsize = maketcp_hashsize(maxsockets / 4); /* * Our historical default is 512, * do not autotune lower than this. */ if (hashsize < 512) hashsize = 512; if (bootverbose) printf("%s: %s auto tuned to %d\n", __func__, tcbhash_tuneable, hashsize); } /* * We require a hashsize to be a power of two. * Previously if it was not a power of two we would just reset it * back to 512, which could be a nasty surprise if you did not notice * the error message. * Instead what we do is clip it to the closest power of two lower * than the specified hash value. */ if (!powerof2(hashsize)) { int oldhashsize = hashsize; hashsize = maketcp_hashsize(hashsize); /* prevent absurdly low value */ if (hashsize < 16) hashsize = 16; printf("%s: WARNING: TCB hash size not a power of 2, " "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } tcp_tcbhashsize = hashsize; #ifdef INET IPPROTO_REGISTER(IPPROTO_TCP, tcp_input, tcp_ctlinput); #endif #ifdef INET6 IP6PROTO_REGISTER(IPPROTO_TCP, tcp6_input, tcp6_ctlinput); #endif } SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL); #ifdef VIMAGE static void tcp_destroy(void *unused __unused) { int n; #ifdef TCP_HHOOK int error; #endif /* * All our processes are gone, all our sockets should be cleaned * up, which means, we should be past the tcp_discardcb() calls. * Sleep to let all tcpcb timers really disappear and cleanup. */ for (;;) { INP_INFO_WLOCK(&V_tcbinfo); n = V_tcbinfo.ipi_count; INP_INFO_WUNLOCK(&V_tcbinfo); if (n == 0) break; pause("tcpdes", hz / 10); } tcp_hc_destroy(); syncache_destroy(); in_pcbinfo_destroy(&V_tcbinfo); /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); /* * Cannot free the zone until all tcpcbs are released as we attach * the allocations to them. */ tcp_fastopen_destroy(); COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES); VNET_PCPUSTAT_FREE(tcpstat); #ifdef TCP_HHOOK error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error); } error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]); if (error != 0) { printf("%s: WARNING: unable to deregister helper hook " "type=%d, id=%d: error %d returned\n", __func__, HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error); } #endif } VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL); #endif void tcp_fini(void *xtp) { } /* * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb. * tcp_template used to store this data in mbufs, but we now recopy it out * of the tcpcb each time to conserve mbufs. */ void tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr) { struct tcphdr *th = (struct tcphdr *)tcp_ptr; INP_WLOCK_ASSERT(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV6) != 0) { struct ip6_hdr *ip6; ip6 = (struct ip6_hdr *)ip_ptr; ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) | (inp->inp_flow & IPV6_FLOWINFO_MASK); ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) | (IPV6_VERSION & IPV6_VERSION_MASK); if (port == 0) ip6->ip6_nxt = IPPROTO_TCP; else ip6->ip6_nxt = IPPROTO_UDP; ip6->ip6_plen = htons(sizeof(struct tcphdr)); ip6->ip6_src = inp->in6p_laddr; ip6->ip6_dst = inp->in6p_faddr; } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { struct ip *ip; ip = (struct ip *)ip_ptr; ip->ip_v = IPVERSION; ip->ip_hl = 5; ip->ip_tos = inp->inp_ip_tos; ip->ip_len = 0; ip->ip_id = 0; ip->ip_off = 0; ip->ip_ttl = inp->inp_ip_ttl; ip->ip_sum = 0; if (port == 0) ip->ip_p = IPPROTO_TCP; else ip->ip_p = IPPROTO_UDP; ip->ip_src = inp->inp_laddr; ip->ip_dst = inp->inp_faddr; } #endif /* INET */ th->th_sport = inp->inp_lport; th->th_dport = inp->inp_fport; th->th_seq = 0; th->th_ack = 0; th->th_off = 5; tcp_set_flags(th, 0); th->th_win = 0; th->th_urp = 0; th->th_sum = 0; /* in_pseudo() is called later for ipv4 */ } /* * Create template to be used to send tcp packets on a connection. * Allocates an mbuf and fills in a skeletal tcp/ip header. The only * use for this function is in keepalives, which use tcp_respond. */ struct tcptemp * tcpip_maketemplate(struct inpcb *inp) { struct tcptemp *t; t = malloc(sizeof(*t), M_TEMP, M_NOWAIT); if (t == NULL) return (NULL); tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t); return (t); } /* * Send a single message to the TCP at address specified by * the given TCP/IP header. If m == NULL, then we make a copy * of the tcpiphdr at th and send directly to the addressed host. * This is used to force keep alive messages out using the TCP * template for a connection. If flags are given then we send * a message back to the TCP which originated the segment th, * and discard the mbuf containing it and any other attached mbufs. * * In any case the ack and sequence number of the transmitted * segment are as specified by the parameters. * * NOTE: If m != NULL, then th must point to *inside* the mbuf. */ void tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m, tcp_seq ack, tcp_seq seq, uint16_t flags) { struct tcpopt to; struct inpcb *inp; struct ip *ip; struct mbuf *optm; struct udphdr *uh = NULL; struct tcphdr *nth; struct tcp_log_buffer *lgb; u_char *optp; #ifdef INET6 struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int optlen, tlen, win, ulen; int ect = 0; bool incl_opts; uint16_t port; int output_ret; #ifdef INVARIANTS int thflags = tcp_get_flags(th); #endif KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); NET_EPOCH_ASSERT(); #ifdef INET6 isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4); ip6 = ipgen; #endif /* INET6 */ ip = ipgen; if (tp != NULL) { inp = tptoinpcb(tp); INP_LOCK_ASSERT(inp); } else inp = NULL; if (m != NULL) { #ifdef INET6 if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else #endif if (ip && (ip->ip_p == IPPROTO_UDP)) port = m->m_pkthdr.tcp_tun_port; else port = 0; } else port = tp->t_port; incl_opts = false; win = 0; if (tp != NULL) { if (!(flags & TH_RST)) { win = sbspace(&inp->inp_socket->so_rcv); if (win > TCP_MAXWIN << tp->rcv_scale) win = TCP_MAXWIN << tp->rcv_scale; } if ((tp->t_flags & TF_NOOPT) == 0) incl_opts = true; } if (m == NULL) { m = m_gethdr(M_NOWAIT, MT_DATA); if (m == NULL) return; m->m_data += max_linkhdr; #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(m, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(m, struct ip6_hdr *); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip)); ip = mtod(m, struct ip *); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); flags = TH_ACK; } else if ((!M_WRITABLE(m)) || (port != 0)) { struct mbuf *n; /* Can't reuse 'm', allocate a new mbuf. */ n = m_gethdr(M_NOWAIT, MT_DATA); if (n == NULL) { m_freem(m); return; } if (!m_dup_pkthdr(n, m, M_NOWAIT)) { m_freem(m); m_freem(n); return; } n->m_data += max_linkhdr; /* m_len is set later */ #define xchg(a,b,type) { type t; t=a; a=b; b=t; } #ifdef INET6 if (isipv6) { bcopy((caddr_t)ip6, mtod(n, caddr_t), sizeof(struct ip6_hdr)); ip6 = mtod(n, struct ip6_hdr *); xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } else #endif /* INET6 */ { bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip)); ip = mtod(n, struct ip *); xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); if (port) { /* Insert a UDP header */ uh = (struct udphdr *)nth; uh->uh_sport = htons(V_tcp_udp_tunneling_port); uh->uh_dport = port; nth = (struct tcphdr *)(uh + 1); } } bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr)); xchg(nth->th_dport, nth->th_sport, uint16_t); th = nth; m_freem(m); m = n; } else { /* * reuse the mbuf. * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); m->m_next = NULL; m->m_data = (caddr_t)ipgen; /* clear any receive flags for proper bpf timestamping */ m->m_flags &= ~(M_TSTMP | M_TSTMP_LRO); /* m_len is set later */ #ifdef INET6 if (isipv6) { xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr); nth = (struct tcphdr *)(ip6 + 1); } else #endif /* INET6 */ { xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t); nth = (struct tcphdr *)(ip + 1); } if (th != nth) { /* * this is usually a case when an extension header * exists between the IPv6 header and the * TCP header. */ nth->th_sport = th->th_sport; nth->th_dport = th->th_dport; } xchg(nth->th_dport, nth->th_sport, uint16_t); #undef xchg } tlen = 0; #ifdef INET6 if (isipv6) tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr); #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET tlen = sizeof (struct tcpiphdr); #endif if (port) tlen += sizeof (struct udphdr); #ifdef INVARIANTS m->m_len = 0; KASSERT(M_TRAILINGSPACE(m) >= tlen, ("Not enough trailing space for message (m=%p, need=%d, have=%ld)", m, tlen, (long)M_TRAILINGSPACE(m))); #endif m->m_len = tlen; to.to_flags = 0; if (incl_opts) { ect = tcp_ecn_output_established(tp, &flags, 0, false); /* Make sure we have room. */ if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) { m->m_next = m_get(M_NOWAIT, MT_DATA); if (m->m_next) { optp = mtod(m->m_next, u_char *); optm = m->m_next; } else incl_opts = false; } else { optp = (u_char *) (nth + 1); optm = m; } } if (incl_opts) { /* Timestamps. */ if (tp->t_flags & TF_RCVD_TSTMP) { to.to_tsval = tcp_ts_getticks() + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else optlen = 0; #ifdef INET6 if (isipv6) { if (uh) { ulen = tlen - sizeof(struct ip6_hdr); uh->uh_ulen = htons(ulen); } ip6->ip6_flow = htonl(ect << IPV6_FLOWLABEL_LEN); ip6->ip6_vfc = IPV6_VERSION; if (port) ip6->ip6_nxt = IPPROTO_UDP; else ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } #endif #if defined(INET) && defined(INET6) else #endif #ifdef INET { if (uh) { ulen = tlen - sizeof(struct ip); uh->uh_ulen = htons(ulen); } ip->ip_tos = ect; ip->ip_len = htons(tlen); ip->ip_ttl = V_ip_defttl; if (port) { ip->ip_p = IPPROTO_UDP; } else { ip->ip_p = IPPROTO_TCP; } if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); } #endif m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC if (inp != NULL) { /* * Packet is associated with a socket, so allow the * label of the response to reflect the socket label. */ INP_LOCK_ASSERT(inp); mac_inpcb_create_mbuf(inp, m); } else { /* * Packet is not associated with a socket, so possibly * update the label in place. */ mac_netinet_tcp_reply(m); } #endif nth->th_seq = htonl(seq); nth->th_ack = htonl(ack); nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2; tcp_set_flags(nth, flags); if (tp && (flags & TH_RST)) { /* Log the reset */ tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); } if (tp != NULL) nth->th_win = htons((u_short) (win >> tp->rcv_scale)); else nth->th_win = htons((u_short)win); nth->th_urp = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { if (!TCPMD5_ENABLED() || TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) { m_freem(m); return; } } #endif #ifdef INET6 if (isipv6) { if (port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in6_cksum_pseudo(ip6, tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0); } ip6->ip6_hlim = in6_selecthlim(inp, NULL); } #endif /* INET6 */ #if defined(INET6) && defined(INET) else #endif #ifdef INET { if (port) { uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); nth->th_sum = 0; } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p))); } } #endif /* INET */ TCP_PROBE3(debug__output, tp, th, m); if (flags & TH_RST) TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth); lgb = NULL; if ((tp != NULL) && tcp_bblogging_on(tp)) { if (INP_WLOCKED(inp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = inp->inp_in_hpts; log.u_bbr.flex8 = 4; log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.delivered = 0; lgb = tcp_log_event(tp, nth, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, 0, &log, false, NULL, NULL, 0, &tv); } else { /* * We can not log the packet, since we only own the * read lock, but a write lock is needed. The read lock * is not upgraded to a write lock, since only getting * the read lock was done intentionally to improve the * handling of SYN flooding attacks. * This happens only for pure SYN segments received in * the initial CLOSED state, or received in a more * advanced state than listen and the UDP encapsulation * port is unexpected. * The incoming SYN segments do not really belong to * the TCP connection and the handling does not change * the state of the TCP connection. Therefore, the * sending of the RST segments is not logged. Please * note that also the incoming SYN segments are not * logged. * * The following code ensures that the above description * is and stays correct. */ KASSERT((thflags & (TH_ACK|TH_SYN)) == TH_SYN && (tp->t_state == TCPS_CLOSED || (tp->t_state > TCPS_LISTEN && tp->t_port != port)), ("%s: Logging of TCP segment with flags 0x%b and " "UDP encapsulation port %u skipped in state %s", __func__, thflags, PRINT_TH_FLAGS, ntohs(port), tcpstates[tp->t_state])); } } if (flags & TH_ACK) TCPSTAT_INC(tcps_sndacks); else if (flags & (TH_SYN|TH_FIN|TH_RST)) TCPSTAT_INC(tcps_sndctrl); TCPSTAT_INC(tcps_sndtotal); #ifdef INET6 if (isipv6) { TCP_PROBE5(send, NULL, tp, ip6, tp, nth); output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); } #endif /* INET6 */ #if defined(INET) && defined(INET6) else #endif #ifdef INET { TCP_PROBE5(send, NULL, tp, ip, tp, nth); output_ret = ip_output(m, NULL, NULL, 0, NULL, inp); } #endif if (lgb != NULL) lgb->tlb_errno = output_ret; } /* * Create a new TCP control block, making an empty reassembly queue and hooking * it to the argument protocol control block. The `inp' parameter must have * come from the zone allocator set up by tcpcbstor declaration. */ struct tcpcb * tcp_newtcpcb(struct inpcb *inp) { struct tcpcb *tp = intotcpcb(inp); #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ /* * Historically allocation was done with M_ZERO. There is a lot of * code that rely on that. For now take safe approach and zero whole * tcpcb. This definitely can be optimized. */ bzero(&tp->t_start_zero, t_zero_size); /* Initialise cc_var struct for this tcpcb. */ tp->t_ccv.type = IPPROTO_TCP; tp->t_ccv.ccvc.tcp = tp; rw_rlock(&tcp_function_lock); tp->t_fb = V_tcp_func_set_ptr; refcount_acquire(&tp->t_fb->tfb_refcnt); rw_runlock(&tcp_function_lock); /* * Use the current system default CC algorithm. */ cc_attach(tp, CC_DEFAULT_ALGO()); if (CC_ALGO(tp)->cb_init != NULL) if (CC_ALGO(tp)->cb_init(&tp->t_ccv, NULL) > 0) { cc_detach(tp); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } #ifdef TCP_HHOOK if (khelp_init_osd(HELPER_CLASS_TCP, &tp->t_osd)) { if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } #endif TAILQ_INIT(&tp->t_segq); STAILQ_INIT(&tp->t_inqueue); tp->t_maxseg = #ifdef INET6 isipv6 ? V_tcp_v6mssdflt : #endif /* INET6 */ V_tcp_mssdflt; /* All mbuf queue/ack compress flags should be off */ tcp_lro_features_off(tptoinpcb(tp)); callout_init_rw(&tp->t_callout, &inp->inp_lock, CALLOUT_RETURNUNLOCKED); for (int i = 0; i < TT_N; i++) tp->t_timers[i] = SBT_MAX; switch (V_tcp_do_rfc1323) { case 0: break; default: case 1: tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); break; case 2: tp->t_flags = TF_REQ_SCALE; break; case 3: tp->t_flags = TF_REQ_TSTMP; break; } if (V_tcp_do_sack) tp->t_flags |= TF_SACK_PERMIT; TAILQ_INIT(&tp->snd_holes); /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives * reasonable initial retransmit time. */ tp->t_srtt = TCPTV_SRTTBASE; tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4; tp->t_rttmin = tcp_rexmit_min; tp->t_rxtcur = tcp_rexmit_initial; tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; tp->t_rcvtime = ticks; /* We always start with ticks granularity */ tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; /* * IPv4 TTL initialization is necessary for an IPv6 socket as well, * because the socket may be bound to an IPv6 wildcard address, * which may match an IPv4-mapped IPv6 address. */ inp->inp_ip_ttl = V_ip_defttl; #ifdef TCPHPTS /* * If using hpts lets drop a random number in so * not all new connections fall on the same CPU. */ inp->inp_hpts_cpu = hpts_random_cpu(inp); #endif #ifdef TCPPCAP /* * Init the TCP PCAP queues. */ tcp_pcap_tcpcb_init(tp); #endif #ifdef TCP_BLACKBOX /* Initialize the per-TCPCB log data. */ tcp_log_tcpcbinit(tp); #endif tp->t_pacing_rate = -1; if (tp->t_fb->tfb_tcp_fb_init) { if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) { refcount_release(&tp->t_fb->tfb_refcnt); return (NULL); } } #ifdef STATS if (V_tcp_perconn_stats_enable == 1) tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0); #endif if (V_tcp_do_lrd) tp->t_flags |= TF_LRD; return (tp); } /* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. */ struct tcpcb * tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tptosocket(tp); NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tptoinpcb(tp)); if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); /* Don't use tcp_output() here due to possible recursion. */ (void)tcp_output_nodrop(tp); TCPSTAT_INC(tcps_drops); } else TCPSTAT_INC(tcps_conndrops); if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; return (tcp_close(tp)); } void tcp_discardcb(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); struct socket *so = tptosocket(tp); #ifdef INET6 bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif INP_WLOCK_ASSERT(inp); tcp_timer_stop(tp); if (tp->t_fb->tfb_tcp_timer_stop_all) { tp->t_fb->tfb_tcp_timer_stop_all(tp); } /* free the reassembly queue, if any */ tcp_reass_flush(tp); #ifdef TCP_OFFLOAD /* Disconnect offload device, if any. */ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif tcp_free_sackholes(tp); #ifdef TCPPCAP /* Free the TCP PCAP queues. */ tcp_pcap_drain(&(tp->t_inpkts)); tcp_pcap_drain(&(tp->t_outpkts)); #endif /* Allow the CC algorithm to clean up after itself. */ if (CC_ALGO(tp)->cb_destroy != NULL) CC_ALGO(tp)->cb_destroy(&tp->t_ccv); CC_DATA(tp) = NULL; /* Detach from the CC algorithm */ cc_detach(tp); #ifdef TCP_HHOOK khelp_destroy_osd(&tp->t_osd); #endif #ifdef STATS stats_blob_destroy(tp->t_stats); #endif CC_ALGO(tp) = NULL; -#ifdef TCP_BLACKBOX - tcp_log_tcpcbfini(tp); -#endif TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); +#ifdef TCP_BLACKBOX + tcp_log_tcpcbfini(tp); +#endif MPASS(STAILQ_EMPTY(&tp->t_inqueue)); /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. * 'Enough' is arbitrarily defined as 4 rtt samples. * 4 samples is enough for the srtt filter to converge * to within enough % of the correct value; fewer samples * and we could save a bogus rtt. The danger is not high * as tcp quickly recovers from everything. * XXX: Works very well but needs some more statistics! * * XXXRRS: Updating must be after the stack fini() since * that may be converting some internal representation of * say srtt etc into the general one used by other stacks. * Lets also at least protect against the so being NULL * as RW stated below. */ if ((tp->t_rttupdated >= 4) && (so != NULL)) { struct hc_metrics_lite metrics; uint32_t ssthresh; bzero(&metrics, sizeof(metrics)); /* * Update the ssthresh always when the conditions below * are satisfied. This gives us better new start value * for the congestion avoidance for new connections. * ssthresh is only set if packet loss occurred on a session. * * XXXRW: 'so' may be NULL here, and/or socket buffer may be * being torn down. Ideally this code would not use 'so'. */ ssthresh = tp->snd_ssthresh; if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; if (ssthresh < 2) ssthresh = 2; ssthresh *= (tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : #endif sizeof (struct tcpiphdr) #ifdef INET6 ) #endif ); } else ssthresh = 0; metrics.rmx_ssthresh = ssthresh; metrics.rmx_rtt = tp->t_srtt; metrics.rmx_rttvar = tp->t_rttvar; metrics.rmx_cwnd = tp->snd_cwnd; metrics.rmx_sendpipe = 0; metrics.rmx_recvpipe = 0; tcp_hc_update(&inp->inp_inc, &metrics); } refcount_release(&tp->t_fb->tfb_refcnt); } /* * Attempt to close a TCP control block, marking it as dropped, and freeing * the socket if we hold the only reference. */ struct tcpcb * tcp_close(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); struct socket *so = tptosocket(tp); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); #endif /* * This releases the TFO pending counter resource for TFO listen * sockets as well as passively-created TFO sockets that transition * from SYN_RECEIVED to CLOSED. */ if (tp->t_tfo_pending) { tcp_fastopen_decrement_counter(tp->t_tfo_pending); tp->t_tfo_pending = NULL; } #ifdef TCPHPTS tcp_hpts_remove(inp); #endif in_pcbdrop(inp); TCPSTAT_INC(tcps_closed); if (tp->t_state != TCPS_CLOSED) tcp_state_change(tp, TCPS_CLOSED); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); soisdisconnected(so); if (inp->inp_flags & INP_SOCKREF) { inp->inp_flags &= ~INP_SOCKREF; INP_WUNLOCK(inp); sorele(so); return (NULL); } return (tp); } /* * Notify a tcp user of an asynchronous error; * store error as soft error, but wake up user * (for now, won't do anything until can select for soft error). * * Do not wake up user since there currently is no mechanism for * reporting soft errors (yet - a kqueue filter may be added). */ static struct inpcb * tcp_notify(struct inpcb *inp, int error) { struct tcpcb *tp; INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_notify: tp == NULL")); /* * Ignore some errors if we are hooked up. * If connection hasn't completed, has retransmitted several times, * and receives a second error, give up now. This is better * than waiting a long time to establish a connection that * can never complete. */ if (tp->t_state == TCPS_ESTABLISHED && (error == EHOSTUNREACH || error == ENETUNREACH || error == EHOSTDOWN)) { if (inp->inp_route.ro_nh) { NH_FREE(inp->inp_route.ro_nh); inp->inp_route.ro_nh = (struct nhop_object *)NULL; } return (inp); } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 && tp->t_softerror) { tp = tcp_drop(tp, error); if (tp != NULL) return (inp); else return (NULL); } else { tp->t_softerror = error; return (inp); } #if 0 wakeup( &so->so_timeo); sorwakeup(so); sowwakeup(so); #endif } static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, INPLOOKUP_RLOCKPCB); struct xinpgen xig; struct inpcb *inp; int error; if (req->newptr != NULL) return (EPERM); if (req->oldptr == NULL) { int n; n = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb); return (0); } if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); error = syncache_pcblist(req); if (error) return (error); while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xtcpcb xt; tcp_inptoxtp(inp, &xt); error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { INP_RUNLOCK(inp); break; } else continue; } } if (!error) { /* * Give the user an updated idea of our state. * If the generation differs from what we told * her before, she knows that something happened * while we were processing this request, and it * might be necessary to retry. */ xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = SYSCTL_OUT(req, &xig, sizeof xig); } return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist, CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT, NULL, 0, tcp_pcblist, "S,xtcpcb", "List of active TCP connections"); #ifdef INET static int tcp_getcred(SYSCTL_HANDLER_ARGS) { struct xucred xuc; struct sockaddr_in addrs[2]; struct epoch_tracker et; struct inpcb *inp; int error; error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); NET_EPOCH_ENTER(et); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp_getcred, "S,xucred", "Get the xucred of a TCP connection"); #endif /* INET */ #ifdef INET6 static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { struct epoch_tracker et; struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; int error; #ifdef INET int mapped = 0; #endif error = priv_check(req->td, PRIV_NETINET_GETCRED); if (error) return (error); error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 || (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) { #ifdef INET if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr)) mapped = 1; else #endif return (EINVAL); } NET_EPOCH_ENTER(et); #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, *(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12], addrs[1].sin6_port, *(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12], addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); else #endif inp = in6_pcblookup(&V_tcbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); NET_EPOCH_EXIT(et); if (inp != NULL) { if (error == 0) error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) cru2x(inp->inp_cred, &xuc); INP_RUNLOCK(inp); } else error = ENOENT; if (error == 0) error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred)); return (error); } SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred, CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT, 0, 0, tcp6_getcred, "S,xucred", "Get the xucred of a TCP6 connection"); #endif /* INET6 */ #ifdef INET /* Path MTU to try next when a fragmentation-needed message is received. */ static inline int tcp_next_pmtu(const struct icmp *icp, const struct ip *ip) { int mtu = ntohs(icp->icmp_nextmtu); /* If no alternative MTU was proposed, try the next smaller one. */ if (!mtu) mtu = ip_next_mtu(ntohs(ip->ip_len), 1); if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr)) mtu = V_tcp_minmss + sizeof(struct tcpiphdr); return (mtu); } static void tcp_ctlinput_with_port(struct icmp *icp, uint16_t port) { struct ip *ip; struct tcphdr *th; struct inpcb *inp; struct tcpcb *tp; struct inpcb *(*notify)(struct inpcb *, int); struct in_conninfo inc; tcp_seq icmp_tcp_seq; int errno, mtu; errno = icmp_errmap(icp); switch (errno) { case 0: return; case EMSGSIZE: notify = tcp_mtudisc_notify; break; case ECONNREFUSED: if (V_icmp_may_rst) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; case EHOSTUNREACH: if (V_icmp_may_rst && icp->icmp_type == ICMP_TIMXCEED) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; default: notify = tcp_notify; } ip = &icp->icmp_ip; th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); icmp_tcp_seq = th->th_seq; inp = in_pcblookup(&V_tcbinfo, ip->ip_dst, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { /* * MTU discovery for offloaded connections. Let * the TOE driver verify seq# and process it. */ mtu = tcp_next_pmtu(icp, ip); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) goto out; if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (errno == EMSGSIZE) { /* * MTU discovery: we got a needfrag and * will potentially try a lower MTU. */ mtu = tcp_next_pmtu(icp, ip); /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof(struct tcpiphdr)) { bzero(&inc, sizeof(inc)); inc.inc_faddr = ip->ip_dst; inc.inc_fibnum = inp->inp_inc.inc_fibnum; tcp_hc_updatemtu(&inc, mtu); inp = tcp_mtudisc(inp, mtu); } } else inp = (*notify)(inp, errno); } } else { bzero(&inc, sizeof(inc)); inc.inc_fport = th->th_dport; inc.inc_lport = th->th_sport; inc.inc_faddr = ip->ip_dst; inc.inc_laddr = ip->ip_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } static void tcp_ctlinput(struct icmp *icmp) { tcp_ctlinput_with_port(icmp, htons(0)); } static void tcp_ctlinput_viaudp(udp_tun_icmp_param_t param) { /* Its a tunneled TCP over UDP icmp */ struct icmp *icmp = param.icmp; struct ip *outer_ip, *inner_ip; struct udphdr *udp; struct tcphdr *th, ttemp; int i_hlen, o_len; uint16_t port; outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip)); inner_ip = &icmp->icmp_ip; i_hlen = inner_ip->ip_hl << 2; o_len = ntohs(outer_ip->ip_len); if (o_len < (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) { /* Not enough data present */ return; } /* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */ udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; th = (struct tcphdr *)(udp + 1); memcpy(&ttemp, th, sizeof(struct tcphdr)); memcpy(udp, &ttemp, sizeof(struct tcphdr)); /* Now adjust down the size of the outer IP header */ o_len -= sizeof(struct udphdr); outer_ip->ip_len = htons(o_len); /* Now call in to the normal handling code */ tcp_ctlinput_with_port(icmp, port); } #endif /* INET */ #ifdef INET6 static inline int tcp6_next_pmtu(const struct icmp6_hdr *icmp6) { int mtu = ntohl(icmp6->icmp6_mtu); /* * If no alternative MTU was proposed, or the proposed MTU was too * small, set to the min. */ if (mtu < IPV6_MMTU) mtu = IPV6_MMTU - 8; /* XXXNP: what is the adjustment for? */ return (mtu); } static void tcp6_ctlinput_with_port(struct ip6ctlparam *ip6cp, uint16_t port) { struct in6_addr *dst; struct inpcb *(*notify)(struct inpcb *, int); struct ip6_hdr *ip6; struct mbuf *m; struct inpcb *inp; struct tcpcb *tp; struct icmp6_hdr *icmp6; struct in_conninfo inc; struct tcp_ports { uint16_t th_sport; uint16_t th_dport; } t_ports; tcp_seq icmp_tcp_seq; unsigned int mtu; unsigned int off; int errno; icmp6 = ip6cp->ip6c_icmp6; m = ip6cp->ip6c_m; ip6 = ip6cp->ip6c_ip6; off = ip6cp->ip6c_off; dst = &ip6cp->ip6c_finaldst->sin6_addr; errno = icmp6_errmap(icmp6); switch (errno) { case 0: return; case EMSGSIZE: notify = tcp_mtudisc_notify; break; case ECONNREFUSED: if (V_icmp_may_rst) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; case EHOSTUNREACH: /* * There are only four ICMPs that may reset connection: * - administratively prohibited * - port unreachable * - time exceeded in transit * - unknown next header */ if (V_icmp_may_rst && ((icmp6->icmp6_type == ICMP6_DST_UNREACH && (icmp6->icmp6_code == ICMP6_DST_UNREACH_ADMIN || icmp6->icmp6_code == ICMP6_DST_UNREACH_NOPORT)) || (icmp6->icmp6_type == ICMP6_TIME_EXCEEDED && icmp6->icmp6_code == ICMP6_TIME_EXCEED_TRANSIT) || (icmp6->icmp6_type == ICMP6_PARAM_PROB && icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER))) notify = tcp_drop_syn_sent; else notify = tcp_notify; break; default: notify = tcp_notify; } /* Check if we can safely get the ports from the tcp hdr */ if (m == NULL || (m->m_pkthdr.len < (int32_t) (off + sizeof(struct tcp_ports)))) { return; } bzero(&t_ports, sizeof(struct tcp_ports)); m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); off += sizeof(struct tcp_ports); if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) { goto out; } m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq); if (inp != NULL) { tp = intotcpcb(inp); #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE && errno == EMSGSIZE) { /* MTU discovery for offloaded connections. */ mtu = tcp6_next_pmtu(icmp6); tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu); goto out; } #endif if (tp->t_port != port) goto out; if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) && SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) { if (errno == EMSGSIZE) { /* * MTU discovery: * If we got a needfrag set the MTU * in the route to the suggested new * value (if given) and then notify. */ mtu = tcp6_next_pmtu(icmp6); bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc6_faddr = *dst; if (in6_setscope(&inc.inc6_faddr, m->m_pkthdr.rcvif, NULL)) goto out; /* * Only process the offered MTU if it * is smaller than the current one. */ if (mtu < tp->t_maxseg + sizeof (struct tcphdr) + sizeof (struct ip6_hdr)) { tcp_hc_updatemtu(&inc, mtu); tcp_mtudisc(inp, mtu); ICMP6STAT_INC(icp6s_pmtuchg); } } else inp = (*notify)(inp, errno); } } else { bzero(&inc, sizeof(inc)); inc.inc_fibnum = M_GETFIB(m); inc.inc_flags |= INC_ISIPV6; inc.inc_fport = t_ports.th_dport; inc.inc_lport = t_ports.th_sport; inc.inc6_faddr = *dst; inc.inc6_laddr = ip6->ip6_src; syncache_unreach(&inc, icmp_tcp_seq, port); } out: if (inp != NULL) INP_WUNLOCK(inp); } static void tcp6_ctlinput(struct ip6ctlparam *ctl) { tcp6_ctlinput_with_port(ctl, htons(0)); } static void tcp6_ctlinput_viaudp(udp_tun_icmp_param_t param) { struct ip6ctlparam *ip6cp = param.ip6cp; struct mbuf *m; struct udphdr *udp; uint16_t port; m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL); if (m == NULL) { return; } udp = mtod(m, struct udphdr *); if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) { return; } port = udp->uh_dport; m_adj(m, sizeof(struct udphdr)); if ((m->m_flags & M_PKTHDR) == 0) { ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr); } /* Now call in to the normal handling code */ tcp6_ctlinput_with_port(ip6cp, port); } #endif /* INET6 */ static uint32_t tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len) { SIPHASH_CTX ctx; uint32_t hash[2]; KASSERT(len >= SIPHASH_KEY_LENGTH, ("%s: keylen %u too short ", __func__, len)); SipHash24_Init(&ctx); SipHash_SetKey(&ctx, (uint8_t *)key); SipHash_Update(&ctx, &inc->inc_fport, sizeof(uint16_t)); SipHash_Update(&ctx, &inc->inc_lport, sizeof(uint16_t)); switch (inc->inc_flags & INC_ISIPV6) { #ifdef INET case 0: SipHash_Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr)); SipHash_Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr)); break; #endif #ifdef INET6 case INC_ISIPV6: SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr)); SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr)); break; #endif } SipHash_Final((uint8_t *)hash, &ctx); return (hash[0] ^ hash[1]); } uint32_t tcp_new_ts_offset(struct in_conninfo *inc) { struct in_conninfo inc_store, *local_inc; if (!V_tcp_ts_offset_per_conn) { memcpy(&inc_store, inc, sizeof(struct in_conninfo)); inc_store.inc_lport = 0; inc_store.inc_fport = 0; local_inc = &inc_store; } else { local_inc = inc; } return (tcp_keyed_hash(local_inc, V_ts_offset_secret, sizeof(V_ts_offset_secret))); } /* * Following is where TCP initial sequence number generation occurs. * * There are two places where we must use initial sequence numbers: * 1. In SYN-ACK packets. * 2. In SYN packets. * * All ISNs for SYN-ACK packets are generated by the syncache. See * tcp_syncache.c for details. * * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling * depends on this property. In addition, these ISNs should be * unguessable so as to prevent connection hijacking. To satisfy * the requirements of this situation, the algorithm outlined in * RFC 1948 is used, with only small modifications. * * Implementation details: * * Time is based off the system timer, and is corrected so that it * increases by one megabyte per second. This allows for proper * recycling on high speed LANs while still leaving over an hour * before rollover. * * As reading the *exact* system time is too expensive to be done * whenever setting up a TCP connection, we increment the time * offset in two ways. First, a small random positive increment * is added to isn_offset for each connection that is set up. * Second, the function tcp_isn_tick fires once per clock tick * and increments isn_offset as necessary so that sequence numbers * are incremented at approximately ISN_BYTES_PER_SECOND. The * random positive increments serve only to ensure that the same * exact sequence number is never sent out twice (as could otherwise * happen when a port is recycled in less than the system tick * interval.) * * net.inet.tcp.isn_reseed_interval controls the number of seconds * between seeding of isn_secret. This is normally set to zero, * as reseeding should not be necessary. * * Locking of the global variables isn_secret, isn_last_reseed, isn_offset, * isn_offset_old, and isn_ctx is performed using the ISN lock. In * general, this means holding an exclusive (write) lock. */ #define ISN_BYTES_PER_SECOND 1048576 #define ISN_STATIC_INCREMENT 4096 #define ISN_RANDOM_INCREMENT (4096 - 1) #define ISN_SECRET_LENGTH SIPHASH_KEY_LENGTH VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]); VNET_DEFINE_STATIC(int, isn_last); VNET_DEFINE_STATIC(int, isn_last_reseed); VNET_DEFINE_STATIC(u_int32_t, isn_offset); VNET_DEFINE_STATIC(u_int32_t, isn_offset_old); #define V_isn_secret VNET(isn_secret) #define V_isn_last VNET(isn_last) #define V_isn_last_reseed VNET(isn_last_reseed) #define V_isn_offset VNET(isn_offset) #define V_isn_offset_old VNET(isn_offset_old) tcp_seq tcp_new_isn(struct in_conninfo *inc) { tcp_seq new_isn; u_int32_t projected_offset; ISN_LOCK(); /* Seed if this is the first use, reseed if requested. */ if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) && (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz) < (u_int)ticks))) { arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0); V_isn_last_reseed = ticks; } /* Compute the hash and return the ISN. */ new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret, sizeof(V_isn_secret)); V_isn_offset += ISN_STATIC_INCREMENT + (arc4random() & ISN_RANDOM_INCREMENT); if (ticks != V_isn_last) { projected_offset = V_isn_offset_old + ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last); if (SEQ_GT(projected_offset, V_isn_offset)) V_isn_offset = projected_offset; V_isn_offset_old = V_isn_offset; V_isn_last = ticks; } new_isn += V_isn_offset; ISN_UNLOCK(); return (new_isn); } /* * When a specific ICMP unreachable message is received and the * connection state is SYN-SENT, drop the connection. This behavior * is controlled by the icmp_may_rst sysctl. */ static struct inpcb * tcp_drop_syn_sent(struct inpcb *inp, int errno) { struct tcpcb *tp; NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); if (tp->t_state != TCPS_SYN_SENT) return (inp); if (IS_FASTOPEN(tp->t_flags)) tcp_fastopen_disable_path(tp); tp = tcp_drop(tp, errno); if (tp != NULL) return (inp); else return (NULL); } /* * When `need fragmentation' ICMP is received, update our idea of the MSS * based on the new value. Also nudge TCP to send something, since we * know the packet we just sent was dropped. * This duplicates some code in the tcp_mss() function in tcp_input.c. */ static struct inpcb * tcp_mtudisc_notify(struct inpcb *inp, int error) { return (tcp_mtudisc(inp, -1)); } static struct inpcb * tcp_mtudisc(struct inpcb *inp, int mtuoffer) { struct tcpcb *tp; struct socket *so; INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ if (so->so_snd.sb_hiwat < tp->t_maxseg) tp->t_maxseg = so->so_snd.sb_hiwat; SOCKBUF_UNLOCK(&so->so_snd); TCPSTAT_INC(tcps_mturesent); tp->t_rtttime = 0; tp->snd_nxt = tp->snd_una; tcp_free_sackholes(tp); tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_SACK_PERMIT) EXIT_FASTRECOVERY(tp->t_flags); if (tp->t_fb->tfb_tcp_mtu_chg != NULL) { /* * Conceptually the snd_nxt setting * and freeing sack holes should * be done by the default stacks * own tfb_tcp_mtu_chg(). */ tp->t_fb->tfb_tcp_mtu_chg(tp); } if (tcp_output(tp) < 0) return (NULL); else return (inp); } #ifdef INET /* * Look-up the routing entry to the peer of this inpcb. If no route * is found and it cannot be allocated, then return 0. This routine * is called by TCP routines that access the rmx structure and by * tcp_mss_update to get the peer/interface MTU. */ uint32_t tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); if (inc->inc_faddr.s_addr != INADDR_ANY) { nh = fib4_lookup(inc->inc_fibnum, inc->inc_faddr, 0, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO4 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } } return (maxmtu); } #endif /* INET */ #ifdef INET6 uint32_t tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap) { struct nhop_object *nh; struct in6_addr dst6; uint32_t scopeid; struct ifnet *ifp; uint32_t maxmtu = 0; KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); if (inc->inc_flags & INC_IPV6MINMTU) return (IPV6_MMTU); if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid); nh = fib6_lookup(inc->inc_fibnum, &dst6, scopeid, NHR_NONE, 0); if (nh == NULL) return (0); ifp = nh->nh_ifp; maxmtu = nh->nh_mtu; /* Report additional interface capabilities. */ if (cap != NULL) { if (ifp->if_capenable & IFCAP_TSO6 && ifp->if_hwassist & CSUM_TSO) { cap->ifcap |= CSUM_TSO; cap->tsomax = ifp->if_hw_tsomax; cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount; cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize; } } } return (maxmtu); } /* * Handle setsockopt(IPV6_USE_MIN_MTU) by a TCP stack. * * XXXGL: we are updating inpcb here with INC_IPV6MINMTU flag. * The right place to do that is ip6_setpktopt() that has just been * executed. By the way it just filled ip6po_minmtu for us. */ void tcp6_use_min_mtu(struct tcpcb *tp) { struct inpcb *inp = tptoinpcb(tp); INP_WLOCK_ASSERT(inp); /* * In case of the IPV6_USE_MIN_MTU socket * option, the INC_IPV6MINMTU flag to announce * a corresponding MSS during the initial * handshake. If the TCP connection is not in * the front states, just reduce the MSS being * used. This avoids the sending of TCP * segments which will be fragmented at the * IPv6 layer. */ inp->inp_inc.inc_flags |= INC_IPV6MINMTU; if ((tp->t_state >= TCPS_SYN_SENT) && (inp->inp_inc.inc_flags & INC_ISIPV6)) { struct ip6_pktopts *opt; opt = inp->in6p_outputopts; if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL && tp->t_maxseg > TCP6_MSS) tp->t_maxseg = TCP6_MSS; } } #endif /* INET6 */ /* * Calculate effective SMSS per RFC5681 definition for a given TCP * connection at its current state, taking into account SACK and etc. */ u_int tcp_maxseg(const struct tcpcb *tp) { u_int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We might make mistakes with padding here in some edge cases, * but this is harmless, since result of tcp_maxseg() is used * only in cwnd and ssthresh estimations. */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) { optlen += TCPOLEN_SACKHDR; optlen += tp->rcv_numsacks * TCPOLEN_SACK; optlen = PADTCPOLEN(optlen); } } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PADTCPOLEN(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PADTCPOLEN(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PADTCPOLEN(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } u_int tcp_fixed_maxseg(const struct tcpcb *tp) { int optlen; if (tp->t_flags & TF_NOOPT) return (tp->t_maxseg); /* * Here we have a simplified code from tcp_addoptions(), * without a proper loop, and having most of paddings hardcoded. * We only consider fixed options that we would send every * time I.e. SACK is not considered. This is important * for cc modules to figure out what the modulo of the * cwnd should be. */ #define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) if (TCPS_HAVEESTABLISHED(tp->t_state)) { if (tp->t_flags & TF_RCVD_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = 0; #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif } else { if (tp->t_flags & TF_REQ_TSTMP) optlen = TCPOLEN_TSTAMP_APPA; else optlen = PAD(TCPOLEN_MAXSEG); if (tp->t_flags & TF_REQ_SCALE) optlen += PAD(TCPOLEN_WINDOW); #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (tp->t_flags & TF_SIGNATURE) optlen += PAD(TCPOLEN_SIGNATURE); #endif if (tp->t_flags & TF_SACK_PERMIT) optlen += PAD(TCPOLEN_SACK_PERMITTED); } #undef PAD optlen = min(optlen, TCP_MAXOLEN); return (tp->t_maxseg - optlen); } static int sysctl_drop(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } if (inp != NULL) { if (!SOLISTENING(inp->inp_socket)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); if (tp != NULL) INP_WUNLOCK(inp); } else INP_WUNLOCK(inp); } else error = ESRCH; NET_EPOCH_EXIT(et); return (error); } SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_drop, "", "Drop TCP connection"); static int tcp_sysctl_setsockopt(SYSCTL_HANDLER_ARGS) { return (sysctl_setsockopt(oidp, arg1, arg2, req, &V_tcbinfo, &tcp_ctloutput_set)); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, setsockopt, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_MPSAFE, NULL, 0, tcp_sysctl_setsockopt, "", "Set socket option for TCP endpoint"); #ifdef KERN_TLS static int sysctl_switch_tls(SYSCTL_HANDLER_ARGS) { /* addrs[0] is a foreign socket, addrs[1] is a local one. */ struct sockaddr_storage addrs[2]; struct inpcb *inp; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif struct epoch_tracker et; #ifdef INET6 struct sockaddr_in6 *fin6, *lin6; #endif int error; inp = NULL; #ifdef INET6 fin6 = lin6 = NULL; #endif error = 0; if (req->oldptr != NULL || req->oldlen != 0) return (EINVAL); if (req->newptr == NULL) return (EPERM); if (req->newlen < sizeof(addrs)) return (ENOMEM); error = SYSCTL_IN(req, &addrs, sizeof(addrs)); if (error) return (error); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: fin6 = (struct sockaddr_in6 *)&addrs[0]; lin6 = (struct sockaddr_in6 *)&addrs[1]; if (fin6->sin6_len != sizeof(struct sockaddr_in6) || lin6->sin6_len != sizeof(struct sockaddr_in6)) return (EINVAL); if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) { if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr)) return (EINVAL); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]); in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]); #ifdef INET fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; #endif break; } error = sa6_embedscope(fin6, V_ip6_use_defzone); if (error) return (error); error = sa6_embedscope(lin6, V_ip6_use_defzone); if (error) return (error); break; #endif #ifdef INET case AF_INET: fin = (struct sockaddr_in *)&addrs[0]; lin = (struct sockaddr_in *)&addrs[1]; if (fin->sin_len != sizeof(struct sockaddr_in) || lin->sin_len != sizeof(struct sockaddr_in)) return (EINVAL); break; #endif default: return (EINVAL); } NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr, fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif #ifdef INET case AF_INET: inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port, lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL); break; #endif } NET_EPOCH_EXIT(et); if (inp != NULL) { struct socket *so; so = inp->inp_socket; soref(so); error = ktls_set_tx_mode(so, arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET); INP_WUNLOCK(inp); sorele(so); } else error = ESRCH; return (error); } SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 0, sysctl_switch_tls, "", "Switch TCP connection to SW TLS"); SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls, CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP | CTLFLAG_NEEDGIANT, NULL, 1, sysctl_switch_tls, "", "Switch TCP connection to ifnet TLS"); #endif /* * Generate a standardized TCP log line for use throughout the * tcp subsystem. Memory allocation is done with M_NOWAIT to * allow use in the interrupt context. * * NB: The caller MUST free(s, M_TCPLOG) the returned string. * NB: The function may return NULL if memory allocation failed. * * Due to header inclusion and ordering limitations the struct ip * and ip6_hdr pointers have to be passed as void pointers. */ char * tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (V_tcp_log_in_vain == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } char * tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { /* Is logging enabled? */ if (tcp_log_debug == 0) return (NULL); return (tcp_log_addr(inc, th, ip4hdr, ip6hdr)); } static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr, const void *ip6hdr) { char *s, *sp; size_t size; #ifdef INET const struct ip *ip = (const struct ip *)ip4hdr; #endif #ifdef INET6 const struct ip6_hdr *ip6 = (const struct ip6_hdr *)ip6hdr; #endif /* INET6 */ /* * The log line looks like this: * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2" */ size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") + sizeof(PRINT_TH_FLAGS) + 1 + #ifdef INET6 2 * INET6_ADDRSTRLEN; #else 2 * INET_ADDRSTRLEN; #endif /* INET6 */ s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT); if (s == NULL) return (NULL); strcat(s, "TCP: ["); sp = s + strlen(s); if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) { inet_ntoa_r(inc->inc_faddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); inet_ntoa_r(inc->inc_laddr, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); #ifdef INET6 } else if (inc) { ip6_sprintf(sp, &inc->inc6_faddr); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(inc->inc_fport)); sp = s + strlen(s); ip6_sprintf(sp, &inc->inc6_laddr); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(inc->inc_lport)); } else if (ip6 && th) { ip6_sprintf(sp, &ip6->ip6_src); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); ip6_sprintf(sp, &ip6->ip6_dst); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET6 */ #ifdef INET } else if (ip && th) { inet_ntoa_r(ip->ip_src, sp); sp = s + strlen(s); sprintf(sp, "]:%i to [", ntohs(th->th_sport)); sp = s + strlen(s); inet_ntoa_r(ip->ip_dst, sp); sp = s + strlen(s); sprintf(sp, "]:%i", ntohs(th->th_dport)); #endif /* INET */ } else { free(s, M_TCPLOG); return (NULL); } sp = s + strlen(s); if (th) sprintf(sp, " tcpflags 0x%b", tcp_get_flags(th), PRINT_TH_FLAGS); if (*(s + size - 1) != '\0') panic("%s: string too long", __func__); return (s); } /* * A subroutine which makes it easy to track TCP state changes with DTrace. * This function shouldn't be called for t_state initializations that don't * correspond to actual TCP state transitions. */ void tcp_state_change(struct tcpcb *tp, int newstate) { #if defined(KDTRACE_HOOKS) int pstate = tp->t_state; #endif TCPSTATES_DEC(tp->t_state); TCPSTATES_INC(newstate); tp->t_state = newstate; TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate); } /* * Create an external-format (``xtcpcb'') structure using the information in * the kernel-format tcpcb structure pointed to by tp. This is done to * reduce the spew of irrelevant information over this interface, to isolate * user code from changes in the kernel structure, and potentially to provide * information-hiding if we decide that some of this information should be * hidden from users. */ void tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) { struct tcpcb *tp = intotcpcb(inp); sbintime_t now; bzero(xt, sizeof(*xt)); xt->t_state = tp->t_state; xt->t_logstate = tcp_get_bblog_state(tp); xt->t_flags = tp->t_flags; xt->t_sndzerowin = tp->t_sndzerowin; xt->t_sndrexmitpack = tp->t_sndrexmitpack; xt->t_rcvoopack = tp->t_rcvoopack; xt->t_rcv_wnd = tp->rcv_wnd; xt->t_snd_wnd = tp->snd_wnd; xt->t_snd_cwnd = tp->snd_cwnd; xt->t_snd_ssthresh = tp->snd_ssthresh; xt->t_dsack_bytes = tp->t_dsack_bytes; xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes; xt->t_dsack_pack = tp->t_dsack_pack; xt->t_maxseg = tp->t_maxseg; xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 + (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0; now = getsbinuptime(); #define COPYTIMER(which,where) do { \ if (tp->t_timers[which] != SBT_MAX) \ xt->where = (tp->t_timers[which] - now) / SBT_1MS; \ else \ xt->where = 0; \ } while (0) COPYTIMER(TT_DELACK, tt_delack); COPYTIMER(TT_REXMT, tt_rexmt); COPYTIMER(TT_PERSIST, tt_persist); COPYTIMER(TT_KEEP, tt_keep); COPYTIMER(TT_2MSL, tt_2msl); #undef COPYTIMER xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; xt->xt_encaps_port = tp->t_port; bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, TCP_FUNCTION_NAME_LEN_MAX); bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX); #ifdef TCP_BLACKBOX (void)tcp_log_get_id(tp, xt->xt_logid); #endif xt->xt_len = sizeof(struct xtcpcb); in_pcbtoxinpcb(inp, &xt->xt_inp); } void tcp_log_end_status(struct tcpcb *tp, uint8_t status) { uint32_t bit, i; if ((tp == NULL) || (status > TCP_EI_STATUS_MAX_VALUE) || (status == 0)) { /* Invalid */ return; } if (status > (sizeof(uint32_t) * 8)) { /* Should this be a KASSERT? */ return; } bit = 1U << (status - 1); if (bit & tp->t_end_info_status) { /* already logged */ return; } for (i = 0; i < TCP_END_BYTE_INFO; i++) { if (tp->t_end_info_bytes[i] == TCP_EI_EMPTY_SLOT) { tp->t_end_info_bytes[i] = status; tp->t_end_info_status |= bit; break; } } } int tcp_can_enable_pacing(void) { if ((tcp_pacing_limit == -1) || (tcp_pacing_limit > number_of_tcp_connections_pacing)) { atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1); shadow_num_connections = number_of_tcp_connections_pacing; return (1); } else { counter_u64_add(tcp_pacing_failures, 1); return (0); } } static uint8_t tcp_pacing_warning = 0; void tcp_decrement_paced_conn(void) { uint32_t ret; ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1); shadow_num_connections = number_of_tcp_connections_pacing; KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?")); if (ret == 0) { if (tcp_pacing_limit != -1) { printf("Warning all pacing is now disabled, count decrements invalidly!\n"); tcp_pacing_limit = 0; } else if (tcp_pacing_warning == 0) { printf("Warning pacing count is invalid, invalid decrement\n"); tcp_pacing_warning = 1; } } } static void tcp_default_switch_failed(struct tcpcb *tp) { /* * If a switch fails we only need to * care about two things: * a) The inp_flags2 * and * b) The timer granularity. * Timeouts, at least for now, don't use the * old callout system in the other stacks so * those are hopefully safe. */ tcp_lro_features_off(tptoinpcb(tp)); tcp_change_time_units(tp, TCP_TMR_GRANULARITY_TICKS); } #ifdef TCP_ACCOUNTING int tcp_do_ack_accounting(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t tiwin, int mss) { if (SEQ_LT(th->th_ack, tp->snd_una)) { /* Do we have a SACK? */ if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_SACK]++; } return (ACK_SACK); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_BEHIND]++; } return (ACK_BEHIND); } } else if (th->th_ack == tp->snd_una) { /* Do we have a SACK? */ if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_SACK]++; } return (ACK_SACK); } else if (tiwin != tp->snd_wnd) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_RWND]++; } return (ACK_RWND); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_DUPACK]++; } return (ACK_DUPACK); } } else { if (!SEQ_GT(th->th_ack, tp->snd_max)) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((th->th_ack - tp->snd_una) + mss - 1)/mss); } } if (to->to_flags & TOF_SACK) { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_CUMACK_SACK]++; } return (ACK_CUMACK_SACK); } else { if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { tp->tcp_cnt_counters[ACK_CUMACK]++; } return (ACK_CUMACK); } } } #endif void tcp_change_time_units(struct tcpcb *tp, int granularity) { if (tp->t_tmr_granularity == granularity) { /* We are there */ return; } if (granularity == TCP_TMR_GRANULARITY_USEC) { KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_TICKS), ("Granularity is not TICKS its %u in tp:%p", tp->t_tmr_granularity, tp)); tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); if (tp->t_srtt > 1) { uint32_t val, frac; val = tp->t_srtt >> TCP_RTT_SHIFT; frac = tp->t_srtt & 0x1f; tp->t_srtt = TICKS_2_USEC(val); /* * frac is the fractional part of the srtt (if any) * but its in ticks and every bit represents * 1/32nd of a hz. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); } else { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); } tp->t_srtt += frac; } } if (tp->t_rttvar) { uint32_t val, frac; val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; frac = tp->t_rttvar & 0x1f; tp->t_rttvar = TICKS_2_USEC(val); /* * frac is the fractional part of the srtt (if any) * but its in ticks and every bit represents * 1/32nd of a hz. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); } else { frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); } tp->t_rttvar += frac; } } tp->t_tmr_granularity = TCP_TMR_GRANULARITY_USEC; } else if (granularity == TCP_TMR_GRANULARITY_TICKS) { /* Convert back to ticks, with */ KASSERT((tp->t_tmr_granularity == TCP_TMR_GRANULARITY_USEC), ("Granularity is not USEC its %u in tp:%p", tp->t_tmr_granularity, tp)); if (tp->t_srtt > 1) { uint32_t val, frac; val = USEC_2_TICKS(tp->t_srtt); frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); tp->t_srtt = val << TCP_RTT_SHIFT; /* * frac is the fractional part here is left * over from converting to hz and shifting. * We need to convert this to the 5 bit * remainder. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); } else { frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); } tp->t_srtt += frac; } } if (tp->t_rttvar) { uint32_t val, frac; val = USEC_2_TICKS(tp->t_rttvar); frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); tp->t_rttvar = val << TCP_RTTVAR_SHIFT; /* * frac is the fractional part here is left * over from converting to hz and shifting. * We need to convert this to the 5 bit * remainder. */ if (frac) { if (hz == 1000) { frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); } else { frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); } tp->t_rttvar += frac; } } tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); tp->t_tmr_granularity = TCP_TMR_GRANULARITY_TICKS; } #ifdef INVARIANTS else { panic("Unknown granularity:%d tp:%p", granularity, tp); } #endif } void tcp_handle_orphaned_packets(struct tcpcb *tp) { struct mbuf *save, *m, *prev; /* * Called when a stack switch is occuring from the fini() * of the old stack. We assue the init() as already been * run of the new stack and it has set the inp_flags2 to * what it supports. This function will then deal with any * differences i.e. cleanup packets that maybe queued that * the newstack does not support. */ if (tptoinpcb(tp)->inp_flags2 & INP_MBUF_L_ACKS) return; if ((tptoinpcb(tp)->inp_flags2 & INP_SUPPORTS_MBUFQ) == 0 && !STAILQ_EMPTY(&tp->t_inqueue)) { /* * It is unsafe to process the packets since a * reset may be lurking in them (its rare but it * can occur). If we were to find a RST, then we * would end up dropping the connection and the * INP lock, so when we return the caller (tcp_usrreq) * will blow up when it trys to unlock the inp. * This new stack does not do any fancy LRO features * so all we can do is toss the packets. */ m = STAILQ_FIRST(&tp->t_inqueue); STAILQ_INIT(&tp->t_inqueue); STAILQ_FOREACH_FROM_SAFE(m, &tp->t_inqueue, m_stailqpkt, save) m_freem(m); } else { /* * Here we have a stack that does mbuf queuing but * does not support compressed ack's. We must * walk all the mbufs and discard any compressed acks. */ STAILQ_FOREACH_SAFE(m, &tp->t_inqueue, m_stailqpkt, save) { if (m->m_flags & M_ACKCMP) { if (m == STAILQ_FIRST(&tp->t_inqueue)) STAILQ_REMOVE_HEAD(&tp->t_inqueue, m_stailqpkt); else STAILQ_REMOVE_AFTER(&tp->t_inqueue, prev, m_stailqpkt); m_freem(m); } else prev = m; } } } #ifdef TCP_REQUEST_TRK uint32_t tcp_estimate_tls_overhead(struct socket *so, uint64_t tls_usr_bytes) { #ifdef KERN_TLS struct ktls_session *tls; uint32_t rec_oh, records; tls = so->so_snd.sb_tls_info; if (tls == NULL) return (0); rec_oh = tls->params.tls_hlen + tls->params.tls_tlen; records = ((tls_usr_bytes + tls->params.max_frame_len - 1)/tls->params.max_frame_len); return (records * rec_oh); #else return (0); #endif } extern uint32_t tcp_stale_entry_time; uint32_t tcp_stale_entry_time = 250000; SYSCTL_UINT(_net_inet_tcp, OID_AUTO, usrlog_stale, CTLFLAG_RW, &tcp_stale_entry_time, 250000, "Time that a http entry without a sendfile ages out"); void tcp_http_log_req_info(struct tcpcb *tp, struct http_sendfile_track *http, uint16_t slot, uint8_t val, uint64_t offset, uint64_t nbytes) { if (tcp_bblogging_on(tp)) { union tcp_log_stackspecific log; struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); #ifdef TCPHPTS log.u_bbr.inhpts = tcp_in_hpts(tptoinpcb(tp)); #endif log.u_bbr.flex8 = val; log.u_bbr.rttProp = http->timestamp; log.u_bbr.delRate = http->start; log.u_bbr.cur_del_rate = http->end; log.u_bbr.flex1 = http->start_seq; log.u_bbr.flex2 = http->end_seq; log.u_bbr.flex3 = http->flags; log.u_bbr.flex4 = ((http->localtime >> 32) & 0x00000000ffffffff); log.u_bbr.flex5 = (http->localtime & 0x00000000ffffffff); log.u_bbr.flex7 = slot; log.u_bbr.bw_inuse = offset; /* nbytes = flex6 | epoch */ log.u_bbr.flex6 = ((nbytes >> 32) & 0x00000000ffffffff); log.u_bbr.epoch = (nbytes & 0x00000000ffffffff); /* cspr = lt_epoch | pkts_out */ log.u_bbr.lt_epoch = ((http->cspr >> 32) & 0x00000000ffffffff); log.u_bbr.pkts_out |= (http->cspr & 0x00000000ffffffff); log.u_bbr.applimited = tp->t_http_closed; log.u_bbr.applimited <<= 8; log.u_bbr.applimited |= tp->t_http_open; log.u_bbr.applimited <<= 8; log.u_bbr.applimited |= tp->t_http_req; log.u_bbr.timeStamp = tcp_get_usecs(&tv); TCP_LOG_EVENTP(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_LOG_HTTP_T, 0, 0, &log, false, &tv); } } void tcp_http_free_a_slot(struct tcpcb *tp, struct http_sendfile_track *ent) { if (tp->t_http_req > 0) tp->t_http_req--; if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) { if (tp->t_http_open > 0) tp->t_http_open--; } else { if (tp->t_http_closed > 0) tp->t_http_closed--; } ent->flags = TCP_HTTP_TRACK_FLG_EMPTY; } static void tcp_http_check_for_stale_entries(struct tcpcb *tp, uint64_t ts, int rm_oldest) { struct http_sendfile_track *ent; uint64_t time_delta, oldest_delta; int i, oldest, oldest_set = 0, cnt_rm = 0; for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { ent = &tp->t_http_info[i]; if (ent->flags != TCP_HTTP_TRACK_FLG_USED) { /* * We only care about closed end ranges * that are allocated and have no sendfile * ever touching them. They would be in * state USED. */ continue; } if (ts >= ent->localtime) time_delta = ts - ent->localtime; else time_delta = 0; if (time_delta && ((oldest_delta < time_delta) || (oldest_set == 0))) { oldest_set = 1; oldest = i; oldest_delta = time_delta; } if (tcp_stale_entry_time && (time_delta >= tcp_stale_entry_time)) { /* * No sendfile in a our time-limit * time to purge it. */ cnt_rm++; tcp_http_log_req_info(tp, &tp->t_http_info[i], i, TCP_HTTP_REQ_LOG_STALE, time_delta, 0); tcp_http_free_a_slot(tp, ent); } } if ((cnt_rm == 0) && rm_oldest && oldest_set) { ent = &tp->t_http_info[oldest]; tcp_http_log_req_info(tp, &tp->t_http_info[i], i, TCP_HTTP_REQ_LOG_STALE, oldest_delta, 1); tcp_http_free_a_slot(tp, ent); } } int tcp_http_check_for_comp(struct tcpcb *tp, tcp_seq ack_point) { int i, ret=0; struct http_sendfile_track *ent; /* Clean up any old closed end requests that are now completed */ if (tp->t_http_req == 0) return(0); if (tp->t_http_closed == 0) return(0); for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { ent = &tp->t_http_info[i]; /* Skip empty ones */ if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) continue; /* Skip open ones */ if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) continue; if (SEQ_GEQ(ack_point, ent->end_seq)) { /* We are past it -- free it */ tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_FREED, 0, 0); tcp_http_free_a_slot(tp, ent); ret++; } } return (ret); } int tcp_http_is_entry_comp(struct tcpcb *tp, struct http_sendfile_track *ent, tcp_seq ack_point) { if (tp->t_http_req == 0) return(-1); if (tp->t_http_closed == 0) return(-1); if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) return(-1); if (SEQ_GEQ(ack_point, ent->end_seq)) { return (1); } return (0); } struct http_sendfile_track * tcp_http_find_a_req_that_is_completed_by(struct tcpcb *tp, tcp_seq th_ack, int *ip) { /* * Given an ack point (th_ack) walk through our entries and * return the first one found that th_ack goes past the * end_seq. */ struct http_sendfile_track *ent; int i; if (tp->t_http_req == 0) { /* none open */ return (NULL); } for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { ent = &tp->t_http_info[i]; if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) continue; if ((ent->flags & TCP_HTTP_TRACK_FLG_OPEN) == 0) { if (SEQ_GEQ(th_ack, ent->end_seq)) { *ip = i; return (ent); } } } return (NULL); } struct http_sendfile_track * tcp_http_find_req_for_seq(struct tcpcb *tp, tcp_seq seq) { struct http_sendfile_track *ent; int i; if (tp->t_http_req == 0) { /* none open */ return (NULL); } for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { ent = &tp->t_http_info[i]; tcp_http_log_req_info(tp, ent, i, TCP_HTTP_REQ_LOG_SEARCH, (uint64_t)seq, 0); if (ent->flags == TCP_HTTP_TRACK_FLG_EMPTY) { continue; } if (ent->flags & TCP_HTTP_TRACK_FLG_OPEN) { /* * An open end request only needs to * match the beginning seq or be * all we have (once we keep going on * a open end request we may have a seq * wrap). */ if ((SEQ_GEQ(seq, ent->start_seq)) || (tp->t_http_closed == 0)) return (ent); } else { /* * For this one we need to * be a bit more careful if its * completed at least. */ if ((SEQ_GEQ(seq, ent->start_seq)) && (SEQ_LT(seq, ent->end_seq))) { return (ent); } } } return (NULL); } /* Should this be in its own file tcp_http.c ? */ struct http_sendfile_track * tcp_http_alloc_req_full(struct tcpcb *tp, struct http_req *req, uint64_t ts, int rec_dups) { struct http_sendfile_track *fil; int i, allocated; /* In case the stack does not check for completions do so now */ tcp_http_check_for_comp(tp, tp->snd_una); /* Check for stale entries */ if (tp->t_http_req) tcp_http_check_for_stale_entries(tp, ts, (tp->t_http_req >= MAX_TCP_HTTP_REQ)); /* Check to see if this is a duplicate of one not started */ if (tp->t_http_req) { for(i = 0, allocated = 0; i < MAX_TCP_HTTP_REQ; i++) { fil = &tp->t_http_info[i]; if (fil->flags != TCP_HTTP_TRACK_FLG_USED) continue; if ((fil->timestamp == req->timestamp) && (fil->start == req->start) && ((fil->flags & TCP_HTTP_TRACK_FLG_OPEN) || (fil->end == req->end))) { /* * We already have this request * and it has not been started with sendfile. * This probably means the user was returned * a 4xx of some sort and its going to age * out, lets not duplicate it. */ return(fil); } } } /* Ok if there is no room at the inn we are in trouble */ if (tp->t_http_req >= MAX_TCP_HTTP_REQ) { tcp_trace_point(tp, TCP_TP_HTTP_LOG_FAIL); for(i = 0; i < MAX_TCP_HTTP_REQ; i++) { tcp_http_log_req_info(tp, &tp->t_http_info[i], i, TCP_HTTP_REQ_LOG_ALLOCFAIL, 0, 0); } return (NULL); } for(i = 0, allocated = 0; i < MAX_TCP_HTTP_REQ; i++) { fil = &tp->t_http_info[i]; if (fil->flags == TCP_HTTP_TRACK_FLG_EMPTY) { allocated = 1; fil->flags = TCP_HTTP_TRACK_FLG_USED; fil->timestamp = req->timestamp; fil->localtime = ts; fil->start = req->start; if (req->flags & TCP_LOG_HTTPD_RANGE_END) { fil->end = req->end; } else { fil->end = 0; fil->flags |= TCP_HTTP_TRACK_FLG_OPEN; } /* * We can set the min boundaries to the TCP Sequence space, * but it might be found to be further up when sendfile * actually runs on this range (if it ever does). */ fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc; fil->start_seq = tp->snd_una + tptosocket(tp)->so_snd.sb_ccc; fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start))); if (tptosocket(tp)->so_snd.sb_tls_info) { /* * This session is doing TLS. Take a swag guess * at the overhead. */ fil->end_seq += tcp_estimate_tls_overhead( tptosocket(tp), (fil->end - fil->start)); } tp->t_http_req++; if (fil->flags & TCP_HTTP_TRACK_FLG_OPEN) tp->t_http_open++; else tp->t_http_closed++; tcp_http_log_req_info(tp, fil, i, TCP_HTTP_REQ_LOG_NEW, 0, 0); break; } else fil = NULL; } return (fil); } void tcp_http_alloc_req(struct tcpcb *tp, union tcp_log_userdata *user, uint64_t ts) { (void)tcp_http_alloc_req_full(tp, &user->http_req, ts, 1); } #endif void tcp_log_socket_option(struct tcpcb *tp, uint32_t option_num, uint32_t option_val, int err) { if (tcp_bblogging_on(tp)) { struct tcp_log_buffer *l; l = tcp_log_event(tp, NULL, &tptosocket(tp)->so_rcv, &tptosocket(tp)->so_snd, TCP_LOG_SOCKET_OPT, err, 0, NULL, 1, NULL, NULL, 0, NULL); if (l) { l->tlb_flex1 = option_num; l->tlb_flex2 = option_val; } } }