Index: head/sys/netinet/tcp_log_buf.c =================================================================== --- head/sys/netinet/tcp_log_buf.c (revision 356413) +++ head/sys/netinet/tcp_log_buf.c (revision 356414) @@ -1,2438 +1,2639 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include -#include +#include /* Must come after qmath.h and tree.h */ #include #include #include #include #include #include #include #include #include #include /* Default expiry time */ #define TCP_LOG_EXPIRE_TIME ((sbintime_t)60 * SBT_1S) /* Max interval at which to run the expiry timer */ #define TCP_LOG_EXPIRE_INTVL ((sbintime_t)5 * SBT_1S) bool tcp_log_verbose; static uma_zone_t tcp_log_bucket_zone, tcp_log_node_zone, tcp_log_zone; static int tcp_log_session_limit = TCP_LOG_BUF_DEFAULT_SESSION_LIMIT; static uint32_t tcp_log_version = TCP_LOG_BUF_VER; RB_HEAD(tcp_log_id_tree, tcp_log_id_bucket); static struct tcp_log_id_tree tcp_log_id_head; static STAILQ_HEAD(, tcp_log_id_node) tcp_log_expireq_head = STAILQ_HEAD_INITIALIZER(tcp_log_expireq_head); static struct mtx tcp_log_expireq_mtx; static struct callout tcp_log_expireq_callout; static u_long tcp_log_auto_ratio = 0; static volatile u_long tcp_log_auto_ratio_cur = 0; static uint32_t tcp_log_auto_mode = TCP_LOG_STATE_TAIL; static bool tcp_log_auto_all = false; +static uint32_t tcp_disable_all_bb_logs = 0; RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp) SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW, 0, "TCP Black Box controls"); SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose, 0, "Force verbose logging for TCP traces"); SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit, CTLFLAG_RW, &tcp_log_session_limit, 0, "Maximum number of events maintained for each TCP session"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW, &tcp_log_zone, "Maximum number of events maintained for all TCP sessions"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_global_entries, CTLFLAG_RD, &tcp_log_zone, "Current number of events maintained for all TCP sessions"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_limit, CTLFLAG_RW, &tcp_log_bucket_zone, "Maximum number of log IDs"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_entries, CTLFLAG_RD, &tcp_log_bucket_zone, "Current number of log IDs"); SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_limit, CTLFLAG_RW, &tcp_log_node_zone, "Maximum number of tcpcbs with log IDs"); SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_entries, CTLFLAG_RD, &tcp_log_node_zone, "Current number of tcpcbs with log IDs"); SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_version, CTLFLAG_RD, &tcp_log_version, 0, "Version of log formats exported"); +SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, disable_all, CTLFLAG_RW, + &tcp_disable_all_bb_logs, TCP_LOG_STATE_HEAD_AUTO, + "Disable all BB logging for all connections"); + SYSCTL_ULONG(_net_inet_tcp_bb, OID_AUTO, log_auto_ratio, CTLFLAG_RW, &tcp_log_auto_ratio, 0, "Do auto capturing for 1 out of N sessions"); SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_auto_mode, CTLFLAG_RW, &tcp_log_auto_mode, TCP_LOG_STATE_HEAD_AUTO, "Logging mode for auto-selected sessions (default is TCP_LOG_STATE_HEAD_AUTO)"); SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_auto_all, CTLFLAG_RW, &tcp_log_auto_all, false, "Auto-select from all sessions (rather than just those with IDs)"); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_t tcp_log_queued; counter_u64_t tcp_log_que_fail1; counter_u64_t tcp_log_que_fail2; counter_u64_t tcp_log_que_fail3; counter_u64_t tcp_log_que_fail4; counter_u64_t tcp_log_que_fail5; counter_u64_t tcp_log_que_copyout; counter_u64_t tcp_log_que_read; counter_u64_t tcp_log_que_freed; SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, queued, CTLFLAG_RD, &tcp_log_queued, "Number of entries queued"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail1, CTLFLAG_RD, &tcp_log_que_fail1, "Number of entries queued but fail 1"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail2, CTLFLAG_RD, &tcp_log_que_fail2, "Number of entries queued but fail 2"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail3, CTLFLAG_RD, &tcp_log_que_fail3, "Number of entries queued but fail 3"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail4, CTLFLAG_RD, &tcp_log_que_fail4, "Number of entries queued but fail 4"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail5, CTLFLAG_RD, &tcp_log_que_fail5, "Number of entries queued but fail 4"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, copyout, CTLFLAG_RD, &tcp_log_que_copyout, "Number of entries copied out"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, read, CTLFLAG_RD, &tcp_log_que_read, "Number of entries read from the queue"); SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, freed, CTLFLAG_RD, &tcp_log_que_freed, "Number of entries freed after reading"); #endif #ifdef INVARIANTS #define TCPLOG_DEBUG_RINGBUF #endif +/* Number of requests to consider a PBCID "active". */ +#define ACTIVE_REQUEST_COUNT 10 +/* Statistic tracking for "active" PBCIDs. */ +static counter_u64_t tcp_log_pcb_ids_cur; +static counter_u64_t tcp_log_pcb_ids_tot; + +SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_cur, CTLFLAG_RD, + &tcp_log_pcb_ids_cur, "Number of pcb IDs allocated in the system"); +SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, pcb_ids_tot, CTLFLAG_RD, + &tcp_log_pcb_ids_tot, "Total number of pcb IDs that have been allocated"); + struct tcp_log_mem { STAILQ_ENTRY(tcp_log_mem) tlm_queue; struct tcp_log_buffer tlm_buf; struct tcp_log_verbose tlm_v; #ifdef TCPLOG_DEBUG_RINGBUF volatile int tlm_refcnt; #endif }; /* 60 bytes for the header, + 16 bytes for padding */ static uint8_t zerobuf[76]; /* * Lock order: * 1. TCPID_TREE * 2. TCPID_BUCKET * 3. INP * * Rules: * A. You need a lock on the Tree to add/remove buckets. * B. You need a lock on the bucket to add/remove nodes from the bucket. * C. To change information in a node, you need the INP lock if the tln_closed * field is false. Otherwise, you need the bucket lock. (Note that the * tln_closed field can change at any point, so you need to recheck the * entry after acquiring the INP lock.) * D. To remove a node from the bucket, you must have that entry locked, * according to the criteria of Rule C. Also, the node must not be on * the expiry queue. * E. The exception to C is the expiry queue fields, which are locked by * the TCPLOG_EXPIREQ lock. * * Buckets have a reference count. Each node is a reference. Further, * other callers may add reference counts to keep a bucket from disappearing. * You can add a reference as long as you own a lock sufficient to keep the * bucket from disappearing. For example, a common use is: * a. Have a locked INP, but need to lock the TCPID_BUCKET. * b. Add a refcount on the bucket. (Safe because the INP lock prevents * the TCPID_BUCKET from going away.) * c. Drop the INP lock. * d. Acquire a lock on the TCPID_BUCKET. * e. Acquire a lock on the INP. * f. Drop the refcount on the bucket. * (At this point, the bucket may disappear.) * * Expire queue lock: * You can acquire this with either the bucket or INP lock. Don't reverse it. * When the expire code has committed to freeing a node, it resets the expiry * time to SBT_MAX. That is the signal to everyone else that they should * leave that node alone. */ static struct rwlock tcp_id_tree_lock; #define TCPID_TREE_WLOCK() rw_wlock(&tcp_id_tree_lock) #define TCPID_TREE_RLOCK() rw_rlock(&tcp_id_tree_lock) #define TCPID_TREE_UPGRADE() rw_try_upgrade(&tcp_id_tree_lock) #define TCPID_TREE_WUNLOCK() rw_wunlock(&tcp_id_tree_lock) #define TCPID_TREE_RUNLOCK() rw_runlock(&tcp_id_tree_lock) #define TCPID_TREE_WLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_WLOCKED) #define TCPID_TREE_RLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_RLOCKED) #define TCPID_TREE_UNLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_UNLOCKED) #define TCPID_BUCKET_LOCK_INIT(tlb) mtx_init(&((tlb)->tlb_mtx), "tcp log id bucket", NULL, MTX_DEF) #define TCPID_BUCKET_LOCK_DESTROY(tlb) mtx_destroy(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_LOCK(tlb) mtx_lock(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_UNLOCK(tlb) mtx_unlock(&((tlb)->tlb_mtx)) #define TCPID_BUCKET_LOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_OWNED) #define TCPID_BUCKET_UNLOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_NOTOWNED) #define TCPID_BUCKET_REF(tlb) refcount_acquire(&((tlb)->tlb_refcnt)) #define TCPID_BUCKET_UNREF(tlb) refcount_release(&((tlb)->tlb_refcnt)) #define TCPLOG_EXPIREQ_LOCK() mtx_lock(&tcp_log_expireq_mtx) #define TCPLOG_EXPIREQ_UNLOCK() mtx_unlock(&tcp_log_expireq_mtx) SLIST_HEAD(tcp_log_id_head, tcp_log_id_node); struct tcp_log_id_bucket { /* * tlb_id must be first. This lets us use strcmp on * (struct tcp_log_id_bucket *) and (char *) interchangeably. */ char tlb_id[TCP_LOG_ID_LEN]; + char tlb_tag[TCP_LOG_TAG_LEN]; RB_ENTRY(tcp_log_id_bucket) tlb_rb; struct tcp_log_id_head tlb_head; struct mtx tlb_mtx; volatile u_int tlb_refcnt; + volatile u_int tlb_reqcnt; + uint32_t tlb_loglimit; + uint8_t tlb_logstate; }; struct tcp_log_id_node { SLIST_ENTRY(tcp_log_id_node) tln_list; STAILQ_ENTRY(tcp_log_id_node) tln_expireq; /* Locked by the expireq lock */ sbintime_t tln_expiretime; /* Locked by the expireq lock */ /* * If INP is NULL, that means the connection has closed. We've * saved the connection endpoint information and the log entries * in the tln_ie and tln_entries members. We've also saved a pointer * to the enclosing bucket here. If INP is not NULL, the information is * in the PCB and not here. */ struct inpcb *tln_inp; struct tcpcb *tln_tp; struct tcp_log_id_bucket *tln_bucket; struct in_endpoints tln_ie; struct tcp_log_stailq tln_entries; int tln_count; volatile int tln_closed; uint8_t tln_af; }; enum tree_lock_state { TREE_UNLOCKED = 0, TREE_RLOCKED, TREE_WLOCKED, }; /* Do we want to select this session for auto-logging? */ static __inline bool tcp_log_selectauto(void) { /* * If we are doing auto-capturing, figure out whether we will capture * this session. */ if (tcp_log_auto_ratio && + (tcp_disable_all_bb_logs == 0) && (atomic_fetchadd_long(&tcp_log_auto_ratio_cur, 1) % tcp_log_auto_ratio) == 0) return (true); return (false); } static __inline int tcp_log_id_cmp(struct tcp_log_id_bucket *a, struct tcp_log_id_bucket *b) { KASSERT(a != NULL, ("tcp_log_id_cmp: argument a is unexpectedly NULL")); KASSERT(b != NULL, ("tcp_log_id_cmp: argument b is unexpectedly NULL")); return strncmp(a->tlb_id, b->tlb_id, TCP_LOG_ID_LEN); } RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp) static __inline void tcp_log_id_validate_tree_lock(int tree_locked) { #ifdef INVARIANTS switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WLOCK_ASSERT(); break; case TREE_RLOCKED: TCPID_TREE_RLOCK_ASSERT(); break; case TREE_UNLOCKED: TCPID_TREE_UNLOCK_ASSERT(); break; default: kassert_panic("%s:%d: unknown tree lock state", __func__, __LINE__); } #endif } static __inline void tcp_log_remove_bucket(struct tcp_log_id_bucket *tlb) { TCPID_TREE_WLOCK_ASSERT(); KASSERT(SLIST_EMPTY(&tlb->tlb_head), ("%s: Attempt to remove non-empty bucket", __func__)); if (RB_REMOVE(tcp_log_id_tree, &tcp_log_id_head, tlb) == NULL) { #ifdef INVARIANTS kassert_panic("%s:%d: error removing element from tree", __func__, __LINE__); #endif } TCPID_BUCKET_LOCK_DESTROY(tlb); + counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1); uma_zfree(tcp_log_bucket_zone, tlb); } /* * Call with a referenced and locked bucket. * Will return true if the bucket was freed; otherwise, false. * tlb: The bucket to unreference. * tree_locked: A pointer to the state of the tree lock. If the tree lock * state changes, the function will update it. * inp: If not NULL and the function needs to drop the inp lock to relock the * tree, it will do so. (The caller must ensure inp will not become invalid, * probably by holding a reference to it.) */ static bool tcp_log_unref_bucket(struct tcp_log_id_bucket *tlb, int *tree_locked, struct inpcb *inp) { KASSERT(tlb != NULL, ("%s: called with NULL tlb", __func__)); KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked", __func__)); tcp_log_id_validate_tree_lock(*tree_locked); /* * Did we hold the last reference on the tlb? If so, we may need * to free it. (Note that we can realistically only execute the * loop twice: once without a write lock and once with a write * lock.) */ while (TCPID_BUCKET_UNREF(tlb)) { /* * We need a write lock on the tree to free this. * If we can upgrade the tree lock, this is "easy". If we * can't upgrade the tree lock, we need to do this the * "hard" way: unwind all our locks and relock everything. * In the meantime, anything could have changed. We even * need to validate that we still need to free the bucket. */ if (*tree_locked == TREE_RLOCKED && TCPID_TREE_UPGRADE()) *tree_locked = TREE_WLOCKED; else if (*tree_locked != TREE_WLOCKED) { TCPID_BUCKET_REF(tlb); if (inp != NULL) INP_WUNLOCK(inp); TCPID_BUCKET_UNLOCK(tlb); if (*tree_locked == TREE_RLOCKED) TCPID_TREE_RUNLOCK(); TCPID_TREE_WLOCK(); *tree_locked = TREE_WLOCKED; TCPID_BUCKET_LOCK(tlb); if (inp != NULL) INP_WLOCK(inp); continue; } /* * We have an empty bucket and a write lock on the tree. * Remove the empty bucket. */ tcp_log_remove_bucket(tlb); return (true); } return (false); } /* * Call with a locked bucket. This function will release the lock on the * bucket before returning. * * The caller is responsible for freeing the tp->t_lin/tln node! * * Note: one of tp or both tlb and tln must be supplied. * * inp: A pointer to the inp. If the function needs to drop the inp lock to * acquire the tree write lock, it will do so. (The caller must ensure inp * will not become invalid, probably by holding a reference to it.) * tp: A pointer to the tcpcb. (optional; if specified, tlb and tln are ignored) * tlb: A pointer to the bucket. (optional; ignored if tp is specified) * tln: A pointer to the node. (optional; ignored if tp is specified) * tree_locked: A pointer to the state of the tree lock. If the tree lock * state changes, the function will update it. * * Will return true if the INP lock was reacquired; otherwise, false. */ static bool tcp_log_remove_id_node(struct inpcb *inp, struct tcpcb *tp, struct tcp_log_id_bucket *tlb, struct tcp_log_id_node *tln, int *tree_locked) { int orig_tree_locked; KASSERT(tp != NULL || (tlb != NULL && tln != NULL), ("%s: called with tp=%p, tlb=%p, tln=%p", __func__, tp, tlb, tln)); KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked", __func__)); if (tp != NULL) { tlb = tp->t_lib; tln = tp->t_lin; KASSERT(tlb != NULL, ("%s: unexpectedly NULL tlb", __func__)); KASSERT(tln != NULL, ("%s: unexpectedly NULL tln", __func__)); } tcp_log_id_validate_tree_lock(*tree_locked); TCPID_BUCKET_LOCK_ASSERT(tlb); /* * Remove the node, clear the log bucket and node from the TCPCB, and * decrement the bucket refcount. In the process, if this is the * last reference, the bucket will be freed. */ SLIST_REMOVE(&tlb->tlb_head, tln, tcp_log_id_node, tln_list); if (tp != NULL) { tp->t_lib = NULL; tp->t_lin = NULL; } orig_tree_locked = *tree_locked; if (!tcp_log_unref_bucket(tlb, tree_locked, inp)) TCPID_BUCKET_UNLOCK(tlb); return (*tree_locked != orig_tree_locked); } #define RECHECK_INP_CLEAN(cleanup) do { \ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ rv = ECONNRESET; \ cleanup; \ goto done; \ } \ tp = intotcpcb(inp); \ } while (0) #define RECHECK_INP() RECHECK_INP_CLEAN(/* noop */) static void tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); #ifdef STATS if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL) (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id)); #endif } +static void +tcp_log_increment_reqcnt(struct tcp_log_id_bucket *tlb) +{ + + atomic_fetchadd_int(&tlb->tlb_reqcnt, 1); +} + /* + * Associate the specified tag with a particular TCP log ID. + * Called with INPCB locked. Returns with it unlocked. + * Returns 0 on success or EOPNOTSUPP if the connection has no TCP log ID. + */ +int +tcp_log_set_tag(struct tcpcb *tp, char *tag) +{ + struct tcp_log_id_bucket *tlb; + int tree_locked; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + tree_locked = TREE_UNLOCKED; + tlb = tp->t_lib; + if (tlb == NULL) { + INP_WUNLOCK(tp->t_inpcb); + return (EOPNOTSUPP); + } + + TCPID_BUCKET_REF(tlb); + INP_WUNLOCK(tp->t_inpcb); + TCPID_BUCKET_LOCK(tlb); + strlcpy(tlb->tlb_tag, tag, TCP_LOG_TAG_LEN); + if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) + TCPID_BUCKET_UNLOCK(tlb); + + if (tree_locked == TREE_WLOCKED) { + TCPID_TREE_WLOCK_ASSERT(); + TCPID_TREE_WUNLOCK(); + } else if (tree_locked == TREE_RLOCKED) { + TCPID_TREE_RLOCK_ASSERT(); + TCPID_TREE_RUNLOCK(); + } else + TCPID_TREE_UNLOCK_ASSERT(); + + return (0); +} + +/* * Set the TCP log ID for a TCPCB. * Called with INPCB locked. Returns with it unlocked. */ int tcp_log_set_id(struct tcpcb *tp, char *id) { struct tcp_log_id_bucket *tlb, *tmp_tlb; struct tcp_log_id_node *tln; struct inpcb *inp; int tree_locked, rv; bool bucket_locked; tlb = NULL; tln = NULL; inp = tp->t_inpcb; tree_locked = TREE_UNLOCKED; bucket_locked = false; restart: INP_WLOCK_ASSERT(inp); /* See if the ID is unchanged. */ if ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) || (tp->t_lib == NULL && *id == 0)) { + if (tp->t_lib != NULL) { + tcp_log_increment_reqcnt(tp->t_lib); + if ((tp->t_lib->tlb_logstate) && + (tp->t_log_state_set == 0)) { + /* Clone in any logging */ + + tp->t_logstate = tp->t_lib->tlb_logstate; + } + if ((tp->t_lib->tlb_loglimit) && + (tp->t_log_state_set == 0)) { + /* We also have a limit set */ + + tp->t_loglimit = tp->t_lib->tlb_loglimit; + } + } rv = 0; goto done; } /* * If the TCPCB had a previous ID, we need to extricate it from * the previous list. * * Drop the TCPCB lock and lock the tree and the bucket. * Because this is called in the socket context, we (theoretically) * don't need to worry about the INPCB completely going away * while we are gone. */ if (tp->t_lib != NULL) { tlb = tp->t_lib; TCPID_BUCKET_REF(tlb); INP_WUNLOCK(inp); if (tree_locked == TREE_UNLOCKED) { TCPID_TREE_RLOCK(); tree_locked = TREE_RLOCKED; } TCPID_BUCKET_LOCK(tlb); bucket_locked = true; INP_WLOCK(inp); /* * Unreference the bucket. If our bucket went away, it is no * longer locked or valid. */ if (tcp_log_unref_bucket(tlb, &tree_locked, inp)) { bucket_locked = false; tlb = NULL; } /* Validate the INP. */ RECHECK_INP(); /* * Evaluate whether the bucket changed while we were unlocked. * * Possible scenarios here: * 1. Bucket is unchanged and the same one we started with. * 2. The TCPCB no longer has a bucket and our bucket was * freed. * 3. The TCPCB has a new bucket, whether ours was freed. * 4. The TCPCB no longer has a bucket and our bucket was * not freed. * * In cases 2-4, we will start over. In case 1, we will * proceed here to remove the bucket. */ if (tlb == NULL || tp->t_lib != tlb) { KASSERT(bucket_locked || tlb == NULL, ("%s: bucket_locked (%d) and tlb (%p) are " "inconsistent", __func__, bucket_locked, tlb)); if (bucket_locked) { TCPID_BUCKET_UNLOCK(tlb); bucket_locked = false; tlb = NULL; } goto restart; } /* * Store the (struct tcp_log_id_node) for reuse. Then, remove * it from the bucket. In the process, we may end up relocking. * If so, we need to validate that the INP is still valid, and * the TCPCB entries match we expect. * * We will clear tlb and change the bucket_locked state just * before calling tcp_log_remove_id_node(), since that function * will unlock the bucket. */ if (tln != NULL) uma_zfree(tcp_log_node_zone, tln); tln = tp->t_lin; tlb = NULL; bucket_locked = false; if (tcp_log_remove_id_node(inp, tp, NULL, NULL, &tree_locked)) { RECHECK_INP(); /* * If the TCPCB moved to a new bucket while we had * dropped the lock, restart. */ if (tp->t_lib != NULL || tp->t_lin != NULL) goto restart; } /* * Yay! We successfully removed the TCPCB from its old * bucket. Phew! * * On to bigger and better things... */ } /* At this point, the TCPCB should not be in any bucket. */ KASSERT(tp->t_lib == NULL, ("%s: tp->t_lib is not NULL", __func__)); /* * If the new ID is not empty, we need to now assign this TCPCB to a * new bucket. */ if (*id) { /* Get a new tln, if we don't already have one to reuse. */ if (tln == NULL) { tln = uma_zalloc(tcp_log_node_zone, M_NOWAIT | M_ZERO); if (tln == NULL) { rv = ENOBUFS; goto done; } tln->tln_inp = inp; tln->tln_tp = tp; } /* * Drop the INP lock for a bit. We don't need it, and dropping * it prevents lock order reversals. */ INP_WUNLOCK(inp); /* Make sure we have at least a read lock on the tree. */ tcp_log_id_validate_tree_lock(tree_locked); if (tree_locked == TREE_UNLOCKED) { TCPID_TREE_RLOCK(); tree_locked = TREE_RLOCKED; } refind: /* * Remember that we constructed (struct tcp_log_id_node) so * we can safely cast the id to it for the purposes of finding. */ KASSERT(tlb == NULL, ("%s:%d tlb unexpectedly non-NULL", __func__, __LINE__)); tmp_tlb = RB_FIND(tcp_log_id_tree, &tcp_log_id_head, (struct tcp_log_id_bucket *) id); /* * If we didn't find a matching bucket, we need to add a new * one. This requires a write lock. But, of course, we will * need to recheck some things when we re-acquire the lock. */ if (tmp_tlb == NULL && tree_locked != TREE_WLOCKED) { tree_locked = TREE_WLOCKED; if (!TCPID_TREE_UPGRADE()) { TCPID_TREE_RUNLOCK(); TCPID_TREE_WLOCK(); /* * The tree may have changed while we were * unlocked. */ goto refind; } } /* If we need to add a new bucket, do it now. */ if (tmp_tlb == NULL) { /* Allocate new bucket. */ tlb = uma_zalloc(tcp_log_bucket_zone, M_NOWAIT); if (tlb == NULL) { rv = ENOBUFS; goto done_noinp; } + counter_u64_add(tcp_log_pcb_ids_cur, 1); + counter_u64_add(tcp_log_pcb_ids_tot, 1); + if ((tcp_log_auto_all == false) && + tcp_log_auto_mode && + tcp_log_selectauto()) { + /* Save off the log state */ + tlb->tlb_logstate = tcp_log_auto_mode; + } else + tlb->tlb_logstate = TCP_LOG_STATE_OFF; + tlb->tlb_loglimit = 0; + tlb->tlb_tag[0] = '\0'; /* Default to an empty tag. */ + /* * Copy the ID to the bucket. * NB: Don't use strlcpy() unless you are sure * we've always validated NULL termination. * * TODO: When I'm done writing this, see if we * we have correctly validated NULL termination and * can use strlcpy(). :-) */ strncpy(tlb->tlb_id, id, TCP_LOG_ID_LEN - 1); tlb->tlb_id[TCP_LOG_ID_LEN - 1] = '\0'; /* * Take the refcount for the first node and go ahead * and lock this. Note that we zero the tlb_mtx * structure, since 0xdeadc0de flips the right bits * for the code to think that this mutex has already * been initialized. :-( */ SLIST_INIT(&tlb->tlb_head); refcount_init(&tlb->tlb_refcnt, 1); + tlb->tlb_reqcnt = 1; memset(&tlb->tlb_mtx, 0, sizeof(struct mtx)); TCPID_BUCKET_LOCK_INIT(tlb); TCPID_BUCKET_LOCK(tlb); bucket_locked = true; #define FREE_NEW_TLB() do { \ TCPID_BUCKET_LOCK_DESTROY(tlb); \ uma_zfree(tcp_log_bucket_zone, tlb); \ + counter_u64_add(tcp_log_pcb_ids_cur, (int64_t)-1); \ + counter_u64_add(tcp_log_pcb_ids_tot, (int64_t)-1); \ bucket_locked = false; \ tlb = NULL; \ } while (0) /* * Relock the INP and make sure we are still * unassigned. */ INP_WLOCK(inp); RECHECK_INP_CLEAN(FREE_NEW_TLB()); if (tp->t_lib != NULL) { FREE_NEW_TLB(); goto restart; } /* Add the new bucket to the tree. */ tmp_tlb = RB_INSERT(tcp_log_id_tree, &tcp_log_id_head, tlb); KASSERT(tmp_tlb == NULL, ("%s: Unexpected conflicting bucket (%p) while " "adding new bucket (%p)", __func__, tmp_tlb, tlb)); /* * If we found a conflicting bucket, free the new * one we made and fall through to use the existing * bucket. */ if (tmp_tlb != NULL) { FREE_NEW_TLB(); INP_WUNLOCK(inp); } #undef FREE_NEW_TLB } /* If we found an existing bucket, use it. */ if (tmp_tlb != NULL) { tlb = tmp_tlb; TCPID_BUCKET_LOCK(tlb); bucket_locked = true; /* * Relock the INP and make sure we are still * unassigned. */ INP_UNLOCK_ASSERT(inp); INP_WLOCK(inp); RECHECK_INP(); if (tp->t_lib != NULL) { TCPID_BUCKET_UNLOCK(tlb); bucket_locked = false; tlb = NULL; goto restart; } /* Take a reference on the bucket. */ TCPID_BUCKET_REF(tlb); + + /* Record the request. */ + tcp_log_increment_reqcnt(tlb); } tcp_log_grow_tlb(tlb->tlb_id, tp); /* Add the new node to the list. */ SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list); tp->t_lib = tlb; tp->t_lin = tln; + if (tp->t_lib->tlb_logstate) { + /* Clone in any logging */ + + tp->t_logstate = tp->t_lib->tlb_logstate; + } + if (tp->t_lib->tlb_loglimit) { + /* The loglimit too */ + + tp->t_loglimit = tp->t_lib->tlb_loglimit; + } tln = NULL; } rv = 0; done: /* Unlock things, as needed, and return. */ INP_WUNLOCK(inp); done_noinp: INP_UNLOCK_ASSERT(inp); if (bucket_locked) { TCPID_BUCKET_LOCK_ASSERT(tlb); TCPID_BUCKET_UNLOCK(tlb); } else if (tlb != NULL) TCPID_BUCKET_UNLOCK_ASSERT(tlb); if (tree_locked == TREE_WLOCKED) { TCPID_TREE_WLOCK_ASSERT(); TCPID_TREE_WUNLOCK(); } else if (tree_locked == TREE_RLOCKED) { TCPID_TREE_RLOCK_ASSERT(); TCPID_TREE_RUNLOCK(); } else TCPID_TREE_UNLOCK_ASSERT(); if (tln != NULL) uma_zfree(tcp_log_node_zone, tln); return (rv); } /* * Get the TCP log ID for a TCPCB. * Called with INPCB locked. * 'buf' must point to a buffer that is at least TCP_LOG_ID_LEN bytes long. * Returns number of bytes copied. */ size_t tcp_log_get_id(struct tcpcb *tp, char *buf) { size_t len; INP_LOCK_ASSERT(tp->t_inpcb); if (tp->t_lib != NULL) { len = strlcpy(buf, tp->t_lib->tlb_id, TCP_LOG_ID_LEN); KASSERT(len < TCP_LOG_ID_LEN, ("%s:%d: tp->t_lib->tlb_id too long (%zu)", __func__, __LINE__, len)); } else { *buf = '\0'; len = 0; } return (len); } /* + * Get the tag associated with the TCPCB's log ID. + * Called with INPCB locked. Returns with it unlocked. + * 'buf' must point to a buffer that is at least TCP_LOG_TAG_LEN bytes long. + * Returns number of bytes copied. + */ +size_t +tcp_log_get_tag(struct tcpcb *tp, char *buf) +{ + struct tcp_log_id_bucket *tlb; + size_t len; + int tree_locked; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + tree_locked = TREE_UNLOCKED; + tlb = tp->t_lib; + + if (tlb != NULL) { + TCPID_BUCKET_REF(tlb); + INP_WUNLOCK(tp->t_inpcb); + TCPID_BUCKET_LOCK(tlb); + len = strlcpy(buf, tlb->tlb_tag, TCP_LOG_TAG_LEN); + KASSERT(len < TCP_LOG_TAG_LEN, + ("%s:%d: tp->t_lib->tlb_tag too long (%zu)", + __func__, __LINE__, len)); + if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) + TCPID_BUCKET_UNLOCK(tlb); + + if (tree_locked == TREE_WLOCKED) { + TCPID_TREE_WLOCK_ASSERT(); + TCPID_TREE_WUNLOCK(); + } else if (tree_locked == TREE_RLOCKED) { + TCPID_TREE_RLOCK_ASSERT(); + TCPID_TREE_RUNLOCK(); + } else + TCPID_TREE_UNLOCK_ASSERT(); + } else { + INP_WUNLOCK(tp->t_inpcb); + *buf = '\0'; + len = 0; + } + + return (len); +} + +/* * Get number of connections with the same log ID. * Log ID is taken from given TCPCB. * Called with INPCB locked. */ u_int tcp_log_get_id_cnt(struct tcpcb *tp) { INP_WLOCK_ASSERT(tp->t_inpcb); return ((tp->t_lib == NULL) ? 0 : tp->t_lib->tlb_refcnt); } #ifdef TCPLOG_DEBUG_RINGBUF /* * Functions/macros to increment/decrement reference count for a log * entry. This should catch when we do a double-free/double-remove or * a double-add. */ static inline void _tcp_log_entry_refcnt_add(struct tcp_log_mem *log_entry, const char *func, int line) { int refcnt; refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, 1); if (refcnt != 0) panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 0)", func, line, log_entry, refcnt); } #define tcp_log_entry_refcnt_add(l) \ _tcp_log_entry_refcnt_add((l), __func__, __LINE__) static inline void _tcp_log_entry_refcnt_rem(struct tcp_log_mem *log_entry, const char *func, int line) { int refcnt; refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, -1); if (refcnt != 1) panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 1)", func, line, log_entry, refcnt); } #define tcp_log_entry_refcnt_rem(l) \ _tcp_log_entry_refcnt_rem((l), __func__, __LINE__) #else /* !TCPLOG_DEBUG_RINGBUF */ #define tcp_log_entry_refcnt_add(l) #define tcp_log_entry_refcnt_rem(l) #endif /* * Cleanup after removing a log entry, but only decrement the count if we * are running INVARIANTS. */ static inline void tcp_log_free_log_common(struct tcp_log_mem *log_entry, int *count __unused) { uma_zfree(tcp_log_zone, log_entry); #ifdef INVARIANTS (*count)--; KASSERT(*count >= 0, ("%s: count unexpectedly negative", __func__)); #endif } static void tcp_log_free_entries(struct tcp_log_stailq *head, int *count) { struct tcp_log_mem *log_entry; /* Free the entries. */ while ((log_entry = STAILQ_FIRST(head)) != NULL) { STAILQ_REMOVE_HEAD(head, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); tcp_log_free_log_common(log_entry, count); } } /* Cleanup after removing a log entry. */ static inline void tcp_log_remove_log_cleanup(struct tcpcb *tp, struct tcp_log_mem *log_entry) { uma_zfree(tcp_log_zone, log_entry); tp->t_lognum--; KASSERT(tp->t_lognum >= 0, ("%s: tp->t_lognum unexpectedly negative", __func__)); } /* Remove a log entry from the head of a list. */ static inline void tcp_log_remove_log_head(struct tcpcb *tp, struct tcp_log_mem *log_entry) { KASSERT(log_entry == STAILQ_FIRST(&tp->t_logs), ("%s: attempt to remove non-HEAD log entry", __func__)); STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); tcp_log_remove_log_cleanup(tp, log_entry); } #ifdef TCPLOG_DEBUG_RINGBUF /* * Initialize the log entry's reference count, which we want to * survive allocations. */ static int tcp_log_zone_init(void *mem, int size, int flags __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; tlm->tlm_refcnt = 0; return (0); } /* * Double check that the refcnt is zero on allocation and return. */ static int tcp_log_zone_ctor(void *mem, int size, void *args __unused, int flags __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; if (tlm->tlm_refcnt != 0) panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)", __func__, __LINE__, tlm, tlm->tlm_refcnt); return (0); } static void tcp_log_zone_dtor(void *mem, int size, void *args __unused) { struct tcp_log_mem *tlm; KASSERT(size >= sizeof(struct tcp_log_mem), ("%s: unexpectedly short (%d) allocation", __func__, size)); tlm = (struct tcp_log_mem *)mem; if (tlm->tlm_refcnt != 0) panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)", __func__, __LINE__, tlm, tlm->tlm_refcnt); } #endif /* TCPLOG_DEBUG_RINGBUF */ /* Do global initialization. */ void tcp_log_init(void) { tcp_log_zone = uma_zcreate("tcp_log", sizeof(struct tcp_log_mem), #ifdef TCPLOG_DEBUG_RINGBUF tcp_log_zone_ctor, tcp_log_zone_dtor, tcp_log_zone_init, #else NULL, NULL, NULL, #endif NULL, UMA_ALIGN_PTR, 0); (void)uma_zone_set_max(tcp_log_zone, TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT); tcp_log_bucket_zone = uma_zcreate("tcp_log_bucket", sizeof(struct tcp_log_id_bucket), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); tcp_log_node_zone = uma_zcreate("tcp_log_node", sizeof(struct tcp_log_id_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); #ifdef TCPLOG_DEBUG_COUNTERS tcp_log_queued = counter_u64_alloc(M_WAITOK); tcp_log_que_fail1 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail2 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail3 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail4 = counter_u64_alloc(M_WAITOK); tcp_log_que_fail5 = counter_u64_alloc(M_WAITOK); tcp_log_que_copyout = counter_u64_alloc(M_WAITOK); tcp_log_que_read = counter_u64_alloc(M_WAITOK); tcp_log_que_freed = counter_u64_alloc(M_WAITOK); #endif + tcp_log_pcb_ids_cur = counter_u64_alloc(M_WAITOK); + tcp_log_pcb_ids_tot = counter_u64_alloc(M_WAITOK); rw_init_flags(&tcp_id_tree_lock, "TCP ID tree", RW_NEW); mtx_init(&tcp_log_expireq_mtx, "TCP log expireq", NULL, MTX_DEF); callout_init(&tcp_log_expireq_callout, 1); } /* Do per-TCPCB initialization. */ void tcp_log_tcpcbinit(struct tcpcb *tp) { /* A new TCPCB should start out zero-initialized. */ STAILQ_INIT(&tp->t_logs); /* * If we are doing auto-capturing, figure out whether we will capture * this session. */ - if (tcp_log_selectauto()) { + tp->t_loglimit = tcp_log_session_limit; + if ((tcp_log_auto_all == true) && + tcp_log_auto_mode && + tcp_log_selectauto()) { tp->t_logstate = tcp_log_auto_mode; tp->t_flags2 |= TF2_LOG_AUTO; } } /* Remove entries */ static void tcp_log_expire(void *unused __unused) { struct tcp_log_id_bucket *tlb; struct tcp_log_id_node *tln; sbintime_t expiry_limit; int tree_locked; TCPLOG_EXPIREQ_LOCK(); if (callout_pending(&tcp_log_expireq_callout)) { /* Callout was reset. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Process entries until we reach one that expires too far in the * future. Look one second in the future. */ expiry_limit = getsbinuptime() + SBT_1S; tree_locked = TREE_UNLOCKED; while ((tln = STAILQ_FIRST(&tcp_log_expireq_head)) != NULL && tln->tln_expiretime <= expiry_limit) { if (!callout_active(&tcp_log_expireq_callout)) { /* * Callout was stopped. I guess we should * just quit at this point. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Remove the node from the head of the list and unlock * the list. Change the expiry time to SBT_MAX as a signal * to other threads that we now own this. */ STAILQ_REMOVE_HEAD(&tcp_log_expireq_head, tln_expireq); tln->tln_expiretime = SBT_MAX; TCPLOG_EXPIREQ_UNLOCK(); /* * Remove the node from the bucket. */ tlb = tln->tln_bucket; TCPID_BUCKET_LOCK(tlb); if (tcp_log_remove_id_node(NULL, NULL, tlb, tln, &tree_locked)) { tcp_log_id_validate_tree_lock(tree_locked); if (tree_locked == TREE_WLOCKED) TCPID_TREE_WUNLOCK(); else TCPID_TREE_RUNLOCK(); tree_locked = TREE_UNLOCKED; } /* Drop the INP reference. */ INP_WLOCK(tln->tln_inp); if (!in_pcbrele_wlocked(tln->tln_inp)) INP_WUNLOCK(tln->tln_inp); /* Free the log records. */ tcp_log_free_entries(&tln->tln_entries, &tln->tln_count); /* Free the node. */ uma_zfree(tcp_log_node_zone, tln); /* Relock the expiry queue. */ TCPLOG_EXPIREQ_LOCK(); } /* * We've expired all the entries we can. Do we need to reschedule * ourselves? */ callout_deactivate(&tcp_log_expireq_callout); if (tln != NULL) { /* * Get max(now + TCP_LOG_EXPIRE_INTVL, tln->tln_expiretime) and * set the next callout to that. (This helps ensure we generally * run the callout no more often than desired.) */ expiry_limit = getsbinuptime() + TCP_LOG_EXPIRE_INTVL; if (expiry_limit < tln->tln_expiretime) expiry_limit = tln->tln_expiretime; callout_reset_sbt(&tcp_log_expireq_callout, expiry_limit, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } /* We're done. */ TCPLOG_EXPIREQ_UNLOCK(); return; } /* * Move log data from the TCPCB to a new node. This will reset the TCPCB log * entries and log count; however, it will not touch other things from the * TCPCB (e.g. t_lin, t_lib). * * NOTE: Must hold a lock on the INP. */ static void tcp_log_move_tp_to_node(struct tcpcb *tp, struct tcp_log_id_node *tln) { INP_WLOCK_ASSERT(tp->t_inpcb); tln->tln_ie = tp->t_inpcb->inp_inc.inc_ie; if (tp->t_inpcb->inp_inc.inc_flags & INC_ISIPV6) tln->tln_af = AF_INET6; else tln->tln_af = AF_INET; tln->tln_entries = tp->t_logs; tln->tln_count = tp->t_lognum; tln->tln_bucket = tp->t_lib; /* Clear information from the PCB. */ STAILQ_INIT(&tp->t_logs); tp->t_lognum = 0; } /* Do per-TCPCB cleanup */ void tcp_log_tcpcbfini(struct tcpcb *tp) { struct tcp_log_id_node *tln, *tln_first; struct tcp_log_mem *log_entry; sbintime_t callouttime; INP_WLOCK_ASSERT(tp->t_inpcb); + TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_CONNEND, 0, 0, NULL, false); + /* * If we were gathering packets to be automatically dumped, try to do * it now. If this succeeds, the log information in the TCPCB will be * cleared. Otherwise, we'll handle the log information as we do * for other states. */ switch(tp->t_logstate) { case TCP_LOG_STATE_HEAD_AUTO: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", M_NOWAIT, false); break; case TCP_LOG_STATE_TAIL_AUTO: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail", M_NOWAIT, false); break; case TCP_LOG_STATE_CONTINUAL: (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false); break; } /* * There are two ways we could keep logs: per-socket or per-ID. If * we are tracking logs with an ID, then the logs survive the * destruction of the TCPCB. * * If the TCPCB is associated with an ID node, move the logs from the * TCPCB to the ID node. In theory, this is safe, for reasons which I * will now explain for my own benefit when I next need to figure out * this code. :-) * * We own the INP lock. Therefore, no one else can change the contents * of this node (Rule C). Further, no one can remove this node from * the bucket while we hold the lock (Rule D). Basically, no one can * mess with this node. That leaves two states in which we could be: * * 1. Another thread is currently waiting to acquire the INP lock, with * plans to do something with this node. When we drop the INP lock, * they will have a chance to do that. They will recheck the * tln_closed field (see note to Rule C) and then acquire the * bucket lock before proceeding further. * * 2. Another thread will try to acquire a lock at some point in the * future. If they try to acquire a lock before we set the * tln_closed field, they will follow state #1. If they try to * acquire a lock after we set the tln_closed field, they will be * able to make changes to the node, at will, following Rule C. * * Therefore, we currently own this node and can make any changes * we want. But, as soon as we set the tln_closed field to true, we * have effectively dropped our lock on the node. (For this reason, we * also need to make sure our writes are ordered correctly. An atomic * operation with "release" semantics should be sufficient.) */ if (tp->t_lin != NULL) { /* Copy the relevant information to the log entry. */ tln = tp->t_lin; KASSERT(tln->tln_inp == tp->t_inpcb, ("%s: Mismatched inp (tln->tln_inp=%p, tp->t_inpcb=%p)", __func__, tln->tln_inp, tp->t_inpcb)); tcp_log_move_tp_to_node(tp, tln); /* Clear information from the PCB. */ tp->t_lin = NULL; tp->t_lib = NULL; /* * Take a reference on the INP. This ensures that the INP * remains valid while the node is on the expiry queue. This * ensures the INP is valid for other threads that may be * racing to lock this node when we move it to the expire * queue. */ in_pcbref(tp->t_inpcb); /* * Store the entry on the expiry list. The exact behavior * depends on whether we have entries to keep. If so, we * put the entry at the tail of the list and expire in * TCP_LOG_EXPIRE_TIME. Otherwise, we expire "now" and put * the entry at the head of the list. (Handling the cleanup * via the expiry timer lets us avoid locking messy-ness here.) */ tln->tln_expiretime = getsbinuptime(); TCPLOG_EXPIREQ_LOCK(); if (tln->tln_count) { tln->tln_expiretime += TCP_LOG_EXPIRE_TIME; if (STAILQ_EMPTY(&tcp_log_expireq_head) && !callout_active(&tcp_log_expireq_callout)) { /* * We are adding the first entry and a callout * is not currently scheduled; therefore, we * need to schedule one. */ callout_reset_sbt(&tcp_log_expireq_callout, tln->tln_expiretime, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } STAILQ_INSERT_TAIL(&tcp_log_expireq_head, tln, tln_expireq); } else { callouttime = tln->tln_expiretime + TCP_LOG_EXPIRE_INTVL; tln_first = STAILQ_FIRST(&tcp_log_expireq_head); if ((tln_first == NULL || callouttime < tln_first->tln_expiretime) && (callout_pending(&tcp_log_expireq_callout) || !callout_active(&tcp_log_expireq_callout))) { /* * The list is empty, or we want to run the * expire code before the first entry's timer * fires. Also, we are in a case where a callout * is not actively running. We want to reset * the callout to occur sooner. */ callout_reset_sbt(&tcp_log_expireq_callout, callouttime, SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); } /* * Insert to the head, or just after the head, as * appropriate. (This might result in small * mis-orderings as a bunch of "expire now" entries * gather at the start of the list, but that should * not produce big problems, since the expire timer * will walk through all of them.) */ if (tln_first == NULL || tln->tln_expiretime < tln_first->tln_expiretime) STAILQ_INSERT_HEAD(&tcp_log_expireq_head, tln, tln_expireq); else STAILQ_INSERT_AFTER(&tcp_log_expireq_head, tln_first, tln, tln_expireq); } TCPLOG_EXPIREQ_UNLOCK(); /* * We are done messing with the tln. After this point, we * can't touch it. (Note that the "release" semantics should * be included with the TCPLOG_EXPIREQ_UNLOCK() call above. * Therefore, they should be unnecessary here. However, it * seems like a good idea to include them anyway, since we * really are releasing a lock here.) */ atomic_store_rel_int(&tln->tln_closed, 1); } else { /* Remove log entries. */ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); } /* * Change the log state to off (just in case anything tries to sneak * in a last-minute log). */ tp->t_logstate = TCP_LOG_STATE_OFF; } +static void +tcp_log_purge_tp_logbuf(struct tcpcb *tp) +{ + struct tcp_log_mem *log_entry; + struct inpcb *inp; + + inp = tp->t_inpcb; + INP_WLOCK_ASSERT(inp); + if (tp->t_lognum == 0) + return; + + while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) + tcp_log_remove_log_head(tp, log_entry); + KASSERT(tp->t_lognum == 0, + ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", + __func__, tp->t_lognum)); + tp->t_logstate = TCP_LOG_STATE_OFF; +} + /* * This logs an event for a TCP socket. Normally, this is called via * TCP_LOG_EVENT or TCP_LOG_EVENT_VERBOSE. See the documentation for * TCP_LOG_EVENT(). */ struct tcp_log_buffer * tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *itv) { struct tcp_log_mem *log_entry; struct tcp_log_buffer *log_buf; int attempt_count = 0; struct tcp_log_verbose *log_verbose; uint32_t logsn; KASSERT((func == NULL && line == 0) || (func != NULL && line > 0), ("%s called with inconsistent func (%p) and line (%d) arguments", __func__, func, line)); INP_WLOCK_ASSERT(tp->t_inpcb); - + if (tcp_disable_all_bb_logs) { + /* + * The global shutdown logging + * switch has been thrown. Call + * the purge function that frees + * purges out the logs and + * turns off logging. + */ + tcp_log_purge_tp_logbuf(tp); + return (NULL); + } KASSERT(tp->t_logstate == TCP_LOG_STATE_HEAD || tp->t_logstate == TCP_LOG_STATE_TAIL || tp->t_logstate == TCP_LOG_STATE_CONTINUAL || tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO || tp->t_logstate == TCP_LOG_STATE_TAIL_AUTO, ("%s called with unexpected tp->t_logstate (%d)", __func__, tp->t_logstate)); /* * Get the serial number. We do this early so it will * increment even if we end up skipping the log entry for some * reason. */ logsn = tp->t_logsn++; /* * Can we get a new log entry? If so, increment the lognum counter * here. */ retry: - if (tp->t_lognum < tcp_log_session_limit) { + if (tp->t_lognum < tp->t_loglimit) { if ((log_entry = uma_zalloc(tcp_log_zone, M_NOWAIT)) != NULL) tp->t_lognum++; } else log_entry = NULL; /* Do we need to try to reuse? */ if (log_entry == NULL) { /* * Sacrifice auto-logged sessions without a log ID if * tcp_log_auto_all is false. (If they don't have a log * ID by now, it is probable that either they won't get one * or we are resource-constrained.) */ if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) && !tcp_log_auto_all) { if (tcp_log_state_change(tp, TCP_LOG_STATE_CLEAR)) { #ifdef INVARIANTS panic("%s:%d: tcp_log_state_change() failed " "to set tp %p to TCP_LOG_STATE_CLEAR", __func__, __LINE__, tp); #endif tp->t_logstate = TCP_LOG_STATE_OFF; } return (NULL); } /* * If we are in TCP_LOG_STATE_HEAD_AUTO state, try to dump * the buffers. If successful, deactivate tracing. Otherwise, * leave it active so we will retry. */ if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO && !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", M_NOWAIT, false)) { tp->t_logstate = TCP_LOG_STATE_OFF; return(NULL); } else if ((tp->t_logstate == TCP_LOG_STATE_CONTINUAL) && !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false)) { if (attempt_count == 0) { attempt_count++; goto retry; } #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail4, 1); #endif return(NULL); } else if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) return(NULL); /* If in HEAD state, just deactivate the tracing and return. */ if (tp->t_logstate == TCP_LOG_STATE_HEAD) { tp->t_logstate = TCP_LOG_STATE_OFF; return(NULL); } /* * Get a buffer to reuse. If that fails, just give up. * (We can't log anything without a buffer in which to * put it.) * * Note that we don't change the t_lognum counter * here. Because we are re-using the buffer, the total * number won't change. */ if ((log_entry = STAILQ_FIRST(&tp->t_logs)) == NULL) return(NULL); STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue); tcp_log_entry_refcnt_rem(log_entry); } KASSERT(log_entry != NULL, ("%s: log_entry unexpectedly NULL", __func__)); /* Extract the log buffer and verbose buffer pointers. */ log_buf = &log_entry->tlm_buf; log_verbose = &log_entry->tlm_v; /* Basic entries. */ if (itv == NULL) getmicrouptime(&log_buf->tlb_tv); else memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval)); log_buf->tlb_ticks = ticks; log_buf->tlb_sn = logsn; log_buf->tlb_stackid = tp->t_fb->tfb_id; log_buf->tlb_eventid = eventid; log_buf->tlb_eventflags = 0; log_buf->tlb_errno = errornum; /* Socket buffers */ if (rxbuf != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_RXBUF; log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc; log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc; log_buf->tlb_rxbuf.tls_sb_spare = 0; } if (txbuf != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_TXBUF; log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc; log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc; log_buf->tlb_txbuf.tls_sb_spare = 0; } /* Copy values from tp to the log entry. */ #define COPY_STAT(f) log_buf->tlb_ ## f = tp->f #define COPY_STAT_T(f) log_buf->tlb_ ## f = tp->t_ ## f COPY_STAT_T(state); COPY_STAT_T(starttime); COPY_STAT(iss); COPY_STAT_T(flags); COPY_STAT(snd_una); COPY_STAT(snd_max); COPY_STAT(snd_cwnd); COPY_STAT(snd_nxt); COPY_STAT(snd_recover); COPY_STAT(snd_wnd); COPY_STAT(snd_ssthresh); COPY_STAT_T(srtt); COPY_STAT_T(rttvar); COPY_STAT(rcv_up); COPY_STAT(rcv_adv); COPY_STAT(rcv_nxt); COPY_STAT(sack_newdata); COPY_STAT(rcv_wnd); COPY_STAT_T(dupacks); COPY_STAT_T(segqlen); COPY_STAT(snd_numholes); COPY_STAT(snd_scale); COPY_STAT(rcv_scale); #undef COPY_STAT #undef COPY_STAT_T log_buf->tlb_flex1 = 0; log_buf->tlb_flex2 = 0; /* Copy stack-specific info. */ if (stackinfo != NULL) { memcpy(&log_buf->tlb_stackinfo, stackinfo, sizeof(log_buf->tlb_stackinfo)); log_buf->tlb_eventflags |= TLB_FLAG_STACKINFO; } /* The packet */ log_buf->tlb_len = len; if (th) { int optlen; log_buf->tlb_eventflags |= TLB_FLAG_HDR; log_buf->tlb_th = *th; if (th_hostorder) tcp_fields_to_net(&log_buf->tlb_th); optlen = (th->th_off << 2) - sizeof (struct tcphdr); if (optlen > 0) memcpy(log_buf->tlb_opts, th + 1, optlen); } /* Verbose information */ if (func != NULL) { log_buf->tlb_eventflags |= TLB_FLAG_VERBOSE; if (output_caller != NULL) strlcpy(log_verbose->tlv_snd_frm, output_caller, TCP_FUNC_LEN); else *log_verbose->tlv_snd_frm = 0; strlcpy(log_verbose->tlv_trace_func, func, TCP_FUNC_LEN); log_verbose->tlv_trace_line = line; } /* Insert the new log at the tail. */ STAILQ_INSERT_TAIL(&tp->t_logs, log_entry, tlm_queue); tcp_log_entry_refcnt_add(log_entry); return (log_buf); } /* * Change the logging state for a TCPCB. Returns 0 on success or an * error code on failure. */ int tcp_log_state_change(struct tcpcb *tp, int state) { struct tcp_log_mem *log_entry; INP_WLOCK_ASSERT(tp->t_inpcb); switch(state) { case TCP_LOG_STATE_CLEAR: while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); /* Fall through */ case TCP_LOG_STATE_OFF: tp->t_logstate = TCP_LOG_STATE_OFF; break; case TCP_LOG_STATE_TAIL: case TCP_LOG_STATE_HEAD: case TCP_LOG_STATE_CONTINUAL: case TCP_LOG_STATE_HEAD_AUTO: case TCP_LOG_STATE_TAIL_AUTO: tp->t_logstate = state; break; default: return (EINVAL); } - + if (tcp_disable_all_bb_logs) { + /* We are prohibited from doing any logs */ + tp->t_logstate = TCP_LOG_STATE_OFF; + } tp->t_flags2 &= ~(TF2_LOG_AUTO); return (0); } /* If tcp_drain() is called, flush half the log entries. */ void tcp_log_drain(struct tcpcb *tp) { struct tcp_log_mem *log_entry, *next; int target, skip; INP_WLOCK_ASSERT(tp->t_inpcb); if ((target = tp->t_lognum / 2) == 0) return; /* * If we are logging the "head" packets, we want to discard * from the tail of the queue. Otherwise, we want to discard * from the head. */ if (tp->t_logstate == TCP_LOG_STATE_HEAD || tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) { skip = tp->t_lognum - target; STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue) if (!--skip) break; KASSERT(log_entry != NULL, ("%s: skipped through all entries!", __func__)); if (log_entry == NULL) return; while ((next = STAILQ_NEXT(log_entry, tlm_queue)) != NULL) { STAILQ_REMOVE_AFTER(&tp->t_logs, log_entry, tlm_queue); tcp_log_entry_refcnt_rem(next); tcp_log_remove_log_cleanup(tp, next); #ifdef INVARIANTS target--; #endif } KASSERT(target == 0, ("%s: After removing from tail, target was %d", __func__, target)); } else if (tp->t_logstate == TCP_LOG_STATE_CONTINUAL) { (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", M_NOWAIT, false); } else { while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL && target--) tcp_log_remove_log_head(tp, log_entry); KASSERT(target <= 0, ("%s: After removing from head, target was %d", __func__, target)); KASSERT(tp->t_lognum > 0, ("%s: After removing from head, tp->t_lognum was %d", __func__, target)); KASSERT(log_entry != NULL, ("%s: After removing from head, the tailq was empty", __func__)); } } static inline int tcp_log_copyout(struct sockopt *sopt, void *src, void *dst, size_t len) { if (sopt->sopt_td != NULL) return (copyout(src, dst, len)); bcopy(src, dst, len); return (0); } static int tcp_log_logs_to_buf(struct sockopt *sopt, struct tcp_log_stailq *log_tailqp, struct tcp_log_buffer **end, int count) { struct tcp_log_buffer *out_entry; struct tcp_log_mem *log_entry; size_t entrysize; int error; #ifdef INVARIANTS int orig_count = count; #endif /* Copy the data out. */ error = 0; out_entry = (struct tcp_log_buffer *) sopt->sopt_val; STAILQ_FOREACH(log_entry, log_tailqp, tlm_queue) { count--; KASSERT(count >= 0, ("%s:%d: Exceeded expected count (%d) processing list %p", __func__, __LINE__, orig_count, log_tailqp)); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_copyout, 1); #endif /* * Skip copying out the header if it isn't present. * Instead, copy out zeros (to ensure we don't leak info). * TODO: Make sure we truly do zero everything we don't * explicitly set. */ if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR) entrysize = sizeof(struct tcp_log_buffer); else entrysize = offsetof(struct tcp_log_buffer, tlb_th); error = tcp_log_copyout(sopt, &log_entry->tlm_buf, out_entry, entrysize); if (error) break; if (!(log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)) { error = tcp_log_copyout(sopt, zerobuf, ((uint8_t *)out_entry) + entrysize, sizeof(struct tcp_log_buffer) - entrysize); } /* * Copy out the verbose bit, if needed. Either way, * increment the output pointer the correct amount. */ if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) { error = tcp_log_copyout(sopt, &log_entry->tlm_v, out_entry->tlb_verbose, sizeof(struct tcp_log_verbose)); if (error) break; out_entry = (struct tcp_log_buffer *) (((uint8_t *) (out_entry + 1)) + sizeof(struct tcp_log_verbose)); } else out_entry++; } *end = out_entry; KASSERT(error || count == 0, ("%s:%d: Less than expected count (%d) processing list %p" " (%d remain)", __func__, __LINE__, orig_count, log_tailqp, count)); return (error); } /* * Copy out the buffer. Note that we do incremental copying, so * sooptcopyout() won't work. However, the goal is to produce the same * end result as if we copied in the entire user buffer, updated it, * and then used sooptcopyout() to copy it out. * * NOTE: This should be called with a write lock on the PCB; however, * the function will drop it after it extracts the data from the TCPCB. */ int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp) { struct tcp_log_stailq log_tailq; struct tcp_log_mem *log_entry, *log_next; struct tcp_log_buffer *out_entry; struct inpcb *inp; size_t outsize, entrysize; int error, outnum; INP_WLOCK_ASSERT(tp->t_inpcb); inp = tp->t_inpcb; /* * Determine which log entries will fit in the buffer. As an * optimization, skip this if all the entries will clearly fit * in the buffer. (However, get an exact size if we are using * INVARIANTS.) */ #ifndef INVARIANTS if (sopt->sopt_valsize / (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)) >= tp->t_lognum) { log_entry = STAILQ_LAST(&tp->t_logs, tcp_log_mem, tlm_queue); log_next = NULL; outsize = 0; outnum = tp->t_lognum; } else { #endif outsize = outnum = 0; log_entry = NULL; STAILQ_FOREACH(log_next, &tp->t_logs, tlm_queue) { entrysize = sizeof(struct tcp_log_buffer); if (log_next->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) entrysize += sizeof(struct tcp_log_verbose); if ((sopt->sopt_valsize - outsize) < entrysize) break; outsize += entrysize; outnum++; log_entry = log_next; } KASSERT(outsize <= sopt->sopt_valsize, ("%s: calculated output size (%zu) greater than available" "space (%zu)", __func__, outsize, sopt->sopt_valsize)); #ifndef INVARIANTS } #endif /* * Copy traditional sooptcopyout() behavior: if sopt->sopt_val * is NULL, silently skip the copy. However, in this case, we * will leave the list alone and return. Functionally, this * gives userspace a way to poll for an approximate buffer * size they will need to get the log entries. */ if (sopt->sopt_val == NULL) { INP_WUNLOCK(inp); if (outsize == 0) { outsize = outnum * (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)); } if (sopt->sopt_valsize > outsize) sopt->sopt_valsize = outsize; return (0); } /* * Break apart the list. We'll save the ones we want to copy * out locally and remove them from the TCPCB list. We can * then drop the INPCB lock while we do the copyout. * * There are roughly three cases: * 1. There was nothing to copy out. That's easy: drop the * lock and return. * 2. We are copying out the entire list. Again, that's easy: * move the whole list. * 3. We are copying out a partial list. That's harder. We * need to update the list book-keeping entries. */ if (log_entry != NULL && log_next == NULL) { /* Move entire list. */ KASSERT(outnum == tp->t_lognum, ("%s:%d: outnum (%d) should match tp->t_lognum (%d)", __func__, __LINE__, outnum, tp->t_lognum)); log_tailq = tp->t_logs; tp->t_lognum = 0; STAILQ_INIT(&tp->t_logs); } else if (log_entry != NULL) { /* Move partial list. */ KASSERT(outnum < tp->t_lognum, ("%s:%d: outnum (%d) not less than tp->t_lognum (%d)", __func__, __LINE__, outnum, tp->t_lognum)); STAILQ_FIRST(&log_tailq) = STAILQ_FIRST(&tp->t_logs); STAILQ_FIRST(&tp->t_logs) = STAILQ_NEXT(log_entry, tlm_queue); KASSERT(STAILQ_NEXT(log_entry, tlm_queue) != NULL, ("%s:%d: tp->t_logs is unexpectedly shorter than expected" "(tp: %p, log_tailq: %p, outnum: %d, tp->t_lognum: %d)", __func__, __LINE__, tp, &log_tailq, outnum, tp->t_lognum)); STAILQ_NEXT(log_entry, tlm_queue) = NULL; log_tailq.stqh_last = &STAILQ_NEXT(log_entry, tlm_queue); tp->t_lognum -= outnum; } else STAILQ_INIT(&log_tailq); /* Drop the PCB lock. */ INP_WUNLOCK(inp); /* Copy the data out. */ error = tcp_log_logs_to_buf(sopt, &log_tailq, &out_entry, outnum); if (error) { /* Restore list */ INP_WLOCK(inp); if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0) { tp = intotcpcb(inp); /* Merge the two lists. */ STAILQ_CONCAT(&log_tailq, &tp->t_logs); tp->t_logs = log_tailq; tp->t_lognum += outnum; } INP_WUNLOCK(inp); } else { /* Sanity check entries */ KASSERT(((caddr_t)out_entry - (caddr_t)sopt->sopt_val) == outsize, ("%s: Actual output size (%zu) != " "calculated output size (%zu)", __func__, (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val), outsize)); /* Free the entries we just copied out. */ STAILQ_FOREACH_SAFE(log_entry, &log_tailq, tlm_queue, log_next) { tcp_log_entry_refcnt_rem(log_entry); uma_zfree(tcp_log_zone, log_entry); } } sopt->sopt_valsize = (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val); return (error); } static void tcp_log_free_queue(struct tcp_log_dev_queue *param) { struct tcp_log_dev_log_queue *entry; KASSERT(param != NULL, ("%s: called with NULL param", __func__)); if (param == NULL) return; entry = (struct tcp_log_dev_log_queue *)param; /* Free the entries. */ tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count); /* Free the buffer, if it is allocated. */ if (entry->tldl_common.tldq_buf != NULL) free(entry->tldl_common.tldq_buf, M_TCPLOGDEV); /* Free the queue entry. */ free(entry, M_TCPLOGDEV); } static struct tcp_log_common_header * tcp_log_expandlogbuf(struct tcp_log_dev_queue *param) { struct tcp_log_dev_log_queue *entry; struct tcp_log_header *hdr; uint8_t *end; struct sockopt sopt; int error; entry = (struct tcp_log_dev_log_queue *)param; /* Take a worst-case guess at space needs. */ sopt.sopt_valsize = sizeof(struct tcp_log_header) + entry->tldl_count * (sizeof(struct tcp_log_buffer) + sizeof(struct tcp_log_verbose)); hdr = malloc(sopt.sopt_valsize, M_TCPLOGDEV, M_NOWAIT); if (hdr == NULL) { #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail5, entry->tldl_count); #endif return (NULL); } sopt.sopt_val = hdr + 1; sopt.sopt_valsize -= sizeof(struct tcp_log_header); sopt.sopt_td = NULL; error = tcp_log_logs_to_buf(&sopt, &entry->tldl_entries, (struct tcp_log_buffer **)&end, entry->tldl_count); if (error) { free(hdr, M_TCPLOGDEV); return (NULL); } /* Free the entries. */ tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count); entry->tldl_count = 0; memset(hdr, 0, sizeof(struct tcp_log_header)); hdr->tlh_version = TCP_LOG_BUF_VER; hdr->tlh_type = TCP_LOG_DEV_TYPE_BBR; hdr->tlh_length = end - (uint8_t *)hdr; hdr->tlh_ie = entry->tldl_ie; hdr->tlh_af = entry->tldl_af; getboottime(&hdr->tlh_offset); strlcpy(hdr->tlh_id, entry->tldl_id, TCP_LOG_ID_LEN); + strlcpy(hdr->tlh_tag, entry->tldl_tag, TCP_LOG_TAG_LEN); strlcpy(hdr->tlh_reason, entry->tldl_reason, TCP_LOG_REASON_LEN); return ((struct tcp_log_common_header *)hdr); } /* * Queue the tcpcb's log buffer for transmission via the log buffer facility. * * NOTE: This should be called with a write lock on the PCB. * * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop * and reacquire the INP lock if it needs to do so. * * If force is false, this will only dump auto-logged sessions if * tcp_log_auto_all is true or if there is a log ID defined for the session. */ int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force) { struct tcp_log_dev_log_queue *entry; struct inpcb *inp; #ifdef TCPLOG_DEBUG_COUNTERS int num_entries; #endif inp = tp->t_inpcb; INP_WLOCK_ASSERT(inp); /* If there are no log entries, there is nothing to do. */ if (tp->t_lognum == 0) return (0); /* Check for a log ID. */ if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) && !tcp_log_auto_all && !force) { struct tcp_log_mem *log_entry; /* * We needed a log ID and none was found. Free the log entries * and return success. Also, cancel further logging. If the * session doesn't have a log ID by now, we'll assume it isn't * going to get one. */ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) tcp_log_remove_log_head(tp, log_entry); KASSERT(tp->t_lognum == 0, ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", __func__, tp->t_lognum)); tp->t_logstate = TCP_LOG_STATE_OFF; return (0); } /* * Allocate memory. If we must wait, we'll need to drop the locks * and reacquire them (and do all the related business that goes * along with that). */ entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_NOWAIT); if (entry == NULL && (how & M_NOWAIT)) { #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail3, 1); #endif return (ENOBUFS); } if (entry == NULL) { INP_WUNLOCK(inp); entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_WAITOK); INP_WLOCK(inp); /* * Note that this check is slightly overly-restrictive in * that the TCB can survive either of these events. * However, there is currently not a good way to ensure * that is the case. So, if we hit this M_WAIT path, we * may end up dropping some entries. That seems like a * small price to pay for safety. */ if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { free(entry, M_TCPLOGDEV); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail2, 1); #endif return (ECONNRESET); } tp = intotcpcb(inp); if (tp->t_lognum == 0) { free(entry, M_TCPLOGDEV); return (0); } } /* Fill in the unique parts of the queue entry. */ - if (tp->t_lib != NULL) + if (tp->t_lib != NULL) { strlcpy(entry->tldl_id, tp->t_lib->tlb_id, TCP_LOG_ID_LEN); - else + strlcpy(entry->tldl_tag, tp->t_lib->tlb_tag, TCP_LOG_TAG_LEN); + } else { strlcpy(entry->tldl_id, "UNKNOWN", TCP_LOG_ID_LEN); + strlcpy(entry->tldl_tag, "UNKNOWN", TCP_LOG_TAG_LEN); + } if (reason != NULL) strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN); else strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN); entry->tldl_ie = inp->inp_inc.inc_ie; if (inp->inp_inc.inc_flags & INC_ISIPV6) entry->tldl_af = AF_INET6; else entry->tldl_af = AF_INET; entry->tldl_entries = tp->t_logs; entry->tldl_count = tp->t_lognum; /* Fill in the common parts of the queue entry. */ entry->tldl_common.tldq_buf = NULL; entry->tldl_common.tldq_xform = tcp_log_expandlogbuf; entry->tldl_common.tldq_dtor = tcp_log_free_queue; /* Clear the log data from the TCPCB. */ #ifdef TCPLOG_DEBUG_COUNTERS num_entries = tp->t_lognum; #endif tp->t_lognum = 0; STAILQ_INIT(&tp->t_logs); /* Add the entry. If no one is listening, free the entry. */ if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) { tcp_log_free_queue((struct tcp_log_dev_queue *)entry); #ifdef TCPLOG_DEBUG_COUNTERS counter_u64_add(tcp_log_que_fail1, num_entries); } else { counter_u64_add(tcp_log_queued, num_entries); #endif } return (0); } /* * Queue the log_id_node's log buffers for transmission via the log buffer * facility. * * NOTE: This should be called with the bucket locked and referenced. * * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop * and reacquire the bucket lock if it needs to do so. (The caller must * ensure that the tln is no longer on any lists so no one else will mess * with this while the lock is dropped!) */ static int tcp_log_dump_node_logbuf(struct tcp_log_id_node *tln, char *reason, int how) { struct tcp_log_dev_log_queue *entry; struct tcp_log_id_bucket *tlb; tlb = tln->tln_bucket; TCPID_BUCKET_LOCK_ASSERT(tlb); KASSERT(tlb->tlb_refcnt > 0, ("%s:%d: Called with unreferenced bucket (tln=%p, tlb=%p)", __func__, __LINE__, tln, tlb)); KASSERT(tln->tln_closed, ("%s:%d: Called for node with tln_closed==false (tln=%p)", __func__, __LINE__, tln)); /* If there are no log entries, there is nothing to do. */ if (tln->tln_count == 0) return (0); /* * Allocate memory. If we must wait, we'll need to drop the locks * and reacquire them (and do all the related business that goes * along with that). */ entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_NOWAIT); if (entry == NULL && (how & M_NOWAIT)) return (ENOBUFS); if (entry == NULL) { TCPID_BUCKET_UNLOCK(tlb); entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, M_WAITOK); TCPID_BUCKET_LOCK(tlb); } /* Fill in the common parts of the queue entry.. */ entry->tldl_common.tldq_buf = NULL; entry->tldl_common.tldq_xform = tcp_log_expandlogbuf; entry->tldl_common.tldq_dtor = tcp_log_free_queue; /* Fill in the unique parts of the queue entry. */ strlcpy(entry->tldl_id, tlb->tlb_id, TCP_LOG_ID_LEN); + strlcpy(entry->tldl_tag, tlb->tlb_tag, TCP_LOG_TAG_LEN); if (reason != NULL) strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN); else strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN); entry->tldl_ie = tln->tln_ie; entry->tldl_entries = tln->tln_entries; entry->tldl_count = tln->tln_count; entry->tldl_af = tln->tln_af; /* Add the entry. If no one is listening, free the entry. */ if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) tcp_log_free_queue((struct tcp_log_dev_queue *)entry); return (0); } /* * Queue the log buffers for all sessions in a bucket for transmissions via * the log buffer facility. * * NOTE: This should be called with a locked bucket; however, the function * will drop the lock. */ #define LOCAL_SAVE 10 static void tcp_log_dumpbucketlogs(struct tcp_log_id_bucket *tlb, char *reason) { struct tcp_log_id_node local_entries[LOCAL_SAVE]; struct inpcb *inp; struct tcpcb *tp; struct tcp_log_id_node *cur_tln, *prev_tln, *tmp_tln; int i, num_local_entries, tree_locked; bool expireq_locked; TCPID_BUCKET_LOCK_ASSERT(tlb); /* * Take a reference on the bucket to keep it from disappearing until * we are done. */ TCPID_BUCKET_REF(tlb); /* * We'll try to create these without dropping locks. However, we * might very well need to drop locks to get memory. If that's the * case, we'll save up to 10 on the stack, and sacrifice the rest. * (Otherwise, we need to worry about finding our place again in a * potentially changed list. It just doesn't seem worth the trouble * to do that. */ expireq_locked = false; num_local_entries = 0; prev_tln = NULL; tree_locked = TREE_UNLOCKED; SLIST_FOREACH_SAFE(cur_tln, &tlb->tlb_head, tln_list, tmp_tln) { /* * If this isn't associated with a TCPCB, we can pull it off * the list now. We need to be careful that the expire timer * hasn't already taken ownership (tln_expiretime == SBT_MAX). * If so, we let the expire timer code free the data. */ if (cur_tln->tln_closed) { no_inp: /* * Get the expireq lock so we can get a consistent * read of tln_expiretime and so we can remove this * from the expireq. */ if (!expireq_locked) { TCPLOG_EXPIREQ_LOCK(); expireq_locked = true; } /* * We ignore entries with tln_expiretime == SBT_MAX. * The expire timer code already owns those. */ KASSERT(cur_tln->tln_expiretime > (sbintime_t) 0, ("%s:%d: node on the expire queue without positive " "expire time", __func__, __LINE__)); if (cur_tln->tln_expiretime == SBT_MAX) { prev_tln = cur_tln; continue; } /* Remove the entry from the expireq. */ STAILQ_REMOVE(&tcp_log_expireq_head, cur_tln, tcp_log_id_node, tln_expireq); /* Remove the entry from the bucket. */ if (prev_tln != NULL) SLIST_REMOVE_AFTER(prev_tln, tln_list); else SLIST_REMOVE_HEAD(&tlb->tlb_head, tln_list); /* * Drop the INP and bucket reference counts. Due to * lock-ordering rules, we need to drop the expire * queue lock. */ TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; /* Drop the INP reference. */ INP_WLOCK(cur_tln->tln_inp); if (!in_pcbrele_wlocked(cur_tln->tln_inp)) INP_WUNLOCK(cur_tln->tln_inp); if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) { #ifdef INVARIANTS panic("%s: Bucket refcount unexpectedly 0.", __func__); #endif /* * Recover as best we can: free the entry we * own. */ tcp_log_free_entries(&cur_tln->tln_entries, &cur_tln->tln_count); uma_zfree(tcp_log_node_zone, cur_tln); goto done; } if (tcp_log_dump_node_logbuf(cur_tln, reason, M_NOWAIT)) { /* * If we have sapce, save the entries locally. * Otherwise, free them. */ if (num_local_entries < LOCAL_SAVE) { local_entries[num_local_entries] = *cur_tln; num_local_entries++; } else { tcp_log_free_entries( &cur_tln->tln_entries, &cur_tln->tln_count); } } /* No matter what, we are done with the node now. */ uma_zfree(tcp_log_node_zone, cur_tln); /* * Because we removed this entry from the list, prev_tln * (which tracks the previous entry still on the tlb * list) remains unchanged. */ continue; } /* * If we get to this point, the session data is still held in * the TCPCB. So, we need to pull the data out of that. * * We will need to drop the expireq lock so we can lock the INP. * We can then try to extract the data the "easy" way. If that * fails, we'll save the log entries for later. */ if (expireq_locked) { TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; } /* Lock the INP and then re-check the state. */ inp = cur_tln->tln_inp; INP_WLOCK(inp); /* * If we caught this while it was transitioning, the data * might have moved from the TCPCB to the tln (signified by * setting tln_closed to true. If so, treat this like an * inactive connection. */ if (cur_tln->tln_closed) { /* * It looks like we may have caught this connection * while it was transitioning from active to inactive. * Treat this like an inactive connection. */ INP_WUNLOCK(inp); goto no_inp; } /* * Try to dump the data from the tp without dropping the lock. * If this fails, try to save off the data locally. */ tp = cur_tln->tln_tp; if (tcp_log_dump_tp_logbuf(tp, reason, M_NOWAIT, true) && num_local_entries < LOCAL_SAVE) { tcp_log_move_tp_to_node(tp, &local_entries[num_local_entries]); local_entries[num_local_entries].tln_closed = 1; KASSERT(local_entries[num_local_entries].tln_bucket == tlb, ("%s: %d: bucket mismatch for node %p", __func__, __LINE__, cur_tln)); num_local_entries++; } INP_WUNLOCK(inp); /* * We are goint to leave the current tln on the list. It will * become the previous tln. */ prev_tln = cur_tln; } /* Drop our locks, if any. */ KASSERT(tree_locked == TREE_UNLOCKED, ("%s: %d: tree unexpectedly locked", __func__, __LINE__)); switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); tree_locked = TREE_UNLOCKED; break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); tree_locked = TREE_UNLOCKED; break; } if (expireq_locked) { TCPLOG_EXPIREQ_UNLOCK(); expireq_locked = false; } /* * Try again for any saved entries. tcp_log_dump_node_logbuf() is * guaranteed to free the log entries within the node. And, since * the node itself is on our stack, we don't need to free it. */ for (i = 0; i < num_local_entries; i++) tcp_log_dump_node_logbuf(&local_entries[i], reason, M_WAITOK); /* Drop our reference. */ if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) TCPID_BUCKET_UNLOCK(tlb); done: /* Drop our locks, if any. */ switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); break; } if (expireq_locked) TCPLOG_EXPIREQ_UNLOCK(); } #undef LOCAL_SAVE /* * Queue the log buffers for all sessions in a bucket for transmissions via * the log buffer facility. * * NOTE: This should be called with a locked INP; however, the function * will drop the lock. */ void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason) { struct tcp_log_id_bucket *tlb; int tree_locked; /* Figure out our bucket and lock it. */ INP_WLOCK_ASSERT(tp->t_inpcb); tlb = tp->t_lib; if (tlb == NULL) { /* * No bucket; treat this like a request to dump a single * session's traces. */ (void)tcp_log_dump_tp_logbuf(tp, reason, M_WAITOK, true); INP_WUNLOCK(tp->t_inpcb); return; } TCPID_BUCKET_REF(tlb); INP_WUNLOCK(tp->t_inpcb); TCPID_BUCKET_LOCK(tlb); /* If we are the last reference, we have nothing more to do here. */ tree_locked = TREE_UNLOCKED; if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) { switch (tree_locked) { case TREE_WLOCKED: TCPID_TREE_WUNLOCK(); break; case TREE_RLOCKED: TCPID_TREE_RUNLOCK(); break; } return; } /* Turn this over to tcp_log_dumpbucketlogs() to finish the work. */ tcp_log_dumpbucketlogs(tlb, reason); } /* * Mark the end of a flow with the current stack. A stack can add * stack-specific info to this trace event by overriding this * function (see bbr_log_flowend() for example). */ void tcp_log_flowend(struct tcpcb *tp) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { struct socket *so = tp->t_inpcb->inp_socket; TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FLOWEND, 0, 0, NULL, false); } } Index: head/sys/netinet/tcp_log_buf.h =================================================================== --- head/sys/netinet/tcp_log_buf.h (revision 356413) +++ head/sys/netinet/tcp_log_buf.h (revision 356414) @@ -1,376 +1,381 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (c) 2016-2018 Netflix, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __tcp_log_buf_h__ #define __tcp_log_buf_h__ #define TCP_LOG_REASON_LEN 32 -#define TCP_LOG_BUF_VER (6) +#define TCP_LOG_TAG_LEN 32 +#define TCP_LOG_BUF_VER (7) /* * Because the (struct tcp_log_buffer) includes 8-byte uint64_t's, it requires * 8-byte alignment to work properly on all platforms. Therefore, we will * enforce 8-byte alignment for all the structures that may appear by * themselves (instead of being embedded in another structure) in a data * stream. */ #define ALIGN_TCP_LOG __aligned(8) /* Information about the socketbuffer state. */ struct tcp_log_sockbuf { uint32_t tls_sb_acc; /* available chars (sb->sb_acc) */ uint32_t tls_sb_ccc; /* claimed chars (sb->sb_ccc) */ uint32_t tls_sb_spare; /* spare */ }; /* Optional, verbose information that may be appended to an event log. */ struct tcp_log_verbose { #define TCP_FUNC_LEN 32 char tlv_snd_frm[TCP_FUNC_LEN]; /* tcp_output() caller */ char tlv_trace_func[TCP_FUNC_LEN]; /* Function that generated trace */ uint32_t tlv_trace_line; /* Line number that generated trace */ uint8_t _pad[4]; } ALIGN_TCP_LOG; /* Internal RACK state variables. */ struct tcp_log_rack { uint32_t tlr_rack_rtt; /* rc_rack_rtt */ uint8_t tlr_state; /* Internal RACK state */ uint8_t _pad[3]; /* Padding */ }; struct tcp_log_bbr { uint64_t cur_del_rate; uint64_t delRate; uint64_t rttProp; uint64_t bw_inuse; uint32_t inflight; uint32_t applimited; uint32_t delivered; uint32_t timeStamp; uint32_t epoch; uint32_t lt_epoch; uint32_t pkts_out; uint32_t flex1; uint32_t flex2; uint32_t flex3; uint32_t flex4; uint32_t flex5; uint32_t flex6; uint32_t lost; uint16_t pacing_gain; uint16_t cwnd_gain; uint16_t flex7; uint8_t bbr_state; uint8_t bbr_substate; uint8_t inhpts; uint8_t ininput; uint8_t use_lt_bw; uint8_t flex8; uint32_t pkt_epoch; }; /* Per-stack stack-specific info. */ union tcp_log_stackspecific { struct tcp_log_rack u_rack; struct tcp_log_bbr u_bbr; }; struct tcp_log_buffer { /* Event basics */ struct timeval tlb_tv; /* Timestamp of trace */ uint32_t tlb_ticks; /* Timestamp of trace */ uint32_t tlb_sn; /* Serial number */ uint8_t tlb_stackid; /* Stack ID */ uint8_t tlb_eventid; /* Event ID */ uint16_t tlb_eventflags; /* Flags for the record */ #define TLB_FLAG_RXBUF 0x0001 /* Includes receive buffer info */ #define TLB_FLAG_TXBUF 0x0002 /* Includes send buffer info */ #define TLB_FLAG_HDR 0x0004 /* Includes a TCP header */ #define TLB_FLAG_VERBOSE 0x0008 /* Includes function/line numbers */ #define TLB_FLAG_STACKINFO 0x0010 /* Includes stack-specific info */ int tlb_errno; /* Event error (if any) */ /* Internal session state */ struct tcp_log_sockbuf tlb_rxbuf; /* Receive buffer */ struct tcp_log_sockbuf tlb_txbuf; /* Send buffer */ int tlb_state; /* TCPCB t_state */ uint32_t tlb_starttime; /* TCPCB t_starttime */ uint32_t tlb_iss; /* TCPCB iss */ uint32_t tlb_flags; /* TCPCB flags */ uint32_t tlb_snd_una; /* TCPCB snd_una */ uint32_t tlb_snd_max; /* TCPCB snd_max */ uint32_t tlb_snd_cwnd; /* TCPCB snd_cwnd */ uint32_t tlb_snd_nxt; /* TCPCB snd_nxt */ uint32_t tlb_snd_recover;/* TCPCB snd_recover */ uint32_t tlb_snd_wnd; /* TCPCB snd_wnd */ uint32_t tlb_snd_ssthresh; /* TCPCB snd_ssthresh */ uint32_t tlb_srtt; /* TCPCB t_srtt */ uint32_t tlb_rttvar; /* TCPCB t_rttvar */ uint32_t tlb_rcv_up; /* TCPCB rcv_up */ uint32_t tlb_rcv_adv; /* TCPCB rcv_adv */ uint32_t tlb_rcv_nxt; /* TCPCB rcv_nxt */ tcp_seq tlb_sack_newdata; /* TCPCB sack_newdata */ uint32_t tlb_rcv_wnd; /* TCPCB rcv_wnd */ uint32_t tlb_dupacks; /* TCPCB t_dupacks */ int tlb_segqlen; /* TCPCB segqlen */ int tlb_snd_numholes; /* TCPCB snd_numholes */ uint32_t tlb_flex1; /* Event specific information */ uint32_t tlb_flex2; /* Event specific information */ uint8_t tlb_snd_scale:4, /* TCPCB snd_scale */ tlb_rcv_scale:4; /* TCPCB rcv_scale */ uint8_t _pad[3]; /* Padding */ /* Per-stack info */ union tcp_log_stackspecific tlb_stackinfo; #define tlb_rack tlb_stackinfo.u_rack /* The packet */ uint32_t tlb_len; /* The packet's data length */ struct tcphdr tlb_th; /* The TCP header */ uint8_t tlb_opts[TCP_MAXOLEN]; /* The TCP options */ /* Verbose information (optional) */ struct tcp_log_verbose tlb_verbose[0]; } ALIGN_TCP_LOG; enum tcp_log_events { TCP_LOG_IN = 1, /* Incoming packet 1 */ TCP_LOG_OUT, /* Transmit (without other event) 2 */ TCP_LOG_RTO, /* Retransmit timeout 3 */ TCP_LOG_TF_ACK, /* Transmit due to TF_ACK 4 */ TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ TCP_LOG_REORDER,/* Detected reorder 7 */ TCP_LOG_HPTS, /* Hpts sending a packet 8 */ BBR_LOG_BBRUPD, /* We updated BBR info 9 */ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ BBR_LOG_INQUEUE, /* The tcb had a packet input to it 12 */ BBR_LOG_TIMERSTAR, /* Start a timer 13 */ BBR_LOG_TIMERCANC, /* Cancel a timer 14 */ BBR_LOG_ENTREC, /* Entered recovery 15 */ BBR_LOG_EXITREC, /* Exited recovery 16 */ BBR_LOG_CWND, /* Cwnd change 17 */ BBR_LOG_BWSAMP, /* LT B/W sample has been made 18 */ BBR_LOG_MSGSIZE, /* We received a EMSGSIZE error 19 */ BBR_LOG_BBRRTT, /* BBR RTT is updated 20 */ BBR_LOG_JUSTRET, /* We just returned out of output 21 */ BBR_LOG_STATE, /* A BBR state change occured 22 */ BBR_LOG_PKT_EPOCH, /* A BBR packet epoch occured 23 */ BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ TCP_LOG_FLOWEND, /* End of a flow 25 */ BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */ BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ TCP_LOG_USERSEND, /* User level sends data 31 */ BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */ BBR_LOG_STATE_TARGET, /* Log of target at state 33 */ BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */ BBR_LOG_TO_PROCESS, /* A to was processed 35 */ BBR_LOG_BBRTSO, /* TSO update 36 */ BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */ BBR_LOG_LOWGAIN, /* Low gain accounting 38 */ BBR_LOG_PROGRESS, /* Progress timer event 39 */ TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */ BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */ BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */ BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */ BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */ BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */ BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */ TCP_LOG_REASS, /* Reassembly buffer logging 50 */ TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */ BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */ BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */ TCP_LOG_CONNEND, /* End of connection 54 */ TCP_LOG_LRO, /* LRO entry 55 */ TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */ TCP_SAD_DETECTION, /* Sack Attack Detection 57 */ TCP_LOG_END /* End (keep at end) 58 */ }; enum tcp_log_states { TCP_LOG_STATE_CLEAR = -1, /* Deactivate and clear tracing */ TCP_LOG_STATE_OFF = 0, /* Pause */ TCP_LOG_STATE_TAIL=1, /* Keep the trailing events */ TCP_LOG_STATE_HEAD=2, /* Keep the leading events */ TCP_LOG_STATE_HEAD_AUTO=3, /* Keep the leading events, and automatically dump them to the device */ TCP_LOG_STATE_CONTINUAL=4, /* Continually dump the data when full */ TCP_LOG_STATE_TAIL_AUTO=5, /* Keep the trailing events, and automatically dump them when the session ends */ }; /* Use this if we don't know whether the operation succeeded. */ #define ERRNO_UNK (-1) /* * If the user included dev/tcp_log/tcp_log_dev.h, then include our private * headers. Otherwise, there is no reason to pollute all the files with an * additional include. * * This structure is aligned to an 8-byte boundary to match the alignment * requirements of (struct tcp_log_buffer). */ #ifdef __tcp_log_dev_h__ struct tcp_log_header { struct tcp_log_common_header tlh_common; #define tlh_version tlh_common.tlch_version #define tlh_type tlh_common.tlch_type #define tlh_length tlh_common.tlch_length struct in_endpoints tlh_ie; struct timeval tlh_offset; /* Uptime -> UTC offset */ char tlh_id[TCP_LOG_ID_LEN]; char tlh_reason[TCP_LOG_REASON_LEN]; + char tlh_tag[TCP_LOG_TAG_LEN]; uint8_t tlh_af; uint8_t _pad[7]; } ALIGN_TCP_LOG; #ifdef _KERNEL struct tcp_log_dev_log_queue { struct tcp_log_dev_queue tldl_common; char tldl_id[TCP_LOG_ID_LEN]; char tldl_reason[TCP_LOG_REASON_LEN]; + char tldl_tag[TCP_LOG_TAG_LEN]; struct in_endpoints tldl_ie; struct tcp_log_stailq tldl_entries; int tldl_count; uint8_t tldl_af; }; #endif /* _KERNEL */ #endif /* __tcp_log_dev_h__ */ #ifdef _KERNEL #define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000 #define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000 /* * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always * tries to record verbose information. */ #define TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ tp->t_output_caller, __func__, __LINE__, tv); \ } while (0) /* * TCP_LOG_EVENT: This is a macro so we can capture function/line * information when needed. * * Prototype: * TCP_LOG_EVENT(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, * struct sockbuf *txbuf, uint8_t eventid, int errornum, * union tcp_log_stackspecific *stackinfo) * * tp is mandatory and must be write locked. * th is optional; if present, it will appear in the record. * rxbuf and txbuf are optional; if present, they will appear in the record. * eventid is mandatory. * errornum is mandatory (it indicates the success or failure of the * operation associated with the event). * len indicates the length of the packet. If no packet, use 0. * stackinfo is optional; if present, it will appear in the record. */ #ifdef TCP_LOG_FORCEVERBOSE #define TCP_LOG_EVENT TCP_LOG_EVENT_VERBOSE #else #define TCP_LOG_EVENT(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder) \ do { \ if (tcp_log_verbose) \ TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, \ eventid, errornum, len, stackinfo, \ th_hostorder, NULL); \ else if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ NULL, NULL, 0, NULL); \ } while (0) #endif /* TCP_LOG_FORCEVERBOSE */ #define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ do { \ if (tp->t_logstate != TCP_LOG_STATE_OFF) \ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ errornum, len, stackinfo, th_hostorder, \ NULL, NULL, 0, tv); \ } while (0) #ifdef TCP_BLACKBOX extern bool tcp_log_verbose; void tcp_log_drain(struct tcpcb *tp); int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force); void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason); struct tcp_log_buffer *tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *tv); size_t tcp_log_get_id(struct tcpcb *tp, char *buf); +size_t tcp_log_get_tag(struct tcpcb *tp, char *buf); u_int tcp_log_get_id_cnt(struct tcpcb *tp); int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp); void tcp_log_init(void); int tcp_log_set_id(struct tcpcb *tp, char *id); +int tcp_log_set_tag(struct tcpcb *tp, char *tag); int tcp_log_state_change(struct tcpcb *tp, int state); void tcp_log_tcpcbinit(struct tcpcb *tp); void tcp_log_tcpcbfini(struct tcpcb *tp); void tcp_log_flowend(struct tcpcb *tp); #else /* !TCP_BLACKBOX */ #define tcp_log_verbose (false) static inline struct tcp_log_buffer * tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, union tcp_log_stackspecific *stackinfo, int th_hostorder, const char *output_caller, const char *func, int line, const struct timeval *tv) { return (NULL); } #endif /* TCP_BLACKBOX */ #endif /* _KERNEL */ #endif /* __tcp_log_buf_h__ */ Index: head/sys/netinet/tcp_var.h =================================================================== --- head/sys/netinet/tcp_var.h (revision 356413) +++ head/sys/netinet/tcp_var.h (revision 356414) @@ -1,1030 +1,1032 @@ /*- * SPDX-License-Identifier: BSD-3-Clause * * Copyright (c) 1982, 1986, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_var.h 8.4 (Berkeley) 5/24/95 * $FreeBSD$ */ #ifndef _NETINET_TCP_VAR_H_ #define _NETINET_TCP_VAR_H_ #include #include #ifdef _KERNEL #include #include #endif #if defined(_KERNEL) || defined(_WANT_TCPCB) /* TCP segment queue entry */ struct tseg_qent { TAILQ_ENTRY(tseg_qent) tqe_q; struct mbuf *tqe_m; /* mbuf contains packet */ struct mbuf *tqe_last; /* last mbuf in chain */ tcp_seq tqe_start; /* TCP Sequence number start */ int tqe_len; /* TCP segment data length */ uint32_t tqe_flags; /* The flags from the th->th_flags */ uint32_t tqe_mbuf_cnt; /* Count of mbuf overhead */ }; TAILQ_HEAD(tsegqe_head, tseg_qent); struct sackblk { tcp_seq start; /* start seq no. of sack block */ tcp_seq end; /* end seq no. */ }; struct sackhole { tcp_seq start; /* start seq no. of hole */ tcp_seq end; /* end seq no. */ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ TAILQ_ENTRY(sackhole) scblink; /* scoreboard linkage */ }; struct sackhint { struct sackhole *nexthole; int sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ int ispare; /* explicit pad for 64bit alignment */ int sacked_bytes; /* * Total sacked bytes reported by the * receiver via sack option */ uint32_t _pad1[1]; /* TBD */ uint64_t _pad[1]; /* TBD */ }; #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq) STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); /* * Tcp control block, one per tcp; fields: * Organized for 64 byte cacheline efficiency based * on common tcp_input/tcp_output processing. */ struct tcpcb { /* Cache line 1 */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ uint32_t t_maxseg:24, /* maximum segment size */ t_logstate:8; /* State of "black box" logging */ uint32_t t_port:16, /* Tunneling (over udp) port */ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ t_fin_is_rst: 1, /* Are fin's treated as resets */ - bits_spare : 3; + t_log_state_set: 1, + bits_spare : 2; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; * used to recognize retransmits */ tcp_seq snd_nxt; /* send next */ tcp_seq snd_up; /* send urgent pointer */ uint32_t snd_wnd; /* send window */ uint32_t snd_cwnd; /* congestion-controlled window */ uint32_t t_peakrate_thr; /* pre-calculated peak rate threshold */ /* Cache line 2 */ u_int32_t ts_offset; /* our timestamp offset */ u_int32_t rfbuf_ts; /* recv buffer autoscaling timestamp */ int rcv_numsacks; /* # distinct sack blks present */ u_int t_tsomax; /* TSO total burst length limit in bytes */ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ tcp_seq rcv_nxt; /* receive next */ tcp_seq rcv_adv; /* advertised window */ uint32_t rcv_wnd; /* receive window */ u_int t_flags2; /* More tcpcb flags storage */ int t_srtt; /* smoothed round-trip time */ int t_rttvar; /* variance in round-trip time */ u_int32_t ts_recent; /* timestamp echo data */ u_char snd_scale; /* window scaling for send window */ u_char rcv_scale; /* window scaling for recv window */ u_char snd_limited; /* segments limited transmitted */ u_char request_r_scale; /* pending window scaling */ tcp_seq last_ack_sent; u_int t_rcvtime; /* inactivity time */ /* Cache line 3 */ tcp_seq rcv_up; /* receive urgent pointer */ int t_segqlen; /* segment reassembly queue length */ uint32_t t_segqmbuflen; /* Count of bytes mbufs on all entries */ struct tsegqe_head t_segq; /* segment reassembly queue */ struct mbuf *t_in_pkt; struct mbuf *t_tail_pkt; struct tcp_timer *t_timers; /* All the TCP timers in one struct */ struct vnet *t_vnet; /* back pointer to parent vnet */ uint32_t snd_ssthresh; /* snd_cwnd size threshold for * for slow start exponential to * linear switch */ tcp_seq snd_wl1; /* window update seg seq number */ /* Cache line 4 */ tcp_seq snd_wl2; /* window update seg ack number */ tcp_seq irs; /* initial receive sequence number */ tcp_seq iss; /* initial send sequence number */ u_int t_acktime; u_int ts_recent_age; /* when last updated */ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ uint16_t cl4_spare; /* Spare to adjust CL 4 */ char t_oobflags; /* have some */ char t_iobc; /* input character */ int t_rxtcur; /* current retransmit value (ticks) */ int t_rxtshift; /* log(2) of rexmt exp. backoff */ u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ u_int t_starttime; /* time connection was established */ u_int t_pmtud_saved_maxseg; /* pre-blackhole MSS */ u_int t_rttmin; /* minimum rtt allowed */ u_int t_rttbest; /* best rtt we've seen */ int t_softerror; /* possible error not yet reported */ uint32_t max_sndwnd; /* largest window peer has offered */ /* Cache line 5 */ uint32_t snd_cwnd_prev; /* cwnd prior to retransmit */ uint32_t snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_recover_prev; /* snd_recover prior to retransmit */ int t_sndzerowin; /* zero-window updates sent */ u_long t_rttupdated; /* number of times rtt sampled */ int snd_numholes; /* number of holes seen by sender */ u_int t_badrxtwin; /* window for retransmit recovery */ TAILQ_HEAD(sackhole_head, sackhole) snd_holes; /* SACK scoreboard (sorted) */ tcp_seq snd_fack; /* last seq number(+1) sack'd by rcv'r*/ tcp_seq sack_newdata; /* New data xmitted in this recovery episode starts at this seq number */ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ struct sackhint sackhint; /* SACK scoreboard hint */ int t_rttlow; /* smallest observerved RTT */ int rfbuf_cnt; /* recv buffer autoscaling byte count */ struct toedev *tod; /* toedev handling this connection */ int t_sndrexmitpack; /* retransmit packets sent */ int t_rcvoopack; /* out-of-order packets received */ void *t_toe; /* TOE pcb pointer */ struct cc_algo *cc_algo; /* congestion control algorithm */ struct cc_var *ccv; /* congestion control specific vars */ struct osd *osd; /* storage for Khelp module data */ int t_bytes_acked; /* # bytes acked during current RTT */ u_int t_maxunacktime; u_int t_keepinit; /* time to establish connection */ u_int t_keepidle; /* time before keepalive probes begin */ u_int t_keepintvl; /* interval between keepalives */ u_int t_keepcnt; /* number of keepalives before close */ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ + int t_loglimit; /* Maximum number of log entries */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; const char *t_output_caller; /* Function that called tcp_output */ struct statsblob *t_stats; /* Per-connection stats */ uint32_t t_logsn; /* Log "serial number" */ uint32_t gput_ts; /* Time goodput measurement started */ tcp_seq gput_seq; /* Outbound measurement seq */ tcp_seq gput_ack; /* Inbound measurement ack */ int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */ union { uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN]; uint64_t server; } t_tfo_cookie; /* TCP Fast Open cookie to send */ #ifdef TCPPCAP struct mbufq t_inpkts; /* List of saved input packets. */ struct mbufq t_outpkts; /* List of saved output packets. */ #endif }; #endif /* _KERNEL || _WANT_TCPCB */ #ifdef _KERNEL struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; }; /* Minimum map entries limit value, if set */ #define TCP_MIN_MAP_ENTRIES_LIMIT 128 /* * TODO: We yet need to brave plowing in * to tcp_input() and the pru_usrreq() block. * Right now these go to the old standards which * are somewhat ok, but in the long term may * need to be changed. If we do tackle tcp_input() * then we need to get rid of the tcp_do_segment() * function below. */ /* Flags for tcp functions */ #define TCP_FUNC_BEING_REMOVED 0x01 /* Can no longer be referenced */ /* * If defining the optional tcp_timers, in the * tfb_tcp_timer_stop call you must use the * callout_async_drain() function with the * tcp_timer_discard callback. You should check * the return of callout_async_drain() and if 0 * increment tt_draincnt. Since the timer sub-system * does not know your callbacks you must provide a * stop_all function that loops through and calls * tcp_timer_stop() with each of your defined timers. * Adding a tfb_tcp_handoff_ok function allows the socket * option to change stacks to query you even if the * connection is in a later stage. You return 0 to * say you can take over and run your stack, you return * non-zero (an error number) to say no you can't. * If the function is undefined you can only change * in the early states (before connect or listen). * tfb_tcp_fb_fini is changed to add a flag to tell * the old stack if the tcb is being destroyed or * not. A one in the flag means the TCB is being * destroyed, a zero indicates its transitioning to * another stack (via socket option). */ struct tcp_function_block { char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX]; int (*tfb_tcp_output)(struct tcpcb *); int (*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *); void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int); int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int, struct timeval *); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, int, struct timeval *); int (*tfb_tcp_ctloutput)(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); /* Optional memory allocation/free routine */ int (*tfb_tcp_fb_init)(struct tcpcb *); void (*tfb_tcp_fb_fini)(struct tcpcb *, int); /* Optional timers, must define all if you define one */ int (*tfb_tcp_timer_stop_all)(struct tcpcb *); void (*tfb_tcp_timer_activate)(struct tcpcb *, uint32_t, u_int); int (*tfb_tcp_timer_active)(struct tcpcb *, uint32_t); void (*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t); void (*tfb_tcp_rexmit_tmr)(struct tcpcb *); int (*tfb_tcp_handoff_ok)(struct tcpcb *); void (*tfb_tcp_mtu_chg)(struct tcpcb *); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; }; struct tcp_function { TAILQ_ENTRY(tcp_function) tf_next; char tf_name[TCP_FUNCTION_NAME_LEN_MAX]; struct tcp_function_block *tf_fb; }; TAILQ_HEAD(tcp_funchead, tcp_function); #endif /* _KERNEL */ /* * Flags and utility macros for the t_flags field. */ #define TF_ACKNOW 0x00000001 /* ack peer immediately */ #define TF_DELACK 0x00000002 /* ack, but try to delay it */ #define TF_NODELAY 0x00000004 /* don't delay packets to coalesce */ #define TF_NOOPT 0x00000008 /* don't use tcp options */ #define TF_SENTFIN 0x00000010 /* have sent FIN */ #define TF_REQ_SCALE 0x00000020 /* have/will request window scaling */ #define TF_RCVD_SCALE 0x00000040 /* other side has requested scaling */ #define TF_REQ_TSTMP 0x00000080 /* have/will request timestamps */ #define TF_RCVD_TSTMP 0x00000100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x00000200 /* other side said I could SACK */ #define TF_NEEDSYN 0x00000400 /* send SYN (implicit state) */ #define TF_NEEDFIN 0x00000800 /* send FIN (implicit state) */ #define TF_NOPUSH 0x00001000 /* don't push */ #define TF_PREVVALID 0x00002000 /* saved values for bad rxmit valid */ #define TF_UNUSED1 0x00004000 /* unused */ #define TF_GPUTINPROG 0x00008000 /* Goodput measurement in progress */ #define TF_MORETOCOME 0x00010000 /* More data to be appended to sock */ #define TF_LQ_OVERFLOW 0x00020000 /* listen queue overflow */ #define TF_LASTIDLE 0x00040000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x00080000 /* sent a receiver win 0 in response */ #define TF_FASTRECOVERY 0x00100000 /* in NewReno Fast Recovery */ #define TF_WASFRECOVERY 0x00200000 /* was in NewReno Fast Recovery */ #define TF_SIGNATURE 0x00400000 /* require MD5 digests (RFC2385) */ #define TF_FORCEDATA 0x00800000 /* force out a byte */ #define TF_TSO 0x01000000 /* TSO enabled on this connection */ #define TF_TOE 0x02000000 /* this connection is offloaded */ #define TF_UNUSED3 0x04000000 /* unused */ #define TF_UNUSED4 0x08000000 /* unused */ #define TF_UNUSED5 0x10000000 /* unused */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY #define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY #define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY) #define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY #define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY #define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY)) #define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY) #define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY) #if defined(_KERNEL) && !defined(TCP_RFC7413) #define IS_FASTOPEN(t_flags) (false) #else #define IS_FASTOPEN(t_flags) (t_flags & TF_FASTOPEN) #endif #define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una) /* * Flags for the t_oobflags field. */ #define TCPOOB_HAVEDATA 0x01 #define TCPOOB_HADDATA 0x02 /* * Flags for the extended TCP flags field, t_flags2 */ #define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ #define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ #define TF2_DROP_AF_DATA 0x00000010 /* Drop after all data ack'd */ #define TF2_ECN_PERMIT 0x00000020 /* connection ECN-ready */ #define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */ #define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */ #define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. * It's basically used to reduce the number of parameters * to tcp_dooptions and tcp_addoptions. * The binary order of the to_flags is relevant for packing of the * options in tcp_addoptions. */ struct tcpopt { u_int32_t to_flags; /* which options are present */ #define TOF_MSS 0x0001 /* maximum segment size */ #define TOF_SCALE 0x0002 /* window scaling */ #define TOF_SACKPERM 0x0004 /* SACK permitted */ #define TOF_TS 0x0010 /* timestamp */ #define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ #define TOF_SACK 0x0080 /* Peer sent SACK option */ #define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ #define TOF_MAXOPT 0x0200 u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ u_char *to_signature; /* pointer to the TCP-MD5 signature */ u_int8_t *to_tfo_cookie; /* pointer to the TFO cookie */ u_int16_t to_mss; /* maximum segment size */ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ u_int32_t to_spare; /* UTO */ }; /* * Flags for tcp_dooptions. */ #define TO_SYN 0x01 /* parse SYN-only options */ struct hc_metrics_lite { /* must stay in sync with hc_metrics */ uint32_t rmx_mtu; /* MTU for this path */ uint32_t rmx_ssthresh; /* outbound gateway buffer limit */ uint32_t rmx_rtt; /* estimated round trip time */ uint32_t rmx_rttvar; /* estimated rtt variance */ uint32_t rmx_cwnd; /* congestion window */ uint32_t rmx_sendpipe; /* outbound delay-bandwidth product */ uint32_t rmx_recvpipe; /* inbound delay-bandwidth product */ }; /* * Used by tcp_maxmtu() to communicate interface specific features * and limits at the time of connection setup. */ struct tcp_ifcap { int ifcap; u_int tsomax; u_int tsomaxsegcount; u_int tsomaxsegsize; }; #ifndef _NETINET_IN_PCB_H_ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ struct tcptw { struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ tcp_seq snd_nxt; tcp_seq rcv_nxt; tcp_seq iss; tcp_seq irs; u_short last_win; /* cached window value */ short tw_so_options; /* copy of so_options */ struct ucred *tw_cred; /* user credentials */ u_int32_t t_recent; u_int32_t ts_offset; /* our timestamp offset */ u_int t_starttime; int tw_time; TAILQ_ENTRY(tcptw) tw_2msl; void *tw_pspare; /* TCP_SIGNATURE */ u_int *tw_spare; /* TCP_SIGNATURE */ }; #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* * The smoothed round-trip time and estimated variance * are stored as fixed point numbers scaled by the values below. * For convenience, these scales are also used in smoothing the average * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed). * With these scales, srtt has 3 bits to the right of the binary point, * and thus an "ALPHA" of 0.875. rttvar has 2 bits to the right of the * binary point, and is smoothed with an ALPHA of 0.75. */ #define TCP_RTT_SCALE 32 /* multiplier for srtt; 3 bits frac. */ #define TCP_RTT_SHIFT 5 /* shift for srtt; 3 bits frac. */ #define TCP_RTTVAR_SCALE 16 /* multiplier for rttvar; 2 bits */ #define TCP_RTTVAR_SHIFT 4 /* shift for rttvar; 2 bits */ #define TCP_DELTA_SHIFT 2 /* see tcp_input.c */ /* * The initial retransmission should happen at rtt + 4 * rttvar. * Because of the way we do the smoothing, srtt and rttvar * will each average +1/2 tick of bias. When we compute * the retransmit timer, we want 1/2 tick of rounding and * 1 extra tick because of +-1/2 tick uncertainty in the * firing of the timer. The bias will give us exactly the * 1.5 tick we need. But, because the bias is * statistical, we have to test that we don't drop below * the minimum feasible timer (which is 2 ticks). * This version of the macro adapted from a paper by Lawrence * Brakmo and Larry Peterson which outlines a problem caused * by insufficient precision in the original implementation, * which results in inappropriately large RTO values for very * fast networks. */ #define TCP_REXMTVAL(tp) \ max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)) \ + (tp)->t_rttvar) >> TCP_DELTA_SHIFT) /* * TCP statistics. * Many of these should be kept per connection, * but that's inconvenient at the moment. */ struct tcpstat { uint64_t tcps_connattempt; /* connections initiated */ uint64_t tcps_accepts; /* connections accepted */ uint64_t tcps_connects; /* connections established */ uint64_t tcps_drops; /* connections dropped */ uint64_t tcps_conndrops; /* embryonic connections dropped */ uint64_t tcps_minmssdrops; /* average minmss too low drops */ uint64_t tcps_closed; /* conn. closed (includes drops) */ uint64_t tcps_segstimed; /* segs where we tried to get rtt */ uint64_t tcps_rttupdated; /* times we succeeded */ uint64_t tcps_delack; /* delayed acks sent */ uint64_t tcps_timeoutdrop; /* conn. dropped in rxmt timeout */ uint64_t tcps_rexmttimeo; /* retransmit timeouts */ uint64_t tcps_persisttimeo; /* persist timeouts */ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ uint64_t tcps_sndbyte; /* data bytes sent */ uint64_t tcps_sndrexmitpack; /* data packets retransmitted */ uint64_t tcps_sndrexmitbyte; /* data bytes retransmitted */ uint64_t tcps_sndrexmitbad; /* unnecessary packet retransmissions */ uint64_t tcps_sndacks; /* ack-only packets sent */ uint64_t tcps_sndprobe; /* window probes sent */ uint64_t tcps_sndurg; /* packets sent with URG only */ uint64_t tcps_sndwinup; /* window update-only packets sent */ uint64_t tcps_sndctrl; /* control (SYN|FIN|RST) packets sent */ uint64_t tcps_rcvtotal; /* total packets received */ uint64_t tcps_rcvpack; /* packets received in sequence */ uint64_t tcps_rcvbyte; /* bytes received in sequence */ uint64_t tcps_rcvbadsum; /* packets received with ccksum errs */ uint64_t tcps_rcvbadoff; /* packets received with bad offset */ uint64_t tcps_rcvreassfull; /* packets dropped for no reass space */ uint64_t tcps_rcvshort; /* packets received too short */ uint64_t tcps_rcvduppack; /* duplicate-only packets received */ uint64_t tcps_rcvdupbyte; /* duplicate-only bytes received */ uint64_t tcps_rcvpartduppack; /* packets with some duplicate data */ uint64_t tcps_rcvpartdupbyte; /* dup. bytes in part-dup. packets */ uint64_t tcps_rcvoopack; /* out-of-order packets received */ uint64_t tcps_rcvoobyte; /* out-of-order bytes received */ uint64_t tcps_rcvpackafterwin; /* packets with data after window */ uint64_t tcps_rcvbyteafterwin; /* bytes rcvd after window */ uint64_t tcps_rcvafterclose; /* packets rcvd after "close" */ uint64_t tcps_rcvwinprobe; /* rcvd window probe packets */ uint64_t tcps_rcvdupack; /* rcvd duplicate acks */ uint64_t tcps_rcvacktoomuch; /* rcvd acks for unsent data */ uint64_t tcps_rcvackpack; /* rcvd ack packets */ uint64_t tcps_rcvackbyte; /* bytes acked by rcvd acks */ uint64_t tcps_rcvwinupd; /* rcvd window update packets */ uint64_t tcps_pawsdrop; /* segments dropped due to PAWS */ uint64_t tcps_predack; /* times hdr predict ok for acks */ uint64_t tcps_preddat; /* times hdr predict ok for data pkts */ uint64_t tcps_pcbcachemiss; uint64_t tcps_cachedrtt; /* times cached RTT in route updated */ uint64_t tcps_cachedrttvar; /* times cached rttvar updated */ uint64_t tcps_cachedssthresh; /* times cached ssthresh updated */ uint64_t tcps_usedrtt; /* times RTT initialized from route */ uint64_t tcps_usedrttvar; /* times RTTVAR initialized from rt */ uint64_t tcps_usedssthresh; /* times ssthresh initialized from rt*/ uint64_t tcps_persistdrop; /* timeout in persist state */ uint64_t tcps_badsyn; /* bogus SYN, e.g. premature ACK */ uint64_t tcps_mturesent; /* resends due to MTU discovery */ uint64_t tcps_listendrop; /* listen queue overflows */ uint64_t tcps_badrst; /* ignored RSTs in the window */ uint64_t tcps_sc_added; /* entry added to syncache */ uint64_t tcps_sc_retransmitted; /* syncache entry was retransmitted */ uint64_t tcps_sc_dupsyn; /* duplicate SYN packet */ uint64_t tcps_sc_dropped; /* could not reply to packet */ uint64_t tcps_sc_completed; /* successful extraction of entry */ uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */ uint64_t tcps_sc_cacheoverflow; /* syncache cache limit hit */ uint64_t tcps_sc_reset; /* RST removed entry from syncache */ uint64_t tcps_sc_stale; /* timed out or listen socket gone */ uint64_t tcps_sc_aborted; /* syncache entry aborted */ uint64_t tcps_sc_badack; /* removed due to bad ACK */ uint64_t tcps_sc_unreach; /* ICMP unreachable received */ uint64_t tcps_sc_zonefail; /* zalloc() failed */ uint64_t tcps_sc_sendcookie; /* SYN cookie sent */ uint64_t tcps_sc_recvcookie; /* SYN cookie received */ uint64_t tcps_hc_added; /* entry added to hostcache */ uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */ uint64_t tcps_finwait2_drops; /* Drop FIN_WAIT_2 connection after time limit */ /* SACK related stats */ uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */ uint64_t tcps_sack_rexmits; /* SACK rexmit segments */ uint64_t tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ uint64_t tcps_sack_rcv_blocks; /* SACK blocks (options) received */ uint64_t tcps_sack_send_blocks; /* SACK blocks (options) sent */ uint64_t tcps_sack_sboverflow; /* times scoreboard overflowed */ /* ECN related stats */ uint64_t tcps_ecn_ce; /* ECN Congestion Experienced */ uint64_t tcps_ecn_ect0; /* ECN Capable Transport */ uint64_t tcps_ecn_ect1; /* ECN Capable Transport */ uint64_t tcps_ecn_shs; /* ECN successful handshakes */ uint64_t tcps_ecn_rcwnd; /* # times ECN reduced the cwnd */ /* TCP_SIGNATURE related stats */ uint64_t tcps_sig_rcvgoodsig; /* Total matching signature received */ uint64_t tcps_sig_rcvbadsig; /* Total bad signature received */ uint64_t tcps_sig_err_buildsig; /* Failed to make signature */ uint64_t tcps_sig_err_sigopt; /* No signature expected by socket */ uint64_t tcps_sig_err_nosigopt; /* No signature provided by segment */ /* Path MTU Discovery Black Hole Detection related stats */ uint64_t tcps_pmtud_blackhole_activated; /* Black Hole Count */ uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ uint64_t _pad[12]; /* 6 UTO, 6 TBD */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ #ifdef _KERNEL #define TI_UNLOCKED 1 #define TI_RLOCKED 2 #include VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat); /* tcp statistics */ /* * In-kernel consumers can use these accessor macros directly to update * stats. */ #define TCPSTAT_ADD(name, val) \ VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val)) #define TCPSTAT_INC(name) TCPSTAT_ADD(name, 1) /* * Kernel module consumers must use this accessor macro. */ void kmod_tcpstat_inc(int statnum); #define KMOD_TCPSTAT_INC(name) \ kmod_tcpstat_inc(offsetof(struct tcpstat, name) / sizeof(uint64_t)) /* * Running TCP connection count by state. */ VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]); #define V_tcps_states VNET(tcps_states) #define TCPSTATES_INC(state) counter_u64_add(V_tcps_states[state], 1) #define TCPSTATES_DEC(state) counter_u64_add(V_tcps_states[state], -1) /* * TCP specific helper hook point identifiers. */ #define HHOOK_TCP_EST_IN 0 #define HHOOK_TCP_EST_OUT 1 #define HHOOK_TCP_LAST HHOOK_TCP_EST_OUT struct tcp_hhook_data { struct tcpcb *tp; struct tcphdr *th; struct tcpopt *to; uint32_t len; int tso; tcp_seq curack; }; #ifdef TCP_HHOOK void hhook_run_tcp_est_out(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, uint32_t len, int tso); #endif #endif /* * TCB structure exported to user-land via sysctl(3). * * Fields prefixed with "xt_" are unique to the export structure, and fields * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'. * * Legend: * (s) - used by userland utilities in src * (p) - used by utilities in ports * (3) - is known to be used by third party software not in ports * (n) - no known usage * * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been * included. Not all of our clients do. */ #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_) struct xtcpcb { ksize_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ int64_t spare64[8]; int32_t t_state; /* (s,p) */ uint32_t t_flags; /* (s,p) */ int32_t t_sndzerowin; /* (s) */ int32_t t_sndrexmitpack; /* (s) */ int32_t t_rcvoopack; /* (s) */ int32_t t_rcvtime; /* (s) */ int32_t tt_rexmt; /* (s) */ int32_t tt_persist; /* (s) */ int32_t tt_keep; /* (s) */ int32_t tt_2msl; /* (s) */ int32_t tt_delack; /* (s) */ int32_t t_logstate; /* (3) */ int32_t spare32[32]; } __aligned(8); #ifdef _KERNEL void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif #endif /* * TCP function information (name-to-id mapping, aliases, and refcnt) * exported to user-land via sysctl(3). */ struct tcp_function_info { uint32_t tfi_refcnt; uint8_t tfi_id; char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; char tfi_alias[TCP_FUNCTION_NAME_LEN_MAX]; }; /* * Identifiers for TCP sysctl nodes */ #define TCPCTL_DO_RFC1323 1 /* use RFC-1323 extensions */ #define TCPCTL_MSSDFLT 3 /* MSS default */ #define TCPCTL_STATS 4 /* statistics */ #define TCPCTL_RTTDFLT 5 /* default RTT estimate */ #define TCPCTL_KEEPIDLE 6 /* keepalive idle timer */ #define TCPCTL_KEEPINTVL 7 /* interval to send keepalives */ #define TCPCTL_SENDSPACE 8 /* send buffer space */ #define TCPCTL_RECVSPACE 9 /* receive buffer space */ #define TCPCTL_KEEPINIT 10 /* timeout for establishing syn */ #define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */ #define TCPCTL_DROP 15 /* drop tcp connection */ #define TCPCTL_STATES 16 /* connection counts by TCP state */ #ifdef _KERNEL #ifdef SYSCTL_DECL SYSCTL_DECL(_net_inet_tcp); SYSCTL_DECL(_net_inet_tcp_sack); MALLOC_DECLARE(M_TCPLOG); #endif extern int tcp_log_in_vain; /* * Global TCP tunables shared between different stacks. * Please keep the list sorted. */ VNET_DECLARE(int, drop_synfin); VNET_DECLARE(int, path_mtu_discovery); VNET_DECLARE(int, tcp_abc_l_var); VNET_DECLARE(int, tcp_autorcvbuf_max); VNET_DECLARE(int, tcp_autosndbuf_inc); VNET_DECLARE(int, tcp_autosndbuf_max); VNET_DECLARE(int, tcp_delack_enabled); VNET_DECLARE(int, tcp_do_autorcvbuf); VNET_DECLARE(int, tcp_do_autosndbuf); VNET_DECLARE(int, tcp_do_ecn); VNET_DECLARE(int, tcp_do_newcwv); VNET_DECLARE(int, tcp_do_rfc1323); VNET_DECLARE(int, tcp_do_rfc3042); VNET_DECLARE(int, tcp_do_rfc3390); VNET_DECLARE(int, tcp_do_rfc3465); VNET_DECLARE(int, tcp_do_rfc6675_pipe); VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); VNET_DECLARE(uint32_t, tcp_map_entries_limit); VNET_DECLARE(uint32_t, tcp_map_split_limit); VNET_DECLARE(int, tcp_minmss); VNET_DECLARE(int, tcp_mssdflt); #ifdef STATS VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl); VNET_DECLARE(int, tcp_perconn_stats_enable); #endif /* STATS */ VNET_DECLARE(int, tcp_recvspace); VNET_DECLARE(int, tcp_sack_globalholes); VNET_DECLARE(int, tcp_sack_globalmaxholes); VNET_DECLARE(int, tcp_sack_maxholes); VNET_DECLARE(int, tcp_sc_rst_sock_fail); VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(struct inpcbhead, tcb); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_newcwv VNET(tcp_do_newcwv) #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) #define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) #define V_tcp_autosndbuf_inc VNET(tcp_autosndbuf_inc) #define V_tcp_autosndbuf_max VNET(tcp_autosndbuf_max) #define V_tcp_delack_enabled VNET(tcp_delack_enabled) #define V_tcp_do_autorcvbuf VNET(tcp_do_autorcvbuf) #define V_tcp_do_autosndbuf VNET(tcp_do_autosndbuf) #define V_tcp_do_ecn VNET(tcp_do_ecn) #define V_tcp_do_rfc1323 VNET(tcp_do_rfc1323) #define V_tcp_ts_offset_per_conn VNET(tcp_ts_offset_per_conn) #define V_tcp_do_rfc3042 VNET(tcp_do_rfc3042) #define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390) #define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465) #define V_tcp_do_rfc6675_pipe VNET(tcp_do_rfc6675_pipe) #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn) #define V_tcp_map_entries_limit VNET(tcp_map_entries_limit) #define V_tcp_map_split_limit VNET(tcp_map_split_limit) #define V_tcp_minmss VNET(tcp_minmss) #define V_tcp_mssdflt VNET(tcp_mssdflt) #ifdef STATS #define V_tcp_perconn_stats_dflt_tpl VNET(tcp_perconn_stats_dflt_tpl) #define V_tcp_perconn_stats_enable VNET(tcp_perconn_stats_enable) #endif /* STATS */ #define V_tcp_recvspace VNET(tcp_recvspace) #define V_tcp_sack_globalholes VNET(tcp_sack_globalholes) #define V_tcp_sack_globalmaxholes VNET(tcp_sack_globalmaxholes) #define V_tcp_sack_maxholes VNET(tcp_sack_maxholes) #define V_tcp_sc_rst_sock_fail VNET(tcp_sc_rst_sock_fail) #define V_tcp_sendspace VNET(tcp_sendspace) #define V_tcp_udp_tunneling_overhead VNET(tcp_udp_tunneling_overhead) #define V_tcp_udp_tunneling_port VNET(tcp_udp_tunneling_port) #ifdef TCP_HHOOK VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]); #define V_tcp_hhh VNET(tcp_hhh) #endif int tcp_addoptions(struct tcpopt *, u_char *); int tcp_ccalgounload(struct cc_algo *unload_algo); struct tcpcb * tcp_close(struct tcpcb *); void tcp_discardcb(struct tcpcb *); void tcp_twstart(struct tcpcb *); void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); void tcp_init(void); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, void *, const void *); char *tcp_log_vain(struct in_conninfo *, struct tcphdr *, void *, const void *); int tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *, struct mbuf *); void tcp_reass_global_init(void); void tcp_reass_flush(struct tcpcb *); void tcp_dooptions(struct tcpopt *, u_char *, int, int); void tcp_dropwithreset(struct mbuf *, struct tcphdr *, struct tcpcb *, int, int); void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); void tcp_xmit_timer(struct tcpcb *, int); void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *); void cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs, uint16_t type); void cc_conn_init(struct tcpcb *tp); void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); #ifdef TCP_HHOOK void hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to); #endif int tcp_input(struct mbuf **, int *, int); int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int); void tcp_do_segment(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); int register_tcp_functions(struct tcp_function_block *blk, int wait); int register_tcp_functions_as_names(struct tcp_function_block *blk, int wait, const char *names[], int *num_names); int register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name, int wait); int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce, bool force); struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs); void tcp_switch_back_to_default(struct tcpcb *tp); struct tcp_function_block * find_and_ref_tcp_fb(struct tcp_function_block *fs); int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); extern counter_u64_t tcp_inp_lro_direct_queue; extern counter_u64_t tcp_inp_lro_wokeup_queue; extern counter_u64_t tcp_inp_lro_compressed; extern counter_u64_t tcp_inp_lro_single_push; extern counter_u64_t tcp_inp_lro_locks_taken; extern counter_u64_t tcp_inp_lro_sack_wake; #ifdef NETFLIX_EXP_DETECTION /* Various SACK attack thresholds */ extern int32_t tcp_force_detection; extern int32_t tcp_sack_to_ack_thresh; extern int32_t tcp_sack_to_move_thresh; extern int32_t tcp_restoral_thresh; extern int32_t tcp_sad_decay_val; extern int32_t tcp_sad_pacing_interval; extern int32_t tcp_sad_low_pps; extern int32_t tcp_map_minimum; extern int32_t tcp_attack_on_turns_on_logging; #endif uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); u_int tcp_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); int tcp_mssopt(struct in_conninfo *); struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); struct tcpcb * tcp_newtcpcb(struct inpcb *); int tcp_output(struct tcpcb *); void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); void tcp_tw_init(void); #ifdef VIMAGE void tcp_tw_destroy(void); #endif void tcp_tw_zone_change(void); int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); void tcp_slowtimo(void); struct tcptemp * tcpip_maketemplate(struct inpcb *); void tcpip_fillheaders(struct inpcb *, void *, void *); void tcp_timer_activate(struct tcpcb *, uint32_t, u_int); int tcp_timer_suspend(struct tcpcb *, uint32_t); void tcp_timers_unsuspend(struct tcpcb *, uint32_t); int tcp_timer_active(struct tcpcb *, uint32_t); void tcp_timer_stop(struct tcpcb *, uint32_t); void tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int); int inp_to_cpuid(struct inpcb *inp); /* * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) */ void tcp_hc_init(void); #ifdef VIMAGE void tcp_hc_destroy(void); #endif void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); uint32_t tcp_hc_getmtu(struct in_conninfo *); void tcp_hc_updatemtu(struct in_conninfo *, uint32_t); void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); extern struct pr_usrreqs tcp_usrreqs; uint32_t tcp_new_ts_offset(struct in_conninfo *); tcp_seq tcp_new_isn(struct in_conninfo *); int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp); int tcp_newreno(struct tcpcb *, struct tcphdr *); int tcp_compute_pipe(struct tcpcb *); uint32_t tcp_compute_initwnd(uint32_t); void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, size_t seed_len); struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls); int tcp_stats_init(void); static inline void tcp_fields_to_host(struct tcphdr *th) { th->th_seq = ntohl(th->th_seq); th->th_ack = ntohl(th->th_ack); th->th_win = ntohs(th->th_win); th->th_urp = ntohs(th->th_urp); } static inline void tcp_fields_to_net(struct tcphdr *th) { th->th_seq = htonl(th->th_seq); th->th_ack = htonl(th->th_ack); th->th_win = htons(th->th_win); th->th_urp = htons(th->th_urp); } #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */