Index: head/etc/mtree/BSD.include.dist =================================================================== --- head/etc/mtree/BSD.include.dist +++ head/etc/mtree/BSD.include.dist @@ -158,6 +158,8 @@ .. speaker .. + tcp_log + .. usb .. vkbd Index: head/include/Makefile =================================================================== --- head/include/Makefile +++ head/include/Makefile @@ -47,7 +47,7 @@ dev/hwpmc dev/hyperv \ dev/ic dev/iicbus dev/io dev/lmc dev/mfi dev/mmc dev/nvme \ dev/ofw dev/pbio dev/pci ${_dev_powermac_nvram} dev/ppbus dev/smbus \ - dev/speaker dev/vkbd dev/wi \ + dev/speaker dev/tcp_log dev/vkbd dev/wi \ fs/devfs fs/fdescfs fs/msdosfs fs/nandfs fs/nfs fs/nullfs \ fs/procfs fs/smbfs fs/udf fs/unionfs \ geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \ Index: head/sys/conf/files =================================================================== --- head/sys/conf/files +++ head/sys/conf/files @@ -3161,6 +3161,7 @@ dev/syscons/syscons.c optional sc dev/syscons/sysmouse.c optional sc dev/syscons/warp/warp_saver.c optional warp_saver +dev/tcp_log/tcp_log_dev.c optional inet | inet6 dev/tdfx/tdfx_linux.c optional tdfx_linux tdfx compat_linux dev/tdfx/tdfx_pci.c optional tdfx pci dev/ti/if_ti.c optional ti pci @@ -4309,6 +4310,7 @@ netinet/tcp_fastopen.c optional inet tcp_rfc7413 | inet6 tcp_rfc7413 netinet/tcp_hostcache.c optional inet | inet6 netinet/tcp_input.c optional inet | inet6 +netinet/tcp_log_buf.c optional inet | inet6 netinet/tcp_lro.c optional inet | inet6 netinet/tcp_output.c optional inet | inet6 netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6 Index: head/sys/dev/tcp_log/tcp_log_dev.h =================================================================== --- head/sys/dev/tcp_log/tcp_log_dev.h +++ head/sys/dev/tcp_log/tcp_log_dev.h @@ -0,0 +1,88 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 + * Netflix Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __tcp_log_dev_h__ +#define __tcp_log_dev_h__ + +/* + * This is the common header for data streamed from the log device. All + * blocks of data need to start with this header. + */ +struct tcp_log_common_header { + uint32_t tlch_version; /* Version is specific to type. */ + uint32_t tlch_type; /* Type of entry(ies) that follow. */ + uint64_t tlch_length; /* Total length, including header. */ +} __packed; + +#define TCP_LOG_DEV_TYPE_BBR 1 /* black box recorder */ + +#ifdef _KERNEL +/* + * This is a queue entry. All queue entries need to start with this structure + * so the common code can cast them to this structure; however, other modules + * are free to include additional data after this structure. + * + * The elements are explained here: + * tldq_queue: used by the common code to maintain this entry's position in the + * queue. + * tldq_buf: should be NULL, or a pointer to a chunk of data. The data must be + * as long as the common header indicates. + * tldq_xform: If tldq_buf is NULL, the code will call this to create the + * the tldq_buf object. The function should *not* directly modify tldq_buf, + * but should return the buffer (which must meet the restrictions + * indicated for tldq_buf). + * tldq_dtor: This function is called to free the queue entry. If tldq_buf is + * not NULL, the dtor function must free that, too. + * tldq_refcnt: used by the common code to indicate how many readers still need + * this data. + */ +struct tcp_log_dev_queue { + STAILQ_ENTRY(tcp_log_dev_queue) tldq_queue; + struct tcp_log_common_header *tldq_buf; + struct tcp_log_common_header *(*tldq_xform)(struct tcp_log_dev_queue *entry); + void (*tldq_dtor)(struct tcp_log_dev_queue *entry); + volatile u_int tldq_refcnt; +}; + +STAILQ_HEAD(log_queueh, tcp_log_dev_queue); + +struct tcp_log_dev_info { + STAILQ_ENTRY(tcp_log_dev_info) tldi_list; + struct tcp_log_dev_queue *tldi_head; + struct tcp_log_common_header *tldi_cur; + off_t tldi_off; +}; +STAILQ_HEAD(log_infoh, tcp_log_dev_info); + + +MALLOC_DECLARE(M_TCPLOGDEV); +int tcp_log_dev_add_log(struct tcp_log_dev_queue *entry); +#endif /* _KERNEL */ +#endif /* !__tcp_log_dev_h__ */ Index: head/sys/dev/tcp_log/tcp_log_dev.c =================================================================== --- head/sys/dev/tcp_log/tcp_log_dev.c +++ head/sys/dev/tcp_log/tcp_log_dev.c @@ -0,0 +1,521 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016-2017 + * Netflix Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef TCPLOG_DEBUG_COUNTERS +extern counter_u64_t tcp_log_que_read; +extern counter_u64_t tcp_log_que_freed; +#endif + +static struct cdev *tcp_log_dev; +static struct selinfo tcp_log_sel; + +static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head); +static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head); + +MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures"); + +static int tcp_log_dev_listeners = 0; + +static struct mtx tcp_log_dev_queue_lock; + +#define TCP_LOG_DEV_QUEUE_LOCK() mtx_lock(&tcp_log_dev_queue_lock) +#define TCP_LOG_DEV_QUEUE_UNLOCK() mtx_unlock(&tcp_log_dev_queue_lock) +#define TCP_LOG_DEV_QUEUE_LOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED) +#define TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED) +#define TCP_LOG_DEV_QUEUE_REF(tldq) refcount_acquire(&((tldq)->tldq_refcnt)) +#define TCP_LOG_DEV_QUEUE_UNREF(tldq) refcount_release(&((tldq)->tldq_refcnt)) + +static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry); +static void tcp_log_dev_clear_cdevpriv(void *data); +static int tcp_log_dev_open(struct cdev *dev __unused, int flags, + int devtype __unused, struct thread *td __unused); +static int tcp_log_dev_write(struct cdev *dev __unused, + struct uio *uio __unused, int flags __unused); +static int tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, + int flags __unused); +static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, + caddr_t data, int fflag __unused, struct thread *td __unused); +static int tcp_log_dev_poll(struct cdev *dev __unused, int events, + struct thread *td); + + +enum tcp_log_dev_queue_lock_state { + QUEUE_UNLOCKED = 0, + QUEUE_LOCKED, +}; + +static struct cdevsw tcp_log_cdevsw = { + .d_version = D_VERSION, + .d_read = tcp_log_dev_read, + .d_open = tcp_log_dev_open, + .d_write = tcp_log_dev_write, + .d_poll = tcp_log_dev_poll, + .d_ioctl = tcp_log_dev_ioctl, +#ifdef NOTYET + .d_mmap = tcp_log_dev_mmap, +#endif + .d_name = "tcp_log", +}; + +static __inline void +tcp_log_dev_queue_validate_lock(int lockstate) +{ + +#ifdef INVARIANTS + switch (lockstate) { + case QUEUE_LOCKED: + TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); + break; + case QUEUE_UNLOCKED: + TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT(); + break; + default: + kassert_panic("%s:%d: unknown queue lock state", __func__, + __LINE__); + } +#endif +} + +/* + * Clear the refcount. If appropriate, it will remove the entry from the + * queue and call the destructor. + * + * This must be called with the queue lock held. + */ +static void +tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry) +{ + + KASSERT(entry != NULL, ("%s: called with NULL entry", __func__)); + + TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); + + if (TCP_LOG_DEV_QUEUE_UNREF(entry)) { +#ifdef TCPLOG_DEBUG_COUNTERS + counter_u64_add(tcp_log_que_freed, 1); +#endif + /* Remove the entry from the queue and call the destructor. */ + STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue, + tldq_queue); + (*entry->tldq_dtor)(entry); + } +} + +static void +tcp_log_dev_clear_cdevpriv(void *data) +{ + struct tcp_log_dev_info *priv; + struct tcp_log_dev_queue *entry, *entry_tmp; + + priv = (struct tcp_log_dev_info *)data; + if (priv == NULL) + return; + + /* + * Lock the queue and drop our references. We hold references to all + * the entries starting with tldi_head (or, if tldi_head == NULL, all + * entries in the queue). + * + * Because we don't want anyone adding addition things to the queue + * while we are doing this, we lock the queue. + */ + TCP_LOG_DEV_QUEUE_LOCK(); + if (priv->tldi_head != NULL) { + entry = priv->tldi_head; + STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head, + tldq_queue, entry_tmp) { + tcp_log_dev_clear_refcount(entry); + } + } + tcp_log_dev_listeners--; + KASSERT(tcp_log_dev_listeners >= 0, + ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__)); + STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info, + tldi_list); + TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); + TCP_LOG_DEV_QUEUE_UNLOCK(); + free(priv, M_TCPLOGDEV); +} + +static int +tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused, + struct thread *td __unused) +{ + struct tcp_log_dev_info *priv; + struct tcp_log_dev_queue *entry; + int rv; + + /* + * Ideally, we shouldn't see these because of file system + * permissions. + */ + if (flags & (FWRITE | FEXEC | FAPPEND | O_TRUNC)) + return (ENODEV); + + /* Allocate space to hold information about where we are. */ + priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV, + M_ZERO | M_WAITOK); + + /* Stash the private data away. */ + rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv); + if (!rv) { + /* + * Increase the listener count, add this reader to the list, and + * take references on all current queues. + */ + TCP_LOG_DEV_QUEUE_LOCK(); + tcp_log_dev_listeners++; + STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list); + priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head); + if (priv->tldi_head != NULL) + priv->tldi_cur = priv->tldi_head->tldq_buf; + STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue) + TCP_LOG_DEV_QUEUE_REF(entry); + TCP_LOG_DEV_QUEUE_UNLOCK(); + } else { + /* Free the entry. */ + free(priv, M_TCPLOGDEV); + } + return (rv); +} + +static int +tcp_log_dev_write(struct cdev *dev __unused, struct uio *uio __unused, + int flags __unused) +{ + + return (ENODEV); +} + +static __inline void +tcp_log_dev_rotate_bufs(struct tcp_log_dev_info *priv, int *lockstate) +{ + struct tcp_log_dev_queue *entry; + + KASSERT(priv->tldi_head != NULL, + ("%s:%d: priv->tldi_head unexpectedly NULL", + __func__, __LINE__)); + KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur, + ("%s:%d: buffer mismatch (%p vs %p)", + __func__, __LINE__, priv->tldi_head->tldq_buf, + priv->tldi_cur)); + tcp_log_dev_queue_validate_lock(*lockstate); + + if (*lockstate == QUEUE_UNLOCKED) { + TCP_LOG_DEV_QUEUE_LOCK(); + *lockstate = QUEUE_LOCKED; + } + entry = priv->tldi_head; + priv->tldi_head = STAILQ_NEXT(entry, tldq_queue); + tcp_log_dev_clear_refcount(entry); + priv->tldi_cur = NULL; +} + +static int +tcp_log_dev_read(struct cdev *dev __unused, struct uio *uio, int flags) +{ + struct tcp_log_common_header *buf; + struct tcp_log_dev_info *priv; + struct tcp_log_dev_queue *entry; + ssize_t len; + int lockstate, rv; + + /* Get our private info. */ + rv = devfs_get_cdevpriv((void **)&priv); + if (rv) + return (rv); + + lockstate = QUEUE_UNLOCKED; + + /* Do we need to get a new buffer? */ + while (priv->tldi_cur == NULL || + priv->tldi_cur->tlch_length <= priv->tldi_off) { + /* Did we somehow forget to rotate? */ + KASSERT(priv->tldi_cur == NULL, + ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__, + __LINE__)); + if (priv->tldi_cur != NULL) + tcp_log_dev_rotate_bufs(priv, &lockstate); + + /* + * Before we start looking at tldi_head, we need a lock on the + * queue to make sure tldi_head stays stable. + */ + if (lockstate == QUEUE_UNLOCKED) { + TCP_LOG_DEV_QUEUE_LOCK(); + lockstate = QUEUE_LOCKED; + } + + /* We need the next buffer. Do we have one? */ + if (priv->tldi_head == NULL && (flags & FNONBLOCK)) { + rv = EAGAIN; + goto done; + } + if (priv->tldi_head == NULL) { + /* Sleep and wait for more things we can read. */ + rv = mtx_sleep(&tcp_log_dev_listeners, + &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0); + if (rv) + goto done; + if (priv->tldi_head == NULL) + continue; + } + + /* + * We have an entry to read. We want to try to create a + * buffer, if one doesn't already exist. + */ + entry = priv->tldi_head; + if (entry->tldq_buf == NULL) { + TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); + buf = (*entry->tldq_xform)(entry); + if (buf == NULL) { + rv = EBUSY; + goto done; + } + entry->tldq_buf = buf; + } + + priv->tldi_cur = entry->tldq_buf; + priv->tldi_off = 0; + } + + /* Copy what we can from this buffer to the output buffer. */ + if (uio->uio_resid > 0) { + /* Drop locks so we can take page faults. */ + if (lockstate == QUEUE_LOCKED) + TCP_LOG_DEV_QUEUE_UNLOCK(); + lockstate = QUEUE_UNLOCKED; + + KASSERT(priv->tldi_cur != NULL, + ("%s: priv->tldi_cur is unexpectedly NULL", __func__)); + + /* Copy as much as we can to this uio. */ + len = priv->tldi_cur->tlch_length - priv->tldi_off; + if (len > uio->uio_resid) + len = uio->uio_resid; + rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off, + len, uio); + if (rv != 0) + goto done; + priv->tldi_off += len; +#ifdef TCPLOG_DEBUG_COUNTERS + counter_u64_add(tcp_log_que_read, len); +#endif + } + /* Are we done with this buffer? If so, find the next one. */ + if (priv->tldi_off >= priv->tldi_cur->tlch_length) { + KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length, + ("%s: offset (%ju) exceeds length (%ju)", __func__, + (uintmax_t)priv->tldi_off, + (uintmax_t)priv->tldi_cur->tlch_length)); + tcp_log_dev_rotate_bufs(priv, &lockstate); + } +done: + tcp_log_dev_queue_validate_lock(lockstate); + if (lockstate == QUEUE_LOCKED) + TCP_LOG_DEV_QUEUE_UNLOCK(); + return (rv); +} + +static int +tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data, + int fflag __unused, struct thread *td __unused) +{ + struct tcp_log_dev_info *priv; + int rv; + + /* Get our private info. */ + rv = devfs_get_cdevpriv((void **)&priv); + if (rv) + return (rv); + + /* + * Set things. Here, we are most concerned about the non-blocking I/O + * flag. + */ + rv = 0; + switch (cmd) { + case FIONBIO: + break; + case FIOASYNC: + if (*(int *)data != 0) + rv = EINVAL; + break; + default: + rv = ENOIOCTL; + } + return (rv); +} + +static int +tcp_log_dev_poll(struct cdev *dev __unused, int events, struct thread *td) +{ + struct tcp_log_dev_info *priv; + int revents; + + /* + * Get our private info. If this fails, claim that all events are + * ready. That should prod the user to do something that will + * make the error evident to them. + */ + if (devfs_get_cdevpriv((void **)&priv)) + return (events); + + revents = 0; + if (events & (POLLIN | POLLRDNORM)) { + /* + * We can (probably) read right now if we are partway through + * a buffer or if we are just about to start a buffer. + * Because we are going to read tldi_head, we should acquire + * a read lock on the queue. + */ + TCP_LOG_DEV_QUEUE_LOCK(); + if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) || + (priv->tldi_cur != NULL && + priv->tldi_off < priv->tldi_cur->tlch_length)) + revents = events & (POLLIN | POLLRDNORM); + else + selrecord(td, &tcp_log_sel); + TCP_LOG_DEV_QUEUE_UNLOCK(); + } else { + /* + * It only makes sense to poll for reading. So, again, prod the + * user to do something that will make the error of their ways + * apparent. + */ + revents = events; + } + return (revents); +} + +int +tcp_log_dev_add_log(struct tcp_log_dev_queue *entry) +{ + struct tcp_log_dev_info *priv; + int rv; + bool wakeup_needed; + + KASSERT(entry->tldq_buf != NULL || entry->tldq_xform != NULL, + ("%s: Called with both tldq_buf and tldq_xform set to NULL", + __func__)); + KASSERT(entry->tldq_dtor != NULL, + ("%s: Called with tldq_dtor set to NULL", __func__)); + + /* Get a lock on the queue. */ + TCP_LOG_DEV_QUEUE_LOCK(); + + /* If no one is listening, tell the caller to free the resources. */ + if (tcp_log_dev_listeners == 0) { + rv = ENXIO; + goto done; + } + + /* Add this to the end of the tailq. */ + STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue); + + /* Add references for all current listeners. */ + refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners); + + /* + * If any listener is currently stuck on NULL, that means they are + * waiting. Point their head to this new entry. + */ + wakeup_needed = false; + STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list) + if (priv->tldi_head == NULL) { + priv->tldi_head = entry; + wakeup_needed = true; + } + + if (wakeup_needed) { + selwakeup(&tcp_log_sel); + wakeup(&tcp_log_dev_listeners); + } + + rv = 0; + +done: + TCP_LOG_DEV_QUEUE_LOCK_ASSERT(); + TCP_LOG_DEV_QUEUE_UNLOCK(); + return (rv); +} + +static int +tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused) +{ + + /* TODO: Support intelligent unloading. */ + switch (type) { + case MOD_LOAD: + if (bootverbose) + printf("tcp_log: tcp_log device\n"); + memset(&tcp_log_sel, 0, sizeof(tcp_log_sel)); + memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx)); + mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev", + "tcp_log device queues", MTX_DEF); + tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD, + &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400, + "tcp_log"); + break; + default: + return (EOPNOTSUPP); + } + + return (0); +} + +DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL); +MODULE_VERSION(tcp_log_dev, 1); Index: head/sys/kern/subr_witness.c =================================================================== --- head/sys/kern/subr_witness.c +++ head/sys/kern/subr_witness.c @@ -640,6 +640,14 @@ { "db->db_mtx", &lock_class_sx }, { NULL, NULL }, /* + * TCP log locks + */ + { "TCP ID tree", &lock_class_rw }, + { "tcp log id bucket", &lock_class_mtx_sleep }, + { "tcpinp", &lock_class_rw }, + { "TCP log expireq", &lock_class_mtx_sleep }, + { NULL, NULL }, + /* * spin locks */ #ifdef SMP Index: head/sys/netinet/tcp.h =================================================================== --- head/sys/netinet/tcp.h +++ head/sys/netinet/tcp.h @@ -168,6 +168,12 @@ #define TCP_NOOPT 8 /* don't use TCP options */ #define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */ #define TCP_INFO 32 /* retrieve tcp_info structure */ +#define TCP_LOG 34 /* configure event logging for connection */ +#define TCP_LOGBUF 35 /* retrieve event log for connection */ +#define TCP_LOGID 36 /* configure log ID to correlate connections */ +#define TCP_LOGDUMP 37 /* dump connection log events to device */ +#define TCP_LOGDUMPID 38 /* dump events from connections with same ID to + device */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ #define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ #define TCP_KEEPINIT 128 /* N, time to establish connection */ @@ -188,6 +194,9 @@ #define TCPI_OPT_WSCALE 0x04 #define TCPI_OPT_ECN 0x08 #define TCPI_OPT_TOE 0x10 + +/* Maximum length of log ID. */ +#define TCP_LOG_ID_LEN 64 /* * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits Index: head/sys/netinet/tcp_input.c =================================================================== --- head/sys/netinet/tcp_input.c +++ head/sys/netinet/tcp_input.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #include @@ -1592,6 +1593,8 @@ /* Save segment, if requested. */ tcp_pcap_add(th, m, &(tp->t_inpkts)); #endif + TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, + tlen, NULL, true); if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { Index: head/sys/netinet/tcp_log_buf.h =================================================================== --- head/sys/netinet/tcp_log_buf.h +++ head/sys/netinet/tcp_log_buf.h @@ -0,0 +1,353 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016-2018 + * Netflix Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __tcp_log_buf_h__ +#define __tcp_log_buf_h__ + +#define TCP_LOG_REASON_LEN 32 +#define TCP_LOG_BUF_VER (6) + +/* + * Because the (struct tcp_log_buffer) includes 8-byte uint64_t's, it requires + * 8-byte alignment to work properly on all platforms. Therefore, we will + * enforce 8-byte alignment for all the structures that may appear by + * themselves (instead of being embedded in another structure) in a data + * stream. + */ +#define ALIGN_TCP_LOG __aligned(8) + +/* Information about the socketbuffer state. */ +struct tcp_log_sockbuf +{ + uint32_t tls_sb_acc; /* available chars (sb->sb_acc) */ + uint32_t tls_sb_ccc; /* claimed chars (sb->sb_ccc) */ + uint32_t tls_sb_spare; /* spare */ +}; + +/* Optional, verbose information that may be appended to an event log. */ +struct tcp_log_verbose +{ +#define TCP_FUNC_LEN 32 + char tlv_snd_frm[TCP_FUNC_LEN]; /* tcp_output() caller */ + char tlv_trace_func[TCP_FUNC_LEN]; /* Function that + generated trace */ + uint32_t tlv_trace_line; /* Line number that generated trace */ + uint8_t _pad[4]; +} ALIGN_TCP_LOG; + +/* Internal RACK state variables. */ +struct tcp_log_rack +{ + uint32_t tlr_rack_rtt; /* rc_rack_rtt */ + uint8_t tlr_state; /* Internal RACK state */ + uint8_t _pad[3]; /* Padding */ +}; + +struct tcp_log_bbr { + uint64_t cur_del_rate; + uint64_t delRate; + uint64_t rttProp; + uint64_t bw_inuse; + uint32_t inflight; + uint32_t applimited; + uint32_t delivered; + uint32_t timeStamp; + uint32_t epoch; + uint32_t lt_epoch; + uint32_t pkts_out; + uint32_t flex1; + uint32_t flex2; + uint32_t flex3; + uint32_t flex4; + uint32_t flex5; + uint32_t flex6; + uint32_t lost; + uint16_t pacing_gain; + uint16_t cwnd_gain; + uint16_t flex7; + uint8_t bbr_state; + uint8_t bbr_substate; + uint8_t inpacer; + uint8_t ininput; + uint8_t use_lt_bw; + uint8_t flex8; + uint32_t pkt_epoch; +}; + +/* Per-stack stack-specific info. */ +union tcp_log_stackspecific +{ + struct tcp_log_rack u_rack; + struct tcp_log_bbr u_bbr; +}; + +struct tcp_log_buffer +{ + /* Event basics */ + struct timeval tlb_tv; /* Timestamp of trace */ + uint32_t tlb_ticks; /* Timestamp of trace */ + uint32_t tlb_sn; /* Serial number */ + uint8_t tlb_stackid; /* Stack ID */ + uint8_t tlb_eventid; /* Event ID */ + uint16_t tlb_eventflags; /* Flags for the record */ +#define TLB_FLAG_RXBUF 0x0001 /* Includes receive buffer info */ +#define TLB_FLAG_TXBUF 0x0002 /* Includes send buffer info */ +#define TLB_FLAG_HDR 0x0004 /* Includes a TCP header */ +#define TLB_FLAG_VERBOSE 0x0008 /* Includes function/line numbers */ +#define TLB_FLAG_STACKINFO 0x0010 /* Includes stack-specific info */ + int tlb_errno; /* Event error (if any) */ + + /* Internal session state */ + struct tcp_log_sockbuf tlb_rxbuf; /* Receive buffer */ + struct tcp_log_sockbuf tlb_txbuf; /* Send buffer */ + + int tlb_state; /* TCPCB t_state */ + uint32_t tlb_starttime; /* TCPCB t_starttime */ + uint32_t tlb_iss; /* TCPCB iss */ + uint32_t tlb_flags; /* TCPCB flags */ + uint32_t tlb_snd_una; /* TCPCB snd_una */ + uint32_t tlb_snd_max; /* TCPCB snd_max */ + uint32_t tlb_snd_cwnd; /* TCPCB snd_cwnd */ + uint32_t tlb_snd_nxt; /* TCPCB snd_nxt */ + uint32_t tlb_snd_recover;/* TCPCB snd_recover */ + uint32_t tlb_snd_wnd; /* TCPCB snd_wnd */ + uint32_t tlb_snd_ssthresh; /* TCPCB snd_ssthresh */ + uint32_t tlb_srtt; /* TCPCB t_srtt */ + uint32_t tlb_rttvar; /* TCPCB t_rttvar */ + uint32_t tlb_rcv_up; /* TCPCB rcv_up */ + uint32_t tlb_rcv_adv; /* TCPCB rcv_adv */ + uint32_t tlb_rcv_nxt; /* TCPCB rcv_nxt */ + tcp_seq tlb_sack_newdata; /* TCPCB sack_newdata */ + uint32_t tlb_rcv_wnd; /* TCPCB rcv_wnd */ + uint32_t tlb_dupacks; /* TCPCB t_dupacks */ + int tlb_segqlen; /* TCPCB segqlen */ + int tlb_snd_numholes; /* TCPCB snd_numholes */ + uint32_t tlb_flex1; /* Event specific information */ + uint32_t tlb_flex2; /* Event specific information */ + uint8_t tlb_snd_scale:4, /* TCPCB snd_scale */ + tlb_rcv_scale:4; /* TCPCB rcv_scale */ + uint8_t _pad[3]; /* Padding */ + + /* Per-stack info */ + union tcp_log_stackspecific tlb_stackinfo; +#define tlb_rack tlb_stackinfo.u_rack + + /* The packet */ + uint32_t tlb_len; /* The packet's data length */ + struct tcphdr tlb_th; /* The TCP header */ + uint8_t tlb_opts[TCP_MAXOLEN]; /* The TCP options */ + + /* Verbose information (optional) */ + struct tcp_log_verbose tlb_verbose[0]; +} ALIGN_TCP_LOG; + +enum tcp_log_events { + TCP_LOG_IN = 1, /* Incoming packet 1 */ + TCP_LOG_OUT, /* Transmit (without other event) 2 */ + TCP_LOG_RTO, /* Retransmit timeout 3 */ + TCP_LOG_TF_ACK, /* Transmit due to TF_ACK 4 */ + TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ + TCP_LOG_PRR, /* Doing PRR 6 */ + TCP_LOG_REORDER,/* Detected reorder 7 */ + TCP_LOG_PACER, /* Pacer sending a packet 8 */ + BBR_LOG_BBRUPD, /* We updated BBR info 9 */ + BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ + BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ + BBR_LOG_INQUEUE, /* The tcb had a packet input to it 12 */ + BBR_LOG_TIMERSTAR, /* Start a timer 13 */ + BBR_LOG_TIMERCANC, /* Cancel a timer 14 */ + BBR_LOG_ENTREC, /* Entered recovery 15 */ + BBR_LOG_EXITREC, /* Exited recovery 16 */ + BBR_LOG_CWND, /* Cwnd change 17 */ + BBR_LOG_BWSAMP, /* LT B/W sample has been made 18 */ + BBR_LOG_MSGSIZE, /* We received a EMSGSIZE error 19 */ + BBR_LOG_BBRRTT, /* BBR RTT is updated 20 */ + BBR_LOG_JUSTRET, /* We just returned out of output 21 */ + BBR_LOG_STATE, /* A BBR state change occured 22 */ + BBR_LOG_PKT_EPOCH, /* A BBR packet epoch occured 23 */ + BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ + TCP_LOG_FLOWEND, /* End of a flow 25 */ + BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ + BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */ + BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */ + BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ + BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ + TCP_LOG_USERSEND, /* User level sends data 31 */ + UNUSED_32, /* Unused 32 */ + UNUSED_33, /* Unused 33 */ + BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */ + BBR_LOG_TO_PROCESS, /* A to was processed 35 */ + BBR_LOG_BBRTSO, /* TSO update 36 */ + BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */ + BBR_LOG_LOWGAIN, /* Low gain accounting 38 */ + BBR_LOG_PROGRESS, /* Progress timer event 39 */ + TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */ + BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */ + BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */ + BBR_LOG_PACING_CALC, /* calc the pacing time 43 */ + BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */ + BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */ + BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ + TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ + BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ + TCP_LOG_END /* End (keep at end) 49 */ +}; + +enum tcp_log_states { + TCP_LOG_STATE_CLEAR = -1, /* Deactivate and clear tracing */ + TCP_LOG_STATE_OFF = 0, /* Pause */ + TCP_LOG_STATE_TAIL=1, /* Keep the trailing events */ + TCP_LOG_STATE_HEAD=2, /* Keep the leading events */ + TCP_LOG_STATE_HEAD_AUTO=3, /* Keep the leading events, and + automatically dump them to the + device */ + TCP_LOG_STATE_CONTINUAL=4, /* Continually dump the data when full */ + TCP_LOG_STATE_TAIL_AUTO=5, /* Keep the trailing events, and + automatically dump them when the + session ends */ +}; + +/* Use this if we don't know whether the operation succeeded. */ +#define ERRNO_UNK (-1) + +/* + * If the user included dev/tcp_log/tcp_log_dev.h, then include our private + * headers. Otherwise, there is no reason to pollute all the files with an + * additional include. + * + * This structure is aligned to an 8-byte boundary to match the alignment + * requirements of (struct tcp_log_buffer). + */ +#ifdef __tcp_log_dev_h__ +struct tcp_log_header { + struct tcp_log_common_header tlh_common; +#define tlh_version tlh_common.tlch_version +#define tlh_type tlh_common.tlch_type +#define tlh_length tlh_common.tlch_length + struct in_endpoints tlh_ie; + struct timeval tlh_offset; /* Uptime -> UTC offset */ + char tlh_id[TCP_LOG_ID_LEN]; + char tlh_reason[TCP_LOG_REASON_LEN]; + uint8_t tlh_af; + uint8_t _pad[7]; +} ALIGN_TCP_LOG; + +#ifdef _KERNEL +struct tcp_log_dev_log_queue { + struct tcp_log_dev_queue tldl_common; + char tldl_id[TCP_LOG_ID_LEN]; + char tldl_reason[TCP_LOG_REASON_LEN]; + struct in_endpoints tldl_ie; + struct tcp_log_stailq tldl_entries; + int tldl_count; + uint8_t tldl_af; +}; +#endif /* _KERNEL */ +#endif /* __tcp_log_dev_h__ */ + +#ifdef _KERNEL + +#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000 +#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000 + +/* + * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always + * tries to record verbose information. + */ +#define TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ + do { \ + if (tp->t_logstate != TCP_LOG_STATE_OFF) \ + tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ + errornum, len, stackinfo, th_hostorder, \ + tp->t_output_caller, __func__, __LINE__, tv); \ + } while (0) + +/* + * TCP_LOG_EVENT: This is a macro so we can capture function/line + * information when needed. + * + * Prototype: + * TCP_LOG_EVENT(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, + * struct sockbuf *txbuf, uint8_t eventid, int errornum, + * union tcp_log_stackspecific *stackinfo) + * + * tp is mandatory and must be write locked. + * th is optional; if present, it will appear in the record. + * rxbuf and txbuf are optional; if present, they will appear in the record. + * eventid is mandatory. + * errornum is mandatory (it indicates the success or failure of the + * operation associated with the event). + * len indicates the length of the packet. If no packet, use 0. + * stackinfo is optional; if present, it will appear in the record. + */ +#ifdef TCP_LOG_FORCEVERBOSE +#define TCP_LOG_EVENT TCP_LOG_EVENT_VERBOSE +#else +#define TCP_LOG_EVENT(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder) \ + do { \ + if (tcp_log_verbose) \ + TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, \ + eventid, errornum, len, stackinfo, \ + th_hostorder, NULL); \ + else if (tp->t_logstate != TCP_LOG_STATE_OFF) \ + tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ + errornum, len, stackinfo, th_hostorder, \ + NULL, NULL, 0, NULL); \ + } while (0) +#endif /* TCP_LOG_FORCEVERBOSE */ +#define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \ + do { \ + if (tp->t_logstate != TCP_LOG_STATE_OFF) \ + tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \ + errornum, len, stackinfo, th_hostorder, \ + NULL, NULL, 0, tv); \ + } while (0) + + +extern bool tcp_log_verbose; +void tcp_log_drain(struct tcpcb *tp); +int tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force); +void tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason); +struct tcp_log_buffer *tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, + struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, + union tcp_log_stackspecific *stackinfo, int th_hostorder, + const char *output_caller, const char *func, int line, const struct timeval *tv); +size_t tcp_log_get_id(struct tcpcb *tp, char *buf); +u_int tcp_log_get_id_cnt(struct tcpcb *tp); +int tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp); +void tcp_log_init(void); +int tcp_log_set_id(struct tcpcb *tp, char *id); +int tcp_log_state_change(struct tcpcb *tp, int state); +void tcp_log_tcpcbinit(struct tcpcb *tp); +void tcp_log_tcpcbfini(struct tcpcb *tp); +void tcp_log_flowend(struct tcpcb *tp); + +#endif /* _KERNEL */ +#endif /* __tcp_log_buf_h__ */ Index: head/sys/netinet/tcp_log_buf.c =================================================================== --- head/sys/netinet/tcp_log_buf.c +++ head/sys/netinet/tcp_log_buf.c @@ -0,0 +1,2480 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016-2018 + * Netflix Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include + +/* Default expiry time */ +#define TCP_LOG_EXPIRE_TIME ((sbintime_t)60 * SBT_1S) + +/* Max interval at which to run the expiry timer */ +#define TCP_LOG_EXPIRE_INTVL ((sbintime_t)5 * SBT_1S) + +bool tcp_log_verbose; +static uma_zone_t tcp_log_bucket_zone, tcp_log_node_zone, tcp_log_zone; +static int tcp_log_session_limit = TCP_LOG_BUF_DEFAULT_SESSION_LIMIT; +static uint32_t tcp_log_version = TCP_LOG_BUF_VER; +RB_HEAD(tcp_log_id_tree, tcp_log_id_bucket); +static struct tcp_log_id_tree tcp_log_id_head; +static STAILQ_HEAD(, tcp_log_id_node) tcp_log_expireq_head = + STAILQ_HEAD_INITIALIZER(tcp_log_expireq_head); +static struct mtx tcp_log_expireq_mtx; +static struct callout tcp_log_expireq_callout; +static uint64_t tcp_log_auto_ratio = 0; +static uint64_t tcp_log_auto_ratio_cur = 0; +static uint32_t tcp_log_auto_mode = TCP_LOG_STATE_TAIL; +static bool tcp_log_auto_all = false; + +RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp) + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW, 0, "TCP Black Box controls"); + +SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose, + 0, "Force verbose logging for TCP traces"); + +SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit, + CTLFLAG_RW, &tcp_log_session_limit, 0, + "Maximum number of events maintained for each TCP session"); + +SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW, + &tcp_log_zone, "Maximum number of events maintained for all TCP sessions"); + +SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_global_entries, CTLFLAG_RD, + &tcp_log_zone, "Current number of events maintained for all TCP sessions"); + +SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_limit, CTLFLAG_RW, + &tcp_log_bucket_zone, "Maximum number of log IDs"); + +SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_entries, CTLFLAG_RD, + &tcp_log_bucket_zone, "Current number of log IDs"); + +SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_limit, CTLFLAG_RW, + &tcp_log_node_zone, "Maximum number of tcpcbs with log IDs"); + +SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_entries, CTLFLAG_RD, + &tcp_log_node_zone, "Current number of tcpcbs with log IDs"); + +SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_version, CTLFLAG_RD, &tcp_log_version, + 0, "Version of log formats exported"); + +SYSCTL_U64(_net_inet_tcp_bb, OID_AUTO, log_auto_ratio, CTLFLAG_RW, + &tcp_log_auto_ratio, 0, "Do auto capturing for 1 out of N sessions"); + +SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_auto_mode, CTLFLAG_RW, + &tcp_log_auto_mode, TCP_LOG_STATE_HEAD_AUTO, + "Logging mode for auto-selected sessions (default is TCP_LOG_STATE_HEAD_AUTO)"); + +SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_auto_all, CTLFLAG_RW, + &tcp_log_auto_all, false, + "Auto-select from all sessions (rather than just those with IDs)"); + +#ifdef TCPLOG_DEBUG_COUNTERS +counter_u64_t tcp_log_queued; +counter_u64_t tcp_log_que_fail1; +counter_u64_t tcp_log_que_fail2; +counter_u64_t tcp_log_que_fail3; +counter_u64_t tcp_log_que_fail4; +counter_u64_t tcp_log_que_fail5; +counter_u64_t tcp_log_que_copyout; +counter_u64_t tcp_log_que_read; +counter_u64_t tcp_log_que_freed; + +SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, queued, CTLFLAG_RD, + &tcp_log_queued, "Number of entries queued"); +SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail1, CTLFLAG_RD, + &tcp_log_que_fail1, "Number of entries queued but fail 1"); +SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail2, CTLFLAG_RD, + &tcp_log_que_fail2, "Number of entries queued but fail 2"); +SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail3, CTLFLAG_RD, + &tcp_log_que_fail3, "Number of entries queued but fail 3"); +SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail4, CTLFLAG_RD, + &tcp_log_que_fail4, "Number of entries queued but fail 4"); +SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail5, CTLFLAG_RD, + &tcp_log_que_fail5, "Number of entries queued but fail 4"); +SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, copyout, CTLFLAG_RD, + &tcp_log_que_copyout, "Number of entries copied out"); +SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, read, CTLFLAG_RD, + &tcp_log_que_read, "Number of entries read from the queue"); +SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, freed, CTLFLAG_RD, + &tcp_log_que_freed, "Number of entries freed after reading"); +#endif + +#ifdef INVARIANTS +#define TCPLOG_DEBUG_RINGBUF +#endif + +struct tcp_log_mem +{ + STAILQ_ENTRY(tcp_log_mem) tlm_queue; + struct tcp_log_buffer tlm_buf; + struct tcp_log_verbose tlm_v; +#ifdef TCPLOG_DEBUG_RINGBUF + volatile int tlm_refcnt; +#endif +}; + +/* 60 bytes for the header, + 16 bytes for padding */ +static uint8_t zerobuf[76]; + +/* + * Lock order: + * 1. TCPID_TREE + * 2. TCPID_BUCKET + * 3. INP + * + * Rules: + * A. You need a lock on the Tree to add/remove buckets. + * B. You need a lock on the bucket to add/remove nodes from the bucket. + * C. To change information in a node, you need the INP lock if the tln_closed + * field is false. Otherwise, you need the bucket lock. (Note that the + * tln_closed field can change at any point, so you need to recheck the + * entry after acquiring the INP lock.) + * D. To remove a node from the bucket, you must have that entry locked, + * according to the criteria of Rule C. Also, the node must not be on + * the expiry queue. + * E. The exception to C is the expiry queue fields, which are locked by + * the TCPLOG_EXPIREQ lock. + * + * Buckets have a reference count. Each node is a reference. Further, + * other callers may add reference counts to keep a bucket from disappearing. + * You can add a reference as long as you own a lock sufficient to keep the + * bucket from disappearing. For example, a common use is: + * a. Have a locked INP, but need to lock the TCPID_BUCKET. + * b. Add a refcount on the bucket. (Safe because the INP lock prevents + * the TCPID_BUCKET from going away.) + * c. Drop the INP lock. + * d. Acquire a lock on the TCPID_BUCKET. + * e. Acquire a lock on the INP. + * f. Drop the refcount on the bucket. + * (At this point, the bucket may disappear.) + * + * Expire queue lock: + * You can acquire this with either the bucket or INP lock. Don't reverse it. + * When the expire code has committed to freeing a node, it resets the expiry + * time to SBT_MAX. That is the signal to everyone else that they should + * leave that node alone. + */ +static struct rwlock tcp_id_tree_lock; +#define TCPID_TREE_WLOCK() rw_wlock(&tcp_id_tree_lock) +#define TCPID_TREE_RLOCK() rw_rlock(&tcp_id_tree_lock) +#define TCPID_TREE_UPGRADE() rw_try_upgrade(&tcp_id_tree_lock) +#define TCPID_TREE_WUNLOCK() rw_wunlock(&tcp_id_tree_lock) +#define TCPID_TREE_RUNLOCK() rw_runlock(&tcp_id_tree_lock) +#define TCPID_TREE_WLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_WLOCKED) +#define TCPID_TREE_RLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_RLOCKED) +#define TCPID_TREE_UNLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_UNLOCKED) + +#define TCPID_BUCKET_LOCK_INIT(tlb) mtx_init(&((tlb)->tlb_mtx), "tcp log id bucket", NULL, MTX_DEF) +#define TCPID_BUCKET_LOCK_DESTROY(tlb) mtx_destroy(&((tlb)->tlb_mtx)) +#define TCPID_BUCKET_LOCK(tlb) mtx_lock(&((tlb)->tlb_mtx)) +#define TCPID_BUCKET_UNLOCK(tlb) mtx_unlock(&((tlb)->tlb_mtx)) +#define TCPID_BUCKET_LOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_OWNED) +#define TCPID_BUCKET_UNLOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_NOTOWNED) + +#define TCPID_BUCKET_REF(tlb) refcount_acquire(&((tlb)->tlb_refcnt)) +#define TCPID_BUCKET_UNREF(tlb) refcount_release(&((tlb)->tlb_refcnt)) + +#define TCPLOG_EXPIREQ_LOCK() mtx_lock(&tcp_log_expireq_mtx) +#define TCPLOG_EXPIREQ_UNLOCK() mtx_unlock(&tcp_log_expireq_mtx) + +SLIST_HEAD(tcp_log_id_head, tcp_log_id_node); + +struct tcp_log_id_bucket +{ + /* + * tlb_id must be first. This lets us use strcmp on + * (struct tcp_log_id_bucket *) and (char *) interchangeably. + */ + char tlb_id[TCP_LOG_ID_LEN]; + RB_ENTRY(tcp_log_id_bucket) tlb_rb; + struct tcp_log_id_head tlb_head; + struct mtx tlb_mtx; + volatile u_int tlb_refcnt; +}; + +struct tcp_log_id_node +{ + SLIST_ENTRY(tcp_log_id_node) tln_list; + STAILQ_ENTRY(tcp_log_id_node) tln_expireq; /* Locked by the expireq lock */ + sbintime_t tln_expiretime; /* Locked by the expireq lock */ + + /* + * If INP is NULL, that means the connection has closed. We've + * saved the connection endpoint information and the log entries + * in the tln_ie and tln_entries members. We've also saved a pointer + * to the enclosing bucket here. If INP is not NULL, the information is + * in the PCB and not here. + */ + struct inpcb *tln_inp; + struct tcpcb *tln_tp; + struct tcp_log_id_bucket *tln_bucket; + struct in_endpoints tln_ie; + struct tcp_log_stailq tln_entries; + int tln_count; + volatile int tln_closed; + uint8_t tln_af; +}; + +enum tree_lock_state { + TREE_UNLOCKED = 0, + TREE_RLOCKED, + TREE_WLOCKED, +}; + +/* Do we want to select this session for auto-logging? */ +static __inline bool +tcp_log_selectauto(void) +{ + + /* + * If we are doing auto-capturing, figure out whether we will capture + * this session. + */ + if (tcp_log_auto_ratio && + (atomic_fetchadd_64(&tcp_log_auto_ratio_cur, 1) % + tcp_log_auto_ratio) == 0) + return (true); + return (false); +} + +static __inline int +tcp_log_id_cmp(struct tcp_log_id_bucket *a, struct tcp_log_id_bucket *b) +{ + KASSERT(a != NULL, ("tcp_log_id_cmp: argument a is unexpectedly NULL")); + KASSERT(b != NULL, ("tcp_log_id_cmp: argument b is unexpectedly NULL")); + return strncmp(a->tlb_id, b->tlb_id, TCP_LOG_ID_LEN); +} + +RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp) + +static __inline void +tcp_log_id_validate_tree_lock(int tree_locked) +{ + +#ifdef INVARIANTS + switch (tree_locked) { + case TREE_WLOCKED: + TCPID_TREE_WLOCK_ASSERT(); + break; + case TREE_RLOCKED: + TCPID_TREE_RLOCK_ASSERT(); + break; + case TREE_UNLOCKED: + TCPID_TREE_UNLOCK_ASSERT(); + break; + default: + kassert_panic("%s:%d: unknown tree lock state", __func__, + __LINE__); + } +#endif +} + +static __inline void +tcp_log_remove_bucket(struct tcp_log_id_bucket *tlb) +{ + + TCPID_TREE_WLOCK_ASSERT(); + KASSERT(SLIST_EMPTY(&tlb->tlb_head), + ("%s: Attempt to remove non-empty bucket", __func__)); + if (RB_REMOVE(tcp_log_id_tree, &tcp_log_id_head, tlb) == NULL) { +#ifdef INVARIANTS + kassert_panic("%s:%d: error removing element from tree", + __func__, __LINE__); +#endif + } + TCPID_BUCKET_LOCK_DESTROY(tlb); + uma_zfree(tcp_log_bucket_zone, tlb); +} + +/* + * Call with a referenced and locked bucket. + * Will return true if the bucket was freed; otherwise, false. + * tlb: The bucket to unreference. + * tree_locked: A pointer to the state of the tree lock. If the tree lock + * state changes, the function will update it. + * inp: If not NULL and the function needs to drop the inp lock to relock the + * tree, it will do so. (The caller must ensure inp will not become invalid, + * probably by holding a reference to it.) + */ +static bool +tcp_log_unref_bucket(struct tcp_log_id_bucket *tlb, int *tree_locked, + struct inpcb *inp) +{ + + KASSERT(tlb != NULL, ("%s: called with NULL tlb", __func__)); + KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked", + __func__)); + + tcp_log_id_validate_tree_lock(*tree_locked); + + /* + * Did we hold the last reference on the tlb? If so, we may need + * to free it. (Note that we can realistically only execute the + * loop twice: once without a write lock and once with a write + * lock.) + */ + while (TCPID_BUCKET_UNREF(tlb)) { + /* + * We need a write lock on the tree to free this. + * If we can upgrade the tree lock, this is "easy". If we + * can't upgrade the tree lock, we need to do this the + * "hard" way: unwind all our locks and relock everything. + * In the meantime, anything could have changed. We even + * need to validate that we still need to free the bucket. + */ + if (*tree_locked == TREE_RLOCKED && TCPID_TREE_UPGRADE()) + *tree_locked = TREE_WLOCKED; + else if (*tree_locked != TREE_WLOCKED) { + TCPID_BUCKET_REF(tlb); + if (inp != NULL) + INP_WUNLOCK(inp); + TCPID_BUCKET_UNLOCK(tlb); + if (*tree_locked == TREE_RLOCKED) + TCPID_TREE_RUNLOCK(); + TCPID_TREE_WLOCK(); + *tree_locked = TREE_WLOCKED; + TCPID_BUCKET_LOCK(tlb); + if (inp != NULL) + INP_WLOCK(inp); + continue; + } + + /* + * We have an empty bucket and a write lock on the tree. + * Remove the empty bucket. + */ + tcp_log_remove_bucket(tlb); + return (true); + } + return (false); +} + +/* + * Call with a locked bucket. This function will release the lock on the + * bucket before returning. + * + * The caller is responsible for freeing the tp->t_lin/tln node! + * + * Note: one of tp or both tlb and tln must be supplied. + * + * inp: A pointer to the inp. If the function needs to drop the inp lock to + * acquire the tree write lock, it will do so. (The caller must ensure inp + * will not become invalid, probably by holding a reference to it.) + * tp: A pointer to the tcpcb. (optional; if specified, tlb and tln are ignored) + * tlb: A pointer to the bucket. (optional; ignored if tp is specified) + * tln: A pointer to the node. (optional; ignored if tp is specified) + * tree_locked: A pointer to the state of the tree lock. If the tree lock + * state changes, the function will update it. + * + * Will return true if the INP lock was reacquired; otherwise, false. + */ +static bool +tcp_log_remove_id_node(struct inpcb *inp, struct tcpcb *tp, + struct tcp_log_id_bucket *tlb, struct tcp_log_id_node *tln, + int *tree_locked) +{ + int orig_tree_locked; + + KASSERT(tp != NULL || (tlb != NULL && tln != NULL), + ("%s: called with tp=%p, tlb=%p, tln=%p", __func__, + tp, tlb, tln)); + KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked", + __func__)); + + if (tp != NULL) { + tlb = tp->t_lib; + tln = tp->t_lin; + KASSERT(tlb != NULL, ("%s: unexpectedly NULL tlb", __func__)); + KASSERT(tln != NULL, ("%s: unexpectedly NULL tln", __func__)); + } + + tcp_log_id_validate_tree_lock(*tree_locked); + TCPID_BUCKET_LOCK_ASSERT(tlb); + + /* + * Remove the node, clear the log bucket and node from the TCPCB, and + * decrement the bucket refcount. In the process, if this is the + * last reference, the bucket will be freed. + */ + SLIST_REMOVE(&tlb->tlb_head, tln, tcp_log_id_node, tln_list); + if (tp != NULL) { + tp->t_lib = NULL; + tp->t_lin = NULL; + } + orig_tree_locked = *tree_locked; + if (!tcp_log_unref_bucket(tlb, tree_locked, inp)) + TCPID_BUCKET_UNLOCK(tlb); + return (*tree_locked != orig_tree_locked); +} + +#define RECHECK_INP_CLEAN(cleanup) do { \ + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { \ + rv = ECONNRESET; \ + cleanup; \ + goto done; \ + } \ + tp = intotcpcb(inp); \ +} while (0) + +#define RECHECK_INP() RECHECK_INP_CLEAN(/* noop */) + +static void +tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp) +{ + + INP_WLOCK_ASSERT(tp->t_inpcb); + +#ifdef NETFLIX + if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL) + (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id)); +#endif +} + +/* + * Set the TCP log ID for a TCPCB. + * Called with INPCB locked. Returns with it unlocked. + */ +int +tcp_log_set_id(struct tcpcb *tp, char *id) +{ + struct tcp_log_id_bucket *tlb, *tmp_tlb; + struct tcp_log_id_node *tln; + struct inpcb *inp; + int tree_locked, rv; + bool bucket_locked; + + tlb = NULL; + tln = NULL; + inp = tp->t_inpcb; + tree_locked = TREE_UNLOCKED; + bucket_locked = false; + +restart: + INP_WLOCK_ASSERT(inp); + + /* See if the ID is unchanged. */ + if ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) || + (tp->t_lib == NULL && *id == 0)) { + rv = 0; + goto done; + } + + /* + * If the TCPCB had a previous ID, we need to extricate it from + * the previous list. + * + * Drop the TCPCB lock and lock the tree and the bucket. + * Because this is called in the socket context, we (theoretically) + * don't need to worry about the INPCB completely going away + * while we are gone. + */ + if (tp->t_lib != NULL) { + tlb = tp->t_lib; + TCPID_BUCKET_REF(tlb); + INP_WUNLOCK(inp); + + if (tree_locked == TREE_UNLOCKED) { + TCPID_TREE_RLOCK(); + tree_locked = TREE_RLOCKED; + } + TCPID_BUCKET_LOCK(tlb); + bucket_locked = true; + INP_WLOCK(inp); + + /* + * Unreference the bucket. If our bucket went away, it is no + * longer locked or valid. + */ + if (tcp_log_unref_bucket(tlb, &tree_locked, inp)) { + bucket_locked = false; + tlb = NULL; + } + + /* Validate the INP. */ + RECHECK_INP(); + + /* + * Evaluate whether the bucket changed while we were unlocked. + * + * Possible scenarios here: + * 1. Bucket is unchanged and the same one we started with. + * 2. The TCPCB no longer has a bucket and our bucket was + * freed. + * 3. The TCPCB has a new bucket, whether ours was freed. + * 4. The TCPCB no longer has a bucket and our bucket was + * not freed. + * + * In cases 2-4, we will start over. In case 1, we will + * proceed here to remove the bucket. + */ + if (tlb == NULL || tp->t_lib != tlb) { + KASSERT(bucket_locked || tlb == NULL, + ("%s: bucket_locked (%d) and tlb (%p) are " + "inconsistent", __func__, bucket_locked, tlb)); + + if (bucket_locked) { + TCPID_BUCKET_UNLOCK(tlb); + bucket_locked = false; + tlb = NULL; + } + goto restart; + } + + /* + * Store the (struct tcp_log_id_node) for reuse. Then, remove + * it from the bucket. In the process, we may end up relocking. + * If so, we need to validate that the INP is still valid, and + * the TCPCB entries match we expect. + * + * We will clear tlb and change the bucket_locked state just + * before calling tcp_log_remove_id_node(), since that function + * will unlock the bucket. + */ + if (tln != NULL) + uma_zfree(tcp_log_node_zone, tln); + tln = tp->t_lin; + tlb = NULL; + bucket_locked = false; + if (tcp_log_remove_id_node(inp, tp, NULL, NULL, &tree_locked)) { + RECHECK_INP(); + + /* + * If the TCPCB moved to a new bucket while we had + * dropped the lock, restart. + */ + if (tp->t_lib != NULL || tp->t_lin != NULL) + goto restart; + } + + /* + * Yay! We successfully removed the TCPCB from its old + * bucket. Phew! + * + * On to bigger and better things... + */ + } + + /* At this point, the TCPCB should not be in any bucket. */ + KASSERT(tp->t_lib == NULL, ("%s: tp->t_lib is not NULL", __func__)); + + /* + * If the new ID is not empty, we need to now assign this TCPCB to a + * new bucket. + */ + if (*id) { + /* Get a new tln, if we don't already have one to reuse. */ + if (tln == NULL) { + tln = uma_zalloc(tcp_log_node_zone, M_NOWAIT | M_ZERO); + if (tln == NULL) { + rv = ENOBUFS; + goto done; + } + tln->tln_inp = inp; + tln->tln_tp = tp; + } + + /* + * Drop the INP lock for a bit. We don't need it, and dropping + * it prevents lock order reversals. + */ + INP_WUNLOCK(inp); + + /* Make sure we have at least a read lock on the tree. */ + tcp_log_id_validate_tree_lock(tree_locked); + if (tree_locked == TREE_UNLOCKED) { + TCPID_TREE_RLOCK(); + tree_locked = TREE_RLOCKED; + } + +refind: + /* + * Remember that we constructed (struct tcp_log_id_node) so + * we can safely cast the id to it for the purposes of finding. + */ + KASSERT(tlb == NULL, ("%s:%d tlb unexpectedly non-NULL", + __func__, __LINE__)); + tmp_tlb = RB_FIND(tcp_log_id_tree, &tcp_log_id_head, + (struct tcp_log_id_bucket *) id); + + /* + * If we didn't find a matching bucket, we need to add a new + * one. This requires a write lock. But, of course, we will + * need to recheck some things when we re-acquire the lock. + */ + if (tmp_tlb == NULL && tree_locked != TREE_WLOCKED) { + tree_locked = TREE_WLOCKED; + if (!TCPID_TREE_UPGRADE()) { + TCPID_TREE_RUNLOCK(); + TCPID_TREE_WLOCK(); + + /* + * The tree may have changed while we were + * unlocked. + */ + goto refind; + } + } + + /* If we need to add a new bucket, do it now. */ + if (tmp_tlb == NULL) { + /* Allocate new bucket. */ + tlb = uma_zalloc(tcp_log_bucket_zone, M_NOWAIT); + if (tlb == NULL) { + rv = ENOBUFS; + goto done_noinp; + } + + /* + * Copy the ID to the bucket. + * NB: Don't use strlcpy() unless you are sure + * we've always validated NULL termination. + * + * TODO: When I'm done writing this, see if we + * we have correctly validated NULL termination and + * can use strlcpy(). :-) + */ + strncpy(tlb->tlb_id, id, TCP_LOG_ID_LEN - 1); + tlb->tlb_id[TCP_LOG_ID_LEN - 1] = '\0'; + + /* + * Take the refcount for the first node and go ahead + * and lock this. Note that we zero the tlb_mtx + * structure, since 0xdeadc0de flips the right bits + * for the code to think that this mutex has already + * been initialized. :-( + */ + SLIST_INIT(&tlb->tlb_head); + refcount_init(&tlb->tlb_refcnt, 1); + memset(&tlb->tlb_mtx, 0, sizeof(struct mtx)); + TCPID_BUCKET_LOCK_INIT(tlb); + TCPID_BUCKET_LOCK(tlb); + bucket_locked = true; + +#define FREE_NEW_TLB() do { \ + TCPID_BUCKET_LOCK_DESTROY(tlb); \ + uma_zfree(tcp_log_bucket_zone, tlb); \ + bucket_locked = false; \ + tlb = NULL; \ +} while (0) + /* + * Relock the INP and make sure we are still + * unassigned. + */ + INP_WLOCK(inp); + RECHECK_INP_CLEAN(FREE_NEW_TLB()); + if (tp->t_lib != NULL) { + FREE_NEW_TLB(); + goto restart; + } + + /* Add the new bucket to the tree. */ + tmp_tlb = RB_INSERT(tcp_log_id_tree, &tcp_log_id_head, + tlb); + KASSERT(tmp_tlb == NULL, + ("%s: Unexpected conflicting bucket (%p) while " + "adding new bucket (%p)", __func__, tmp_tlb, tlb)); + + /* + * If we found a conflicting bucket, free the new + * one we made and fall through to use the existing + * bucket. + */ + if (tmp_tlb != NULL) { + FREE_NEW_TLB(); + INP_WUNLOCK(inp); + } +#undef FREE_NEW_TLB + } + + /* If we found an existing bucket, use it. */ + if (tmp_tlb != NULL) { + tlb = tmp_tlb; + TCPID_BUCKET_LOCK(tlb); + bucket_locked = true; + + /* + * Relock the INP and make sure we are still + * unassigned. + */ + INP_UNLOCK_ASSERT(inp); + INP_WLOCK(inp); + RECHECK_INP(); + if (tp->t_lib != NULL) { + TCPID_BUCKET_UNLOCK(tlb); + tlb = NULL; + goto restart; + } + + /* Take a reference on the bucket. */ + TCPID_BUCKET_REF(tlb); + } + + tcp_log_grow_tlb(tlb->tlb_id, tp); + + /* Add the new node to the list. */ + SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list); + tp->t_lib = tlb; + tp->t_lin = tln; + tln = NULL; + } + + rv = 0; + +done: + /* Unlock things, as needed, and return. */ + INP_WUNLOCK(inp); +done_noinp: + INP_UNLOCK_ASSERT(inp); + if (bucket_locked) { + TCPID_BUCKET_LOCK_ASSERT(tlb); + TCPID_BUCKET_UNLOCK(tlb); + } else if (tlb != NULL) + TCPID_BUCKET_UNLOCK_ASSERT(tlb); + if (tree_locked == TREE_WLOCKED) { + TCPID_TREE_WLOCK_ASSERT(); + TCPID_TREE_WUNLOCK(); + } else if (tree_locked == TREE_RLOCKED) { + TCPID_TREE_RLOCK_ASSERT(); + TCPID_TREE_RUNLOCK(); + } else + TCPID_TREE_UNLOCK_ASSERT(); + if (tln != NULL) + uma_zfree(tcp_log_node_zone, tln); + return (rv); +} + +/* + * Get the TCP log ID for a TCPCB. + * Called with INPCB locked. + * 'buf' must point to a buffer that is at least TCP_LOG_ID_LEN bytes long. + * Returns number of bytes copied. + */ +size_t +tcp_log_get_id(struct tcpcb *tp, char *buf) +{ + size_t len; + + INP_LOCK_ASSERT(tp->t_inpcb); + if (tp->t_lib != NULL) { + len = strlcpy(buf, tp->t_lib->tlb_id, TCP_LOG_ID_LEN); + KASSERT(len < TCP_LOG_ID_LEN, + ("%s:%d: tp->t_lib->tlb_id too long (%zu)", + __func__, __LINE__, len)); + } else { + *buf = '\0'; + len = 0; + } + return (len); +} + +/* + * Get number of connections with the same log ID. + * Log ID is taken from given TCPCB. + * Called with INPCB locked. + */ +u_int +tcp_log_get_id_cnt(struct tcpcb *tp) +{ + + INP_WLOCK_ASSERT(tp->t_inpcb); + return ((tp->t_lib == NULL) ? 0 : tp->t_lib->tlb_refcnt); +} + +#ifdef TCPLOG_DEBUG_RINGBUF +/* + * Functions/macros to increment/decrement reference count for a log + * entry. This should catch when we do a double-free/double-remove or + * a double-add. + */ +static inline void +_tcp_log_entry_refcnt_add(struct tcp_log_mem *log_entry, const char *func, + int line) +{ + int refcnt; + + refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, 1); + if (refcnt != 0) + panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 0)", + func, line, log_entry, refcnt); +} +#define tcp_log_entry_refcnt_add(l) \ + _tcp_log_entry_refcnt_add((l), __func__, __LINE__) + +static inline void +_tcp_log_entry_refcnt_rem(struct tcp_log_mem *log_entry, const char *func, + int line) +{ + int refcnt; + + refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, -1); + if (refcnt != 1) + panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 1)", + func, line, log_entry, refcnt); +} +#define tcp_log_entry_refcnt_rem(l) \ + _tcp_log_entry_refcnt_rem((l), __func__, __LINE__) + +#else /* !TCPLOG_DEBUG_RINGBUF */ + +#define tcp_log_entry_refcnt_add(l) +#define tcp_log_entry_refcnt_rem(l) + +#endif + +/* + * Cleanup after removing a log entry, but only decrement the count if we + * are running INVARIANTS. + */ +static inline void +tcp_log_free_log_common(struct tcp_log_mem *log_entry, int *count __unused) +{ + + uma_zfree(tcp_log_zone, log_entry); +#ifdef INVARIANTS + (*count)--; + KASSERT(*count >= 0, + ("%s: count unexpectedly negative", __func__)); +#endif +} + +static void +tcp_log_free_entries(struct tcp_log_stailq *head, int *count) +{ + struct tcp_log_mem *log_entry; + + /* Free the entries. */ + while ((log_entry = STAILQ_FIRST(head)) != NULL) { + STAILQ_REMOVE_HEAD(head, tlm_queue); + tcp_log_entry_refcnt_rem(log_entry); + tcp_log_free_log_common(log_entry, count); + } +} + +/* Cleanup after removing a log entry. */ +static inline void +tcp_log_remove_log_cleanup(struct tcpcb *tp, struct tcp_log_mem *log_entry) +{ + uma_zfree(tcp_log_zone, log_entry); + tp->t_lognum--; + KASSERT(tp->t_lognum >= 0, + ("%s: tp->t_lognum unexpectedly negative", __func__)); +} + +/* Remove a log entry from the head of a list. */ +static inline void +tcp_log_remove_log_head(struct tcpcb *tp, struct tcp_log_mem *log_entry) +{ + + KASSERT(log_entry == STAILQ_FIRST(&tp->t_logs), + ("%s: attempt to remove non-HEAD log entry", __func__)); + STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue); + tcp_log_entry_refcnt_rem(log_entry); + tcp_log_remove_log_cleanup(tp, log_entry); +} + +#ifdef TCPLOG_DEBUG_RINGBUF +/* + * Initialize the log entry's reference count, which we want to + * survive allocations. + */ +static int +tcp_log_zone_init(void *mem, int size, int flags __unused) +{ + struct tcp_log_mem *tlm; + + KASSERT(size >= sizeof(struct tcp_log_mem), + ("%s: unexpectedly short (%d) allocation", __func__, size)); + tlm = (struct tcp_log_mem *)mem; + tlm->tlm_refcnt = 0; + return (0); +} + +/* + * Double check that the refcnt is zero on allocation and return. + */ +static int +tcp_log_zone_ctor(void *mem, int size, void *args __unused, int flags __unused) +{ + struct tcp_log_mem *tlm; + + KASSERT(size >= sizeof(struct tcp_log_mem), + ("%s: unexpectedly short (%d) allocation", __func__, size)); + tlm = (struct tcp_log_mem *)mem; + if (tlm->tlm_refcnt != 0) + panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)", + __func__, __LINE__, tlm, tlm->tlm_refcnt); + return (0); +} + +static void +tcp_log_zone_dtor(void *mem, int size, void *args __unused) +{ + struct tcp_log_mem *tlm; + + KASSERT(size >= sizeof(struct tcp_log_mem), + ("%s: unexpectedly short (%d) allocation", __func__, size)); + tlm = (struct tcp_log_mem *)mem; + if (tlm->tlm_refcnt != 0) + panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)", + __func__, __LINE__, tlm, tlm->tlm_refcnt); +} +#endif /* TCPLOG_DEBUG_RINGBUF */ + +/* Do global initialization. */ +void +tcp_log_init(void) +{ + + tcp_log_zone = uma_zcreate("tcp_log", sizeof(struct tcp_log_mem), +#ifdef TCPLOG_DEBUG_RINGBUF + tcp_log_zone_ctor, tcp_log_zone_dtor, tcp_log_zone_init, +#else + NULL, NULL, NULL, +#endif + NULL, UMA_ALIGN_PTR, 0); + (void)uma_zone_set_max(tcp_log_zone, TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT); + tcp_log_bucket_zone = uma_zcreate("tcp_log_bucket", + sizeof(struct tcp_log_id_bucket), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); + tcp_log_node_zone = uma_zcreate("tcp_log_node", + sizeof(struct tcp_log_id_node), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); +#ifdef TCPLOG_DEBUG_COUNTERS + tcp_log_queued = counter_u64_alloc(M_WAITOK); + tcp_log_que_fail1 = counter_u64_alloc(M_WAITOK); + tcp_log_que_fail2 = counter_u64_alloc(M_WAITOK); + tcp_log_que_fail3 = counter_u64_alloc(M_WAITOK); + tcp_log_que_fail4 = counter_u64_alloc(M_WAITOK); + tcp_log_que_fail5 = counter_u64_alloc(M_WAITOK); + tcp_log_que_copyout = counter_u64_alloc(M_WAITOK); + tcp_log_que_read = counter_u64_alloc(M_WAITOK); + tcp_log_que_freed = counter_u64_alloc(M_WAITOK); +#endif + + rw_init_flags(&tcp_id_tree_lock, "TCP ID tree", RW_NEW); + mtx_init(&tcp_log_expireq_mtx, "TCP log expireq", NULL, MTX_DEF); + callout_init(&tcp_log_expireq_callout, 1); +} + +/* Do per-TCPCB initialization. */ +void +tcp_log_tcpcbinit(struct tcpcb *tp) +{ + + /* A new TCPCB should start out zero-initialized. */ + STAILQ_INIT(&tp->t_logs); + + /* + * If we are doing auto-capturing, figure out whether we will capture + * this session. + */ + if (tcp_log_selectauto()) { + tp->t_logstate = tcp_log_auto_mode; + tp->t_flags2 |= TF2_LOG_AUTO; + } +} + + +/* Remove entries */ +static void +tcp_log_expire(void *unused __unused) +{ + struct tcp_log_id_bucket *tlb; + struct tcp_log_id_node *tln; + sbintime_t expiry_limit; + int tree_locked; + + TCPLOG_EXPIREQ_LOCK(); + if (callout_pending(&tcp_log_expireq_callout)) { + /* Callout was reset. */ + TCPLOG_EXPIREQ_UNLOCK(); + return; + } + + /* + * Process entries until we reach one that expires too far in the + * future. Look one second in the future. + */ + expiry_limit = getsbinuptime() + SBT_1S; + tree_locked = TREE_UNLOCKED; + + while ((tln = STAILQ_FIRST(&tcp_log_expireq_head)) != NULL && + tln->tln_expiretime <= expiry_limit) { + if (!callout_active(&tcp_log_expireq_callout)) { + /* + * Callout was stopped. I guess we should + * just quit at this point. + */ + TCPLOG_EXPIREQ_UNLOCK(); + return; + } + + /* + * Remove the node from the head of the list and unlock + * the list. Change the expiry time to SBT_MAX as a signal + * to other threads that we now own this. + */ + STAILQ_REMOVE_HEAD(&tcp_log_expireq_head, tln_expireq); + tln->tln_expiretime = SBT_MAX; + TCPLOG_EXPIREQ_UNLOCK(); + + /* + * Remove the node from the bucket. + */ + tlb = tln->tln_bucket; + TCPID_BUCKET_LOCK(tlb); + if (tcp_log_remove_id_node(NULL, NULL, tlb, tln, &tree_locked)) { + tcp_log_id_validate_tree_lock(tree_locked); + if (tree_locked == TREE_WLOCKED) + TCPID_TREE_WUNLOCK(); + else + TCPID_TREE_RUNLOCK(); + tree_locked = TREE_UNLOCKED; + } + + /* Drop the INP reference. */ + INP_WLOCK(tln->tln_inp); + if (!in_pcbrele_wlocked(tln->tln_inp)) + INP_WUNLOCK(tln->tln_inp); + + /* Free the log records. */ + tcp_log_free_entries(&tln->tln_entries, &tln->tln_count); + + /* Free the node. */ + uma_zfree(tcp_log_node_zone, tln); + + /* Relock the expiry queue. */ + TCPLOG_EXPIREQ_LOCK(); + } + + /* + * We've expired all the entries we can. Do we need to reschedule + * ourselves? + */ + callout_deactivate(&tcp_log_expireq_callout); + if (tln != NULL) { + /* + * Get max(now + TCP_LOG_EXPIRE_INTVL, tln->tln_expiretime) and + * set the next callout to that. (This helps ensure we generally + * run the callout no more often than desired.) + */ + expiry_limit = getsbinuptime() + TCP_LOG_EXPIRE_INTVL; + if (expiry_limit < tln->tln_expiretime) + expiry_limit = tln->tln_expiretime; + callout_reset_sbt(&tcp_log_expireq_callout, expiry_limit, + SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE); + } + + /* We're done. */ + TCPLOG_EXPIREQ_UNLOCK(); + return; +} + +/* + * Move log data from the TCPCB to a new node. This will reset the TCPCB log + * entries and log count; however, it will not touch other things from the + * TCPCB (e.g. t_lin, t_lib). + * + * NOTE: Must hold a lock on the INP. + */ +static void +tcp_log_move_tp_to_node(struct tcpcb *tp, struct tcp_log_id_node *tln) +{ + + INP_WLOCK_ASSERT(tp->t_inpcb); + + tln->tln_ie = tp->t_inpcb->inp_inc.inc_ie; + if (tp->t_inpcb->inp_inc.inc_flags & INC_ISIPV6) + tln->tln_af = AF_INET6; + else + tln->tln_af = AF_INET; + tln->tln_entries = tp->t_logs; + tln->tln_count = tp->t_lognum; + tln->tln_bucket = tp->t_lib; + + /* Clear information from the PCB. */ + STAILQ_INIT(&tp->t_logs); + tp->t_lognum = 0; +} + +/* Do per-TCPCB cleanup */ +void +tcp_log_tcpcbfini(struct tcpcb *tp) +{ + struct tcp_log_id_node *tln, *tln_first; + struct tcp_log_mem *log_entry; + sbintime_t callouttime; + + INP_WLOCK_ASSERT(tp->t_inpcb); + + /* + * If we were gathering packets to be automatically dumped, try to do + * it now. If this succeeds, the log information in the TCPCB will be + * cleared. Otherwise, we'll handle the log information as we do + * for other states. + */ + switch(tp->t_logstate) { + case TCP_LOG_STATE_HEAD_AUTO: + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", + M_NOWAIT, false); + break; + case TCP_LOG_STATE_TAIL_AUTO: + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail", + M_NOWAIT, false); + break; + case TCP_LOG_STATE_CONTINUAL: + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", + M_NOWAIT, false); + break; + } + + /* + * There are two ways we could keep logs: per-socket or per-ID. If + * we are tracking logs with an ID, then the logs survive the + * destruction of the TCPCB. + * + * If the TCPCB is associated with an ID node, move the logs from the + * TCPCB to the ID node. In theory, this is safe, for reasons which I + * will now explain for my own benefit when I next need to figure out + * this code. :-) + * + * We own the INP lock. Therefore, no one else can change the contents + * of this node (Rule C). Further, no one can remove this node from + * the bucket while we hold the lock (Rule D). Basically, no one can + * mess with this node. That leaves two states in which we could be: + * + * 1. Another thread is currently waiting to acquire the INP lock, with + * plans to do something with this node. When we drop the INP lock, + * they will have a chance to do that. They will recheck the + * tln_closed field (see note to Rule C) and then acquire the + * bucket lock before proceeding further. + * + * 2. Another thread will try to acquire a lock at some point in the + * future. If they try to acquire a lock before we set the + * tln_closed field, they will follow state #1. If they try to + * acquire a lock after we set the tln_closed field, they will be + * able to make changes to the node, at will, following Rule C. + * + * Therefore, we currently own this node and can make any changes + * we want. But, as soon as we set the tln_closed field to true, we + * have effectively dropped our lock on the node. (For this reason, we + * also need to make sure our writes are ordered correctly. An atomic + * operation with "release" semantics should be sufficient.) + */ + + if (tp->t_lin != NULL) { + /* Copy the relevant information to the log entry. */ + tln = tp->t_lin; + KASSERT(tln->tln_inp == tp->t_inpcb, + ("%s: Mismatched inp (tln->tln_inp=%p, tp->t_inpcb=%p)", + __func__, tln->tln_inp, tp->t_inpcb)); + tcp_log_move_tp_to_node(tp, tln); + + /* Clear information from the PCB. */ + tp->t_lin = NULL; + tp->t_lib = NULL; + + /* + * Take a reference on the INP. This ensures that the INP + * remains valid while the node is on the expiry queue. This + * ensures the INP is valid for other threads that may be + * racing to lock this node when we move it to the expire + * queue. + */ + in_pcbref(tp->t_inpcb); + + /* + * Store the entry on the expiry list. The exact behavior + * depends on whether we have entries to keep. If so, we + * put the entry at the tail of the list and expire in + * TCP_LOG_EXPIRE_TIME. Otherwise, we expire "now" and put + * the entry at the head of the list. (Handling the cleanup + * via the expiry timer lets us avoid locking messy-ness here.) + */ + tln->tln_expiretime = getsbinuptime(); + TCPLOG_EXPIREQ_LOCK(); + if (tln->tln_count) { + tln->tln_expiretime += TCP_LOG_EXPIRE_TIME; + if (STAILQ_EMPTY(&tcp_log_expireq_head) && + !callout_active(&tcp_log_expireq_callout)) { + /* + * We are adding the first entry and a callout + * is not currently scheduled; therefore, we + * need to schedule one. + */ + callout_reset_sbt(&tcp_log_expireq_callout, + tln->tln_expiretime, SBT_1S, tcp_log_expire, + NULL, C_ABSOLUTE); + } + STAILQ_INSERT_TAIL(&tcp_log_expireq_head, tln, + tln_expireq); + } else { + callouttime = tln->tln_expiretime + + TCP_LOG_EXPIRE_INTVL; + tln_first = STAILQ_FIRST(&tcp_log_expireq_head); + + if ((tln_first == NULL || + callouttime < tln_first->tln_expiretime) && + (callout_pending(&tcp_log_expireq_callout) || + !callout_active(&tcp_log_expireq_callout))) { + /* + * The list is empty, or we want to run the + * expire code before the first entry's timer + * fires. Also, we are in a case where a callout + * is not actively running. We want to reset + * the callout to occur sooner. + */ + callout_reset_sbt(&tcp_log_expireq_callout, + callouttime, SBT_1S, tcp_log_expire, NULL, + C_ABSOLUTE); + } + + /* + * Insert to the head, or just after the head, as + * appropriate. (This might result in small + * mis-orderings as a bunch of "expire now" entries + * gather at the start of the list, but that should + * not produce big problems, since the expire timer + * will walk through all of them.) + */ + if (tln_first == NULL || + tln->tln_expiretime < tln_first->tln_expiretime) + STAILQ_INSERT_HEAD(&tcp_log_expireq_head, tln, + tln_expireq); + else + STAILQ_INSERT_AFTER(&tcp_log_expireq_head, + tln_first, tln, tln_expireq); + } + TCPLOG_EXPIREQ_UNLOCK(); + + /* + * We are done messing with the tln. After this point, we + * can't touch it. (Note that the "release" semantics should + * be included with the TCPLOG_EXPIREQ_UNLOCK() call above. + * Therefore, they should be unnecessary here. However, it + * seems like a good idea to include them anyway, since we + * really are releasing a lock here.) + */ + atomic_store_rel_int(&tln->tln_closed, 1); + } else { + /* Remove log entries. */ + while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) + tcp_log_remove_log_head(tp, log_entry); + KASSERT(tp->t_lognum == 0, + ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", + __func__, tp->t_lognum)); + } + + /* + * Change the log state to off (just in case anything tries to sneak + * in a last-minute log). + */ + tp->t_logstate = TCP_LOG_STATE_OFF; +} + +/* + * This logs an event for a TCP socket. Normally, this is called via + * TCP_LOG_EVENT or TCP_LOG_EVENT_VERBOSE. See the documentation for + * TCP_LOG_EVENT(). + */ + +struct tcp_log_buffer * +tcp_log_event_(struct tcpcb *tp, struct tcphdr *th, struct sockbuf *rxbuf, + struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len, + union tcp_log_stackspecific *stackinfo, int th_hostorder, + const char *output_caller, const char *func, int line, const struct timeval *itv) +{ + struct tcp_log_mem *log_entry; + struct tcp_log_buffer *log_buf; + int attempt_count = 0; + struct tcp_log_verbose *log_verbose; + uint32_t logsn; + + KASSERT((func == NULL && line == 0) || (func != NULL && line > 0), + ("%s called with inconsistent func (%p) and line (%d) arguments", + __func__, func, line)); + + INP_WLOCK_ASSERT(tp->t_inpcb); + + KASSERT(tp->t_logstate == TCP_LOG_STATE_HEAD || + tp->t_logstate == TCP_LOG_STATE_TAIL || + tp->t_logstate == TCP_LOG_STATE_CONTINUAL || + tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO || + tp->t_logstate == TCP_LOG_STATE_TAIL_AUTO, + ("%s called with unexpected tp->t_logstate (%d)", __func__, + tp->t_logstate)); + + /* + * Get the serial number. We do this early so it will + * increment even if we end up skipping the log entry for some + * reason. + */ + logsn = tp->t_logsn++; + + /* + * Can we get a new log entry? If so, increment the lognum counter + * here. + */ +retry: + if (tp->t_lognum < tcp_log_session_limit) { + if ((log_entry = uma_zalloc(tcp_log_zone, M_NOWAIT)) != NULL) + tp->t_lognum++; + } else + log_entry = NULL; + + /* Do we need to try to reuse? */ + if (log_entry == NULL) { + /* + * Sacrifice auto-logged sessions without a log ID if + * tcp_log_auto_all is false. (If they don't have a log + * ID by now, it is probable that either they won't get one + * or we are resource-constrained.) + */ + if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) && + !tcp_log_auto_all) { + if (tcp_log_state_change(tp, TCP_LOG_STATE_CLEAR)) { +#ifdef INVARIANTS + panic("%s:%d: tcp_log_state_change() failed " + "to set tp %p to TCP_LOG_STATE_CLEAR", + __func__, __LINE__, tp); +#endif + tp->t_logstate = TCP_LOG_STATE_OFF; + } + return (NULL); + } + /* + * If we are in TCP_LOG_STATE_HEAD_AUTO state, try to dump + * the buffers. If successful, deactivate tracing. Otherwise, + * leave it active so we will retry. + */ + if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO && + !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head", + M_NOWAIT, false)) { + tp->t_logstate = TCP_LOG_STATE_OFF; + return(NULL); + } else if ((tp->t_logstate == TCP_LOG_STATE_CONTINUAL) && + !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", + M_NOWAIT, false)) { + if (attempt_count == 0) { + attempt_count++; + goto retry; + } +#ifdef TCPLOG_DEBUG_COUNTERS + counter_u64_add(tcp_log_que_fail4, 1); +#endif + return(NULL); + } else if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) + return(NULL); + + /* If in HEAD state, just deactivate the tracing and return. */ + if (tp->t_logstate == TCP_LOG_STATE_HEAD) { + tp->t_logstate = TCP_LOG_STATE_OFF; + return(NULL); + } + + /* + * Get a buffer to reuse. If that fails, just give up. + * (We can't log anything without a buffer in which to + * put it.) + * + * Note that we don't change the t_lognum counter + * here. Because we are re-using the buffer, the total + * number won't change. + */ + if ((log_entry = STAILQ_FIRST(&tp->t_logs)) == NULL) + return(NULL); + STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue); + tcp_log_entry_refcnt_rem(log_entry); + } + + KASSERT(log_entry != NULL, + ("%s: log_entry unexpectedly NULL", __func__)); + + /* Extract the log buffer and verbose buffer pointers. */ + log_buf = &log_entry->tlm_buf; + log_verbose = &log_entry->tlm_v; + + /* Basic entries. */ + if (itv == NULL) + getmicrouptime(&log_buf->tlb_tv); + else + memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval)); + log_buf->tlb_ticks = ticks; + log_buf->tlb_sn = logsn; + log_buf->tlb_stackid = tp->t_fb->tfb_id; + log_buf->tlb_eventid = eventid; + log_buf->tlb_eventflags = 0; + log_buf->tlb_errno = errornum; + + /* Socket buffers */ + if (rxbuf != NULL) { + log_buf->tlb_eventflags |= TLB_FLAG_RXBUF; + log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc; + log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc; + log_buf->tlb_rxbuf.tls_sb_spare = 0; + } + if (txbuf != NULL) { + log_buf->tlb_eventflags |= TLB_FLAG_TXBUF; + log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc; + log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc; + log_buf->tlb_txbuf.tls_sb_spare = 0; + } + /* Copy values from tp to the log entry. */ +#define COPY_STAT(f) log_buf->tlb_ ## f = tp->f +#define COPY_STAT_T(f) log_buf->tlb_ ## f = tp->t_ ## f + COPY_STAT_T(state); + COPY_STAT_T(starttime); + COPY_STAT(iss); + COPY_STAT_T(flags); + COPY_STAT(snd_una); + COPY_STAT(snd_max); + COPY_STAT(snd_cwnd); + COPY_STAT(snd_nxt); + COPY_STAT(snd_recover); + COPY_STAT(snd_wnd); + COPY_STAT(snd_ssthresh); + COPY_STAT_T(srtt); + COPY_STAT_T(rttvar); + COPY_STAT(rcv_up); + COPY_STAT(rcv_adv); + COPY_STAT(rcv_nxt); + COPY_STAT(sack_newdata); + COPY_STAT(rcv_wnd); + COPY_STAT_T(dupacks); + COPY_STAT_T(segqlen); + COPY_STAT(snd_numholes); + COPY_STAT(snd_scale); + COPY_STAT(rcv_scale); +#undef COPY_STAT +#undef COPY_STAT_T + log_buf->tlb_flex1 = 0; + log_buf->tlb_flex2 = 0; + /* Copy stack-specific info. */ + if (stackinfo != NULL) { + memcpy(&log_buf->tlb_stackinfo, stackinfo, + sizeof(log_buf->tlb_stackinfo)); + log_buf->tlb_eventflags |= TLB_FLAG_STACKINFO; + } + + /* The packet */ + log_buf->tlb_len = len; + if (th) { + int optlen; + + log_buf->tlb_eventflags |= TLB_FLAG_HDR; + log_buf->tlb_th = *th; + if (th_hostorder) + tcp_fields_to_net(&log_buf->tlb_th); + optlen = (th->th_off << 2) - sizeof (struct tcphdr); + if (optlen > 0) + memcpy(log_buf->tlb_opts, th + 1, optlen); + } + + /* Verbose information */ + if (func != NULL) { + log_buf->tlb_eventflags |= TLB_FLAG_VERBOSE; + if (output_caller != NULL) + strlcpy(log_verbose->tlv_snd_frm, output_caller, + TCP_FUNC_LEN); + else + *log_verbose->tlv_snd_frm = 0; + strlcpy(log_verbose->tlv_trace_func, func, TCP_FUNC_LEN); + log_verbose->tlv_trace_line = line; + } + + /* Insert the new log at the tail. */ + STAILQ_INSERT_TAIL(&tp->t_logs, log_entry, tlm_queue); + tcp_log_entry_refcnt_add(log_entry); + return (log_buf); +} + +/* + * Change the logging state for a TCPCB. Returns 0 on success or an + * error code on failure. + */ +int +tcp_log_state_change(struct tcpcb *tp, int state) +{ + struct tcp_log_mem *log_entry; + + INP_WLOCK_ASSERT(tp->t_inpcb); + switch(state) { + case TCP_LOG_STATE_CLEAR: + while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) + tcp_log_remove_log_head(tp, log_entry); + /* Fall through */ + + case TCP_LOG_STATE_OFF: + tp->t_logstate = TCP_LOG_STATE_OFF; + break; + + case TCP_LOG_STATE_TAIL: + case TCP_LOG_STATE_HEAD: + case TCP_LOG_STATE_CONTINUAL: + case TCP_LOG_STATE_HEAD_AUTO: + case TCP_LOG_STATE_TAIL_AUTO: + tp->t_logstate = state; + break; + + default: + return (EINVAL); + } + + tp->t_flags2 &= ~(TF2_LOG_AUTO); + + return (0); +} + +/* If tcp_drain() is called, flush half the log entries. */ +void +tcp_log_drain(struct tcpcb *tp) +{ + struct tcp_log_mem *log_entry, *next; + int target, skip; + + INP_WLOCK_ASSERT(tp->t_inpcb); + if ((target = tp->t_lognum / 2) == 0) + return; + + /* + * If we are logging the "head" packets, we want to discard + * from the tail of the queue. Otherwise, we want to discard + * from the head. + */ + if (tp->t_logstate == TCP_LOG_STATE_HEAD || + tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) { + skip = tp->t_lognum - target; + STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue) + if (!--skip) + break; + KASSERT(log_entry != NULL, + ("%s: skipped through all entries!", __func__)); + if (log_entry == NULL) + return; + while ((next = STAILQ_NEXT(log_entry, tlm_queue)) != NULL) { + STAILQ_REMOVE_AFTER(&tp->t_logs, log_entry, tlm_queue); + tcp_log_entry_refcnt_rem(next); + tcp_log_remove_log_cleanup(tp, next); +#ifdef INVARIANTS + target--; +#endif + } + KASSERT(target == 0, + ("%s: After removing from tail, target was %d", __func__, + target)); + } else if (tp->t_logstate == TCP_LOG_STATE_CONTINUAL) { + (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual", + M_NOWAIT, false); + } else { + while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL && + target--) + tcp_log_remove_log_head(tp, log_entry); + KASSERT(target <= 0, + ("%s: After removing from head, target was %d", __func__, + target)); + KASSERT(tp->t_lognum > 0, + ("%s: After removing from head, tp->t_lognum was %d", + __func__, target)); + KASSERT(log_entry != NULL, + ("%s: After removing from head, the tailq was empty", + __func__)); + } +} + +static inline int +tcp_log_copyout(struct sockopt *sopt, void *src, void *dst, size_t len) +{ + + if (sopt->sopt_td != NULL) + return (copyout(src, dst, len)); + bcopy(src, dst, len); + return (0); +} + +static int +tcp_log_logs_to_buf(struct sockopt *sopt, struct tcp_log_stailq *log_tailqp, + struct tcp_log_buffer **end, int count) +{ + struct tcp_log_buffer *out_entry; + struct tcp_log_mem *log_entry; + size_t entrysize; + int error; +#ifdef INVARIANTS + int orig_count = count; +#endif + + /* Copy the data out. */ + error = 0; + out_entry = (struct tcp_log_buffer *) sopt->sopt_val; + STAILQ_FOREACH(log_entry, log_tailqp, tlm_queue) { + count--; + KASSERT(count >= 0, + ("%s:%d: Exceeded expected count (%d) processing list %p", + __func__, __LINE__, orig_count, log_tailqp)); + +#ifdef TCPLOG_DEBUG_COUNTERS + counter_u64_add(tcp_log_que_copyout, 1); +#endif +#if 0 + struct tcp_log_buffer *lb = &log_entry->tlm_buf; + int i; + + printf("lb = %p:\n", lb); +#define PRINT(f) printf(#f " = %u\n", (unsigned int)lb->f) + printf("tlb_tv = {%lu, %lu}\n", lb->tlb_tv.tv_sec, lb->tlb_tv.tv_usec); + PRINT(tlb_ticks); + PRINT(tlb_sn); + PRINT(tlb_stackid); + PRINT(tlb_eventid); + PRINT(tlb_eventflags); + PRINT(tlb_errno); + PRINT(tlb_rxbuf.tls_sb_acc); + PRINT(tlb_rxbuf.tls_sb_ccc); + PRINT(tlb_rxbuf.tls_sb_spare); + PRINT(tlb_txbuf.tls_sb_acc); + PRINT(tlb_txbuf.tls_sb_ccc); + PRINT(tlb_txbuf.tls_sb_spare); + PRINT(tlb_state); + PRINT(tlb_flags); + PRINT(tlb_snd_una); + PRINT(tlb_snd_max); + PRINT(tlb_snd_cwnd); + PRINT(tlb_snd_nxt); + PRINT(tlb_snd_recover); + PRINT(tlb_snd_wnd); + PRINT(tlb_snd_ssthresh); + PRINT(tlb_srtt); + PRINT(tlb_rttvar); + PRINT(tlb_rcv_up); + PRINT(tlb_rcv_adv); + PRINT(tlb_rcv_nxt); + PRINT(tlb_sack_newdata); + PRINT(tlb_rcv_wnd); + PRINT(tlb_dupacks); + PRINT(tlb_segqlen); + PRINT(tlb_snd_numholes); + PRINT(tlb_snd_scale); + PRINT(tlb_rcv_scale); + PRINT(tlb_len); + printf("hex dump: "); + for (i = 0; i < sizeof(struct tcp_log_buffer); i++) + printf("%02x", *(((uint8_t *)lb) + i)); +#undef PRINT +#endif + /* + * Skip copying out the header if it isn't present. + * Instead, copy out zeros (to ensure we don't leak info). + * TODO: Make sure we truly do zero everything we don't + * explicitly set. + */ + if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR) + entrysize = sizeof(struct tcp_log_buffer); + else + entrysize = offsetof(struct tcp_log_buffer, tlb_th); + error = tcp_log_copyout(sopt, &log_entry->tlm_buf, out_entry, + entrysize); + if (error) + break; + if (!(log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)) { + error = tcp_log_copyout(sopt, zerobuf, + ((uint8_t *)out_entry) + entrysize, + sizeof(struct tcp_log_buffer) - entrysize); + } + + /* + * Copy out the verbose bit, if needed. Either way, + * increment the output pointer the correct amount. + */ + if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) { + error = tcp_log_copyout(sopt, &log_entry->tlm_v, + out_entry->tlb_verbose, + sizeof(struct tcp_log_verbose)); + if (error) + break; + out_entry = (struct tcp_log_buffer *) + (((uint8_t *) (out_entry + 1)) + + sizeof(struct tcp_log_verbose)); + } else + out_entry++; + } + *end = out_entry; + KASSERT(error || count == 0, + ("%s:%d: Less than expected count (%d) processing list %p" + " (%d remain)", __func__, __LINE__, orig_count, + log_tailqp, count)); + + return (error); +} + +/* + * Copy out the buffer. Note that we do incremental copying, so + * sooptcopyout() won't work. However, the goal is to produce the same + * end result as if we copied in the entire user buffer, updated it, + * and then used sooptcopyout() to copy it out. + * + * NOTE: This should be called with a write lock on the PCB; however, + * the function will drop it after it extracts the data from the TCPCB. + */ +int +tcp_log_getlogbuf(struct sockopt *sopt, struct tcpcb *tp) +{ + struct tcp_log_stailq log_tailq; + struct tcp_log_mem *log_entry, *log_next; + struct tcp_log_buffer *out_entry; + struct inpcb *inp; + size_t outsize, entrysize; + int error, outnum; + + INP_WLOCK_ASSERT(tp->t_inpcb); + inp = tp->t_inpcb; + + /* + * Determine which log entries will fit in the buffer. As an + * optimization, skip this if all the entries will clearly fit + * in the buffer. (However, get an exact size if we are using + * INVARIANTS.) + */ +#ifndef INVARIANTS + if (sopt->sopt_valsize / (sizeof(struct tcp_log_buffer) + + sizeof(struct tcp_log_verbose)) >= tp->t_lognum) { + log_entry = STAILQ_LAST(&tp->t_logs, tcp_log_mem, tlm_queue); + log_next = NULL; + outsize = 0; + outnum = tp->t_lognum; + } else { +#endif + outsize = outnum = 0; + log_entry = NULL; + STAILQ_FOREACH(log_next, &tp->t_logs, tlm_queue) { + entrysize = sizeof(struct tcp_log_buffer); + if (log_next->tlm_buf.tlb_eventflags & + TLB_FLAG_VERBOSE) + entrysize += sizeof(struct tcp_log_verbose); + if ((sopt->sopt_valsize - outsize) < entrysize) + break; + outsize += entrysize; + outnum++; + log_entry = log_next; + } + KASSERT(outsize <= sopt->sopt_valsize, + ("%s: calculated output size (%zu) greater than available" + "space (%zu)", __func__, outsize, sopt->sopt_valsize)); +#ifndef INVARIANTS + } +#endif + + /* + * Copy traditional sooptcopyout() behavior: if sopt->sopt_val + * is NULL, silently skip the copy. However, in this case, we + * will leave the list alone and return. Functionally, this + * gives userspace a way to poll for an approximate buffer + * size they will need to get the log entries. + */ + if (sopt->sopt_val == NULL) { + INP_WUNLOCK(inp); + if (outsize == 0) { + outsize = outnum * (sizeof(struct tcp_log_buffer) + + sizeof(struct tcp_log_verbose)); + } + if (sopt->sopt_valsize > outsize) + sopt->sopt_valsize = outsize; + return (0); + } + + /* + * Break apart the list. We'll save the ones we want to copy + * out locally and remove them from the TCPCB list. We can + * then drop the INPCB lock while we do the copyout. + * + * There are roughly three cases: + * 1. There was nothing to copy out. That's easy: drop the + * lock and return. + * 2. We are copying out the entire list. Again, that's easy: + * move the whole list. + * 3. We are copying out a partial list. That's harder. We + * need to update the list book-keeping entries. + */ + if (log_entry != NULL && log_next == NULL) { + /* Move entire list. */ + KASSERT(outnum == tp->t_lognum, + ("%s:%d: outnum (%d) should match tp->t_lognum (%d)", + __func__, __LINE__, outnum, tp->t_lognum)); + log_tailq = tp->t_logs; + tp->t_lognum = 0; + STAILQ_INIT(&tp->t_logs); + } else if (log_entry != NULL) { + /* Move partial list. */ + KASSERT(outnum < tp->t_lognum, + ("%s:%d: outnum (%d) not less than tp->t_lognum (%d)", + __func__, __LINE__, outnum, tp->t_lognum)); + STAILQ_FIRST(&log_tailq) = STAILQ_FIRST(&tp->t_logs); + STAILQ_FIRST(&tp->t_logs) = STAILQ_NEXT(log_entry, tlm_queue); + KASSERT(STAILQ_NEXT(log_entry, tlm_queue) != NULL, + ("%s:%d: tp->t_logs is unexpectedly shorter than expected" + "(tp: %p, log_tailq: %p, outnum: %d, tp->t_lognum: %d)", + __func__, __LINE__, tp, &log_tailq, outnum, tp->t_lognum)); + STAILQ_NEXT(log_entry, tlm_queue) = NULL; + log_tailq.stqh_last = &STAILQ_NEXT(log_entry, tlm_queue); + tp->t_lognum -= outnum; + } else + STAILQ_INIT(&log_tailq); + + /* Drop the PCB lock. */ + INP_WUNLOCK(inp); + + /* Copy the data out. */ + error = tcp_log_logs_to_buf(sopt, &log_tailq, &out_entry, outnum); + + if (error) { + /* Restore list */ + INP_WLOCK(inp); + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0) { + tp = intotcpcb(inp); + + /* Merge the two lists. */ + STAILQ_CONCAT(&log_tailq, &tp->t_logs); + tp->t_logs = log_tailq; + tp->t_lognum += outnum; + } + INP_WUNLOCK(inp); + } else { + /* Sanity check entries */ + KASSERT(((caddr_t)out_entry - (caddr_t)sopt->sopt_val) == + outsize, ("%s: Actual output size (%zu) != " + "calculated output size (%zu)", __func__, + (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val), + outsize)); + + /* Free the entries we just copied out. */ + STAILQ_FOREACH_SAFE(log_entry, &log_tailq, tlm_queue, log_next) { + tcp_log_entry_refcnt_rem(log_entry); + uma_zfree(tcp_log_zone, log_entry); + } + } + + sopt->sopt_valsize = (size_t)((caddr_t)out_entry - + (caddr_t)sopt->sopt_val); + return (error); +} + +static void +tcp_log_free_queue(struct tcp_log_dev_queue *param) +{ + struct tcp_log_dev_log_queue *entry; + + KASSERT(param != NULL, ("%s: called with NULL param", __func__)); + if (param == NULL) + return; + + entry = (struct tcp_log_dev_log_queue *)param; + + /* Free the entries. */ + tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count); + + /* Free the buffer, if it is allocated. */ + if (entry->tldl_common.tldq_buf != NULL) + free(entry->tldl_common.tldq_buf, M_TCPLOGDEV); + + /* Free the queue entry. */ + free(entry, M_TCPLOGDEV); +} + +static struct tcp_log_common_header * +tcp_log_expandlogbuf(struct tcp_log_dev_queue *param) +{ + struct tcp_log_dev_log_queue *entry; + struct tcp_log_header *hdr; + uint8_t *end; + struct sockopt sopt; + int error; + + entry = (struct tcp_log_dev_log_queue *)param; + + /* Take a worst-case guess at space needs. */ + sopt.sopt_valsize = sizeof(struct tcp_log_header) + + entry->tldl_count * (sizeof(struct tcp_log_buffer) + + sizeof(struct tcp_log_verbose)); + hdr = malloc(sopt.sopt_valsize, M_TCPLOGDEV, M_NOWAIT); + if (hdr == NULL) { +#ifdef TCPLOG_DEBUG_COUNTERS + counter_u64_add(tcp_log_que_fail5, entry->tldl_count); +#endif + return (NULL); + } + sopt.sopt_val = hdr + 1; + sopt.sopt_valsize -= sizeof(struct tcp_log_header); + sopt.sopt_td = NULL; + + error = tcp_log_logs_to_buf(&sopt, &entry->tldl_entries, + (struct tcp_log_buffer **)&end, entry->tldl_count); + if (error) { + free(hdr, M_TCPLOGDEV); + return (NULL); + } + + /* Free the entries. */ + tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count); + entry->tldl_count = 0; + + memset(hdr, 0, sizeof(struct tcp_log_header)); + hdr->tlh_version = TCP_LOG_BUF_VER; + hdr->tlh_type = TCP_LOG_DEV_TYPE_BBR; + hdr->tlh_length = end - (uint8_t *)hdr; + hdr->tlh_ie = entry->tldl_ie; + hdr->tlh_af = entry->tldl_af; + getboottime(&hdr->tlh_offset); + strlcpy(hdr->tlh_id, entry->tldl_id, TCP_LOG_ID_LEN); + strlcpy(hdr->tlh_reason, entry->tldl_reason, TCP_LOG_REASON_LEN); + return ((struct tcp_log_common_header *)hdr); +} + +/* + * Queue the tcpcb's log buffer for transmission via the log buffer facility. + * + * NOTE: This should be called with a write lock on the PCB. + * + * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop + * and reacquire the INP lock if it needs to do so. + * + * If force is false, this will only dump auto-logged sessions if + * tcp_log_auto_all is true or if there is a log ID defined for the session. + */ +int +tcp_log_dump_tp_logbuf(struct tcpcb *tp, char *reason, int how, bool force) +{ + struct tcp_log_dev_log_queue *entry; + struct inpcb *inp; +#ifdef TCPLOG_DEBUG_COUNTERS + int num_entries; +#endif + + inp = tp->t_inpcb; + INP_WLOCK_ASSERT(inp); + + /* If there are no log entries, there is nothing to do. */ + if (tp->t_lognum == 0) + return (0); + + /* Check for a log ID. */ + if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) && + !tcp_log_auto_all && !force) { + struct tcp_log_mem *log_entry; + + /* + * We needed a log ID and none was found. Free the log entries + * and return success. Also, cancel further logging. If the + * session doesn't have a log ID by now, we'll assume it isn't + * going to get one. + */ + while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL) + tcp_log_remove_log_head(tp, log_entry); + KASSERT(tp->t_lognum == 0, + ("%s: After freeing entries, tp->t_lognum=%d (expected 0)", + __func__, tp->t_lognum)); + tp->t_logstate = TCP_LOG_STATE_OFF; + return (0); + } + + /* + * Allocate memory. If we must wait, we'll need to drop the locks + * and reacquire them (and do all the related business that goes + * along with that). + */ + entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, + M_NOWAIT); + if (entry == NULL && (how & M_NOWAIT)) { +#ifdef TCPLOG_DEBUG_COUNTERS + counter_u64_add(tcp_log_que_fail3, 1); +#endif + return (ENOBUFS); + } + if (entry == NULL) { + INP_WUNLOCK(inp); + entry = malloc(sizeof(struct tcp_log_dev_log_queue), + M_TCPLOGDEV, M_WAITOK); + INP_WLOCK(inp); + /* + * Note that this check is slightly overly-restrictive in + * that the TCB can survive either of these events. + * However, there is currently not a good way to ensure + * that is the case. So, if we hit this M_WAIT path, we + * may end up dropping some entries. That seems like a + * small price to pay for safety. + */ + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + free(entry, M_TCPLOGDEV); +#ifdef TCPLOG_DEBUG_COUNTERS + counter_u64_add(tcp_log_que_fail2, 1); +#endif + return (ECONNRESET); + } + tp = intotcpcb(inp); + if (tp->t_lognum == 0) { + free(entry, M_TCPLOGDEV); + return (0); + } + } + + /* Fill in the unique parts of the queue entry. */ + if (tp->t_lib != NULL) + strlcpy(entry->tldl_id, tp->t_lib->tlb_id, TCP_LOG_ID_LEN); + else + strlcpy(entry->tldl_id, "UNKNOWN", TCP_LOG_ID_LEN); + if (reason != NULL) + strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN); + else + strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN); + entry->tldl_ie = inp->inp_inc.inc_ie; + if (inp->inp_inc.inc_flags & INC_ISIPV6) + entry->tldl_af = AF_INET6; + else + entry->tldl_af = AF_INET; + entry->tldl_entries = tp->t_logs; + entry->tldl_count = tp->t_lognum; + + /* Fill in the common parts of the queue entry. */ + entry->tldl_common.tldq_buf = NULL; + entry->tldl_common.tldq_xform = tcp_log_expandlogbuf; + entry->tldl_common.tldq_dtor = tcp_log_free_queue; + + /* Clear the log data from the TCPCB. */ +#ifdef TCPLOG_DEBUG_COUNTERS + num_entries = tp->t_lognum; +#endif + tp->t_lognum = 0; + STAILQ_INIT(&tp->t_logs); + + /* Add the entry. If no one is listening, free the entry. */ + if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) { + tcp_log_free_queue((struct tcp_log_dev_queue *)entry); +#ifdef TCPLOG_DEBUG_COUNTERS + counter_u64_add(tcp_log_que_fail1, num_entries); + } else { + counter_u64_add(tcp_log_queued, num_entries); +#endif + } + return (0); +} + +/* + * Queue the log_id_node's log buffers for transmission via the log buffer + * facility. + * + * NOTE: This should be called with the bucket locked and referenced. + * + * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop + * and reacquire the bucket lock if it needs to do so. (The caller must + * ensure that the tln is no longer on any lists so no one else will mess + * with this while the lock is dropped!) + */ +static int +tcp_log_dump_node_logbuf(struct tcp_log_id_node *tln, char *reason, int how) +{ + struct tcp_log_dev_log_queue *entry; + struct tcp_log_id_bucket *tlb; + + tlb = tln->tln_bucket; + TCPID_BUCKET_LOCK_ASSERT(tlb); + KASSERT(tlb->tlb_refcnt > 0, + ("%s:%d: Called with unreferenced bucket (tln=%p, tlb=%p)", + __func__, __LINE__, tln, tlb)); + KASSERT(tln->tln_closed, + ("%s:%d: Called for node with tln_closed==false (tln=%p)", + __func__, __LINE__, tln)); + + /* If there are no log entries, there is nothing to do. */ + if (tln->tln_count == 0) + return (0); + + /* + * Allocate memory. If we must wait, we'll need to drop the locks + * and reacquire them (and do all the related business that goes + * along with that). + */ + entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV, + M_NOWAIT); + if (entry == NULL && (how & M_NOWAIT)) + return (ENOBUFS); + if (entry == NULL) { + TCPID_BUCKET_UNLOCK(tlb); + entry = malloc(sizeof(struct tcp_log_dev_log_queue), + M_TCPLOGDEV, M_WAITOK); + TCPID_BUCKET_LOCK(tlb); + } + + /* Fill in the common parts of the queue entry.. */ + entry->tldl_common.tldq_buf = NULL; + entry->tldl_common.tldq_xform = tcp_log_expandlogbuf; + entry->tldl_common.tldq_dtor = tcp_log_free_queue; + + /* Fill in the unique parts of the queue entry. */ + strlcpy(entry->tldl_id, tlb->tlb_id, TCP_LOG_ID_LEN); + if (reason != NULL) + strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN); + else + strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN); + entry->tldl_ie = tln->tln_ie; + entry->tldl_entries = tln->tln_entries; + entry->tldl_count = tln->tln_count; + entry->tldl_af = tln->tln_af; + + /* Add the entry. If no one is listening, free the entry. */ + if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) + tcp_log_free_queue((struct tcp_log_dev_queue *)entry); + + return (0); +} + + +/* + * Queue the log buffers for all sessions in a bucket for transmissions via + * the log buffer facility. + * + * NOTE: This should be called with a locked bucket; however, the function + * will drop the lock. + */ +#define LOCAL_SAVE 10 +static void +tcp_log_dumpbucketlogs(struct tcp_log_id_bucket *tlb, char *reason) +{ + struct tcp_log_id_node local_entries[LOCAL_SAVE]; + struct inpcb *inp; + struct tcpcb *tp; + struct tcp_log_id_node *cur_tln, *prev_tln, *tmp_tln; + int i, num_local_entries, tree_locked; + bool expireq_locked; + + TCPID_BUCKET_LOCK_ASSERT(tlb); + + /* + * Take a reference on the bucket to keep it from disappearing until + * we are done. + */ + TCPID_BUCKET_REF(tlb); + + /* + * We'll try to create these without dropping locks. However, we + * might very well need to drop locks to get memory. If that's the + * case, we'll save up to 10 on the stack, and sacrifice the rest. + * (Otherwise, we need to worry about finding our place again in a + * potentially changed list. It just doesn't seem worth the trouble + * to do that. + */ + expireq_locked = false; + num_local_entries = 0; + prev_tln = NULL; + tree_locked = TREE_UNLOCKED; + SLIST_FOREACH_SAFE(cur_tln, &tlb->tlb_head, tln_list, tmp_tln) { + /* + * If this isn't associated with a TCPCB, we can pull it off + * the list now. We need to be careful that the expire timer + * hasn't already taken ownership (tln_expiretime == SBT_MAX). + * If so, we let the expire timer code free the data. + */ + if (cur_tln->tln_closed) { +no_inp: + /* + * Get the expireq lock so we can get a consistent + * read of tln_expiretime and so we can remove this + * from the expireq. + */ + if (!expireq_locked) { + TCPLOG_EXPIREQ_LOCK(); + expireq_locked = true; + } + + /* + * We ignore entries with tln_expiretime == SBT_MAX. + * The expire timer code already owns those. + */ + KASSERT(cur_tln->tln_expiretime > (sbintime_t) 0, + ("%s:%d: node on the expire queue without positive " + "expire time", __func__, __LINE__)); + if (cur_tln->tln_expiretime == SBT_MAX) { + prev_tln = cur_tln; + continue; + } + + /* Remove the entry from the expireq. */ + STAILQ_REMOVE(&tcp_log_expireq_head, cur_tln, + tcp_log_id_node, tln_expireq); + + /* Remove the entry from the bucket. */ + if (prev_tln != NULL) + SLIST_REMOVE_AFTER(prev_tln, tln_list); + else + SLIST_REMOVE_HEAD(&tlb->tlb_head, tln_list); + + /* + * Drop the INP and bucket reference counts. Due to + * lock-ordering rules, we need to drop the expire + * queue lock. + */ + TCPLOG_EXPIREQ_UNLOCK(); + expireq_locked = false; + + /* Drop the INP reference. */ + INP_WLOCK(cur_tln->tln_inp); + if (!in_pcbrele_wlocked(cur_tln->tln_inp)) + INP_WUNLOCK(cur_tln->tln_inp); + + if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) { +#ifdef INVARIANTS + panic("%s: Bucket refcount unexpectedly 0.", + __func__); +#endif + /* + * Recover as best we can: free the entry we + * own. + */ + tcp_log_free_entries(&cur_tln->tln_entries, + &cur_tln->tln_count); + uma_zfree(tcp_log_node_zone, cur_tln); + goto done; + } + + if (tcp_log_dump_node_logbuf(cur_tln, reason, + M_NOWAIT)) { + /* + * If we have sapce, save the entries locally. + * Otherwise, free them. + */ + if (num_local_entries < LOCAL_SAVE) { + local_entries[num_local_entries] = + *cur_tln; + num_local_entries++; + } else { + tcp_log_free_entries( + &cur_tln->tln_entries, + &cur_tln->tln_count); + } + } + + /* No matter what, we are done with the node now. */ + uma_zfree(tcp_log_node_zone, cur_tln); + + /* + * Because we removed this entry from the list, prev_tln + * (which tracks the previous entry still on the tlb + * list) remains unchanged. + */ + continue; + } + + /* + * If we get to this point, the session data is still held in + * the TCPCB. So, we need to pull the data out of that. + * + * We will need to drop the expireq lock so we can lock the INP. + * We can then try to extract the data the "easy" way. If that + * fails, we'll save the log entries for later. + */ + if (expireq_locked) { + TCPLOG_EXPIREQ_UNLOCK(); + expireq_locked = false; + } + + /* Lock the INP and then re-check the state. */ + inp = cur_tln->tln_inp; + INP_WLOCK(inp); + /* + * If we caught this while it was transitioning, the data + * might have moved from the TCPCB to the tln (signified by + * setting tln_closed to true. If so, treat this like an + * inactive connection. + */ + if (cur_tln->tln_closed) { + /* + * It looks like we may have caught this connection + * while it was transitioning from active to inactive. + * Treat this like an inactive connection. + */ + INP_WUNLOCK(inp); + goto no_inp; + } + + /* + * Try to dump the data from the tp without dropping the lock. + * If this fails, try to save off the data locally. + */ + tp = cur_tln->tln_tp; + if (tcp_log_dump_tp_logbuf(tp, reason, M_NOWAIT, true) && + num_local_entries < LOCAL_SAVE) { + tcp_log_move_tp_to_node(tp, + &local_entries[num_local_entries]); + local_entries[num_local_entries].tln_closed = 1; + KASSERT(local_entries[num_local_entries].tln_bucket == + tlb, ("%s: %d: bucket mismatch for node %p", + __func__, __LINE__, cur_tln)); + num_local_entries++; + } + + INP_WUNLOCK(inp); + + /* + * We are goint to leave the current tln on the list. It will + * become the previous tln. + */ + prev_tln = cur_tln; + } + + /* Drop our locks, if any. */ + KASSERT(tree_locked == TREE_UNLOCKED, + ("%s: %d: tree unexpectedly locked", __func__, __LINE__)); + switch (tree_locked) { + case TREE_WLOCKED: + TCPID_TREE_WUNLOCK(); + tree_locked = TREE_UNLOCKED; + break; + case TREE_RLOCKED: + TCPID_TREE_RUNLOCK(); + tree_locked = TREE_UNLOCKED; + break; + } + if (expireq_locked) { + TCPLOG_EXPIREQ_UNLOCK(); + expireq_locked = false; + } + + /* + * Try again for any saved entries. tcp_log_dump_node_logbuf() is + * guaranteed to free the log entries within the node. And, since + * the node itself is on our stack, we don't need to free it. + */ + for (i = 0; i < num_local_entries; i++) + tcp_log_dump_node_logbuf(&local_entries[i], reason, M_WAITOK); + + /* Drop our reference. */ + if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL)) + TCPID_BUCKET_UNLOCK(tlb); + +done: + /* Drop our locks, if any. */ + switch (tree_locked) { + case TREE_WLOCKED: + TCPID_TREE_WUNLOCK(); + break; + case TREE_RLOCKED: + TCPID_TREE_RUNLOCK(); + break; + } + if (expireq_locked) + TCPLOG_EXPIREQ_UNLOCK(); +} +#undef LOCAL_SAVE + + +/* + * Queue the log buffers for all sessions in a bucket for transmissions via + * the log buffer facility. + * + * NOTE: This should be called with a locked INP; however, the function + * will drop the lock. + */ +void +tcp_log_dump_tp_bucket_logbufs(struct tcpcb *tp, char *reason) +{ + struct tcp_log_id_bucket *tlb; + int tree_locked; + + /* Figure out our bucket and lock it. */ + INP_WLOCK_ASSERT(tp->t_inpcb); + tlb = tp->t_lib; + if (tlb == NULL) { + /* + * No bucket; treat this like a request to dump a single + * session's traces. + */ + (void)tcp_log_dump_tp_logbuf(tp, reason, M_WAITOK, true); + INP_WUNLOCK(tp->t_inpcb); + return; + } + TCPID_BUCKET_REF(tlb); + INP_WUNLOCK(tp->t_inpcb); + TCPID_BUCKET_LOCK(tlb); + + /* If we are the last reference, we have nothing more to do here. */ + tree_locked = TREE_UNLOCKED; + if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) { + switch (tree_locked) { + case TREE_WLOCKED: + TCPID_TREE_WUNLOCK(); + break; + case TREE_RLOCKED: + TCPID_TREE_RUNLOCK(); + break; + } + return; + } + + /* Turn this over to tcp_log_dumpbucketlogs() to finish the work. */ + tcp_log_dumpbucketlogs(tlb, reason); +} + +/* + * Mark the end of a flow with the current stack. A stack can add + * stack-specific info to this trace event by overriding this + * function (see bbr_log_flowend() for example). + */ +void +tcp_log_flowend(struct tcpcb *tp) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + struct socket *so = tp->t_inpcb->inp_socket; + TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, + TCP_LOG_FLOWEND, 0, 0, NULL, false); + } +} + Index: head/sys/netinet/tcp_output.c =================================================================== --- head/sys/netinet/tcp_output.c +++ head/sys/netinet/tcp_output.c @@ -74,6 +74,7 @@ #include #define TCPOUTFLAGS #include +#include #include #include #include @@ -1310,6 +1311,10 @@ } #endif + /* We're getting ready to send; log now. */ + TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, + len, NULL, false); + /* * Enable TSO and specify the size of the segments. * The TCP pseudo header checksum is always provided. @@ -1549,6 +1554,9 @@ } if (error) { + /* Record the error. */ + TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, + error, 0, NULL, false); /* * We know that the packet was lost, so back out the Index: head/sys/netinet/tcp_subr.c =================================================================== --- head/sys/netinet/tcp_subr.c +++ head/sys/netinet/tcp_subr.c @@ -98,6 +98,7 @@ #include #include #include +#include #include #include #ifdef INET6 @@ -426,6 +427,71 @@ "list available TCP Function sets"); /* + * Exports one (struct tcp_function_id) for each non-alias. + */ +static int +sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS) +{ + int error, cnt; + struct tcp_function *f; + struct tcp_function_id tfi; + + /* + * We don't allow writes. + */ + if (req->newptr != NULL) + return (EINVAL); + + /* + * Wire the old buffer so we can directly copy the functions to + * user space without dropping the lock. + */ + if (req->oldptr != NULL) { + error = sysctl_wire_old_buffer(req, 0); + if (error) + return (error); + } + + /* + * Walk the list, comparing the name of the function entry and + * function block to determine which is an alias. + * If exporting the list, copy out matching entries. Otherwise, + * just record the total length. + */ + cnt = 0; + rw_rlock(&tcp_function_lock); + TAILQ_FOREACH(f, &t_functions, tf_next) { + if (strncmp(f->tf_name, f->tf_fb->tfb_tcp_block_name, + TCP_FUNCTION_NAME_LEN_MAX)) + continue; + if (req->oldptr != NULL) { + tfi.tfi_id = f->tf_fb->tfb_id; + (void)strncpy(tfi.tfi_name, f->tf_name, + TCP_FUNCTION_NAME_LEN_MAX); + tfi.tfi_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0'; + error = SYSCTL_OUT(req, &tfi, sizeof(tfi)); + /* + * Don't stop on error, as that is the + * mechanism we use to accumulate length + * information if the buffer was too short. + */ + } else + cnt++; + } + rw_runlock(&tcp_function_lock); + if (req->oldptr == NULL) + error = SYSCTL_OUT(req, NULL, + (cnt + 1) * sizeof(struct tcp_function_id)); + + return (error); +} + +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_ids, + CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE, + NULL, 0, sysctl_net_inet_list_func_ids, "S,tcp_function_id", + "List TCP function block name-to-ID mappings"); + +/* * Target size of TCP PCB hash tables. Must be a power of two. * * Note that this can be overridden by the kernel environment @@ -504,6 +570,8 @@ return (hashsize); } +static volatile int next_tcp_stack_id = 1; + /* * Register a TCP function block with the name provided in the names * array. (Note that this function does NOT automatically register @@ -563,6 +631,7 @@ refcount_init(&blk->tfb_refcnt, 0); blk->tfb_flags = 0; + blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1); for (i = 0; i < *num_names; i++) { n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait); if (n == NULL) { @@ -779,6 +848,8 @@ /* Setup the tcp function block list */ init_tcp_functions(); register_tcp_functions(&tcp_def_funcblk, M_WAITOK); + /* Initialize the TCP logging data. */ + tcp_log_init(); if (tcp_soreceive_stream) { #ifdef INET @@ -1360,6 +1431,8 @@ */ tcp_pcap_tcpcb_init(tp); #endif + /* Initialize the per-TCPCB log data. */ + tcp_log_tcpcbinit(tp); if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } @@ -1577,6 +1650,7 @@ inp->inp_ppcb = NULL; if (tp->t_timers->tt_draincnt == 0) { /* We own the last reference on tcpcb, let's free it. */ + tcp_log_tcpcbfini(tp); TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); @@ -1607,6 +1681,7 @@ tp->t_timers->tt_draincnt--; if (tp->t_timers->tt_draincnt == 0) { /* We own the last reference on this tcpcb, let's free it. */ + tcp_log_tcpcbfini(tp); TCPSTATES_DEC(tp->t_state); if (tp->t_fb->tfb_tcp_fb_fini) (*tp->t_fb->tfb_tcp_fb_fini)(tp, 1); @@ -1700,6 +1775,7 @@ if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); + tcp_log_drain(tcpb); #ifdef TCPPCAP if (tcp_pcap_aggressive_free) { /* Free the TCP PCAP queues. */ @@ -2856,6 +2932,7 @@ xt->t_state = TCPS_TIME_WAIT; } else { xt->t_state = tp->t_state; + xt->t_logstate = tp->t_logstate; xt->t_flags = tp->t_flags; xt->t_sndzerowin = tp->t_sndzerowin; xt->t_sndrexmitpack = tp->t_sndrexmitpack; @@ -2879,6 +2956,8 @@ bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, TCP_FUNCTION_NAME_LEN_MAX); + bzero(xt->xt_logid, TCP_LOG_ID_LEN); + (void)tcp_log_get_id(tp, xt->xt_logid); } xt->xt_len = sizeof(struct xtcpcb); Index: head/sys/netinet/tcp_timer.c =================================================================== --- head/sys/netinet/tcp_timer.c +++ head/sys/netinet/tcp_timer.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -644,6 +645,7 @@ KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0, ("%s: tp %p tcpcb can't be stopped here", __func__, tp)); tcp_free_sackholes(tp); + TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false); if (tp->t_fb->tfb_tcp_rexmit_tmr) { /* The stack has a timer action too. */ (*tp->t_fb->tfb_tcp_rexmit_tmr)(tp); Index: head/sys/netinet/tcp_usrreq.c =================================================================== --- head/sys/netinet/tcp_usrreq.c +++ head/sys/netinet/tcp_usrreq.c @@ -90,6 +90,7 @@ #include #include #include +#include #include #include #include @@ -1026,6 +1027,11 @@ tp->t_flags &= ~TF_FORCEDATA; } } + TCP_LOG_EVENT(tp, NULL, + &inp->inp_socket->so_rcv, + &inp->inp_socket->so_snd, + TCP_LOG_USERSEND, error, + 0, NULL, false); out: TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB : ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); @@ -1533,6 +1539,15 @@ return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp)); } +/* + * If this assert becomes untrue, we need to change the size of the buf + * variable in tcp_default_ctloutput(). + */ +#ifdef CTASSERT +CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN); +CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN); +#endif + int tcp_default_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp) { @@ -1540,7 +1555,7 @@ u_int ui; struct tcp_info ti; struct cc_algo *algo; - char *pbuf, buf[TCP_CA_NAME_MAX]; + char *pbuf, buf[TCP_LOG_ID_LEN]; size_t len; /* @@ -1822,6 +1837,55 @@ goto unlock_and_done; } + case TCP_LOG: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + return (error); + + INP_WLOCK_RECHECK(inp); + error = tcp_log_state_change(tp, optval); + goto unlock_and_done; + + case TCP_LOGBUF: + INP_WUNLOCK(inp); + error = EINVAL; + break; + + case TCP_LOGID: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0); + if (error) + break; + buf[sopt->sopt_valsize] = '\0'; + INP_WLOCK_RECHECK(inp); + error = tcp_log_set_id(tp, buf); + /* tcp_log_set_id() unlocks the INP. */ + break; + + case TCP_LOGDUMP: + case TCP_LOGDUMPID: + INP_WUNLOCK(inp); + error = + sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0); + if (error) + break; + buf[sopt->sopt_valsize] = '\0'; + INP_WLOCK_RECHECK(inp); + if (sopt->sopt_name == TCP_LOGDUMP) { + error = tcp_log_dump_tp_logbuf(tp, buf, + M_WAITOK, true); + INP_WUNLOCK(inp); + } else { + tcp_log_dump_tp_bucket_logbufs(tp, buf); + /* + * tcp_log_dump_tp_bucket_logbufs() drops the + * INP lock. + */ + } + break; + default: INP_WUNLOCK(inp); error = ENOPROTOOPT; @@ -1906,6 +1970,25 @@ optval = tp->t_flags & TF_FASTOPEN; INP_WUNLOCK(inp); error = sooptcopyout(sopt, &optval, sizeof optval); + break; + case TCP_LOG: + optval = tp->t_logstate; + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &optval, sizeof(optval)); + break; + case TCP_LOGBUF: + /* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */ + error = tcp_log_getlogbuf(sopt, tp); + break; + case TCP_LOGID: + len = tcp_log_get_id(tp, buf); + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, buf, len + 1); + break; + case TCP_LOGDUMP: + case TCP_LOGDUMPID: + INP_WUNLOCK(inp); + error = EINVAL; break; default: INP_WUNLOCK(inp); Index: head/sys/netinet/tcp_var.h =================================================================== --- head/sys/netinet/tcp_var.h +++ head/sys/netinet/tcp_var.h @@ -79,6 +79,8 @@ uint64_t _pad[1]; /* TBD */ }; +STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); + /* * Tcp control block, one per tcp; fields: * Organized for 16 byte cacheline efficiency. @@ -189,6 +191,13 @@ u_int t_tsomaxsegcount; /* TSO maximum segment count */ u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */ u_int t_flags2; /* More tcpcb flags storage */ + int t_logstate; /* State of "black box" logging */ + struct tcp_log_stailq t_logs; /* Log buffer */ + int t_lognum; /* Number of log entries */ + uint32_t t_logsn; /* Log "serial number" */ + struct tcp_log_id_node *t_lin; + struct tcp_log_id_bucket *t_lib; + const char *t_output_caller; /* Function that called tcp_output */ struct tcp_function_block *t_fb;/* TCP function call block */ void *t_fb_ptr; /* Pointer to t_fb specific data */ uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ @@ -267,6 +276,7 @@ int (*tfb_tcp_handoff_ok)(struct tcpcb *); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; + uint8_t tfb_id; }; struct tcp_function { @@ -339,11 +349,12 @@ #define TCPOOB_HADDATA 0x02 /* - * Flags for PLPMTU handling, t_flags2 + * Flags for the extended TCP flags field, t_flags2 */ #define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */ #define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */ #define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */ +#define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */ /* * Structure to hold TCP options that are only used during segment @@ -654,6 +665,7 @@ size_t xt_len; /* length of this structure */ struct xinpcb xt_inp; char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */ + char xt_logid[TCP_LOG_ID_LEN]; /* (s) */ int64_t spare64[8]; int32_t t_state; /* (s,p) */ uint32_t t_flags; /* (s,p) */ @@ -666,12 +678,22 @@ int32_t tt_keep; /* (s) */ int32_t tt_2msl; /* (s) */ int32_t tt_delack; /* (s) */ + int32_t t_logstate; /* (3) */ int32_t spare32[32]; } __aligned(8); + #ifdef _KERNEL void tcp_inptoxtp(const struct inpcb *, struct xtcpcb *); #endif #endif + +/* + * TCP function name-to-id mapping exported to user-land via sysctl(3). + */ +struct tcp_function_id { + uint8_t tfi_id; + char tfi_name[TCP_FUNCTION_NAME_LEN_MAX]; +}; /* * Identifiers for TCP sysctl nodes Index: head/usr.bin/netstat/inet.c =================================================================== --- head/usr.bin/netstat/inet.c +++ head/usr.bin/netstat/inet.c @@ -321,7 +321,7 @@ "Proto", "Recv-Q", "Send-Q", "Local Address", "Foreign Address"); if (!xflag && !Rflag) - xo_emit(" (state)"); + xo_emit(" {T:/%-11.11s}", "(state)"); } if (xflag) { xo_emit(" {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} " @@ -339,6 +339,8 @@ xo_emit(" {T:/%8.8s} {T:/%5.5s}", "flowid", "ftype"); } + if (Pflag) + xo_emit(" {T:/%s}", "Log ID"); xo_emit("\n"); first = 0; } @@ -478,9 +480,9 @@ } if (istcp && !Lflag && !xflag && !Tflag && !Rflag) { if (tp->t_state < 0 || tp->t_state >= TCP_NSTATES) - xo_emit("{:tcp-state/%d}", tp->t_state); + xo_emit("{:tcp-state/%-11d}", tp->t_state); else { - xo_emit("{:tcp-state/%s}", + xo_emit("{:tcp-state/%-11s}", tcpstates[tp->t_state]); #if defined(TF_NEEDSYN) && defined(TF_NEEDFIN) /* Show T/TCP `hidden state' */ @@ -495,6 +497,9 @@ inp->inp_flowid, inp->inp_flowtype); } + if (istcp && Pflag) + xo_emit(" {:log-id/%s}", tp->xt_logid[0] == '\0' ? + "-" : tp->xt_logid); xo_emit("\n"); xo_close_instance("socket"); } Index: head/usr.bin/netstat/main.c =================================================================== --- head/usr.bin/netstat/main.c +++ head/usr.bin/netstat/main.c @@ -214,6 +214,7 @@ int noutputs = 0; /* how much outputs before we exit */ int numeric_addr; /* show addresses numerically */ int numeric_port; /* show ports numerically */ +int Pflag; /* show TCP log ID */ static int pflag; /* show given protocol */ static int Qflag; /* show netisr information */ int rflag; /* show routing tables (or routing stats) */ @@ -247,7 +248,7 @@ if (argc < 0) exit(EXIT_FAILURE); - while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:np:Qq:RrSTsuWw:xz")) + while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:nPp:Qq:RrSTsuWw:xz")) != -1) switch(ch) { case '4': @@ -343,6 +344,9 @@ break; case 'n': numeric_addr = numeric_port = 1; + break; + case 'P': + Pflag = 1; break; case 'p': if ((tp = name2protox(optarg)) == NULL) { Index: head/usr.bin/netstat/netstat.h =================================================================== --- head/usr.bin/netstat/netstat.h +++ head/usr.bin/netstat/netstat.h @@ -50,6 +50,7 @@ extern int noutputs; /* how much outputs before we exit */ extern int numeric_addr; /* show addresses numerically */ extern int numeric_port; /* show ports numerically */ +extern int Pflag; /* show TCP log ID */ extern int rflag; /* show routing tables (or routing stats) */ extern int Rflag; /* show flowid / RSS information */ extern int sflag; /* show protocol statistics */ Index: head/usr.bin/netstat/netstat.1 =================================================================== --- head/usr.bin/netstat/netstat.1 +++ head/usr.bin/netstat/netstat.1 @@ -39,7 +39,7 @@ .Bl -tag -width "netstat" .It Nm .Op Fl -libxo -.Op Fl 46AaLnRSTWx +.Op Fl 46AaLnPRSTWx .Op Fl f Ar protocol_family | Fl p Ar protocol .Op Fl M Ar core .Op Fl N Ar system @@ -181,6 +181,8 @@ Do not resolve numeric addresses and port numbers to names. See .Sx GENERAL OPTIONS . +.It Fl P +Display the log ID for each socket. .It Fl R Display the flowid and flowtype for each socket. flowid is a 32 bit hardware specific identifier for each flow.