D11085.id.diff
No OneTemporary
Actions

Size

126 KB

Referenced Files

None

Subscribers

None

D11085.id.diff
View Options

	Index: head/etc/mtree/BSD.include.dist
	===================================================================
	--- head/etc/mtree/BSD.include.dist
	+++ head/etc/mtree/BSD.include.dist
	@@ -158,6 +158,8 @@
	..
	speaker
	..
	+ tcp_log
	+ ..
	usb
	..
	vkbd
	Index: head/include/Makefile
	===================================================================
	--- head/include/Makefile
	+++ head/include/Makefile
	@@ -47,7 +47,7 @@
	dev/hwpmc dev/hyperv \
	dev/ic dev/iicbus dev/io dev/lmc dev/mfi dev/mmc dev/nvme \
	dev/ofw dev/pbio dev/pci ${_dev_powermac_nvram} dev/ppbus dev/smbus \
	- dev/speaker dev/vkbd dev/wi \
	+ dev/speaker dev/tcp_log dev/vkbd dev/wi \
	fs/devfs fs/fdescfs fs/msdosfs fs/nandfs fs/nfs fs/nullfs \
	fs/procfs fs/smbfs fs/udf fs/unionfs \
	geom/cache geom/concat geom/eli geom/gate geom/journal geom/label \
	Index: head/sys/conf/files
	===================================================================
	--- head/sys/conf/files
	+++ head/sys/conf/files
	@@ -3161,6 +3161,7 @@
	dev/syscons/syscons.c optional sc
	dev/syscons/sysmouse.c optional sc
	dev/syscons/warp/warp_saver.c optional warp_saver
	+dev/tcp_log/tcp_log_dev.c optional inet \| inet6
	dev/tdfx/tdfx_linux.c optional tdfx_linux tdfx compat_linux
	dev/tdfx/tdfx_pci.c optional tdfx pci
	dev/ti/if_ti.c optional ti pci
	@@ -4309,6 +4310,7 @@
	netinet/tcp_fastopen.c optional inet tcp_rfc7413 \| inet6 tcp_rfc7413
	netinet/tcp_hostcache.c optional inet \| inet6
	netinet/tcp_input.c optional inet \| inet6
	+netinet/tcp_log_buf.c optional inet \| inet6
	netinet/tcp_lro.c optional inet \| inet6
	netinet/tcp_output.c optional inet \| inet6
	netinet/tcp_offload.c optional tcp_offload inet \| tcp_offload inet6
	Index: head/sys/dev/tcp_log/tcp_log_dev.h
	===================================================================
	--- head/sys/dev/tcp_log/tcp_log_dev.h
	+++ head/sys/dev/tcp_log/tcp_log_dev.h
	@@ -0,0 +1,88 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	+ *
	+ * Copyright (c) 2016
	+ * Netflix Inc. All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ * $FreeBSD$
	+ */
	+
	+#ifndef __tcp_log_dev_h__
	+#define __tcp_log_dev_h__
	+
	+/*
	+ * This is the common header for data streamed from the log device. All
	+ * blocks of data need to start with this header.
	+ */
	+struct tcp_log_common_header {
	+ uint32_t tlch_version; /* Version is specific to type. */
	+ uint32_t tlch_type; /* Type of entry(ies) that follow. */
	+ uint64_t tlch_length; /* Total length, including header. */
	+} __packed;
	+
	+#define TCP_LOG_DEV_TYPE_BBR 1 /* black box recorder */
	+
	+#ifdef _KERNEL
	+/*
	+ * This is a queue entry. All queue entries need to start with this structure
	+ * so the common code can cast them to this structure; however, other modules
	+ * are free to include additional data after this structure.
	+ *
	+ * The elements are explained here:
	+ * tldq_queue: used by the common code to maintain this entry's position in the
	+ * queue.
	+ * tldq_buf: should be NULL, or a pointer to a chunk of data. The data must be
	+ * as long as the common header indicates.
	+ * tldq_xform: If tldq_buf is NULL, the code will call this to create the
	+ * the tldq_buf object. The function should not directly modify tldq_buf,
	+ * but should return the buffer (which must meet the restrictions
	+ * indicated for tldq_buf).
	+ * tldq_dtor: This function is called to free the queue entry. If tldq_buf is
	+ * not NULL, the dtor function must free that, too.
	+ * tldq_refcnt: used by the common code to indicate how many readers still need
	+ * this data.
	+ */
	+struct tcp_log_dev_queue {
	+ STAILQ_ENTRY(tcp_log_dev_queue) tldq_queue;
	+ struct tcp_log_common_header *tldq_buf;
	+ struct tcp_log_common_header (tldq_xform)(struct tcp_log_dev_queue *entry);
	+ void (tldq_dtor)(struct tcp_log_dev_queue entry);
	+ volatile u_int tldq_refcnt;
	+};
	+
	+STAILQ_HEAD(log_queueh, tcp_log_dev_queue);
	+
	+struct tcp_log_dev_info {
	+ STAILQ_ENTRY(tcp_log_dev_info) tldi_list;
	+ struct tcp_log_dev_queue *tldi_head;
	+ struct tcp_log_common_header *tldi_cur;
	+ off_t tldi_off;
	+};
	+STAILQ_HEAD(log_infoh, tcp_log_dev_info);
	+
	+
	+MALLOC_DECLARE(M_TCPLOGDEV);
	+int tcp_log_dev_add_log(struct tcp_log_dev_queue *entry);
	+#endif /* _KERNEL */
	+#endif /* !__tcp_log_dev_h__ */
	Index: head/sys/dev/tcp_log/tcp_log_dev.c
	===================================================================
	--- head/sys/dev/tcp_log/tcp_log_dev.c
	+++ head/sys/dev/tcp_log/tcp_log_dev.c
	@@ -0,0 +1,521 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	+ *
	+ * Copyright (c) 2016-2017
	+ * Netflix Inc. All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ */
	+
	+#include <sys/cdefs.h>
	+__FBSDID("$FreeBSD$");
	+
	+#include <sys/param.h>
	+#include <sys/conf.h>
	+#include <sys/fcntl.h>
	+#include <sys/filio.h>
	+#include <sys/kernel.h>
	+#include <sys/lock.h>
	+#include <sys/malloc.h>
	+#include <sys/module.h>
	+#include <sys/poll.h>
	+#include <sys/queue.h>
	+#include <sys/refcount.h>
	+#include <sys/mutex.h>
	+#include <sys/selinfo.h>
	+#include <sys/socket.h>
	+#include <sys/socketvar.h>
	+#include <sys/sysctl.h>
	+#include <sys/tree.h>
	+#include <sys/uio.h>
	+#include <machine/atomic.h>
	+#include <sys/counter.h>
	+
	+#include <dev/tcp_log/tcp_log_dev.h>
	+
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+extern counter_u64_t tcp_log_que_read;
	+extern counter_u64_t tcp_log_que_freed;
	+#endif
	+
	+static struct cdev *tcp_log_dev;
	+static struct selinfo tcp_log_sel;
	+
	+static struct log_queueh tcp_log_dev_queue_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_queue_head);
	+static struct log_infoh tcp_log_dev_reader_head = STAILQ_HEAD_INITIALIZER(tcp_log_dev_reader_head);
	+
	+MALLOC_DEFINE(M_TCPLOGDEV, "tcp_log_dev", "TCP log device data structures");
	+
	+static int tcp_log_dev_listeners = 0;
	+
	+static struct mtx tcp_log_dev_queue_lock;
	+
	+#define TCP_LOG_DEV_QUEUE_LOCK() mtx_lock(&tcp_log_dev_queue_lock)
	+#define TCP_LOG_DEV_QUEUE_UNLOCK() mtx_unlock(&tcp_log_dev_queue_lock)
	+#define TCP_LOG_DEV_QUEUE_LOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_OWNED)
	+#define TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT() mtx_assert(&tcp_log_dev_queue_lock, MA_NOTOWNED)
	+#define TCP_LOG_DEV_QUEUE_REF(tldq) refcount_acquire(&((tldq)->tldq_refcnt))
	+#define TCP_LOG_DEV_QUEUE_UNREF(tldq) refcount_release(&((tldq)->tldq_refcnt))
	+
	+static void tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry);
	+static void tcp_log_dev_clear_cdevpriv(void *data);
	+static int tcp_log_dev_open(struct cdev *dev __unused, int flags,
	+ int devtype __unused, struct thread *td __unused);
	+static int tcp_log_dev_write(struct cdev *dev __unused,
	+ struct uio *uio __unused, int flags __unused);
	+static int tcp_log_dev_read(struct cdev dev __unused, struct uio uio,
	+ int flags __unused);
	+static int tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd,
	+ caddr_t data, int fflag __unused, struct thread *td __unused);
	+static int tcp_log_dev_poll(struct cdev *dev __unused, int events,
	+ struct thread *td);
	+
	+
	+enum tcp_log_dev_queue_lock_state {
	+ QUEUE_UNLOCKED = 0,
	+ QUEUE_LOCKED,
	+};
	+
	+static struct cdevsw tcp_log_cdevsw = {
	+ .d_version = D_VERSION,
	+ .d_read = tcp_log_dev_read,
	+ .d_open = tcp_log_dev_open,
	+ .d_write = tcp_log_dev_write,
	+ .d_poll = tcp_log_dev_poll,
	+ .d_ioctl = tcp_log_dev_ioctl,
	+#ifdef NOTYET
	+ .d_mmap = tcp_log_dev_mmap,
	+#endif
	+ .d_name = "tcp_log",
	+};
	+
	+static __inline void
	+tcp_log_dev_queue_validate_lock(int lockstate)
	+{
	+
	+#ifdef INVARIANTS
	+ switch (lockstate) {
	+ case QUEUE_LOCKED:
	+ TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
	+ break;
	+ case QUEUE_UNLOCKED:
	+ TCP_LOG_DEV_QUEUE_UNLOCK_ASSERT();
	+ break;
	+ default:
	+ kassert_panic("%s:%d: unknown queue lock state", __func__,
	+ __LINE__);
	+ }
	+#endif
	+}
	+
	+/*
	+ * Clear the refcount. If appropriate, it will remove the entry from the
	+ * queue and call the destructor.
	+ *
	+ * This must be called with the queue lock held.
	+ */
	+static void
	+tcp_log_dev_clear_refcount(struct tcp_log_dev_queue *entry)
	+{
	+
	+ KASSERT(entry != NULL, ("%s: called with NULL entry", __func__));
	+
	+ TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
	+
	+ if (TCP_LOG_DEV_QUEUE_UNREF(entry)) {
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+ counter_u64_add(tcp_log_que_freed, 1);
	+#endif
	+ /* Remove the entry from the queue and call the destructor. */
	+ STAILQ_REMOVE(&tcp_log_dev_queue_head, entry, tcp_log_dev_queue,
	+ tldq_queue);
	+ (*entry->tldq_dtor)(entry);
	+ }
	+}
	+
	+static void
	+tcp_log_dev_clear_cdevpriv(void *data)
	+{
	+ struct tcp_log_dev_info *priv;
	+ struct tcp_log_dev_queue entry, entry_tmp;
	+
	+ priv = (struct tcp_log_dev_info *)data;
	+ if (priv == NULL)
	+ return;
	+
	+ /*
	+ * Lock the queue and drop our references. We hold references to all
	+ * the entries starting with tldi_head (or, if tldi_head == NULL, all
	+ * entries in the queue).
	+ *
	+ * Because we don't want anyone adding addition things to the queue
	+ * while we are doing this, we lock the queue.
	+ */
	+ TCP_LOG_DEV_QUEUE_LOCK();
	+ if (priv->tldi_head != NULL) {
	+ entry = priv->tldi_head;
	+ STAILQ_FOREACH_FROM_SAFE(entry, &tcp_log_dev_queue_head,
	+ tldq_queue, entry_tmp) {
	+ tcp_log_dev_clear_refcount(entry);
	+ }
	+ }
	+ tcp_log_dev_listeners--;
	+ KASSERT(tcp_log_dev_listeners >= 0,
	+ ("%s: tcp_log_dev_listeners is unexpectedly negative", __func__));
	+ STAILQ_REMOVE(&tcp_log_dev_reader_head, priv, tcp_log_dev_info,
	+ tldi_list);
	+ TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
	+ TCP_LOG_DEV_QUEUE_UNLOCK();
	+ free(priv, M_TCPLOGDEV);
	+}
	+
	+static int
	+tcp_log_dev_open(struct cdev *dev __unused, int flags, int devtype __unused,
	+ struct thread *td __unused)
	+{
	+ struct tcp_log_dev_info *priv;
	+ struct tcp_log_dev_queue *entry;
	+ int rv;
	+
	+ /*
	+ * Ideally, we shouldn't see these because of file system
	+ * permissions.
	+ */
	+ if (flags & (FWRITE \| FEXEC \| FAPPEND \| O_TRUNC))
	+ return (ENODEV);
	+
	+ /* Allocate space to hold information about where we are. */
	+ priv = malloc(sizeof(struct tcp_log_dev_info), M_TCPLOGDEV,
	+ M_ZERO \| M_WAITOK);
	+
	+ /* Stash the private data away. */
	+ rv = devfs_set_cdevpriv((void *)priv, tcp_log_dev_clear_cdevpriv);
	+ if (!rv) {
	+ /*
	+ * Increase the listener count, add this reader to the list, and
	+ * take references on all current queues.
	+ */
	+ TCP_LOG_DEV_QUEUE_LOCK();
	+ tcp_log_dev_listeners++;
	+ STAILQ_INSERT_HEAD(&tcp_log_dev_reader_head, priv, tldi_list);
	+ priv->tldi_head = STAILQ_FIRST(&tcp_log_dev_queue_head);
	+ if (priv->tldi_head != NULL)
	+ priv->tldi_cur = priv->tldi_head->tldq_buf;
	+ STAILQ_FOREACH(entry, &tcp_log_dev_queue_head, tldq_queue)
	+ TCP_LOG_DEV_QUEUE_REF(entry);
	+ TCP_LOG_DEV_QUEUE_UNLOCK();
	+ } else {
	+ /* Free the entry. */
	+ free(priv, M_TCPLOGDEV);
	+ }
	+ return (rv);
	+}
	+
	+static int
	+tcp_log_dev_write(struct cdev dev __unused, struct uio uio __unused,
	+ int flags __unused)
	+{
	+
	+ return (ENODEV);
	+}
	+
	+static __inline void
	+tcp_log_dev_rotate_bufs(struct tcp_log_dev_info priv, int lockstate)
	+{
	+ struct tcp_log_dev_queue *entry;
	+
	+ KASSERT(priv->tldi_head != NULL,
	+ ("%s:%d: priv->tldi_head unexpectedly NULL",
	+ __func__, __LINE__));
	+ KASSERT(priv->tldi_head->tldq_buf == priv->tldi_cur,
	+ ("%s:%d: buffer mismatch (%p vs %p)",
	+ __func__, __LINE__, priv->tldi_head->tldq_buf,
	+ priv->tldi_cur));
	+ tcp_log_dev_queue_validate_lock(*lockstate);
	+
	+ if (*lockstate == QUEUE_UNLOCKED) {
	+ TCP_LOG_DEV_QUEUE_LOCK();
	+ *lockstate = QUEUE_LOCKED;
	+ }
	+ entry = priv->tldi_head;
	+ priv->tldi_head = STAILQ_NEXT(entry, tldq_queue);
	+ tcp_log_dev_clear_refcount(entry);
	+ priv->tldi_cur = NULL;
	+}
	+
	+static int
	+tcp_log_dev_read(struct cdev dev __unused, struct uio uio, int flags)
	+{
	+ struct tcp_log_common_header *buf;
	+ struct tcp_log_dev_info *priv;
	+ struct tcp_log_dev_queue *entry;
	+ ssize_t len;
	+ int lockstate, rv;
	+
	+ /* Get our private info. */
	+ rv = devfs_get_cdevpriv((void **)&priv);
	+ if (rv)
	+ return (rv);
	+
	+ lockstate = QUEUE_UNLOCKED;
	+
	+ /* Do we need to get a new buffer? */
	+ while (priv->tldi_cur == NULL \|\|
	+ priv->tldi_cur->tlch_length <= priv->tldi_off) {
	+ /* Did we somehow forget to rotate? */
	+ KASSERT(priv->tldi_cur == NULL,
	+ ("%s:%d: tldi_cur is unexpectedly non-NULL", __func__,
	+ __LINE__));
	+ if (priv->tldi_cur != NULL)
	+ tcp_log_dev_rotate_bufs(priv, &lockstate);
	+
	+ /*
	+ * Before we start looking at tldi_head, we need a lock on the
	+ * queue to make sure tldi_head stays stable.
	+ */
	+ if (lockstate == QUEUE_UNLOCKED) {
	+ TCP_LOG_DEV_QUEUE_LOCK();
	+ lockstate = QUEUE_LOCKED;
	+ }
	+
	+ /* We need the next buffer. Do we have one? */
	+ if (priv->tldi_head == NULL && (flags & FNONBLOCK)) {
	+ rv = EAGAIN;
	+ goto done;
	+ }
	+ if (priv->tldi_head == NULL) {
	+ /* Sleep and wait for more things we can read. */
	+ rv = mtx_sleep(&tcp_log_dev_listeners,
	+ &tcp_log_dev_queue_lock, PCATCH, "tcplogdev", 0);
	+ if (rv)
	+ goto done;
	+ if (priv->tldi_head == NULL)
	+ continue;
	+ }
	+
	+ /*
	+ * We have an entry to read. We want to try to create a
	+ * buffer, if one doesn't already exist.
	+ */
	+ entry = priv->tldi_head;
	+ if (entry->tldq_buf == NULL) {
	+ TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
	+ buf = (*entry->tldq_xform)(entry);
	+ if (buf == NULL) {
	+ rv = EBUSY;
	+ goto done;
	+ }
	+ entry->tldq_buf = buf;
	+ }
	+
	+ priv->tldi_cur = entry->tldq_buf;
	+ priv->tldi_off = 0;
	+ }
	+
	+ /* Copy what we can from this buffer to the output buffer. */
	+ if (uio->uio_resid > 0) {
	+ /* Drop locks so we can take page faults. */
	+ if (lockstate == QUEUE_LOCKED)
	+ TCP_LOG_DEV_QUEUE_UNLOCK();
	+ lockstate = QUEUE_UNLOCKED;
	+
	+ KASSERT(priv->tldi_cur != NULL,
	+ ("%s: priv->tldi_cur is unexpectedly NULL", __func__));
	+
	+ /* Copy as much as we can to this uio. */
	+ len = priv->tldi_cur->tlch_length - priv->tldi_off;
	+ if (len > uio->uio_resid)
	+ len = uio->uio_resid;
	+ rv = uiomove(((uint8_t *)priv->tldi_cur) + priv->tldi_off,
	+ len, uio);
	+ if (rv != 0)
	+ goto done;
	+ priv->tldi_off += len;
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+ counter_u64_add(tcp_log_que_read, len);
	+#endif
	+ }
	+ /* Are we done with this buffer? If so, find the next one. */
	+ if (priv->tldi_off >= priv->tldi_cur->tlch_length) {
	+ KASSERT(priv->tldi_off == priv->tldi_cur->tlch_length,
	+ ("%s: offset (%ju) exceeds length (%ju)", __func__,
	+ (uintmax_t)priv->tldi_off,
	+ (uintmax_t)priv->tldi_cur->tlch_length));
	+ tcp_log_dev_rotate_bufs(priv, &lockstate);
	+ }
	+done:
	+ tcp_log_dev_queue_validate_lock(lockstate);
	+ if (lockstate == QUEUE_LOCKED)
	+ TCP_LOG_DEV_QUEUE_UNLOCK();
	+ return (rv);
	+}
	+
	+static int
	+tcp_log_dev_ioctl(struct cdev *dev __unused, u_long cmd, caddr_t data,
	+ int fflag __unused, struct thread *td __unused)
	+{
	+ struct tcp_log_dev_info *priv;
	+ int rv;
	+
	+ /* Get our private info. */
	+ rv = devfs_get_cdevpriv((void **)&priv);
	+ if (rv)
	+ return (rv);
	+
	+ /*
	+ * Set things. Here, we are most concerned about the non-blocking I/O
	+ * flag.
	+ */
	+ rv = 0;
	+ switch (cmd) {
	+ case FIONBIO:
	+ break;
	+ case FIOASYNC:
	+ if ((int )data != 0)
	+ rv = EINVAL;
	+ break;
	+ default:
	+ rv = ENOIOCTL;
	+ }
	+ return (rv);
	+}
	+
	+static int
	+tcp_log_dev_poll(struct cdev dev __unused, int events, struct thread td)
	+{
	+ struct tcp_log_dev_info *priv;
	+ int revents;
	+
	+ /*
	+ * Get our private info. If this fails, claim that all events are
	+ * ready. That should prod the user to do something that will
	+ * make the error evident to them.
	+ */
	+ if (devfs_get_cdevpriv((void **)&priv))
	+ return (events);
	+
	+ revents = 0;
	+ if (events & (POLLIN \| POLLRDNORM)) {
	+ /*
	+ * We can (probably) read right now if we are partway through
	+ * a buffer or if we are just about to start a buffer.
	+ * Because we are going to read tldi_head, we should acquire
	+ * a read lock on the queue.
	+ */
	+ TCP_LOG_DEV_QUEUE_LOCK();
	+ if ((priv->tldi_head != NULL && priv->tldi_cur == NULL) \|\|
	+ (priv->tldi_cur != NULL &&
	+ priv->tldi_off < priv->tldi_cur->tlch_length))
	+ revents = events & (POLLIN \| POLLRDNORM);
	+ else
	+ selrecord(td, &tcp_log_sel);
	+ TCP_LOG_DEV_QUEUE_UNLOCK();
	+ } else {
	+ /*
	+ * It only makes sense to poll for reading. So, again, prod the
	+ * user to do something that will make the error of their ways
	+ * apparent.
	+ */
	+ revents = events;
	+ }
	+ return (revents);
	+}
	+
	+int
	+tcp_log_dev_add_log(struct tcp_log_dev_queue *entry)
	+{
	+ struct tcp_log_dev_info *priv;
	+ int rv;
	+ bool wakeup_needed;
	+
	+ KASSERT(entry->tldq_buf != NULL \|\| entry->tldq_xform != NULL,
	+ ("%s: Called with both tldq_buf and tldq_xform set to NULL",
	+ __func__));
	+ KASSERT(entry->tldq_dtor != NULL,
	+ ("%s: Called with tldq_dtor set to NULL", __func__));
	+
	+ /* Get a lock on the queue. */
	+ TCP_LOG_DEV_QUEUE_LOCK();
	+
	+ /* If no one is listening, tell the caller to free the resources. */
	+ if (tcp_log_dev_listeners == 0) {
	+ rv = ENXIO;
	+ goto done;
	+ }
	+
	+ /* Add this to the end of the tailq. */
	+ STAILQ_INSERT_TAIL(&tcp_log_dev_queue_head, entry, tldq_queue);
	+
	+ /* Add references for all current listeners. */
	+ refcount_init(&entry->tldq_refcnt, tcp_log_dev_listeners);
	+
	+ /*
	+ * If any listener is currently stuck on NULL, that means they are
	+ * waiting. Point their head to this new entry.
	+ */
	+ wakeup_needed = false;
	+ STAILQ_FOREACH(priv, &tcp_log_dev_reader_head, tldi_list)
	+ if (priv->tldi_head == NULL) {
	+ priv->tldi_head = entry;
	+ wakeup_needed = true;
	+ }
	+
	+ if (wakeup_needed) {
	+ selwakeup(&tcp_log_sel);
	+ wakeup(&tcp_log_dev_listeners);
	+ }
	+
	+ rv = 0;
	+
	+done:
	+ TCP_LOG_DEV_QUEUE_LOCK_ASSERT();
	+ TCP_LOG_DEV_QUEUE_UNLOCK();
	+ return (rv);
	+}
	+
	+static int
	+tcp_log_dev_modevent(module_t mod __unused, int type, void *data __unused)
	+{
	+
	+ /* TODO: Support intelligent unloading. */
	+ switch (type) {
	+ case MOD_LOAD:
	+ if (bootverbose)
	+ printf("tcp_log: tcp_log device\n");
	+ memset(&tcp_log_sel, 0, sizeof(tcp_log_sel));
	+ memset(&tcp_log_dev_queue_lock, 0, sizeof(struct mtx));
	+ mtx_init(&tcp_log_dev_queue_lock, "tcp_log dev",
	+ "tcp_log device queues", MTX_DEF);
	+ tcp_log_dev = make_dev_credf(MAKEDEV_ETERNAL_KLD,
	+ &tcp_log_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
	+ "tcp_log");
	+ break;
	+ default:
	+ return (EOPNOTSUPP);
	+ }
	+
	+ return (0);
	+}
	+
	+DEV_MODULE(tcp_log_dev, tcp_log_dev_modevent, NULL);
	+MODULE_VERSION(tcp_log_dev, 1);
	Index: head/sys/kern/subr_witness.c
	===================================================================
	--- head/sys/kern/subr_witness.c
	+++ head/sys/kern/subr_witness.c
	@@ -640,6 +640,14 @@
	{ "db->db_mtx", &lock_class_sx },
	{ NULL, NULL },
	/*
	+ * TCP log locks
	+ */
	+ { "TCP ID tree", &lock_class_rw },
	+ { "tcp log id bucket", &lock_class_mtx_sleep },
	+ { "tcpinp", &lock_class_rw },
	+ { "TCP log expireq", &lock_class_mtx_sleep },
	+ { NULL, NULL },
	+ /*
	* spin locks
	*/
	#ifdef SMP
	Index: head/sys/netinet/tcp.h
	===================================================================
	--- head/sys/netinet/tcp.h
	+++ head/sys/netinet/tcp.h
	@@ -168,6 +168,12 @@
	#define TCP_NOOPT 8 /* don't use TCP options */
	#define TCP_MD5SIG 16 /* use MD5 digests (RFC2385) */
	#define TCP_INFO 32 /* retrieve tcp_info structure */
	+#define TCP_LOG 34 /* configure event logging for connection */
	+#define TCP_LOGBUF 35 /* retrieve event log for connection */
	+#define TCP_LOGID 36 /* configure log ID to correlate connections */
	+#define TCP_LOGDUMP 37 /* dump connection log events to device */
	+#define TCP_LOGDUMPID 38 /* dump events from connections with same ID to
	+ device */
	#define TCP_CONGESTION 64 /* get/set congestion control algorithm */
	#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */
	#define TCP_KEEPINIT 128 /* N, time to establish connection */
	@@ -188,6 +194,9 @@
	#define TCPI_OPT_WSCALE 0x04
	#define TCPI_OPT_ECN 0x08
	#define TCPI_OPT_TOE 0x10
	+
	+/* Maximum length of log ID. */
	+#define TCP_LOG_ID_LEN 64

	/*
	* The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits
	Index: head/sys/netinet/tcp_input.c
	===================================================================
	--- head/sys/netinet/tcp_input.c
	+++ head/sys/netinet/tcp_input.c
	@@ -102,6 +102,7 @@
	#include <netinet6/nd6.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_fsm.h>
	+#include <netinet/tcp_log_buf.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	@@ -1592,6 +1593,8 @@
	/* Save segment, if requested. */
	tcp_pcap_add(th, m, &(tp->t_inpkts));
	#endif
	+ TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
	+ tlen, NULL, true);

	if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
	if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
	Index: head/sys/netinet/tcp_log_buf.h
	===================================================================
	--- head/sys/netinet/tcp_log_buf.h
	+++ head/sys/netinet/tcp_log_buf.h
	@@ -0,0 +1,353 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	+ *
	+ * Copyright (c) 2016-2018
	+ * Netflix Inc. All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ * $FreeBSD$
	+ */
	+
	+#ifndef __tcp_log_buf_h__
	+#define __tcp_log_buf_h__
	+
	+#define TCP_LOG_REASON_LEN 32
	+#define TCP_LOG_BUF_VER (6)
	+
	+/*
	+ * Because the (struct tcp_log_buffer) includes 8-byte uint64_t's, it requires
	+ * 8-byte alignment to work properly on all platforms. Therefore, we will
	+ * enforce 8-byte alignment for all the structures that may appear by
	+ * themselves (instead of being embedded in another structure) in a data
	+ * stream.
	+ */
	+#define ALIGN_TCP_LOG __aligned(8)
	+
	+/* Information about the socketbuffer state. */
	+struct tcp_log_sockbuf
	+{
	+ uint32_t tls_sb_acc; /* available chars (sb->sb_acc) */
	+ uint32_t tls_sb_ccc; /* claimed chars (sb->sb_ccc) */
	+ uint32_t tls_sb_spare; /* spare */
	+};
	+
	+/* Optional, verbose information that may be appended to an event log. */
	+struct tcp_log_verbose
	+{
	+#define TCP_FUNC_LEN 32
	+ char tlv_snd_frm[TCP_FUNC_LEN]; /* tcp_output() caller */
	+ char tlv_trace_func[TCP_FUNC_LEN]; /* Function that
	+ generated trace */
	+ uint32_t tlv_trace_line; /* Line number that generated trace */
	+ uint8_t _pad[4];
	+} ALIGN_TCP_LOG;
	+
	+/* Internal RACK state variables. */
	+struct tcp_log_rack
	+{
	+ uint32_t tlr_rack_rtt; /* rc_rack_rtt */
	+ uint8_t tlr_state; /* Internal RACK state */
	+ uint8_t _pad[3]; /* Padding */
	+};
	+
	+struct tcp_log_bbr {
	+ uint64_t cur_del_rate;
	+ uint64_t delRate;
	+ uint64_t rttProp;
	+ uint64_t bw_inuse;
	+ uint32_t inflight;
	+ uint32_t applimited;
	+ uint32_t delivered;
	+ uint32_t timeStamp;
	+ uint32_t epoch;
	+ uint32_t lt_epoch;
	+ uint32_t pkts_out;
	+ uint32_t flex1;
	+ uint32_t flex2;
	+ uint32_t flex3;
	+ uint32_t flex4;
	+ uint32_t flex5;
	+ uint32_t flex6;
	+ uint32_t lost;
	+ uint16_t pacing_gain;
	+ uint16_t cwnd_gain;
	+ uint16_t flex7;
	+ uint8_t bbr_state;
	+ uint8_t bbr_substate;
	+ uint8_t inpacer;
	+ uint8_t ininput;
	+ uint8_t use_lt_bw;
	+ uint8_t flex8;
	+ uint32_t pkt_epoch;
	+};
	+
	+/* Per-stack stack-specific info. */
	+union tcp_log_stackspecific
	+{
	+ struct tcp_log_rack u_rack;
	+ struct tcp_log_bbr u_bbr;
	+};
	+
	+struct tcp_log_buffer
	+{
	+ /* Event basics */
	+ struct timeval tlb_tv; /* Timestamp of trace */
	+ uint32_t tlb_ticks; /* Timestamp of trace */
	+ uint32_t tlb_sn; /* Serial number */
	+ uint8_t tlb_stackid; /* Stack ID */
	+ uint8_t tlb_eventid; /* Event ID */
	+ uint16_t tlb_eventflags; /* Flags for the record */
	+#define TLB_FLAG_RXBUF 0x0001 /* Includes receive buffer info */
	+#define TLB_FLAG_TXBUF 0x0002 /* Includes send buffer info */
	+#define TLB_FLAG_HDR 0x0004 /* Includes a TCP header */
	+#define TLB_FLAG_VERBOSE 0x0008 /* Includes function/line numbers */
	+#define TLB_FLAG_STACKINFO 0x0010 /* Includes stack-specific info */
	+ int tlb_errno; /* Event error (if any) */
	+
	+ /* Internal session state */
	+ struct tcp_log_sockbuf tlb_rxbuf; /* Receive buffer */
	+ struct tcp_log_sockbuf tlb_txbuf; /* Send buffer */
	+
	+ int tlb_state; /* TCPCB t_state */
	+ uint32_t tlb_starttime; /* TCPCB t_starttime */
	+ uint32_t tlb_iss; /* TCPCB iss */
	+ uint32_t tlb_flags; /* TCPCB flags */
	+ uint32_t tlb_snd_una; /* TCPCB snd_una */
	+ uint32_t tlb_snd_max; /* TCPCB snd_max */
	+ uint32_t tlb_snd_cwnd; /* TCPCB snd_cwnd */
	+ uint32_t tlb_snd_nxt; /* TCPCB snd_nxt */
	+ uint32_t tlb_snd_recover;/* TCPCB snd_recover */
	+ uint32_t tlb_snd_wnd; /* TCPCB snd_wnd */
	+ uint32_t tlb_snd_ssthresh; /* TCPCB snd_ssthresh */
	+ uint32_t tlb_srtt; /* TCPCB t_srtt */
	+ uint32_t tlb_rttvar; /* TCPCB t_rttvar */
	+ uint32_t tlb_rcv_up; /* TCPCB rcv_up */
	+ uint32_t tlb_rcv_adv; /* TCPCB rcv_adv */
	+ uint32_t tlb_rcv_nxt; /* TCPCB rcv_nxt */
	+ tcp_seq tlb_sack_newdata; /* TCPCB sack_newdata */
	+ uint32_t tlb_rcv_wnd; /* TCPCB rcv_wnd */
	+ uint32_t tlb_dupacks; /* TCPCB t_dupacks */
	+ int tlb_segqlen; /* TCPCB segqlen */
	+ int tlb_snd_numholes; /* TCPCB snd_numholes */
	+ uint32_t tlb_flex1; /* Event specific information */
	+ uint32_t tlb_flex2; /* Event specific information */
	+ uint8_t tlb_snd_scale:4, /* TCPCB snd_scale */
	+ tlb_rcv_scale:4; /* TCPCB rcv_scale */
	+ uint8_t _pad[3]; /* Padding */
	+
	+ /* Per-stack info */
	+ union tcp_log_stackspecific tlb_stackinfo;
	+#define tlb_rack tlb_stackinfo.u_rack
	+
	+ /* The packet */
	+ uint32_t tlb_len; /* The packet's data length */
	+ struct tcphdr tlb_th; /* The TCP header */
	+ uint8_t tlb_opts[TCP_MAXOLEN]; /* The TCP options */
	+
	+ /* Verbose information (optional) */
	+ struct tcp_log_verbose tlb_verbose[0];
	+} ALIGN_TCP_LOG;
	+
	+enum tcp_log_events {
	+ TCP_LOG_IN = 1, /* Incoming packet 1 */
	+ TCP_LOG_OUT, /* Transmit (without other event) 2 */
	+ TCP_LOG_RTO, /* Retransmit timeout 3 */
	+ TCP_LOG_TF_ACK, /* Transmit due to TF_ACK 4 */
	+ TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
	+ TCP_LOG_PRR, /* Doing PRR 6 */
	+ TCP_LOG_REORDER,/* Detected reorder 7 */
	+ TCP_LOG_PACER, /* Pacer sending a packet 8 */
	+ BBR_LOG_BBRUPD, /* We updated BBR info 9 */
	+ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */
	+ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */
	+ BBR_LOG_INQUEUE, /* The tcb had a packet input to it 12 */
	+ BBR_LOG_TIMERSTAR, /* Start a timer 13 */
	+ BBR_LOG_TIMERCANC, /* Cancel a timer 14 */
	+ BBR_LOG_ENTREC, /* Entered recovery 15 */
	+ BBR_LOG_EXITREC, /* Exited recovery 16 */
	+ BBR_LOG_CWND, /* Cwnd change 17 */
	+ BBR_LOG_BWSAMP, /* LT B/W sample has been made 18 */
	+ BBR_LOG_MSGSIZE, /* We received a EMSGSIZE error 19 */
	+ BBR_LOG_BBRRTT, /* BBR RTT is updated 20 */
	+ BBR_LOG_JUSTRET, /* We just returned out of output 21 */
	+ BBR_LOG_STATE, /* A BBR state change occured 22 */
	+ BBR_LOG_PKT_EPOCH, /* A BBR packet epoch occured 23 */
	+ BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */
	+ TCP_LOG_FLOWEND, /* End of a flow 25 */
	+ BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */
	+ BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */
	+ BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */
	+ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */
	+ BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */
	+ TCP_LOG_USERSEND, /* User level sends data 31 */
	+ UNUSED_32, /* Unused 32 */
	+ UNUSED_33, /* Unused 33 */
	+ BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */
	+ BBR_LOG_TO_PROCESS, /* A to was processed 35 */
	+ BBR_LOG_BBRTSO, /* TSO update 36 */
	+ BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */
	+ BBR_LOG_LOWGAIN, /* Low gain accounting 38 */
	+ BBR_LOG_PROGRESS, /* Progress timer event 39 */
	+ TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */
	+ BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */
	+ BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */
	+ BBR_LOG_PACING_CALC, /* calc the pacing time 43 */
	+ BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */
	+ BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */
	+ BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/
	+ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */
	+ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */
	+ TCP_LOG_END /* End (keep at end) 49 */
	+};
	+
	+enum tcp_log_states {
	+ TCP_LOG_STATE_CLEAR = -1, /* Deactivate and clear tracing */
	+ TCP_LOG_STATE_OFF = 0, /* Pause */
	+ TCP_LOG_STATE_TAIL=1, /* Keep the trailing events */
	+ TCP_LOG_STATE_HEAD=2, /* Keep the leading events */
	+ TCP_LOG_STATE_HEAD_AUTO=3, /* Keep the leading events, and
	+ automatically dump them to the
	+ device */
	+ TCP_LOG_STATE_CONTINUAL=4, /* Continually dump the data when full */
	+ TCP_LOG_STATE_TAIL_AUTO=5, /* Keep the trailing events, and
	+ automatically dump them when the
	+ session ends */
	+};
	+
	+/* Use this if we don't know whether the operation succeeded. */
	+#define ERRNO_UNK (-1)
	+
	+/*
	+ * If the user included dev/tcp_log/tcp_log_dev.h, then include our private
	+ * headers. Otherwise, there is no reason to pollute all the files with an
	+ * additional include.
	+ *
	+ * This structure is aligned to an 8-byte boundary to match the alignment
	+ * requirements of (struct tcp_log_buffer).
	+ */
	+#ifdef __tcp_log_dev_h__
	+struct tcp_log_header {
	+ struct tcp_log_common_header tlh_common;
	+#define tlh_version tlh_common.tlch_version
	+#define tlh_type tlh_common.tlch_type
	+#define tlh_length tlh_common.tlch_length
	+ struct in_endpoints tlh_ie;
	+ struct timeval tlh_offset; /* Uptime -> UTC offset */
	+ char tlh_id[TCP_LOG_ID_LEN];
	+ char tlh_reason[TCP_LOG_REASON_LEN];
	+ uint8_t tlh_af;
	+ uint8_t _pad[7];
	+} ALIGN_TCP_LOG;
	+
	+#ifdef _KERNEL
	+struct tcp_log_dev_log_queue {
	+ struct tcp_log_dev_queue tldl_common;
	+ char tldl_id[TCP_LOG_ID_LEN];
	+ char tldl_reason[TCP_LOG_REASON_LEN];
	+ struct in_endpoints tldl_ie;
	+ struct tcp_log_stailq tldl_entries;
	+ int tldl_count;
	+ uint8_t tldl_af;
	+};
	+#endif /* _KERNEL */
	+#endif /* __tcp_log_dev_h__ */
	+
	+#ifdef _KERNEL
	+
	+#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000
	+#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000
	+
	+/*
	+ * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always
	+ * tries to record verbose information.
	+ */
	+#define TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
	+ do { \
	+ if (tp->t_logstate != TCP_LOG_STATE_OFF) \
	+ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \
	+ errornum, len, stackinfo, th_hostorder, \
	+ tp->t_output_caller, __func__, __LINE__, tv); \
	+ } while (0)
	+
	+/*
	+ * TCP_LOG_EVENT: This is a macro so we can capture function/line
	+ * information when needed.
	+ *
	+ * Prototype:
	+ * TCP_LOG_EVENT(struct tcpcb tp, struct tcphdr th, struct sockbuf *rxbuf,
	+ * struct sockbuf *txbuf, uint8_t eventid, int errornum,
	+ * union tcp_log_stackspecific *stackinfo)
	+ *
	+ * tp is mandatory and must be write locked.
	+ * th is optional; if present, it will appear in the record.
	+ * rxbuf and txbuf are optional; if present, they will appear in the record.
	+ * eventid is mandatory.
	+ * errornum is mandatory (it indicates the success or failure of the
	+ * operation associated with the event).
	+ * len indicates the length of the packet. If no packet, use 0.
	+ * stackinfo is optional; if present, it will appear in the record.
	+ */
	+#ifdef TCP_LOG_FORCEVERBOSE
	+#define TCP_LOG_EVENT TCP_LOG_EVENT_VERBOSE
	+#else
	+#define TCP_LOG_EVENT(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder) \
	+ do { \
	+ if (tcp_log_verbose) \
	+ TCP_LOG_EVENT_VERBOSE(tp, th, rxbuf, txbuf, \
	+ eventid, errornum, len, stackinfo, \
	+ th_hostorder, NULL); \
	+ else if (tp->t_logstate != TCP_LOG_STATE_OFF) \
	+ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \
	+ errornum, len, stackinfo, th_hostorder, \
	+ NULL, NULL, 0, NULL); \
	+ } while (0)
	+#endif /* TCP_LOG_FORCEVERBOSE */
	+#define TCP_LOG_EVENTP(tp, th, rxbuf, txbuf, eventid, errornum, len, stackinfo, th_hostorder, tv) \
	+ do { \
	+ if (tp->t_logstate != TCP_LOG_STATE_OFF) \
	+ tcp_log_event_(tp, th, rxbuf, txbuf, eventid, \
	+ errornum, len, stackinfo, th_hostorder, \
	+ NULL, NULL, 0, tv); \
	+ } while (0)
	+
	+
	+extern bool tcp_log_verbose;
	+void tcp_log_drain(struct tcpcb *tp);
	+int tcp_log_dump_tp_logbuf(struct tcpcb tp, char reason, int how, bool force);
	+void tcp_log_dump_tp_bucket_logbufs(struct tcpcb tp, char reason);
	+struct tcp_log_buffer tcp_log_event_(struct tcpcb tp, struct tcphdr th, struct sockbuf rxbuf,
	+ struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len,
	+ union tcp_log_stackspecific *stackinfo, int th_hostorder,
	+ const char output_caller, const char func, int line, const struct timeval *tv);
	+size_t tcp_log_get_id(struct tcpcb tp, char buf);
	+u_int tcp_log_get_id_cnt(struct tcpcb *tp);
	+int tcp_log_getlogbuf(struct sockopt sopt, struct tcpcb tp);
	+void tcp_log_init(void);
	+int tcp_log_set_id(struct tcpcb tp, char id);
	+int tcp_log_state_change(struct tcpcb *tp, int state);
	+void tcp_log_tcpcbinit(struct tcpcb *tp);
	+void tcp_log_tcpcbfini(struct tcpcb *tp);
	+void tcp_log_flowend(struct tcpcb *tp);
	+
	+#endif /* _KERNEL */
	+#endif /* __tcp_log_buf_h__ */
	Index: head/sys/netinet/tcp_log_buf.c
	===================================================================
	--- head/sys/netinet/tcp_log_buf.c
	+++ head/sys/netinet/tcp_log_buf.c
	@@ -0,0 +1,2480 @@
	+/*-
	+ * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
	+ *
	+ * Copyright (c) 2016-2018
	+ * Netflix Inc. All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ */
	+
	+#include <sys/cdefs.h>
	+__FBSDID("$FreeBSD$");
	+
	+#include <sys/param.h>
	+#include <sys/kernel.h>
	+#include <sys/lock.h>
	+#include <sys/malloc.h>
	+#include <sys/mutex.h>
	+#include <sys/queue.h>
	+#include <sys/refcount.h>
	+#include <sys/rwlock.h>
	+#include <sys/socket.h>
	+#include <sys/socketvar.h>
	+#include <sys/sysctl.h>
	+#include <sys/tree.h>
	+#include <sys/counter.h>
	+
	+#include <dev/tcp_log/tcp_log_dev.h>
	+
	+#include <net/if.h>
	+#include <net/if_var.h>
	+#include <net/vnet.h>
	+
	+#include <netinet/in.h>
	+#include <netinet/in_pcb.h>
	+#include <netinet/in_var.h>
	+#include <netinet/tcp_var.h>
	+#include <netinet/tcp_log_buf.h>
	+
	+/* Default expiry time */
	+#define TCP_LOG_EXPIRE_TIME ((sbintime_t)60 * SBT_1S)
	+
	+/* Max interval at which to run the expiry timer */
	+#define TCP_LOG_EXPIRE_INTVL ((sbintime_t)5 * SBT_1S)
	+
	+bool tcp_log_verbose;
	+static uma_zone_t tcp_log_bucket_zone, tcp_log_node_zone, tcp_log_zone;
	+static int tcp_log_session_limit = TCP_LOG_BUF_DEFAULT_SESSION_LIMIT;
	+static uint32_t tcp_log_version = TCP_LOG_BUF_VER;
	+RB_HEAD(tcp_log_id_tree, tcp_log_id_bucket);
	+static struct tcp_log_id_tree tcp_log_id_head;
	+static STAILQ_HEAD(, tcp_log_id_node) tcp_log_expireq_head =
	+ STAILQ_HEAD_INITIALIZER(tcp_log_expireq_head);
	+static struct mtx tcp_log_expireq_mtx;
	+static struct callout tcp_log_expireq_callout;
	+static uint64_t tcp_log_auto_ratio = 0;
	+static uint64_t tcp_log_auto_ratio_cur = 0;
	+static uint32_t tcp_log_auto_mode = TCP_LOG_STATE_TAIL;
	+static bool tcp_log_auto_all = false;
	+
	+RB_PROTOTYPE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
	+
	+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, bb, CTLFLAG_RW, 0, "TCP Black Box controls");
	+
	+SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_verbose, CTLFLAG_RW, &tcp_log_verbose,
	+ 0, "Force verbose logging for TCP traces");
	+
	+SYSCTL_INT(_net_inet_tcp_bb, OID_AUTO, log_session_limit,
	+ CTLFLAG_RW, &tcp_log_session_limit, 0,
	+ "Maximum number of events maintained for each TCP session");
	+
	+SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_global_limit, CTLFLAG_RW,
	+ &tcp_log_zone, "Maximum number of events maintained for all TCP sessions");
	+
	+SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_global_entries, CTLFLAG_RD,
	+ &tcp_log_zone, "Current number of events maintained for all TCP sessions");
	+
	+SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_limit, CTLFLAG_RW,
	+ &tcp_log_bucket_zone, "Maximum number of log IDs");
	+
	+SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_entries, CTLFLAG_RD,
	+ &tcp_log_bucket_zone, "Current number of log IDs");
	+
	+SYSCTL_UMA_MAX(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_limit, CTLFLAG_RW,
	+ &tcp_log_node_zone, "Maximum number of tcpcbs with log IDs");
	+
	+SYSCTL_UMA_CUR(_net_inet_tcp_bb, OID_AUTO, log_id_tcpcb_entries, CTLFLAG_RD,
	+ &tcp_log_node_zone, "Current number of tcpcbs with log IDs");
	+
	+SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_version, CTLFLAG_RD, &tcp_log_version,
	+ 0, "Version of log formats exported");
	+
	+SYSCTL_U64(_net_inet_tcp_bb, OID_AUTO, log_auto_ratio, CTLFLAG_RW,
	+ &tcp_log_auto_ratio, 0, "Do auto capturing for 1 out of N sessions");
	+
	+SYSCTL_U32(_net_inet_tcp_bb, OID_AUTO, log_auto_mode, CTLFLAG_RW,
	+ &tcp_log_auto_mode, TCP_LOG_STATE_HEAD_AUTO,
	+ "Logging mode for auto-selected sessions (default is TCP_LOG_STATE_HEAD_AUTO)");
	+
	+SYSCTL_BOOL(_net_inet_tcp_bb, OID_AUTO, log_auto_all, CTLFLAG_RW,
	+ &tcp_log_auto_all, false,
	+ "Auto-select from all sessions (rather than just those with IDs)");
	+
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+counter_u64_t tcp_log_queued;
	+counter_u64_t tcp_log_que_fail1;
	+counter_u64_t tcp_log_que_fail2;
	+counter_u64_t tcp_log_que_fail3;
	+counter_u64_t tcp_log_que_fail4;
	+counter_u64_t tcp_log_que_fail5;
	+counter_u64_t tcp_log_que_copyout;
	+counter_u64_t tcp_log_que_read;
	+counter_u64_t tcp_log_que_freed;
	+
	+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, queued, CTLFLAG_RD,
	+ &tcp_log_queued, "Number of entries queued");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail1, CTLFLAG_RD,
	+ &tcp_log_que_fail1, "Number of entries queued but fail 1");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail2, CTLFLAG_RD,
	+ &tcp_log_que_fail2, "Number of entries queued but fail 2");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail3, CTLFLAG_RD,
	+ &tcp_log_que_fail3, "Number of entries queued but fail 3");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail4, CTLFLAG_RD,
	+ &tcp_log_que_fail4, "Number of entries queued but fail 4");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, fail5, CTLFLAG_RD,
	+ &tcp_log_que_fail5, "Number of entries queued but fail 4");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, copyout, CTLFLAG_RD,
	+ &tcp_log_que_copyout, "Number of entries copied out");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, read, CTLFLAG_RD,
	+ &tcp_log_que_read, "Number of entries read from the queue");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_bb, OID_AUTO, freed, CTLFLAG_RD,
	+ &tcp_log_que_freed, "Number of entries freed after reading");
	+#endif
	+
	+#ifdef INVARIANTS
	+#define TCPLOG_DEBUG_RINGBUF
	+#endif
	+
	+struct tcp_log_mem
	+{
	+ STAILQ_ENTRY(tcp_log_mem) tlm_queue;
	+ struct tcp_log_buffer tlm_buf;
	+ struct tcp_log_verbose tlm_v;
	+#ifdef TCPLOG_DEBUG_RINGBUF
	+ volatile int tlm_refcnt;
	+#endif
	+};
	+
	+/* 60 bytes for the header, + 16 bytes for padding */
	+static uint8_t zerobuf[76];
	+
	+/*
	+ * Lock order:
	+ * 1. TCPID_TREE
	+ * 2. TCPID_BUCKET
	+ * 3. INP
	+ *
	+ * Rules:
	+ * A. You need a lock on the Tree to add/remove buckets.
	+ * B. You need a lock on the bucket to add/remove nodes from the bucket.
	+ * C. To change information in a node, you need the INP lock if the tln_closed
	+ * field is false. Otherwise, you need the bucket lock. (Note that the
	+ * tln_closed field can change at any point, so you need to recheck the
	+ * entry after acquiring the INP lock.)
	+ * D. To remove a node from the bucket, you must have that entry locked,
	+ * according to the criteria of Rule C. Also, the node must not be on
	+ * the expiry queue.
	+ * E. The exception to C is the expiry queue fields, which are locked by
	+ * the TCPLOG_EXPIREQ lock.
	+ *
	+ * Buckets have a reference count. Each node is a reference. Further,
	+ * other callers may add reference counts to keep a bucket from disappearing.
	+ * You can add a reference as long as you own a lock sufficient to keep the
	+ * bucket from disappearing. For example, a common use is:
	+ * a. Have a locked INP, but need to lock the TCPID_BUCKET.
	+ * b. Add a refcount on the bucket. (Safe because the INP lock prevents
	+ * the TCPID_BUCKET from going away.)
	+ * c. Drop the INP lock.
	+ * d. Acquire a lock on the TCPID_BUCKET.
	+ * e. Acquire a lock on the INP.
	+ * f. Drop the refcount on the bucket.
	+ * (At this point, the bucket may disappear.)
	+ *
	+ * Expire queue lock:
	+ * You can acquire this with either the bucket or INP lock. Don't reverse it.
	+ * When the expire code has committed to freeing a node, it resets the expiry
	+ * time to SBT_MAX. That is the signal to everyone else that they should
	+ * leave that node alone.
	+ */
	+static struct rwlock tcp_id_tree_lock;
	+#define TCPID_TREE_WLOCK() rw_wlock(&tcp_id_tree_lock)
	+#define TCPID_TREE_RLOCK() rw_rlock(&tcp_id_tree_lock)
	+#define TCPID_TREE_UPGRADE() rw_try_upgrade(&tcp_id_tree_lock)
	+#define TCPID_TREE_WUNLOCK() rw_wunlock(&tcp_id_tree_lock)
	+#define TCPID_TREE_RUNLOCK() rw_runlock(&tcp_id_tree_lock)
	+#define TCPID_TREE_WLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_WLOCKED)
	+#define TCPID_TREE_RLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_RLOCKED)
	+#define TCPID_TREE_UNLOCK_ASSERT() rw_assert(&tcp_id_tree_lock, RA_UNLOCKED)
	+
	+#define TCPID_BUCKET_LOCK_INIT(tlb) mtx_init(&((tlb)->tlb_mtx), "tcp log id bucket", NULL, MTX_DEF)
	+#define TCPID_BUCKET_LOCK_DESTROY(tlb) mtx_destroy(&((tlb)->tlb_mtx))
	+#define TCPID_BUCKET_LOCK(tlb) mtx_lock(&((tlb)->tlb_mtx))
	+#define TCPID_BUCKET_UNLOCK(tlb) mtx_unlock(&((tlb)->tlb_mtx))
	+#define TCPID_BUCKET_LOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_OWNED)
	+#define TCPID_BUCKET_UNLOCK_ASSERT(tlb) mtx_assert(&((tlb)->tlb_mtx), MA_NOTOWNED)
	+
	+#define TCPID_BUCKET_REF(tlb) refcount_acquire(&((tlb)->tlb_refcnt))
	+#define TCPID_BUCKET_UNREF(tlb) refcount_release(&((tlb)->tlb_refcnt))
	+
	+#define TCPLOG_EXPIREQ_LOCK() mtx_lock(&tcp_log_expireq_mtx)
	+#define TCPLOG_EXPIREQ_UNLOCK() mtx_unlock(&tcp_log_expireq_mtx)
	+
	+SLIST_HEAD(tcp_log_id_head, tcp_log_id_node);
	+
	+struct tcp_log_id_bucket
	+{
	+ /*
	+ * tlb_id must be first. This lets us use strcmp on
	+ * (struct tcp_log_id_bucket ) and (char ) interchangeably.
	+ */
	+ char tlb_id[TCP_LOG_ID_LEN];
	+ RB_ENTRY(tcp_log_id_bucket) tlb_rb;
	+ struct tcp_log_id_head tlb_head;
	+ struct mtx tlb_mtx;
	+ volatile u_int tlb_refcnt;
	+};
	+
	+struct tcp_log_id_node
	+{
	+ SLIST_ENTRY(tcp_log_id_node) tln_list;
	+ STAILQ_ENTRY(tcp_log_id_node) tln_expireq; /* Locked by the expireq lock */
	+ sbintime_t tln_expiretime; /* Locked by the expireq lock */
	+
	+ /*
	+ * If INP is NULL, that means the connection has closed. We've
	+ * saved the connection endpoint information and the log entries
	+ * in the tln_ie and tln_entries members. We've also saved a pointer
	+ * to the enclosing bucket here. If INP is not NULL, the information is
	+ * in the PCB and not here.
	+ */
	+ struct inpcb *tln_inp;
	+ struct tcpcb *tln_tp;
	+ struct tcp_log_id_bucket *tln_bucket;
	+ struct in_endpoints tln_ie;
	+ struct tcp_log_stailq tln_entries;
	+ int tln_count;
	+ volatile int tln_closed;
	+ uint8_t tln_af;
	+};
	+
	+enum tree_lock_state {
	+ TREE_UNLOCKED = 0,
	+ TREE_RLOCKED,
	+ TREE_WLOCKED,
	+};
	+
	+/* Do we want to select this session for auto-logging? */
	+static __inline bool
	+tcp_log_selectauto(void)
	+{
	+
	+ /*
	+ * If we are doing auto-capturing, figure out whether we will capture
	+ * this session.
	+ */
	+ if (tcp_log_auto_ratio &&
	+ (atomic_fetchadd_64(&tcp_log_auto_ratio_cur, 1) %
	+ tcp_log_auto_ratio) == 0)
	+ return (true);
	+ return (false);
	+}
	+
	+static __inline int
	+tcp_log_id_cmp(struct tcp_log_id_bucket a, struct tcp_log_id_bucket b)
	+{
	+ KASSERT(a != NULL, ("tcp_log_id_cmp: argument a is unexpectedly NULL"));
	+ KASSERT(b != NULL, ("tcp_log_id_cmp: argument b is unexpectedly NULL"));
	+ return strncmp(a->tlb_id, b->tlb_id, TCP_LOG_ID_LEN);
	+}
	+
	+RB_GENERATE_STATIC(tcp_log_id_tree, tcp_log_id_bucket, tlb_rb, tcp_log_id_cmp)
	+
	+static __inline void
	+tcp_log_id_validate_tree_lock(int tree_locked)
	+{
	+
	+#ifdef INVARIANTS
	+ switch (tree_locked) {
	+ case TREE_WLOCKED:
	+ TCPID_TREE_WLOCK_ASSERT();
	+ break;
	+ case TREE_RLOCKED:
	+ TCPID_TREE_RLOCK_ASSERT();
	+ break;
	+ case TREE_UNLOCKED:
	+ TCPID_TREE_UNLOCK_ASSERT();
	+ break;
	+ default:
	+ kassert_panic("%s:%d: unknown tree lock state", __func__,
	+ __LINE__);
	+ }
	+#endif
	+}
	+
	+static __inline void
	+tcp_log_remove_bucket(struct tcp_log_id_bucket *tlb)
	+{
	+
	+ TCPID_TREE_WLOCK_ASSERT();
	+ KASSERT(SLIST_EMPTY(&tlb->tlb_head),
	+ ("%s: Attempt to remove non-empty bucket", __func__));
	+ if (RB_REMOVE(tcp_log_id_tree, &tcp_log_id_head, tlb) == NULL) {
	+#ifdef INVARIANTS
	+ kassert_panic("%s:%d: error removing element from tree",
	+ __func__, __LINE__);
	+#endif
	+ }
	+ TCPID_BUCKET_LOCK_DESTROY(tlb);
	+ uma_zfree(tcp_log_bucket_zone, tlb);
	+}
	+
	+/*
	+ * Call with a referenced and locked bucket.
	+ * Will return true if the bucket was freed; otherwise, false.
	+ * tlb: The bucket to unreference.
	+ * tree_locked: A pointer to the state of the tree lock. If the tree lock
	+ * state changes, the function will update it.
	+ * inp: If not NULL and the function needs to drop the inp lock to relock the
	+ * tree, it will do so. (The caller must ensure inp will not become invalid,
	+ * probably by holding a reference to it.)
	+ */
	+static bool
	+tcp_log_unref_bucket(struct tcp_log_id_bucket tlb, int tree_locked,
	+ struct inpcb *inp)
	+{
	+
	+ KASSERT(tlb != NULL, ("%s: called with NULL tlb", __func__));
	+ KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
	+ __func__));
	+
	+ tcp_log_id_validate_tree_lock(*tree_locked);
	+
	+ /*
	+ * Did we hold the last reference on the tlb? If so, we may need
	+ * to free it. (Note that we can realistically only execute the
	+ * loop twice: once without a write lock and once with a write
	+ * lock.)
	+ */
	+ while (TCPID_BUCKET_UNREF(tlb)) {
	+ /*
	+ * We need a write lock on the tree to free this.
	+ * If we can upgrade the tree lock, this is "easy". If we
	+ * can't upgrade the tree lock, we need to do this the
	+ * "hard" way: unwind all our locks and relock everything.
	+ * In the meantime, anything could have changed. We even
	+ * need to validate that we still need to free the bucket.
	+ */
	+ if (*tree_locked == TREE_RLOCKED && TCPID_TREE_UPGRADE())
	+ *tree_locked = TREE_WLOCKED;
	+ else if (*tree_locked != TREE_WLOCKED) {
	+ TCPID_BUCKET_REF(tlb);
	+ if (inp != NULL)
	+ INP_WUNLOCK(inp);
	+ TCPID_BUCKET_UNLOCK(tlb);
	+ if (*tree_locked == TREE_RLOCKED)
	+ TCPID_TREE_RUNLOCK();
	+ TCPID_TREE_WLOCK();
	+ *tree_locked = TREE_WLOCKED;
	+ TCPID_BUCKET_LOCK(tlb);
	+ if (inp != NULL)
	+ INP_WLOCK(inp);
	+ continue;
	+ }
	+
	+ /*
	+ * We have an empty bucket and a write lock on the tree.
	+ * Remove the empty bucket.
	+ */
	+ tcp_log_remove_bucket(tlb);
	+ return (true);
	+ }
	+ return (false);
	+}
	+
	+/*
	+ * Call with a locked bucket. This function will release the lock on the
	+ * bucket before returning.
	+ *
	+ * The caller is responsible for freeing the tp->t_lin/tln node!
	+ *
	+ * Note: one of tp or both tlb and tln must be supplied.
	+ *
	+ * inp: A pointer to the inp. If the function needs to drop the inp lock to
	+ * acquire the tree write lock, it will do so. (The caller must ensure inp
	+ * will not become invalid, probably by holding a reference to it.)
	+ * tp: A pointer to the tcpcb. (optional; if specified, tlb and tln are ignored)
	+ * tlb: A pointer to the bucket. (optional; ignored if tp is specified)
	+ * tln: A pointer to the node. (optional; ignored if tp is specified)
	+ * tree_locked: A pointer to the state of the tree lock. If the tree lock
	+ * state changes, the function will update it.
	+ *
	+ * Will return true if the INP lock was reacquired; otherwise, false.
	+ */
	+static bool
	+tcp_log_remove_id_node(struct inpcb inp, struct tcpcb tp,
	+ struct tcp_log_id_bucket tlb, struct tcp_log_id_node tln,
	+ int *tree_locked)
	+{
	+ int orig_tree_locked;
	+
	+ KASSERT(tp != NULL \|\| (tlb != NULL && tln != NULL),
	+ ("%s: called with tp=%p, tlb=%p, tln=%p", __func__,
	+ tp, tlb, tln));
	+ KASSERT(tree_locked != NULL, ("%s: called with NULL tree_locked",
	+ __func__));
	+
	+ if (tp != NULL) {
	+ tlb = tp->t_lib;
	+ tln = tp->t_lin;
	+ KASSERT(tlb != NULL, ("%s: unexpectedly NULL tlb", __func__));
	+ KASSERT(tln != NULL, ("%s: unexpectedly NULL tln", __func__));
	+ }
	+
	+ tcp_log_id_validate_tree_lock(*tree_locked);
	+ TCPID_BUCKET_LOCK_ASSERT(tlb);
	+
	+ /*
	+ * Remove the node, clear the log bucket and node from the TCPCB, and
	+ * decrement the bucket refcount. In the process, if this is the
	+ * last reference, the bucket will be freed.
	+ */
	+ SLIST_REMOVE(&tlb->tlb_head, tln, tcp_log_id_node, tln_list);
	+ if (tp != NULL) {
	+ tp->t_lib = NULL;
	+ tp->t_lin = NULL;
	+ }
	+ orig_tree_locked = *tree_locked;
	+ if (!tcp_log_unref_bucket(tlb, tree_locked, inp))
	+ TCPID_BUCKET_UNLOCK(tlb);
	+ return (*tree_locked != orig_tree_locked);
	+}
	+
	+#define RECHECK_INP_CLEAN(cleanup) do { \
	+ if (inp->inp_flags & (INP_TIMEWAIT \| INP_DROPPED)) { \
	+ rv = ECONNRESET; \
	+ cleanup; \
	+ goto done; \
	+ } \
	+ tp = intotcpcb(inp); \
	+} while (0)
	+
	+#define RECHECK_INP() RECHECK_INP_CLEAN(/* noop */)
	+
	+static void
	+tcp_log_grow_tlb(char tlb_id, struct tcpcb tp)
	+{
	+
	+ INP_WLOCK_ASSERT(tp->t_inpcb);
	+
	+#ifdef NETFLIX
	+ if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL)
	+ (void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id));
	+#endif
	+}
	+
	+/*
	+ * Set the TCP log ID for a TCPCB.
	+ * Called with INPCB locked. Returns with it unlocked.
	+ */
	+int
	+tcp_log_set_id(struct tcpcb tp, char id)
	+{
	+ struct tcp_log_id_bucket tlb, tmp_tlb;
	+ struct tcp_log_id_node *tln;
	+ struct inpcb *inp;
	+ int tree_locked, rv;
	+ bool bucket_locked;
	+
	+ tlb = NULL;
	+ tln = NULL;
	+ inp = tp->t_inpcb;
	+ tree_locked = TREE_UNLOCKED;
	+ bucket_locked = false;
	+
	+restart:
	+ INP_WLOCK_ASSERT(inp);
	+
	+ /* See if the ID is unchanged. */
	+ if ((tp->t_lib != NULL && !strcmp(tp->t_lib->tlb_id, id)) \|\|
	+ (tp->t_lib == NULL && *id == 0)) {
	+ rv = 0;
	+ goto done;
	+ }
	+
	+ /*
	+ * If the TCPCB had a previous ID, we need to extricate it from
	+ * the previous list.
	+ *
	+ * Drop the TCPCB lock and lock the tree and the bucket.
	+ * Because this is called in the socket context, we (theoretically)
	+ * don't need to worry about the INPCB completely going away
	+ * while we are gone.
	+ */
	+ if (tp->t_lib != NULL) {
	+ tlb = tp->t_lib;
	+ TCPID_BUCKET_REF(tlb);
	+ INP_WUNLOCK(inp);
	+
	+ if (tree_locked == TREE_UNLOCKED) {
	+ TCPID_TREE_RLOCK();
	+ tree_locked = TREE_RLOCKED;
	+ }
	+ TCPID_BUCKET_LOCK(tlb);
	+ bucket_locked = true;
	+ INP_WLOCK(inp);
	+
	+ /*
	+ * Unreference the bucket. If our bucket went away, it is no
	+ * longer locked or valid.
	+ */
	+ if (tcp_log_unref_bucket(tlb, &tree_locked, inp)) {
	+ bucket_locked = false;
	+ tlb = NULL;
	+ }
	+
	+ /* Validate the INP. */
	+ RECHECK_INP();
	+
	+ /*
	+ * Evaluate whether the bucket changed while we were unlocked.
	+ *
	+ * Possible scenarios here:
	+ * 1. Bucket is unchanged and the same one we started with.
	+ * 2. The TCPCB no longer has a bucket and our bucket was
	+ * freed.
	+ * 3. The TCPCB has a new bucket, whether ours was freed.
	+ * 4. The TCPCB no longer has a bucket and our bucket was
	+ * not freed.
	+ *
	+ * In cases 2-4, we will start over. In case 1, we will
	+ * proceed here to remove the bucket.
	+ */
	+ if (tlb == NULL \|\| tp->t_lib != tlb) {
	+ KASSERT(bucket_locked \|\| tlb == NULL,
	+ ("%s: bucket_locked (%d) and tlb (%p) are "
	+ "inconsistent", __func__, bucket_locked, tlb));
	+
	+ if (bucket_locked) {
	+ TCPID_BUCKET_UNLOCK(tlb);
	+ bucket_locked = false;
	+ tlb = NULL;
	+ }
	+ goto restart;
	+ }
	+
	+ /*
	+ * Store the (struct tcp_log_id_node) for reuse. Then, remove
	+ * it from the bucket. In the process, we may end up relocking.
	+ * If so, we need to validate that the INP is still valid, and
	+ * the TCPCB entries match we expect.
	+ *
	+ * We will clear tlb and change the bucket_locked state just
	+ * before calling tcp_log_remove_id_node(), since that function
	+ * will unlock the bucket.
	+ */
	+ if (tln != NULL)
	+ uma_zfree(tcp_log_node_zone, tln);
	+ tln = tp->t_lin;
	+ tlb = NULL;
	+ bucket_locked = false;
	+ if (tcp_log_remove_id_node(inp, tp, NULL, NULL, &tree_locked)) {
	+ RECHECK_INP();
	+
	+ /*
	+ * If the TCPCB moved to a new bucket while we had
	+ * dropped the lock, restart.
	+ */
	+ if (tp->t_lib != NULL \|\| tp->t_lin != NULL)
	+ goto restart;
	+ }
	+
	+ /*
	+ * Yay! We successfully removed the TCPCB from its old
	+ * bucket. Phew!
	+ *
	+ * On to bigger and better things...
	+ */
	+ }
	+
	+ /* At this point, the TCPCB should not be in any bucket. */
	+ KASSERT(tp->t_lib == NULL, ("%s: tp->t_lib is not NULL", __func__));
	+
	+ /*
	+ * If the new ID is not empty, we need to now assign this TCPCB to a
	+ * new bucket.
	+ */
	+ if (*id) {
	+ /* Get a new tln, if we don't already have one to reuse. */
	+ if (tln == NULL) {
	+ tln = uma_zalloc(tcp_log_node_zone, M_NOWAIT \| M_ZERO);
	+ if (tln == NULL) {
	+ rv = ENOBUFS;
	+ goto done;
	+ }
	+ tln->tln_inp = inp;
	+ tln->tln_tp = tp;
	+ }
	+
	+ /*
	+ * Drop the INP lock for a bit. We don't need it, and dropping
	+ * it prevents lock order reversals.
	+ */
	+ INP_WUNLOCK(inp);
	+
	+ /* Make sure we have at least a read lock on the tree. */
	+ tcp_log_id_validate_tree_lock(tree_locked);
	+ if (tree_locked == TREE_UNLOCKED) {
	+ TCPID_TREE_RLOCK();
	+ tree_locked = TREE_RLOCKED;
	+ }
	+
	+refind:
	+ /*
	+ * Remember that we constructed (struct tcp_log_id_node) so
	+ * we can safely cast the id to it for the purposes of finding.
	+ */
	+ KASSERT(tlb == NULL, ("%s:%d tlb unexpectedly non-NULL",
	+ __func__, __LINE__));
	+ tmp_tlb = RB_FIND(tcp_log_id_tree, &tcp_log_id_head,
	+ (struct tcp_log_id_bucket *) id);
	+
	+ /*
	+ * If we didn't find a matching bucket, we need to add a new
	+ * one. This requires a write lock. But, of course, we will
	+ * need to recheck some things when we re-acquire the lock.
	+ */
	+ if (tmp_tlb == NULL && tree_locked != TREE_WLOCKED) {
	+ tree_locked = TREE_WLOCKED;
	+ if (!TCPID_TREE_UPGRADE()) {
	+ TCPID_TREE_RUNLOCK();
	+ TCPID_TREE_WLOCK();
	+
	+ /*
	+ * The tree may have changed while we were
	+ * unlocked.
	+ */
	+ goto refind;
	+ }
	+ }
	+
	+ /* If we need to add a new bucket, do it now. */
	+ if (tmp_tlb == NULL) {
	+ /* Allocate new bucket. */
	+ tlb = uma_zalloc(tcp_log_bucket_zone, M_NOWAIT);
	+ if (tlb == NULL) {
	+ rv = ENOBUFS;
	+ goto done_noinp;
	+ }
	+
	+ /*
	+ * Copy the ID to the bucket.
	+ * NB: Don't use strlcpy() unless you are sure
	+ * we've always validated NULL termination.
	+ *
	+ * TODO: When I'm done writing this, see if we
	+ * we have correctly validated NULL termination and
	+ * can use strlcpy(). :-)
	+ */
	+ strncpy(tlb->tlb_id, id, TCP_LOG_ID_LEN - 1);
	+ tlb->tlb_id[TCP_LOG_ID_LEN - 1] = '\0';
	+
	+ /*
	+ * Take the refcount for the first node and go ahead
	+ * and lock this. Note that we zero the tlb_mtx
	+ * structure, since 0xdeadc0de flips the right bits
	+ * for the code to think that this mutex has already
	+ * been initialized. :-(
	+ */
	+ SLIST_INIT(&tlb->tlb_head);
	+ refcount_init(&tlb->tlb_refcnt, 1);
	+ memset(&tlb->tlb_mtx, 0, sizeof(struct mtx));
	+ TCPID_BUCKET_LOCK_INIT(tlb);
	+ TCPID_BUCKET_LOCK(tlb);
	+ bucket_locked = true;
	+
	+#define FREE_NEW_TLB() do { \
	+ TCPID_BUCKET_LOCK_DESTROY(tlb); \
	+ uma_zfree(tcp_log_bucket_zone, tlb); \
	+ bucket_locked = false; \
	+ tlb = NULL; \
	+} while (0)
	+ /*
	+ * Relock the INP and make sure we are still
	+ * unassigned.
	+ */
	+ INP_WLOCK(inp);
	+ RECHECK_INP_CLEAN(FREE_NEW_TLB());
	+ if (tp->t_lib != NULL) {
	+ FREE_NEW_TLB();
	+ goto restart;
	+ }
	+
	+ /* Add the new bucket to the tree. */
	+ tmp_tlb = RB_INSERT(tcp_log_id_tree, &tcp_log_id_head,
	+ tlb);
	+ KASSERT(tmp_tlb == NULL,
	+ ("%s: Unexpected conflicting bucket (%p) while "
	+ "adding new bucket (%p)", __func__, tmp_tlb, tlb));
	+
	+ /*
	+ * If we found a conflicting bucket, free the new
	+ * one we made and fall through to use the existing
	+ * bucket.
	+ */
	+ if (tmp_tlb != NULL) {
	+ FREE_NEW_TLB();
	+ INP_WUNLOCK(inp);
	+ }
	+#undef FREE_NEW_TLB
	+ }
	+
	+ /* If we found an existing bucket, use it. */
	+ if (tmp_tlb != NULL) {
	+ tlb = tmp_tlb;
	+ TCPID_BUCKET_LOCK(tlb);
	+ bucket_locked = true;
	+
	+ /*
	+ * Relock the INP and make sure we are still
	+ * unassigned.
	+ */
	+ INP_UNLOCK_ASSERT(inp);
	+ INP_WLOCK(inp);
	+ RECHECK_INP();
	+ if (tp->t_lib != NULL) {
	+ TCPID_BUCKET_UNLOCK(tlb);
	+ tlb = NULL;
	+ goto restart;
	+ }
	+
	+ /* Take a reference on the bucket. */
	+ TCPID_BUCKET_REF(tlb);
	+ }
	+
	+ tcp_log_grow_tlb(tlb->tlb_id, tp);
	+
	+ /* Add the new node to the list. */
	+ SLIST_INSERT_HEAD(&tlb->tlb_head, tln, tln_list);
	+ tp->t_lib = tlb;
	+ tp->t_lin = tln;
	+ tln = NULL;
	+ }
	+
	+ rv = 0;
	+
	+done:
	+ /* Unlock things, as needed, and return. */
	+ INP_WUNLOCK(inp);
	+done_noinp:
	+ INP_UNLOCK_ASSERT(inp);
	+ if (bucket_locked) {
	+ TCPID_BUCKET_LOCK_ASSERT(tlb);
	+ TCPID_BUCKET_UNLOCK(tlb);
	+ } else if (tlb != NULL)
	+ TCPID_BUCKET_UNLOCK_ASSERT(tlb);
	+ if (tree_locked == TREE_WLOCKED) {
	+ TCPID_TREE_WLOCK_ASSERT();
	+ TCPID_TREE_WUNLOCK();
	+ } else if (tree_locked == TREE_RLOCKED) {
	+ TCPID_TREE_RLOCK_ASSERT();
	+ TCPID_TREE_RUNLOCK();
	+ } else
	+ TCPID_TREE_UNLOCK_ASSERT();
	+ if (tln != NULL)
	+ uma_zfree(tcp_log_node_zone, tln);
	+ return (rv);
	+}
	+
	+/*
	+ * Get the TCP log ID for a TCPCB.
	+ * Called with INPCB locked.
	+ * 'buf' must point to a buffer that is at least TCP_LOG_ID_LEN bytes long.
	+ * Returns number of bytes copied.
	+ */
	+size_t
	+tcp_log_get_id(struct tcpcb tp, char buf)
	+{
	+ size_t len;
	+
	+ INP_LOCK_ASSERT(tp->t_inpcb);
	+ if (tp->t_lib != NULL) {
	+ len = strlcpy(buf, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
	+ KASSERT(len < TCP_LOG_ID_LEN,
	+ ("%s:%d: tp->t_lib->tlb_id too long (%zu)",
	+ __func__, __LINE__, len));
	+ } else {
	+ *buf = '\0';
	+ len = 0;
	+ }
	+ return (len);
	+}
	+
	+/*
	+ * Get number of connections with the same log ID.
	+ * Log ID is taken from given TCPCB.
	+ * Called with INPCB locked.
	+ */
	+u_int
	+tcp_log_get_id_cnt(struct tcpcb *tp)
	+{
	+
	+ INP_WLOCK_ASSERT(tp->t_inpcb);
	+ return ((tp->t_lib == NULL) ? 0 : tp->t_lib->tlb_refcnt);
	+}
	+
	+#ifdef TCPLOG_DEBUG_RINGBUF
	+/*
	+ * Functions/macros to increment/decrement reference count for a log
	+ * entry. This should catch when we do a double-free/double-remove or
	+ * a double-add.
	+ */
	+static inline void
	+_tcp_log_entry_refcnt_add(struct tcp_log_mem log_entry, const char func,
	+ int line)
	+{
	+ int refcnt;
	+
	+ refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, 1);
	+ if (refcnt != 0)
	+ panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 0)",
	+ func, line, log_entry, refcnt);
	+}
	+#define tcp_log_entry_refcnt_add(l) \
	+ _tcp_log_entry_refcnt_add((l), __func__, __LINE__)
	+
	+static inline void
	+_tcp_log_entry_refcnt_rem(struct tcp_log_mem log_entry, const char func,
	+ int line)
	+{
	+ int refcnt;
	+
	+ refcnt = atomic_fetchadd_int(&log_entry->tlm_refcnt, -1);
	+ if (refcnt != 1)
	+ panic("%s:%d: log_entry(%p)->tlm_refcnt is %d (expected 1)",
	+ func, line, log_entry, refcnt);
	+}
	+#define tcp_log_entry_refcnt_rem(l) \
	+ _tcp_log_entry_refcnt_rem((l), __func__, __LINE__)
	+
	+#else /* !TCPLOG_DEBUG_RINGBUF */
	+
	+#define tcp_log_entry_refcnt_add(l)
	+#define tcp_log_entry_refcnt_rem(l)
	+
	+#endif
	+
	+/*
	+ * Cleanup after removing a log entry, but only decrement the count if we
	+ * are running INVARIANTS.
	+ */
	+static inline void
	+tcp_log_free_log_common(struct tcp_log_mem log_entry, int count __unused)
	+{
	+
	+ uma_zfree(tcp_log_zone, log_entry);
	+#ifdef INVARIANTS
	+ (*count)--;
	+ KASSERT(*count >= 0,
	+ ("%s: count unexpectedly negative", __func__));
	+#endif
	+}
	+
	+static void
	+tcp_log_free_entries(struct tcp_log_stailq head, int count)
	+{
	+ struct tcp_log_mem *log_entry;
	+
	+ /* Free the entries. */
	+ while ((log_entry = STAILQ_FIRST(head)) != NULL) {
	+ STAILQ_REMOVE_HEAD(head, tlm_queue);
	+ tcp_log_entry_refcnt_rem(log_entry);
	+ tcp_log_free_log_common(log_entry, count);
	+ }
	+}
	+
	+/* Cleanup after removing a log entry. */
	+static inline void
	+tcp_log_remove_log_cleanup(struct tcpcb tp, struct tcp_log_mem log_entry)
	+{
	+ uma_zfree(tcp_log_zone, log_entry);
	+ tp->t_lognum--;
	+ KASSERT(tp->t_lognum >= 0,
	+ ("%s: tp->t_lognum unexpectedly negative", __func__));
	+}
	+
	+/* Remove a log entry from the head of a list. */
	+static inline void
	+tcp_log_remove_log_head(struct tcpcb tp, struct tcp_log_mem log_entry)
	+{
	+
	+ KASSERT(log_entry == STAILQ_FIRST(&tp->t_logs),
	+ ("%s: attempt to remove non-HEAD log entry", __func__));
	+ STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
	+ tcp_log_entry_refcnt_rem(log_entry);
	+ tcp_log_remove_log_cleanup(tp, log_entry);
	+}
	+
	+#ifdef TCPLOG_DEBUG_RINGBUF
	+/*
	+ * Initialize the log entry's reference count, which we want to
	+ * survive allocations.
	+ */
	+static int
	+tcp_log_zone_init(void *mem, int size, int flags __unused)
	+{
	+ struct tcp_log_mem *tlm;
	+
	+ KASSERT(size >= sizeof(struct tcp_log_mem),
	+ ("%s: unexpectedly short (%d) allocation", __func__, size));
	+ tlm = (struct tcp_log_mem *)mem;
	+ tlm->tlm_refcnt = 0;
	+ return (0);
	+}
	+
	+/*
	+ * Double check that the refcnt is zero on allocation and return.
	+ */
	+static int
	+tcp_log_zone_ctor(void mem, int size, void args __unused, int flags __unused)
	+{
	+ struct tcp_log_mem *tlm;
	+
	+ KASSERT(size >= sizeof(struct tcp_log_mem),
	+ ("%s: unexpectedly short (%d) allocation", __func__, size));
	+ tlm = (struct tcp_log_mem *)mem;
	+ if (tlm->tlm_refcnt != 0)
	+ panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
	+ __func__, __LINE__, tlm, tlm->tlm_refcnt);
	+ return (0);
	+}
	+
	+static void
	+tcp_log_zone_dtor(void mem, int size, void args __unused)
	+{
	+ struct tcp_log_mem *tlm;
	+
	+ KASSERT(size >= sizeof(struct tcp_log_mem),
	+ ("%s: unexpectedly short (%d) allocation", __func__, size));
	+ tlm = (struct tcp_log_mem *)mem;
	+ if (tlm->tlm_refcnt != 0)
	+ panic("%s:%d: tlm(%p)->tlm_refcnt is %d (expected 0)",
	+ __func__, __LINE__, tlm, tlm->tlm_refcnt);
	+}
	+#endif /* TCPLOG_DEBUG_RINGBUF */
	+
	+/* Do global initialization. */
	+void
	+tcp_log_init(void)
	+{
	+
	+ tcp_log_zone = uma_zcreate("tcp_log", sizeof(struct tcp_log_mem),
	+#ifdef TCPLOG_DEBUG_RINGBUF
	+ tcp_log_zone_ctor, tcp_log_zone_dtor, tcp_log_zone_init,
	+#else
	+ NULL, NULL, NULL,
	+#endif
	+ NULL, UMA_ALIGN_PTR, 0);
	+ (void)uma_zone_set_max(tcp_log_zone, TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT);
	+ tcp_log_bucket_zone = uma_zcreate("tcp_log_bucket",
	+ sizeof(struct tcp_log_id_bucket), NULL, NULL, NULL, NULL,
	+ UMA_ALIGN_PTR, 0);
	+ tcp_log_node_zone = uma_zcreate("tcp_log_node",
	+ sizeof(struct tcp_log_id_node), NULL, NULL, NULL, NULL,
	+ UMA_ALIGN_PTR, 0);
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+ tcp_log_queued = counter_u64_alloc(M_WAITOK);
	+ tcp_log_que_fail1 = counter_u64_alloc(M_WAITOK);
	+ tcp_log_que_fail2 = counter_u64_alloc(M_WAITOK);
	+ tcp_log_que_fail3 = counter_u64_alloc(M_WAITOK);
	+ tcp_log_que_fail4 = counter_u64_alloc(M_WAITOK);
	+ tcp_log_que_fail5 = counter_u64_alloc(M_WAITOK);
	+ tcp_log_que_copyout = counter_u64_alloc(M_WAITOK);
	+ tcp_log_que_read = counter_u64_alloc(M_WAITOK);
	+ tcp_log_que_freed = counter_u64_alloc(M_WAITOK);
	+#endif
	+
	+ rw_init_flags(&tcp_id_tree_lock, "TCP ID tree", RW_NEW);
	+ mtx_init(&tcp_log_expireq_mtx, "TCP log expireq", NULL, MTX_DEF);
	+ callout_init(&tcp_log_expireq_callout, 1);
	+}
	+
	+/* Do per-TCPCB initialization. */
	+void
	+tcp_log_tcpcbinit(struct tcpcb *tp)
	+{
	+
	+ /* A new TCPCB should start out zero-initialized. */
	+ STAILQ_INIT(&tp->t_logs);
	+
	+ /*
	+ * If we are doing auto-capturing, figure out whether we will capture
	+ * this session.
	+ */
	+ if (tcp_log_selectauto()) {
	+ tp->t_logstate = tcp_log_auto_mode;
	+ tp->t_flags2 \|= TF2_LOG_AUTO;
	+ }
	+}
	+
	+
	+/* Remove entries */
	+static void
	+tcp_log_expire(void *unused __unused)
	+{
	+ struct tcp_log_id_bucket *tlb;
	+ struct tcp_log_id_node *tln;
	+ sbintime_t expiry_limit;
	+ int tree_locked;
	+
	+ TCPLOG_EXPIREQ_LOCK();
	+ if (callout_pending(&tcp_log_expireq_callout)) {
	+ /* Callout was reset. */
	+ TCPLOG_EXPIREQ_UNLOCK();
	+ return;
	+ }
	+
	+ /*
	+ * Process entries until we reach one that expires too far in the
	+ * future. Look one second in the future.
	+ */
	+ expiry_limit = getsbinuptime() + SBT_1S;
	+ tree_locked = TREE_UNLOCKED;
	+
	+ while ((tln = STAILQ_FIRST(&tcp_log_expireq_head)) != NULL &&
	+ tln->tln_expiretime <= expiry_limit) {
	+ if (!callout_active(&tcp_log_expireq_callout)) {
	+ /*
	+ * Callout was stopped. I guess we should
	+ * just quit at this point.
	+ */
	+ TCPLOG_EXPIREQ_UNLOCK();
	+ return;
	+ }
	+
	+ /*
	+ * Remove the node from the head of the list and unlock
	+ * the list. Change the expiry time to SBT_MAX as a signal
	+ * to other threads that we now own this.
	+ */
	+ STAILQ_REMOVE_HEAD(&tcp_log_expireq_head, tln_expireq);
	+ tln->tln_expiretime = SBT_MAX;
	+ TCPLOG_EXPIREQ_UNLOCK();
	+
	+ /*
	+ * Remove the node from the bucket.
	+ */
	+ tlb = tln->tln_bucket;
	+ TCPID_BUCKET_LOCK(tlb);
	+ if (tcp_log_remove_id_node(NULL, NULL, tlb, tln, &tree_locked)) {
	+ tcp_log_id_validate_tree_lock(tree_locked);
	+ if (tree_locked == TREE_WLOCKED)
	+ TCPID_TREE_WUNLOCK();
	+ else
	+ TCPID_TREE_RUNLOCK();
	+ tree_locked = TREE_UNLOCKED;
	+ }
	+
	+ /* Drop the INP reference. */
	+ INP_WLOCK(tln->tln_inp);
	+ if (!in_pcbrele_wlocked(tln->tln_inp))
	+ INP_WUNLOCK(tln->tln_inp);
	+
	+ /* Free the log records. */
	+ tcp_log_free_entries(&tln->tln_entries, &tln->tln_count);
	+
	+ /* Free the node. */
	+ uma_zfree(tcp_log_node_zone, tln);
	+
	+ /* Relock the expiry queue. */
	+ TCPLOG_EXPIREQ_LOCK();
	+ }
	+
	+ /*
	+ * We've expired all the entries we can. Do we need to reschedule
	+ * ourselves?
	+ */
	+ callout_deactivate(&tcp_log_expireq_callout);
	+ if (tln != NULL) {
	+ /*
	+ * Get max(now + TCP_LOG_EXPIRE_INTVL, tln->tln_expiretime) and
	+ * set the next callout to that. (This helps ensure we generally
	+ * run the callout no more often than desired.)
	+ */
	+ expiry_limit = getsbinuptime() + TCP_LOG_EXPIRE_INTVL;
	+ if (expiry_limit < tln->tln_expiretime)
	+ expiry_limit = tln->tln_expiretime;
	+ callout_reset_sbt(&tcp_log_expireq_callout, expiry_limit,
	+ SBT_1S, tcp_log_expire, NULL, C_ABSOLUTE);
	+ }
	+
	+ /* We're done. */
	+ TCPLOG_EXPIREQ_UNLOCK();
	+ return;
	+}
	+
	+/*
	+ * Move log data from the TCPCB to a new node. This will reset the TCPCB log
	+ * entries and log count; however, it will not touch other things from the
	+ * TCPCB (e.g. t_lin, t_lib).
	+ *
	+ * NOTE: Must hold a lock on the INP.
	+ */
	+static void
	+tcp_log_move_tp_to_node(struct tcpcb tp, struct tcp_log_id_node tln)
	+{
	+
	+ INP_WLOCK_ASSERT(tp->t_inpcb);
	+
	+ tln->tln_ie = tp->t_inpcb->inp_inc.inc_ie;
	+ if (tp->t_inpcb->inp_inc.inc_flags & INC_ISIPV6)
	+ tln->tln_af = AF_INET6;
	+ else
	+ tln->tln_af = AF_INET;
	+ tln->tln_entries = tp->t_logs;
	+ tln->tln_count = tp->t_lognum;
	+ tln->tln_bucket = tp->t_lib;
	+
	+ /* Clear information from the PCB. */
	+ STAILQ_INIT(&tp->t_logs);
	+ tp->t_lognum = 0;
	+}
	+
	+/* Do per-TCPCB cleanup */
	+void
	+tcp_log_tcpcbfini(struct tcpcb *tp)
	+{
	+ struct tcp_log_id_node tln, tln_first;
	+ struct tcp_log_mem *log_entry;
	+ sbintime_t callouttime;
	+
	+ INP_WLOCK_ASSERT(tp->t_inpcb);
	+
	+ /*
	+ * If we were gathering packets to be automatically dumped, try to do
	+ * it now. If this succeeds, the log information in the TCPCB will be
	+ * cleared. Otherwise, we'll handle the log information as we do
	+ * for other states.
	+ */
	+ switch(tp->t_logstate) {
	+ case TCP_LOG_STATE_HEAD_AUTO:
	+ (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
	+ M_NOWAIT, false);
	+ break;
	+ case TCP_LOG_STATE_TAIL_AUTO:
	+ (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from tail",
	+ M_NOWAIT, false);
	+ break;
	+ case TCP_LOG_STATE_CONTINUAL:
	+ (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
	+ M_NOWAIT, false);
	+ break;
	+ }
	+
	+ /*
	+ * There are two ways we could keep logs: per-socket or per-ID. If
	+ * we are tracking logs with an ID, then the logs survive the
	+ * destruction of the TCPCB.
	+ *
	+ * If the TCPCB is associated with an ID node, move the logs from the
	+ * TCPCB to the ID node. In theory, this is safe, for reasons which I
	+ * will now explain for my own benefit when I next need to figure out
	+ * this code. :-)
	+ *
	+ * We own the INP lock. Therefore, no one else can change the contents
	+ * of this node (Rule C). Further, no one can remove this node from
	+ * the bucket while we hold the lock (Rule D). Basically, no one can
	+ * mess with this node. That leaves two states in which we could be:
	+ *
	+ * 1. Another thread is currently waiting to acquire the INP lock, with
	+ * plans to do something with this node. When we drop the INP lock,
	+ * they will have a chance to do that. They will recheck the
	+ * tln_closed field (see note to Rule C) and then acquire the
	+ * bucket lock before proceeding further.
	+ *
	+ * 2. Another thread will try to acquire a lock at some point in the
	+ * future. If they try to acquire a lock before we set the
	+ * tln_closed field, they will follow state #1. If they try to
	+ * acquire a lock after we set the tln_closed field, they will be
	+ * able to make changes to the node, at will, following Rule C.
	+ *
	+ * Therefore, we currently own this node and can make any changes
	+ * we want. But, as soon as we set the tln_closed field to true, we
	+ * have effectively dropped our lock on the node. (For this reason, we
	+ * also need to make sure our writes are ordered correctly. An atomic
	+ * operation with "release" semantics should be sufficient.)
	+ */
	+
	+ if (tp->t_lin != NULL) {
	+ /* Copy the relevant information to the log entry. */
	+ tln = tp->t_lin;
	+ KASSERT(tln->tln_inp == tp->t_inpcb,
	+ ("%s: Mismatched inp (tln->tln_inp=%p, tp->t_inpcb=%p)",
	+ __func__, tln->tln_inp, tp->t_inpcb));
	+ tcp_log_move_tp_to_node(tp, tln);
	+
	+ /* Clear information from the PCB. */
	+ tp->t_lin = NULL;
	+ tp->t_lib = NULL;
	+
	+ /*
	+ * Take a reference on the INP. This ensures that the INP
	+ * remains valid while the node is on the expiry queue. This
	+ * ensures the INP is valid for other threads that may be
	+ * racing to lock this node when we move it to the expire
	+ * queue.
	+ */
	+ in_pcbref(tp->t_inpcb);
	+
	+ /*
	+ * Store the entry on the expiry list. The exact behavior
	+ * depends on whether we have entries to keep. If so, we
	+ * put the entry at the tail of the list and expire in
	+ * TCP_LOG_EXPIRE_TIME. Otherwise, we expire "now" and put
	+ * the entry at the head of the list. (Handling the cleanup
	+ * via the expiry timer lets us avoid locking messy-ness here.)
	+ */
	+ tln->tln_expiretime = getsbinuptime();
	+ TCPLOG_EXPIREQ_LOCK();
	+ if (tln->tln_count) {
	+ tln->tln_expiretime += TCP_LOG_EXPIRE_TIME;
	+ if (STAILQ_EMPTY(&tcp_log_expireq_head) &&
	+ !callout_active(&tcp_log_expireq_callout)) {
	+ /*
	+ * We are adding the first entry and a callout
	+ * is not currently scheduled; therefore, we
	+ * need to schedule one.
	+ */
	+ callout_reset_sbt(&tcp_log_expireq_callout,
	+ tln->tln_expiretime, SBT_1S, tcp_log_expire,
	+ NULL, C_ABSOLUTE);
	+ }
	+ STAILQ_INSERT_TAIL(&tcp_log_expireq_head, tln,
	+ tln_expireq);
	+ } else {
	+ callouttime = tln->tln_expiretime +
	+ TCP_LOG_EXPIRE_INTVL;
	+ tln_first = STAILQ_FIRST(&tcp_log_expireq_head);
	+
	+ if ((tln_first == NULL \|\|
	+ callouttime < tln_first->tln_expiretime) &&
	+ (callout_pending(&tcp_log_expireq_callout) \|\|
	+ !callout_active(&tcp_log_expireq_callout))) {
	+ /*
	+ * The list is empty, or we want to run the
	+ * expire code before the first entry's timer
	+ * fires. Also, we are in a case where a callout
	+ * is not actively running. We want to reset
	+ * the callout to occur sooner.
	+ */
	+ callout_reset_sbt(&tcp_log_expireq_callout,
	+ callouttime, SBT_1S, tcp_log_expire, NULL,
	+ C_ABSOLUTE);
	+ }
	+
	+ /*
	+ * Insert to the head, or just after the head, as
	+ * appropriate. (This might result in small
	+ * mis-orderings as a bunch of "expire now" entries
	+ * gather at the start of the list, but that should
	+ * not produce big problems, since the expire timer
	+ * will walk through all of them.)
	+ */
	+ if (tln_first == NULL \|\|
	+ tln->tln_expiretime < tln_first->tln_expiretime)
	+ STAILQ_INSERT_HEAD(&tcp_log_expireq_head, tln,
	+ tln_expireq);
	+ else
	+ STAILQ_INSERT_AFTER(&tcp_log_expireq_head,
	+ tln_first, tln, tln_expireq);
	+ }
	+ TCPLOG_EXPIREQ_UNLOCK();
	+
	+ /*
	+ * We are done messing with the tln. After this point, we
	+ * can't touch it. (Note that the "release" semantics should
	+ * be included with the TCPLOG_EXPIREQ_UNLOCK() call above.
	+ * Therefore, they should be unnecessary here. However, it
	+ * seems like a good idea to include them anyway, since we
	+ * really are releasing a lock here.)
	+ */
	+ atomic_store_rel_int(&tln->tln_closed, 1);
	+ } else {
	+ /* Remove log entries. */
	+ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
	+ tcp_log_remove_log_head(tp, log_entry);
	+ KASSERT(tp->t_lognum == 0,
	+ ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
	+ __func__, tp->t_lognum));
	+ }
	+
	+ /*
	+ * Change the log state to off (just in case anything tries to sneak
	+ * in a last-minute log).
	+ */
	+ tp->t_logstate = TCP_LOG_STATE_OFF;
	+}
	+
	+/*
	+ * This logs an event for a TCP socket. Normally, this is called via
	+ * TCP_LOG_EVENT or TCP_LOG_EVENT_VERBOSE. See the documentation for
	+ * TCP_LOG_EVENT().
	+ */
	+
	+struct tcp_log_buffer *
	+tcp_log_event_(struct tcpcb tp, struct tcphdr th, struct sockbuf *rxbuf,
	+ struct sockbuf *txbuf, uint8_t eventid, int errornum, uint32_t len,
	+ union tcp_log_stackspecific *stackinfo, int th_hostorder,
	+ const char output_caller, const char func, int line, const struct timeval *itv)
	+{
	+ struct tcp_log_mem *log_entry;
	+ struct tcp_log_buffer *log_buf;
	+ int attempt_count = 0;
	+ struct tcp_log_verbose *log_verbose;
	+ uint32_t logsn;
	+
	+ KASSERT((func == NULL && line == 0) \|\| (func != NULL && line > 0),
	+ ("%s called with inconsistent func (%p) and line (%d) arguments",
	+ __func__, func, line));
	+
	+ INP_WLOCK_ASSERT(tp->t_inpcb);
	+
	+ KASSERT(tp->t_logstate == TCP_LOG_STATE_HEAD \|\|
	+ tp->t_logstate == TCP_LOG_STATE_TAIL \|\|
	+ tp->t_logstate == TCP_LOG_STATE_CONTINUAL \|\|
	+ tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO \|\|
	+ tp->t_logstate == TCP_LOG_STATE_TAIL_AUTO,
	+ ("%s called with unexpected tp->t_logstate (%d)", __func__,
	+ tp->t_logstate));
	+
	+ /*
	+ * Get the serial number. We do this early so it will
	+ * increment even if we end up skipping the log entry for some
	+ * reason.
	+ */
	+ logsn = tp->t_logsn++;
	+
	+ /*
	+ * Can we get a new log entry? If so, increment the lognum counter
	+ * here.
	+ */
	+retry:
	+ if (tp->t_lognum < tcp_log_session_limit) {
	+ if ((log_entry = uma_zalloc(tcp_log_zone, M_NOWAIT)) != NULL)
	+ tp->t_lognum++;
	+ } else
	+ log_entry = NULL;
	+
	+ /* Do we need to try to reuse? */
	+ if (log_entry == NULL) {
	+ /*
	+ * Sacrifice auto-logged sessions without a log ID if
	+ * tcp_log_auto_all is false. (If they don't have a log
	+ * ID by now, it is probable that either they won't get one
	+ * or we are resource-constrained.)
	+ */
	+ if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
	+ !tcp_log_auto_all) {
	+ if (tcp_log_state_change(tp, TCP_LOG_STATE_CLEAR)) {
	+#ifdef INVARIANTS
	+ panic("%s:%d: tcp_log_state_change() failed "
	+ "to set tp %p to TCP_LOG_STATE_CLEAR",
	+ __func__, __LINE__, tp);
	+#endif
	+ tp->t_logstate = TCP_LOG_STATE_OFF;
	+ }
	+ return (NULL);
	+ }
	+ /*
	+ * If we are in TCP_LOG_STATE_HEAD_AUTO state, try to dump
	+ * the buffers. If successful, deactivate tracing. Otherwise,
	+ * leave it active so we will retry.
	+ */
	+ if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO &&
	+ !tcp_log_dump_tp_logbuf(tp, "auto-dumped from head",
	+ M_NOWAIT, false)) {
	+ tp->t_logstate = TCP_LOG_STATE_OFF;
	+ return(NULL);
	+ } else if ((tp->t_logstate == TCP_LOG_STATE_CONTINUAL) &&
	+ !tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
	+ M_NOWAIT, false)) {
	+ if (attempt_count == 0) {
	+ attempt_count++;
	+ goto retry;
	+ }
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+ counter_u64_add(tcp_log_que_fail4, 1);
	+#endif
	+ return(NULL);
	+ } else if (tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO)
	+ return(NULL);
	+
	+ /* If in HEAD state, just deactivate the tracing and return. */
	+ if (tp->t_logstate == TCP_LOG_STATE_HEAD) {
	+ tp->t_logstate = TCP_LOG_STATE_OFF;
	+ return(NULL);
	+ }
	+
	+ /*
	+ * Get a buffer to reuse. If that fails, just give up.
	+ * (We can't log anything without a buffer in which to
	+ * put it.)
	+ *
	+ * Note that we don't change the t_lognum counter
	+ * here. Because we are re-using the buffer, the total
	+ * number won't change.
	+ */
	+ if ((log_entry = STAILQ_FIRST(&tp->t_logs)) == NULL)
	+ return(NULL);
	+ STAILQ_REMOVE_HEAD(&tp->t_logs, tlm_queue);
	+ tcp_log_entry_refcnt_rem(log_entry);
	+ }
	+
	+ KASSERT(log_entry != NULL,
	+ ("%s: log_entry unexpectedly NULL", __func__));
	+
	+ /* Extract the log buffer and verbose buffer pointers. */
	+ log_buf = &log_entry->tlm_buf;
	+ log_verbose = &log_entry->tlm_v;
	+
	+ /* Basic entries. */
	+ if (itv == NULL)
	+ getmicrouptime(&log_buf->tlb_tv);
	+ else
	+ memcpy(&log_buf->tlb_tv, itv, sizeof(struct timeval));
	+ log_buf->tlb_ticks = ticks;
	+ log_buf->tlb_sn = logsn;
	+ log_buf->tlb_stackid = tp->t_fb->tfb_id;
	+ log_buf->tlb_eventid = eventid;
	+ log_buf->tlb_eventflags = 0;
	+ log_buf->tlb_errno = errornum;
	+
	+ /* Socket buffers */
	+ if (rxbuf != NULL) {
	+ log_buf->tlb_eventflags \|= TLB_FLAG_RXBUF;
	+ log_buf->tlb_rxbuf.tls_sb_acc = rxbuf->sb_acc;
	+ log_buf->tlb_rxbuf.tls_sb_ccc = rxbuf->sb_ccc;
	+ log_buf->tlb_rxbuf.tls_sb_spare = 0;
	+ }
	+ if (txbuf != NULL) {
	+ log_buf->tlb_eventflags \|= TLB_FLAG_TXBUF;
	+ log_buf->tlb_txbuf.tls_sb_acc = txbuf->sb_acc;
	+ log_buf->tlb_txbuf.tls_sb_ccc = txbuf->sb_ccc;
	+ log_buf->tlb_txbuf.tls_sb_spare = 0;
	+ }
	+ /* Copy values from tp to the log entry. */
	+#define COPY_STAT(f) log_buf->tlb_ ## f = tp->f
	+#define COPY_STAT_T(f) log_buf->tlb_ ## f = tp->t_ ## f
	+ COPY_STAT_T(state);
	+ COPY_STAT_T(starttime);
	+ COPY_STAT(iss);
	+ COPY_STAT_T(flags);
	+ COPY_STAT(snd_una);
	+ COPY_STAT(snd_max);
	+ COPY_STAT(snd_cwnd);
	+ COPY_STAT(snd_nxt);
	+ COPY_STAT(snd_recover);
	+ COPY_STAT(snd_wnd);
	+ COPY_STAT(snd_ssthresh);
	+ COPY_STAT_T(srtt);
	+ COPY_STAT_T(rttvar);
	+ COPY_STAT(rcv_up);
	+ COPY_STAT(rcv_adv);
	+ COPY_STAT(rcv_nxt);
	+ COPY_STAT(sack_newdata);
	+ COPY_STAT(rcv_wnd);
	+ COPY_STAT_T(dupacks);
	+ COPY_STAT_T(segqlen);
	+ COPY_STAT(snd_numholes);
	+ COPY_STAT(snd_scale);
	+ COPY_STAT(rcv_scale);
	+#undef COPY_STAT
	+#undef COPY_STAT_T
	+ log_buf->tlb_flex1 = 0;
	+ log_buf->tlb_flex2 = 0;
	+ /* Copy stack-specific info. */
	+ if (stackinfo != NULL) {
	+ memcpy(&log_buf->tlb_stackinfo, stackinfo,
	+ sizeof(log_buf->tlb_stackinfo));
	+ log_buf->tlb_eventflags \|= TLB_FLAG_STACKINFO;
	+ }
	+
	+ /* The packet */
	+ log_buf->tlb_len = len;
	+ if (th) {
	+ int optlen;
	+
	+ log_buf->tlb_eventflags \|= TLB_FLAG_HDR;
	+ log_buf->tlb_th = *th;
	+ if (th_hostorder)
	+ tcp_fields_to_net(&log_buf->tlb_th);
	+ optlen = (th->th_off << 2) - sizeof (struct tcphdr);
	+ if (optlen > 0)
	+ memcpy(log_buf->tlb_opts, th + 1, optlen);
	+ }
	+
	+ /* Verbose information */
	+ if (func != NULL) {
	+ log_buf->tlb_eventflags \|= TLB_FLAG_VERBOSE;
	+ if (output_caller != NULL)
	+ strlcpy(log_verbose->tlv_snd_frm, output_caller,
	+ TCP_FUNC_LEN);
	+ else
	+ *log_verbose->tlv_snd_frm = 0;
	+ strlcpy(log_verbose->tlv_trace_func, func, TCP_FUNC_LEN);
	+ log_verbose->tlv_trace_line = line;
	+ }
	+
	+ /* Insert the new log at the tail. */
	+ STAILQ_INSERT_TAIL(&tp->t_logs, log_entry, tlm_queue);
	+ tcp_log_entry_refcnt_add(log_entry);
	+ return (log_buf);
	+}
	+
	+/*
	+ * Change the logging state for a TCPCB. Returns 0 on success or an
	+ * error code on failure.
	+ */
	+int
	+tcp_log_state_change(struct tcpcb *tp, int state)
	+{
	+ struct tcp_log_mem *log_entry;
	+
	+ INP_WLOCK_ASSERT(tp->t_inpcb);
	+ switch(state) {
	+ case TCP_LOG_STATE_CLEAR:
	+ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
	+ tcp_log_remove_log_head(tp, log_entry);
	+ /* Fall through */
	+
	+ case TCP_LOG_STATE_OFF:
	+ tp->t_logstate = TCP_LOG_STATE_OFF;
	+ break;
	+
	+ case TCP_LOG_STATE_TAIL:
	+ case TCP_LOG_STATE_HEAD:
	+ case TCP_LOG_STATE_CONTINUAL:
	+ case TCP_LOG_STATE_HEAD_AUTO:
	+ case TCP_LOG_STATE_TAIL_AUTO:
	+ tp->t_logstate = state;
	+ break;
	+
	+ default:
	+ return (EINVAL);
	+ }
	+
	+ tp->t_flags2 &= ~(TF2_LOG_AUTO);
	+
	+ return (0);
	+}
	+
	+/* If tcp_drain() is called, flush half the log entries. */
	+void
	+tcp_log_drain(struct tcpcb *tp)
	+{
	+ struct tcp_log_mem log_entry, next;
	+ int target, skip;
	+
	+ INP_WLOCK_ASSERT(tp->t_inpcb);
	+ if ((target = tp->t_lognum / 2) == 0)
	+ return;
	+
	+ /*
	+ * If we are logging the "head" packets, we want to discard
	+ * from the tail of the queue. Otherwise, we want to discard
	+ * from the head.
	+ */
	+ if (tp->t_logstate == TCP_LOG_STATE_HEAD \|\|
	+ tp->t_logstate == TCP_LOG_STATE_HEAD_AUTO) {
	+ skip = tp->t_lognum - target;
	+ STAILQ_FOREACH(log_entry, &tp->t_logs, tlm_queue)
	+ if (!--skip)
	+ break;
	+ KASSERT(log_entry != NULL,
	+ ("%s: skipped through all entries!", __func__));
	+ if (log_entry == NULL)
	+ return;
	+ while ((next = STAILQ_NEXT(log_entry, tlm_queue)) != NULL) {
	+ STAILQ_REMOVE_AFTER(&tp->t_logs, log_entry, tlm_queue);
	+ tcp_log_entry_refcnt_rem(next);
	+ tcp_log_remove_log_cleanup(tp, next);
	+#ifdef INVARIANTS
	+ target--;
	+#endif
	+ }
	+ KASSERT(target == 0,
	+ ("%s: After removing from tail, target was %d", __func__,
	+ target));
	+ } else if (tp->t_logstate == TCP_LOG_STATE_CONTINUAL) {
	+ (void)tcp_log_dump_tp_logbuf(tp, "auto-dumped from continual",
	+ M_NOWAIT, false);
	+ } else {
	+ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL &&
	+ target--)
	+ tcp_log_remove_log_head(tp, log_entry);
	+ KASSERT(target <= 0,
	+ ("%s: After removing from head, target was %d", __func__,
	+ target));
	+ KASSERT(tp->t_lognum > 0,
	+ ("%s: After removing from head, tp->t_lognum was %d",
	+ __func__, target));
	+ KASSERT(log_entry != NULL,
	+ ("%s: After removing from head, the tailq was empty",
	+ __func__));
	+ }
	+}
	+
	+static inline int
	+tcp_log_copyout(struct sockopt sopt, void src, void *dst, size_t len)
	+{
	+
	+ if (sopt->sopt_td != NULL)
	+ return (copyout(src, dst, len));
	+ bcopy(src, dst, len);
	+ return (0);
	+}
	+
	+static int
	+tcp_log_logs_to_buf(struct sockopt sopt, struct tcp_log_stailq log_tailqp,
	+ struct tcp_log_buffer **end, int count)
	+{
	+ struct tcp_log_buffer *out_entry;
	+ struct tcp_log_mem *log_entry;
	+ size_t entrysize;
	+ int error;
	+#ifdef INVARIANTS
	+ int orig_count = count;
	+#endif
	+
	+ /* Copy the data out. */
	+ error = 0;
	+ out_entry = (struct tcp_log_buffer *) sopt->sopt_val;
	+ STAILQ_FOREACH(log_entry, log_tailqp, tlm_queue) {
	+ count--;
	+ KASSERT(count >= 0,
	+ ("%s:%d: Exceeded expected count (%d) processing list %p",
	+ __func__, __LINE__, orig_count, log_tailqp));
	+
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+ counter_u64_add(tcp_log_que_copyout, 1);
	+#endif
	+#if 0
	+ struct tcp_log_buffer *lb = &log_entry->tlm_buf;
	+ int i;
	+
	+ printf("lb = %p:\n", lb);
	+#define PRINT(f) printf(#f " = %u\n", (unsigned int)lb->f)
	+ printf("tlb_tv = {%lu, %lu}\n", lb->tlb_tv.tv_sec, lb->tlb_tv.tv_usec);
	+ PRINT(tlb_ticks);
	+ PRINT(tlb_sn);
	+ PRINT(tlb_stackid);
	+ PRINT(tlb_eventid);
	+ PRINT(tlb_eventflags);
	+ PRINT(tlb_errno);
	+ PRINT(tlb_rxbuf.tls_sb_acc);
	+ PRINT(tlb_rxbuf.tls_sb_ccc);
	+ PRINT(tlb_rxbuf.tls_sb_spare);
	+ PRINT(tlb_txbuf.tls_sb_acc);
	+ PRINT(tlb_txbuf.tls_sb_ccc);
	+ PRINT(tlb_txbuf.tls_sb_spare);
	+ PRINT(tlb_state);
	+ PRINT(tlb_flags);
	+ PRINT(tlb_snd_una);
	+ PRINT(tlb_snd_max);
	+ PRINT(tlb_snd_cwnd);
	+ PRINT(tlb_snd_nxt);
	+ PRINT(tlb_snd_recover);
	+ PRINT(tlb_snd_wnd);
	+ PRINT(tlb_snd_ssthresh);
	+ PRINT(tlb_srtt);
	+ PRINT(tlb_rttvar);
	+ PRINT(tlb_rcv_up);
	+ PRINT(tlb_rcv_adv);
	+ PRINT(tlb_rcv_nxt);
	+ PRINT(tlb_sack_newdata);
	+ PRINT(tlb_rcv_wnd);
	+ PRINT(tlb_dupacks);
	+ PRINT(tlb_segqlen);
	+ PRINT(tlb_snd_numholes);
	+ PRINT(tlb_snd_scale);
	+ PRINT(tlb_rcv_scale);
	+ PRINT(tlb_len);
	+ printf("hex dump: ");
	+ for (i = 0; i < sizeof(struct tcp_log_buffer); i++)
	+ printf("%02x", (((uint8_t )lb) + i));
	+#undef PRINT
	+#endif
	+ /*
	+ * Skip copying out the header if it isn't present.
	+ * Instead, copy out zeros (to ensure we don't leak info).
	+ * TODO: Make sure we truly do zero everything we don't
	+ * explicitly set.
	+ */
	+ if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)
	+ entrysize = sizeof(struct tcp_log_buffer);
	+ else
	+ entrysize = offsetof(struct tcp_log_buffer, tlb_th);
	+ error = tcp_log_copyout(sopt, &log_entry->tlm_buf, out_entry,
	+ entrysize);
	+ if (error)
	+ break;
	+ if (!(log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_HDR)) {
	+ error = tcp_log_copyout(sopt, zerobuf,
	+ ((uint8_t *)out_entry) + entrysize,
	+ sizeof(struct tcp_log_buffer) - entrysize);
	+ }
	+
	+ /*
	+ * Copy out the verbose bit, if needed. Either way,
	+ * increment the output pointer the correct amount.
	+ */
	+ if (log_entry->tlm_buf.tlb_eventflags & TLB_FLAG_VERBOSE) {
	+ error = tcp_log_copyout(sopt, &log_entry->tlm_v,
	+ out_entry->tlb_verbose,
	+ sizeof(struct tcp_log_verbose));
	+ if (error)
	+ break;
	+ out_entry = (struct tcp_log_buffer *)
	+ (((uint8_t *) (out_entry + 1)) +
	+ sizeof(struct tcp_log_verbose));
	+ } else
	+ out_entry++;
	+ }
	+ *end = out_entry;
	+ KASSERT(error \|\| count == 0,
	+ ("%s:%d: Less than expected count (%d) processing list %p"
	+ " (%d remain)", __func__, __LINE__, orig_count,
	+ log_tailqp, count));
	+
	+ return (error);
	+}
	+
	+/*
	+ * Copy out the buffer. Note that we do incremental copying, so
	+ * sooptcopyout() won't work. However, the goal is to produce the same
	+ * end result as if we copied in the entire user buffer, updated it,
	+ * and then used sooptcopyout() to copy it out.
	+ *
	+ * NOTE: This should be called with a write lock on the PCB; however,
	+ * the function will drop it after it extracts the data from the TCPCB.
	+ */
	+int
	+tcp_log_getlogbuf(struct sockopt sopt, struct tcpcb tp)
	+{
	+ struct tcp_log_stailq log_tailq;
	+ struct tcp_log_mem log_entry, log_next;
	+ struct tcp_log_buffer *out_entry;
	+ struct inpcb *inp;
	+ size_t outsize, entrysize;
	+ int error, outnum;
	+
	+ INP_WLOCK_ASSERT(tp->t_inpcb);
	+ inp = tp->t_inpcb;
	+
	+ /*
	+ * Determine which log entries will fit in the buffer. As an
	+ * optimization, skip this if all the entries will clearly fit
	+ * in the buffer. (However, get an exact size if we are using
	+ * INVARIANTS.)
	+ */
	+#ifndef INVARIANTS
	+ if (sopt->sopt_valsize / (sizeof(struct tcp_log_buffer) +
	+ sizeof(struct tcp_log_verbose)) >= tp->t_lognum) {
	+ log_entry = STAILQ_LAST(&tp->t_logs, tcp_log_mem, tlm_queue);
	+ log_next = NULL;
	+ outsize = 0;
	+ outnum = tp->t_lognum;
	+ } else {
	+#endif
	+ outsize = outnum = 0;
	+ log_entry = NULL;
	+ STAILQ_FOREACH(log_next, &tp->t_logs, tlm_queue) {
	+ entrysize = sizeof(struct tcp_log_buffer);
	+ if (log_next->tlm_buf.tlb_eventflags &
	+ TLB_FLAG_VERBOSE)
	+ entrysize += sizeof(struct tcp_log_verbose);
	+ if ((sopt->sopt_valsize - outsize) < entrysize)
	+ break;
	+ outsize += entrysize;
	+ outnum++;
	+ log_entry = log_next;
	+ }
	+ KASSERT(outsize <= sopt->sopt_valsize,
	+ ("%s: calculated output size (%zu) greater than available"
	+ "space (%zu)", __func__, outsize, sopt->sopt_valsize));
	+#ifndef INVARIANTS
	+ }
	+#endif
	+
	+ /*
	+ * Copy traditional sooptcopyout() behavior: if sopt->sopt_val
	+ * is NULL, silently skip the copy. However, in this case, we
	+ * will leave the list alone and return. Functionally, this
	+ * gives userspace a way to poll for an approximate buffer
	+ * size they will need to get the log entries.
	+ */
	+ if (sopt->sopt_val == NULL) {
	+ INP_WUNLOCK(inp);
	+ if (outsize == 0) {
	+ outsize = outnum * (sizeof(struct tcp_log_buffer) +
	+ sizeof(struct tcp_log_verbose));
	+ }
	+ if (sopt->sopt_valsize > outsize)
	+ sopt->sopt_valsize = outsize;
	+ return (0);
	+ }
	+
	+ /*
	+ * Break apart the list. We'll save the ones we want to copy
	+ * out locally and remove them from the TCPCB list. We can
	+ * then drop the INPCB lock while we do the copyout.
	+ *
	+ * There are roughly three cases:
	+ * 1. There was nothing to copy out. That's easy: drop the
	+ * lock and return.
	+ * 2. We are copying out the entire list. Again, that's easy:
	+ * move the whole list.
	+ * 3. We are copying out a partial list. That's harder. We
	+ * need to update the list book-keeping entries.
	+ */
	+ if (log_entry != NULL && log_next == NULL) {
	+ /* Move entire list. */
	+ KASSERT(outnum == tp->t_lognum,
	+ ("%s:%d: outnum (%d) should match tp->t_lognum (%d)",
	+ __func__, __LINE__, outnum, tp->t_lognum));
	+ log_tailq = tp->t_logs;
	+ tp->t_lognum = 0;
	+ STAILQ_INIT(&tp->t_logs);
	+ } else if (log_entry != NULL) {
	+ /* Move partial list. */
	+ KASSERT(outnum < tp->t_lognum,
	+ ("%s:%d: outnum (%d) not less than tp->t_lognum (%d)",
	+ __func__, __LINE__, outnum, tp->t_lognum));
	+ STAILQ_FIRST(&log_tailq) = STAILQ_FIRST(&tp->t_logs);
	+ STAILQ_FIRST(&tp->t_logs) = STAILQ_NEXT(log_entry, tlm_queue);
	+ KASSERT(STAILQ_NEXT(log_entry, tlm_queue) != NULL,
	+ ("%s:%d: tp->t_logs is unexpectedly shorter than expected"
	+ "(tp: %p, log_tailq: %p, outnum: %d, tp->t_lognum: %d)",
	+ __func__, __LINE__, tp, &log_tailq, outnum, tp->t_lognum));
	+ STAILQ_NEXT(log_entry, tlm_queue) = NULL;
	+ log_tailq.stqh_last = &STAILQ_NEXT(log_entry, tlm_queue);
	+ tp->t_lognum -= outnum;
	+ } else
	+ STAILQ_INIT(&log_tailq);
	+
	+ /* Drop the PCB lock. */
	+ INP_WUNLOCK(inp);
	+
	+ /* Copy the data out. */
	+ error = tcp_log_logs_to_buf(sopt, &log_tailq, &out_entry, outnum);
	+
	+ if (error) {
	+ /* Restore list */
	+ INP_WLOCK(inp);
	+ if ((inp->inp_flags & (INP_TIMEWAIT \| INP_DROPPED)) == 0) {
	+ tp = intotcpcb(inp);
	+
	+ /* Merge the two lists. */
	+ STAILQ_CONCAT(&log_tailq, &tp->t_logs);
	+ tp->t_logs = log_tailq;
	+ tp->t_lognum += outnum;
	+ }
	+ INP_WUNLOCK(inp);
	+ } else {
	+ /* Sanity check entries */
	+ KASSERT(((caddr_t)out_entry - (caddr_t)sopt->sopt_val) ==
	+ outsize, ("%s: Actual output size (%zu) != "
	+ "calculated output size (%zu)", __func__,
	+ (size_t)((caddr_t)out_entry - (caddr_t)sopt->sopt_val),
	+ outsize));
	+
	+ /* Free the entries we just copied out. */
	+ STAILQ_FOREACH_SAFE(log_entry, &log_tailq, tlm_queue, log_next) {
	+ tcp_log_entry_refcnt_rem(log_entry);
	+ uma_zfree(tcp_log_zone, log_entry);
	+ }
	+ }
	+
	+ sopt->sopt_valsize = (size_t)((caddr_t)out_entry -
	+ (caddr_t)sopt->sopt_val);
	+ return (error);
	+}
	+
	+static void
	+tcp_log_free_queue(struct tcp_log_dev_queue *param)
	+{
	+ struct tcp_log_dev_log_queue *entry;
	+
	+ KASSERT(param != NULL, ("%s: called with NULL param", __func__));
	+ if (param == NULL)
	+ return;
	+
	+ entry = (struct tcp_log_dev_log_queue *)param;
	+
	+ /* Free the entries. */
	+ tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
	+
	+ /* Free the buffer, if it is allocated. */
	+ if (entry->tldl_common.tldq_buf != NULL)
	+ free(entry->tldl_common.tldq_buf, M_TCPLOGDEV);
	+
	+ /* Free the queue entry. */
	+ free(entry, M_TCPLOGDEV);
	+}
	+
	+static struct tcp_log_common_header *
	+tcp_log_expandlogbuf(struct tcp_log_dev_queue *param)
	+{
	+ struct tcp_log_dev_log_queue *entry;
	+ struct tcp_log_header *hdr;
	+ uint8_t *end;
	+ struct sockopt sopt;
	+ int error;
	+
	+ entry = (struct tcp_log_dev_log_queue *)param;
	+
	+ /* Take a worst-case guess at space needs. */
	+ sopt.sopt_valsize = sizeof(struct tcp_log_header) +
	+ entry->tldl_count * (sizeof(struct tcp_log_buffer) +
	+ sizeof(struct tcp_log_verbose));
	+ hdr = malloc(sopt.sopt_valsize, M_TCPLOGDEV, M_NOWAIT);
	+ if (hdr == NULL) {
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+ counter_u64_add(tcp_log_que_fail5, entry->tldl_count);
	+#endif
	+ return (NULL);
	+ }
	+ sopt.sopt_val = hdr + 1;
	+ sopt.sopt_valsize -= sizeof(struct tcp_log_header);
	+ sopt.sopt_td = NULL;
	+
	+ error = tcp_log_logs_to_buf(&sopt, &entry->tldl_entries,
	+ (struct tcp_log_buffer **)&end, entry->tldl_count);
	+ if (error) {
	+ free(hdr, M_TCPLOGDEV);
	+ return (NULL);
	+ }
	+
	+ /* Free the entries. */
	+ tcp_log_free_entries(&entry->tldl_entries, &entry->tldl_count);
	+ entry->tldl_count = 0;
	+
	+ memset(hdr, 0, sizeof(struct tcp_log_header));
	+ hdr->tlh_version = TCP_LOG_BUF_VER;
	+ hdr->tlh_type = TCP_LOG_DEV_TYPE_BBR;
	+ hdr->tlh_length = end - (uint8_t *)hdr;
	+ hdr->tlh_ie = entry->tldl_ie;
	+ hdr->tlh_af = entry->tldl_af;
	+ getboottime(&hdr->tlh_offset);
	+ strlcpy(hdr->tlh_id, entry->tldl_id, TCP_LOG_ID_LEN);
	+ strlcpy(hdr->tlh_reason, entry->tldl_reason, TCP_LOG_REASON_LEN);
	+ return ((struct tcp_log_common_header *)hdr);
	+}
	+
	+/*
	+ * Queue the tcpcb's log buffer for transmission via the log buffer facility.
	+ *
	+ * NOTE: This should be called with a write lock on the PCB.
	+ *
	+ * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
	+ * and reacquire the INP lock if it needs to do so.
	+ *
	+ * If force is false, this will only dump auto-logged sessions if
	+ * tcp_log_auto_all is true or if there is a log ID defined for the session.
	+ */
	+int
	+tcp_log_dump_tp_logbuf(struct tcpcb tp, char reason, int how, bool force)
	+{
	+ struct tcp_log_dev_log_queue *entry;
	+ struct inpcb *inp;
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+ int num_entries;
	+#endif
	+
	+ inp = tp->t_inpcb;
	+ INP_WLOCK_ASSERT(inp);
	+
	+ /* If there are no log entries, there is nothing to do. */
	+ if (tp->t_lognum == 0)
	+ return (0);
	+
	+ /* Check for a log ID. */
	+ if (tp->t_lib == NULL && (tp->t_flags2 & TF2_LOG_AUTO) &&
	+ !tcp_log_auto_all && !force) {
	+ struct tcp_log_mem *log_entry;
	+
	+ /*
	+ * We needed a log ID and none was found. Free the log entries
	+ * and return success. Also, cancel further logging. If the
	+ * session doesn't have a log ID by now, we'll assume it isn't
	+ * going to get one.
	+ */
	+ while ((log_entry = STAILQ_FIRST(&tp->t_logs)) != NULL)
	+ tcp_log_remove_log_head(tp, log_entry);
	+ KASSERT(tp->t_lognum == 0,
	+ ("%s: After freeing entries, tp->t_lognum=%d (expected 0)",
	+ __func__, tp->t_lognum));
	+ tp->t_logstate = TCP_LOG_STATE_OFF;
	+ return (0);
	+ }
	+
	+ /*
	+ * Allocate memory. If we must wait, we'll need to drop the locks
	+ * and reacquire them (and do all the related business that goes
	+ * along with that).
	+ */
	+ entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
	+ M_NOWAIT);
	+ if (entry == NULL && (how & M_NOWAIT)) {
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+ counter_u64_add(tcp_log_que_fail3, 1);
	+#endif
	+ return (ENOBUFS);
	+ }
	+ if (entry == NULL) {
	+ INP_WUNLOCK(inp);
	+ entry = malloc(sizeof(struct tcp_log_dev_log_queue),
	+ M_TCPLOGDEV, M_WAITOK);
	+ INP_WLOCK(inp);
	+ /*
	+ * Note that this check is slightly overly-restrictive in
	+ * that the TCB can survive either of these events.
	+ * However, there is currently not a good way to ensure
	+ * that is the case. So, if we hit this M_WAIT path, we
	+ * may end up dropping some entries. That seems like a
	+ * small price to pay for safety.
	+ */
	+ if (inp->inp_flags & (INP_TIMEWAIT \| INP_DROPPED)) {
	+ free(entry, M_TCPLOGDEV);
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+ counter_u64_add(tcp_log_que_fail2, 1);
	+#endif
	+ return (ECONNRESET);
	+ }
	+ tp = intotcpcb(inp);
	+ if (tp->t_lognum == 0) {
	+ free(entry, M_TCPLOGDEV);
	+ return (0);
	+ }
	+ }
	+
	+ /* Fill in the unique parts of the queue entry. */
	+ if (tp->t_lib != NULL)
	+ strlcpy(entry->tldl_id, tp->t_lib->tlb_id, TCP_LOG_ID_LEN);
	+ else
	+ strlcpy(entry->tldl_id, "UNKNOWN", TCP_LOG_ID_LEN);
	+ if (reason != NULL)
	+ strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
	+ else
	+ strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN);
	+ entry->tldl_ie = inp->inp_inc.inc_ie;
	+ if (inp->inp_inc.inc_flags & INC_ISIPV6)
	+ entry->tldl_af = AF_INET6;
	+ else
	+ entry->tldl_af = AF_INET;
	+ entry->tldl_entries = tp->t_logs;
	+ entry->tldl_count = tp->t_lognum;
	+
	+ /* Fill in the common parts of the queue entry. */
	+ entry->tldl_common.tldq_buf = NULL;
	+ entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
	+ entry->tldl_common.tldq_dtor = tcp_log_free_queue;
	+
	+ /* Clear the log data from the TCPCB. */
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+ num_entries = tp->t_lognum;
	+#endif
	+ tp->t_lognum = 0;
	+ STAILQ_INIT(&tp->t_logs);
	+
	+ /* Add the entry. If no one is listening, free the entry. */
	+ if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry)) {
	+ tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
	+#ifdef TCPLOG_DEBUG_COUNTERS
	+ counter_u64_add(tcp_log_que_fail1, num_entries);
	+ } else {
	+ counter_u64_add(tcp_log_queued, num_entries);
	+#endif
	+ }
	+ return (0);
	+}
	+
	+/*
	+ * Queue the log_id_node's log buffers for transmission via the log buffer
	+ * facility.
	+ *
	+ * NOTE: This should be called with the bucket locked and referenced.
	+ *
	+ * how should be M_WAITOK or M_NOWAIT. If M_WAITOK, the function will drop
	+ * and reacquire the bucket lock if it needs to do so. (The caller must
	+ * ensure that the tln is no longer on any lists so no one else will mess
	+ * with this while the lock is dropped!)
	+ */
	+static int
	+tcp_log_dump_node_logbuf(struct tcp_log_id_node tln, char reason, int how)
	+{
	+ struct tcp_log_dev_log_queue *entry;
	+ struct tcp_log_id_bucket *tlb;
	+
	+ tlb = tln->tln_bucket;
	+ TCPID_BUCKET_LOCK_ASSERT(tlb);
	+ KASSERT(tlb->tlb_refcnt > 0,
	+ ("%s:%d: Called with unreferenced bucket (tln=%p, tlb=%p)",
	+ __func__, __LINE__, tln, tlb));
	+ KASSERT(tln->tln_closed,
	+ ("%s:%d: Called for node with tln_closed==false (tln=%p)",
	+ __func__, __LINE__, tln));
	+
	+ /* If there are no log entries, there is nothing to do. */
	+ if (tln->tln_count == 0)
	+ return (0);
	+
	+ /*
	+ * Allocate memory. If we must wait, we'll need to drop the locks
	+ * and reacquire them (and do all the related business that goes
	+ * along with that).
	+ */
	+ entry = malloc(sizeof(struct tcp_log_dev_log_queue), M_TCPLOGDEV,
	+ M_NOWAIT);
	+ if (entry == NULL && (how & M_NOWAIT))
	+ return (ENOBUFS);
	+ if (entry == NULL) {
	+ TCPID_BUCKET_UNLOCK(tlb);
	+ entry = malloc(sizeof(struct tcp_log_dev_log_queue),
	+ M_TCPLOGDEV, M_WAITOK);
	+ TCPID_BUCKET_LOCK(tlb);
	+ }
	+
	+ /* Fill in the common parts of the queue entry.. */
	+ entry->tldl_common.tldq_buf = NULL;
	+ entry->tldl_common.tldq_xform = tcp_log_expandlogbuf;
	+ entry->tldl_common.tldq_dtor = tcp_log_free_queue;
	+
	+ /* Fill in the unique parts of the queue entry. */
	+ strlcpy(entry->tldl_id, tlb->tlb_id, TCP_LOG_ID_LEN);
	+ if (reason != NULL)
	+ strlcpy(entry->tldl_reason, reason, TCP_LOG_REASON_LEN);
	+ else
	+ strlcpy(entry->tldl_reason, "UNKNOWN", TCP_LOG_ID_LEN);
	+ entry->tldl_ie = tln->tln_ie;
	+ entry->tldl_entries = tln->tln_entries;
	+ entry->tldl_count = tln->tln_count;
	+ entry->tldl_af = tln->tln_af;
	+
	+ /* Add the entry. If no one is listening, free the entry. */
	+ if (tcp_log_dev_add_log((struct tcp_log_dev_queue *)entry))
	+ tcp_log_free_queue((struct tcp_log_dev_queue *)entry);
	+
	+ return (0);
	+}
	+
	+
	+/*
	+ * Queue the log buffers for all sessions in a bucket for transmissions via
	+ * the log buffer facility.
	+ *
	+ * NOTE: This should be called with a locked bucket; however, the function
	+ * will drop the lock.
	+ */
	+#define LOCAL_SAVE 10
	+static void
	+tcp_log_dumpbucketlogs(struct tcp_log_id_bucket tlb, char reason)
	+{
	+ struct tcp_log_id_node local_entries[LOCAL_SAVE];
	+ struct inpcb *inp;
	+ struct tcpcb *tp;
	+ struct tcp_log_id_node cur_tln, prev_tln, *tmp_tln;
	+ int i, num_local_entries, tree_locked;
	+ bool expireq_locked;
	+
	+ TCPID_BUCKET_LOCK_ASSERT(tlb);
	+
	+ /*
	+ * Take a reference on the bucket to keep it from disappearing until
	+ * we are done.
	+ */
	+ TCPID_BUCKET_REF(tlb);
	+
	+ /*
	+ * We'll try to create these without dropping locks. However, we
	+ * might very well need to drop locks to get memory. If that's the
	+ * case, we'll save up to 10 on the stack, and sacrifice the rest.
	+ * (Otherwise, we need to worry about finding our place again in a
	+ * potentially changed list. It just doesn't seem worth the trouble
	+ * to do that.
	+ */
	+ expireq_locked = false;
	+ num_local_entries = 0;
	+ prev_tln = NULL;
	+ tree_locked = TREE_UNLOCKED;
	+ SLIST_FOREACH_SAFE(cur_tln, &tlb->tlb_head, tln_list, tmp_tln) {
	+ /*
	+ * If this isn't associated with a TCPCB, we can pull it off
	+ * the list now. We need to be careful that the expire timer
	+ * hasn't already taken ownership (tln_expiretime == SBT_MAX).
	+ * If so, we let the expire timer code free the data.
	+ */
	+ if (cur_tln->tln_closed) {
	+no_inp:
	+ /*
	+ * Get the expireq lock so we can get a consistent
	+ * read of tln_expiretime and so we can remove this
	+ * from the expireq.
	+ */
	+ if (!expireq_locked) {
	+ TCPLOG_EXPIREQ_LOCK();
	+ expireq_locked = true;
	+ }
	+
	+ /*
	+ * We ignore entries with tln_expiretime == SBT_MAX.
	+ * The expire timer code already owns those.
	+ */
	+ KASSERT(cur_tln->tln_expiretime > (sbintime_t) 0,
	+ ("%s:%d: node on the expire queue without positive "
	+ "expire time", __func__, __LINE__));
	+ if (cur_tln->tln_expiretime == SBT_MAX) {
	+ prev_tln = cur_tln;
	+ continue;
	+ }
	+
	+ /* Remove the entry from the expireq. */
	+ STAILQ_REMOVE(&tcp_log_expireq_head, cur_tln,
	+ tcp_log_id_node, tln_expireq);
	+
	+ /* Remove the entry from the bucket. */
	+ if (prev_tln != NULL)
	+ SLIST_REMOVE_AFTER(prev_tln, tln_list);
	+ else
	+ SLIST_REMOVE_HEAD(&tlb->tlb_head, tln_list);
	+
	+ /*
	+ * Drop the INP and bucket reference counts. Due to
	+ * lock-ordering rules, we need to drop the expire
	+ * queue lock.
	+ */
	+ TCPLOG_EXPIREQ_UNLOCK();
	+ expireq_locked = false;
	+
	+ /* Drop the INP reference. */
	+ INP_WLOCK(cur_tln->tln_inp);
	+ if (!in_pcbrele_wlocked(cur_tln->tln_inp))
	+ INP_WUNLOCK(cur_tln->tln_inp);
	+
	+ if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
	+#ifdef INVARIANTS
	+ panic("%s: Bucket refcount unexpectedly 0.",
	+ __func__);
	+#endif
	+ /*
	+ * Recover as best we can: free the entry we
	+ * own.
	+ */
	+ tcp_log_free_entries(&cur_tln->tln_entries,
	+ &cur_tln->tln_count);
	+ uma_zfree(tcp_log_node_zone, cur_tln);
	+ goto done;
	+ }
	+
	+ if (tcp_log_dump_node_logbuf(cur_tln, reason,
	+ M_NOWAIT)) {
	+ /*
	+ * If we have sapce, save the entries locally.
	+ * Otherwise, free them.
	+ */
	+ if (num_local_entries < LOCAL_SAVE) {
	+ local_entries[num_local_entries] =
	+ *cur_tln;
	+ num_local_entries++;
	+ } else {
	+ tcp_log_free_entries(
	+ &cur_tln->tln_entries,
	+ &cur_tln->tln_count);
	+ }
	+ }
	+
	+ /* No matter what, we are done with the node now. */
	+ uma_zfree(tcp_log_node_zone, cur_tln);
	+
	+ /*
	+ * Because we removed this entry from the list, prev_tln
	+ * (which tracks the previous entry still on the tlb
	+ * list) remains unchanged.
	+ */
	+ continue;
	+ }
	+
	+ /*
	+ * If we get to this point, the session data is still held in
	+ * the TCPCB. So, we need to pull the data out of that.
	+ *
	+ * We will need to drop the expireq lock so we can lock the INP.
	+ * We can then try to extract the data the "easy" way. If that
	+ * fails, we'll save the log entries for later.
	+ */
	+ if (expireq_locked) {
	+ TCPLOG_EXPIREQ_UNLOCK();
	+ expireq_locked = false;
	+ }
	+
	+ /* Lock the INP and then re-check the state. */
	+ inp = cur_tln->tln_inp;
	+ INP_WLOCK(inp);
	+ /*
	+ * If we caught this while it was transitioning, the data
	+ * might have moved from the TCPCB to the tln (signified by
	+ * setting tln_closed to true. If so, treat this like an
	+ * inactive connection.
	+ */
	+ if (cur_tln->tln_closed) {
	+ /*
	+ * It looks like we may have caught this connection
	+ * while it was transitioning from active to inactive.
	+ * Treat this like an inactive connection.
	+ */
	+ INP_WUNLOCK(inp);
	+ goto no_inp;
	+ }
	+
	+ /*
	+ * Try to dump the data from the tp without dropping the lock.
	+ * If this fails, try to save off the data locally.
	+ */
	+ tp = cur_tln->tln_tp;
	+ if (tcp_log_dump_tp_logbuf(tp, reason, M_NOWAIT, true) &&
	+ num_local_entries < LOCAL_SAVE) {
	+ tcp_log_move_tp_to_node(tp,
	+ &local_entries[num_local_entries]);
	+ local_entries[num_local_entries].tln_closed = 1;
	+ KASSERT(local_entries[num_local_entries].tln_bucket ==
	+ tlb, ("%s: %d: bucket mismatch for node %p",
	+ __func__, __LINE__, cur_tln));
	+ num_local_entries++;
	+ }
	+
	+ INP_WUNLOCK(inp);
	+
	+ /*
	+ * We are goint to leave the current tln on the list. It will
	+ * become the previous tln.
	+ */
	+ prev_tln = cur_tln;
	+ }
	+
	+ /* Drop our locks, if any. */
	+ KASSERT(tree_locked == TREE_UNLOCKED,
	+ ("%s: %d: tree unexpectedly locked", __func__, __LINE__));
	+ switch (tree_locked) {
	+ case TREE_WLOCKED:
	+ TCPID_TREE_WUNLOCK();
	+ tree_locked = TREE_UNLOCKED;
	+ break;
	+ case TREE_RLOCKED:
	+ TCPID_TREE_RUNLOCK();
	+ tree_locked = TREE_UNLOCKED;
	+ break;
	+ }
	+ if (expireq_locked) {
	+ TCPLOG_EXPIREQ_UNLOCK();
	+ expireq_locked = false;
	+ }
	+
	+ /*
	+ * Try again for any saved entries. tcp_log_dump_node_logbuf() is
	+ * guaranteed to free the log entries within the node. And, since
	+ * the node itself is on our stack, we don't need to free it.
	+ */
	+ for (i = 0; i < num_local_entries; i++)
	+ tcp_log_dump_node_logbuf(&local_entries[i], reason, M_WAITOK);
	+
	+ /* Drop our reference. */
	+ if (!tcp_log_unref_bucket(tlb, &tree_locked, NULL))
	+ TCPID_BUCKET_UNLOCK(tlb);
	+
	+done:
	+ /* Drop our locks, if any. */
	+ switch (tree_locked) {
	+ case TREE_WLOCKED:
	+ TCPID_TREE_WUNLOCK();
	+ break;
	+ case TREE_RLOCKED:
	+ TCPID_TREE_RUNLOCK();
	+ break;
	+ }
	+ if (expireq_locked)
	+ TCPLOG_EXPIREQ_UNLOCK();
	+}
	+#undef LOCAL_SAVE
	+
	+
	+/*
	+ * Queue the log buffers for all sessions in a bucket for transmissions via
	+ * the log buffer facility.
	+ *
	+ * NOTE: This should be called with a locked INP; however, the function
	+ * will drop the lock.
	+ */
	+void
	+tcp_log_dump_tp_bucket_logbufs(struct tcpcb tp, char reason)
	+{
	+ struct tcp_log_id_bucket *tlb;
	+ int tree_locked;
	+
	+ /* Figure out our bucket and lock it. */
	+ INP_WLOCK_ASSERT(tp->t_inpcb);
	+ tlb = tp->t_lib;
	+ if (tlb == NULL) {
	+ /*
	+ * No bucket; treat this like a request to dump a single
	+ * session's traces.
	+ */
	+ (void)tcp_log_dump_tp_logbuf(tp, reason, M_WAITOK, true);
	+ INP_WUNLOCK(tp->t_inpcb);
	+ return;
	+ }
	+ TCPID_BUCKET_REF(tlb);
	+ INP_WUNLOCK(tp->t_inpcb);
	+ TCPID_BUCKET_LOCK(tlb);
	+
	+ /* If we are the last reference, we have nothing more to do here. */
	+ tree_locked = TREE_UNLOCKED;
	+ if (tcp_log_unref_bucket(tlb, &tree_locked, NULL)) {
	+ switch (tree_locked) {
	+ case TREE_WLOCKED:
	+ TCPID_TREE_WUNLOCK();
	+ break;
	+ case TREE_RLOCKED:
	+ TCPID_TREE_RUNLOCK();
	+ break;
	+ }
	+ return;
	+ }
	+
	+ /* Turn this over to tcp_log_dumpbucketlogs() to finish the work. */
	+ tcp_log_dumpbucketlogs(tlb, reason);
	+}
	+
	+/*
	+ * Mark the end of a flow with the current stack. A stack can add
	+ * stack-specific info to this trace event by overriding this
	+ * function (see bbr_log_flowend() for example).
	+ */
	+void
	+tcp_log_flowend(struct tcpcb *tp)
	+{
	+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
	+ struct socket *so = tp->t_inpcb->inp_socket;
	+ TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd,
	+ TCP_LOG_FLOWEND, 0, 0, NULL, false);
	+ }
	+}
	+
	Index: head/sys/netinet/tcp_output.c
	===================================================================
	--- head/sys/netinet/tcp_output.c
	+++ head/sys/netinet/tcp_output.c
	@@ -74,6 +74,7 @@
	#include <netinet/tcp.h>
	#define TCPOUTFLAGS
	#include <netinet/tcp_fsm.h>
	+#include <netinet/tcp_log_buf.h>
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	@@ -1310,6 +1311,10 @@
	}
	#endif

	+ /* We're getting ready to send; log now. */
	+ TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK,
	+ len, NULL, false);
	+
	/*
	* Enable TSO and specify the size of the segments.
	* The TCP pseudo header checksum is always provided.
	@@ -1549,6 +1554,9 @@
	}

	if (error) {
	+ /* Record the error. */
	+ TCP_LOG_EVENT(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_OUT,
	+ error, 0, NULL, false);

	/*
	* We know that the packet was lost, so back out the
	Index: head/sys/netinet/tcp_subr.c
	===================================================================
	--- head/sys/netinet/tcp_subr.c
	+++ head/sys/netinet/tcp_subr.c
	@@ -98,6 +98,7 @@
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	+#include <netinet/tcp_log_buf.h>
	#include <netinet/tcp_syncache.h>
	#include <netinet/cc/cc.h>
	#ifdef INET6
	@@ -426,6 +427,71 @@
	"list available TCP Function sets");

	/*
	+ * Exports one (struct tcp_function_id) for each non-alias.
	+ */
	+static int
	+sysctl_net_inet_list_func_ids(SYSCTL_HANDLER_ARGS)
	+{
	+ int error, cnt;
	+ struct tcp_function *f;
	+ struct tcp_function_id tfi;
	+
	+ /*
	+ * We don't allow writes.
	+ */
	+ if (req->newptr != NULL)
	+ return (EINVAL);
	+
	+ /*
	+ * Wire the old buffer so we can directly copy the functions to
	+ * user space without dropping the lock.
	+ */
	+ if (req->oldptr != NULL) {
	+ error = sysctl_wire_old_buffer(req, 0);
	+ if (error)
	+ return (error);
	+ }
	+
	+ /*
	+ * Walk the list, comparing the name of the function entry and
	+ * function block to determine which is an alias.
	+ * If exporting the list, copy out matching entries. Otherwise,
	+ * just record the total length.
	+ */
	+ cnt = 0;
	+ rw_rlock(&tcp_function_lock);
	+ TAILQ_FOREACH(f, &t_functions, tf_next) {
	+ if (strncmp(f->tf_name, f->tf_fb->tfb_tcp_block_name,
	+ TCP_FUNCTION_NAME_LEN_MAX))
	+ continue;
	+ if (req->oldptr != NULL) {
	+ tfi.tfi_id = f->tf_fb->tfb_id;
	+ (void)strncpy(tfi.tfi_name, f->tf_name,
	+ TCP_FUNCTION_NAME_LEN_MAX);
	+ tfi.tfi_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
	+ error = SYSCTL_OUT(req, &tfi, sizeof(tfi));
	+ /*
	+ * Don't stop on error, as that is the
	+ * mechanism we use to accumulate length
	+ * information if the buffer was too short.
	+ */
	+ } else
	+ cnt++;
	+ }
	+ rw_runlock(&tcp_function_lock);
	+ if (req->oldptr == NULL)
	+ error = SYSCTL_OUT(req, NULL,
	+ (cnt + 1) * sizeof(struct tcp_function_id));
	+
	+ return (error);
	+}
	+
	+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_ids,
	+ CTLTYPE_OPAQUE \| CTLFLAG_SKIP \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	+ NULL, 0, sysctl_net_inet_list_func_ids, "S,tcp_function_id",
	+ "List TCP function block name-to-ID mappings");
	+
	+/*
	* Target size of TCP PCB hash tables. Must be a power of two.
	*
	* Note that this can be overridden by the kernel environment
	@@ -504,6 +570,8 @@
	return (hashsize);
	}

	+static volatile int next_tcp_stack_id = 1;
	+
	/*
	* Register a TCP function block with the name provided in the names
	* array. (Note that this function does NOT automatically register
	@@ -563,6 +631,7 @@

	refcount_init(&blk->tfb_refcnt, 0);
	blk->tfb_flags = 0;
	+ blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1);
	for (i = 0; i < *num_names; i++) {
	n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
	if (n == NULL) {
	@@ -779,6 +848,8 @@
	/* Setup the tcp function block list */
	init_tcp_functions();
	register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
	+ /* Initialize the TCP logging data. */
	+ tcp_log_init();

	if (tcp_soreceive_stream) {
	#ifdef INET
	@@ -1360,6 +1431,8 @@
	*/
	tcp_pcap_tcpcb_init(tp);
	#endif
	+ /* Initialize the per-TCPCB log data. */
	+ tcp_log_tcpcbinit(tp);
	if (tp->t_fb->tfb_tcp_fb_init) {
	(*tp->t_fb->tfb_tcp_fb_init)(tp);
	}
	@@ -1577,6 +1650,7 @@
	inp->inp_ppcb = NULL;
	if (tp->t_timers->tt_draincnt == 0) {
	/* We own the last reference on tcpcb, let's free it. */
	+ tcp_log_tcpcbfini(tp);
	TCPSTATES_DEC(tp->t_state);
	if (tp->t_fb->tfb_tcp_fb_fini)
	(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
	@@ -1607,6 +1681,7 @@
	tp->t_timers->tt_draincnt--;
	if (tp->t_timers->tt_draincnt == 0) {
	/* We own the last reference on this tcpcb, let's free it. */
	+ tcp_log_tcpcbfini(tp);
	TCPSTATES_DEC(tp->t_state);
	if (tp->t_fb->tfb_tcp_fb_fini)
	(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
	@@ -1700,6 +1775,7 @@
	if ((tcpb = intotcpcb(inpb)) != NULL) {
	tcp_reass_flush(tcpb);
	tcp_clean_sackreport(tcpb);
	+ tcp_log_drain(tcpb);
	#ifdef TCPPCAP
	if (tcp_pcap_aggressive_free) {
	/* Free the TCP PCAP queues. */
	@@ -2856,6 +2932,7 @@
	xt->t_state = TCPS_TIME_WAIT;
	} else {
	xt->t_state = tp->t_state;
	+ xt->t_logstate = tp->t_logstate;
	xt->t_flags = tp->t_flags;
	xt->t_sndzerowin = tp->t_sndzerowin;
	xt->t_sndrexmitpack = tp->t_sndrexmitpack;
	@@ -2879,6 +2956,8 @@

	bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack,
	TCP_FUNCTION_NAME_LEN_MAX);
	+ bzero(xt->xt_logid, TCP_LOG_ID_LEN);
	+ (void)tcp_log_get_id(tp, xt->xt_logid);
	}

	xt->xt_len = sizeof(struct xtcpcb);
	Index: head/sys/netinet/tcp_timer.c
	===================================================================
	--- head/sys/netinet/tcp_timer.c
	+++ head/sys/netinet/tcp_timer.c
	@@ -68,6 +68,7 @@
	#include <netinet/ip_var.h>
	#include <netinet/tcp.h>
	#include <netinet/tcp_fsm.h>
	+#include <netinet/tcp_log_buf.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	#include <netinet/cc/cc.h>
	@@ -644,6 +645,7 @@
	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
	("%s: tp %p tcpcb can't be stopped here", __func__, tp));
	tcp_free_sackholes(tp);
	+ TCP_LOG_EVENT(tp, NULL, NULL, NULL, TCP_LOG_RTO, 0, 0, NULL, false);
	if (tp->t_fb->tfb_tcp_rexmit_tmr) {
	/* The stack has a timer action too. */
	(*tp->t_fb->tfb_tcp_rexmit_tmr)(tp);
	Index: head/sys/netinet/tcp_usrreq.c
	===================================================================
	--- head/sys/netinet/tcp_usrreq.c
	+++ head/sys/netinet/tcp_usrreq.c
	@@ -90,6 +90,7 @@
	#include <netinet/tcp_seq.h>
	#include <netinet/tcp_timer.h>
	#include <netinet/tcp_var.h>
	+#include <netinet/tcp_log_buf.h>
	#include <netinet/tcpip.h>
	#include <netinet/cc/cc.h>
	#include <netinet/tcp_fastopen.h>
	@@ -1026,6 +1027,11 @@
	tp->t_flags &= ~TF_FORCEDATA;
	}
	}
	+ TCP_LOG_EVENT(tp, NULL,
	+ &inp->inp_socket->so_rcv,
	+ &inp->inp_socket->so_snd,
	+ TCP_LOG_USERSEND, error,
	+ 0, NULL, false);
	out:
	TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
	((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
	@@ -1533,6 +1539,15 @@
	return (tp->t_fb->tfb_tcp_ctloutput(so, sopt, inp, tp));
	}

	+/*
	+ * If this assert becomes untrue, we need to change the size of the buf
	+ * variable in tcp_default_ctloutput().
	+ */
	+#ifdef CTASSERT
	+CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN);
	+CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN);
	+#endif
	+
	int
	tcp_default_ctloutput(struct socket so, struct sockopt sopt, struct inpcb inp, struct tcpcb tp)
	{
	@@ -1540,7 +1555,7 @@
	u_int ui;
	struct tcp_info ti;
	struct cc_algo *algo;
	- char *pbuf, buf[TCP_CA_NAME_MAX];
	+ char *pbuf, buf[TCP_LOG_ID_LEN];
	size_t len;

	/*
	@@ -1822,6 +1837,55 @@
	goto unlock_and_done;
	}

	+ case TCP_LOG:
	+ INP_WUNLOCK(inp);
	+ error = sooptcopyin(sopt, &optval, sizeof optval,
	+ sizeof optval);
	+ if (error)
	+ return (error);
	+
	+ INP_WLOCK_RECHECK(inp);
	+ error = tcp_log_state_change(tp, optval);
	+ goto unlock_and_done;
	+
	+ case TCP_LOGBUF:
	+ INP_WUNLOCK(inp);
	+ error = EINVAL;
	+ break;
	+
	+ case TCP_LOGID:
	+ INP_WUNLOCK(inp);
	+ error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0);
	+ if (error)
	+ break;
	+ buf[sopt->sopt_valsize] = '\0';
	+ INP_WLOCK_RECHECK(inp);
	+ error = tcp_log_set_id(tp, buf);
	+ /* tcp_log_set_id() unlocks the INP. */
	+ break;
	+
	+ case TCP_LOGDUMP:
	+ case TCP_LOGDUMPID:
	+ INP_WUNLOCK(inp);
	+ error =
	+ sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0);
	+ if (error)
	+ break;
	+ buf[sopt->sopt_valsize] = '\0';
	+ INP_WLOCK_RECHECK(inp);
	+ if (sopt->sopt_name == TCP_LOGDUMP) {
	+ error = tcp_log_dump_tp_logbuf(tp, buf,
	+ M_WAITOK, true);
	+ INP_WUNLOCK(inp);
	+ } else {
	+ tcp_log_dump_tp_bucket_logbufs(tp, buf);
	+ /*
	+ * tcp_log_dump_tp_bucket_logbufs() drops the
	+ * INP lock.
	+ */
	+ }
	+ break;
	+
	default:
	INP_WUNLOCK(inp);
	error = ENOPROTOOPT;
	@@ -1906,6 +1970,25 @@
	optval = tp->t_flags & TF_FASTOPEN;
	INP_WUNLOCK(inp);
	error = sooptcopyout(sopt, &optval, sizeof optval);
	+ break;
	+ case TCP_LOG:
	+ optval = tp->t_logstate;
	+ INP_WUNLOCK(inp);
	+ error = sooptcopyout(sopt, &optval, sizeof(optval));
	+ break;
	+ case TCP_LOGBUF:
	+ /* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */
	+ error = tcp_log_getlogbuf(sopt, tp);
	+ break;
	+ case TCP_LOGID:
	+ len = tcp_log_get_id(tp, buf);
	+ INP_WUNLOCK(inp);
	+ error = sooptcopyout(sopt, buf, len + 1);
	+ break;
	+ case TCP_LOGDUMP:
	+ case TCP_LOGDUMPID:
	+ INP_WUNLOCK(inp);
	+ error = EINVAL;
	break;
	default:
	INP_WUNLOCK(inp);
	Index: head/sys/netinet/tcp_var.h
	===================================================================
	--- head/sys/netinet/tcp_var.h
	+++ head/sys/netinet/tcp_var.h
	@@ -79,6 +79,8 @@
	uint64_t _pad[1]; /* TBD */
	};

	+STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
	+
	/*
	* Tcp control block, one per tcp; fields:
	* Organized for 16 byte cacheline efficiency.
	@@ -189,6 +191,13 @@
	u_int t_tsomaxsegcount; /* TSO maximum segment count */
	u_int t_tsomaxsegsize; /* TSO maximum segment size in bytes */
	u_int t_flags2; /* More tcpcb flags storage */
	+ int t_logstate; /* State of "black box" logging */
	+ struct tcp_log_stailq t_logs; /* Log buffer */
	+ int t_lognum; /* Number of log entries */
	+ uint32_t t_logsn; /* Log "serial number" */
	+ struct tcp_log_id_node *t_lin;
	+ struct tcp_log_id_bucket *t_lib;
	+ const char t_output_caller; / Function that called tcp_output */
	struct tcp_function_block t_fb;/ TCP function call block */
	void t_fb_ptr; / Pointer to t_fb specific data */
	uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
	@@ -267,6 +276,7 @@
	int (tfb_tcp_handoff_ok)(struct tcpcb );
	volatile uint32_t tfb_refcnt;
	uint32_t tfb_flags;
	+ uint8_t tfb_id;
	};

	struct tcp_function {
	@@ -339,11 +349,12 @@
	#define TCPOOB_HADDATA 0x02

	/*
	- * Flags for PLPMTU handling, t_flags2
	+ * Flags for the extended TCP flags field, t_flags2
	*/
	#define TF2_PLPMTU_BLACKHOLE 0x00000001 /* Possible PLPMTUD Black Hole. */
	#define TF2_PLPMTU_PMTUD 0x00000002 /* Allowed to attempt PLPMTUD. */
	#define TF2_PLPMTU_MAXSEGSNT 0x00000004 /* Last seg sent was full seg. */
	+#define TF2_LOG_AUTO 0x00000008 /* Session is auto-logging. */

	/*
	* Structure to hold TCP options that are only used during segment
	@@ -654,6 +665,7 @@
	size_t xt_len; /* length of this structure */
	struct xinpcb xt_inp;
	char xt_stack[TCP_FUNCTION_NAME_LEN_MAX]; /* (s) */
	+ char xt_logid[TCP_LOG_ID_LEN]; /* (s) */
	int64_t spare64[8];
	int32_t t_state; /* (s,p) */
	uint32_t t_flags; /* (s,p) */
	@@ -666,12 +678,22 @@
	int32_t tt_keep; /* (s) */
	int32_t tt_2msl; /* (s) */
	int32_t tt_delack; /* (s) */
	+ int32_t t_logstate; /* (3) */
	int32_t spare32[32];
	} __aligned(8);
	+
	#ifdef _KERNEL
	void tcp_inptoxtp(const struct inpcb , struct xtcpcb );
	#endif
	#endif
	+
	+/*
	+ * TCP function name-to-id mapping exported to user-land via sysctl(3).
	+ */
	+struct tcp_function_id {
	+ uint8_t tfi_id;
	+ char tfi_name[TCP_FUNCTION_NAME_LEN_MAX];
	+};

	/*
	* Identifiers for TCP sysctl nodes
	Index: head/usr.bin/netstat/inet.c
	===================================================================
	--- head/usr.bin/netstat/inet.c
	+++ head/usr.bin/netstat/inet.c
	@@ -321,7 +321,7 @@
	"Proto", "Recv-Q", "Send-Q",
	"Local Address", "Foreign Address");
	if (!xflag && !Rflag)
	- xo_emit(" (state)");
	+ xo_emit(" {T:/%-11.11s}", "(state)");
	}
	if (xflag) {
	xo_emit(" {T:/%-6.6s} {T:/%-6.6s} {T:/%-6.6s} "
	@@ -339,6 +339,8 @@
	xo_emit(" {T:/%8.8s} {T:/%5.5s}",
	"flowid", "ftype");
	}
	+ if (Pflag)
	+ xo_emit(" {T:/%s}", "Log ID");
	xo_emit("\n");
	first = 0;
	}
	@@ -478,9 +480,9 @@
	}
	if (istcp && !Lflag && !xflag && !Tflag && !Rflag) {
	if (tp->t_state < 0 \|\| tp->t_state >= TCP_NSTATES)
	- xo_emit("{:tcp-state/%d}", tp->t_state);
	+ xo_emit("{:tcp-state/%-11d}", tp->t_state);
	else {
	- xo_emit("{:tcp-state/%s}",
	+ xo_emit("{:tcp-state/%-11s}",
	tcpstates[tp->t_state]);
	#if defined(TF_NEEDSYN) && defined(TF_NEEDFIN)
	/* Show T/TCP `hidden state' */
	@@ -495,6 +497,9 @@
	inp->inp_flowid,
	inp->inp_flowtype);
	}
	+ if (istcp && Pflag)
	+ xo_emit(" {:log-id/%s}", tp->xt_logid[0] == '\0' ?
	+ "-" : tp->xt_logid);
	xo_emit("\n");
	xo_close_instance("socket");
	}
	Index: head/usr.bin/netstat/main.c
	===================================================================
	--- head/usr.bin/netstat/main.c
	+++ head/usr.bin/netstat/main.c
	@@ -214,6 +214,7 @@
	int noutputs = 0; /* how much outputs before we exit */
	int numeric_addr; /* show addresses numerically */
	int numeric_port; /* show ports numerically */
	+int Pflag; /* show TCP log ID */
	static int pflag; /* show given protocol */
	static int Qflag; /* show netisr information */
	int rflag; /* show routing tables (or routing stats) */
	@@ -247,7 +248,7 @@
	if (argc < 0)
	exit(EXIT_FAILURE);

	- while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:np:Qq:RrSTsuWw:xz"))
	+ while ((ch = getopt(argc, argv, "46AaBbdF:f:ghI:iLlM:mN:nPp:Qq:RrSTsuWw:xz"))
	!= -1)
	switch(ch) {
	case '4':
	@@ -343,6 +344,9 @@
	break;
	case 'n':
	numeric_addr = numeric_port = 1;
	+ break;
	+ case 'P':
	+ Pflag = 1;
	break;
	case 'p':
	if ((tp = name2protox(optarg)) == NULL) {
	Index: head/usr.bin/netstat/netstat.h
	===================================================================
	--- head/usr.bin/netstat/netstat.h
	+++ head/usr.bin/netstat/netstat.h
	@@ -50,6 +50,7 @@
	extern int noutputs; /* how much outputs before we exit */
	extern int numeric_addr; /* show addresses numerically */
	extern int numeric_port; /* show ports numerically */
	+extern int Pflag; /* show TCP log ID */
	extern int rflag; /* show routing tables (or routing stats) */
	extern int Rflag; /* show flowid / RSS information */
	extern int sflag; /* show protocol statistics */
	Index: head/usr.bin/netstat/netstat.1
	===================================================================
	--- head/usr.bin/netstat/netstat.1
	+++ head/usr.bin/netstat/netstat.1
	@@ -39,7 +39,7 @@
	.Bl -tag -width "netstat"
	.It Nm
	.Op Fl -libxo
	-.Op Fl 46AaLnRSTWx
	+.Op Fl 46AaLnPRSTWx
	.Op Fl f Ar protocol_family \| Fl p Ar protocol
	.Op Fl M Ar core
	.Op Fl N Ar system
	@@ -181,6 +181,8 @@
	Do not resolve numeric addresses and port numbers to names.
	See
	.Sx GENERAL OPTIONS .
	+.It Fl P
	+Display the log ID for each socket.
	.It Fl R
	Display the flowid and flowtype for each socket.
	flowid is a 32 bit hardware specific identifier for each flow.

File Metadata

Mime Type: text/plain
Expires: Fri, Nov 21, 8:55 AM (19 h, 41 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 25757968
Default Alt Text: D11085.id.diff (126 KB)

D11085.id.diffNo OneTemporaryActions

D11085.id.diffView Options

File Metadata

Event Timeline

D11085.id.diff
No OneTemporary
Actions

D11085.id.diff
View Options