diff --git a/sys/compat/linuxkpi/common/include/linux/net.h b/sys/compat/linuxkpi/common/include/linux/net.h
index 5438fccb8512..d5752093da74 100644
--- a/sys/compat/linuxkpi/common/include/linux/net.h
+++ b/sys/compat/linuxkpi/common/include/linux/net.h
@@ -1,89 +1,89 @@
 /*-
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * Copyright (c) 2013, 2014 Mellanox Technologies, Ltd.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 #ifndef	_LINUXKPI_LINUX_NET_H_
 #define	_LINUXKPI_LINUX_NET_H_
 
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/errno.h>
 
 static inline int
 sock_create_kern(int family, int type, int proto, struct socket **res)
 {
 	return -socreate(family, res, type, proto, curthread->td_ucred,
 	    curthread);
 }
 
 static inline int
 sock_getname(struct socket *so, struct sockaddr *addr, int *sockaddr_len,
     int peer)
 {
 	struct sockaddr *nam;
 	int error;
 
 	nam = NULL;
 	if (peer) {
 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
 			return (-ENOTCONN);
 
-		error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &nam);
+		error = so->so_proto->pr_peeraddr(so, &nam);
 	} else
-		error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &nam);
+		error = so->so_proto->pr_sockaddr(so, &nam);
 	if (error)
 		return (-error);
 	*addr = *nam;
 	*sockaddr_len = addr->sa_len;
 
 	free(nam, M_SONAME);
 	return (0);
 }
 
 static inline void
 sock_release(struct socket *so)
 {
 	soclose(so);
 }
 
 
 int linuxkpi_net_ratelimit(void);
 
 static inline int
 net_ratelimit(void)
 {
 
 	return (linuxkpi_net_ratelimit());
 }
 
 #endif	/* _LINUXKPI_LINUX_NET_H_ */
diff --git a/sys/dev/cxgbe/tom/t4_tom.c b/sys/dev/cxgbe/tom/t4_tom.c
index 9bd6145c9acc..825a4e96ce81 100644
--- a/sys/dev/cxgbe/tom/t4_tom.c
+++ b/sys/dev/cxgbe/tom/t4_tom.c
@@ -1,2114 +1,2099 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2012 Chelsio Communications, Inc.
  * All rights reserved.
  * Written by: Navdeep Parhar <np@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/limits.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/refcount.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/if_vlan_var.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet6/scope6_var.h>
 #define TCPSTATES
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/toecore.h>
 #include <netinet/cc/cc.h>
 
 #ifdef TCP_OFFLOAD
 #include "common/common.h"
 #include "common/t4_msg.h"
 #include "common/t4_regs.h"
 #include "common/t4_regs_values.h"
 #include "common/t4_tcb.h"
 #include "t4_clip.h"
 #include "tom/t4_tom_l2t.h"
 #include "tom/t4_tom.h"
 #include "tom/t4_tls.h"
 
-static struct protosw *tcp_protosw;
 static struct protosw toe_protosw;
-static struct pr_usrreqs toe_usrreqs;
-
-static struct protosw *tcp6_protosw;
 static struct protosw toe6_protosw;
-static struct pr_usrreqs toe6_usrreqs;
 
 /* Module ops */
 static int t4_tom_mod_load(void);
 static int t4_tom_mod_unload(void);
 static int t4_tom_modevent(module_t, int, void *);
 
 /* ULD ops and helpers */
 static int t4_tom_activate(struct adapter *);
 static int t4_tom_deactivate(struct adapter *);
 
 static struct uld_info tom_uld_info = {
 	.uld_id = ULD_TOM,
 	.activate = t4_tom_activate,
 	.deactivate = t4_tom_deactivate,
 };
 
 static void release_offload_resources(struct toepcb *);
 static int alloc_tid_tabs(struct tid_info *);
 static void free_tid_tabs(struct tid_info *);
 static void free_tom_data(struct adapter *, struct tom_data *);
 static void reclaim_wr_resources(void *, int);
 
 struct toepcb *
 alloc_toepcb(struct vi_info *vi, int flags)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct toepcb *toep;
 	int tx_credits, txsd_total, len;
 
 	/*
 	 * The firmware counts tx work request credits in units of 16 bytes
 	 * each.  Reserve room for an ABORT_REQ so the driver never has to worry
 	 * about tx credits if it wants to abort a connection.
 	 */
 	tx_credits = sc->params.ofldq_wr_cred;
 	tx_credits -= howmany(sizeof(struct cpl_abort_req), 16);
 
 	/*
 	 * Shortest possible tx work request is a fw_ofld_tx_data_wr + 1 byte
 	 * immediate payload, and firmware counts tx work request credits in
 	 * units of 16 byte.  Calculate the maximum work requests possible.
 	 */
 	txsd_total = tx_credits /
 	    howmany(sizeof(struct fw_ofld_tx_data_wr) + 1, 16);
 
 	len = offsetof(struct toepcb, txsd) +
 	    txsd_total * sizeof(struct ofld_tx_sdesc);
 
 	toep = malloc(len, M_CXGBE, M_ZERO | flags);
 	if (toep == NULL)
 		return (NULL);
 
 	refcount_init(&toep->refcount, 1);
 	toep->td = sc->tom_softc;
 	toep->vi = vi;
 	toep->tid = -1;
 	toep->tx_total = tx_credits;
 	toep->tx_credits = tx_credits;
 	mbufq_init(&toep->ulp_pduq, INT_MAX);
 	mbufq_init(&toep->ulp_pdu_reclaimq, INT_MAX);
 	toep->txsd_total = txsd_total;
 	toep->txsd_avail = txsd_total;
 	toep->txsd_pidx = 0;
 	toep->txsd_cidx = 0;
 	aiotx_init_toep(toep);
 
 	return (toep);
 }
 
 /*
  * Initialize a toepcb after its params have been filled out.
  */
 int
 init_toepcb(struct vi_info *vi, struct toepcb *toep)
 {
 	struct conn_params *cp = &toep->params;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct tx_cl_rl_params *tc;
 
 	if (cp->tc_idx >= 0 && cp->tc_idx < sc->params.nsched_cls) {
 		tc = &pi->sched_params->cl_rl[cp->tc_idx];
 		mtx_lock(&sc->tc_lock);
 		if (tc->state != CS_HW_CONFIGURED) {
 			CH_ERR(vi, "tid %d cannot be bound to traffic class %d "
 			    "because it is not configured (its state is %d)\n",
 			    toep->tid, cp->tc_idx, tc->state);
 			cp->tc_idx = -1;
 		} else {
 			tc->refcount++;
 		}
 		mtx_unlock(&sc->tc_lock);
 	}
 	toep->ofld_txq = &sc->sge.ofld_txq[cp->txq_idx];
 	toep->ofld_rxq = &sc->sge.ofld_rxq[cp->rxq_idx];
 	toep->ctrlq = &sc->sge.ctrlq[pi->port_id];
 
 	tls_init_toep(toep);
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 		ddp_init_toep(toep);
 
 	toep->flags |= TPF_INITIALIZED;
 
 	return (0);
 }
 
 struct toepcb *
 hold_toepcb(struct toepcb *toep)
 {
 
 	refcount_acquire(&toep->refcount);
 	return (toep);
 }
 
 void
 free_toepcb(struct toepcb *toep)
 {
 
 	if (refcount_release(&toep->refcount) == 0)
 		return;
 
 	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: attached to an inpcb", __func__));
 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: CPL pending", __func__));
 
 	if (toep->flags & TPF_INITIALIZED) {
 		if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 			ddp_uninit_toep(toep);
 		tls_uninit_toep(toep);
 	}
 	free(toep, M_CXGBE);
 }
 
 /*
  * Set up the socket for TCP offload.
  */
 void
 offload_socket(struct socket *so, struct toepcb *toep)
 {
 	struct tom_data *td = toep->td;
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct sockbuf *sb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/* Update socket */
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags |= SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags |= SB_NOCOALESCE;
 	if (inp->inp_vflag & INP_IPV6)
 		so->so_proto = &toe6_protosw;
 	else
 		so->so_proto = &toe_protosw;
 	SOCKBUF_UNLOCK(sb);
 
 	/* Update TCP PCB */
 	tp->tod = &td->tod;
 	tp->t_toe = toep;
 	tp->t_flags |= TF_TOE;
 
 	/* Install an extra hold on inp */
 	toep->inp = inp;
 	toep->flags |= TPF_ATTACHED;
 	in_pcbref(inp);
 
 	/* Add the TOE PCB to the active list */
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_INSERT_HEAD(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 void
 restore_so_proto(struct socket *so, bool v6)
 {
 	if (v6)
-		so->so_proto = tcp6_protosw;
+		so->so_proto = &tcp6_protosw;
 	else
-		so->so_proto = tcp_protosw;
+		so->so_proto = &tcp_protosw;
 }
 
 /* This is _not_ the normal way to "unoffload" a socket. */
 void
 undo_offload_socket(struct socket *so)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	struct toepcb *toep = tp->t_toe;
 	struct tom_data *td = toep->td;
 	struct sockbuf *sb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_NOCOALESCE;
 	SOCKBUF_UNLOCK(sb);
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	sb->sb_flags &= ~SB_NOCOALESCE;
 	restore_so_proto(so, inp->inp_vflag & INP_IPV6);
 	SOCKBUF_UNLOCK(sb);
 
 	tp->tod = NULL;
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 
 	toep->inp = NULL;
 	toep->flags &= ~TPF_ATTACHED;
 	if (in_pcbrele_wlocked(inp))
 		panic("%s: inp freed.", __func__);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 }
 
 static void
 release_offload_resources(struct toepcb *toep)
 {
 	struct tom_data *td = toep->td;
 	struct adapter *sc = td_adapter(td);
 	int tid = toep->tid;
 
 	KASSERT(!(toep->flags & TPF_CPL_PENDING),
 	    ("%s: %p has CPL pending.", __func__, toep));
 	KASSERT(!(toep->flags & TPF_ATTACHED),
 	    ("%s: %p is still attached.", __func__, toep));
 
 	CTR5(KTR_CXGBE, "%s: toep %p (tid %d, l2te %p, ce %p)",
 	    __func__, toep, tid, toep->l2te, toep->ce);
 
 	/*
 	 * These queues should have been emptied at approximately the same time
 	 * that a normal connection's socket's so_snd would have been purged or
 	 * drained.  Do _not_ clean up here.
 	 */
 	MPASS(mbufq_len(&toep->ulp_pduq) == 0);
 	MPASS(mbufq_len(&toep->ulp_pdu_reclaimq) == 0);
 #ifdef INVARIANTS
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 		ddp_assert_empty(toep);
 #endif
 	MPASS(TAILQ_EMPTY(&toep->aiotx_jobq));
 
 	if (toep->l2te)
 		t4_l2t_release(toep->l2te);
 
 	if (tid >= 0) {
 		remove_tid(sc, tid, toep->ce ? 2 : 1);
 		release_tid(sc, tid, toep->ctrlq);
 	}
 
 	if (toep->ce)
 		t4_release_clip_entry(sc, toep->ce);
 
 	if (toep->params.tc_idx != -1)
 		t4_release_cl_rl(sc, toep->vi->pi->port_id, toep->params.tc_idx);
 
 	mtx_lock(&td->toep_list_lock);
 	TAILQ_REMOVE(&td->toep_list, toep, link);
 	mtx_unlock(&td->toep_list_lock);
 
 	free_toepcb(toep);
 }
 
 /*
  * The kernel is done with the TCP PCB and this is our opportunity to unhook the
  * toepcb hanging off of it.  If the TOE driver is also done with the toepcb (no
  * pending CPL) then it is time to release all resources tied to the toepcb.
  *
  * Also gets called when an offloaded active open fails and the TOM wants the
  * kernel to take the TCP PCB back.
  */
 static void
 t4_pcb_detach(struct toedev *tod __unused, struct tcpcb *tp)
 {
 #if defined(KTR) || defined(INVARIANTS)
 	struct inpcb *inp = tp->t_inpcb;
 #endif
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(inp);
 
 	KASSERT(toep != NULL, ("%s: toep is NULL", __func__));
 	KASSERT(toep->flags & TPF_ATTACHED,
 	    ("%s: not attached", __func__));
 
 #ifdef KTR
 	if (tp->t_state == TCPS_SYN_SENT) {
 		CTR6(KTR_CXGBE, "%s: atid %d, toep %p (0x%x), inp %p (0x%x)",
 		    __func__, toep->tid, toep, toep->flags, inp,
 		    inp->inp_flags);
 	} else {
 		CTR6(KTR_CXGBE,
 		    "t4_pcb_detach: tid %d (%s), toep %p (0x%x), inp %p (0x%x)",
 		    toep->tid, tcpstates[tp->t_state], toep, toep->flags, inp,
 		    inp->inp_flags);
 	}
 #endif
 
 	if (ulp_mode(toep) == ULP_MODE_TLS)
 		tls_detach(toep);
 
 	tp->tod = NULL;
 	tp->t_toe = NULL;
 	tp->t_flags &= ~TF_TOE;
 	toep->flags &= ~TPF_ATTACHED;
 
 	if (!(toep->flags & TPF_CPL_PENDING))
 		release_offload_resources(toep);
 }
 
 /*
  * setsockopt handler.
  */
 static void
 t4_ctloutput(struct toedev *tod, struct tcpcb *tp, int dir, int name)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct toepcb *toep = tp->t_toe;
 
 	if (dir == SOPT_GET)
 		return;
 
 	CTR4(KTR_CXGBE, "%s: tp %p, dir %u, name %u", __func__, tp, dir, name);
 
 	switch (name) {
 	case TCP_NODELAY:
 		if (tp->t_state != TCPS_ESTABLISHED)
 			break;
 		toep->params.nagle = tp->t_flags & TF_NODELAY ? 0 : 1;
 		t4_set_tcb_field(sc, toep->ctrlq, toep, W_TCB_T_FLAGS,
 		    V_TF_NAGLE(1), V_TF_NAGLE(toep->params.nagle), 0, 0);
 		break;
 	default:
 		break;
 	}
 }
 
 static inline uint64_t
 get_tcb_tflags(const uint64_t *tcb)
 {
 
 	return ((be64toh(tcb[14]) << 32) | (be64toh(tcb[15]) >> 32));
 }
 
 static inline uint32_t
 get_tcb_field(const uint64_t *tcb, u_int word, uint32_t mask, u_int shift)
 {
 #define LAST_WORD ((TCB_SIZE / 4) - 1)
 	uint64_t t1, t2;
 	int flit_idx;
 
 	MPASS(mask != 0);
 	MPASS(word <= LAST_WORD);
 	MPASS(shift < 32);
 
 	flit_idx = (LAST_WORD - word) / 2;
 	if (word & 0x1)
 		shift += 32;
 	t1 = be64toh(tcb[flit_idx]) >> shift;
 	t2 = 0;
 	if (fls(mask) > 64 - shift) {
 		/*
 		 * Will spill over into the next logical flit, which is the flit
 		 * before this one.  The flit_idx before this one must be valid.
 		 */
 		MPASS(flit_idx > 0);
 		t2 = be64toh(tcb[flit_idx - 1]) << (64 - shift);
 	}
 	return ((t2 | t1) & mask);
 #undef LAST_WORD
 }
 #define GET_TCB_FIELD(tcb, F) \
     get_tcb_field(tcb, W_TCB_##F, M_TCB_##F, S_TCB_##F)
 
 /*
  * Issues a CPL_GET_TCB to read the entire TCB for the tid.
  */
 static int
 send_get_tcb(struct adapter *sc, u_int tid)
 {
 	struct cpl_get_tcb *cpl;
 	struct wrq_cookie cookie;
 
 	MPASS(tid < sc->tids.ntids);
 
 	cpl = start_wrq_wr(&sc->sge.ctrlq[0], howmany(sizeof(*cpl), 16),
 	    &cookie);
 	if (__predict_false(cpl == NULL))
 		return (ENOMEM);
 	bzero(cpl, sizeof(*cpl));
 	INIT_TP_WR(cpl, tid);
 	OPCODE_TID(cpl) = htobe32(MK_OPCODE_TID(CPL_GET_TCB, tid));
 	cpl->reply_ctrl = htobe16(V_REPLY_CHAN(0) |
 	    V_QUEUENO(sc->sge.ofld_rxq[0].iq.cntxt_id));
 	cpl->cookie = 0xff;
 	commit_wrq_wr(&sc->sge.ctrlq[0], cpl, &cookie);
 
 	return (0);
 }
 
 static struct tcb_histent *
 alloc_tcb_histent(struct adapter *sc, u_int tid, int flags)
 {
 	struct tcb_histent *te;
 
 	MPASS(flags == M_NOWAIT || flags == M_WAITOK);
 
 	te = malloc(sizeof(*te), M_CXGBE, M_ZERO | flags);
 	if (te == NULL)
 		return (NULL);
 	mtx_init(&te->te_lock, "TCB entry", NULL, MTX_DEF);
 	callout_init_mtx(&te->te_callout, &te->te_lock, 0);
 	te->te_adapter = sc;
 	te->te_tid = tid;
 
 	return (te);
 }
 
 static void
 free_tcb_histent(struct tcb_histent *te)
 {
 
 	mtx_destroy(&te->te_lock);
 	free(te, M_CXGBE);
 }
 
 /*
  * Start tracking the tid in the TCB history.
  */
 int
 add_tid_to_history(struct adapter *sc, u_int tid)
 {
 	struct tcb_histent *te = NULL;
 	struct tom_data *td = sc->tom_softc;
 	int rc;
 
 	MPASS(tid < sc->tids.ntids);
 
 	if (td->tcb_history == NULL)
 		return (ENXIO);
 
 	rw_wlock(&td->tcb_history_lock);
 	if (td->tcb_history[tid] != NULL) {
 		rc = EEXIST;
 		goto done;
 	}
 	te = alloc_tcb_histent(sc, tid, M_NOWAIT);
 	if (te == NULL) {
 		rc = ENOMEM;
 		goto done;
 	}
 	mtx_lock(&te->te_lock);
 	rc = send_get_tcb(sc, tid);
 	if (rc == 0) {
 		te->te_flags |= TE_RPL_PENDING;
 		td->tcb_history[tid] = te;
 	} else {
 		free(te, M_CXGBE);
 	}
 	mtx_unlock(&te->te_lock);
 done:
 	rw_wunlock(&td->tcb_history_lock);
 	return (rc);
 }
 
 static void
 remove_tcb_histent(struct tcb_histent *te)
 {
 	struct adapter *sc = te->te_adapter;
 	struct tom_data *td = sc->tom_softc;
 
 	rw_assert(&td->tcb_history_lock, RA_WLOCKED);
 	mtx_assert(&te->te_lock, MA_OWNED);
 	MPASS(td->tcb_history[te->te_tid] == te);
 
 	td->tcb_history[te->te_tid] = NULL;
 	free_tcb_histent(te);
 	rw_wunlock(&td->tcb_history_lock);
 }
 
 static inline struct tcb_histent *
 lookup_tcb_histent(struct adapter *sc, u_int tid, bool addrem)
 {
 	struct tcb_histent *te;
 	struct tom_data *td = sc->tom_softc;
 
 	MPASS(tid < sc->tids.ntids);
 
 	if (td->tcb_history == NULL)
 		return (NULL);
 
 	if (addrem)
 		rw_wlock(&td->tcb_history_lock);
 	else
 		rw_rlock(&td->tcb_history_lock);
 	te = td->tcb_history[tid];
 	if (te != NULL) {
 		mtx_lock(&te->te_lock);
 		return (te);	/* with both locks held */
 	}
 	if (addrem)
 		rw_wunlock(&td->tcb_history_lock);
 	else
 		rw_runlock(&td->tcb_history_lock);
 
 	return (te);
 }
 
 static inline void
 release_tcb_histent(struct tcb_histent *te)
 {
 	struct adapter *sc = te->te_adapter;
 	struct tom_data *td = sc->tom_softc;
 
 	mtx_assert(&te->te_lock, MA_OWNED);
 	mtx_unlock(&te->te_lock);
 	rw_assert(&td->tcb_history_lock, RA_RLOCKED);
 	rw_runlock(&td->tcb_history_lock);
 }
 
 static void
 request_tcb(void *arg)
 {
 	struct tcb_histent *te = arg;
 
 	mtx_assert(&te->te_lock, MA_OWNED);
 
 	/* Noone else is supposed to update the histent. */
 	MPASS(!(te->te_flags & TE_RPL_PENDING));
 	if (send_get_tcb(te->te_adapter, te->te_tid) == 0)
 		te->te_flags |= TE_RPL_PENDING;
 	else
 		callout_schedule(&te->te_callout, hz / 100);
 }
 
 static void
 update_tcb_histent(struct tcb_histent *te, const uint64_t *tcb)
 {
 	struct tom_data *td = te->te_adapter->tom_softc;
 	uint64_t tflags = get_tcb_tflags(tcb);
 	uint8_t sample = 0;
 
 	if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != GET_TCB_FIELD(tcb, SND_UNA_RAW)) {
 		if (GET_TCB_FIELD(tcb, T_RXTSHIFT) != 0)
 			sample |= TS_RTO;
 		if (GET_TCB_FIELD(tcb, T_DUPACKS) != 0)
 			sample |= TS_DUPACKS;
 		if (GET_TCB_FIELD(tcb, T_DUPACKS) >= td->dupack_threshold)
 			sample |= TS_FASTREXMT;
 	}
 
 	if (GET_TCB_FIELD(tcb, SND_MAX_RAW) != 0) {
 		uint32_t snd_wnd;
 
 		sample |= TS_SND_BACKLOGGED;	/* for whatever reason. */
 
 		snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV);
 		if (tflags & V_TF_RECV_SCALE(1))
 			snd_wnd <<= GET_TCB_FIELD(tcb, RCV_SCALE);
 		if (GET_TCB_FIELD(tcb, SND_CWND) < snd_wnd)
 			sample |= TS_CWND_LIMITED;	/* maybe due to CWND */
 	}
 
 	if (tflags & V_TF_CCTRL_ECN(1)) {
 
 		/*
 		 * CE marker on incoming IP hdr, echoing ECE back in the TCP
 		 * hdr.  Indicates congestion somewhere on the way from the peer
 		 * to this node.
 		 */
 		if (tflags & V_TF_CCTRL_ECE(1))
 			sample |= TS_ECN_ECE;
 
 		/*
 		 * ECE seen and CWR sent (or about to be sent).  Might indicate
 		 * congestion on the way to the peer.  This node is reducing its
 		 * congestion window in response.
 		 */
 		if (tflags & (V_TF_CCTRL_CWR(1) | V_TF_CCTRL_RFR(1)))
 			sample |= TS_ECN_CWR;
 	}
 
 	te->te_sample[te->te_pidx] = sample;
 	if (++te->te_pidx == nitems(te->te_sample))
 		te->te_pidx = 0;
 	memcpy(te->te_tcb, tcb, TCB_SIZE);
 	te->te_flags |= TE_ACTIVE;
 }
 
 static int
 do_get_tcb_rpl(struct sge_iq *iq, const struct rss_header *rss, struct mbuf *m)
 {
 	struct adapter *sc = iq->adapter;
 	const struct cpl_get_tcb_rpl *cpl = mtod(m, const void *);
 	const uint64_t *tcb = (const uint64_t *)(const void *)(cpl + 1);
 	struct tcb_histent *te;
 	const u_int tid = GET_TID(cpl);
 	bool remove;
 
 	remove = GET_TCB_FIELD(tcb, T_STATE) == TCPS_CLOSED;
 	te = lookup_tcb_histent(sc, tid, remove);
 	if (te == NULL) {
 		/* Not in the history.  Who issued the GET_TCB for this? */
 		device_printf(sc->dev, "tcb %u: flags 0x%016jx, state %u, "
 		    "srtt %u, sscale %u, rscale %u, cookie 0x%x\n", tid,
 		    (uintmax_t)get_tcb_tflags(tcb), GET_TCB_FIELD(tcb, T_STATE),
 		    GET_TCB_FIELD(tcb, T_SRTT), GET_TCB_FIELD(tcb, SND_SCALE),
 		    GET_TCB_FIELD(tcb, RCV_SCALE), cpl->cookie);
 		goto done;
 	}
 
 	MPASS(te->te_flags & TE_RPL_PENDING);
 	te->te_flags &= ~TE_RPL_PENDING;
 	if (remove) {
 		remove_tcb_histent(te);
 	} else {
 		update_tcb_histent(te, tcb);
 		callout_reset(&te->te_callout, hz / 10, request_tcb, te);
 		release_tcb_histent(te);
 	}
 done:
 	m_freem(m);
 	return (0);
 }
 
 static void
 fill_tcp_info_from_tcb(struct adapter *sc, uint64_t *tcb, struct tcp_info *ti)
 {
 	uint32_t v;
 
 	ti->tcpi_state = GET_TCB_FIELD(tcb, T_STATE);
 
 	v = GET_TCB_FIELD(tcb, T_SRTT);
 	ti->tcpi_rtt = tcp_ticks_to_us(sc, v);
 
 	v = GET_TCB_FIELD(tcb, T_RTTVAR);
 	ti->tcpi_rttvar = tcp_ticks_to_us(sc, v);
 
 	ti->tcpi_snd_ssthresh = GET_TCB_FIELD(tcb, SND_SSTHRESH);
 	ti->tcpi_snd_cwnd = GET_TCB_FIELD(tcb, SND_CWND);
 	ti->tcpi_rcv_nxt = GET_TCB_FIELD(tcb, RCV_NXT);
 
 	v = GET_TCB_FIELD(tcb, TX_MAX);
 	ti->tcpi_snd_nxt = v - GET_TCB_FIELD(tcb, SND_NXT_RAW);
 
 	/* Receive window being advertised by us. */
 	ti->tcpi_rcv_wscale = GET_TCB_FIELD(tcb, SND_SCALE);	/* Yes, SND. */
 	ti->tcpi_rcv_space = GET_TCB_FIELD(tcb, RCV_WND);
 
 	/* Send window */
 	ti->tcpi_snd_wscale = GET_TCB_FIELD(tcb, RCV_SCALE);	/* Yes, RCV. */
 	ti->tcpi_snd_wnd = GET_TCB_FIELD(tcb, RCV_ADV);
 	if (get_tcb_tflags(tcb) & V_TF_RECV_SCALE(1))
 		ti->tcpi_snd_wnd <<= ti->tcpi_snd_wscale;
 	else
 		ti->tcpi_snd_wscale = 0;
 
 }
 
 static void
 fill_tcp_info_from_history(struct adapter *sc, struct tcb_histent *te,
     struct tcp_info *ti)
 {
 
 	fill_tcp_info_from_tcb(sc, te->te_tcb, ti);
 }
 
 /*
  * Reads the TCB for the given tid using a memory window and copies it to 'buf'
  * in the same format as CPL_GET_TCB_RPL.
  */
 static void
 read_tcb_using_memwin(struct adapter *sc, u_int tid, uint64_t *buf)
 {
 	int i, j, k, rc;
 	uint32_t addr;
 	u_char *tcb, tmp;
 
 	MPASS(tid < sc->tids.ntids);
 
 	addr = t4_read_reg(sc, A_TP_CMM_TCB_BASE) + tid * TCB_SIZE;
 	rc = read_via_memwin(sc, 2, addr, (uint32_t *)buf, TCB_SIZE);
 	if (rc != 0)
 		return;
 
 	tcb = (u_char *)buf;
 	for (i = 0, j = TCB_SIZE - 16; i < j; i += 16, j -= 16) {
 		for (k = 0; k < 16; k++) {
 			tmp = tcb[i + k];
 			tcb[i + k] = tcb[j + k];
 			tcb[j + k] = tmp;
 		}
 	}
 }
 
 static void
 fill_tcp_info(struct adapter *sc, u_int tid, struct tcp_info *ti)
 {
 	uint64_t tcb[TCB_SIZE / sizeof(uint64_t)];
 	struct tcb_histent *te;
 
 	ti->tcpi_toe_tid = tid;
 	te = lookup_tcb_histent(sc, tid, false);
 	if (te != NULL) {
 		fill_tcp_info_from_history(sc, te, ti);
 		release_tcb_histent(te);
 	} else {
 		if (!(sc->debug_flags & DF_DISABLE_TCB_CACHE)) {
 			/* XXX: tell firmware to flush TCB cache. */
 		}
 		read_tcb_using_memwin(sc, tid, tcb);
 		fill_tcp_info_from_tcb(sc, tcb, ti);
 	}
 }
 
 /*
  * Called by the kernel to allow the TOE driver to "refine" values filled up in
  * the tcp_info for an offloaded connection.
  */
 static void
 t4_tcp_info(struct toedev *tod, struct tcpcb *tp, struct tcp_info *ti)
 {
 	struct adapter *sc = tod->tod_softc;
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	MPASS(ti != NULL);
 
 	fill_tcp_info(sc, toep->tid, ti);
 }
 
 #ifdef KERN_TLS
 static int
 t4_alloc_tls_session(struct toedev *tod, struct tcpcb *tp,
     struct ktls_session *tls, int direction)
 {
 	struct toepcb *toep = tp->t_toe;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	MPASS(tls != NULL);
 
 	return (tls_alloc_ktls(toep, tls, direction));
 }
 #endif
 
 /* SET_TCB_FIELD sent as a ULP command looks like this */
 #define LEN__SET_TCB_FIELD_ULP (sizeof(struct ulp_txpkt) + \
     sizeof(struct ulptx_idata) + sizeof(struct cpl_set_tcb_field_core))
 
 static void *
 mk_set_tcb_field_ulp(struct ulp_txpkt *ulpmc, uint64_t word, uint64_t mask,
 		uint64_t val, uint32_t tid)
 {
 	struct ulptx_idata *ulpsc;
 	struct cpl_set_tcb_field_core *req;
 
 	ulpmc->cmd_dest = htonl(V_ULPTX_CMD(ULP_TX_PKT) | V_ULP_TXPKT_DEST(0));
 	ulpmc->len = htobe32(howmany(LEN__SET_TCB_FIELD_ULP, 16));
 
 	ulpsc = (struct ulptx_idata *)(ulpmc + 1);
 	ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_IMM));
 	ulpsc->len = htobe32(sizeof(*req));
 
 	req = (struct cpl_set_tcb_field_core *)(ulpsc + 1);
 	OPCODE_TID(req) = htobe32(MK_OPCODE_TID(CPL_SET_TCB_FIELD, tid));
 	req->reply_ctrl = htobe16(V_NO_REPLY(1));
 	req->word_cookie = htobe16(V_WORD(word) | V_COOKIE(0));
 	req->mask = htobe64(mask);
 	req->val = htobe64(val);
 
 	ulpsc = (struct ulptx_idata *)(req + 1);
 	if (LEN__SET_TCB_FIELD_ULP % 16) {
 		ulpsc->cmd_more = htobe32(V_ULPTX_CMD(ULP_TX_SC_NOOP));
 		ulpsc->len = htobe32(0);
 		return (ulpsc + 1);
 	}
 	return (ulpsc);
 }
 
 static void
 send_mss_flowc_wr(struct adapter *sc, struct toepcb *toep)
 {
 	struct wrq_cookie cookie;
 	struct fw_flowc_wr *flowc;
 	struct ofld_tx_sdesc *txsd;
 	const int flowclen = sizeof(*flowc) + sizeof(struct fw_flowc_mnemval);
 	const int flowclen16 = howmany(flowclen, 16);
 
 	if (toep->tx_credits < flowclen16 || toep->txsd_avail == 0) {
 		CH_ERR(sc, "%s: tid %u out of tx credits (%d, %d).\n", __func__,
 		    toep->tid, toep->tx_credits, toep->txsd_avail);
 		return;
 	}
 
 	flowc = start_wrq_wr(&toep->ofld_txq->wrq, flowclen16, &cookie);
 	if (__predict_false(flowc == NULL)) {
 		CH_ERR(sc, "ENOMEM in %s for tid %u.\n", __func__, toep->tid);
 		return;
 	}
 	flowc->op_to_nparams = htobe32(V_FW_WR_OP(FW_FLOWC_WR) |
 	    V_FW_FLOWC_WR_NPARAMS(1));
 	flowc->flowid_len16 = htonl(V_FW_WR_LEN16(flowclen16) |
 	    V_FW_WR_FLOWID(toep->tid));
 	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_MSS;
 	flowc->mnemval[0].val = htobe32(toep->params.emss);
 
 	txsd = &toep->txsd[toep->txsd_pidx];
 	txsd->tx_credits = flowclen16;
 	txsd->plen = 0;
 	toep->tx_credits -= txsd->tx_credits;
 	if (__predict_false(++toep->txsd_pidx == toep->txsd_total))
 		toep->txsd_pidx = 0;
 	toep->txsd_avail--;
 	commit_wrq_wr(&toep->ofld_txq->wrq, flowc, &cookie);
 }
 
 static void
 t4_pmtu_update(struct toedev *tod, struct tcpcb *tp, tcp_seq seq, int mtu)
 {
 	struct work_request_hdr *wrh;
 	struct ulp_txpkt *ulpmc;
 	int idx, len;
 	struct wrq_cookie cookie;
 	struct inpcb *inp = tp->t_inpcb;
 	struct toepcb *toep = tp->t_toe;
 	struct adapter *sc = td_adapter(toep->td);
 	unsigned short *mtus = &sc->params.mtus[0];
 
 	INP_WLOCK_ASSERT(inp);
 	MPASS(mtu > 0);	/* kernel is supposed to provide something usable. */
 
 	/* tp->snd_una and snd_max are in host byte order too. */
 	seq = be32toh(seq);
 
 	CTR6(KTR_CXGBE, "%s: tid %d, seq 0x%08x, mtu %u, mtu_idx %u (%d)",
 	    __func__, toep->tid, seq, mtu, toep->params.mtu_idx,
 	    mtus[toep->params.mtu_idx]);
 
 	if (ulp_mode(toep) == ULP_MODE_NONE &&	/* XXX: Read TCB otherwise? */
 	    (SEQ_LT(seq, tp->snd_una) || SEQ_GEQ(seq, tp->snd_max))) {
 		CTR5(KTR_CXGBE,
 		    "%s: tid %d, seq 0x%08x not in range [0x%08x, 0x%08x).",
 		    __func__, toep->tid, seq, tp->snd_una, tp->snd_max);
 		return;
 	}
 
 	/* Find the best mtu_idx for the suggested MTU. */
 	for (idx = 0; idx < NMTUS - 1 && mtus[idx + 1] <= mtu; idx++)
 		continue;
 	if (idx >= toep->params.mtu_idx)
 		return;	/* Never increase the PMTU (just like the kernel). */
 
 	/*
 	 * We'll send a compound work request with 2 SET_TCB_FIELDs -- the first
 	 * one updates the mtu_idx and the second one triggers a retransmit.
 	 */
 	len = sizeof(*wrh) + 2 * roundup2(LEN__SET_TCB_FIELD_ULP, 16);
 	wrh = start_wrq_wr(toep->ctrlq, howmany(len, 16), &cookie);
 	if (wrh == NULL) {
 		CH_ERR(sc, "failed to change mtu_idx of tid %d (%u -> %u).\n",
 		    toep->tid, toep->params.mtu_idx, idx);
 		return;
 	}
 	INIT_ULPTX_WRH(wrh, len, 1, 0);	/* atomic */
 	ulpmc = (struct ulp_txpkt *)(wrh + 1);
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, W_TCB_T_MAXSEG,
 	    V_TCB_T_MAXSEG(M_TCB_T_MAXSEG), V_TCB_T_MAXSEG(idx), toep->tid);
 	ulpmc = mk_set_tcb_field_ulp(ulpmc, W_TCB_TIMESTAMP,
 	    V_TCB_TIMESTAMP(0x7FFFFULL << 11), 0, toep->tid);
 	commit_wrq_wr(toep->ctrlq, wrh, &cookie);
 
 	/* Update the software toepcb and tcpcb. */
 	toep->params.mtu_idx = idx;
 	tp->t_maxseg = mtus[toep->params.mtu_idx];
 	if (inp->inp_inc.inc_flags & INC_ISIPV6)
 		tp->t_maxseg -= sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		tp->t_maxseg -= sizeof(struct ip) + sizeof(struct tcphdr);
 	toep->params.emss = tp->t_maxseg;
 	if (tp->t_flags & TF_RCVD_TSTMP)
 		toep->params.emss -= TCPOLEN_TSTAMP_APPA;
 
 	/* Update the firmware flowc. */
 	send_mss_flowc_wr(sc, toep);
 
 	/* Update the MTU in the kernel's hostcache. */
 	if (sc->tt.update_hc_on_pmtu_change != 0) {
 		struct in_conninfo inc = {0};
 
 		inc.inc_fibnum = inp->inp_inc.inc_fibnum;
 		if (inp->inp_inc.inc_flags & INC_ISIPV6) {
 			inc.inc_flags |= INC_ISIPV6;
 			inc.inc6_faddr = inp->inp_inc.inc6_faddr;
 		} else {
 			inc.inc_faddr = inp->inp_inc.inc_faddr;
 		}
 		tcp_hc_updatemtu(&inc, mtu);
 	}
 
 	CTR6(KTR_CXGBE, "%s: tid %d, mtu_idx %u (%u), t_maxseg %u, emss %u",
 	    __func__, toep->tid, toep->params.mtu_idx,
 	    mtus[toep->params.mtu_idx], tp->t_maxseg, toep->params.emss);
 }
 
 /*
  * The TOE driver will not receive any more CPLs for the tid associated with the
  * toepcb; release the hold on the inpcb.
  */
 void
 final_cpl_received(struct toepcb *toep)
 {
 	struct inpcb *inp = toep->inp;
 	bool need_wakeup;
 
 	KASSERT(inp != NULL, ("%s: inp is NULL", __func__));
 	INP_WLOCK_ASSERT(inp);
 	KASSERT(toep->flags & TPF_CPL_PENDING,
 	    ("%s: CPL not pending already?", __func__));
 
 	CTR6(KTR_CXGBE, "%s: tid %d, toep %p (0x%x), inp %p (0x%x)",
 	    __func__, toep->tid, toep, toep->flags, inp, inp->inp_flags);
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP)
 		release_ddp_resources(toep);
 	else if (ulp_mode(toep) == ULP_MODE_TLS)
 		tls_detach(toep);
 	toep->inp = NULL;
 	need_wakeup = (toep->flags & TPF_WAITING_FOR_FINAL) != 0;
 	toep->flags &= ~(TPF_CPL_PENDING | TPF_WAITING_FOR_FINAL);
 	mbufq_drain(&toep->ulp_pduq);
 	mbufq_drain(&toep->ulp_pdu_reclaimq);
 
 	if (!(toep->flags & TPF_ATTACHED))
 		release_offload_resources(toep);
 
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 
 	if (need_wakeup) {
 		struct mtx *lock = mtx_pool_find(mtxpool_sleep, toep);
 
 		mtx_lock(lock);
 		wakeup(toep);
 		mtx_unlock(lock);
 	}
 }
 
 void
 insert_tid(struct adapter *sc, int tid, void *ctx, int ntids)
 {
 	struct tid_info *t = &sc->tids;
 
 	MPASS(tid >= t->tid_base);
 	MPASS(tid - t->tid_base < t->ntids);
 
 	t->tid_tab[tid - t->tid_base] = ctx;
 	atomic_add_int(&t->tids_in_use, ntids);
 }
 
 void *
 lookup_tid(struct adapter *sc, int tid)
 {
 	struct tid_info *t = &sc->tids;
 
 	return (t->tid_tab[tid - t->tid_base]);
 }
 
 void
 update_tid(struct adapter *sc, int tid, void *ctx)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid - t->tid_base] = ctx;
 }
 
 void
 remove_tid(struct adapter *sc, int tid, int ntids)
 {
 	struct tid_info *t = &sc->tids;
 
 	t->tid_tab[tid - t->tid_base] = NULL;
 	atomic_subtract_int(&t->tids_in_use, ntids);
 }
 
 /*
  * What mtu_idx to use, given a 4-tuple.  Note that both s->mss and tcp_mssopt
  * have the MSS that we should advertise in our SYN.  Advertised MSS doesn't
  * account for any TCP options so the effective MSS (only payload, no headers or
  * options) could be different.
  */
 static int
 find_best_mtu_idx(struct adapter *sc, struct in_conninfo *inc,
     struct offload_settings *s)
 {
 	unsigned short *mtus = &sc->params.mtus[0];
 	int i, mss, mtu;
 
 	MPASS(inc != NULL);
 
 	mss = s->mss > 0 ? s->mss : tcp_mssopt(inc);
 	if (inc->inc_flags & INC_ISIPV6)
 		mtu = mss + sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
 	else
 		mtu = mss + sizeof(struct ip) + sizeof(struct tcphdr);
 
 	for (i = 0; i < NMTUS - 1 && mtus[i + 1] <= mtu; i++)
 		continue;
 
 	return (i);
 }
 
 /*
  * Determine the receive window size for a socket.
  */
 u_long
 select_rcv_wnd(struct socket *so)
 {
 	unsigned long wnd;
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	wnd = sbspace(&so->so_rcv);
 	if (wnd < MIN_RCV_WND)
 		wnd = MIN_RCV_WND;
 
 	return min(wnd, MAX_RCV_WND);
 }
 
 int
 select_rcv_wscale(void)
 {
 	int wscale = 0;
 	unsigned long space = sb_max;
 
 	if (space > MAX_RCV_WND)
 		space = MAX_RCV_WND;
 
 	while (wscale < TCP_MAX_WINSHIFT && (TCP_MAXWIN << wscale) < space)
 		wscale++;
 
 	return (wscale);
 }
 
 __be64
 calc_options0(struct vi_info *vi, struct conn_params *cp)
 {
 	uint64_t opt0 = 0;
 
 	opt0 |= F_TCAM_BYPASS;
 
 	MPASS(cp->wscale >= 0 && cp->wscale <= M_WND_SCALE);
 	opt0 |= V_WND_SCALE(cp->wscale);
 
 	MPASS(cp->mtu_idx >= 0 && cp->mtu_idx < NMTUS);
 	opt0 |= V_MSS_IDX(cp->mtu_idx);
 
 	MPASS(cp->ulp_mode >= 0 && cp->ulp_mode <= M_ULP_MODE);
 	opt0 |= V_ULP_MODE(cp->ulp_mode);
 
 	MPASS(cp->opt0_bufsize >= 0 && cp->opt0_bufsize <= M_RCV_BUFSIZ);
 	opt0 |= V_RCV_BUFSIZ(cp->opt0_bufsize);
 
 	MPASS(cp->l2t_idx >= 0 && cp->l2t_idx < vi->adapter->vres.l2t.size);
 	opt0 |= V_L2T_IDX(cp->l2t_idx);
 
 	opt0 |= V_SMAC_SEL(vi->smt_idx);
 	opt0 |= V_TX_CHAN(vi->pi->tx_chan);
 
 	MPASS(cp->keepalive == 0 || cp->keepalive == 1);
 	opt0 |= V_KEEP_ALIVE(cp->keepalive);
 
 	MPASS(cp->nagle == 0 || cp->nagle == 1);
 	opt0 |= V_NAGLE(cp->nagle);
 
 	return (htobe64(opt0));
 }
 
 __be32
 calc_options2(struct vi_info *vi, struct conn_params *cp)
 {
 	uint32_t opt2 = 0;
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 
 	/*
 	 * rx flow control, rx coalesce, congestion control, and tx pace are all
 	 * explicitly set by the driver.  On T5+ the ISS is also set by the
 	 * driver to the value picked by the kernel.
 	 */
 	if (is_t4(sc)) {
 		opt2 |= F_RX_FC_VALID | F_RX_COALESCE_VALID;
 		opt2 |= F_CONG_CNTRL_VALID | F_PACE_VALID;
 	} else {
 		opt2 |= F_T5_OPT_2_VALID;	/* all 4 valid */
 		opt2 |= F_T5_ISS;		/* ISS provided in CPL */
 	}
 
 	MPASS(cp->sack == 0 || cp->sack == 1);
 	opt2 |= V_SACK_EN(cp->sack);
 
 	MPASS(cp->tstamp == 0 || cp->tstamp == 1);
 	opt2 |= V_TSTAMPS_EN(cp->tstamp);
 
 	if (cp->wscale > 0)
 		opt2 |= F_WND_SCALE_EN;
 
 	MPASS(cp->ecn == 0 || cp->ecn == 1);
 	opt2 |= V_CCTRL_ECN(cp->ecn);
 
 	/* XXX: F_RX_CHANNEL for multiple rx c-chan support goes here. */
 
 	opt2 |= V_TX_QUEUE(sc->params.tp.tx_modq[pi->tx_chan]);
 	opt2 |= V_PACE(0);
 	opt2 |= F_RSS_QUEUE_VALID;
 	opt2 |= V_RSS_QUEUE(sc->sge.ofld_rxq[cp->rxq_idx].iq.abs_id);
 
 	MPASS(cp->cong_algo >= 0 && cp->cong_algo <= M_CONG_CNTRL);
 	opt2 |= V_CONG_CNTRL(cp->cong_algo);
 
 	MPASS(cp->rx_coalesce == 0 || cp->rx_coalesce == 1);
 	if (cp->rx_coalesce == 1)
 		opt2 |= V_RX_COALESCE(M_RX_COALESCE);
 
 	opt2 |= V_RX_FC_DDP(0) | V_RX_FC_DISABLE(0);
 #ifdef USE_DDP_RX_FLOW_CONTROL
 	if (cp->ulp_mode == ULP_MODE_TCPDDP)
 		opt2 |= F_RX_FC_DDP;
 #endif
 
 	return (htobe32(opt2));
 }
 
 uint64_t
 select_ntuple(struct vi_info *vi, struct l2t_entry *e)
 {
 	struct adapter *sc = vi->adapter;
 	struct tp_params *tp = &sc->params.tp;
 	uint64_t ntuple = 0;
 
 	/*
 	 * Initialize each of the fields which we care about which are present
 	 * in the Compressed Filter Tuple.
 	 */
 	if (tp->vlan_shift >= 0 && EVL_VLANOFTAG(e->vlan) != CPL_L2T_VLAN_NONE)
 		ntuple |= (uint64_t)(F_FT_VLAN_VLD | e->vlan) << tp->vlan_shift;
 
 	if (tp->port_shift >= 0)
 		ntuple |= (uint64_t)e->lport << tp->port_shift;
 
 	if (tp->protocol_shift >= 0)
 		ntuple |= (uint64_t)IPPROTO_TCP << tp->protocol_shift;
 
 	if (tp->vnic_shift >= 0 && tp->vnic_mode == FW_VNIC_MODE_PF_VF) {
 		ntuple |= (uint64_t)(V_FT_VNID_ID_VF(vi->vin) |
 		    V_FT_VNID_ID_PF(sc->pf) | V_FT_VNID_ID_VLD(vi->vfvld)) <<
 		    tp->vnic_shift;
 	}
 
 	if (is_t4(sc))
 		return (htobe32((uint32_t)ntuple));
 	else
 		return (htobe64(V_FILTER_TUPLE(ntuple)));
 }
 
 static int
 is_tls_sock(struct socket *so, struct adapter *sc)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	int i, rc;
 
 	/* XXX: Eventually add a SO_WANT_TLS socket option perhaps? */
 	rc = 0;
 	ADAPTER_LOCK(sc);
 	for (i = 0; i < sc->tt.num_tls_rx_ports; i++) {
 		if (inp->inp_lport == htons(sc->tt.tls_rx_ports[i]) ||
 		    inp->inp_fport == htons(sc->tt.tls_rx_ports[i])) {
 			rc = 1;
 			break;
 		}
 	}
 	ADAPTER_UNLOCK(sc);
 	return (rc);
 }
 
 /*
  * Initialize various connection parameters.
  */
 void
 init_conn_params(struct vi_info *vi , struct offload_settings *s,
     struct in_conninfo *inc, struct socket *so,
     const struct tcp_options *tcpopt, int16_t l2t_idx, struct conn_params *cp)
 {
 	struct port_info *pi = vi->pi;
 	struct adapter *sc = pi->adapter;
 	struct tom_tunables *tt = &sc->tt;
 	struct inpcb *inp = sotoinpcb(so);
 	struct tcpcb *tp = intotcpcb(inp);
 	u_long wnd;
 	u_int q_idx;
 
 	MPASS(s->offload != 0);
 
 	/* Congestion control algorithm */
 	if (s->cong_algo >= 0)
 		cp->cong_algo = s->cong_algo & M_CONG_CNTRL;
 	else if (sc->tt.cong_algorithm >= 0)
 		cp->cong_algo = tt->cong_algorithm & M_CONG_CNTRL;
 	else {
 		struct cc_algo *cc = CC_ALGO(tp);
 
 		if (strcasecmp(cc->name, "reno") == 0)
 			cp->cong_algo = CONG_ALG_RENO;
 		else if (strcasecmp(cc->name, "tahoe") == 0)
 			cp->cong_algo = CONG_ALG_TAHOE;
 		if (strcasecmp(cc->name, "newreno") == 0)
 			cp->cong_algo = CONG_ALG_NEWRENO;
 		if (strcasecmp(cc->name, "highspeed") == 0)
 			cp->cong_algo = CONG_ALG_HIGHSPEED;
 		else {
 			/*
 			 * Use newreno in case the algorithm selected by the
 			 * host stack is not supported by the hardware.
 			 */
 			cp->cong_algo = CONG_ALG_NEWRENO;
 		}
 	}
 
 	/* Tx traffic scheduling class. */
 	if (s->sched_class >= 0 && s->sched_class < sc->params.nsched_cls)
 		cp->tc_idx = s->sched_class;
 	else
 		cp->tc_idx = -1;
 
 	/* Nagle's algorithm. */
 	if (s->nagle >= 0)
 		cp->nagle = s->nagle > 0 ? 1 : 0;
 	else
 		cp->nagle = tp->t_flags & TF_NODELAY ? 0 : 1;
 
 	/* TCP Keepalive. */
 	if (V_tcp_always_keepalive || so_options_get(so) & SO_KEEPALIVE)
 		cp->keepalive = 1;
 	else
 		cp->keepalive = 0;
 
 	/* Optimization that's specific to T5 @ 40G. */
 	if (tt->tx_align >= 0)
 		cp->tx_align =  tt->tx_align > 0 ? 1 : 0;
 	else if (chip_id(sc) == CHELSIO_T5 &&
 	    (port_top_speed(pi) > 10 || sc->params.nports > 2))
 		cp->tx_align = 1;
 	else
 		cp->tx_align = 0;
 
 	/* ULP mode. */
 	if (can_tls_offload(sc) &&
 	    (s->tls > 0 || (s->tls < 0 && is_tls_sock(so, sc))))
 		cp->ulp_mode = ULP_MODE_TLS;
 	else if (s->ddp > 0 ||
 	    (s->ddp < 0 && sc->tt.ddp && (so_options_get(so) & SO_NO_DDP) == 0))
 		cp->ulp_mode = ULP_MODE_TCPDDP;
 	else
 		cp->ulp_mode = ULP_MODE_NONE;
 
 	/* Rx coalescing. */
 	if (s->rx_coalesce >= 0)
 		cp->rx_coalesce = s->rx_coalesce > 0 ? 1 : 0;
 	else if (cp->ulp_mode == ULP_MODE_TLS)
 		cp->rx_coalesce = 0;
 	else if (tt->rx_coalesce >= 0)
 		cp->rx_coalesce = tt->rx_coalesce > 0 ? 1 : 0;
 	else
 		cp->rx_coalesce = 1;	/* default */
 
 	/*
 	 * Index in the PMTU table.  This controls the MSS that we announce in
 	 * our SYN initially, but after ESTABLISHED it controls the MSS that we
 	 * use to send data.
 	 */
 	cp->mtu_idx = find_best_mtu_idx(sc, inc, s);
 
 	/* Tx queue for this connection. */
 	if (s->txq == QUEUE_RANDOM)
 		q_idx = arc4random();
 	else if (s->txq == QUEUE_ROUNDROBIN)
 		q_idx = atomic_fetchadd_int(&vi->txq_rr, 1);
 	else
 		q_idx = s->txq;
 	cp->txq_idx = vi->first_ofld_txq + q_idx % vi->nofldtxq;
 
 	/* Rx queue for this connection. */
 	if (s->rxq == QUEUE_RANDOM)
 		q_idx = arc4random();
 	else if (s->rxq == QUEUE_ROUNDROBIN)
 		q_idx = atomic_fetchadd_int(&vi->rxq_rr, 1);
 	else
 		q_idx = s->rxq;
 	cp->rxq_idx = vi->first_ofld_rxq + q_idx % vi->nofldrxq;
 
 	if (SOLISTENING(so)) {
 		/* Passive open */
 		MPASS(tcpopt != NULL);
 
 		/* TCP timestamp option */
 		if (tcpopt->tstamp &&
 		    (s->tstamp > 0 || (s->tstamp < 0 && V_tcp_do_rfc1323)))
 			cp->tstamp = 1;
 		else
 			cp->tstamp = 0;
 
 		/* SACK */
 		if (tcpopt->sack &&
 		    (s->sack > 0 || (s->sack < 0 && V_tcp_do_sack)))
 			cp->sack = 1;
 		else
 			cp->sack = 0;
 
 		/* Receive window scaling. */
 		if (tcpopt->wsf > 0 && tcpopt->wsf < 15 && V_tcp_do_rfc1323)
 			cp->wscale = select_rcv_wscale();
 		else
 			cp->wscale = 0;
 
 		/* ECN */
 		if (tcpopt->ecn &&	/* XXX: review. */
 		    (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn)))
 			cp->ecn = 1;
 		else
 			cp->ecn = 0;
 
 		wnd = max(so->sol_sbrcv_hiwat, MIN_RCV_WND);
 		cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ);
 
 		if (tt->sndbuf > 0)
 			cp->sndbuf = tt->sndbuf;
 		else if (so->sol_sbsnd_flags & SB_AUTOSIZE &&
 		    V_tcp_do_autosndbuf)
 			cp->sndbuf = 256 * 1024;
 		else
 			cp->sndbuf = so->sol_sbsnd_hiwat;
 	} else {
 		/* Active open */
 
 		/* TCP timestamp option */
 		if (s->tstamp > 0 ||
 		    (s->tstamp < 0 && (tp->t_flags & TF_REQ_TSTMP)))
 			cp->tstamp = 1;
 		else
 			cp->tstamp = 0;
 
 		/* SACK */
 		if (s->sack > 0 ||
 		    (s->sack < 0 && (tp->t_flags & TF_SACK_PERMIT)))
 			cp->sack = 1;
 		else
 			cp->sack = 0;
 
 		/* Receive window scaling */
 		if (tp->t_flags & TF_REQ_SCALE)
 			cp->wscale = select_rcv_wscale();
 		else
 			cp->wscale = 0;
 
 		/* ECN */
 		if (s->ecn > 0 || (s->ecn < 0 && V_tcp_do_ecn == 1))
 			cp->ecn = 1;
 		else
 			cp->ecn = 0;
 
 		SOCKBUF_LOCK(&so->so_rcv);
 		wnd = max(select_rcv_wnd(so), MIN_RCV_WND);
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		cp->opt0_bufsize = min(wnd >> 10, M_RCV_BUFSIZ);
 
 		if (tt->sndbuf > 0)
 			cp->sndbuf = tt->sndbuf;
 		else {
 			SOCKBUF_LOCK(&so->so_snd);
 			if (so->so_snd.sb_flags & SB_AUTOSIZE &&
 			    V_tcp_do_autosndbuf)
 				cp->sndbuf = 256 * 1024;
 			else
 				cp->sndbuf = so->so_snd.sb_hiwat;
 			SOCKBUF_UNLOCK(&so->so_snd);
 		}
 	}
 
 	cp->l2t_idx = l2t_idx;
 
 	/* This will be initialized on ESTABLISHED. */
 	cp->emss = 0;
 }
 
 int
 negative_advice(int status)
 {
 
 	return (status == CPL_ERR_RTX_NEG_ADVICE ||
 	    status == CPL_ERR_PERSIST_NEG_ADVICE ||
 	    status == CPL_ERR_KEEPALV_NEG_ADVICE);
 }
 
 static int
 alloc_tid_tab(struct tid_info *t, int flags)
 {
 
 	MPASS(t->ntids > 0);
 	MPASS(t->tid_tab == NULL);
 
 	t->tid_tab = malloc(t->ntids * sizeof(*t->tid_tab), M_CXGBE,
 	    M_ZERO | flags);
 	if (t->tid_tab == NULL)
 		return (ENOMEM);
 	atomic_store_rel_int(&t->tids_in_use, 0);
 
 	return (0);
 }
 
 static void
 free_tid_tab(struct tid_info *t)
 {
 
 	KASSERT(t->tids_in_use == 0,
 	    ("%s: %d tids still in use.", __func__, t->tids_in_use));
 
 	free(t->tid_tab, M_CXGBE);
 	t->tid_tab = NULL;
 }
 
 static int
 alloc_stid_tab(struct tid_info *t, int flags)
 {
 
 	MPASS(t->nstids > 0);
 	MPASS(t->stid_tab == NULL);
 
 	t->stid_tab = malloc(t->nstids * sizeof(*t->stid_tab), M_CXGBE,
 	    M_ZERO | flags);
 	if (t->stid_tab == NULL)
 		return (ENOMEM);
 	mtx_init(&t->stid_lock, "stid lock", NULL, MTX_DEF);
 	t->stids_in_use = 0;
 	TAILQ_INIT(&t->stids);
 	t->nstids_free_head = t->nstids;
 
 	return (0);
 }
 
 static void
 free_stid_tab(struct tid_info *t)
 {
 
 	KASSERT(t->stids_in_use == 0,
 	    ("%s: %d tids still in use.", __func__, t->stids_in_use));
 
 	if (mtx_initialized(&t->stid_lock))
 		mtx_destroy(&t->stid_lock);
 	free(t->stid_tab, M_CXGBE);
 	t->stid_tab = NULL;
 }
 
 static void
 free_tid_tabs(struct tid_info *t)
 {
 
 	free_tid_tab(t);
 	free_stid_tab(t);
 }
 
 static int
 alloc_tid_tabs(struct tid_info *t)
 {
 	int rc;
 
 	rc = alloc_tid_tab(t, M_NOWAIT);
 	if (rc != 0)
 		goto failed;
 
 	rc = alloc_stid_tab(t, M_NOWAIT);
 	if (rc != 0)
 		goto failed;
 
 	return (0);
 failed:
 	free_tid_tabs(t);
 	return (rc);
 }
 
 static inline void
 alloc_tcb_history(struct adapter *sc, struct tom_data *td)
 {
 
 	if (sc->tids.ntids == 0 || sc->tids.ntids > 1024)
 		return;
 	rw_init(&td->tcb_history_lock, "TCB history");
 	td->tcb_history = malloc(sc->tids.ntids * sizeof(*td->tcb_history),
 	    M_CXGBE, M_ZERO | M_NOWAIT);
 	td->dupack_threshold = G_DUPACKTHRESH(t4_read_reg(sc, A_TP_PARA_REG0));
 }
 
 static inline void
 free_tcb_history(struct adapter *sc, struct tom_data *td)
 {
 #ifdef INVARIANTS
 	int i;
 
 	if (td->tcb_history != NULL) {
 		for (i = 0; i < sc->tids.ntids; i++) {
 			MPASS(td->tcb_history[i] == NULL);
 		}
 	}
 #endif
 	free(td->tcb_history, M_CXGBE);
 	if (rw_initialized(&td->tcb_history_lock))
 		rw_destroy(&td->tcb_history_lock);
 }
 
 static void
 free_tom_data(struct adapter *sc, struct tom_data *td)
 {
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	KASSERT(TAILQ_EMPTY(&td->toep_list),
 	    ("%s: TOE PCB list is not empty.", __func__));
 	KASSERT(td->lctx_count == 0,
 	    ("%s: lctx hash table is not empty.", __func__));
 
 	t4_free_ppod_region(&td->pr);
 
 	if (td->listen_mask != 0)
 		hashdestroy(td->listen_hash, M_CXGBE, td->listen_mask);
 
 	if (mtx_initialized(&td->unsent_wr_lock))
 		mtx_destroy(&td->unsent_wr_lock);
 	if (mtx_initialized(&td->lctx_hash_lock))
 		mtx_destroy(&td->lctx_hash_lock);
 	if (mtx_initialized(&td->toep_list_lock))
 		mtx_destroy(&td->toep_list_lock);
 
 	free_tcb_history(sc, td);
 	free_tid_tabs(&sc->tids);
 	free(td, M_CXGBE);
 }
 
 static char *
 prepare_pkt(int open_type, uint16_t vtag, struct inpcb *inp, int *pktlen,
     int *buflen)
 {
 	char *pkt;
 	struct tcphdr *th;
 	int ipv6, len;
 	const int maxlen =
 	    max(sizeof(struct ether_header), sizeof(struct ether_vlan_header)) +
 	    max(sizeof(struct ip), sizeof(struct ip6_hdr)) +
 	    sizeof(struct tcphdr);
 
 	MPASS(open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN);
 
 	pkt = malloc(maxlen, M_CXGBE, M_ZERO | M_NOWAIT);
 	if (pkt == NULL)
 		return (NULL);
 
 	ipv6 = inp->inp_vflag & INP_IPV6;
 	len = 0;
 
 	if (EVL_VLANOFTAG(vtag) == 0xfff) {
 		struct ether_header *eh = (void *)pkt;
 
 		if (ipv6)
 			eh->ether_type = htons(ETHERTYPE_IPV6);
 		else
 			eh->ether_type = htons(ETHERTYPE_IP);
 
 		len += sizeof(*eh);
 	} else {
 		struct ether_vlan_header *evh = (void *)pkt;
 
 		evh->evl_encap_proto = htons(ETHERTYPE_VLAN);
 		evh->evl_tag = htons(vtag);
 		if (ipv6)
 			evh->evl_proto = htons(ETHERTYPE_IPV6);
 		else
 			evh->evl_proto = htons(ETHERTYPE_IP);
 
 		len += sizeof(*evh);
 	}
 
 	if (ipv6) {
 		struct ip6_hdr *ip6 = (void *)&pkt[len];
 
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
 		ip6->ip6_nxt = IPPROTO_TCP;
 		if (open_type == OPEN_TYPE_ACTIVE) {
 			ip6->ip6_src = inp->in6p_laddr;
 			ip6->ip6_dst = inp->in6p_faddr;
 		} else if (open_type == OPEN_TYPE_LISTEN) {
 			ip6->ip6_src = inp->in6p_laddr;
 			ip6->ip6_dst = ip6->ip6_src;
 		}
 
 		len += sizeof(*ip6);
 	} else {
 		struct ip *ip = (void *)&pkt[len];
 
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = sizeof(*ip) >> 2;
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_p = IPPROTO_TCP;
 		if (open_type == OPEN_TYPE_ACTIVE) {
 			ip->ip_src = inp->inp_laddr;
 			ip->ip_dst = inp->inp_faddr;
 		} else if (open_type == OPEN_TYPE_LISTEN) {
 			ip->ip_src = inp->inp_laddr;
 			ip->ip_dst = ip->ip_src;
 		}
 
 		len += sizeof(*ip);
 	}
 
 	th = (void *)&pkt[len];
 	if (open_type == OPEN_TYPE_ACTIVE) {
 		th->th_sport = inp->inp_lport;	/* network byte order already */
 		th->th_dport = inp->inp_fport;	/* ditto */
 	} else if (open_type == OPEN_TYPE_LISTEN) {
 		th->th_sport = inp->inp_lport;	/* network byte order already */
 		th->th_dport = th->th_sport;
 	}
 	len += sizeof(th);
 
 	*pktlen = *buflen = len;
 	return (pkt);
 }
 
 const struct offload_settings *
 lookup_offload_policy(struct adapter *sc, int open_type, struct mbuf *m,
     uint16_t vtag, struct inpcb *inp)
 {
 	const struct t4_offload_policy *op;
 	char *pkt;
 	struct offload_rule *r;
 	int i, matched, pktlen, buflen;
 	static const struct offload_settings allow_offloading_settings = {
 		.offload = 1,
 		.rx_coalesce = -1,
 		.cong_algo = -1,
 		.sched_class = -1,
 		.tstamp = -1,
 		.sack = -1,
 		.nagle = -1,
 		.ecn = -1,
 		.ddp = -1,
 		.tls = -1,
 		.txq = QUEUE_RANDOM,
 		.rxq = QUEUE_RANDOM,
 		.mss = -1,
 	};
 	static const struct offload_settings disallow_offloading_settings = {
 		.offload = 0,
 		/* rest is irrelevant when offload is off. */
 	};
 
 	rw_assert(&sc->policy_lock, RA_LOCKED);
 
 	/*
 	 * If there's no Connection Offloading Policy attached to the device
 	 * then we need to return a default static policy.  If
 	 * "cop_managed_offloading" is true, then we need to disallow
 	 * offloading until a COP is attached to the device.  Otherwise we
 	 * allow offloading ...
 	 */
 	op = sc->policy;
 	if (op == NULL) {
 		if (sc->tt.cop_managed_offloading)
 			return (&disallow_offloading_settings);
 		else
 			return (&allow_offloading_settings);
 	}
 
 	switch (open_type) {
 	case OPEN_TYPE_ACTIVE:
 	case OPEN_TYPE_LISTEN:
 		pkt = prepare_pkt(open_type, vtag, inp, &pktlen, &buflen);
 		break;
 	case OPEN_TYPE_PASSIVE:
 		MPASS(m != NULL);
 		pkt = mtod(m, char *);
 		MPASS(*pkt == CPL_PASS_ACCEPT_REQ);
 		pkt += sizeof(struct cpl_pass_accept_req);
 		pktlen = m->m_pkthdr.len - sizeof(struct cpl_pass_accept_req);
 		buflen = m->m_len - sizeof(struct cpl_pass_accept_req);
 		break;
 	default:
 		MPASS(0);
 		return (&disallow_offloading_settings);
 	}
 
 	if (pkt == NULL || pktlen == 0 || buflen == 0)
 		return (&disallow_offloading_settings);
 
 	matched = 0;
 	r = &op->rule[0];
 	for (i = 0; i < op->nrules; i++, r++) {
 		if (r->open_type != open_type &&
 		    r->open_type != OPEN_TYPE_DONTCARE) {
 			continue;
 		}
 		matched = bpf_filter(r->bpf_prog.bf_insns, pkt, pktlen, buflen);
 		if (matched)
 			break;
 	}
 
 	if (open_type == OPEN_TYPE_ACTIVE || open_type == OPEN_TYPE_LISTEN)
 		free(pkt, M_CXGBE);
 
 	return (matched ? &r->settings : &disallow_offloading_settings);
 }
 
 static void
 reclaim_wr_resources(void *arg, int count)
 {
 	struct tom_data *td = arg;
 	STAILQ_HEAD(, wrqe) twr_list = STAILQ_HEAD_INITIALIZER(twr_list);
 	struct cpl_act_open_req *cpl;
 	u_int opcode, atid, tid;
 	struct wrqe *wr;
 	struct adapter *sc = td_adapter(td);
 
 	mtx_lock(&td->unsent_wr_lock);
 	STAILQ_SWAP(&td->unsent_wr_list, &twr_list, wrqe);
 	mtx_unlock(&td->unsent_wr_lock);
 
 	while ((wr = STAILQ_FIRST(&twr_list)) != NULL) {
 		STAILQ_REMOVE_HEAD(&twr_list, link);
 
 		cpl = wrtod(wr);
 		opcode = GET_OPCODE(cpl);
 
 		switch (opcode) {
 		case CPL_ACT_OPEN_REQ:
 		case CPL_ACT_OPEN_REQ6:
 			atid = G_TID_TID(be32toh(OPCODE_TID(cpl)));
 			CTR2(KTR_CXGBE, "%s: atid %u ", __func__, atid);
 			act_open_failure_cleanup(sc, atid, EHOSTUNREACH);
 			free(wr, M_CXGBE);
 			break;
 		case CPL_PASS_ACCEPT_RPL:
 			tid = GET_TID(cpl);
 			CTR2(KTR_CXGBE, "%s: tid %u ", __func__, tid);
 			synack_failure_cleanup(sc, tid);
 			free(wr, M_CXGBE);
 			break;
 		default:
 			log(LOG_ERR, "%s: leaked work request %p, wr_len %d, "
 			    "opcode %x\n", __func__, wr, wr->wr_len, opcode);
 			/* WR not freed here; go look at it with a debugger.  */
 		}
 	}
 }
 
 /*
  * Ground control to Major TOM
  * Commencing countdown, engines on
  */
 static int
 t4_tom_activate(struct adapter *sc)
 {
 	struct tom_data *td;
 	struct toedev *tod;
 	struct vi_info *vi;
 	int i, rc, v;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	/* per-adapter softc for TOM */
 	td = malloc(sizeof(*td), M_CXGBE, M_ZERO | M_NOWAIT);
 	if (td == NULL)
 		return (ENOMEM);
 
 	/* List of TOE PCBs and associated lock */
 	mtx_init(&td->toep_list_lock, "PCB list lock", NULL, MTX_DEF);
 	TAILQ_INIT(&td->toep_list);
 
 	/* Listen context */
 	mtx_init(&td->lctx_hash_lock, "lctx hash lock", NULL, MTX_DEF);
 	td->listen_hash = hashinit_flags(LISTEN_HASH_SIZE, M_CXGBE,
 	    &td->listen_mask, HASH_NOWAIT);
 
 	/* List of WRs for which L2 resolution failed */
 	mtx_init(&td->unsent_wr_lock, "Unsent WR list lock", NULL, MTX_DEF);
 	STAILQ_INIT(&td->unsent_wr_list);
 	TASK_INIT(&td->reclaim_wr_resources, 0, reclaim_wr_resources, td);
 
 	/* TID tables */
 	rc = alloc_tid_tabs(&sc->tids);
 	if (rc != 0)
 		goto done;
 
 	rc = t4_init_ppod_region(&td->pr, &sc->vres.ddp,
 	    t4_read_reg(sc, A_ULP_RX_TDDP_PSZ), "TDDP page pods");
 	if (rc != 0)
 		goto done;
 	t4_set_reg_field(sc, A_ULP_RX_TDDP_TAGMASK,
 	    V_TDDPTAGMASK(M_TDDPTAGMASK), td->pr.pr_tag_mask);
 
 	alloc_tcb_history(sc, td);
 
 	/* toedev ops */
 	tod = &td->tod;
 	init_toedev(tod);
 	tod->tod_softc = sc;
 	tod->tod_connect = t4_connect;
 	tod->tod_listen_start = t4_listen_start;
 	tod->tod_listen_stop = t4_listen_stop;
 	tod->tod_rcvd = t4_rcvd;
 	tod->tod_output = t4_tod_output;
 	tod->tod_send_rst = t4_send_rst;
 	tod->tod_send_fin = t4_send_fin;
 	tod->tod_pcb_detach = t4_pcb_detach;
 	tod->tod_l2_update = t4_l2_update;
 	tod->tod_syncache_added = t4_syncache_added;
 	tod->tod_syncache_removed = t4_syncache_removed;
 	tod->tod_syncache_respond = t4_syncache_respond;
 	tod->tod_offload_socket = t4_offload_socket;
 	tod->tod_ctloutput = t4_ctloutput;
 	tod->tod_tcp_info = t4_tcp_info;
 #ifdef KERN_TLS
 	tod->tod_alloc_tls_session = t4_alloc_tls_session;
 #endif
 	tod->tod_pmtu_update = t4_pmtu_update;
 
 	for_each_port(sc, i) {
 		for_each_vi(sc->port[i], v, vi) {
 			TOEDEV(vi->ifp) = &td->tod;
 		}
 	}
 
 	sc->tom_softc = td;
 	register_toedev(sc->tom_softc);
 
 done:
 	if (rc != 0)
 		free_tom_data(sc, td);
 	return (rc);
 }
 
 static int
 t4_tom_deactivate(struct adapter *sc)
 {
 	int rc = 0;
 	struct tom_data *td = sc->tom_softc;
 
 	ASSERT_SYNCHRONIZED_OP(sc);
 
 	if (td == NULL)
 		return (0);	/* XXX. KASSERT? */
 
 	if (sc->offload_map != 0)
 		return (EBUSY);	/* at least one port has IFCAP_TOE enabled */
 
 	if (uld_active(sc, ULD_IWARP) || uld_active(sc, ULD_ISCSI))
 		return (EBUSY);	/* both iWARP and iSCSI rely on the TOE. */
 
 	mtx_lock(&td->toep_list_lock);
 	if (!TAILQ_EMPTY(&td->toep_list))
 		rc = EBUSY;
 	mtx_unlock(&td->toep_list_lock);
 
 	mtx_lock(&td->lctx_hash_lock);
 	if (td->lctx_count > 0)
 		rc = EBUSY;
 	mtx_unlock(&td->lctx_hash_lock);
 
 	taskqueue_drain(taskqueue_thread, &td->reclaim_wr_resources);
 	mtx_lock(&td->unsent_wr_lock);
 	if (!STAILQ_EMPTY(&td->unsent_wr_list))
 		rc = EBUSY;
 	mtx_unlock(&td->unsent_wr_lock);
 
 	if (rc == 0) {
 		unregister_toedev(sc->tom_softc);
 		free_tom_data(sc, td);
 		sc->tom_softc = NULL;
 	}
 
 	return (rc);
 }
 
 static int
 t4_aio_queue_tom(struct socket *so, struct kaiocb *job)
 {
 	struct tcpcb *tp = so_sototcpcb(so);
 	struct toepcb *toep = tp->t_toe;
 	int error;
 
 	/*
 	 * No lock is needed as TOE sockets never change between
 	 * active and passive.
 	 */
 	if (SOLISTENING(so))
 		return (EINVAL);
 
 	if (ulp_mode(toep) == ULP_MODE_TCPDDP) {
 		error = t4_aio_queue_ddp(so, job);
 		if (error != EOPNOTSUPP)
 			return (error);
 	}
 
 	return (t4_aio_queue_aiotx(so, job));
 }
 
 static int
 t4_tom_mod_load(void)
 {
 	/* CPL handlers */
 	t4_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
 	t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, do_l2t_write_rpl2,
 	    CPL_COOKIE_TOM);
 	t4_init_connect_cpl_handlers();
 	t4_init_listen_cpl_handlers();
 	t4_init_cpl_io_handlers();
 
 	t4_ddp_mod_load();
 	t4_tls_mod_load();
 
-	tcp_protosw = pffindproto(PF_INET, IPPROTO_TCP, SOCK_STREAM);
-	if (tcp_protosw == NULL)
-		return (ENOPROTOOPT);
-	bcopy(tcp_protosw, &toe_protosw, sizeof(toe_protosw));
-	bcopy(tcp_protosw->pr_usrreqs, &toe_usrreqs, sizeof(toe_usrreqs));
-	toe_usrreqs.pru_aio_queue = t4_aio_queue_tom;
-	toe_protosw.pr_usrreqs = &toe_usrreqs;
-
-	tcp6_protosw = pffindproto(PF_INET6, IPPROTO_TCP, SOCK_STREAM);
-	if (tcp6_protosw == NULL)
-		return (ENOPROTOOPT);
-	bcopy(tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
-	bcopy(tcp6_protosw->pr_usrreqs, &toe6_usrreqs, sizeof(toe6_usrreqs));
-	toe6_usrreqs.pru_aio_queue = t4_aio_queue_tom;
-	toe6_protosw.pr_usrreqs = &toe6_usrreqs;
+	bcopy(&tcp_protosw, &toe_protosw, sizeof(toe_protosw));
+	toe_protosw.pr_aio_queue = t4_aio_queue_tom;
+
+	bcopy(&tcp6_protosw, &toe6_protosw, sizeof(toe6_protosw));
+	toe6_protosw.pr_aio_queue = t4_aio_queue_tom;
 
 	return (t4_register_uld(&tom_uld_info));
 }
 
 static void
 tom_uninit(struct adapter *sc, void *arg __unused)
 {
 	if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4tomun"))
 		return;
 
 	/* Try to free resources (works only if no port has IFCAP_TOE) */
 	if (uld_active(sc, ULD_TOM))
 		t4_deactivate_uld(sc, ULD_TOM);
 
 	end_synchronized_op(sc, 0);
 }
 
 static int
 t4_tom_mod_unload(void)
 {
 	t4_iterate(tom_uninit, NULL);
 
 	if (t4_unregister_uld(&tom_uld_info) == EBUSY)
 		return (EBUSY);
 
 	t4_tls_mod_unload();
 	t4_ddp_mod_unload();
 
 	t4_uninit_connect_cpl_handlers();
 	t4_uninit_listen_cpl_handlers();
 	t4_uninit_cpl_io_handlers();
 	t4_register_shared_cpl_handler(CPL_L2T_WRITE_RPL, NULL, CPL_COOKIE_TOM);
 	t4_register_cpl_handler(CPL_GET_TCB_RPL, NULL);
 
 	return (0);
 }
 #endif	/* TCP_OFFLOAD */
 
 static int
 t4_tom_modevent(module_t mod, int cmd, void *arg)
 {
 	int rc = 0;
 
 #ifdef TCP_OFFLOAD
 	switch (cmd) {
 	case MOD_LOAD:
 		rc = t4_tom_mod_load();
 		break;
 
 	case MOD_UNLOAD:
 		rc = t4_tom_mod_unload();
 		break;
 
 	default:
 		rc = EINVAL;
 	}
 #else
 	printf("t4_tom: compiled without TCP_OFFLOAD support.\n");
 	rc = EOPNOTSUPP;
 #endif
 	return (rc);
 }
 
 static moduledata_t t4_tom_moddata= {
 	"t4_tom",
 	t4_tom_modevent,
 	0
 };
 
 MODULE_VERSION(t4_tom, 1);
 MODULE_DEPEND(t4_tom, toecore, 1, 1, 1);
 MODULE_DEPEND(t4_tom, t4nex, 1, 1, 1);
 DECLARE_MODULE(t4_tom, t4_tom_moddata, SI_SUB_EXEC, SI_ORDER_ANY);
diff --git a/sys/dev/hyperv/hvsock/hv_sock.c b/sys/dev/hyperv/hvsock/hv_sock.c
index 8c327a22e6fd..f728383a1b6f 100644
--- a/sys/dev/hyperv/hvsock/hv_sock.c
+++ b/sys/dev/hyperv/hvsock/hv_sock.c
@@ -1,1758 +1,1745 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2020 Microsoft Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/domain.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/malloc.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/systm.h>
 #include <sys/sockbuf.h>
 #include <sys/sx.h>
 #include <sys/uio.h>
 
 #include <net/vnet.h>
 
 #include <dev/hyperv/vmbus/vmbus_reg.h>
 
 #include "hv_sock.h"
 
 #define HVSOCK_DBG_NONE			0x0
 #define HVSOCK_DBG_INFO			0x1
 #define HVSOCK_DBG_ERR			0x2
 #define HVSOCK_DBG_VERBOSE		0x3
 
 
 SYSCTL_NODE(_net, OID_AUTO, hvsock, CTLFLAG_RD, 0, "HyperV socket");
 
 static int hvs_dbg_level;
 SYSCTL_INT(_net_hvsock, OID_AUTO, hvs_dbg_level, CTLFLAG_RWTUN, &hvs_dbg_level,
     0, "hyperv socket debug level: 0 = none, 1 = info, 2 = error, 3 = verbose");
 
 
 #define HVSOCK_DBG(level, ...) do {					\
 	if (hvs_dbg_level >= (level))					\
 		printf(__VA_ARGS__);					\
 	} while (0)
 
 MALLOC_DEFINE(M_HVSOCK, "hyperv_socket", "hyperv socket control structures");
 
 static int hvs_dom_probe(void);
 
 /* The MTU is 16KB per host side's design */
 #define HVSOCK_MTU_SIZE		(1024 * 16)
 #define HVSOCK_SEND_BUF_SZ	(PAGE_SIZE - sizeof(struct vmpipe_proto_header))
 
 #define HVSOCK_HEADER_LEN	(sizeof(struct hvs_pkt_header))
 
 #define HVSOCK_PKT_LEN(payload_len)	(HVSOCK_HEADER_LEN + \
 					 roundup2(payload_len, 8) + \
 					 sizeof(uint64_t))
 
-
-static struct domain		hv_socket_domain;
-
 /*
  * HyperV Transport sockets
  */
-static struct pr_usrreqs	hvs_trans_usrreqs = {
-	.pru_attach =		hvs_trans_attach,
-	.pru_bind =		hvs_trans_bind,
-	.pru_listen =		hvs_trans_listen,
-	.pru_accept =		hvs_trans_accept,
-	.pru_connect =		hvs_trans_connect,
-	.pru_peeraddr =		hvs_trans_peeraddr,
-	.pru_sockaddr =		hvs_trans_sockaddr,
-	.pru_soreceive =	hvs_trans_soreceive,
-	.pru_sosend =		hvs_trans_sosend,
-	.pru_disconnect =	hvs_trans_disconnect,
-	.pru_close =		hvs_trans_close,
-	.pru_detach =		hvs_trans_detach,
-	.pru_shutdown =		hvs_trans_shutdown,
-	.pru_abort =		hvs_trans_abort,
-};
-
-/*
- * Definitions of protocols supported in HyperV socket domain
- */
-static struct protosw		hv_socket_protosw[] = {
-{
+static struct protosw hv_socket_protosw = {
 	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&hv_socket_domain,
 	.pr_protocol =		HYPERV_SOCK_PROTO_TRANS,
 	.pr_flags =		PR_CONNREQUIRED,
-	.pr_usrreqs =		&hvs_trans_usrreqs,
-},
+	.pr_attach =		hvs_trans_attach,
+	.pr_bind =		hvs_trans_bind,
+	.pr_listen =		hvs_trans_listen,
+	.pr_accept =		hvs_trans_accept,
+	.pr_connect =		hvs_trans_connect,
+	.pr_peeraddr =		hvs_trans_peeraddr,
+	.pr_sockaddr =		hvs_trans_sockaddr,
+	.pr_soreceive =		hvs_trans_soreceive,
+	.pr_sosend =		hvs_trans_sosend,
+	.pr_disconnect =	hvs_trans_disconnect,
+	.pr_close =		hvs_trans_close,
+	.pr_detach =		hvs_trans_detach,
+	.pr_shutdown =		hvs_trans_shutdown,
+	.pr_abort =		hvs_trans_abort,
 };
 
 static struct domain		hv_socket_domain = {
 	.dom_family =		AF_HYPERV,
 	.dom_name =		"hyperv",
 	.dom_probe =		hvs_dom_probe,
-	.dom_protosw =		hv_socket_protosw,
-	.dom_protoswNPROTOSW =	&hv_socket_protosw[nitems(hv_socket_protosw)]
+	.dom_nprotosw =		1,
+	.dom_protosw =		{ &hv_socket_protosw },
 };
 
 DOMAIN_SET(hv_socket_);
 
 #define MAX_PORT			((uint32_t)0xFFFFFFFF)
 #define MIN_PORT			((uint32_t)0x0)
 
 /* 00000000-facb-11e6-bd58-64006a7986d3 */
 static const struct hyperv_guid srv_id_template = {
 	.hv_guid = {
 	    0x00, 0x00, 0x00, 0x00, 0xcb, 0xfa, 0xe6, 0x11,
 	    0xbd, 0x58, 0x64, 0x00, 0x6a, 0x79, 0x86, 0xd3 }
 };
 
 static int		hvsock_br_callback(void *, int, void *);
 static uint32_t		hvsock_canread_check(struct hvs_pcb *);
 static uint32_t		hvsock_canwrite_check(struct hvs_pcb *);
 static int		hvsock_send_data(struct vmbus_channel *chan,
     struct uio *uio, uint32_t to_write, struct sockbuf *sb);
 
 
 
 /* Globals */
 static struct sx		hvs_trans_socks_sx;
 static struct mtx		hvs_trans_socks_mtx;
 static LIST_HEAD(, hvs_pcb)	hvs_trans_bound_socks;
 static LIST_HEAD(, hvs_pcb)	hvs_trans_connected_socks;
 static uint32_t			previous_auto_bound_port;
 
 static void
 hvsock_print_guid(struct hyperv_guid *guid)
 {
 	unsigned char *p = (unsigned char *)guid;
 
 	HVSOCK_DBG(HVSOCK_DBG_INFO,
 	    "0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x-0x%x\n",
 	    *(unsigned int *)p,
 	    *((unsigned short *) &p[4]),
 	    *((unsigned short *) &p[6]),
 	    p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15]);
 }
 
 static bool
 is_valid_srv_id(const struct hyperv_guid *id)
 {
 	return !memcmp(&id->hv_guid[4],
 	    &srv_id_template.hv_guid[4], sizeof(struct hyperv_guid) - 4);
 }
 
 static unsigned int
 get_port_by_srv_id(const struct hyperv_guid *srv_id)
 {
 	return *((const unsigned int *)srv_id);
 }
 
 static void
 set_port_by_srv_id(struct hyperv_guid *srv_id, unsigned int port)
 {
 	*((unsigned int *)srv_id) = port;
 }
 
 
 static void
 __hvs_remove_pcb_from_list(struct hvs_pcb *pcb, unsigned char list)
 {
 	struct hvs_pcb *p = NULL;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
 
 	if (!pcb)
 		return;
 
 	if (list & HVS_LIST_BOUND) {
 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
 			if  (p == pcb)
 				LIST_REMOVE(p, bound_next);
 	}
 
 	if (list & HVS_LIST_CONNECTED) {
 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
 			if (p == pcb)
 				LIST_REMOVE(pcb, connected_next);
 	}
 }
 
 static void
 __hvs_remove_socket_from_list(struct socket *so, unsigned char list)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "%s: pcb is %p\n", __func__, pcb);
 
 	__hvs_remove_pcb_from_list(pcb, list);
 }
 
 static void
 __hvs_insert_socket_on_list(struct socket *so, unsigned char list)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 
 	if (list & HVS_LIST_BOUND)
 		LIST_INSERT_HEAD(&hvs_trans_bound_socks,
 		   pcb, bound_next);
 
 	if (list & HVS_LIST_CONNECTED)
 		LIST_INSERT_HEAD(&hvs_trans_connected_socks,
 		   pcb, connected_next);
 }
 
 void
 hvs_remove_socket_from_list(struct socket *so, unsigned char list)
 {
 	if (!so || !so->so_pcb) {
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "%s: socket or so_pcb is null\n", __func__);
 		return;
 	}
 
 	mtx_lock(&hvs_trans_socks_mtx);
 	__hvs_remove_socket_from_list(so, list);
 	mtx_unlock(&hvs_trans_socks_mtx);
 }
 
 static void
 hvs_insert_socket_on_list(struct socket *so, unsigned char list)
 {
 	if (!so || !so->so_pcb) {
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "%s: socket or so_pcb is null\n", __func__);
 		return;
 	}
 
 	mtx_lock(&hvs_trans_socks_mtx);
 	__hvs_insert_socket_on_list(so, list);
 	mtx_unlock(&hvs_trans_socks_mtx);
 }
 
 static struct socket *
 __hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
 {
 	struct hvs_pcb *p = NULL;
 
 	if (list & HVS_LIST_BOUND)
 		LIST_FOREACH(p, &hvs_trans_bound_socks, bound_next)
 			if (p->so != NULL &&
 			    addr->hvs_port == p->local_addr.hvs_port)
 				return p->so;
 
 	if (list & HVS_LIST_CONNECTED)
 		LIST_FOREACH(p, &hvs_trans_connected_socks, connected_next)
 			if (p->so != NULL &&
 			    addr->hvs_port == p->local_addr.hvs_port)
 				return p->so;
 
 	return NULL;
 }
 
 static struct socket *
 hvs_find_socket_on_list(struct sockaddr_hvs *addr, unsigned char list)
 {
 	struct socket *s = NULL;
 
 	mtx_lock(&hvs_trans_socks_mtx);
 	s = __hvs_find_socket_on_list(addr, list);
 	mtx_unlock(&hvs_trans_socks_mtx);
 
 	return s;
 }
 
 static inline void
 hvs_addr_set(struct sockaddr_hvs *addr, unsigned int port)
 {
 	memset(addr, 0, sizeof(*addr));
 	addr->sa_family = AF_HYPERV;
 	addr->sa_len = sizeof(*addr);
 	addr->hvs_port = port;
 }
 
 void
 hvs_addr_init(struct sockaddr_hvs *addr, const struct hyperv_guid *svr_id)
 {
 	hvs_addr_set(addr, get_port_by_srv_id(svr_id));
 }
 
 int
 hvs_trans_lock(void)
 {
 	sx_xlock(&hvs_trans_socks_sx);
 	return (0);
 }
 
 void
 hvs_trans_unlock(void)
 {
 	sx_xunlock(&hvs_trans_socks_sx);
 }
 
 static int
 hvs_dom_probe(void)
 {
 
 	/* Don't even give us a chance to attach on non-HyperV. */
 	if (vm_guest != VM_GUEST_HV)
 		return (ENXIO);
 	return (0);
 }
 
 static void
 hvs_trans_init(void *arg __unused)
 {
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_init called\n", __func__);
 
 	/* Initialize Globals */
 	previous_auto_bound_port = MAX_PORT;
 	sx_init(&hvs_trans_socks_sx, "hvs_trans_sock_sx");
 	mtx_init(&hvs_trans_socks_mtx,
 	    "hvs_trans_socks_mtx", NULL, MTX_DEF);
 	LIST_INIT(&hvs_trans_bound_socks);
 	LIST_INIT(&hvs_trans_connected_socks);
 }
 SYSINIT(hvs_trans_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     hvs_trans_init, NULL);
 
 /*
  * Called in two cases:
  * 1) When user calls socket();
  * 2) When we accept new incoming conneciton and call sonewconn().
  */
 int
 hvs_trans_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_attach called\n", __func__);
 
 	if (so->so_type != SOCK_STREAM)
 		return (ESOCKTNOSUPPORT);
 
 	if (proto != 0 && proto != HYPERV_SOCK_PROTO_TRANS)
 		return (EPROTONOSUPPORT);
 
 	if (pcb != NULL)
 		return (EISCONN);
 	pcb = malloc(sizeof(struct hvs_pcb), M_HVSOCK, M_NOWAIT | M_ZERO);
 	if (pcb == NULL)
 		return (ENOMEM);
 
 	pcb->so = so;
 	so->so_pcb = (void *)pcb;
 
 	return (0);
 }
 
 void
 hvs_trans_detach(struct socket *so)
 {
 	struct hvs_pcb *pcb;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_detach called\n", __func__);
 
 	(void) hvs_trans_lock();
 	pcb = so2hvspcb(so);
 	if (pcb == NULL) {
 		hvs_trans_unlock();
 		return;
 	}
 
 	if (SOLISTENING(so)) {
 		bzero(pcb, sizeof(*pcb));
 		free(pcb, M_HVSOCK);
 	}
 
 	so->so_pcb = NULL;
 
 	hvs_trans_unlock();
 }
 
 int
 hvs_trans_bind(struct socket *so, struct sockaddr *addr, struct thread *td)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 	struct sockaddr_hvs *sa = (struct sockaddr_hvs *) addr;
 	int error = 0;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_bind called\n", __func__);
 
 	if (sa == NULL) {
 		return (EINVAL);
 	}
 
 	if (pcb == NULL) {
 		return (EINVAL);
 	}
 
 	if (sa->sa_family != AF_HYPERV) {
 		HVSOCK_DBG(HVSOCK_DBG_ERR,
 		    "%s: Not supported, sa_family is %u\n",
 		    __func__, sa->sa_family);
 		return (EAFNOSUPPORT);
 	}
 	if (sa->sa_len != sizeof(*sa)) {
 		HVSOCK_DBG(HVSOCK_DBG_ERR,
 		    "%s: Not supported, sa_len is %u\n",
 		    __func__, sa->sa_len);
 		return (EINVAL);
 	}
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: binding port = 0x%x\n", __func__, sa->hvs_port);
 
 	mtx_lock(&hvs_trans_socks_mtx);
 	if (__hvs_find_socket_on_list(sa,
 	    HVS_LIST_BOUND | HVS_LIST_CONNECTED)) {
 		error = EADDRINUSE;
 	} else {
 		/*
 		 * The address is available for us to bind.
 		 * Add socket to the bound list.
 		 */
 		hvs_addr_set(&pcb->local_addr, sa->hvs_port);
 		hvs_addr_set(&pcb->remote_addr, HVADDR_PORT_ANY);
 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
 	}
 	mtx_unlock(&hvs_trans_socks_mtx);
 
 	return (error);
 }
 
 int
 hvs_trans_listen(struct socket *so, int backlog, struct thread *td)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 	struct socket *bound_so;
 	int error;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_listen called\n", __func__);
 
 	if (pcb == NULL)
 		return (EINVAL);
 
 	/* Check if the address is already bound and it was by us. */
 	bound_so = hvs_find_socket_on_list(&pcb->local_addr, HVS_LIST_BOUND);
 	if (bound_so == NULL || bound_so != so) {
 		HVSOCK_DBG(HVSOCK_DBG_ERR,
 		    "%s: Address not bound or not by us.\n", __func__);
 		return (EADDRNOTAVAIL);
 	}
 
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	if (error == 0)
 		solisten_proto(so, backlog);
 	SOCK_UNLOCK(so);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket listen error = %d\n", __func__, error);
 	return (error);
 }
 
 int
 hvs_trans_accept(struct socket *so, struct sockaddr **nam)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_accept called\n", __func__);
 
 	if (pcb == NULL)
 		return (EINVAL);
 
 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr,
 	    M_NOWAIT);
 
 	return ((*nam == NULL) ? ENOMEM : 0);
 }
 
 int
 hvs_trans_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 	struct sockaddr_hvs *raddr = (struct sockaddr_hvs *)nam;
 	bool found_auto_bound_port = false;
 	int i, error = 0;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_connect called, remote port is %x\n",
 	    __func__, raddr->hvs_port);
 
 	if (pcb == NULL)
 		return (EINVAL);
 
 	/* Verify the remote address */
 	if (raddr == NULL)
 		return (EINVAL);
 	if (raddr->sa_family != AF_HYPERV)
 		return (EAFNOSUPPORT);
 	if (raddr->sa_len != sizeof(*raddr))
 		return (EINVAL);
 
 	mtx_lock(&hvs_trans_socks_mtx);
 	if (so->so_state &
 	    (SS_ISCONNECTED|SS_ISDISCONNECTING|SS_ISCONNECTING)) {
 			HVSOCK_DBG(HVSOCK_DBG_ERR,
 			    "%s: socket connect in progress\n",
 			    __func__);
 			error = EINPROGRESS;
 			goto out;
 	}
 
 	/*
 	 * Find an available port for us to auto bind the local
 	 * address.
 	 */
 	hvs_addr_set(&pcb->local_addr, 0);
 
 	for (i = previous_auto_bound_port - 1;
 	    i != previous_auto_bound_port; i --) {
 		if (i == MIN_PORT)
 			i = MAX_PORT;
 
 		pcb->local_addr.hvs_port = i;
 
 		if (__hvs_find_socket_on_list(&pcb->local_addr,
 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED) == NULL) {
 			found_auto_bound_port = true;
 			previous_auto_bound_port = i;
 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 			    "%s: found local bound port is %x\n",
 			    __func__, pcb->local_addr.hvs_port);
 			break;
 		}
 	}
 
 	if (found_auto_bound_port == true) {
 		/* Found available port for auto bound, put on list */
 		__hvs_insert_socket_on_list(so, HVS_LIST_BOUND);
 		/* Set VM service ID */
 		pcb->vm_srv_id = srv_id_template;
 		set_port_by_srv_id(&pcb->vm_srv_id, pcb->local_addr.hvs_port);
 		/* Set host service ID and remote port */
 		pcb->host_srv_id = srv_id_template;
 		set_port_by_srv_id(&pcb->host_srv_id, raddr->hvs_port);
 		hvs_addr_set(&pcb->remote_addr, raddr->hvs_port);
 
 		/* Change the socket state to SS_ISCONNECTING */
 		soisconnecting(so);
 	} else {
 		HVSOCK_DBG(HVSOCK_DBG_ERR,
 		    "%s: No local port available for auto bound\n",
 		    __func__);
 		error = EADDRINUSE;
 	}
 
 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect vm_srv_id is ");
 	hvsock_print_guid(&pcb->vm_srv_id);
 	HVSOCK_DBG(HVSOCK_DBG_INFO, "Connect host_srv_id is ");
 	hvsock_print_guid(&pcb->host_srv_id);
 
 out:
 	mtx_unlock(&hvs_trans_socks_mtx);
 
 	if (found_auto_bound_port == true)
 		 vmbus_req_tl_connect(&pcb->vm_srv_id, &pcb->host_srv_id);
 
 	return (error);
 }
 
 int
 hvs_trans_disconnect(struct socket *so)
 {
 	struct hvs_pcb *pcb;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_disconnect called\n", __func__);
 
 	(void) hvs_trans_lock();
 	pcb = so2hvspcb(so);
 	if (pcb == NULL) {
 		hvs_trans_unlock();
 		return (EINVAL);
 	}
 
 	/* If socket is already disconnected, skip this */
 	if ((so->so_state & SS_ISDISCONNECTED) == 0)
 		soisdisconnecting(so);
 
 	hvs_trans_unlock();
 
 	return (0);
 }
 
 struct hvs_callback_arg {
 	struct uio *uio;
 	struct sockbuf *sb;
 };
 
 int
 hvs_trans_soreceive(struct socket *so, struct sockaddr **paddr,
     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 	struct sockbuf *sb;
 	ssize_t orig_resid;
 	uint32_t canread, to_read;
 	int flags, error = 0;
 	struct hvs_callback_arg cbarg;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_soreceive called\n", __func__);
 
 	if (so->so_type != SOCK_STREAM)
 		return (EINVAL);
 	if (pcb == NULL)
 		return (EINVAL);
 
 	if (flagsp != NULL)
 		flags = *flagsp &~ MSG_EOR;
 	else
 		flags = 0;
 
 	if (flags & MSG_PEEK)
 		return (EOPNOTSUPP);
 
 	/* If no space to copy out anything */
 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_READ)
 		return (EINVAL);
 
 	orig_resid = uio->uio_resid;
 
 	/* Prevent other readers from entering the socket. */
 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
 	if (error) {
 		HVSOCK_DBG(HVSOCK_DBG_ERR,
 		    "%s: soiolock returned error = %d\n", __func__, error);
 		return (error);
 	}
 
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 
 	cbarg.uio = uio;
 	cbarg.sb = sb;
 	/*
 	 * If the socket is closing, there might still be some data
 	 * in rx br to read. However we need to make sure
 	 * the channel is still open.
 	 */
 	if ((sb->sb_state & SBS_CANTRCVMORE) &&
 	    (so->so_state & SS_ISDISCONNECTED)) {
 		/* Other thread already closed the channel */
 		error = EPIPE;
 		goto out;
 	}
 
 	while (true) {
 		while (uio->uio_resid > 0 &&
 		    (canread = hvsock_canread_check(pcb)) > 0) {
 			to_read = MIN(canread, uio->uio_resid);
 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 			    "%s: to_read = %u, skip = %u\n", __func__, to_read,
 			    (unsigned int)(sizeof(struct hvs_pkt_header) +
 			    pcb->recv_data_off));
 
 			error = vmbus_chan_recv_peek_call(pcb->chan, to_read,
 			    sizeof(struct hvs_pkt_header) + pcb->recv_data_off,
 			    hvsock_br_callback, (void *)&cbarg);
 			/*
 			 * It is possible socket is disconnected becasue
 			 * we released lock in hvsock_br_callback. So we
 			 * need to check the state to make sure it is not
 			 * disconnected.
 			 */
 			if (error || so->so_state & SS_ISDISCONNECTED) {
 				break;
 			}
 
 			pcb->recv_data_len -= to_read;
 			pcb->recv_data_off += to_read;
 		}
 
 		if (error)
 			break;
 
 		/* Abort if socket has reported problems. */
 		if (so->so_error) {
 			if (so->so_error == ESHUTDOWN &&
 			    orig_resid > uio->uio_resid) {
 				/*
 				 * Although we got a FIN, we also received
 				 * some data in this round. Delivery it
 				 * to user.
 				 */
 				error = 0;
 			} else {
 				if (so->so_error != ESHUTDOWN)
 					error = so->so_error;
 			}
 
 			break;
 		}
 
 		/* Cannot received more. */
 		if (sb->sb_state & SBS_CANTRCVMORE)
 			break;
 
 		/* We are done if buffer has been filled */
 		if (uio->uio_resid == 0)
 			break;
 
 		if (!(flags & MSG_WAITALL) && orig_resid > uio->uio_resid)
 			break;
 
 		/* Buffer ring is empty and we shall not block */
 		if ((so->so_state & SS_NBIO) ||
 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
 			if (orig_resid == uio->uio_resid) {
 				/* We have not read anything */
 				error = EAGAIN;
 			}
 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 			    "%s: non blocked read return, error %d.\n",
 			    __func__, error);
 			break;
 		}
 
 		/*
 		 * Wait and block until (more) data comes in.
 		 * Note: Drops the sockbuf lock during wait.
 		 */
 		error = sbwait(so, SO_RCV);
 
 		if (error)
 			break;
 
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "%s: wake up from sbwait, read available is %u\n",
 		    __func__, vmbus_chan_read_available(pcb->chan));
 	}
 
 out:
 	SOCKBUF_UNLOCK(sb);
 	SOCK_IO_RECV_UNLOCK(so);
 
 	/* We recieved a FIN in this call */
 	if (so->so_error == ESHUTDOWN) {
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			/* Send has already closed */
 			soisdisconnecting(so);
 		} else {
 			/* Just close the receive side */
 			socantrcvmore(so);
 		}
 	}
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: returning error = %d, so_error = %d\n",
 	    __func__, error, so->so_error);
 
 	return (error);
 }
 
 int
 hvs_trans_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *controlp, int flags, struct thread *td)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 	struct sockbuf *sb;
 	ssize_t orig_resid;
 	uint32_t canwrite, to_write;
 	int error = 0;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_sosend called, uio_resid = %zd\n",
 	    __func__, uio->uio_resid);
 
 	if (so->so_type != SOCK_STREAM)
 		return (EINVAL);
 	if (pcb == NULL)
 		return (EINVAL);
 
 	/* If nothing to send */
 	if (uio->uio_resid == 0 || uio->uio_rw != UIO_WRITE)
 		return (EINVAL);
 
 	orig_resid = uio->uio_resid;
 
 	/* Prevent other writers from entering the socket. */
 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
 	if (error) {
 		HVSOCK_DBG(HVSOCK_DBG_ERR,
 		    "%s: soiolocak returned error = %d\n", __func__, error);
 		return (error);
 	}
 
 	sb = &so->so_snd;
 	SOCKBUF_LOCK(sb);
 
 	if ((sb->sb_state & SBS_CANTSENDMORE) ||
 	    so->so_error == ESHUTDOWN) {
 		error = EPIPE;
 		goto out;
 	}
 
 	while (uio->uio_resid > 0) {
 		canwrite = hvsock_canwrite_check(pcb);
 		if (canwrite == 0) {
 			/* We have sent some data */
 			if (orig_resid > uio->uio_resid)
 				break;
 			/*
 			 * We have not sent any data and it is
 			 * non-blocked io
 			 */
 			if (so->so_state & SS_NBIO ||
 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
 				error = EWOULDBLOCK;
 				break;
 			} else {
 				/*
 				 * We are here because there is no space on
 				 * send buffer ring. Signal the other side
 				 * to read and free more space.
 				 * Sleep wait until space avaiable to send
 				 * Note: Drops the sockbuf lock during wait.
 				 */
 				error = sbwait(so, SO_SND);
 
 				if (error)
 					break;
 
 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 				    "%s: wake up from sbwait, space avail on "
 				    "tx ring is %u\n",
 				    __func__,
 				    vmbus_chan_write_available(pcb->chan));
 
 				continue;
 			}
 		}
 		to_write = MIN(canwrite, uio->uio_resid);
 		to_write = MIN(to_write, HVSOCK_SEND_BUF_SZ);
 
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "%s: canwrite is %u, to_write = %u\n", __func__,
 		    canwrite, to_write);
 		error = hvsock_send_data(pcb->chan, uio, to_write, sb);
 
 		if (error)
 			break;
 	}
 
 out:
 	SOCKBUF_UNLOCK(sb);
 	SOCK_IO_SEND_UNLOCK(so);
 
 	return (error);
 }
 
 int
 hvs_trans_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_peeraddr called\n", __func__);
 
 	if (pcb == NULL)
 		return (EINVAL);
 
 	*nam = sodupsockaddr((struct sockaddr *) &pcb->remote_addr, M_NOWAIT);
 
 	return ((*nam == NULL)? ENOMEM : 0);
 }
 
 int
 hvs_trans_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_sockaddr called\n", __func__);
 
 	if (pcb == NULL)
 		return (EINVAL);
 
 	*nam = sodupsockaddr((struct sockaddr *) &pcb->local_addr, M_NOWAIT);
 
 	return ((*nam == NULL)? ENOMEM : 0);
 }
 
 void
 hvs_trans_close(struct socket *so)
 {
 	struct hvs_pcb *pcb;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_close called\n", __func__);
 
 	(void) hvs_trans_lock();
 	pcb = so2hvspcb(so);
 	if (!pcb) {
 		hvs_trans_unlock();
 		return;
 	}
 
 	if (so->so_state & SS_ISCONNECTED) {
 		/* Send a FIN to peer */
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "%s: hvs_trans_close sending a FIN to host\n", __func__);
 		(void) hvsock_send_data(pcb->chan, NULL, 0, NULL);
 	}
 
 	if (so->so_state &
 	    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
 		soisdisconnected(so);
 
 	pcb->chan = NULL;
 	pcb->so = NULL;
 
 	if (SOLISTENING(so)) {
 		mtx_lock(&hvs_trans_socks_mtx);
 		/* Remove from bound list */
 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
 		mtx_unlock(&hvs_trans_socks_mtx);
 	}
 
 	hvs_trans_unlock();
 
 	return;
 }
 
 void
 hvs_trans_abort(struct socket *so)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_abort called\n", __func__);
 
 	(void) hvs_trans_lock();
 	if (pcb == NULL) {
 		hvs_trans_unlock();
 		return;
 	}
 
 	if (SOLISTENING(so)) {
 		mtx_lock(&hvs_trans_socks_mtx);
 		/* Remove from bound list */
 		__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
 		mtx_unlock(&hvs_trans_socks_mtx);
 	}
 
 	if (so->so_state & SS_ISCONNECTED) {
 		(void) sodisconnect(so);
 	}
 	hvs_trans_unlock();
 
 	return;
 }
 
 int
 hvs_trans_shutdown(struct socket *so)
 {
 	struct hvs_pcb *pcb = so2hvspcb(so);
 	struct sockbuf *sb;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: HyperV Socket hvs_trans_shutdown called\n", __func__);
 
 	if (pcb == NULL)
 		return (EINVAL);
 
 	/*
 	 * Only get called with the shutdown method is SHUT_WR or
 	 * SHUT_RDWR.
 	 * When the method is SHUT_RD or SHUT_RDWR, the caller
 	 * already set the SBS_CANTRCVMORE on receive side socket
 	 * buffer.
 	 */
 	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
 		/*
 		 * SHUT_WR only case.
 		 * Receive side is still open. Just close
 		 * the send side.
 		 */
 		socantsendmore(so);
 	} else {
 		/* SHUT_RDWR case */
 		if (so->so_state & SS_ISCONNECTED) {
 			/* Send a FIN to peer */
 			sb = &so->so_snd;
 			SOCKBUF_LOCK(sb);
 			(void) hvsock_send_data(pcb->chan, NULL, 0, sb);
 			SOCKBUF_UNLOCK(sb);
 
 			soisdisconnecting(so);
 		}
 	}
 
 	return (0);
 }
 
 /* In the VM, we support Hyper-V Sockets with AF_HYPERV, and the endpoint is
  * <port> (see struct sockaddr_hvs).
  *
  * On the host, Hyper-V Sockets are supported by Winsock AF_HYPERV:
  * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/user-
  * guide/make-integration-service, and the endpoint is <VmID, ServiceId> with
  * the below sockaddr:
  *
  * struct SOCKADDR_HV
  * {
  *    ADDRESS_FAMILY Family;
  *    USHORT Reserved;
  *    GUID VmId;
  *    GUID ServiceId;
  * };
  * Note: VmID is not used by FreeBSD VM and actually it isn't transmitted via
  * VMBus, because here it's obvious the host and the VM can easily identify
  * each other. Though the VmID is useful on the host, especially in the case
  * of Windows container, FreeBSD VM doesn't need it at all.
  *
  * To be compatible with similar infrastructure in Linux VMs, we have
  * to limit the available GUID space of SOCKADDR_HV so that we can create
  * a mapping between FreeBSD AF_HYPERV port and SOCKADDR_HV Service GUID.
  * The rule of writing Hyper-V Sockets apps on the host and in FreeBSD VM is:
  *
  ****************************************************************************
  * The only valid Service GUIDs, from the perspectives of both the host and *
  * FreeBSD VM, that can be connected by the other end, must conform to this *
  * format: <port>-facb-11e6-bd58-64006a7986d3.                              *
  ****************************************************************************
  *
  * When we write apps on the host to connect(), the GUID ServiceID is used.
  * When we write apps in FreeBSD VM to connect(), we only need to specify the
  * port and the driver will form the GUID and use that to request the host.
  *
  * From the perspective of FreeBSD VM, the remote ephemeral port (i.e. the
  * auto-generated remote port for a connect request initiated by the host's
  * connect()) is set to HVADDR_PORT_UNKNOWN, which is not realy used on the
  * FreeBSD guest.
  */
 
 /*
  * Older HyperV hosts (vmbus version 'VMBUS_VERSION_WIN10' or before)
  * restricts HyperV socket ring buffer size to six 4K pages. Newer
  * HyperV hosts doen't have this limit.
  */
 #define HVS_RINGBUF_RCV_SIZE	(PAGE_SIZE * 6)
 #define HVS_RINGBUF_SND_SIZE	(PAGE_SIZE * 6)
 #define HVS_RINGBUF_MAX_SIZE	(PAGE_SIZE * 64)
 
 struct hvsock_sc {
 	device_t		dev;
 	struct hvs_pcb		*pcb;
 	struct vmbus_channel	*channel;
 };
 
 static bool
 hvsock_chan_readable(struct vmbus_channel *chan)
 {
 	uint32_t readable = vmbus_chan_read_available(chan);
 
 	return (readable >= HVSOCK_PKT_LEN(0));
 }
 
 static void
 hvsock_chan_cb(struct vmbus_channel *chan, void *context)
 {
 	struct hvs_pcb *pcb = (struct hvs_pcb *) context;
 	struct socket *so;
 	uint32_t canwrite;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: host send us a wakeup on rb data, pcb = %p\n",
 	    __func__, pcb);
 
 	/*
 	 * Check if the socket is still attached and valid.
 	 * Here we know channel is still open. Need to make
 	 * sure the socket has not been closed or freed.
 	 */
 	(void) hvs_trans_lock();
 	so = hsvpcb2so(pcb);
 
 	if (pcb->chan != NULL && so != NULL) {
 		/*
 		 * Wake up reader if there are data to read.
 		 */
 		SOCKBUF_LOCK(&(so)->so_rcv);
 
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "%s: read available = %u\n", __func__,
 		    vmbus_chan_read_available(pcb->chan));
 
 		if (hvsock_chan_readable(pcb->chan))
 			sorwakeup_locked(so);
 		else
 			SOCKBUF_UNLOCK(&(so)->so_rcv);
 
 		/*
 		 * Wake up sender if space becomes available to write.
 		 */
 		SOCKBUF_LOCK(&(so)->so_snd);
 		canwrite = hvsock_canwrite_check(pcb);
 
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "%s: canwrite = %u\n", __func__, canwrite);
 
 		if (canwrite > 0) {
 			sowwakeup_locked(so);
 		} else {
 			SOCKBUF_UNLOCK(&(so)->so_snd);
 		}
 	}
 
 	hvs_trans_unlock();
 
 	return;
 }
 
 static int
 hvsock_br_callback(void *datap, int cplen, void *cbarg)
 {
 	struct hvs_callback_arg *arg = (struct hvs_callback_arg *)cbarg;
 	struct uio *uio = arg->uio;
 	struct sockbuf *sb = arg->sb;
 	int error = 0;
 
 	if (cbarg == NULL || datap == NULL)
 		return (EINVAL);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: called, uio_rw = %s, uio_resid = %zd, cplen = %u, "
 	    "datap = %p\n",
 	    __func__, (uio->uio_rw == UIO_READ) ? "read from br":"write to br",
 	    uio->uio_resid, cplen, datap);
 
 	if (sb)
 		SOCKBUF_UNLOCK(sb);
 
 	error = uiomove(datap, cplen, uio);
 
 	if (sb)
 		SOCKBUF_LOCK(sb);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: after uiomove, uio_resid = %zd, error = %d\n",
 	    __func__, uio->uio_resid, error);
 
 	return (error);
 }
 
 static int
 hvsock_send_data(struct vmbus_channel *chan, struct uio *uio,
     uint32_t to_write, struct sockbuf *sb)
 {
 	struct hvs_pkt_header hvs_pkt;
 	int hvs_pkthlen, hvs_pktlen, pad_pktlen, hlen, error = 0;
 	uint64_t pad = 0;
 	struct iovec iov[3];
 	struct hvs_callback_arg cbarg;
 
 	if (chan == NULL)
 		return (ENOTCONN);
 
 	hlen = sizeof(struct vmbus_chanpkt_hdr);
 	hvs_pkthlen = sizeof(struct hvs_pkt_header);
 	hvs_pktlen = hvs_pkthlen + to_write;
 	pad_pktlen = VMBUS_CHANPKT_TOTLEN(hvs_pktlen);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: hlen = %u, hvs_pkthlen = %u, hvs_pktlen = %u, "
 	    "pad_pktlen = %u, data_len = %u\n",
 	    __func__, hlen, hvs_pkthlen, hvs_pktlen, pad_pktlen, to_write);
 
 	hvs_pkt.chan_pkt_hdr.cph_type = VMBUS_CHANPKT_TYPE_INBAND;
 	hvs_pkt.chan_pkt_hdr.cph_flags = 0;
 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_hlen, hlen);
 	VMBUS_CHANPKT_SETLEN(hvs_pkt.chan_pkt_hdr.cph_tlen, pad_pktlen);
 	hvs_pkt.chan_pkt_hdr.cph_xactid = 0;
 
 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_pkt_type = 1;
 	hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size = to_write;
 
 	cbarg.uio = uio;
 	cbarg.sb = sb;
 
 	if (uio && to_write > 0) {
 		iov[0].iov_base = &hvs_pkt;
 		iov[0].iov_len = hvs_pkthlen;
 		iov[1].iov_base = NULL;
 		iov[1].iov_len = to_write;
 		iov[2].iov_base = &pad;
 		iov[2].iov_len = pad_pktlen - hvs_pktlen;
 
 		error = vmbus_chan_iov_send(chan, iov, 3,
 		    hvsock_br_callback, &cbarg);
 	} else {
 		if (to_write == 0) {
 			iov[0].iov_base = &hvs_pkt;
 			iov[0].iov_len = hvs_pkthlen;
 			iov[1].iov_base = &pad;
 			iov[1].iov_len = pad_pktlen - hvs_pktlen;
 			error = vmbus_chan_iov_send(chan, iov, 2, NULL, NULL);
 		}
 	}
 
 	if (error) {
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "%s: error = %d\n", __func__, error);
 	}
 
 	return (error);
 }
 
 /*
  * Check if we have data on current ring buffer to read
  * or not. If not, advance the ring buffer read index to
  * next packet. Update the recev_data_len and recev_data_off
  * to new value.
  * Return the number of bytes can read.
  */
 static uint32_t
 hvsock_canread_check(struct hvs_pcb *pcb)
 {
 	uint32_t advance;
 	uint32_t tlen, hlen, dlen;
 	uint32_t bytes_canread = 0;
 	int error;
 
 	if (pcb == NULL || pcb->chan == NULL) {
 		pcb->so->so_error = EIO;
 		return (0);
 	}
 
 	/* Still have data not read yet on current packet */
 	if (pcb->recv_data_len > 0)
 		return (pcb->recv_data_len);
 
 	if (pcb->rb_init)
 		advance =
 		    VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
 	else
 		advance = 0;
 
 	bytes_canread = vmbus_chan_read_available(pcb->chan);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: bytes_canread on br = %u, advance = %u\n",
 	    __func__, bytes_canread, advance);
 
 	if (pcb->rb_init && bytes_canread == (advance + sizeof(uint64_t))) {
 		/*
 		 * Nothing to read. Need to advance the rindex before
 		 * calling sbwait, so host knows to wake us up when data
 		 * is available to read on rb.
 		 */
 		error = vmbus_chan_recv_idxadv(pcb->chan, advance);
 		if (error) {
 			HVSOCK_DBG(HVSOCK_DBG_ERR,
 			    "%s: after calling vmbus_chan_recv_idxadv, "
 			    "got error = %d\n",  __func__, error);
 			return (0);
 		} else {
 			pcb->rb_init = false;
 			pcb->recv_data_len = 0;
 			pcb->recv_data_off = 0;
 			bytes_canread = vmbus_chan_read_available(pcb->chan);
 
 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 			    "%s: advanced %u bytes, "
 			    " bytes_canread on br now = %u\n",
 			    __func__, advance, bytes_canread);
 
 			if (bytes_canread == 0)
 				return (0);
 			else
 				advance = 0;
 		}
 	}
 
 	if (bytes_canread <
 	    advance + (sizeof(struct hvs_pkt_header) + sizeof(uint64_t)))
 		return (0);
 
 	error = vmbus_chan_recv_peek(pcb->chan, &pcb->hvs_pkt,
 	    sizeof(struct hvs_pkt_header), advance);
 
 	/* Don't have anything to read */
 	if (error) {
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "%s: after calling vmbus_chan_recv_peek, got error = %d\n",
 		    __func__, error);
 		return (0);
 	}
 
 	/*
 	 * We just read in a new packet header. Do some sanity checks.
 	 */
 	tlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_tlen);
 	hlen = VMBUS_CHANPKT_GETLEN(pcb->hvs_pkt.chan_pkt_hdr.cph_hlen);
 	dlen = pcb->hvs_pkt.vmpipe_pkt_hdr.vmpipe_data_size;
 	if (__predict_false(hlen < sizeof(struct vmbus_chanpkt_hdr)) ||
 	    __predict_false(hlen > tlen) ||
 	    __predict_false(tlen < dlen + sizeof(struct hvs_pkt_header))) {
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "invalid tlen(%u), hlen(%u) or dlen(%u)\n",
 		    tlen, hlen, dlen);
 		pcb->so->so_error = EIO;
 		return (0);
 	}
 	if (pcb->rb_init == false)
 		pcb->rb_init = true;
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "Got new pkt tlen(%u), hlen(%u) or dlen(%u)\n",
 	    tlen, hlen, dlen);
 
 	/* The other side has sent a close FIN */
 	if (dlen == 0) {
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "%s: Received FIN from other side\n", __func__);
 		/* inform the caller by seting so_error to ESHUTDOWN */
 		pcb->so->so_error = ESHUTDOWN;
 	}
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: canread on receive ring is %u \n", __func__, dlen);
 
 	pcb->recv_data_len = dlen;
 	pcb->recv_data_off = 0;
 
 	return (pcb->recv_data_len);
 }
 
 static uint32_t
 hvsock_canwrite_check(struct hvs_pcb *pcb)
 {
 	uint32_t writeable;
 	uint32_t ret;
 
 	if (pcb == NULL || pcb->chan == NULL)
 		return (0);
 
 	writeable = vmbus_chan_write_available(pcb->chan);
 
 	/*
 	 * We must always reserve a 0-length-payload packet for the FIN.
 	 */
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: writeable is %u, should be greater than %ju\n",
 	    __func__, writeable,
 	    (uintmax_t)(HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)));
 
 	if (writeable < HVSOCK_PKT_LEN(1) + HVSOCK_PKT_LEN(0)) {
 		/*
 		 * The Tx ring seems full.
 		 */
 		return (0);
 	}
 
 	ret = writeable - HVSOCK_PKT_LEN(0) - HVSOCK_PKT_LEN(0);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 	    "%s: available size is %u\n", __func__, rounddown2(ret, 8));
 
 	return (rounddown2(ret, 8));
 }
 
 static void
 hvsock_set_chan_pending_send_size(struct vmbus_channel *chan)
 {
 	vmbus_chan_set_pending_send_size(chan,
 	    HVSOCK_PKT_LEN(HVSOCK_SEND_BUF_SZ));
 }
 
 static int
 hvsock_open_channel(struct vmbus_channel *chan, struct socket *so)
 {
 	unsigned int rcvbuf, sndbuf;
 	struct hvs_pcb *pcb = so2hvspcb(so);
 	int ret;
 
 	if (vmbus_current_version < VMBUS_VERSION_WIN10_V5) {
 		sndbuf = HVS_RINGBUF_SND_SIZE;
 		rcvbuf = HVS_RINGBUF_RCV_SIZE;
 	} else {
 		sndbuf = MAX(so->so_snd.sb_hiwat, HVS_RINGBUF_SND_SIZE);
 		sndbuf = MIN(sndbuf, HVS_RINGBUF_MAX_SIZE);
 		sndbuf = rounddown2(sndbuf, PAGE_SIZE);
 		rcvbuf = MAX(so->so_rcv.sb_hiwat, HVS_RINGBUF_RCV_SIZE);
 		rcvbuf = MIN(rcvbuf, HVS_RINGBUF_MAX_SIZE);
 		rcvbuf = rounddown2(rcvbuf, PAGE_SIZE);
 	}
 
 	/*
 	 * Can only read whatever user provided size of data
 	 * from ring buffer. Turn off batched reading.
 	 */
 	vmbus_chan_set_readbatch(chan, false);
 
 	ret = vmbus_chan_open(chan, sndbuf, rcvbuf, NULL, 0,
 	    hvsock_chan_cb, pcb);
 
 	if (ret != 0) {
 		HVSOCK_DBG(HVSOCK_DBG_ERR,
 		    "%s: failed to open hvsock channel, sndbuf = %u, "
 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
 	} else {
 		HVSOCK_DBG(HVSOCK_DBG_INFO,
 		    "%s: hvsock channel opened, sndbuf = %u, i"
 		    "rcvbuf = %u\n", __func__, sndbuf, rcvbuf);
 		/*
 		 * Se the pending send size so to receive wakeup
 		 * signals from host when there is enough space on
 		 * rx buffer ring to write.
 		 */
 		hvsock_set_chan_pending_send_size(chan);
 	}
 
 	return ret;
 }
 
 /*
  * Guest is listening passively on the socket. Open channel and
  * create a new socket for the conneciton.
  */
 static void
 hvsock_open_conn_passive(struct vmbus_channel *chan, struct socket *so,
     struct hvsock_sc *sc)
 {
 	struct socket *new_so;
 	struct hvs_pcb *new_pcb, *pcb;
 	int error;
 
 	/* Do nothing if socket is not listening */
 	if (!SOLISTENING(so)) {
 		HVSOCK_DBG(HVSOCK_DBG_ERR,
 		    "%s: socket is not a listening one\n", __func__);
 		return;
 	}
 
 	/*
 	 * Create a new socket. This will call pru_attach to complete
 	 * the socket initialization and put the new socket onto
 	 * listening socket's sol_incomp list, waiting to be promoted
 	 * to sol_comp list.
 	 * The new socket created has ref count 0. There is no other
 	 * thread that changes the state of this new one at the
 	 * moment, so we don't need to hold its lock while opening
 	 * channel and filling out its pcb information.
 	 */
 	new_so = sonewconn(so, 0);
 	if (!new_so)
 		HVSOCK_DBG(HVSOCK_DBG_ERR,
 		    "%s: creating new socket failed\n", __func__);
 
 	/*
 	 * Now open the vmbus channel. If it fails, the socket will be
 	 * on the listening socket's sol_incomp queue until it is
 	 * replaced and aborted.
 	 */
 	error = hvsock_open_channel(chan, new_so);
 	if (error) {
 		new_so->so_error = error;
 		return;
 	}
 
 	pcb = so->so_pcb;
 	new_pcb = new_so->so_pcb;
 
 	hvs_addr_set(&(new_pcb->local_addr), pcb->local_addr.hvs_port);
 	/* Remote port is unknown to guest in this type of conneciton */
 	hvs_addr_set(&(new_pcb->remote_addr), HVADDR_PORT_UNKNOWN);
 	new_pcb->chan = chan;
 	new_pcb->recv_data_len = 0;
 	new_pcb->recv_data_off = 0;
 	new_pcb->rb_init = false;
 
 	new_pcb->vm_srv_id = *vmbus_chan_guid_type(chan);
 	new_pcb->host_srv_id = *vmbus_chan_guid_inst(chan);
 
 	hvs_insert_socket_on_list(new_so, HVS_LIST_CONNECTED);
 
 	sc->pcb = new_pcb;
 
 	/*
 	 * Change the socket state to SS_ISCONNECTED. This will promote
 	 * the socket to sol_comp queue and wake up the thread which
 	 * is accepting connection.
 	 */
 	soisconnected(new_so);
 }
 
 
 /*
  * Guest is actively connecting to host.
  */
 static void
 hvsock_open_conn_active(struct vmbus_channel *chan, struct socket *so)
 {
 	struct hvs_pcb *pcb;
 	int error;
 
 	error = hvsock_open_channel(chan, so);
 	if (error) {
 		so->so_error = error;
 		return;
 	}
 
 	pcb = so->so_pcb;
 	pcb->chan = chan;
 	pcb->recv_data_len = 0;
 	pcb->recv_data_off = 0;
 	pcb->rb_init = false;
 
 	mtx_lock(&hvs_trans_socks_mtx);
 	__hvs_remove_socket_from_list(so, HVS_LIST_BOUND);
 	__hvs_insert_socket_on_list(so, HVS_LIST_CONNECTED);
 	mtx_unlock(&hvs_trans_socks_mtx);
 
 	/*
 	 * Change the socket state to SS_ISCONNECTED. This will wake up
 	 * the thread sleeping in connect call.
 	 */
 	soisconnected(so);
 }
 
 static void
 hvsock_open_connection(struct vmbus_channel *chan, struct hvsock_sc *sc)
 {
 	struct hyperv_guid *inst_guid, *type_guid;
 	bool conn_from_host;
 	struct sockaddr_hvs addr;
 	struct socket *so;
 	struct hvs_pcb *pcb;
 
 	type_guid = (struct hyperv_guid *) vmbus_chan_guid_type(chan);
 	inst_guid = (struct hyperv_guid *) vmbus_chan_guid_inst(chan);
 	conn_from_host = vmbus_chan_is_hvs_conn_from_host(chan);
 
 	HVSOCK_DBG(HVSOCK_DBG_INFO, "type_guid is ");
 	hvsock_print_guid(type_guid);
 	HVSOCK_DBG(HVSOCK_DBG_INFO, "inst_guid is ");
 	hvsock_print_guid(inst_guid);
 	HVSOCK_DBG(HVSOCK_DBG_INFO, "connection %s host\n",
 	    (conn_from_host == true ) ? "from" : "to");
 
 	/*
 	 * The listening port should be in [0, MAX_LISTEN_PORT]
 	 */
 	if (!is_valid_srv_id(type_guid))
 		return;
 
 	/*
 	 * There should be a bound socket already created no matter
 	 * it is a passive or active connection.
 	 * For host initiated connection (passive on guest side),
 	 * the  type_guid contains the port which guest is bound and
 	 * listening.
 	 * For the guest initiated connection (active on guest side),
 	 * the inst_guid contains the port that guest has auto bound
 	 * to.
 	 */
 	hvs_addr_init(&addr, conn_from_host ? type_guid : inst_guid);
 	so = hvs_find_socket_on_list(&addr, HVS_LIST_BOUND);
 	if (!so) {
 		HVSOCK_DBG(HVSOCK_DBG_ERR,
 		    "%s: no bound socket found for port %u\n",
 		    __func__, addr.hvs_port);
 		return;
 	}
 
 	if (conn_from_host) {
 		hvsock_open_conn_passive(chan, so, sc);
 	} else {
 		(void) hvs_trans_lock();
 		pcb = so->so_pcb;
 		if (pcb && pcb->so) {
 			sc->pcb = so2hvspcb(so);
 			hvsock_open_conn_active(chan, so);
 		} else {
 			HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 			    "%s: channel detached before open\n", __func__);
 		}
 		hvs_trans_unlock();
 	}
 
 }
 
 static int
 hvsock_probe(device_t dev)
 {
 	struct vmbus_channel *channel = vmbus_get_channel(dev);
 
 	if (!channel || !vmbus_chan_is_hvs(channel)) {
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "hvsock_probe called but not a hvsock channel id %u\n",
 		    vmbus_chan_id(channel));
 
 		return ENXIO;
 	} else {
 		HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 		    "hvsock_probe got a hvsock channel id %u\n",
 		    vmbus_chan_id(channel));
 
 		return BUS_PROBE_DEFAULT;
 	}
 }
 
 static int
 hvsock_attach(device_t dev)
 {
 	struct vmbus_channel *channel = vmbus_get_channel(dev);
 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_attach called.\n");
 
 	hvsock_open_connection(channel, sc);
 
 	/*
 	 * Always return success. On error the host will rescind the device
 	 * in 30 seconds and we can do cleanup at that time in
 	 * vmbus_chan_msgproc_chrescind().
 	 */
 	return (0);
 }
 
 static int
 hvsock_detach(device_t dev)
 {
 	struct hvsock_sc *sc = (struct hvsock_sc *)device_get_softc(dev);
 	struct socket *so;
 	int retry;
 
 	if (bootverbose)
 		device_printf(dev, "hvsock_detach called.\n");
 
 	HVSOCK_DBG(HVSOCK_DBG_VERBOSE, "hvsock_detach called.\n");
 
 	if (sc->pcb != NULL) {
 		(void) hvs_trans_lock();
 
 		so = hsvpcb2so(sc->pcb);
 		if (so) {
 			/* Close the connection */
 			if (so->so_state &
 			    (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
 				soisdisconnected(so);
 		}
 
 		mtx_lock(&hvs_trans_socks_mtx);
 		__hvs_remove_pcb_from_list(sc->pcb,
 		    HVS_LIST_BOUND | HVS_LIST_CONNECTED);
 		mtx_unlock(&hvs_trans_socks_mtx);
 
 		/*
 		 * Close channel while no reader and sender are working
 		 * on the buffer rings.
 		 */
 		if (so) {
 			retry = 0;
 			while (SOCK_IO_RECV_LOCK(so, 0) == EWOULDBLOCK) {
 				/*
 				 * Someone is reading, rx br is busy
 				 */
 				soisdisconnected(so);
 				DELAY(500);
 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 				    "waiting for rx reader to exit, "
 				    "retry = %d\n", retry++);
 			}
 			retry = 0;
 			while (SOCK_IO_SEND_LOCK(so, 0) == EWOULDBLOCK) {
 				/*
 				 * Someone is sending, tx br is busy
 				 */
 				soisdisconnected(so);
 				DELAY(500);
 				HVSOCK_DBG(HVSOCK_DBG_VERBOSE,
 				    "waiting for tx sender to exit, "
 				    "retry = %d\n", retry++);
 			}
 		}
 
 
 		bzero(sc->pcb, sizeof(struct hvs_pcb));
 		free(sc->pcb, M_HVSOCK);
 		sc->pcb = NULL;
 
 		if (so) {
 			SOCK_IO_RECV_UNLOCK(so);
 			SOCK_IO_SEND_UNLOCK(so);
 			so->so_pcb = NULL;
 		}
 
 		hvs_trans_unlock();
 	}
 
 	vmbus_chan_close(vmbus_get_channel(dev));
 
 	return (0);
 }
 
 static device_method_t hvsock_methods[] = {
 	/* Device interface */
 	DEVMETHOD(device_probe, hvsock_probe),
 	DEVMETHOD(device_attach, hvsock_attach),
 	DEVMETHOD(device_detach, hvsock_detach),
 	DEVMETHOD_END
 };
 
 static driver_t hvsock_driver = {
 	"hv_sock",
 	hvsock_methods,
 	sizeof(struct hvsock_sc)
 };
 
 DRIVER_MODULE(hvsock, vmbus, hvsock_driver, NULL, NULL);
 MODULE_VERSION(hvsock, 1);
 MODULE_DEPEND(hvsock, vmbus, 1, 1, 1);
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index 2de015254ab9..f444e38e153d 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -1,1370 +1,1369 @@
 /*-
  * Copyright (c) 2013-2015 Gleb Smirnoff <glebius@FreeBSD.org>
  * Copyright (c) 1998, David Greenman. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_kern_tls.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/ktls.h>
 #include <sys/mutex.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
 
 #include <net/vnet.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_pager.h>
 
 static MALLOC_DEFINE(M_SENDFILE, "sendfile", "sendfile dynamic memory");
 
 #define	EXT_FLAG_SYNC		EXT_FLAG_VENDOR1
 #define	EXT_FLAG_NOCACHE	EXT_FLAG_VENDOR2
 #define	EXT_FLAG_CACHE_LAST	EXT_FLAG_VENDOR3
 
 /*
  * Structure describing a single sendfile(2) I/O, which may consist of
  * several underlying pager I/Os.
  *
  * The syscall context allocates the structure and initializes 'nios'
  * to 1.  As sendfile_swapin() runs through pages and starts asynchronous
  * paging operations, it increments 'nios'.
  *
  * Every I/O completion calls sendfile_iodone(), which decrements the 'nios',
  * and the syscall also calls sendfile_iodone() after allocating all mbufs,
  * linking them and sending to socket.  Whoever reaches zero 'nios' is
  * responsible to * call pru_ready on the socket, to notify it of readyness
  * of the data.
  */
 struct sf_io {
 	volatile u_int	nios;
 	u_int		error;
 	int		npages;
 	struct socket	*so;
 	struct mbuf	*m;
 	vm_object_t	obj;
 	vm_pindex_t	pindex0;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 #endif
 	vm_page_t	pa[];
 };
 
 /*
  * Structure used to track requests with SF_SYNC flag.
  */
 struct sendfile_sync {
 	struct mtx	mtx;
 	struct cv	cv;
 	unsigned	count;
 	bool		waiting;
 };
 
 static void
 sendfile_sync_destroy(struct sendfile_sync *sfs)
 {
 	KASSERT(sfs->count == 0, ("sendfile sync %p still busy", sfs));
 
 	cv_destroy(&sfs->cv);
 	mtx_destroy(&sfs->mtx);
 	free(sfs, M_SENDFILE);
 }
 
 static void
 sendfile_sync_signal(struct sendfile_sync *sfs)
 {
 	mtx_lock(&sfs->mtx);
 	KASSERT(sfs->count > 0, ("sendfile sync %p not busy", sfs));
 	if (--sfs->count == 0) {
 		if (!sfs->waiting) {
 			/* The sendfile() waiter was interrupted by a signal. */
 			sendfile_sync_destroy(sfs);
 			return;
 		} else {
 			cv_signal(&sfs->cv);
 		}
 	}
 	mtx_unlock(&sfs->mtx);
 }
 
 counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 
 static void
 sfstat_init(const void *unused)
 {
 
 	COUNTER_ARRAY_ALLOC(sfstat, sizeof(struct sfstat) / sizeof(uint64_t),
 	    M_WAITOK);
 }
 SYSINIT(sfstat, SI_SUB_MBUF, SI_ORDER_FIRST, sfstat_init, NULL);
 
 static int
 sfstat_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct sfstat s;
 
 	COUNTER_ARRAY_COPY(sfstat, &s, sizeof(s) / sizeof(uint64_t));
 	if (req->newptr)
 		COUNTER_ARRAY_ZERO(sfstat, sizeof(s) / sizeof(uint64_t));
 	return (SYSCTL_OUT(req, &s, sizeof(s)));
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, sfstat,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
     sfstat_sysctl, "I",
     "sendfile statistics");
 
 static void
 sendfile_free_mext(struct mbuf *m)
 {
 	struct sf_buf *sf;
 	vm_page_t pg;
 	int flags;
 
 	KASSERT(m->m_flags & M_EXT && m->m_ext.ext_type == EXT_SFBUF,
 	    ("%s: m %p !M_EXT or !EXT_SFBUF", __func__, m));
 
 	sf = m->m_ext.ext_arg1;
 	pg = sf_buf_page(sf);
 	flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0;
 
 	sf_buf_free(sf);
 	vm_page_release(pg, flags);
 
 	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
 		struct sendfile_sync *sfs = m->m_ext.ext_arg2;
 		sendfile_sync_signal(sfs);
 	}
 }
 
 static void
 sendfile_free_mext_pg(struct mbuf *m)
 {
 	vm_page_t pg;
 	int flags, i;
 	bool cache_last;
 
 	M_ASSERTEXTPG(m);
 
 	cache_last = m->m_ext.ext_flags & EXT_FLAG_CACHE_LAST;
 	flags = (m->m_ext.ext_flags & EXT_FLAG_NOCACHE) != 0 ? VPR_TRYFREE : 0;
 
 	for (i = 0; i < m->m_epg_npgs; i++) {
 		if (cache_last && i == m->m_epg_npgs - 1)
 			flags = 0;
 		pg = PHYS_TO_VM_PAGE(m->m_epg_pa[i]);
 		vm_page_release(pg, flags);
 	}
 
 	if (m->m_ext.ext_flags & EXT_FLAG_SYNC) {
 		struct sendfile_sync *sfs = m->m_ext.ext_arg1;
 		sendfile_sync_signal(sfs);
 	}
 }
 
 /*
  * Helper function to calculate how much data to put into page i of n.
  * Only first and last pages are special.
  */
 static inline off_t
 xfsize(int i, int n, off_t off, off_t len)
 {
 
 	if (i == 0)
 		return (omin(PAGE_SIZE - (off & PAGE_MASK), len));
 
 	if (i == n - 1 && ((off + len) & PAGE_MASK) > 0)
 		return ((off + len) & PAGE_MASK);
 
 	return (PAGE_SIZE);
 }
 
 /*
  * Helper function to get offset within object for i page.
  */
 static inline vm_ooffset_t
 vmoff(int i, off_t off)
 {
 
 	if (i == 0)
 		return ((vm_ooffset_t)off);
 
 	return (trunc_page(off + i * PAGE_SIZE));
 }
 
 /*
  * Helper function used when allocation of a page or sf_buf failed.
  * Pretend as if we don't have enough space, subtract xfsize() of
  * all pages that failed.
  */
 static inline void
 fixspace(int old, int new, off_t off, int *space)
 {
 
 	KASSERT(old > new, ("%s: old %d new %d", __func__, old, new));
 
 	/* Subtract last one. */
 	*space -= xfsize(old - 1, old, off, *space);
 	old--;
 
 	if (new == old)
 		/* There was only one page. */
 		return;
 
 	/* Subtract first one. */
 	if (new == 0) {
 		*space -= xfsize(0, old, off, *space);
 		new++;
 	}
 
 	/* Rest of pages are full sized. */
 	*space -= (old - new) * PAGE_SIZE;
 
 	KASSERT(*space >= 0, ("%s: space went backwards", __func__));
 }
 
 /*
  * Wait for all in-flight ios to complete, we must not unwire pages
  * under them.
  */
 static void
 sendfile_iowait(struct sf_io *sfio, const char *wmesg)
 {
 	while (atomic_load_int(&sfio->nios) != 1)
 		pause(wmesg, 1);
 }
 
 /*
  * I/O completion callback.
  */
 static void
 sendfile_iodone(void *arg, vm_page_t *pa, int count, int error)
 {
 	struct sf_io *sfio = arg;
 	struct socket *so;
 	int i;
 
 	if (error != 0)
 		sfio->error = error;
 
 	/*
 	 * Restore the valid page pointers.  They are already
 	 * unbusied, but still wired.
 	 *
 	 * XXXKIB since pages are only wired, and we do not
 	 * own the object lock, other users might have
 	 * invalidated them in meantime.  Similarly, after we
 	 * unbusied the swapped-in pages, they can become
 	 * invalid under us.
 	 */
 	MPASS(count == 0 || pa[0] != bogus_page);
 	for (i = 0; i < count; i++) {
 		if (pa[i] == bogus_page) {
 			sfio->pa[(pa[0]->pindex - sfio->pindex0) + i] =
 			    pa[i] = vm_page_relookup(sfio->obj,
 			    pa[0]->pindex + i);
 			KASSERT(pa[i] != NULL,
 			    ("%s: page %p[%d] disappeared",
 			    __func__, pa, i));
 		} else {
 			vm_page_xunbusy_unchecked(pa[i]);
 		}
 	}
 
 	if (!refcount_release(&sfio->nios))
 		return;
 
 #ifdef INVARIANTS
 	for (i = 1; i < sfio->npages; i++) {
 		if (sfio->pa[i] == NULL)
 			break;
 		KASSERT(vm_page_wired(sfio->pa[i]),
 		    ("sfio %p page %d %p not wired", sfio, i, sfio->pa[i]));
 		if (i == 0)
 			continue;
 		KASSERT(sfio->pa[0]->object == sfio->pa[i]->object,
 		    ("sfio %p page %d %p wrong owner %p %p", sfio, i,
 		    sfio->pa[i], sfio->pa[0]->object, sfio->pa[i]->object));
 		KASSERT(sfio->pa[0]->pindex + i == sfio->pa[i]->pindex,
 		    ("sfio %p page %d %p wrong index %jx %jx", sfio, i,
 		    sfio->pa[i], (uintmax_t)sfio->pa[0]->pindex,
 		    (uintmax_t)sfio->pa[i]->pindex));
 	}
 #endif
 
 	vm_object_pip_wakeup(sfio->obj);
 
 	if (sfio->m == NULL) {
 		/*
 		 * Either I/O operation failed, or we failed to allocate
 		 * buffers, or we bailed out on first busy page, or we
 		 * succeeded filling the request without any I/Os. Anyway,
 		 * pru_send hadn't been executed - nothing had been sent
 		 * to the socket yet.
 		 */
 		MPASS((curthread->td_pflags & TDP_KTHREAD) == 0);
 		free(sfio, M_SENDFILE);
 		return;
 	}
 
 #if defined(KERN_TLS) && defined(INVARIANTS)
 	if ((sfio->m->m_flags & M_EXTPG) != 0)
 		KASSERT(sfio->tls == sfio->m->m_epg_tls,
 		    ("TLS session mismatch"));
 	else
 		KASSERT(sfio->tls == NULL,
 		    ("non-ext_pgs mbuf with TLS session"));
 #endif
 	so = sfio->so;
 	CURVNET_SET(so->so_vnet);
 	if (__predict_false(sfio->error)) {
 		/*
 		 * I/O operation failed.  The state of data in the socket
 		 * is now inconsistent, and all what we can do is to tear
 		 * it down. Protocol abort method would tear down protocol
 		 * state, free all ready mbufs and detach not ready ones.
 		 * We will free the mbufs corresponding to this I/O manually.
 		 *
 		 * The socket would be marked with EIO and made available
 		 * for read, so that application receives EIO on next
 		 * syscall and eventually closes the socket.
 		 */
-		so->so_proto->pr_usrreqs->pru_abort(so);
+		so->so_proto->pr_abort(so);
 		so->so_error = EIO;
 
 		mb_free_notready(sfio->m, sfio->npages);
 #ifdef KERN_TLS
 	} else if (sfio->tls != NULL && sfio->tls->mode == TCP_TLS_MODE_SW) {
 		/*
 		 * I/O operation is complete, but we still need to
 		 * encrypt.  We cannot do this in the interrupt thread
 		 * of the disk controller, so forward the mbufs to a
 		 * different thread.
 		 *
 		 * Donate the socket reference from sfio to rather
 		 * than explicitly invoking soref().
 		 */
 		ktls_enqueue(sfio->m, so, sfio->npages);
 		goto out_with_ref;
 #endif
 	} else
-		(void)(so->so_proto->pr_usrreqs->pru_ready)(so, sfio->m,
-		    sfio->npages);
+		(void)so->so_proto->pr_ready(so, sfio->m, sfio->npages);
 
 	sorele(so);
 #ifdef KERN_TLS
 out_with_ref:
 #endif
 	CURVNET_RESTORE();
 	free(sfio, M_SENDFILE);
 }
 
 /*
  * Iterate through pages vector and request paging for non-valid pages.
  */
 static int
 sendfile_swapin(vm_object_t obj, struct sf_io *sfio, int *nios, off_t off,
     off_t len, int rhpages, int flags)
 {
 	vm_page_t *pa;
 	int a, count, count1, grabbed, i, j, npages, rv;
 
 	pa = sfio->pa;
 	npages = sfio->npages;
 	*nios = 0;
 	flags = (flags & SF_NODISKIO) ? VM_ALLOC_NOWAIT : 0;
 	sfio->pindex0 = OFF_TO_IDX(off);
 
 	/*
 	 * First grab all the pages and wire them.  Note that we grab
 	 * only required pages.  Readahead pages are dealt with later.
 	 */
 	grabbed = vm_page_grab_pages_unlocked(obj, OFF_TO_IDX(off),
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | flags, pa, npages);
 	if (grabbed < npages) {
 		for (int i = grabbed; i < npages; i++)
 			pa[i] = NULL;
 		npages = grabbed;
 		rhpages = 0;
 	}
 
 	for (i = 0; i < npages;) {
 		/* Skip valid pages. */
 		if (vm_page_is_valid(pa[i], vmoff(i, off) & PAGE_MASK,
 		    xfsize(i, npages, off, len))) {
 			vm_page_xunbusy(pa[i]);
 			SFSTAT_INC(sf_pages_valid);
 			i++;
 			continue;
 		}
 
 		/*
 		 * Next page is invalid.  Check if it belongs to pager.  It
 		 * may not be there, which is a regular situation for shmem
 		 * pager.  For vnode pager this happens only in case of
 		 * a sparse file.
 		 *
 		 * Important feature of vm_pager_has_page() is the hint
 		 * stored in 'a', about how many pages we can pagein after
 		 * this page in a single I/O.
 		 */
 		VM_OBJECT_RLOCK(obj);
 		if (!vm_pager_has_page(obj, OFF_TO_IDX(vmoff(i, off)), NULL,
 		    &a)) {
 			VM_OBJECT_RUNLOCK(obj);
 			pmap_zero_page(pa[i]);
 			vm_page_valid(pa[i]);
 			MPASS(pa[i]->dirty == 0);
 			vm_page_xunbusy(pa[i]);
 			i++;
 			continue;
 		}
 		VM_OBJECT_RUNLOCK(obj);
 
 		/*
 		 * We want to pagein as many pages as possible, limited only
 		 * by the 'a' hint and actual request.
 		 */
 		count = min(a + 1, npages - i);
 
 		/*
 		 * We should not pagein into a valid page because
 		 * there might be still unfinished write tracked by
 		 * e.g. a buffer, thus we substitute any valid pages
 		 * with the bogus one.
 		 *
 		 * We must not leave around xbusy pages which are not
 		 * part of the run passed to vm_pager_getpages(),
 		 * otherwise pager might deadlock waiting for the busy
 		 * status of the page, e.g. if it constitues the
 		 * buffer needed to validate other page.
 		 *
 		 * First trim the end of the run consisting of the
 		 * valid pages, then replace the rest of the valid
 		 * with bogus.
 		 */
 		count1 = count;
 		for (j = i + count - 1; j > i; j--) {
 			if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
 			    xfsize(j, npages, off, len))) {
 				vm_page_xunbusy(pa[j]);
 				SFSTAT_INC(sf_pages_valid);
 				count--;
 			} else {
 				break;
 			}
 		}
 
 		/*
 		 * The last page in the run pa[i + count - 1] is
 		 * guaranteed to be invalid by the trim above, so it
 		 * is not replaced with bogus, thus -1 in the loop end
 		 * condition.
 		 */
 		MPASS(pa[i + count - 1]->valid != VM_PAGE_BITS_ALL);
 		for (j = i + 1; j < i + count - 1; j++) {
 			if (vm_page_is_valid(pa[j], vmoff(j, off) & PAGE_MASK,
 			    xfsize(j, npages, off, len))) {
 				vm_page_xunbusy(pa[j]);
 				SFSTAT_INC(sf_pages_valid);
 				SFSTAT_INC(sf_pages_bogus);
 				pa[j] = bogus_page;
 			}
 		}
 
 		refcount_acquire(&sfio->nios);
 		rv = vm_pager_get_pages_async(obj, pa + i, count, NULL,
 		    i + count == npages ? &rhpages : NULL,
 		    &sendfile_iodone, sfio);
 		if (__predict_false(rv != VM_PAGER_OK)) {
 			sendfile_iowait(sfio, "sferrio");
 
 			/*
 			 * Do remaining pages recovery before returning EIO.
 			 * Pages from 0 to npages are wired.
 			 * Pages from (i + count1) to npages are busied.
 			 */
 			for (j = 0; j < npages; j++) {
 				if (j >= i + count1)
 					vm_page_xunbusy(pa[j]);
 				KASSERT(pa[j] != NULL && pa[j] != bogus_page,
 				    ("%s: page %p[%d] I/O recovery failure",
 				    __func__, pa, j));
 				vm_page_unwire(pa[j], PQ_INACTIVE);
 				pa[j] = NULL;
 			}
 			return (EIO);
 		}
 
 		SFSTAT_INC(sf_iocnt);
 		SFSTAT_ADD(sf_pages_read, count);
 		if (i + count == npages)
 			SFSTAT_ADD(sf_rhpages_read, rhpages);
 
 		i += count1;
 		(*nios)++;
 	}
 
 	if (*nios == 0 && npages != 0)
 		SFSTAT_INC(sf_noiocnt);
 
 	return (0);
 }
 
 static int
 sendfile_getobj(struct thread *td, struct file *fp, vm_object_t *obj_res,
     struct vnode **vp_res, struct shmfd **shmfd_res, off_t *obj_size,
     int *bsize)
 {
 	struct vattr va;
 	vm_object_t obj;
 	struct vnode *vp;
 	struct shmfd *shmfd;
 	int error;
 
 	error = 0;
 	vp = *vp_res = NULL;
 	obj = NULL;
 	shmfd = *shmfd_res = NULL;
 	*bsize = 0;
 
 	/*
 	 * The file descriptor must be a regular file and have a
 	 * backing VM object.
 	 */
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
 		vn_lock(vp, LK_SHARED | LK_RETRY);
 		if (vp->v_type != VREG) {
 			error = EINVAL;
 			goto out;
 		}
 		*bsize = vp->v_mount->mnt_stat.f_iosize;
 		obj = vp->v_object;
 		if (obj == NULL) {
 			error = EINVAL;
 			goto out;
 		}
 
 		/*
 		 * Use the pager size when available to simplify synchronization
 		 * with filesystems, which otherwise must atomically update both
 		 * the vnode pager size and file size.
 		 */
 		if (obj->type == OBJT_VNODE) {
 			VM_OBJECT_RLOCK(obj);
 			*obj_size = obj->un_pager.vnp.vnp_size;
 		} else {
 			error = VOP_GETATTR(vp, &va, td->td_ucred);
 			if (error != 0)
 				goto out;
 			*obj_size = va.va_size;
 			VM_OBJECT_RLOCK(obj);
 		}
 	} else if (fp->f_type == DTYPE_SHM) {
 		shmfd = fp->f_data;
 		obj = shmfd->shm_object;
 		VM_OBJECT_RLOCK(obj);
 		*obj_size = shmfd->shm_size;
 	} else {
 		error = EINVAL;
 		goto out;
 	}
 
 	if ((obj->flags & OBJ_DEAD) != 0) {
 		VM_OBJECT_RUNLOCK(obj);
 		error = EBADF;
 		goto out;
 	}
 
 	/*
 	 * Temporarily increase the backing VM object's reference
 	 * count so that a forced reclamation of its vnode does not
 	 * immediately destroy it.
 	 */
 	vm_object_reference_locked(obj);
 	VM_OBJECT_RUNLOCK(obj);
 	*obj_res = obj;
 	*vp_res = vp;
 	*shmfd_res = shmfd;
 
 out:
 	if (vp != NULL)
 		VOP_UNLOCK(vp);
 	return (error);
 }
 
 static int
 sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
     struct socket **so)
 {
 	int error;
 
 	*sock_fp = NULL;
 	*so = NULL;
 
 	/*
 	 * The socket must be a stream socket and connected.
 	 */
 	error = getsock_cap(td, s, &cap_send_rights,
 	    sock_fp, NULL, NULL);
 	if (error != 0)
 		return (error);
 	*so = (*sock_fp)->f_data;
 	if ((*so)->so_type != SOCK_STREAM)
 		return (EINVAL);
 	/*
 	 * SCTP one-to-one style sockets currently don't work with
 	 * sendfile(). So indicate EINVAL for now.
 	 */
 	if ((*so)->so_proto->pr_protocol == IPPROTO_SCTP)
 		return (EINVAL);
 	return (0);
 }
 
 int
 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
     struct thread *td)
 {
 	struct file *sock_fp;
 	struct vnode *vp;
 	struct vm_object *obj;
 	vm_page_t pga;
 	struct socket *so;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 #endif
 	struct mbuf *m, *mh, *mhtail;
 	struct sf_buf *sf;
 	struct shmfd *shmfd;
 	struct sendfile_sync *sfs;
 	struct vattr va;
 	off_t off, sbytes, rem, obj_size, nobj_size;
 	int bsize, error, ext_pgs_idx, hdrlen, max_pgs, softerr;
 #ifdef KERN_TLS
 	int tls_enq_cnt;
 #endif
 	bool use_ext_pgs;
 
 	obj = NULL;
 	so = NULL;
 	m = mh = NULL;
 	sfs = NULL;
 #ifdef KERN_TLS
 	tls = NULL;
 #endif
 	hdrlen = sbytes = 0;
 	softerr = 0;
 	use_ext_pgs = false;
 
 	error = sendfile_getobj(td, fp, &obj, &vp, &shmfd, &obj_size, &bsize);
 	if (error != 0)
 		return (error);
 
 	error = sendfile_getsock(td, sockfd, &sock_fp, &so);
 	if (error != 0)
 		goto out;
 
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0)
 		goto out;
 #endif
 
 	SFSTAT_INC(sf_syscalls);
 	SFSTAT_ADD(sf_rhpages_requested, SF_READAHEAD(flags));
 
 	if (flags & SF_SYNC) {
 		sfs = malloc(sizeof(*sfs), M_SENDFILE, M_WAITOK | M_ZERO);
 		mtx_init(&sfs->mtx, "sendfile", NULL, MTX_DEF);
 		cv_init(&sfs->cv, "sendfile");
 		sfs->waiting = true;
 	}
 
 	rem = nbytes ? omin(nbytes, obj_size - offset) : obj_size - offset;
 
 	/*
 	 * Protect against multiple writers to the socket.
 	 *
 	 * XXXRW: Historically this has assumed non-interruptibility, so now
 	 * we implement that, but possibly shouldn't.
 	 */
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT | SBL_NOINTR);
 	if (error != 0)
 		goto out;
 #ifdef KERN_TLS
 	tls = ktls_hold(so->so_snd.sb_tls_info);
 #endif
 
 	/*
 	 * Loop through the pages of the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
 	 * This is done in two loops.  The inner loop turns as many pages
 	 * as it can, up to available socket buffer space, without blocking
 	 * into mbufs to have it bulk delivered into the socket send buffer.
 	 * The outer loop checks the state and available space of the socket
 	 * and takes care of the overall progress.
 	 */
 	for (off = offset; rem > 0; ) {
 		struct sf_io *sfio;
 		vm_page_t *pa;
 		struct mbuf *m0, *mtail;
 		int nios, space, npages, rhpages;
 
 		mtail = NULL;
 		/*
 		 * Check the socket state for ongoing connection,
 		 * no errors and space in socket buffer.
 		 * If space is low allow for the remainder of the
 		 * file to be processed if it fits the socket buffer.
 		 * Otherwise block in waiting for sufficient space
 		 * to proceed, or if the socket is nonblocking, return
 		 * to userland with EAGAIN while reporting how far
 		 * we've come.
 		 * We wait until the socket buffer has significant free
 		 * space to do bulk sends.  This makes good use of file
 		 * system read ahead and allows packet segmentation
 		 * offloading hardware to take over lots of work.  If
 		 * we were not careful here we would send off only one
 		 * sfbuf at a time.
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
 			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
 retry_space:
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			error = EPIPE;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		} else if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOTCONN;
 			goto done;
 		}
 
 		space = sbspace(&so->so_snd);
 		if (space < rem &&
 		    (space <= 0 ||
 		     space < so->so_snd.sb_lowat)) {
 			if (so->so_state & SS_NBIO) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EAGAIN;
 				goto done;
 			}
 			/*
 			 * sbwait drops the lock while sleeping.
 			 * When we loop back to retry_space the
 			 * state may have changed and we retest
 			 * for it.
 			 */
 			error = sbwait(so, SO_SND);
 			/*
 			 * An error from sbwait usually indicates that we've
 			 * been interrupted by a signal. If we've sent anything
 			 * then return bytes sent, otherwise return the error.
 			 */
 			if (error != 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				goto done;
 			}
 			goto retry_space;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
 		 * At the beginning of the first loop check if any headers
 		 * are specified and copy them into mbufs.  Reduce space in
 		 * the socket buffer by the size of the header mbuf chain.
 		 * Clear hdr_uio here and hdrlen at the end of the first loop.
 		 */
 		if (hdr_uio != NULL && hdr_uio->uio_resid > 0) {
 			hdr_uio->uio_td = td;
 			hdr_uio->uio_rw = UIO_WRITE;
 #ifdef KERN_TLS
 			if (tls != NULL)
 				mh = m_uiotombuf(hdr_uio, M_WAITOK, space,
 				    tls->params.max_frame_len, M_EXTPG);
 			else
 #endif
 				mh = m_uiotombuf(hdr_uio, M_WAITOK,
 				    space, 0, 0);
 			hdrlen = m_length(mh, &mhtail);
 			space -= hdrlen;
 			/*
 			 * If header consumed all the socket buffer space,
 			 * don't waste CPU cycles and jump to the end.
 			 */
 			if (space == 0) {
 				sfio = NULL;
 				nios = 0;
 				goto prepend_header;
 			}
 			hdr_uio = NULL;
 		}
 
 		if (vp != NULL) {
 			error = vn_lock(vp, LK_SHARED);
 			if (error != 0)
 				goto done;
 
 			/*
 			 * Check to see if the file size has changed.
 			 */
 			if (obj->type == OBJT_VNODE) {
 				VM_OBJECT_RLOCK(obj);
 				nobj_size = obj->un_pager.vnp.vnp_size;
 				VM_OBJECT_RUNLOCK(obj);
 			} else {
 				error = VOP_GETATTR(vp, &va, td->td_ucred);
 				if (error != 0) {
 					VOP_UNLOCK(vp);
 					goto done;
 				}
 				nobj_size = va.va_size;
 			}
 			if (off >= nobj_size) {
 				VOP_UNLOCK(vp);
 				goto done;
 			}
 			if (nobj_size != obj_size) {
 				obj_size = nobj_size;
 				rem = nbytes ? omin(nbytes + offset, obj_size) :
 				    obj_size;
 				rem -= off;
 			}
 		}
 
 		if (space > rem)
 			space = rem;
 		else if (space > PAGE_SIZE) {
 			/*
 			 * Use page boundaries when possible for large
 			 * requests.
 			 */
 			if (off & PAGE_MASK)
 				space -= (PAGE_SIZE - (off & PAGE_MASK));
 			space = trunc_page(space);
 			if (off & PAGE_MASK)
 				space += (PAGE_SIZE - (off & PAGE_MASK));
 		}
 
 		npages = howmany(space + (off & PAGE_MASK), PAGE_SIZE);
 
 		/*
 		 * Calculate maximum allowed number of pages for readahead
 		 * at this iteration.  If SF_USER_READAHEAD was set, we don't
 		 * do any heuristics and use exactly the value supplied by
 		 * application.  Otherwise, we allow readahead up to "rem".
 		 * If application wants more, let it be, but there is no
 		 * reason to go above maxphys.  Also check against "obj_size",
 		 * since vm_pager_has_page() can hint beyond EOF.
 		 */
 		if (flags & SF_USER_READAHEAD) {
 			rhpages = SF_READAHEAD(flags);
 		} else {
 			rhpages = howmany(rem + (off & PAGE_MASK), PAGE_SIZE) -
 			    npages;
 			rhpages += SF_READAHEAD(flags);
 		}
 		rhpages = min(howmany(maxphys, PAGE_SIZE), rhpages);
 		rhpages = min(howmany(obj_size - trunc_page(off), PAGE_SIZE) -
 		    npages, rhpages);
 
 		sfio = malloc(sizeof(struct sf_io) +
 		    npages * sizeof(vm_page_t), M_SENDFILE, M_WAITOK);
 		refcount_init(&sfio->nios, 1);
 		sfio->obj = obj;
 		sfio->error = 0;
 		sfio->m = NULL;
 		sfio->npages = npages;
 #ifdef KERN_TLS
 		/*
 		 * This doesn't use ktls_hold() because sfio->m will
 		 * also have a reference on 'tls' that will be valid
 		 * for all of sfio's lifetime.
 		 */
 		sfio->tls = tls;
 #endif
 		vm_object_pip_add(obj, 1);
 		error = sendfile_swapin(obj, sfio, &nios, off, space, rhpages,
 		    flags);
 		if (error != 0) {
 			if (vp != NULL)
 				VOP_UNLOCK(vp);
 			sendfile_iodone(sfio, NULL, 0, error);
 			goto done;
 		}
 
 		/*
 		 * Loop and construct maximum sized mbuf chain to be bulk
 		 * dumped into socket buffer.
 		 */
 		pa = sfio->pa;
 
 		/*
 		 * Use unmapped mbufs if enabled for TCP.  Unmapped
 		 * bufs are restricted to TCP as that is what has been
 		 * tested.  In particular, unmapped mbufs have not
 		 * been tested with UNIX-domain sockets.
 		 *
 		 * TLS frames always require unmapped mbufs.
 		 */
 		if ((mb_use_ext_pgs &&
 		    so->so_proto->pr_protocol == IPPROTO_TCP)
 #ifdef KERN_TLS
 		    || tls != NULL
 #endif
 		    ) {
 			use_ext_pgs = true;
 #ifdef KERN_TLS
 			if (tls != NULL)
 				max_pgs = num_pages(tls->params.max_frame_len);
 			else
 #endif
 				max_pgs = MBUF_PEXT_MAX_PGS;
 
 			/* Start at last index, to wrap on first use. */
 			ext_pgs_idx = max_pgs - 1;
 		}
 
 		for (int i = 0; i < npages; i++) {
 			/*
 			 * If a page wasn't grabbed successfully, then
 			 * trim the array. Can happen only with SF_NODISKIO.
 			 */
 			if (pa[i] == NULL) {
 				SFSTAT_INC(sf_busy);
 				fixspace(npages, i, off, &space);
 				sfio->npages = i;
 				softerr = EBUSY;
 				break;
 			}
 			pga = pa[i];
 			if (pga == bogus_page)
 				pga = vm_page_relookup(obj, sfio->pindex0 + i);
 
 			if (use_ext_pgs) {
 				off_t xfs;
 
 				ext_pgs_idx++;
 				if (ext_pgs_idx == max_pgs) {
 					m0 = mb_alloc_ext_pgs(M_WAITOK,
 					    sendfile_free_mext_pg);
 
 					if (flags & SF_NOCACHE) {
 						m0->m_ext.ext_flags |=
 						    EXT_FLAG_NOCACHE;
 
 						/*
 						 * See comment below regarding
 						 * ignoring SF_NOCACHE for the
 						 * last page.
 						 */
 						if ((npages - i <= max_pgs) &&
 						    ((off + space) & PAGE_MASK) &&
 						    (rem > space || rhpages > 0))
 							m0->m_ext.ext_flags |=
 							    EXT_FLAG_CACHE_LAST;
 					}
 					if (sfs != NULL) {
 						m0->m_ext.ext_flags |=
 						    EXT_FLAG_SYNC;
 						m0->m_ext.ext_arg1 = sfs;
 						mtx_lock(&sfs->mtx);
 						sfs->count++;
 						mtx_unlock(&sfs->mtx);
 					}
 					ext_pgs_idx = 0;
 
 					/* Append to mbuf chain. */
 					if (mtail != NULL)
 						mtail->m_next = m0;
 					else
 						m = m0;
 					mtail = m0;
 					m0->m_epg_1st_off =
 					    vmoff(i, off) & PAGE_MASK;
 				}
 				if (nios) {
 					mtail->m_flags |= M_NOTREADY;
 					m0->m_epg_nrdy++;
 				}
 
 				m0->m_epg_pa[ext_pgs_idx] = VM_PAGE_TO_PHYS(pga);
 				m0->m_epg_npgs++;
 				xfs = xfsize(i, npages, off, space);
 				m0->m_epg_last_len = xfs;
 				MBUF_EXT_PGS_ASSERT_SANITY(m0);
 				mtail->m_len += xfs;
 				mtail->m_ext.ext_size += PAGE_SIZE;
 				continue;
 			}
 
 			/*
 			 * Get a sendfile buf.  When allocating the
 			 * first buffer for mbuf chain, we usually
 			 * wait as long as necessary, but this wait
 			 * can be interrupted.  For consequent
 			 * buffers, do not sleep, since several
 			 * threads might exhaust the buffers and then
 			 * deadlock.
 			 */
 			sf = sf_buf_alloc(pga,
 			    m != NULL ? SFB_NOWAIT : SFB_CATCH);
 			if (sf == NULL) {
 				SFSTAT_INC(sf_allocfail);
 				sendfile_iowait(sfio, "sfnosf");
 				for (int j = i; j < npages; j++) {
 					vm_page_unwire(pa[j], PQ_INACTIVE);
 					pa[j] = NULL;
 				}
 				if (m == NULL)
 					softerr = ENOBUFS;
 				fixspace(npages, i, off, &space);
 				sfio->npages = i;
 				break;
 			}
 
 			m0 = m_get(M_WAITOK, MT_DATA);
 			m0->m_ext.ext_buf = (char *)sf_buf_kva(sf);
 			m0->m_ext.ext_size = PAGE_SIZE;
 			m0->m_ext.ext_arg1 = sf;
 			m0->m_ext.ext_type = EXT_SFBUF;
 			m0->m_ext.ext_flags = EXT_FLAG_EMBREF;
 			m0->m_ext.ext_free = sendfile_free_mext;
 			/*
 			 * SF_NOCACHE sets the page as being freed upon send.
 			 * However, we ignore it for the last page in 'space',
 			 * if the page is truncated, and we got more data to
 			 * send (rem > space), or if we have readahead
 			 * configured (rhpages > 0).
 			 */
 			if ((flags & SF_NOCACHE) &&
 			    (i != npages - 1 ||
 			    !((off + space) & PAGE_MASK) ||
 			    !(rem > space || rhpages > 0)))
 				m0->m_ext.ext_flags |= EXT_FLAG_NOCACHE;
 			if (sfs != NULL) {
 				m0->m_ext.ext_flags |= EXT_FLAG_SYNC;
 				m0->m_ext.ext_arg2 = sfs;
 				mtx_lock(&sfs->mtx);
 				sfs->count++;
 				mtx_unlock(&sfs->mtx);
 			}
 			m0->m_ext.ext_count = 1;
 			m0->m_flags |= (M_EXT | M_RDONLY);
 			if (nios)
 				m0->m_flags |= M_NOTREADY;
 			m0->m_data = (char *)sf_buf_kva(sf) +
 			    (vmoff(i, off) & PAGE_MASK);
 			m0->m_len = xfsize(i, npages, off, space);
 
 			/* Append to mbuf chain. */
 			if (mtail != NULL)
 				mtail->m_next = m0;
 			else
 				m = m0;
 			mtail = m0;
 		}
 
 		if (vp != NULL)
 			VOP_UNLOCK(vp);
 
 		/* Keep track of bytes processed. */
 		off += space;
 		rem -= space;
 
 		/*
 		 * Prepend header, if any.  Save pointer to first mbuf
 		 * with a page.
 		 */
 		if (hdrlen) {
 prepend_header:
 			m0 = mhtail->m_next = m;
 			m = mh;
 			mh = NULL;
 		} else
 			m0 = m;
 
 		if (m == NULL) {
 			KASSERT(softerr, ("%s: m NULL, no error", __func__));
 			error = softerr;
 			sendfile_iodone(sfio, NULL, 0, 0);
 			goto done;
 		}
 
 		/* Add the buffer chain to the socket buffer. */
 		KASSERT(m_length(m, NULL) == space + hdrlen,
 		    ("%s: mlen %u space %d hdrlen %d",
 		    __func__, m_length(m, NULL), space, hdrlen));
 
 		CURVNET_SET(so->so_vnet);
 #ifdef KERN_TLS
 		if (tls != NULL)
 			ktls_frame(m, tls, &tls_enq_cnt, TLS_RLTYPE_APP);
 #endif
 		if (nios == 0) {
 			/*
 			 * If sendfile_swapin() didn't initiate any I/Os,
 			 * which happens if all data is cached in VM, or if
 			 * the header consumed all socket buffer space and
 			 * sfio is NULL, then we can send data right now
 			 * without the PRUS_NOTREADY flag.
 			 */
 			if (sfio != NULL)
 				sendfile_iodone(sfio, NULL, 0, 0);
 #ifdef KERN_TLS
 			if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
-				error = (*so->so_proto->pr_usrreqs->pru_send)
-				    (so, PRUS_NOTREADY, m, NULL, NULL, td);
+				error = so->so_proto->pr_send(so,
+				    PRUS_NOTREADY, m, NULL, NULL, td);
 				if (error != 0) {
 					m_freem(m);
 				} else {
 					soref(so);
 					ktls_enqueue(m, so, tls_enq_cnt);
 				}
 			} else
 #endif
-				error = (*so->so_proto->pr_usrreqs->pru_send)
-				    (so, 0, m, NULL, NULL, td);
+				error = so->so_proto->pr_send(so, 0, m, NULL,
+				    NULL, td);
 		} else {
 			sfio->so = so;
 			sfio->m = m0;
 			soref(so);
-			error = (*so->so_proto->pr_usrreqs->pru_send)
-			    (so, PRUS_NOTREADY, m, NULL, NULL, td);
+			error = so->so_proto->pr_send(so, PRUS_NOTREADY, m,
+			    NULL, NULL, td);
 			sendfile_iodone(sfio, NULL, 0, error);
 		}
 		CURVNET_RESTORE();
 
 		m = NULL;
 		if (error)
 			goto done;
 		sbytes += space + hdrlen;
 		if (hdrlen)
 			hdrlen = 0;
 		if (softerr) {
 			error = softerr;
 			goto done;
 		}
 	}
 
 	/*
 	 * Send trailers. Wimp out and use writev(2).
 	 */
 	if (trl_uio != NULL) {
 		SOCK_IO_SEND_UNLOCK(so);
 		error = kern_writev(td, sockfd, trl_uio);
 		if (error == 0)
 			sbytes += td->td_retval[0];
 		goto out;
 	}
 
 done:
 	SOCK_IO_SEND_UNLOCK(so);
 out:
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
 	 */
 	if (error == 0) {
 		td->td_retval[0] = 0;
 	}
 	if (sent != NULL) {
 		(*sent) = sbytes;
 	}
 	if (obj != NULL)
 		vm_object_deallocate(obj);
 	if (so)
 		fdrop(sock_fp, td);
 	if (m)
 		m_freem(m);
 	if (mh)
 		m_freem(mh);
 
 	if (sfs != NULL) {
 		mtx_lock(&sfs->mtx);
 		if (sfs->count != 0)
 			error = cv_wait_sig(&sfs->cv, &sfs->mtx);
 		if (sfs->count == 0) {
 			sendfile_sync_destroy(sfs);
 		} else {
 			sfs->waiting = false;
 			mtx_unlock(&sfs->mtx);
 		}
 	}
 #ifdef KERN_TLS
 	if (tls != NULL)
 		ktls_free(tls);
 #endif
 
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
 
 static int
 sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 {
 	struct sf_hdtr hdtr;
 	struct uio *hdr_uio, *trl_uio;
 	struct file *fp;
 	off_t sbytes;
 	int error;
 
 	/*
 	 * File offset must be positive.  If it goes beyond EOF
 	 * we send only the header/trailer and no payload data.
 	 */
 	if (uap->offset < 0)
 		return (EINVAL);
 
 	sbytes = 0;
 	hdr_uio = trl_uio = NULL;
 
 	if (uap->hdtr != NULL) {
 		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
 		if (error != 0)
 			goto out;
 		if (hdtr.headers != NULL) {
 			error = copyinuio(hdtr.headers, hdtr.hdr_cnt,
 			    &hdr_uio);
 			if (error != 0)
 				goto out;
 #ifdef COMPAT_FREEBSD4
 			/*
 			 * In FreeBSD < 5.0 the nbytes to send also included
 			 * the header.  If compat is specified subtract the
 			 * header size from nbytes.
 			 */
 			if (compat) {
 				if (uap->nbytes > hdr_uio->uio_resid)
 					uap->nbytes -= hdr_uio->uio_resid;
 				else
 					uap->nbytes = 0;
 			}
 #endif
 		}
 		if (hdtr.trailers != NULL) {
 			error = copyinuio(hdtr.trailers, hdtr.trl_cnt,
 			    &trl_uio);
 			if (error != 0)
 				goto out;
 		}
 	}
 
 	AUDIT_ARG_FD(uap->fd);
 
 	/*
 	 * sendfile(2) can start at any offset within a file so we require
 	 * CAP_READ+CAP_SEEK = CAP_PREAD.
 	 */
 	if ((error = fget_read(td, uap->fd, &cap_pread_rights, &fp)) != 0)
 		goto out;
 
 	error = fo_sendfile(fp, uap->s, hdr_uio, trl_uio, uap->offset,
 	    uap->nbytes, &sbytes, uap->flags, td);
 	fdrop(fp, td);
 
 	if (uap->sbytes != NULL)
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
 
 out:
 	free(hdr_uio, M_IOV);
 	free(trl_uio, M_IOV);
 	return (error);
 }
 
 /*
  * sendfile(2)
  * 
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  *       struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  * 
  * Send a file specified by 'fd' and starting at 'offset' to a socket
  * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
  * 0.  Optionally add a header and/or trailer to the socket output.  If
  * specified, write the total number of bytes sent into *sbytes.
  */
 int
 sys_sendfile(struct thread *td, struct sendfile_args *uap)
 {
 
 	return (sendfile(td, uap, 0));
 }
 
 #ifdef COMPAT_FREEBSD4
 int
 freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
 {
 	struct sendfile_args args;
 
 	args.fd = uap->fd;
 	args.s = uap->s;
 	args.offset = uap->offset;
 	args.nbytes = uap->nbytes;
 	args.hdtr = uap->hdtr;
 	args.sbytes = uap->sbytes;
 	args.flags = uap->flags;
 
 	return (sendfile(td, &args, 1));
 }
 #endif /* COMPAT_FREEBSD4 */
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
index 7f39e58f4ce4..5b8aadc12e08 100644
--- a/sys/kern/sys_socket.c
+++ b/sys/kern/sys_socket.c
@@ -1,851 +1,850 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)sys_socket.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/aio.h>
 #include <sys/domain.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sigio.h>
 #include <sys/signal.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/filio.h>			/* XXX */
 #include <sys/sockio.h>
 #include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/ucred.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/user.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 
 static SYSCTL_NODE(_kern_ipc, OID_AUTO, aio, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
     "socket AIO stats");
 
 static int empty_results;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_results, CTLFLAG_RD, &empty_results,
     0, "socket operation returned EAGAIN");
 
 static int empty_retries;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, empty_retries, CTLFLAG_RD, &empty_retries,
     0, "socket operation retries");
 
 static fo_rdwr_t soo_read;
 static fo_rdwr_t soo_write;
 static fo_ioctl_t soo_ioctl;
 static fo_poll_t soo_poll;
 extern fo_kqfilter_t soo_kqfilter;
 static fo_stat_t soo_stat;
 static fo_close_t soo_close;
 static fo_fill_kinfo_t soo_fill_kinfo;
 static fo_aio_queue_t soo_aio_queue;
 
 static void	soo_aio_cancel(struct kaiocb *job);
 
 struct fileops	socketops = {
 	.fo_read = soo_read,
 	.fo_write = soo_write,
 	.fo_truncate = invfo_truncate,
 	.fo_ioctl = soo_ioctl,
 	.fo_poll = soo_poll,
 	.fo_kqfilter = soo_kqfilter,
 	.fo_stat = soo_stat,
 	.fo_close = soo_close,
 	.fo_chmod = invfo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_fill_kinfo = soo_fill_kinfo,
 	.fo_aio_queue = soo_aio_queue,
 	.fo_flags = DFLAG_PASSABLE
 };
 
 static int
 soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 #ifdef MAC
 	error = mac_socket_check_receive(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	error = soreceive(so, 0, uio, 0, 0, 0);
 	return (error);
 }
 
 static int
 soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
     int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 #ifdef MAC
 	error = mac_socket_check_send(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td);
 	if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
 		PROC_LOCK(uio->uio_td->td_proc);
 		tdsignal(uio->uio_td, SIGPIPE);
 		PROC_UNLOCK(uio->uio_td->td_proc);
 	}
 	return (error);
 }
 
 static int
 soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error = 0;
 
 	switch (cmd) {
 	case FIONBIO:
 		SOCK_LOCK(so);
 		if (*(int *)data)
 			so->so_state |= SS_NBIO;
 		else
 			so->so_state &= ~SS_NBIO;
 		SOCK_UNLOCK(so);
 		break;
 
 	case FIOASYNC:
 		if (*(int *)data) {
 			SOCK_LOCK(so);
 			so->so_state |= SS_ASYNC;
 			if (SOLISTENING(so)) {
 				so->sol_sbrcv_flags |= SB_ASYNC;
 				so->sol_sbsnd_flags |= SB_ASYNC;
 			} else {
 				SOCK_RECVBUF_LOCK(so);
 				so->so_rcv.sb_flags |= SB_ASYNC;
 				SOCK_RECVBUF_UNLOCK(so);
 				SOCK_SENDBUF_LOCK(so);
 				so->so_snd.sb_flags |= SB_ASYNC;
 				SOCK_SENDBUF_UNLOCK(so);
 			}
 			SOCK_UNLOCK(so);
 		} else {
 			SOCK_LOCK(so);
 			so->so_state &= ~SS_ASYNC;
 			if (SOLISTENING(so)) {
 				so->sol_sbrcv_flags &= ~SB_ASYNC;
 				so->sol_sbsnd_flags &= ~SB_ASYNC;
 			} else {
 				SOCK_RECVBUF_LOCK(so);
 				so->so_rcv.sb_flags &= ~SB_ASYNC;
 				SOCK_RECVBUF_UNLOCK(so);
 				SOCK_SENDBUF_LOCK(so);
 				so->so_snd.sb_flags &= ~SB_ASYNC;
 				SOCK_SENDBUF_UNLOCK(so);
 			}
 			SOCK_UNLOCK(so);
 		}
 		break;
 
 	case FIONREAD:
 		SOCK_RECVBUF_LOCK(so);
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			*(int *)data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
 		}
 		SOCK_RECVBUF_UNLOCK(so);
 		break;
 
 	case FIONWRITE:
 		/* Unlocked read. */
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			*(int *)data = sbavail(&so->so_snd);
 		}
 		break;
 
 	case FIONSPACE:
 		/* Unlocked read. */
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			if ((so->so_snd.sb_hiwat < sbused(&so->so_snd)) ||
 			    (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt)) {
 				*(int *)data = 0;
 			} else {
 				*(int *)data = sbspace(&so->so_snd);
 			}
 		}
 		break;
 
 	case FIOSETOWN:
 		error = fsetown(*(int *)data, &so->so_sigio);
 		break;
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&so->so_sigio);
 		break;
 
 	case SIOCSPGRP:
 		error = fsetown(-(*(int *)data), &so->so_sigio);
 		break;
 
 	case SIOCGPGRP:
 		*(int *)data = -fgetown(&so->so_sigio);
 		break;
 
 	case SIOCATMARK:
 		/* Unlocked read. */
 		if (SOLISTENING(so)) {
 			error = EINVAL;
 		} else {
 			*(int *)data = (so->so_rcv.sb_state & SBS_RCVATMARK) != 0;
 		}
 		break;
 	default:
 		/*
 		 * Interface/routing/protocol specific ioctls: interface and
 		 * routing ioctls should have a different entry since a
 		 * socket is unnecessary.
 		 */
 		if (IOCGROUP(cmd) == 'i')
 			error = ifioctl(so, cmd, data, td);
 		else if (IOCGROUP(cmd) == 'r') {
 			CURVNET_SET(so->so_vnet);
 			error = rtioctl_fib(cmd, data, so->so_fibnum);
 			CURVNET_RESTORE();
 		} else {
 			CURVNET_SET(so->so_vnet);
-			error = ((*so->so_proto->pr_usrreqs->pru_control)
-			    (so, cmd, data, 0, td));
+			error = so->so_proto->pr_control(so, cmd, data, 0, td);
 			CURVNET_RESTORE();
 		}
 		break;
 	}
 	return (error);
 }
 
 static int
 soo_poll(struct file *fp, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	struct socket *so = fp->f_data;
 #ifdef MAC
 	int error;
 
 	error = mac_socket_check_poll(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	return (sopoll(so, events, fp->f_cred, td));
 }
 
 static int
 soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
 	bzero((caddr_t)ub, sizeof (*ub));
 	ub->st_mode = S_IFSOCK;
 #ifdef MAC
 	error = mac_socket_check_stat(active_cred, so);
 	if (error)
 		return (error);
 #endif
 	SOCK_LOCK(so);
 	if (!SOLISTENING(so)) {
 		struct sockbuf *sb;
 
 		/*
 		 * If SBS_CANTRCVMORE is set, but there's still data left
 		 * in the receive buffer, the socket is still readable.
 		 */
 		sb = &so->so_rcv;
 		SOCK_RECVBUF_LOCK(so);
 		if ((sb->sb_state & SBS_CANTRCVMORE) == 0 || sbavail(sb))
 			ub->st_mode |= S_IRUSR | S_IRGRP | S_IROTH;
 		ub->st_size = sbavail(sb) - sb->sb_ctl;
 		SOCK_RECVBUF_UNLOCK(so);
 
 		sb = &so->so_snd;
 		SOCK_SENDBUF_LOCK(so);
 		if ((sb->sb_state & SBS_CANTSENDMORE) == 0)
 			ub->st_mode |= S_IWUSR | S_IWGRP | S_IWOTH;
 		SOCK_SENDBUF_UNLOCK(so);
 	}
 	ub->st_uid = so->so_cred->cr_uid;
 	ub->st_gid = so->so_cred->cr_gid;
-	error = so->so_proto->pr_usrreqs->pru_sense(so, ub);
+	error = so->so_proto->pr_sense(so, ub);
 	SOCK_UNLOCK(so);
 	return (error);
 }
 
 /*
  * API socket close on file pointer.  We call soclose() to close the socket
  * (including initiating closing protocols).  soclose() will sorele() the
  * file reference but the actual socket will not go away until the socket's
  * ref count hits 0.
  */
 static int
 soo_close(struct file *fp, struct thread *td)
 {
 	int error = 0;
 	struct socket *so;
 
 	so = fp->f_data;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 
 	if (so)
 		error = soclose(so);
 	return (error);
 }
 
 static int
 soo_fill_kinfo(struct file *fp, struct kinfo_file *kif, struct filedesc *fdp)
 {
 	struct sockaddr *sa;
 	struct inpcb *inpcb;
 	struct unpcb *unpcb;
 	struct socket *so;
 	int error;
 
 	kif->kf_type = KF_TYPE_SOCKET;
 	so = fp->f_data;
 	CURVNET_SET(so->so_vnet);
 	kif->kf_un.kf_sock.kf_sock_domain0 =
 	    so->so_proto->pr_domain->dom_family;
 	kif->kf_un.kf_sock.kf_sock_type0 = so->so_type;
 	kif->kf_un.kf_sock.kf_sock_protocol0 = so->so_proto->pr_protocol;
 	kif->kf_un.kf_sock.kf_sock_pcb = (uintptr_t)so->so_pcb;
 	switch (kif->kf_un.kf_sock.kf_sock_domain0) {
 	case AF_INET:
 	case AF_INET6:
 		if (so->so_pcb != NULL) {
 			inpcb = (struct inpcb *)(so->so_pcb);
 			kif->kf_un.kf_sock.kf_sock_inpcb =
 			    (uintptr_t)inpcb->inp_ppcb;
 		}
 		kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
 		    so->so_rcv.sb_state;
 		kif->kf_un.kf_sock.kf_sock_snd_sb_state =
 		    so->so_snd.sb_state;
 		kif->kf_un.kf_sock.kf_sock_sendq =
 		    sbused(&so->so_snd);
 		kif->kf_un.kf_sock.kf_sock_recvq =
 		    sbused(&so->so_rcv);
 		break;
 	case AF_UNIX:
 		if (so->so_pcb != NULL) {
 			unpcb = (struct unpcb *)(so->so_pcb);
 			if (unpcb->unp_conn) {
 				kif->kf_un.kf_sock.kf_sock_unpconn =
 				    (uintptr_t)unpcb->unp_conn;
 				kif->kf_un.kf_sock.kf_sock_rcv_sb_state =
 				    so->so_rcv.sb_state;
 				kif->kf_un.kf_sock.kf_sock_snd_sb_state =
 				    so->so_snd.sb_state;
 				kif->kf_un.kf_sock.kf_sock_sendq =
 				    sbused(&so->so_snd);
 				kif->kf_un.kf_sock.kf_sock_recvq =
 				    sbused(&so->so_rcv);
 			}
 		}
 		break;
 	}
-	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
+	error = so->so_proto->pr_sockaddr(so, &sa);
 	if (error == 0 &&
 	    sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_local)) {
 		bcopy(sa, &kif->kf_un.kf_sock.kf_sa_local, sa->sa_len);
 		free(sa, M_SONAME);
 	}
-	error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
+	error = so->so_proto->pr_peeraddr(so, &sa);
 	if (error == 0 &&
 	    sa->sa_len <= sizeof(kif->kf_un.kf_sock.kf_sa_peer)) {
 		bcopy(sa, &kif->kf_un.kf_sock.kf_sa_peer, sa->sa_len);
 		free(sa, M_SONAME);
 	}
 	strncpy(kif->kf_path, so->so_proto->pr_domain->dom_name,
 	    sizeof(kif->kf_path));
 	CURVNET_RESTORE();
 	return (0);	
 }
 
 /*
  * Use the 'backend3' field in AIO jobs to store the amount of data
  * completed by the AIO job so far.
  */
 #define	aio_done	backend3
 
 static STAILQ_HEAD(, task) soaio_jobs;
 static struct mtx soaio_jobs_lock;
 static struct task soaio_kproc_task;
 static int soaio_starting, soaio_idle, soaio_queued;
 static struct unrhdr *soaio_kproc_unr;
 
 static int soaio_max_procs = MAX_AIO_PROCS;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, max_procs, CTLFLAG_RW, &soaio_max_procs, 0,
     "Maximum number of kernel processes to use for async socket IO");
 
 static int soaio_num_procs;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, num_procs, CTLFLAG_RD, &soaio_num_procs, 0,
     "Number of active kernel processes for async socket IO");
 
 static int soaio_target_procs = TARGET_AIO_PROCS;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, target_procs, CTLFLAG_RD,
     &soaio_target_procs, 0,
     "Preferred number of ready kernel processes for async socket IO");
 
 static int soaio_lifetime;
 SYSCTL_INT(_kern_ipc_aio, OID_AUTO, lifetime, CTLFLAG_RW, &soaio_lifetime, 0,
     "Maximum lifetime for idle aiod");
 
 static void
 soaio_kproc_loop(void *arg)
 {
 	struct proc *p;
 	struct vmspace *myvm;
 	struct task *task;
 	int error, id, pending;
 
 	id = (intptr_t)arg;
 
 	/*
 	 * Grab an extra reference on the daemon's vmspace so that it
 	 * doesn't get freed by jobs that switch to a different
 	 * vmspace.
 	 */
 	p = curproc;
 	myvm = vmspace_acquire_ref(p);
 
 	mtx_lock(&soaio_jobs_lock);
 	MPASS(soaio_starting > 0);
 	soaio_starting--;
 	for (;;) {
 		while (!STAILQ_EMPTY(&soaio_jobs)) {
 			task = STAILQ_FIRST(&soaio_jobs);
 			STAILQ_REMOVE_HEAD(&soaio_jobs, ta_link);
 			soaio_queued--;
 			pending = task->ta_pending;
 			task->ta_pending = 0;
 			mtx_unlock(&soaio_jobs_lock);
 
 			task->ta_func(task->ta_context, pending);
 
 			mtx_lock(&soaio_jobs_lock);
 		}
 		MPASS(soaio_queued == 0);
 
 		if (p->p_vmspace != myvm) {
 			mtx_unlock(&soaio_jobs_lock);
 			vmspace_switch_aio(myvm);
 			mtx_lock(&soaio_jobs_lock);
 			continue;
 		}
 
 		soaio_idle++;
 		error = mtx_sleep(&soaio_idle, &soaio_jobs_lock, 0, "-",
 		    soaio_lifetime);
 		soaio_idle--;
 		if (error == EWOULDBLOCK && STAILQ_EMPTY(&soaio_jobs) &&
 		    soaio_num_procs > soaio_target_procs)
 			break;
 	}
 	soaio_num_procs--;
 	mtx_unlock(&soaio_jobs_lock);
 	free_unr(soaio_kproc_unr, id);
 	kproc_exit(0);
 }
 
 static void
 soaio_kproc_create(void *context, int pending)
 {
 	struct proc *p;
 	int error, id;
 
 	mtx_lock(&soaio_jobs_lock);
 	for (;;) {
 		if (soaio_num_procs < soaio_target_procs) {
 			/* Must create */
 		} else if (soaio_num_procs >= soaio_max_procs) {
 			/*
 			 * Hit the limit on kernel processes, don't
 			 * create another one.
 			 */
 			break;
 		} else if (soaio_queued <= soaio_idle + soaio_starting) {
 			/*
 			 * No more AIO jobs waiting for a process to be
 			 * created, so stop.
 			 */
 			break;
 		}
 		soaio_starting++;
 		mtx_unlock(&soaio_jobs_lock);
 
 		id = alloc_unr(soaio_kproc_unr);
 		error = kproc_create(soaio_kproc_loop, (void *)(intptr_t)id,
 		    &p, 0, 0, "soaiod%d", id);
 		if (error != 0) {
 			free_unr(soaio_kproc_unr, id);
 			mtx_lock(&soaio_jobs_lock);
 			soaio_starting--;
 			break;
 		}
 
 		mtx_lock(&soaio_jobs_lock);
 		soaio_num_procs++;
 	}
 	mtx_unlock(&soaio_jobs_lock);
 }
 
 void
 soaio_enqueue(struct task *task)
 {
 
 	mtx_lock(&soaio_jobs_lock);
 	MPASS(task->ta_pending == 0);
 	task->ta_pending++;
 	STAILQ_INSERT_TAIL(&soaio_jobs, task, ta_link);
 	soaio_queued++;
 	if (soaio_queued <= soaio_idle)
 		wakeup_one(&soaio_idle);
 	else if (soaio_num_procs < soaio_max_procs)
 		taskqueue_enqueue(taskqueue_thread, &soaio_kproc_task);
 	mtx_unlock(&soaio_jobs_lock);
 }
 
 static void
 soaio_init(void)
 {
 
 	soaio_lifetime = AIOD_LIFETIME_DEFAULT;
 	STAILQ_INIT(&soaio_jobs);
 	mtx_init(&soaio_jobs_lock, "soaio jobs", NULL, MTX_DEF);
 	soaio_kproc_unr = new_unrhdr(1, INT_MAX, NULL);
 	TASK_INIT(&soaio_kproc_task, 0, soaio_kproc_create, NULL);
 }
 SYSINIT(soaio, SI_SUB_VFS, SI_ORDER_ANY, soaio_init, NULL);
 
 static __inline int
 soaio_ready(struct socket *so, struct sockbuf *sb)
 {
 	return (sb == &so->so_rcv ? soreadable(so) : sowriteable(so));
 }
 
 static void
 soaio_process_job(struct socket *so, sb_which which, struct kaiocb *job)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct sockbuf *sb = sobuf(so, which);
 #ifdef MAC
 	struct file *fp = job->fd_file;
 #endif
 	size_t cnt, done, job_total_nbytes __diagused;
 	long ru_before;
 	int error, flags;
 
 	SOCK_BUF_UNLOCK(so, which);
 	aio_switch_vmspace(job);
 	td = curthread;
 retry:
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
 
 	job_total_nbytes = job->uiop->uio_resid + job->aio_done;
 	done = job->aio_done;
 	cnt = job->uiop->uio_resid;
 	job->uiop->uio_offset = 0;
 	job->uiop->uio_td = td;
 	flags = MSG_NBIO;
 
 	/*
 	 * For resource usage accounting, only count a completed request
 	 * as a single message to avoid counting multiple calls to
 	 * sosend/soreceive on a blocking socket.
 	 */
 
 	if (sb == &so->so_rcv) {
 		ru_before = td->td_ru.ru_msgrcv;
 #ifdef MAC
 		error = mac_socket_check_receive(fp->f_cred, so);
 		if (error == 0)
 
 #endif
 			error = soreceive(so, NULL, job->uiop, NULL, NULL,
 			    &flags);
 		if (td->td_ru.ru_msgrcv != ru_before)
 			job->msgrcv = 1;
 	} else {
 		if (!TAILQ_EMPTY(&sb->sb_aiojobq))
 			flags |= MSG_MORETOCOME;
 		ru_before = td->td_ru.ru_msgsnd;
 #ifdef MAC
 		error = mac_socket_check_send(fp->f_cred, so);
 		if (error == 0)
 #endif
 			error = sosend(so, NULL, job->uiop, NULL, NULL, flags,
 			    td);
 		if (td->td_ru.ru_msgsnd != ru_before)
 			job->msgsnd = 1;
 		if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 	}
 
 	done += cnt - job->uiop->uio_resid;
 	job->aio_done = done;
 	td->td_ucred = td_savedcred;
 
 	if (error == EWOULDBLOCK) {
 		/*
 		 * The request was either partially completed or not
 		 * completed at all due to racing with a read() or
 		 * write() on the socket.  If the socket is
 		 * non-blocking, return with any partial completion.
 		 * If the socket is blocking or if no progress has
 		 * been made, requeue this request at the head of the
 		 * queue to try again when the socket is ready.
 		 */
 		MPASS(done != job_total_nbytes);
 		SOCK_BUF_LOCK(so, which);
 		if (done == 0 || !(so->so_state & SS_NBIO)) {
 			empty_results++;
 			if (soaio_ready(so, sb)) {
 				empty_retries++;
 				SOCK_BUF_UNLOCK(so, which);
 				goto retry;
 			}
 			
 			if (!aio_set_cancel_function(job, soo_aio_cancel)) {
 				SOCK_BUF_UNLOCK(so, which);
 				if (done != 0)
 					aio_complete(job, done, 0);
 				else
 					aio_cancel(job);
 				SOCK_BUF_LOCK(so, which);
 			} else {
 				TAILQ_INSERT_HEAD(&sb->sb_aiojobq, job, list);
 			}
 			return;
 		}
 		SOCK_BUF_UNLOCK(so, which);
 	}		
 	if (done != 0 && (error == ERESTART || error == EINTR ||
 	    error == EWOULDBLOCK))
 		error = 0;
 	if (error)
 		aio_complete(job, -1, error);
 	else
 		aio_complete(job, done, 0);
 	SOCK_BUF_LOCK(so, which);
 }
 
 static void
 soaio_process_sb(struct socket *so, sb_which which)
 {
 	struct kaiocb *job;
 	struct sockbuf *sb = sobuf(so, which);
 
 	CURVNET_SET(so->so_vnet);
 	SOCK_BUF_LOCK(so, which);
 	while (!TAILQ_EMPTY(&sb->sb_aiojobq) && soaio_ready(so, sb)) {
 		job = TAILQ_FIRST(&sb->sb_aiojobq);
 		TAILQ_REMOVE(&sb->sb_aiojobq, job, list);
 		if (!aio_clear_cancel_function(job))
 			continue;
 
 		soaio_process_job(so, which, job);
 	}
 
 	/*
 	 * If there are still pending requests, the socket must not be
 	 * ready so set SB_AIO to request a wakeup when the socket
 	 * becomes ready.
 	 */
 	if (!TAILQ_EMPTY(&sb->sb_aiojobq))
 		sb->sb_flags |= SB_AIO;
 	sb->sb_flags &= ~SB_AIO_RUNNING;
 	SOCK_BUF_UNLOCK(so, which);
 
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 soaio_rcv(void *context, int pending)
 {
 	struct socket *so;
 
 	so = context;
 	soaio_process_sb(so, SO_RCV);
 }
 
 void
 soaio_snd(void *context, int pending)
 {
 	struct socket *so;
 
 	so = context;
 	soaio_process_sb(so, SO_SND);
 }
 
 void
 sowakeup_aio(struct socket *so, sb_which which)
 {
 	struct sockbuf *sb = sobuf(so, which);
 
 	SOCK_BUF_LOCK_ASSERT(so, which);
 
 	sb->sb_flags &= ~SB_AIO;
 	if (sb->sb_flags & SB_AIO_RUNNING)
 		return;
 	sb->sb_flags |= SB_AIO_RUNNING;
 	soref(so);
 	soaio_enqueue(&sb->sb_aiotask);
 }
 
 static void
 soo_aio_cancel(struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	long done;
 	int opcode;
 	sb_which which;
 
 	so = job->fd_file->f_data;
 	opcode = job->uaiocb.aio_lio_opcode;
 	if (opcode & LIO_READ) {
 		sb = &so->so_rcv;
 		which = SO_RCV;
 	} else {
 		MPASS(opcode & LIO_WRITE);
 		sb = &so->so_snd;
 		which = SO_SND;
 	}
 
 	SOCK_BUF_LOCK(so, which);
 	if (!aio_cancel_cleared(job))
 		TAILQ_REMOVE(&sb->sb_aiojobq, job, list);
 	if (TAILQ_EMPTY(&sb->sb_aiojobq))
 		sb->sb_flags &= ~SB_AIO;
 	SOCK_BUF_UNLOCK(so, which);
 
 	done = job->aio_done;
 	if (done != 0)
 		aio_complete(job, done, 0);
 	else
 		aio_cancel(job);
 }
 
 static int
 soo_aio_queue(struct file *fp, struct kaiocb *job)
 {
 	struct socket *so;
 	struct sockbuf *sb;
 	sb_which which;
 	int error;
 
 	so = fp->f_data;
-	error = (*so->so_proto->pr_usrreqs->pru_aio_queue)(so, job);
+	error = so->so_proto->pr_aio_queue(so, job);
 	if (error == 0)
 		return (0);
 
 	/* Lock through the socket, since this may be a listening socket. */
 	switch (job->uaiocb.aio_lio_opcode & (LIO_WRITE | LIO_READ)) {
 	case LIO_READ:
 		SOCK_RECVBUF_LOCK(so);
 		sb = &so->so_rcv;
 		which = SO_RCV;
 		break;
 	case LIO_WRITE:
 		SOCK_SENDBUF_LOCK(so);
 		sb = &so->so_snd;
 		which = SO_SND;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	if (SOLISTENING(so)) {
 		SOCK_BUF_UNLOCK(so, which);
 		return (EINVAL);
 	}
 
 	if (!aio_set_cancel_function(job, soo_aio_cancel))
 		panic("new job was cancelled");
 	TAILQ_INSERT_TAIL(&sb->sb_aiojobq, job, list);
 	if (!(sb->sb_flags & SB_AIO_RUNNING)) {
 		if (soaio_ready(so, sb))
 			sowakeup_aio(so, which);
 		else
 			sb->sb_flags |= SB_AIO;
 	}
 	SOCK_BUF_UNLOCK(so, which);
 	return (0);
 }
diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c
index c6a79d34beb2..d2b8d9095f13 100644
--- a/sys/kern/uipc_domain.c
+++ b/sys/kern/uipc_domain.c
@@ -1,444 +1,512 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_domain.c	8.2 (Berkeley) 10/18/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/epoch.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/rmlock.h>
 #include <sys/socketvar.h>
 #include <sys/systm.h>
+#include <sys/stat.h>		/* XXXGL: remove */
 
 #include <machine/atomic.h>
 
 #include <net/vnet.h>
 
 /*
  * System initialization
  *
  * Note: domain initialization takes place on a per domain basis
  * as a result of traversing a SYSINIT linker set.  Most likely,
  * each domain would want to call DOMAIN_SET(9) itself, which
  * would cause the domain to be added just after domaininit()
  * is called during startup.
  *
  * See DOMAIN_SET(9) for details on its use.
  */
 
 static void domaininit(void *);
 SYSINIT(domain, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, domaininit, NULL);
 
 static void domainfinalize(void *);
 SYSINIT(domainfin, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST, domainfinalize,
     NULL);
 
 struct domain *domains;		/* registered protocol domains */
 int domain_init_status = 0;
 static struct mtx dom_mtx;		/* domain list lock */
 MTX_SYSINIT(domain, &dom_mtx, "domain list", MTX_DEF);
 
+static int
+pr_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_aio_queue_notsupp(struct socket *so, struct kaiocb *job)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
+    struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
+    struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_control_notsupp(struct socket *so, u_long cmd, void *data,
+    struct ifnet *ifp, struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_disconnect_notsupp(struct socket *so)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_listen_notsupp(struct socket *so, int backlog, struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_rcvd_notsupp(struct socket *so, int flags)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_send_notsupp(struct socket *so, int flags, struct mbuf *m,
+    struct sockaddr *addr, struct mbuf *control, struct thread *td)
+{
+	if (control != NULL)
+		m_freem(control);
+	if ((flags & PRUS_NOTREADY) == 0)
+		m_freem(m);
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_ready_notsupp(struct socket *so, struct mbuf *m, int count)
+{
+	return (EOPNOTSUPP);
+}
+
 /*
- * Dummy protocol specific user requests function pointer array.
- * All functions return EOPNOTSUPP.
+ * This isn't really a ``null'' operation, but it's the default one and
+ * doesn't do anything destructive.
  */
-struct pr_usrreqs nousrreqs = {
-	.pru_accept =		pru_accept_notsupp,
-	.pru_attach =		pru_attach_notsupp,
-	.pru_bind =		pru_bind_notsupp,
-	.pru_connect =		pru_connect_notsupp,
-	.pru_connect2 =		pru_connect2_notsupp,
-	.pru_control =		pru_control_notsupp,
-	.pru_disconnect	=	pru_disconnect_notsupp,
-	.pru_listen =		pru_listen_notsupp,
-	.pru_peeraddr =		pru_peeraddr_notsupp,
-	.pru_rcvd =		pru_rcvd_notsupp,
-	.pru_rcvoob =		pru_rcvoob_notsupp,
-	.pru_send =		pru_send_notsupp,
-	.pru_sense =		pru_sense_null,
-	.pru_shutdown =		pru_shutdown_notsupp,
-	.pru_sockaddr =		pru_sockaddr_notsupp,
-	.pru_sosend =		pru_sosend_notsupp,
-	.pru_soreceive =	pru_soreceive_notsupp,
-	.pru_sopoll =		pru_sopoll_notsupp,
-};
+static int
+pr_sense_notsupp(struct socket *so, struct stat *sb)
+{
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	return (0);
+}
 
-static void
-pr_usrreqs_init(struct protosw *pr)
+static int
+pr_shutdown_notsupp(struct socket *so)
 {
-	struct pr_usrreqs *pu;
+	return (EOPNOTSUPP);
+}
 
-	pu = pr->pr_usrreqs;
-	KASSERT(pu != NULL, ("%s: %ssw[%d] has no usrreqs!", __func__,
-	    pr->pr_domain->dom_name,
-	    (int)(pr - pr->pr_domain->dom_protosw)));
+static int
+pr_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
+{
+	return (EOPNOTSUPP);
+}
 
-	/*
-	 * Protocol switch methods fall into three categories: mandatory,
-	 * mandatory but protosw_init() provides a default, and optional.
-	 *
-	 * For true protocols (i.e., pru_attach != NULL), KASSERT truly
-	 * mandatory methods with no defaults, and initialize defaults for
-	 * other mandatory methods if the protocol hasn't defined an
-	 * implementation (NULL function pointer).
-	 */
-#if 0
-	if (pu->pru_attach != NULL) {
-		KASSERT(pu->pru_abort != NULL,
-		    ("protosw_init: %ssw[%d] pru_abort NULL",
-		    pr->pr_domain->dom_name,
-		    (int)(pr - pr->pr_domain->dom_protosw)));
-		KASSERT(pu->pru_send != NULL,
-		    ("protosw_init: %ssw[%d] pru_send NULL",
-		    pr->pr_domain->dom_name,
-		    (int)(pr - pr->pr_domain->dom_protosw)));
-	}
-#endif
+static int
+pr_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
 
-#define DEFAULT(foo, bar)	if ((foo) == NULL)  (foo) = (bar)
-	DEFAULT(pu->pru_accept, pru_accept_notsupp);
-	DEFAULT(pu->pru_aio_queue, pru_aio_queue_notsupp);
-	DEFAULT(pu->pru_bind, pru_bind_notsupp);
-	DEFAULT(pu->pru_bindat, pru_bindat_notsupp);
-	DEFAULT(pu->pru_connect, pru_connect_notsupp);
-	DEFAULT(pu->pru_connect2, pru_connect2_notsupp);
-	DEFAULT(pu->pru_connectat, pru_connectat_notsupp);
-	DEFAULT(pu->pru_control, pru_control_notsupp);
-	DEFAULT(pu->pru_disconnect, pru_disconnect_notsupp);
-	DEFAULT(pu->pru_listen, pru_listen_notsupp);
-	DEFAULT(pu->pru_peeraddr, pru_peeraddr_notsupp);
-	DEFAULT(pu->pru_rcvd, pru_rcvd_notsupp);
-	DEFAULT(pu->pru_rcvoob, pru_rcvoob_notsupp);
-	DEFAULT(pu->pru_sense, pru_sense_null);
-	DEFAULT(pu->pru_shutdown, pru_shutdown_notsupp);
-	DEFAULT(pu->pru_sockaddr, pru_sockaddr_notsupp);
-	DEFAULT(pu->pru_sosend, sosend_generic);
-	DEFAULT(pu->pru_soreceive, soreceive_generic);
-	DEFAULT(pu->pru_sopoll, sopoll_generic);
-	DEFAULT(pu->pru_ready, pru_ready_notsupp);
-#undef DEFAULT
+static int
+pr_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
+    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+pr_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
+    struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static void
+pr_init(struct protosw *pr)
+{
+
+	KASSERT(pr->pr_attach != NULL,
+	    ("%s: protocol doesn't have pr_attach", __func__));
+
+#define	DEFAULT(foo, bar)	if (pr->foo == NULL) pr->foo = bar
+	DEFAULT(pr_sosend, sosend_generic);
+	DEFAULT(pr_soreceive, soreceive_generic);
+	DEFAULT(pr_sopoll, sopoll_generic);
+
+#define NOTSUPP(foo)	if (pr->foo == NULL)  pr->foo = foo ## _notsupp
+	NOTSUPP(pr_accept);
+	NOTSUPP(pr_aio_queue);
+	NOTSUPP(pr_bind);
+	NOTSUPP(pr_bindat);
+	NOTSUPP(pr_connect);
+	NOTSUPP(pr_connect2);
+	NOTSUPP(pr_connectat);
+	NOTSUPP(pr_control);
+	NOTSUPP(pr_disconnect);
+	NOTSUPP(pr_listen);
+	NOTSUPP(pr_peeraddr);
+	NOTSUPP(pr_rcvd);
+	NOTSUPP(pr_rcvoob);
+	NOTSUPP(pr_send);
+	NOTSUPP(pr_sense);
+	NOTSUPP(pr_shutdown);
+	NOTSUPP(pr_sockaddr);
+	NOTSUPP(pr_sosend);
+	NOTSUPP(pr_soreceive);
+	NOTSUPP(pr_sopoll);
+	NOTSUPP(pr_ready);
 }
 
 /*
  * Add a new protocol domain to the list of supported domains
  * Note: you cant unload it again because a socket may be using it.
  * XXX can't fail at this time.
  */
 void
 domain_init(void *arg)
 {
 	struct domain *dp = arg;
 	struct protosw *pr;
 	int flags;
 
 	MPASS(IS_DEFAULT_VNET(curvnet));
 
 	flags = atomic_load_acq_int(&dp->dom_flags);
 	if ((flags & DOMF_SUPPORTED) == 0)
 		return;
 	MPASS((flags & DOMF_INITED) == 0);
 
-	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
-		pr_usrreqs_init(pr);
-	}
+	for (int i = 0; i < dp->dom_nprotosw; i++)
+		if ((pr = dp->dom_protosw[i]) != NULL) {
+			pr->pr_domain = dp;
+			pr_init(pr);
+		}
 
 	/*
 	 * update global information about maximums
 	 */
 	max_hdr = max_linkhdr + max_protohdr;
 	max_datalen = MHLEN - max_hdr;
 	if (max_datalen < 1)
 		panic("%s: max_datalen < 1", __func__);
 	atomic_set_rel_int(&dp->dom_flags, DOMF_INITED);
 }
 
 /*
  * Add a new protocol domain to the list of supported domains
  * Note: you cant unload it again because a socket may be using it.
  * XXX can't fail at this time.
  */
 void
 domain_add(void *data)
 {
 	struct domain *dp;
 
 	dp = (struct domain *)data;
 	if (dp->dom_probe != NULL && (*dp->dom_probe)() != 0)
 		return;
 	atomic_set_rel_int(&dp->dom_flags, DOMF_SUPPORTED);
 	mtx_lock(&dom_mtx);
 	dp->dom_next = domains;
 	domains = dp;
 
 	KASSERT(domain_init_status >= 1,
 	    ("attempt to domain_add(%s) before domaininit()",
 	    dp->dom_name));
 #ifndef INVARIANTS
 	if (domain_init_status < 1)
 		printf("WARNING: attempt to domain_add(%s) before "
 		    "domaininit()\n", dp->dom_name);
 #endif
 	mtx_unlock(&dom_mtx);
 }
 
 void
 domain_remove(void *data)
 {
 	struct domain *dp = (struct domain *)data;
 
 	if ((dp->dom_flags & DOMF_UNLOADABLE) == 0)
 		return;
 
 	mtx_lock(&dom_mtx);
 	if (domains == dp) {
 		domains = dp->dom_next;
 	} else {
 		struct domain *curr;
 		for (curr = domains; curr != NULL; curr = curr->dom_next) {
 			if (curr->dom_next == dp) {
 				curr->dom_next = dp->dom_next;
 				break;
 			}
 		}
 	}
 	mtx_unlock(&dom_mtx);
 }
 
 /* ARGSUSED*/
 static void
 domaininit(void *dummy)
 {
 
 	if (max_linkhdr < 16)		/* XXX */
 		max_linkhdr = 16;
 
 	mtx_lock(&dom_mtx);
 	KASSERT(domain_init_status == 0, ("domaininit called too late!"));
 	domain_init_status = 1;
 	mtx_unlock(&dom_mtx);
 }
 
 /* ARGSUSED*/
 static void
 domainfinalize(void *dummy)
 {
 
 	mtx_lock(&dom_mtx);
 	KASSERT(domain_init_status == 1, ("domainfinalize called too late!"));
 	domain_init_status = 2;
 	mtx_unlock(&dom_mtx);	
 }
 
 struct domain *
 pffinddomain(int family)
 {
 	struct domain *dp;
 
 	for (dp = domains; dp != NULL; dp = dp->dom_next)
 		if (dp->dom_family == family)
 			return (dp);
 	return (NULL);
 }
 
 struct protosw *
 pffindtype(int family, int type)
 {
 	struct domain *dp;
 	struct protosw *pr;
 
 	dp = pffinddomain(family);
 	if (dp == NULL)
 		return (NULL);
 
-	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
-		if (pr->pr_type && pr->pr_type == type)
+	for (int i = 0; i < dp->dom_nprotosw; i++)
+		if ((pr = dp->dom_protosw[i]) != NULL && pr->pr_type == type)
 			return (pr);
+
 	return (NULL);
 }
 
 struct protosw *
 pffindproto(int family, int protocol, int type)
 {
 	struct domain *dp;
 	struct protosw *pr;
 	struct protosw *maybe;
 
-	maybe = NULL;
-	if (family == 0)
-		return (NULL);
-
 	dp = pffinddomain(family);
 	if (dp == NULL)
 		return (NULL);
 
-	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
+	maybe = NULL;
+	for (int i = 0; i < dp->dom_nprotosw; i++) {
+		if ((pr = dp->dom_protosw[i]) == NULL)
+			continue;
 		if ((pr->pr_protocol == protocol) && (pr->pr_type == type))
 			return (pr);
 
+		/* XXX: raw catches all. Why? */
 		if (type == SOCK_RAW && pr->pr_type == SOCK_RAW &&
 		    pr->pr_protocol == 0 && maybe == NULL)
 			maybe = pr;
 	}
 	return (maybe);
 }
 
 /*
  * The caller must make sure that the new protocol is fully set up and ready to
  * accept requests before it is registered.
  */
 int
-pf_proto_register(int family, struct protosw *npr)
+protosw_register(struct domain *dp, struct protosw *npr)
 {
-	struct domain *dp;
-	struct protosw *pr, *fpr;
-
-	/* Sanity checks. */
-	if (family == 0)
-		return (EPFNOSUPPORT);
-	if (npr->pr_type == 0)
-		return (EPROTOTYPE);
-	if (npr->pr_protocol == 0)
-		return (EPROTONOSUPPORT);
-	if (npr->pr_usrreqs == NULL)
-		return (ENXIO);
-
-	/* Try to find the specified domain based on the family. */
-	dp = pffinddomain(family);
-	if (dp == NULL)
-		return (EPFNOSUPPORT);
+	struct protosw **prp;
 
-	/* Initialize backpointer to struct domain. */
-	npr->pr_domain = dp;
-	fpr = NULL;
+	MPASS(dp);
+	MPASS(npr && npr->pr_type > 0 && npr->pr_protocol > 0);
 
+	prp = NULL;
 	/*
 	 * Protect us against races when two protocol registrations for
 	 * the same protocol happen at the same time.
 	 */
 	mtx_lock(&dom_mtx);
+	for (int i = 0; i < dp->dom_nprotosw; i++) {
+		if (dp->dom_protosw[i] == NULL) {
+			/* Remember the first free spacer. */
+			if (prp == NULL)
+				prp = &dp->dom_protosw[i];
+		} else {
+			/*
+			 * The new protocol must not yet exist.
+			 * XXXAO: Check only protocol?
+			 * XXXGL: Maybe assert that it doesn't exist?
+			 */
+			if ((dp->dom_protosw[i]->pr_type == npr->pr_type) &&
+			    (dp->dom_protosw[i]->pr_protocol ==
+			    npr->pr_protocol)) {
+				mtx_unlock(&dom_mtx);
+				return (EEXIST);
+			}
 
-	/* The new protocol must not yet exist. */
-	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
-		if ((pr->pr_type == npr->pr_type) &&
-		    (pr->pr_protocol == npr->pr_protocol)) {
-			mtx_unlock(&dom_mtx);
-			return (EEXIST);	/* XXX: Check only protocol? */
 		}
-		/* While here, remember the first free spacer. */
-		if ((fpr == NULL) && (pr->pr_protocol == PROTO_SPACER))
-			fpr = pr;
 	}
 
 	/* If no free spacer is found we can't add the new protocol. */
-	if (fpr == NULL) {
+	if (prp == NULL) {
 		mtx_unlock(&dom_mtx);
 		return (ENOMEM);
 	}
 
-	/* Copy the new struct protosw over the spacer. */
-	bcopy(npr, fpr, sizeof(*fpr));
-
-	pr_usrreqs_init(fpr);
-
-	/* Job is done, no more protection required. */
+	npr->pr_domain = dp;
+	pr_init(npr);
+	*prp = npr;
 	mtx_unlock(&dom_mtx);
 
 	return (0);
 }
 
 /*
  * The caller must make sure the protocol and its functions correctly shut down
  * all sockets and release all locks and memory references.
  */
 int
-pf_proto_unregister(int family, int protocol, int type)
+protosw_unregister(struct protosw *pr)
 {
 	struct domain *dp;
-	struct protosw *pr, *dpr;
+	struct protosw **prp;
 
-	/* Sanity checks. */
-	if (family == 0)
-		return (EPFNOSUPPORT);
-	if (protocol == 0)
-		return (EPROTONOSUPPORT);
-	if (type == 0)
-		return (EPROTOTYPE);
+	dp = pr->pr_domain;
+	prp = NULL;
 
-	/* Try to find the specified domain based on the family type. */
-	dp = pffinddomain(family);
-	if (dp == NULL)
-		return (EPFNOSUPPORT);
-
-	dpr = NULL;
-
-	/* Lock out everyone else while we are manipulating the protosw. */
 	mtx_lock(&dom_mtx);
-
 	/* The protocol must exist and only once. */
-	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) {
-		if ((pr->pr_type == type) && (pr->pr_protocol == protocol)) {
-			if (dpr != NULL) {
-				mtx_unlock(&dom_mtx);
-				return (EMLINK);   /* Should not happen! */
-			} else
-				dpr = pr;
+	for (int i = 0; i < dp->dom_nprotosw; i++) {
+		if (dp->dom_protosw[i] == pr) {
+			KASSERT(prp == NULL,
+			    ("%s: domain %p protocol %p registered twice\n",
+			    __func__, dp, pr));
+			prp = &dp->dom_protosw[i];
 		}
 	}
 
-	/* Protocol does not exist. */
-	if (dpr == NULL) {
+	/* Protocol does not exist.  XXXGL: assert that it does? */
+	if (prp == NULL) {
 		mtx_unlock(&dom_mtx);
 		return (EPROTONOSUPPORT);
 	}
 
 	/* De-orbit the protocol and make the slot available again. */
-	dpr->pr_type = 0;
-	dpr->pr_domain = dp;
-	dpr->pr_protocol = PROTO_SPACER;
-	dpr->pr_flags = 0;
-	dpr->pr_ctloutput = NULL;
-	dpr->pr_usrreqs = &nousrreqs;
-
-	/* Job is done, not more protection required. */
+	*prp = NULL;
 	mtx_unlock(&dom_mtx);
 
 	return (0);
 }
diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index 5855e62bd983..ff20b3652407 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -1,3252 +1,3252 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause
  *
  * Copyright (c) 2014-2019 Netflix Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/domainset.h>
 #include <sys/endian.h>
 #include <sys/ktls.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/rmlock.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/refcount.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/kthread.h>
 #include <sys/uio.h>
 #include <sys/vmmeter.h>
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 #include <machine/pcb.h>
 #endif
 #include <machine/vmparam.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #ifdef RSS
 #include <net/netisr.h>
 #include <net/rss_config.h>
 #endif
 #include <net/route.h>
 #include <net/route/nhop.h>
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #endif
 #include <netinet/tcp_var.h>
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <opencrypto/cryptodev.h>
 #include <opencrypto/ktls.h>
 #include <vm/uma_dbg.h>
 #include <vm/vm.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pagequeue.h>
 
 struct ktls_wq {
 	struct mtx	mtx;
 	STAILQ_HEAD(, mbuf) m_head;
 	STAILQ_HEAD(, socket) so_head;
 	bool		running;
 	int		lastallocfail;
 } __aligned(CACHE_LINE_SIZE);
 
 struct ktls_alloc_thread {
 	uint64_t wakeups;
 	uint64_t allocs;
 	struct thread *td;
 	int running;
 };
 
 struct ktls_domain_info {
 	int count;
 	int cpu[MAXCPU];
 	struct ktls_alloc_thread alloc_td;
 };
 
 struct ktls_domain_info ktls_domains[MAXMEMDOM];
 static struct ktls_wq *ktls_wq;
 static struct proc *ktls_proc;
 static uma_zone_t ktls_session_zone;
 static uma_zone_t ktls_buffer_zone;
 static uint16_t ktls_cpuid_lookup[MAXCPU];
 static int ktls_init_state;
 static struct sx ktls_init_lock;
 SX_SYSINIT(ktls_init_lock, &ktls_init_lock, "ktls init");
 
 SYSCTL_NODE(_kern_ipc, OID_AUTO, tls, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Kernel TLS offload stats");
 
 #ifdef RSS
 static int ktls_bind_threads = 1;
 #else
 static int ktls_bind_threads;
 #endif
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, bind_threads, CTLFLAG_RDTUN,
     &ktls_bind_threads, 0,
     "Bind crypto threads to cores (1) or cores and domains (2) at boot");
 
 static u_int ktls_maxlen = 16384;
 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, maxlen, CTLFLAG_RDTUN,
     &ktls_maxlen, 0, "Maximum TLS record size");
 
 static int ktls_number_threads;
 SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
     &ktls_number_threads, 0,
     "Number of TLS threads in thread-pool");
 
 unsigned int ktls_ifnet_max_rexmit_pct = 2;
 SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
     &ktls_ifnet_max_rexmit_pct, 2,
     "Max percent bytes retransmitted before ifnet TLS is disabled");
 
 static bool ktls_offload_enable;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
     &ktls_offload_enable, 0,
     "Enable support for kernel TLS offload");
 
 static bool ktls_cbc_enable = true;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, cbc_enable, CTLFLAG_RWTUN,
     &ktls_cbc_enable, 1,
     "Enable Support of AES-CBC crypto for kernel TLS");
 
 static bool ktls_sw_buffer_cache = true;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, sw_buffer_cache, CTLFLAG_RDTUN,
     &ktls_sw_buffer_cache, 1,
     "Enable caching of output buffers for SW encryption");
 
 static int ktls_max_alloc = 128;
 SYSCTL_INT(_kern_ipc_tls, OID_AUTO, max_alloc, CTLFLAG_RWTUN,
     &ktls_max_alloc, 128,
     "Max number of 16k buffers to allocate in thread context");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_tasks_active);
 SYSCTL_COUNTER_U64(_kern_ipc_tls, OID_AUTO, tasks_active, CTLFLAG_RD,
     &ktls_tasks_active, "Number of active tasks");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_pending);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_pending, CTLFLAG_RD,
     &ktls_cnt_tx_pending,
     "Number of TLS 1.0 records waiting for earlier TLS records");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_tx_queued);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_tx_inqueue, CTLFLAG_RD,
     &ktls_cnt_tx_queued,
     "Number of TLS records in queue to tasks for SW encryption");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_cnt_rx_queued);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, sw_rx_inqueue, CTLFLAG_RD,
     &ktls_cnt_rx_queued,
     "Number of TLS sockets in queue to tasks for SW decryption");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_total);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, offload_total,
     CTLFLAG_RD, &ktls_offload_total,
     "Total successful TLS setups (parameters set)");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_enable_calls);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, enable_calls,
     CTLFLAG_RD, &ktls_offload_enable_calls,
     "Total number of TLS enable calls made");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_active);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, active, CTLFLAG_RD,
     &ktls_offload_active, "Total Active TLS sessions");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_corrupted_records);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, corrupted_records, CTLFLAG_RD,
     &ktls_offload_corrupted_records, "Total corrupted TLS records received");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_offload_failed_crypto);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, failed_crypto, CTLFLAG_RD,
     &ktls_offload_failed_crypto, "Total TLS crypto failures");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_ifnet);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_ifnet, CTLFLAG_RD,
     &ktls_switch_to_ifnet, "TLS sessions switched from SW to ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_to_sw);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_to_sw, CTLFLAG_RD,
     &ktls_switch_to_sw, "TLS sessions switched from ifnet to SW");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
     &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD,
     &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD,
     &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet");
 
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Software TLS session stats");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Hardware (ifnet) TLS session stats");
 #ifdef TCP_OFFLOAD
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, toe, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "TOE TLS session stats");
 #endif
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, cbc, CTLFLAG_RD, &ktls_sw_cbc,
     "Active number of software TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, gcm, CTLFLAG_RD, &ktls_sw_gcm,
     "Active number of software TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_sw_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_sw, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_sw_chacha20,
     "Active number of software TLS sessions using Chacha20-Poly1305");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_ifnet_cbc,
     "Active number of ifnet TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_ifnet_gcm,
     "Active number of ifnet TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_ifnet_chacha20,
     "Active number of ifnet TLS sessions using Chacha20-Poly1305");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset, CTLFLAG_RD,
     &ktls_ifnet_reset, "TLS sessions updated to a new ifnet send tag");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_dropped);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_dropped, CTLFLAG_RD,
     &ktls_ifnet_reset_dropped,
     "TLS sessions dropped after failing to update ifnet send tag");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_reset_failed);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_ifnet, OID_AUTO, reset_failed, CTLFLAG_RD,
     &ktls_ifnet_reset_failed,
     "TLS sessions that failed to allocate a new ifnet send tag");
 
 static int ktls_ifnet_permitted;
 SYSCTL_UINT(_kern_ipc_tls_ifnet, OID_AUTO, permitted, CTLFLAG_RWTUN,
     &ktls_ifnet_permitted, 1,
     "Whether to permit hardware (ifnet) TLS sessions");
 
 #ifdef TCP_OFFLOAD
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_cbc);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, cbc, CTLFLAG_RD,
     &ktls_toe_cbc,
     "Active number of TOE TLS sessions using AES-CBC");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_gcm);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, gcm, CTLFLAG_RD,
     &ktls_toe_gcm,
     "Active number of TOE TLS sessions using AES-GCM");
 
 static COUNTER_U64_DEFINE_EARLY(ktls_toe_chacha20);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_toe, OID_AUTO, chacha20, CTLFLAG_RD,
     &ktls_toe_chacha20,
     "Active number of TOE TLS sessions using Chacha20-Poly1305");
 #endif
 
 static MALLOC_DEFINE(M_KTLS, "ktls", "Kernel TLS");
 
 static void ktls_cleanup(struct ktls_session *tls);
 #if defined(INET) || defined(INET6)
 static void ktls_reset_receive_tag(void *context, int pending);
 static void ktls_reset_send_tag(void *context, int pending);
 #endif
 static void ktls_work_thread(void *ctx);
 static void ktls_alloc_thread(void *ctx);
 
 #if defined(INET) || defined(INET6)
 static u_int
 ktls_get_cpu(struct socket *so)
 {
 	struct inpcb *inp;
 #ifdef NUMA
 	struct ktls_domain_info *di;
 #endif
 	u_int cpuid;
 
 	inp = sotoinpcb(so);
 #ifdef RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid != NETISR_CPUID_NONE)
 		return (cpuid);
 #endif
 	/*
 	 * Just use the flowid to shard connections in a repeatable
 	 * fashion.  Note that TLS 1.0 sessions rely on the
 	 * serialization provided by having the same connection use
 	 * the same queue.
 	 */
 #ifdef NUMA
 	if (ktls_bind_threads > 1 && inp->inp_numa_domain != M_NODOM) {
 		di = &ktls_domains[inp->inp_numa_domain];
 		cpuid = di->cpu[inp->inp_flowid % di->count];
 	} else
 #endif
 		cpuid = ktls_cpuid_lookup[inp->inp_flowid % ktls_number_threads];
 	return (cpuid);
 }
 #endif
 
 static int
 ktls_buffer_import(void *arg, void **store, int count, int domain, int flags)
 {
 	vm_page_t m;
 	int i, req;
 
 	KASSERT((ktls_maxlen & PAGE_MASK) == 0,
 	    ("%s: ktls max length %d is not page size-aligned",
 	    __func__, ktls_maxlen));
 
 	req = VM_ALLOC_WIRED | VM_ALLOC_NODUMP | malloc2vm_flags(flags);
 	for (i = 0; i < count; i++) {
 		m = vm_page_alloc_noobj_contig_domain(domain, req,
 		    atop(ktls_maxlen), 0, ~0ul, PAGE_SIZE, 0,
 		    VM_MEMATTR_DEFAULT);
 		if (m == NULL)
 			break;
 		store[i] = (void *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 	}
 	return (i);
 }
 
 static void
 ktls_buffer_release(void *arg __unused, void **store, int count)
 {
 	vm_page_t m;
 	int i, j;
 
 	for (i = 0; i < count; i++) {
 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)store[i]));
 		for (j = 0; j < atop(ktls_maxlen); j++) {
 			(void)vm_page_unwire_noq(m + j);
 			vm_page_free(m + j);
 		}
 	}
 }
 
 static void
 ktls_free_mext_contig(struct mbuf *m)
 {
 	M_ASSERTEXTPG(m);
 	uma_zfree(ktls_buffer_zone, (void *)PHYS_TO_DMAP(m->m_epg_pa[0]));
 }
 
 static int
 ktls_init(void)
 {
 	struct thread *td;
 	struct pcpu *pc;
 	int count, domain, error, i;
 
 	ktls_wq = malloc(sizeof(*ktls_wq) * (mp_maxid + 1), M_KTLS,
 	    M_WAITOK | M_ZERO);
 
 	ktls_session_zone = uma_zcreate("ktls_session",
 	    sizeof(struct ktls_session),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 
 	if (ktls_sw_buffer_cache) {
 		ktls_buffer_zone = uma_zcache_create("ktls_buffers",
 		    roundup2(ktls_maxlen, PAGE_SIZE), NULL, NULL, NULL, NULL,
 		    ktls_buffer_import, ktls_buffer_release, NULL,
 		    UMA_ZONE_FIRSTTOUCH);
 	}
 
 	/*
 	 * Initialize the workqueues to run the TLS work.  We create a
 	 * work queue for each CPU.
 	 */
 	CPU_FOREACH(i) {
 		STAILQ_INIT(&ktls_wq[i].m_head);
 		STAILQ_INIT(&ktls_wq[i].so_head);
 		mtx_init(&ktls_wq[i].mtx, "ktls work queue", NULL, MTX_DEF);
 		if (ktls_bind_threads > 1) {
 			pc = pcpu_find(i);
 			domain = pc->pc_domain;
 			count = ktls_domains[domain].count;
 			ktls_domains[domain].cpu[count] = i;
 			ktls_domains[domain].count++;
 		}
 		ktls_cpuid_lookup[ktls_number_threads] = i;
 		ktls_number_threads++;
 	}
 
 	/*
 	 * If we somehow have an empty domain, fall back to choosing
 	 * among all KTLS threads.
 	 */
 	if (ktls_bind_threads > 1) {
 		for (i = 0; i < vm_ndomains; i++) {
 			if (ktls_domains[i].count == 0) {
 				ktls_bind_threads = 1;
 				break;
 			}
 		}
 	}
 
 	/* Start kthreads for each workqueue. */
 	CPU_FOREACH(i) {
 		error = kproc_kthread_add(ktls_work_thread, &ktls_wq[i],
 		    &ktls_proc, &td, 0, 0, "KTLS", "thr_%d", i);
 		if (error) {
 			printf("Can't add KTLS thread %d error %d\n", i, error);
 			return (error);
 		}
 	}
 
 	/*
 	 * Start an allocation thread per-domain to perform blocking allocations
 	 * of 16k physically contiguous TLS crypto destination buffers.
 	 */
 	if (ktls_sw_buffer_cache) {
 		for (domain = 0; domain < vm_ndomains; domain++) {
 			if (VM_DOMAIN_EMPTY(domain))
 				continue;
 			if (CPU_EMPTY(&cpuset_domain[domain]))
 				continue;
 			error = kproc_kthread_add(ktls_alloc_thread,
 			    &ktls_domains[domain], &ktls_proc,
 			    &ktls_domains[domain].alloc_td.td,
 			    0, 0, "KTLS", "alloc_%d", domain);
 			if (error) {
 				printf("Can't add KTLS alloc thread %d error %d\n",
 				    domain, error);
 				return (error);
 			}
 		}
 	}
 
 	if (bootverbose)
 		printf("KTLS: Initialized %d threads\n", ktls_number_threads);
 	return (0);
 }
 
 static int
 ktls_start_kthreads(void)
 {
 	int error, state;
 
 start:
 	state = atomic_load_acq_int(&ktls_init_state);
 	if (__predict_true(state > 0))
 		return (0);
 	if (state < 0)
 		return (ENXIO);
 
 	sx_xlock(&ktls_init_lock);
 	if (ktls_init_state != 0) {
 		sx_xunlock(&ktls_init_lock);
 		goto start;
 	}
 
 	error = ktls_init();
 	if (error == 0)
 		state = 1;
 	else
 		state = -1;
 	atomic_store_rel_int(&ktls_init_state, state);
 	sx_xunlock(&ktls_init_lock);
 	return (error);
 }
 
 #if defined(INET) || defined(INET6)
 static int
 ktls_create_session(struct socket *so, struct tls_enable *en,
     struct ktls_session **tlsp, int direction)
 {
 	struct ktls_session *tls;
 	int error;
 
 	/* Only TLS 1.0 - 1.3 are supported. */
 	if (en->tls_vmajor != TLS_MAJOR_VER_ONE)
 		return (EINVAL);
 	if (en->tls_vminor < TLS_MINOR_VER_ZERO ||
 	    en->tls_vminor > TLS_MINOR_VER_THREE)
 		return (EINVAL);
 
 	if (en->auth_key_len < 0 || en->auth_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->cipher_key_len < 0 || en->cipher_key_len > TLS_MAX_PARAM_SIZE)
 		return (EINVAL);
 	if (en->iv_len < 0 || en->iv_len > sizeof(tls->params.iv))
 		return (EINVAL);
 
 	/* All supported algorithms require a cipher key. */
 	if (en->cipher_key_len == 0)
 		return (EINVAL);
 
 	/* No flags are currently supported. */
 	if (en->flags != 0)
 		return (EINVAL);
 
 	/* Common checks for supported algorithms. */
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * auth_algorithm isn't used, but permit GMAC values
 		 * for compatibility.
 		 */
 		switch (en->auth_algorithm) {
 		case 0:
 #ifdef COMPAT_FREEBSD12
 		/* XXX: Really 13.0-current COMPAT. */
 		case CRYPTO_AES_128_NIST_GMAC:
 		case CRYPTO_AES_192_NIST_GMAC:
 		case CRYPTO_AES_256_NIST_GMAC:
 #endif
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len != 0)
 			return (EINVAL);
 		switch (en->tls_vminor) {
 		case TLS_MINOR_VER_TWO:
 			if (en->iv_len != TLS_AEAD_GCM_LEN)
 				return (EINVAL);
 			break;
 		case TLS_MINOR_VER_THREE:
 			if (en->iv_len != TLS_1_3_GCM_IV_LEN)
 				return (EINVAL);
 			break;
 		default:
 			return (EINVAL);
 		}
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 		case CRYPTO_SHA2_384_HMAC:
 			if (en->tls_vminor != TLS_MINOR_VER_TWO)
 				return (EINVAL);
 			break;
 		default:
 			return (EINVAL);
 		}
 		if (en->auth_key_len == 0)
 			return (EINVAL);
 
 		/*
 		 * TLS 1.0 requires an implicit IV.  TLS 1.1 and 1.2
 		 * use explicit IVs.
 		 */
 		switch (en->tls_vminor) {
 		case TLS_MINOR_VER_ZERO:
 			if (en->iv_len != TLS_CBC_IMPLICIT_IV_LEN)
 				return (EINVAL);
 			break;
 		case TLS_MINOR_VER_ONE:
 		case TLS_MINOR_VER_TWO:
 			/* Ignore any supplied IV. */
 			en->iv_len = 0;
 			break;
 		default:
 			return (EINVAL);
 		}
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		if (en->auth_algorithm != 0 || en->auth_key_len != 0)
 			return (EINVAL);
 		if (en->tls_vminor != TLS_MINOR_VER_TWO &&
 		    en->tls_vminor != TLS_MINOR_VER_THREE)
 			return (EINVAL);
 		if (en->iv_len != TLS_CHACHA20_IV_LEN)
 			return (EINVAL);
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	error = ktls_start_kthreads();
 	if (error != 0)
 		return (error);
 
 	tls = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls->refcount, 1);
 	if (direction == KTLS_RX)
 		TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_receive_tag, tls);
 	else
 		TASK_INIT(&tls->reset_tag_task, 0, ktls_reset_send_tag, tls);
 
 	tls->wq_index = ktls_get_cpu(so);
 
 	tls->params.cipher_algorithm = en->cipher_algorithm;
 	tls->params.auth_algorithm = en->auth_algorithm;
 	tls->params.tls_vmajor = en->tls_vmajor;
 	tls->params.tls_vminor = en->tls_vminor;
 	tls->params.flags = en->flags;
 	tls->params.max_frame_len = min(TLS_MAX_MSG_SIZE_V10_2, ktls_maxlen);
 
 	/* Set the header and trailer lengths. */
 	tls->params.tls_hlen = sizeof(struct tls_record_layer);
 	switch (en->cipher_algorithm) {
 	case CRYPTO_AES_NIST_GCM_16:
 		/*
 		 * TLS 1.2 uses a 4 byte implicit IV with an explicit 8 byte
 		 * nonce.  TLS 1.3 uses a 12 byte implicit IV.
 		 */
 		if (en->tls_vminor < TLS_MINOR_VER_THREE)
 			tls->params.tls_hlen += sizeof(uint64_t);
 		tls->params.tls_tlen = AES_GMAC_HASH_LEN;
 		tls->params.tls_bs = 1;
 		break;
 	case CRYPTO_AES_CBC:
 		switch (en->auth_algorithm) {
 		case CRYPTO_SHA1_HMAC:
 			if (en->tls_vminor == TLS_MINOR_VER_ZERO) {
 				/* Implicit IV, no nonce. */
 				tls->sequential_records = true;
 				tls->next_seqno = be64dec(en->rec_seq);
 				STAILQ_INIT(&tls->pending_records);
 			} else {
 				tls->params.tls_hlen += AES_BLOCK_LEN;
 			}
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA1_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_256_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_256_HASH_LEN;
 			break;
 		case CRYPTO_SHA2_384_HMAC:
 			tls->params.tls_hlen += AES_BLOCK_LEN;
 			tls->params.tls_tlen = AES_BLOCK_LEN +
 			    SHA2_384_HASH_LEN;
 			break;
 		default:
 			panic("invalid hmac");
 		}
 		tls->params.tls_bs = AES_BLOCK_LEN;
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		/*
 		 * Chacha20 uses a 12 byte implicit IV.
 		 */
 		tls->params.tls_tlen = POLY1305_HASH_LEN;
 		tls->params.tls_bs = 1;
 		break;
 	default:
 		panic("invalid cipher");
 	}
 
 	/*
 	 * TLS 1.3 includes optional padding which we do not support,
 	 * and also puts the "real" record type at the end of the
 	 * encrypted data.
 	 */
 	if (en->tls_vminor == TLS_MINOR_VER_THREE)
 		tls->params.tls_tlen += sizeof(uint8_t);
 
 	KASSERT(tls->params.tls_hlen <= MBUF_PEXT_HDR_LEN,
 	    ("TLS header length too long: %d", tls->params.tls_hlen));
 	KASSERT(tls->params.tls_tlen <= MBUF_PEXT_TRAIL_LEN,
 	    ("TLS trailer length too long: %d", tls->params.tls_tlen));
 
 	if (en->auth_key_len != 0) {
 		tls->params.auth_key_len = en->auth_key_len;
 		tls->params.auth_key = malloc(en->auth_key_len, M_KTLS,
 		    M_WAITOK);
 		error = copyin(en->auth_key, tls->params.auth_key,
 		    en->auth_key_len);
 		if (error)
 			goto out;
 	}
 
 	tls->params.cipher_key_len = en->cipher_key_len;
 	tls->params.cipher_key = malloc(en->cipher_key_len, M_KTLS, M_WAITOK);
 	error = copyin(en->cipher_key, tls->params.cipher_key,
 	    en->cipher_key_len);
 	if (error)
 		goto out;
 
 	/*
 	 * This holds the implicit portion of the nonce for AEAD
 	 * ciphers and the initial implicit IV for TLS 1.0.  The
 	 * explicit portions of the IV are generated in ktls_frame().
 	 */
 	if (en->iv_len != 0) {
 		tls->params.iv_len = en->iv_len;
 		error = copyin(en->iv, tls->params.iv, en->iv_len);
 		if (error)
 			goto out;
 
 		/*
 		 * For TLS 1.2 with GCM, generate an 8-byte nonce as a
 		 * counter to generate unique explicit IVs.
 		 *
 		 * Store this counter in the last 8 bytes of the IV
 		 * array so that it is 8-byte aligned.
 		 */
 		if (en->cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    en->tls_vminor == TLS_MINOR_VER_TWO)
 			arc4rand(tls->params.iv + 8, sizeof(uint64_t), 0);
 	}
 
 	*tlsp = tls;
 	return (0);
 
 out:
 	ktls_cleanup(tls);
 	return (error);
 }
 
 static struct ktls_session *
 ktls_clone_session(struct ktls_session *tls, int direction)
 {
 	struct ktls_session *tls_new;
 
 	tls_new = uma_zalloc(ktls_session_zone, M_WAITOK | M_ZERO);
 
 	counter_u64_add(ktls_offload_active, 1);
 
 	refcount_init(&tls_new->refcount, 1);
 	if (direction == KTLS_RX)
 		TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_receive_tag,
 		    tls_new);
 	else
 		TASK_INIT(&tls_new->reset_tag_task, 0, ktls_reset_send_tag,
 		    tls_new);
 
 	/* Copy fields from existing session. */
 	tls_new->params = tls->params;
 	tls_new->wq_index = tls->wq_index;
 
 	/* Deep copy keys. */
 	if (tls_new->params.auth_key != NULL) {
 		tls_new->params.auth_key = malloc(tls->params.auth_key_len,
 		    M_KTLS, M_WAITOK);
 		memcpy(tls_new->params.auth_key, tls->params.auth_key,
 		    tls->params.auth_key_len);
 	}
 
 	tls_new->params.cipher_key = malloc(tls->params.cipher_key_len, M_KTLS,
 	    M_WAITOK);
 	memcpy(tls_new->params.cipher_key, tls->params.cipher_key,
 	    tls->params.cipher_key_len);
 
 	return (tls_new);
 }
 #endif
 
 static void
 ktls_cleanup(struct ktls_session *tls)
 {
 
 	counter_u64_add(ktls_offload_active, -1);
 	switch (tls->mode) {
 	case TCP_TLS_MODE_SW:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_sw_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_sw_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_sw_chacha20, -1);
 			break;
 		}
 		break;
 	case TCP_TLS_MODE_IFNET:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_ifnet_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_ifnet_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_ifnet_chacha20, -1);
 			break;
 		}
 		if (tls->snd_tag != NULL)
 			m_snd_tag_rele(tls->snd_tag);
 		if (tls->rx_ifp != NULL)
 			if_rele(tls->rx_ifp);
 		break;
 #ifdef TCP_OFFLOAD
 	case TCP_TLS_MODE_TOE:
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, -1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, -1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_toe_chacha20, -1);
 			break;
 		}
 		break;
 #endif
 	}
 	if (tls->ocf_session != NULL)
 		ktls_ocf_free(tls);
 	if (tls->params.auth_key != NULL) {
 		zfree(tls->params.auth_key, M_KTLS);
 		tls->params.auth_key = NULL;
 		tls->params.auth_key_len = 0;
 	}
 	if (tls->params.cipher_key != NULL) {
 		zfree(tls->params.cipher_key, M_KTLS);
 		tls->params.cipher_key = NULL;
 		tls->params.cipher_key_len = 0;
 	}
 	explicit_bzero(tls->params.iv, sizeof(tls->params.iv));
 }
 
 #if defined(INET) || defined(INET6)
 
 #ifdef TCP_OFFLOAD
 static int
 ktls_try_toe(struct socket *so, struct ktls_session *tls, int direction)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	inp = so->so_pcb;
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	if (!(tp->t_flags & TF_TOE)) {
 		INP_WUNLOCK(inp);
 		return (EOPNOTSUPP);
 	}
 
 	error = tcp_offload_alloc_tls_session(tp, tls, direction);
 	INP_WUNLOCK(inp);
 	if (error == 0) {
 		tls->mode = TCP_TLS_MODE_TOE;
 		switch (tls->params.cipher_algorithm) {
 		case CRYPTO_AES_CBC:
 			counter_u64_add(ktls_toe_cbc, 1);
 			break;
 		case CRYPTO_AES_NIST_GCM_16:
 			counter_u64_add(ktls_toe_gcm, 1);
 			break;
 		case CRYPTO_CHACHA20_POLY1305:
 			counter_u64_add(ktls_toe_chacha20, 1);
 			break;
 		}
 	}
 	return (error);
 }
 #endif
 
 /*
  * Common code used when first enabling ifnet TLS on a connection or
  * when allocating a new ifnet TLS session due to a routing change.
  * This function allocates a new TLS send tag on whatever interface
  * the connection is currently routed over.
  */
 static int
 ktls_alloc_snd_tag(struct inpcb *inp, struct ktls_session *tls, bool force,
     struct m_snd_tag **mstp)
 {
 	union if_snd_tag_alloc_params params;
 	struct ifnet *ifp;
 	struct nhop_object *nh;
 	struct tcpcb *tp;
 	int error;
 
 	INP_RLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 
 	/*
 	 * Check administrative controls on ifnet TLS to determine if
 	 * ifnet TLS should be denied.
 	 *
 	 * - Always permit 'force' requests.
 	 * - ktls_ifnet_permitted == 0: always deny.
 	 */
 	if (!force && ktls_ifnet_permitted == 0) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 
 	/*
 	 * XXX: Use the cached route in the inpcb to find the
 	 * interface.  This should perhaps instead use
 	 * rtalloc1_fib(dst, 0, 0, fibnum).  Since KTLS is only
 	 * enabled after a connection has completed key negotiation in
 	 * userland, the cached route will be present in practice.
 	 */
 	nh = inp->inp_route.ro_nh;
 	if (nh == NULL) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 	ifp = nh->nh_ifp;
 	if_ref(ifp);
 
 	/*
 	 * Allocate a TLS + ratelimit tag if the connection has an
 	 * existing pacing rate.
 	 */
 	if (tp->t_pacing_rate != -1 &&
 	    (ifp->if_capenable & IFCAP_TXTLS_RTLMT) != 0) {
 		params.hdr.type = IF_SND_TAG_TYPE_TLS_RATE_LIMIT;
 		params.tls_rate_limit.inp = inp;
 		params.tls_rate_limit.tls = tls;
 		params.tls_rate_limit.max_rate = tp->t_pacing_rate;
 	} else {
 		params.hdr.type = IF_SND_TAG_TYPE_TLS;
 		params.tls.inp = inp;
 		params.tls.tls = tls;
 	}
 	params.hdr.flowid = inp->inp_flowid;
 	params.hdr.flowtype = inp->inp_flowtype;
 	params.hdr.numa_domain = inp->inp_numa_domain;
 	INP_RUNLOCK(inp);
 
 	if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	if (inp->inp_vflag & INP_IPV6) {
 		if ((ifp->if_capenable & IFCAP_TXTLS6) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	} else {
 		if ((ifp->if_capenable & IFCAP_TXTLS4) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	}
 	error = m_snd_tag_alloc(ifp, &params, mstp);
 out:
 	if_rele(ifp);
 	return (error);
 }
 
 /*
  * Allocate an initial TLS receive tag for doing HW decryption of TLS
  * data.
  *
  * This function allocates a new TLS receive tag on whatever interface
  * the connection is currently routed over.  If the connection ends up
  * using a different interface for receive this will get fixed up via
  * ktls_input_ifp_mismatch as future packets arrive.
  */
 static int
 ktls_alloc_rcv_tag(struct inpcb *inp, struct ktls_session *tls,
     struct m_snd_tag **mstp)
 {
 	union if_snd_tag_alloc_params params;
 	struct ifnet *ifp;
 	struct nhop_object *nh;
 	int error;
 
 	if (!ktls_ocf_recrypt_supported(tls))
 		return (ENXIO);
 
 	INP_RLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (inp->inp_socket == NULL) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 
 	/*
 	 * Check administrative controls on ifnet TLS to determine if
 	 * ifnet TLS should be denied.
 	 */
 	if (ktls_ifnet_permitted == 0) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 
 	/*
 	 * XXX: As with ktls_alloc_snd_tag, use the cached route in
 	 * the inpcb to find the interface.
 	 */
 	nh = inp->inp_route.ro_nh;
 	if (nh == NULL) {
 		INP_RUNLOCK(inp);
 		return (ENXIO);
 	}
 	ifp = nh->nh_ifp;
 	if_ref(ifp);
 	tls->rx_ifp = ifp;
 
 	params.hdr.type = IF_SND_TAG_TYPE_TLS_RX;
 	params.hdr.flowid = inp->inp_flowid;
 	params.hdr.flowtype = inp->inp_flowtype;
 	params.hdr.numa_domain = inp->inp_numa_domain;
 	params.tls_rx.inp = inp;
 	params.tls_rx.tls = tls;
 	params.tls_rx.vlan_id = 0;
 
 	INP_RUNLOCK(inp);
 
 	if (inp->inp_vflag & INP_IPV6) {
 		if ((ifp->if_capenable2 & IFCAP2_RXTLS6) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	} else {
 		if ((ifp->if_capenable2 & IFCAP2_RXTLS4) == 0) {
 			error = EOPNOTSUPP;
 			goto out;
 		}
 	}
 	error = m_snd_tag_alloc(ifp, &params, mstp);
 
 	/*
 	 * If this connection is over a vlan, vlan_snd_tag_alloc
 	 * rewrites vlan_id with the saved interface.  Save the VLAN
 	 * ID for use in ktls_reset_receive_tag which allocates new
 	 * receive tags directly from the leaf interface bypassing
 	 * if_vlan.
 	 */
 	if (error == 0)
 		tls->rx_vlan_id = params.tls_rx.vlan_id;
 out:
 	return (error);
 }
 
 static int
 ktls_try_ifnet(struct socket *so, struct ktls_session *tls, int direction,
     bool force)
 {
 	struct m_snd_tag *mst;
 	int error;
 
 	switch (direction) {
 	case KTLS_TX:
 		error = ktls_alloc_snd_tag(so->so_pcb, tls, force, &mst);
 		if (__predict_false(error != 0))
 			goto done;
 		break;
 	case KTLS_RX:
 		KASSERT(!force, ("%s: forced receive tag", __func__));
 		error = ktls_alloc_rcv_tag(so->so_pcb, tls, &mst);
 		if (__predict_false(error != 0))
 			goto done;
 		break;
 	default:
 		__assert_unreachable();
 	}
 
 	tls->mode = TCP_TLS_MODE_IFNET;
 	tls->snd_tag = mst;
 
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		counter_u64_add(ktls_ifnet_cbc, 1);
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		counter_u64_add(ktls_ifnet_gcm, 1);
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		counter_u64_add(ktls_ifnet_chacha20, 1);
 		break;
 	default:
 		break;
 	}
 done:
 	return (error);
 }
 
 static void
 ktls_use_sw(struct ktls_session *tls)
 {
 	tls->mode = TCP_TLS_MODE_SW;
 	switch (tls->params.cipher_algorithm) {
 	case CRYPTO_AES_CBC:
 		counter_u64_add(ktls_sw_cbc, 1);
 		break;
 	case CRYPTO_AES_NIST_GCM_16:
 		counter_u64_add(ktls_sw_gcm, 1);
 		break;
 	case CRYPTO_CHACHA20_POLY1305:
 		counter_u64_add(ktls_sw_chacha20, 1);
 		break;
 	}
 }
 
 static int
 ktls_try_sw(struct socket *so, struct ktls_session *tls, int direction)
 {
 	int error;
 
 	error = ktls_ocf_try(so, tls, direction);
 	if (error)
 		return (error);
 	ktls_use_sw(tls);
 	return (0);
 }
 
 /*
  * KTLS RX stores data in the socket buffer as a list of TLS records,
  * where each record is stored as a control message containg the TLS
  * header followed by data mbufs containing the decrypted data.  This
  * is different from KTLS TX which always uses an mb_ext_pgs mbuf for
  * both encrypted and decrypted data.  TLS records decrypted by a NIC
  * should be queued to the socket buffer as records, but encrypted
  * data which needs to be decrypted by software arrives as a stream of
  * regular mbufs which need to be converted.  In addition, there may
  * already be pending encrypted data in the socket buffer when KTLS RX
  * is enabled.
  *
  * To manage not-yet-decrypted data for KTLS RX, the following scheme
  * is used:
  *
  * - A single chain of NOTREADY mbufs is hung off of sb_mtls.
  *
  * - ktls_check_rx checks this chain of mbufs reading the TLS header
  *   from the first mbuf.  Once all of the data for that TLS record is
  *   queued, the socket is queued to a worker thread.
  *
  * - The worker thread calls ktls_decrypt to decrypt TLS records in
  *   the TLS chain.  Each TLS record is detached from the TLS chain,
  *   decrypted, and inserted into the regular socket buffer chain as
  *   record starting with a control message holding the TLS header and
  *   a chain of mbufs holding the encrypted data.
  */
 
 static void
 sb_mark_notready(struct sockbuf *sb)
 {
 	struct mbuf *m;
 
 	m = sb->sb_mb;
 	sb->sb_mtls = m;
 	sb->sb_mb = NULL;
 	sb->sb_mbtail = NULL;
 	sb->sb_lastrecord = NULL;
 	for (; m != NULL; m = m->m_next) {
 		KASSERT(m->m_nextpkt == NULL, ("%s: m_nextpkt != NULL",
 		    __func__));
 		KASSERT((m->m_flags & M_NOTAVAIL) == 0, ("%s: mbuf not avail",
 		    __func__));
 		KASSERT(sb->sb_acc >= m->m_len, ("%s: sb_acc < m->m_len",
 		    __func__));
 		m->m_flags |= M_NOTREADY;
 		sb->sb_acc -= m->m_len;
 		sb->sb_tlscc += m->m_len;
 		sb->sb_mtlstail = m;
 	}
 	KASSERT(sb->sb_acc == 0 && sb->sb_tlscc == sb->sb_ccc,
 	    ("%s: acc %u tlscc %u ccc %u", __func__, sb->sb_acc, sb->sb_tlscc,
 	    sb->sb_ccc));
 }
 
 /*
  * Return information about the pending TLS data in a socket
  * buffer.  On return, 'seqno' is set to the sequence number
  * of the next TLS record to be received, 'resid' is set to
  * the amount of bytes still needed for the last pending
  * record.  The function returns 'false' if the last pending
  * record contains a partial TLS header.  In that case, 'resid'
  * is the number of bytes needed to complete the TLS header.
  */
 bool
 ktls_pending_rx_info(struct sockbuf *sb, uint64_t *seqnop, size_t *residp)
 {
 	struct tls_record_layer hdr;
 	struct mbuf *m;
 	uint64_t seqno;
 	size_t resid;
 	u_int offset, record_len;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	MPASS(sb->sb_flags & SB_TLS_RX);
 	seqno = sb->sb_tls_seqno;
 	resid = sb->sb_tlscc;
 	m = sb->sb_mtls;
 	offset = 0;
 
 	if (resid == 0) {
 		*seqnop = seqno;
 		*residp = 0;
 		return (true);
 	}
 
 	for (;;) {
 		seqno++;
 
 		if (resid < sizeof(hdr)) {
 			*seqnop = seqno;
 			*residp = sizeof(hdr) - resid;
 			return (false);
 		}
 
 		m_copydata(m, offset, sizeof(hdr), (void *)&hdr);
 
 		record_len = sizeof(hdr) + ntohs(hdr.tls_length);
 		if (resid <= record_len) {
 			*seqnop = seqno;
 			*residp = record_len - resid;
 			return (true);
 		}
 		resid -= record_len;
 
 		while (record_len != 0) {
 			if (m->m_len - offset > record_len) {
 				offset += record_len;
 				break;
 			}
 
 			record_len -= (m->m_len - offset);
 			offset = 0;
 			m = m->m_next;
 		}
 	}
 }
 
 int
 ktls_enable_rx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 	if (SOLISTENING(so))
 		return (EINVAL);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_rcv.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	error = ktls_create_session(so, en, &tls, KTLS_RX);
 	if (error)
 		return (error);
 
 	error = ktls_ocf_try(so, tls, KTLS_RX);
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	/* Mark the socket as using TLS offload. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	so->so_rcv.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_rcv.sb_tls_info = tls;
 	so->so_rcv.sb_flags |= SB_TLS_RX;
 
 	/* Mark existing data as not ready until it can be decrypted. */
 	sb_mark_notready(&so->so_rcv);
 	ktls_check_rx(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/* Prefer TOE -> ifnet TLS -> software TLS. */
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_RX);
 	if (error)
 #endif
 		error = ktls_try_ifnet(so, tls, KTLS_RX, false);
 	if (error)
 		ktls_use_sw(tls);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_enable_tx(struct socket *so, struct tls_enable *en)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	int error;
 
 	if (!ktls_offload_enable)
 		return (ENOTSUP);
 	if (SOLISTENING(so))
 		return (EINVAL);
 
 	counter_u64_add(ktls_offload_enable_calls, 1);
 
 	/*
 	 * This should always be true since only the TCP socket option
 	 * invokes this function.
 	 */
 	if (so->so_proto->pr_protocol != IPPROTO_TCP)
 		return (EINVAL);
 
 	/*
 	 * XXX: Don't overwrite existing sessions.  We should permit
 	 * this to support rekeying in the future.
 	 */
 	if (so->so_snd.sb_tls_info != NULL)
 		return (EALREADY);
 
 	if (en->cipher_algorithm == CRYPTO_AES_CBC && !ktls_cbc_enable)
 		return (ENOTSUP);
 
 	/* TLS requires ext pgs */
 	if (mb_use_ext_pgs == 0)
 		return (ENXIO);
 
 	error = ktls_create_session(so, en, &tls, KTLS_TX);
 	if (error)
 		return (error);
 
 	/* Prefer TOE -> ifnet TLS -> software TLS. */
 #ifdef TCP_OFFLOAD
 	error = ktls_try_toe(so, tls, KTLS_TX);
 	if (error)
 #endif
 		error = ktls_try_ifnet(so, tls, KTLS_TX, false);
 	if (error)
 		error = ktls_try_sw(so, tls, KTLS_TX);
 
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 	if (error) {
 		ktls_cleanup(tls);
 		return (error);
 	}
 
 	/*
 	 * Write lock the INP when setting sb_tls_info so that
 	 * routines in tcp_ratelimit.c can read sb_tls_info while
 	 * holding the INP lock.
 	 */
 	inp = so->so_pcb;
 	INP_WLOCK(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_seqno = be64dec(en->rec_seq);
 	so->so_snd.sb_tls_info = tls;
 	if (tls->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 	SOCK_IO_SEND_UNLOCK(so);
 
 	counter_u64_add(ktls_offload_total, 1);
 
 	return (0);
 }
 
 int
 ktls_get_rx_mode(struct socket *so, int *modep)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp __diagused;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCK_RECVBUF_LOCK(so);
 	tls = so->so_rcv.sb_tls_info;
 	if (tls == NULL)
 		*modep = TCP_TLS_MODE_NONE;
 	else
 		*modep = tls->mode;
 	SOCK_RECVBUF_UNLOCK(so);
 	return (0);
 }
 
 /*
  * ktls_get_rx_sequence - get the next TCP- and TLS- sequence number.
  *
  * This function gets information about the next TCP- and TLS-
  * sequence number to be processed by the TLS receive worker
  * thread. The information is extracted from the given "inpcb"
  * structure. The values are stored in host endian format at the two
  * given output pointer locations. The TCP sequence number points to
  * the beginning of the TLS header.
  *
  * This function returns zero on success, else a non-zero error code
  * is returned.
  */
 int
 ktls_get_rx_sequence(struct inpcb *inp, uint32_t *tcpseq, uint64_t *tlsseq)
 {
 	struct socket *so;
 	struct tcpcb *tp;
 
 	INP_RLOCK(inp);
 	so = inp->inp_socket;
 	if (__predict_false(so == NULL)) {
 		INP_RUNLOCK(inp);
 		return (EINVAL);
 	}
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 
 	tp = intotcpcb(inp);
 	MPASS(tp != NULL);
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	*tcpseq = tp->rcv_nxt - so->so_rcv.sb_tlscc;
 	*tlsseq = so->so_rcv.sb_tls_seqno;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	INP_RUNLOCK(inp);
 
 	return (0);
 }
 
 int
 ktls_get_tx_mode(struct socket *so, int *modep)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp __diagused;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCK_SENDBUF_LOCK(so);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL)
 		*modep = TCP_TLS_MODE_NONE;
 	else
 		*modep = tls->mode;
 	SOCK_SENDBUF_UNLOCK(so);
 	return (0);
 }
 
 /*
  * Switch between SW and ifnet TLS sessions as requested.
  */
 int
 ktls_set_tx_mode(struct socket *so, int mode)
 {
 	struct ktls_session *tls, *tls_new;
 	struct inpcb *inp;
 	int error;
 
 	if (SOLISTENING(so))
 		return (EINVAL);
 	switch (mode) {
 	case TCP_TLS_MODE_SW:
 	case TCP_TLS_MODE_IFNET:
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	inp = so->so_pcb;
 	INP_WLOCK_ASSERT(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	tls = so->so_snd.sb_tls_info;
 	if (tls == NULL) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	if (tls->mode == mode) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		return (0);
 	}
 
 	tls = ktls_hold(tls);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	INP_WUNLOCK(inp);
 
 	tls_new = ktls_clone_session(tls, KTLS_TX);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		error = ktls_try_ifnet(so, tls_new, KTLS_TX, true);
 	else
 		error = ktls_try_sw(so, tls_new, KTLS_TX);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	error = SOCK_IO_SEND_LOCK(so, SBL_WAIT);
 	if (error) {
 		counter_u64_add(ktls_switch_failed, 1);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (error);
 	}
 
 	/*
 	 * If we raced with another session change, keep the existing
 	 * session.
 	 */
 	if (tls != so->so_snd.sb_tls_info) {
 		counter_u64_add(ktls_switch_failed, 1);
 		SOCK_IO_SEND_UNLOCK(so);
 		ktls_free(tls_new);
 		ktls_free(tls);
 		INP_WLOCK(inp);
 		return (EBUSY);
 	}
 
 	INP_WLOCK(inp);
 	SOCKBUF_LOCK(&so->so_snd);
 	so->so_snd.sb_tls_info = tls_new;
 	if (tls_new->mode != TCP_TLS_MODE_SW)
 		so->so_snd.sb_flags |= SB_TLS_IFNET;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	SOCK_IO_SEND_UNLOCK(so);
 
 	/*
 	 * Drop two references on 'tls'.  The first is for the
 	 * ktls_hold() above.  The second drops the reference from the
 	 * socket buffer.
 	 */
 	KASSERT(tls->refcount >= 2, ("too few references on old session"));
 	ktls_free(tls);
 	ktls_free(tls);
 
 	if (mode == TCP_TLS_MODE_IFNET)
 		counter_u64_add(ktls_switch_to_ifnet, 1);
 	else
 		counter_u64_add(ktls_switch_to_sw, 1);
 
 	return (0);
 }
 
 /*
  * Try to allocate a new TLS receive tag.  This task is scheduled when
  * sbappend_ktls_rx detects an input path change.  If a new tag is
  * allocated, replace the tag in the TLS session.  If a new tag cannot
  * be allocated, let the session fall back to software decryption.
  */
 static void
 ktls_reset_receive_tag(void *context, int pending)
 {
 	union if_snd_tag_alloc_params params;
 	struct ktls_session *tls;
 	struct m_snd_tag *mst;
 	struct inpcb *inp;
 	struct ifnet *ifp;
 	struct socket *so;
 	int error;
 
 	MPASS(pending == 1);
 
 	tls = context;
 	so = tls->so;
 	inp = so->so_pcb;
 	ifp = NULL;
 
 	INP_RLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		goto out;
 	}
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	m_snd_tag_rele(tls->snd_tag);
 	tls->snd_tag = NULL;
 
 	ifp = tls->rx_ifp;
 	if_ref(ifp);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	params.hdr.type = IF_SND_TAG_TYPE_TLS_RX;
 	params.hdr.flowid = inp->inp_flowid;
 	params.hdr.flowtype = inp->inp_flowtype;
 	params.hdr.numa_domain = inp->inp_numa_domain;
 	params.tls_rx.inp = inp;
 	params.tls_rx.tls = tls;
 	params.tls_rx.vlan_id = tls->rx_vlan_id;
 	INP_RUNLOCK(inp);
 
 	if (inp->inp_vflag & INP_IPV6) {
 		if ((ifp->if_capenable2 & IFCAP2_RXTLS6) == 0)
 			goto out;
 	} else {
 		if ((ifp->if_capenable2 & IFCAP2_RXTLS4) == 0)
 			goto out;
 	}
 
 	error = m_snd_tag_alloc(ifp, &params, &mst);
 	if (error == 0) {
 		SOCKBUF_LOCK(&so->so_rcv);
 		tls->snd_tag = mst;
 		SOCKBUF_UNLOCK(&so->so_rcv);
 
 		counter_u64_add(ktls_ifnet_reset, 1);
 	} else {
 		/*
 		 * Just fall back to software decryption if a tag
 		 * cannot be allocated leaving the connection intact.
 		 * If a future input path change switches to another
 		 * interface this connection will resume ifnet TLS.
 		 */
 		counter_u64_add(ktls_ifnet_reset_failed, 1);
 	}
 
 out:
 	mtx_pool_lock(mtxpool_sleep, tls);
 	tls->reset_pending = false;
 	mtx_pool_unlock(mtxpool_sleep, tls);
 
 	if (ifp != NULL)
 		if_rele(ifp);
 	sorele(so);
 	ktls_free(tls);
 }
 
 /*
  * Try to allocate a new TLS send tag.  This task is scheduled when
  * ip_output detects a route change while trying to transmit a packet
  * holding a TLS record.  If a new tag is allocated, replace the tag
  * in the TLS session.  Subsequent packets on the connection will use
  * the new tag.  If a new tag cannot be allocated, drop the
  * connection.
  */
 static void
 ktls_reset_send_tag(void *context, int pending)
 {
 	struct epoch_tracker et;
 	struct ktls_session *tls;
 	struct m_snd_tag *old, *new;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	MPASS(pending == 1);
 
 	tls = context;
 	inp = tls->inp;
 
 	/*
 	 * Free the old tag first before allocating a new one.
 	 * ip[6]_output_send() will treat a NULL send tag the same as
 	 * an ifp mismatch and drop packets until a new tag is
 	 * allocated.
 	 *
 	 * Write-lock the INP when changing tls->snd_tag since
 	 * ip[6]_output_send() holds a read-lock when reading the
 	 * pointer.
 	 */
 	INP_WLOCK(inp);
 	old = tls->snd_tag;
 	tls->snd_tag = NULL;
 	INP_WUNLOCK(inp);
 	if (old != NULL)
 		m_snd_tag_rele(old);
 
 	error = ktls_alloc_snd_tag(inp, tls, true, &new);
 
 	if (error == 0) {
 		INP_WLOCK(inp);
 		tls->snd_tag = new;
 		mtx_pool_lock(mtxpool_sleep, tls);
 		tls->reset_pending = false;
 		mtx_pool_unlock(mtxpool_sleep, tls);
 		if (!in_pcbrele_wlocked(inp))
 			INP_WUNLOCK(inp);
 
 		counter_u64_add(ktls_ifnet_reset, 1);
 
 		/*
 		 * XXX: Should we kick tcp_output explicitly now that
 		 * the send tag is fixed or just rely on timers?
 		 */
 	} else {
 		NET_EPOCH_ENTER(et);
 		INP_WLOCK(inp);
 		if (!in_pcbrele_wlocked(inp)) {
 			if (!(inp->inp_flags & INP_TIMEWAIT) &&
 			    !(inp->inp_flags & INP_DROPPED)) {
 				tp = intotcpcb(inp);
 				CURVNET_SET(tp->t_vnet);
 				tp = tcp_drop(tp, ECONNABORTED);
 				CURVNET_RESTORE();
 				if (tp != NULL)
 					INP_WUNLOCK(inp);
 				counter_u64_add(ktls_ifnet_reset_dropped, 1);
 			} else
 				INP_WUNLOCK(inp);
 		}
 		NET_EPOCH_EXIT(et);
 
 		counter_u64_add(ktls_ifnet_reset_failed, 1);
 
 		/*
 		 * Leave reset_pending true to avoid future tasks while
 		 * the socket goes away.
 		 */
 	}
 
 	ktls_free(tls);
 }
 
 void
 ktls_input_ifp_mismatch(struct sockbuf *sb, struct ifnet *ifp)
 {
 	struct ktls_session *tls;
 	struct socket *so;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
 	    __func__, sb));
 	so = __containerof(sb, struct socket, so_rcv);
 
 	tls = sb->sb_tls_info;
 	if_rele(tls->rx_ifp);
 	if_ref(ifp);
 	tls->rx_ifp = ifp;
 
 	/*
 	 * See if we should schedule a task to update the receive tag for
 	 * this session.
 	 */
 	mtx_pool_lock(mtxpool_sleep, tls);
 	if (!tls->reset_pending) {
 		(void) ktls_hold(tls);
 		soref(so);
 		tls->so = so;
 		tls->reset_pending = true;
 		taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 	}
 	mtx_pool_unlock(mtxpool_sleep, tls);
 }
 
 int
 ktls_output_eagain(struct inpcb *inp, struct ktls_session *tls)
 {
 
 	if (inp == NULL)
 		return (ENOBUFS);
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * See if we should schedule a task to update the send tag for
 	 * this session.
 	 */
 	mtx_pool_lock(mtxpool_sleep, tls);
 	if (!tls->reset_pending) {
 		(void) ktls_hold(tls);
 		in_pcbref(inp);
 		tls->inp = inp;
 		tls->reset_pending = true;
 		taskqueue_enqueue(taskqueue_thread, &tls->reset_tag_task);
 	}
 	mtx_pool_unlock(mtxpool_sleep, tls);
 	return (ENOBUFS);
 }
 
 #ifdef RATELIMIT
 int
 ktls_modify_txrtlmt(struct ktls_session *tls, uint64_t max_pacing_rate)
 {
 	union if_snd_tag_modify_params params = {
 		.rate_limit.max_rate = max_pacing_rate,
 		.rate_limit.flags = M_NOWAIT,
 	};
 	struct m_snd_tag *mst;
 
 	/* Can't get to the inp, but it should be locked. */
 	/* INP_LOCK_ASSERT(inp); */
 
 	MPASS(tls->mode == TCP_TLS_MODE_IFNET);
 
 	if (tls->snd_tag == NULL) {
 		/*
 		 * Resetting send tag, ignore this change.  The
 		 * pending reset may or may not see this updated rate
 		 * in the tcpcb.  If it doesn't, we will just lose
 		 * this rate change.
 		 */
 		return (0);
 	}
 
 	mst = tls->snd_tag;
 
 	MPASS(mst != NULL);
 	MPASS(mst->sw->type == IF_SND_TAG_TYPE_TLS_RATE_LIMIT);
 
 	return (mst->sw->snd_tag_modify(mst, &params));
 }
 #endif
 #endif
 
 void
 ktls_destroy(struct ktls_session *tls)
 {
 
 	if (tls->sequential_records) {
 		struct mbuf *m, *n;
 		int page_count;
 
 		STAILQ_FOREACH_SAFE(m, &tls->pending_records, m_epg_stailq, n) {
 			page_count = m->m_epg_enc_cnt;
 			while (page_count > 0) {
 				KASSERT(page_count >= m->m_epg_nrdy,
 				    ("%s: too few pages", __func__));
 				page_count -= m->m_epg_nrdy;
 				m = m_free(m);
 			}
 		}
 	}
 	ktls_cleanup(tls);
 	uma_zfree(ktls_session_zone, tls);
 }
 
 void
 ktls_seq(struct sockbuf *sb, struct mbuf *m)
 {
 
 	for (; m != NULL; m = m->m_next) {
 		KASSERT((m->m_flags & M_EXTPG) != 0,
 		    ("ktls_seq: mapped mbuf %p", m));
 
 		m->m_epg_seqno = sb->sb_tls_seqno;
 		sb->sb_tls_seqno++;
 	}
 }
 
 /*
  * Add TLS framing (headers and trailers) to a chain of mbufs.  Each
  * mbuf in the chain must be an unmapped mbuf.  The payload of the
  * mbuf must be populated with the payload of each TLS record.
  *
  * The record_type argument specifies the TLS record type used when
  * populating the TLS header.
  *
  * The enq_count argument on return is set to the number of pages of
  * payload data for this entire chain that need to be encrypted via SW
  * encryption.  The returned value should be passed to ktls_enqueue
  * when scheduling encryption of this chain of mbufs.  To handle the
  * special case of empty fragments for TLS 1.0 sessions, an empty
  * fragment counts as one page.
  */
 void
 ktls_frame(struct mbuf *top, struct ktls_session *tls, int *enq_cnt,
     uint8_t record_type)
 {
 	struct tls_record_layer *tlshdr;
 	struct mbuf *m;
 	uint64_t *noncep;
 	uint16_t tls_len;
 	int maxlen __diagused;
 
 	maxlen = tls->params.max_frame_len;
 	*enq_cnt = 0;
 	for (m = top; m != NULL; m = m->m_next) {
 		/*
 		 * All mbufs in the chain should be TLS records whose
 		 * payload does not exceed the maximum frame length.
 		 *
 		 * Empty TLS 1.0 records are permitted when using CBC.
 		 */
 		KASSERT(m->m_len <= maxlen && m->m_len >= 0 &&
 		    (m->m_len > 0 || ktls_permit_empty_frames(tls)),
 		    ("ktls_frame: m %p len %d", m, m->m_len));
 
 		/*
 		 * TLS frames require unmapped mbufs to store session
 		 * info.
 		 */
 		KASSERT((m->m_flags & M_EXTPG) != 0,
 		    ("ktls_frame: mapped mbuf %p (top = %p)", m, top));
 
 		tls_len = m->m_len;
 
 		/* Save a reference to the session. */
 		m->m_epg_tls = ktls_hold(tls);
 
 		m->m_epg_hdrlen = tls->params.tls_hlen;
 		m->m_epg_trllen = tls->params.tls_tlen;
 		if (tls->params.cipher_algorithm == CRYPTO_AES_CBC) {
 			int bs, delta;
 
 			/*
 			 * AES-CBC pads messages to a multiple of the
 			 * block size.  Note that the padding is
 			 * applied after the digest and the encryption
 			 * is done on the "plaintext || mac || padding".
 			 * At least one byte of padding is always
 			 * present.
 			 *
 			 * Compute the final trailer length assuming
 			 * at most one block of padding.
 			 * tls->params.tls_tlen is the maximum
 			 * possible trailer length (padding + digest).
 			 * delta holds the number of excess padding
 			 * bytes if the maximum were used.  Those
 			 * extra bytes are removed.
 			 */
 			bs = tls->params.tls_bs;
 			delta = (tls_len + tls->params.tls_tlen) & (bs - 1);
 			m->m_epg_trllen -= delta;
 		}
 		m->m_len += m->m_epg_hdrlen + m->m_epg_trllen;
 
 		/* Populate the TLS header. */
 		tlshdr = (void *)m->m_epg_hdr;
 		tlshdr->tls_vmajor = tls->params.tls_vmajor;
 
 		/*
 		 * TLS 1.3 masquarades as TLS 1.2 with a record type
 		 * of TLS_RLTYPE_APP.
 		 */
 		if (tls->params.tls_vminor == TLS_MINOR_VER_THREE &&
 		    tls->params.tls_vmajor == TLS_MAJOR_VER_ONE) {
 			tlshdr->tls_vminor = TLS_MINOR_VER_TWO;
 			tlshdr->tls_type = TLS_RLTYPE_APP;
 			/* save the real record type for later */
 			m->m_epg_record_type = record_type;
 			m->m_epg_trail[0] = record_type;
 		} else {
 			tlshdr->tls_vminor = tls->params.tls_vminor;
 			tlshdr->tls_type = record_type;
 		}
 		tlshdr->tls_length = htons(m->m_len - sizeof(*tlshdr));
 
 		/*
 		 * Store nonces / explicit IVs after the end of the
 		 * TLS header.
 		 *
 		 * For GCM with TLS 1.2, an 8 byte nonce is copied
 		 * from the end of the IV.  The nonce is then
 		 * incremented for use by the next record.
 		 *
 		 * For CBC, a random nonce is inserted for TLS 1.1+.
 		 */
 		if (tls->params.cipher_algorithm == CRYPTO_AES_NIST_GCM_16 &&
 		    tls->params.tls_vminor == TLS_MINOR_VER_TWO) {
 			noncep = (uint64_t *)(tls->params.iv + 8);
 			be64enc(tlshdr + 1, *noncep);
 			(*noncep)++;
 		} else if (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
 		    tls->params.tls_vminor >= TLS_MINOR_VER_ONE)
 			arc4rand(tlshdr + 1, AES_BLOCK_LEN, 0);
 
 		/*
 		 * When using SW encryption, mark the mbuf not ready.
 		 * It will be marked ready via sbready() after the
 		 * record has been encrypted.
 		 *
 		 * When using ifnet TLS, unencrypted TLS records are
 		 * sent down the stack to the NIC.
 		 */
 		if (tls->mode == TCP_TLS_MODE_SW) {
 			m->m_flags |= M_NOTREADY;
 			if (__predict_false(tls_len == 0)) {
 				/* TLS 1.0 empty fragment. */
 				m->m_epg_nrdy = 1;
 			} else
 				m->m_epg_nrdy = m->m_epg_npgs;
 			*enq_cnt += m->m_epg_nrdy;
 		}
 	}
 }
 
 bool
 ktls_permit_empty_frames(struct ktls_session *tls)
 {
 	return (tls->params.cipher_algorithm == CRYPTO_AES_CBC &&
 	    tls->params.tls_vminor == TLS_MINOR_VER_ZERO);
 }
 
 void
 ktls_check_rx(struct sockbuf *sb)
 {
 	struct tls_record_layer hdr;
 	struct ktls_wq *wq;
 	struct socket *so;
 	bool running;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX, ("%s: sockbuf %p isn't TLS RX",
 	    __func__, sb));
 	so = __containerof(sb, struct socket, so_rcv);
 
 	if (sb->sb_flags & SB_TLS_RX_RUNNING)
 		return;
 
 	/* Is there enough queued for a TLS header? */
 	if (sb->sb_tlscc < sizeof(hdr)) {
 		if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc != 0)
 			so->so_error = EMSGSIZE;
 		return;
 	}
 
 	m_copydata(sb->sb_mtls, 0, sizeof(hdr), (void *)&hdr);
 
 	/* Is the entire record queued? */
 	if (sb->sb_tlscc < sizeof(hdr) + ntohs(hdr.tls_length)) {
 		if ((sb->sb_state & SBS_CANTRCVMORE) != 0)
 			so->so_error = EMSGSIZE;
 		return;
 	}
 
 	sb->sb_flags |= SB_TLS_RX_RUNNING;
 
 	soref(so);
 	wq = &ktls_wq[so->so_rcv.sb_tls_info->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->so_head, so, so_ktls_rx_list);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_rx_queued, 1);
 }
 
 static struct mbuf *
 ktls_detach_record(struct sockbuf *sb, int len)
 {
 	struct mbuf *m, *n, *top;
 	int remain;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	MPASS(len <= sb->sb_tlscc);
 
 	/*
 	 * If TLS chain is the exact size of the record,
 	 * just grab the whole record.
 	 */
 	top = sb->sb_mtls;
 	if (sb->sb_tlscc == len) {
 		sb->sb_mtls = NULL;
 		sb->sb_mtlstail = NULL;
 		goto out;
 	}
 
 	/*
 	 * While it would be nice to use m_split() here, we need
 	 * to know exactly what m_split() allocates to update the
 	 * accounting, so do it inline instead.
 	 */
 	remain = len;
 	for (m = top; remain > m->m_len; m = m->m_next)
 		remain -= m->m_len;
 
 	/* Easy case: don't have to split 'm'. */
 	if (remain == m->m_len) {
 		sb->sb_mtls = m->m_next;
 		if (sb->sb_mtls == NULL)
 			sb->sb_mtlstail = NULL;
 		m->m_next = NULL;
 		goto out;
 	}
 
 	/*
 	 * Need to allocate an mbuf to hold the remainder of 'm'.  Try
 	 * with M_NOWAIT first.
 	 */
 	n = m_get(M_NOWAIT, MT_DATA);
 	if (n == NULL) {
 		/*
 		 * Use M_WAITOK with socket buffer unlocked.  If
 		 * 'sb_mtls' changes while the lock is dropped, return
 		 * NULL to force the caller to retry.
 		 */
 		SOCKBUF_UNLOCK(sb);
 
 		n = m_get(M_WAITOK, MT_DATA);
 
 		SOCKBUF_LOCK(sb);
 		if (sb->sb_mtls != top) {
 			m_free(n);
 			return (NULL);
 		}
 	}
 	n->m_flags |= (m->m_flags & (M_NOTREADY | M_DECRYPTED));
 
 	/* Store remainder in 'n'. */
 	n->m_len = m->m_len - remain;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data + remain;
 		mb_dupcl(n, m);
 	} else {
 		bcopy(mtod(m, caddr_t) + remain, mtod(n, caddr_t), n->m_len);
 	}
 
 	/* Trim 'm' and update accounting. */
 	m->m_len -= n->m_len;
 	sb->sb_tlscc -= n->m_len;
 	sb->sb_ccc -= n->m_len;
 
 	/* Account for 'n'. */
 	sballoc_ktls_rx(sb, n);
 
 	/* Insert 'n' into the TLS chain. */
 	sb->sb_mtls = n;
 	n->m_next = m->m_next;
 	if (sb->sb_mtlstail == m)
 		sb->sb_mtlstail = n;
 
 	/* Detach the record from the TLS chain. */
 	m->m_next = NULL;
 
 out:
 	MPASS(m_length(top, NULL) == len);
 	for (m = top; m != NULL; m = m->m_next)
 		sbfree_ktls_rx(sb, m);
 	sb->sb_tlsdcc = len;
 	sb->sb_ccc += len;
 	SBCHECK(sb);
 	return (top);
 }
 
 /*
  * Determine the length of the trailing zero padding and find the real
  * record type in the byte before the padding.
  *
  * Walking the mbuf chain backwards is clumsy, so another option would
  * be to scan forwards remembering the last non-zero byte before the
  * trailer.  However, it would be expensive to scan the entire record.
  * Instead, find the last non-zero byte of each mbuf in the chain
  * keeping track of the relative offset of that nonzero byte.
  *
  * trail_len is the size of the MAC/tag on input and is set to the
  * size of the full trailer including padding and the record type on
  * return.
  */
 static int
 tls13_find_record_type(struct ktls_session *tls, struct mbuf *m, int tls_len,
     int *trailer_len, uint8_t *record_typep)
 {
 	char *cp;
 	u_int digest_start, last_offset, m_len, offset;
 	uint8_t record_type;
 
 	digest_start = tls_len - *trailer_len;
 	last_offset = 0;
 	offset = 0;
 	for (; m != NULL && offset < digest_start;
 	     offset += m->m_len, m = m->m_next) {
 		/* Don't look for padding in the tag. */
 		m_len = min(digest_start - offset, m->m_len);
 		cp = mtod(m, char *);
 
 		/* Find last non-zero byte in this mbuf. */
 		while (m_len > 0 && cp[m_len - 1] == 0)
 			m_len--;
 		if (m_len > 0) {
 			record_type = cp[m_len - 1];
 			last_offset = offset + m_len;
 		}
 	}
 	if (last_offset < tls->params.tls_hlen)
 		return (EBADMSG);
 
 	*record_typep = record_type;
 	*trailer_len = tls_len - last_offset + 1;
 	return (0);
 }
 
 /*
  * Check if a mbuf chain is fully decrypted at the given offset and
  * length. Returns KTLS_MBUF_CRYPTO_ST_DECRYPTED if all data is
  * decrypted. KTLS_MBUF_CRYPTO_ST_MIXED if there is a mix of encrypted
  * and decrypted data. Else KTLS_MBUF_CRYPTO_ST_ENCRYPTED if all data
  * is encrypted.
  */
 ktls_mbuf_crypto_st_t
 ktls_mbuf_crypto_state(struct mbuf *mb, int offset, int len)
 {
 	int m_flags_ored = 0;
 	int m_flags_anded = -1;
 
 	for (; mb != NULL; mb = mb->m_next) {
 		if (offset < mb->m_len)
 			break;
 		offset -= mb->m_len;
 	}
 	offset += len;
 
 	for (; mb != NULL; mb = mb->m_next) {
 		m_flags_ored |= mb->m_flags;
 		m_flags_anded &= mb->m_flags;
 
 		if (offset <= mb->m_len)
 			break;
 		offset -= mb->m_len;
 	}
 	MPASS(mb != NULL || offset == 0);
 
 	if ((m_flags_ored ^ m_flags_anded) & M_DECRYPTED)
 		return (KTLS_MBUF_CRYPTO_ST_MIXED);
 	else
 		return ((m_flags_ored & M_DECRYPTED) ?
 		    KTLS_MBUF_CRYPTO_ST_DECRYPTED :
 		    KTLS_MBUF_CRYPTO_ST_ENCRYPTED);
 }
 
 /*
  * ktls_resync_ifnet - get HW TLS RX back on track after packet loss
  */
 static int
 ktls_resync_ifnet(struct socket *so, uint32_t tls_len, uint64_t tls_rcd_num)
 {
 	union if_snd_tag_modify_params params;
 	struct m_snd_tag *mst;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 
 	mst = so->so_rcv.sb_tls_info->snd_tag;
 	if (__predict_false(mst == NULL))
 		return (EINVAL);
 
 	inp = sotoinpcb(so);
 	if (__predict_false(inp == NULL))
 		return (EINVAL);
 
 	INP_RLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_RUNLOCK(inp);
 		return (ECONNRESET);
 	}
 
 	tp = intotcpcb(inp);
 	MPASS(tp != NULL);
 
 	/* Get the TCP sequence number of the next valid TLS header. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	params.tls_rx.tls_hdr_tcp_sn =
 	    tp->rcv_nxt - so->so_rcv.sb_tlscc - tls_len;
 	params.tls_rx.tls_rec_length = tls_len;
 	params.tls_rx.tls_seq_number = tls_rcd_num;
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	INP_RUNLOCK(inp);
 
 	MPASS(mst->sw->type == IF_SND_TAG_TYPE_TLS_RX);
 	return (mst->sw->snd_tag_modify(mst, &params));
 }
 
 static void
 ktls_decrypt(struct socket *so)
 {
 	char tls_header[MBUF_PEXT_HDR_LEN];
 	struct ktls_session *tls;
 	struct sockbuf *sb;
 	struct tls_record_layer *hdr;
 	struct tls_get_record tgr;
 	struct mbuf *control, *data, *m;
 	ktls_mbuf_crypto_st_t state;
 	uint64_t seqno;
 	int error, remain, tls_len, trail_len;
 	bool tls13;
 	uint8_t vminor, record_type;
 
 	hdr = (struct tls_record_layer *)tls_header;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	KASSERT(sb->sb_flags & SB_TLS_RX_RUNNING,
 	    ("%s: socket %p not running", __func__, so));
 
 	tls = sb->sb_tls_info;
 	MPASS(tls != NULL);
 
 	tls13 = (tls->params.tls_vminor == TLS_MINOR_VER_THREE);
 	if (tls13)
 		vminor = TLS_MINOR_VER_TWO;
 	else
 		vminor = tls->params.tls_vminor;
 	for (;;) {
 		/* Is there enough queued for a TLS header? */
 		if (sb->sb_tlscc < tls->params.tls_hlen)
 			break;
 
 		m_copydata(sb->sb_mtls, 0, tls->params.tls_hlen, tls_header);
 		tls_len = sizeof(*hdr) + ntohs(hdr->tls_length);
 
 		if (hdr->tls_vmajor != tls->params.tls_vmajor ||
 		    hdr->tls_vminor != vminor)
 			error = EINVAL;
 		else if (tls13 && hdr->tls_type != TLS_RLTYPE_APP)
 			error = EINVAL;
 		else if (tls_len < tls->params.tls_hlen || tls_len >
 		    tls->params.tls_hlen + TLS_MAX_MSG_SIZE_V10_2 +
 		    tls->params.tls_tlen)
 			error = EMSGSIZE;
 		else
 			error = 0;
 		if (__predict_false(error != 0)) {
 			/*
 			 * We have a corrupted record and are likely
 			 * out of sync.  The connection isn't
 			 * recoverable at this point, so abort it.
 			 */
 			SOCKBUF_UNLOCK(sb);
 			counter_u64_add(ktls_offload_corrupted_records, 1);
 
 			CURVNET_SET(so->so_vnet);
-			so->so_proto->pr_usrreqs->pru_abort(so);
+			so->so_proto->pr_abort(so);
 			so->so_error = error;
 			CURVNET_RESTORE();
 			goto deref;
 		}
 
 		/* Is the entire record queued? */
 		if (sb->sb_tlscc < tls_len)
 			break;
 
 		/*
 		 * Split out the portion of the mbuf chain containing
 		 * this TLS record.
 		 */
 		data = ktls_detach_record(sb, tls_len);
 		if (data == NULL)
 			continue;
 		MPASS(sb->sb_tlsdcc == tls_len);
 
 		seqno = sb->sb_tls_seqno;
 		sb->sb_tls_seqno++;
 		SBCHECK(sb);
 		SOCKBUF_UNLOCK(sb);
 
 		/* get crypto state for this TLS record */
 		state = ktls_mbuf_crypto_state(data, 0, tls_len);
 
 		switch (state) {
 		case KTLS_MBUF_CRYPTO_ST_MIXED:
 			error = ktls_ocf_recrypt(tls, hdr, data, seqno);
 			if (error)
 				break;
 			/* FALLTHROUGH */
 		case KTLS_MBUF_CRYPTO_ST_ENCRYPTED:
 			error = ktls_ocf_decrypt(tls, hdr, data, seqno,
 			    &trail_len);
 			if (__predict_true(error == 0)) {
 				if (tls13) {
 					error = tls13_find_record_type(tls, data,
 					    tls_len, &trail_len, &record_type);
 				} else {
 					record_type = hdr->tls_type;
 				}
 			}
 			break;
 		case KTLS_MBUF_CRYPTO_ST_DECRYPTED:
 			/*
 			 * NIC TLS is only supported for AEAD
 			 * ciphersuites which used a fixed sized
 			 * trailer.
 			 */
 			if (tls13) {
 				trail_len = tls->params.tls_tlen - 1;
 				error = tls13_find_record_type(tls, data,
 				    tls_len, &trail_len, &record_type);
 			} else {
 				trail_len = tls->params.tls_tlen;
 				error = 0;
 				record_type = hdr->tls_type;
 			}
 			break;
 		default:
 			error = EINVAL;
 			break;
 		}
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 
 			SOCKBUF_LOCK(sb);
 			if (sb->sb_tlsdcc == 0) {
 				/*
 				 * sbcut/drop/flush discarded these
 				 * mbufs.
 				 */
 				m_freem(data);
 				break;
 			}
 
 			/*
 			 * Drop this TLS record's data, but keep
 			 * decrypting subsequent records.
 			 */
 			sb->sb_ccc -= tls_len;
 			sb->sb_tlsdcc = 0;
 
 			CURVNET_SET(so->so_vnet);
 			so->so_error = EBADMSG;
 			sorwakeup_locked(so);
 			CURVNET_RESTORE();
 
 			m_freem(data);
 
 			SOCKBUF_LOCK(sb);
 			continue;
 		}
 
 		/* Allocate the control mbuf. */
 		memset(&tgr, 0, sizeof(tgr));
 		tgr.tls_type = record_type;
 		tgr.tls_vmajor = hdr->tls_vmajor;
 		tgr.tls_vminor = hdr->tls_vminor;
 		tgr.tls_length = htobe16(tls_len - tls->params.tls_hlen -
 		    trail_len);
 		control = sbcreatecontrol(&tgr, sizeof(tgr),
 		    TLS_GET_RECORD, IPPROTO_TCP, M_WAITOK);
 
 		SOCKBUF_LOCK(sb);
 		if (sb->sb_tlsdcc == 0) {
 			/* sbcut/drop/flush discarded these mbufs. */
 			MPASS(sb->sb_tlscc == 0);
 			m_freem(data);
 			m_freem(control);
 			break;
 		}
 
 		/*
 		 * Clear the 'dcc' accounting in preparation for
 		 * adding the decrypted record.
 		 */
 		sb->sb_ccc -= tls_len;
 		sb->sb_tlsdcc = 0;
 		SBCHECK(sb);
 
 		/* If there is no payload, drop all of the data. */
 		if (tgr.tls_length == htobe16(0)) {
 			m_freem(data);
 			data = NULL;
 		} else {
 			/* Trim header. */
 			remain = tls->params.tls_hlen;
 			while (remain > 0) {
 				if (data->m_len > remain) {
 					data->m_data += remain;
 					data->m_len -= remain;
 					break;
 				}
 				remain -= data->m_len;
 				data = m_free(data);
 			}
 
 			/* Trim trailer and clear M_NOTREADY. */
 			remain = be16toh(tgr.tls_length);
 			m = data;
 			for (m = data; remain > m->m_len; m = m->m_next) {
 				m->m_flags &= ~(M_NOTREADY | M_DECRYPTED);
 				remain -= m->m_len;
 			}
 			m->m_len = remain;
 			m_freem(m->m_next);
 			m->m_next = NULL;
 			m->m_flags &= ~(M_NOTREADY | M_DECRYPTED);
 
 			/* Set EOR on the final mbuf. */
 			m->m_flags |= M_EOR;
 		}
 
 		sbappendcontrol_locked(sb, data, control, 0);
 
 		if (__predict_false(state != KTLS_MBUF_CRYPTO_ST_DECRYPTED)) {
 			sb->sb_flags |= SB_TLS_RX_RESYNC;
 			SOCKBUF_UNLOCK(sb);
 			ktls_resync_ifnet(so, tls_len, seqno);
 			SOCKBUF_LOCK(sb);
 		} else if (__predict_false(sb->sb_flags & SB_TLS_RX_RESYNC)) {
 			sb->sb_flags &= ~SB_TLS_RX_RESYNC;
 			SOCKBUF_UNLOCK(sb);
 			ktls_resync_ifnet(so, 0, seqno);
 			SOCKBUF_LOCK(sb);
 		}
 	}
 
 	sb->sb_flags &= ~SB_TLS_RX_RUNNING;
 
 	if ((sb->sb_state & SBS_CANTRCVMORE) != 0 && sb->sb_tlscc > 0)
 		so->so_error = EMSGSIZE;
 
 	sorwakeup_locked(so);
 
 deref:
 	SOCKBUF_UNLOCK_ASSERT(sb);
 
 	CURVNET_SET(so->so_vnet);
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 ktls_enqueue_to_free(struct mbuf *m)
 {
 	struct ktls_wq *wq;
 	bool running;
 
 	/* Mark it for freeing. */
 	m->m_epg_flags |= EPG_FLAG_2FREE;
 	wq = &ktls_wq[m->m_epg_tls->wq_index];
 	mtx_lock(&wq->mtx);
 	STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 }
 
 static void *
 ktls_buffer_alloc(struct ktls_wq *wq, struct mbuf *m)
 {
 	void *buf;
 	int domain, running;
 
 	if (m->m_epg_npgs <= 2)
 		return (NULL);
 	if (ktls_buffer_zone == NULL)
 		return (NULL);
 	if ((u_int)(ticks - wq->lastallocfail) < hz) {
 		/*
 		 * Rate-limit allocation attempts after a failure.
 		 * ktls_buffer_import() will acquire a per-domain mutex to check
 		 * the free page queues and may fail consistently if memory is
 		 * fragmented.
 		 */
 		return (NULL);
 	}
 	buf = uma_zalloc(ktls_buffer_zone, M_NOWAIT | M_NORECLAIM);
 	if (buf == NULL) {
 		domain = PCPU_GET(domain);
 		wq->lastallocfail = ticks;
 
 		/*
 		 * Note that this check is "racy", but the races are
 		 * harmless, and are either a spurious wakeup if
 		 * multiple threads fail allocations before the alloc
 		 * thread wakes, or waiting an extra second in case we
 		 * see an old value of running == true.
 		 */
 		if (!VM_DOMAIN_EMPTY(domain)) {
 			running = atomic_load_int(&ktls_domains[domain].alloc_td.running);
 			if (!running)
 				wakeup(&ktls_domains[domain].alloc_td);
 		}
 	}
 	return (buf);
 }
 
 static int
 ktls_encrypt_record(struct ktls_wq *wq, struct mbuf *m,
     struct ktls_session *tls, struct ktls_ocf_encrypt_state *state)
 {
 	vm_page_t pg;
 	int error, i, len, off;
 
 	KASSERT((m->m_flags & (M_EXTPG | M_NOTREADY)) == (M_EXTPG | M_NOTREADY),
 	    ("%p not unready & nomap mbuf\n", m));
 	KASSERT(ptoa(m->m_epg_npgs) <= ktls_maxlen,
 	    ("page count %d larger than maximum frame length %d", m->m_epg_npgs,
 	    ktls_maxlen));
 
 	/* Anonymous mbufs are encrypted in place. */
 	if ((m->m_epg_flags & EPG_FLAG_ANON) != 0)
 		return (ktls_ocf_encrypt(state, tls, m, NULL, 0));
 
 	/*
 	 * For file-backed mbufs (from sendfile), anonymous wired
 	 * pages are allocated and used as the encryption destination.
 	 */
 	if ((state->cbuf = ktls_buffer_alloc(wq, m)) != NULL) {
 		len = ptoa(m->m_epg_npgs - 1) + m->m_epg_last_len -
 		    m->m_epg_1st_off;
 		state->dst_iov[0].iov_base = (char *)state->cbuf +
 		    m->m_epg_1st_off;
 		state->dst_iov[0].iov_len = len;
 		state->parray[0] = DMAP_TO_PHYS((vm_offset_t)state->cbuf);
 		i = 1;
 	} else {
 		off = m->m_epg_1st_off;
 		for (i = 0; i < m->m_epg_npgs; i++, off = 0) {
 			pg = vm_page_alloc_noobj(VM_ALLOC_NODUMP |
 			    VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
 			len = m_epg_pagelen(m, i, off);
 			state->parray[i] = VM_PAGE_TO_PHYS(pg);
 			state->dst_iov[i].iov_base =
 			    (char *)PHYS_TO_DMAP(state->parray[i]) + off;
 			state->dst_iov[i].iov_len = len;
 		}
 	}
 	KASSERT(i + 1 <= nitems(state->dst_iov), ("dst_iov is too small"));
 	state->dst_iov[i].iov_base = m->m_epg_trail;
 	state->dst_iov[i].iov_len = m->m_epg_trllen;
 
 	error = ktls_ocf_encrypt(state, tls, m, state->dst_iov, i + 1);
 
 	if (__predict_false(error != 0)) {
 		/* Free the anonymous pages. */
 		if (state->cbuf != NULL)
 			uma_zfree(ktls_buffer_zone, state->cbuf);
 		else {
 			for (i = 0; i < m->m_epg_npgs; i++) {
 				pg = PHYS_TO_VM_PAGE(state->parray[i]);
 				(void)vm_page_unwire_noq(pg);
 				vm_page_free(pg);
 			}
 		}
 	}
 	return (error);
 }
 
 /* Number of TLS records in a batch passed to ktls_enqueue(). */
 static u_int
 ktls_batched_records(struct mbuf *m)
 {
 	int page_count, records;
 
 	records = 0;
 	page_count = m->m_epg_enc_cnt;
 	while (page_count > 0) {
 		records++;
 		page_count -= m->m_epg_nrdy;
 		m = m->m_next;
 	}
 	KASSERT(page_count == 0, ("%s: mismatched page count", __func__));
 	return (records);
 }
 
 void
 ktls_enqueue(struct mbuf *m, struct socket *so, int page_count)
 {
 	struct ktls_session *tls;
 	struct ktls_wq *wq;
 	int queued;
 	bool running;
 
 	KASSERT(((m->m_flags & (M_EXTPG | M_NOTREADY)) ==
 	    (M_EXTPG | M_NOTREADY)),
 	    ("ktls_enqueue: %p not unready & nomap mbuf\n", m));
 	KASSERT(page_count != 0, ("enqueueing TLS mbuf with zero page count"));
 
 	KASSERT(m->m_epg_tls->mode == TCP_TLS_MODE_SW, ("!SW TLS mbuf"));
 
 	m->m_epg_enc_cnt = page_count;
 
 	/*
 	 * Save a pointer to the socket.  The caller is responsible
 	 * for taking an additional reference via soref().
 	 */
 	m->m_epg_so = so;
 
 	queued = 1;
 	tls = m->m_epg_tls;
 	wq = &ktls_wq[tls->wq_index];
 	mtx_lock(&wq->mtx);
 	if (__predict_false(tls->sequential_records)) {
 		/*
 		 * For TLS 1.0, records must be encrypted
 		 * sequentially.  For a given connection, all records
 		 * queued to the associated work queue are processed
 		 * sequentially.  However, sendfile(2) might complete
 		 * I/O requests spanning multiple TLS records out of
 		 * order.  Here we ensure TLS records are enqueued to
 		 * the work queue in FIFO order.
 		 *
 		 * tls->next_seqno holds the sequence number of the
 		 * next TLS record that should be enqueued to the work
 		 * queue.  If this next record is not tls->next_seqno,
 		 * it must be a future record, so insert it, sorted by
 		 * TLS sequence number, into tls->pending_records and
 		 * return.
 		 *
 		 * If this TLS record matches tls->next_seqno, place
 		 * it in the work queue and then check
 		 * tls->pending_records to see if any
 		 * previously-queued records are now ready for
 		 * encryption.
 		 */
 		if (m->m_epg_seqno != tls->next_seqno) {
 			struct mbuf *n, *p;
 
 			p = NULL;
 			STAILQ_FOREACH(n, &tls->pending_records, m_epg_stailq) {
 				if (n->m_epg_seqno > m->m_epg_seqno)
 					break;
 				p = n;
 			}
 			if (n == NULL)
 				STAILQ_INSERT_TAIL(&tls->pending_records, m,
 				    m_epg_stailq);
 			else if (p == NULL)
 				STAILQ_INSERT_HEAD(&tls->pending_records, m,
 				    m_epg_stailq);
 			else
 				STAILQ_INSERT_AFTER(&tls->pending_records, p, m,
 				    m_epg_stailq);
 			mtx_unlock(&wq->mtx);
 			counter_u64_add(ktls_cnt_tx_pending, 1);
 			return;
 		}
 
 		tls->next_seqno += ktls_batched_records(m);
 		STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 
 		while (!STAILQ_EMPTY(&tls->pending_records)) {
 			struct mbuf *n;
 
 			n = STAILQ_FIRST(&tls->pending_records);
 			if (n->m_epg_seqno != tls->next_seqno)
 				break;
 
 			queued++;
 			STAILQ_REMOVE_HEAD(&tls->pending_records, m_epg_stailq);
 			tls->next_seqno += ktls_batched_records(n);
 			STAILQ_INSERT_TAIL(&wq->m_head, n, m_epg_stailq);
 		}
 		counter_u64_add(ktls_cnt_tx_pending, -(queued - 1));
 	} else
 		STAILQ_INSERT_TAIL(&wq->m_head, m, m_epg_stailq);
 
 	running = wq->running;
 	mtx_unlock(&wq->mtx);
 	if (!running)
 		wakeup(wq);
 	counter_u64_add(ktls_cnt_tx_queued, queued);
 }
 
 /*
  * Once a file-backed mbuf (from sendfile) has been encrypted, free
  * the pages from the file and replace them with the anonymous pages
  * allocated in ktls_encrypt_record().
  */
 static void
 ktls_finish_nonanon(struct mbuf *m, struct ktls_ocf_encrypt_state *state)
 {
 	int i;
 
 	MPASS((m->m_epg_flags & EPG_FLAG_ANON) == 0);
 
 	/* Free the old pages. */
 	m->m_ext.ext_free(m);
 
 	/* Replace them with the new pages. */
 	if (state->cbuf != NULL) {
 		for (i = 0; i < m->m_epg_npgs; i++)
 			m->m_epg_pa[i] = state->parray[0] + ptoa(i);
 
 		/* Contig pages should go back to the cache. */
 		m->m_ext.ext_free = ktls_free_mext_contig;
 	} else {
 		for (i = 0; i < m->m_epg_npgs; i++)
 			m->m_epg_pa[i] = state->parray[i];
 
 		/* Use the basic free routine. */
 		m->m_ext.ext_free = mb_free_mext_pgs;
 	}
 
 	/* Pages are now writable. */
 	m->m_epg_flags |= EPG_FLAG_ANON;
 }
 
 static __noinline void
 ktls_encrypt(struct ktls_wq *wq, struct mbuf *top)
 {
 	struct ktls_ocf_encrypt_state state;
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m;
 	int error, npages, total_pages;
 
 	so = top->m_epg_so;
 	tls = top->m_epg_tls;
 	KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 	KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 #ifdef INVARIANTS
 	top->m_epg_so = NULL;
 #endif
 	total_pages = top->m_epg_enc_cnt;
 	npages = 0;
 
 	/*
 	 * Encrypt the TLS records in the chain of mbufs starting with
 	 * 'top'.  'total_pages' gives us a total count of pages and is
 	 * used to know when we have finished encrypting the TLS
 	 * records originally queued with 'top'.
 	 *
 	 * NB: These mbufs are queued in the socket buffer and
 	 * 'm_next' is traversing the mbufs in the socket buffer.  The
 	 * socket buffer lock is not held while traversing this chain.
 	 * Since the mbufs are all marked M_NOTREADY their 'm_next'
 	 * pointers should be stable.  However, the 'm_next' of the
 	 * last mbuf encrypted is not necessarily NULL.  It can point
 	 * to other mbufs appended while 'top' was on the TLS work
 	 * queue.
 	 *
 	 * Each mbuf holds an entire TLS record.
 	 */
 	error = 0;
 	for (m = top; npages != total_pages; m = m->m_next) {
 		KASSERT(m->m_epg_tls == tls,
 		    ("different TLS sessions in a single mbuf chain: %p vs %p",
 		    tls, m->m_epg_tls));
 		KASSERT(npages + m->m_epg_npgs <= total_pages,
 		    ("page count mismatch: top %p, total_pages %d, m %p", top,
 		    total_pages, m));
 
 		error = ktls_encrypt_record(wq, m, tls, &state);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 			break;
 		}
 
 		if ((m->m_epg_flags & EPG_FLAG_ANON) == 0)
 			ktls_finish_nonanon(m, &state);
 
 		npages += m->m_epg_nrdy;
 
 		/*
 		 * Drop a reference to the session now that it is no
 		 * longer needed.  Existing code depends on encrypted
 		 * records having no associated session vs
 		 * yet-to-be-encrypted records having an associated
 		 * session.
 		 */
 		m->m_epg_tls = NULL;
 		ktls_free(tls);
 	}
 
 	CURVNET_SET(so->so_vnet);
 	if (error == 0) {
-		(void)(*so->so_proto->pr_usrreqs->pru_ready)(so, top, npages);
+		(void)so->so_proto->pr_ready(so, top, npages);
 	} else {
-		so->so_proto->pr_usrreqs->pru_abort(so);
+		so->so_proto->pr_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(top, total_pages);
 	}
 
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 void
 ktls_encrypt_cb(struct ktls_ocf_encrypt_state *state, int error)
 {
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m;
 	int npages;
 
 	m = state->m;
 
 	if ((m->m_epg_flags & EPG_FLAG_ANON) == 0)
 		ktls_finish_nonanon(m, state);
 
 	so = state->so;
 	free(state, M_KTLS);
 
 	/*
 	 * Drop a reference to the session now that it is no longer
 	 * needed.  Existing code depends on encrypted records having
 	 * no associated session vs yet-to-be-encrypted records having
 	 * an associated session.
 	 */
 	tls = m->m_epg_tls;
 	m->m_epg_tls = NULL;
 	ktls_free(tls);
 
 	if (error != 0)
 		counter_u64_add(ktls_offload_failed_crypto, 1);
 
 	CURVNET_SET(so->so_vnet);
 	npages = m->m_epg_nrdy;
 
 	if (error == 0) {
-		(void)(*so->so_proto->pr_usrreqs->pru_ready)(so, m, npages);
+		(void)so->so_proto->pr_ready(so, m, npages);
 	} else {
-		so->so_proto->pr_usrreqs->pru_abort(so);
+		so->so_proto->pr_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(m, npages);
 	}
 
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 /*
  * Similar to ktls_encrypt, but used with asynchronous OCF backends
  * (coprocessors) where encryption does not use host CPU resources and
  * it can be beneficial to queue more requests than CPUs.
  */
 static __noinline void
 ktls_encrypt_async(struct ktls_wq *wq, struct mbuf *top)
 {
 	struct ktls_ocf_encrypt_state *state;
 	struct ktls_session *tls;
 	struct socket *so;
 	struct mbuf *m, *n;
 	int error, mpages, npages, total_pages;
 
 	so = top->m_epg_so;
 	tls = top->m_epg_tls;
 	KASSERT(tls != NULL, ("tls = NULL, top = %p\n", top));
 	KASSERT(so != NULL, ("so = NULL, top = %p\n", top));
 #ifdef INVARIANTS
 	top->m_epg_so = NULL;
 #endif
 	total_pages = top->m_epg_enc_cnt;
 	npages = 0;
 
 	error = 0;
 	for (m = top; npages != total_pages; m = n) {
 		KASSERT(m->m_epg_tls == tls,
 		    ("different TLS sessions in a single mbuf chain: %p vs %p",
 		    tls, m->m_epg_tls));
 		KASSERT(npages + m->m_epg_npgs <= total_pages,
 		    ("page count mismatch: top %p, total_pages %d, m %p", top,
 		    total_pages, m));
 
 		state = malloc(sizeof(*state), M_KTLS, M_WAITOK | M_ZERO);
 		soref(so);
 		state->so = so;
 		state->m = m;
 
 		mpages = m->m_epg_nrdy;
 		n = m->m_next;
 
 		error = ktls_encrypt_record(wq, m, tls, state);
 		if (error) {
 			counter_u64_add(ktls_offload_failed_crypto, 1);
 			free(state, M_KTLS);
 			CURVNET_SET(so->so_vnet);
 			sorele(so);
 			CURVNET_RESTORE();
 			break;
 		}
 
 		npages += mpages;
 	}
 
 	CURVNET_SET(so->so_vnet);
 	if (error != 0) {
-		so->so_proto->pr_usrreqs->pru_abort(so);
+		so->so_proto->pr_abort(so);
 		so->so_error = EIO;
 		mb_free_notready(m, total_pages - npages);
 	}
 
 	sorele(so);
 	CURVNET_RESTORE();
 }
 
 static int
 ktls_bind_domain(int domain)
 {
 	int error;
 
 	error = cpuset_setthread(curthread->td_tid, &cpuset_domain[domain]);
 	if (error != 0)
 		return (error);
 	curthread->td_domain.dr_policy = DOMAINSET_PREF(domain);
 	return (0);
 }
 
 static void
 ktls_alloc_thread(void *ctx)
 {
 	struct ktls_domain_info *ktls_domain = ctx;
 	struct ktls_alloc_thread *sc = &ktls_domain->alloc_td;
 	void **buf;
 	struct sysctl_oid *oid;
 	char name[80];
 	int domain, error, i, nbufs;
 
 	domain = ktls_domain - ktls_domains;
 	if (bootverbose)
 		printf("Starting KTLS alloc thread for domain %d\n", domain);
 	error = ktls_bind_domain(domain);
 	if (error)
 		printf("Unable to bind KTLS alloc thread for domain %d: error %d\n",
 		    domain, error);
 	snprintf(name, sizeof(name), "domain%d", domain);
 	oid = SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_kern_ipc_tls), OID_AUTO,
 	    name, CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, "");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "allocs",
 	    CTLFLAG_RD,  &sc->allocs, 0, "buffers allocated");
 	SYSCTL_ADD_U64(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "wakeups",
 	    CTLFLAG_RD,  &sc->wakeups, 0, "thread wakeups");
 	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(oid), OID_AUTO, "running",
 	    CTLFLAG_RD,  &sc->running, 0, "thread running");
 
 	buf = NULL;
 	nbufs = 0;
 	for (;;) {
 		atomic_store_int(&sc->running, 0);
 		tsleep(sc, PZERO | PNOLOCK, "-",  0);
 		atomic_store_int(&sc->running, 1);
 		sc->wakeups++;
 		if (nbufs != ktls_max_alloc) {
 			free(buf, M_KTLS);
 			nbufs = atomic_load_int(&ktls_max_alloc);
 			buf = malloc(sizeof(void *) * nbufs, M_KTLS,
 			    M_WAITOK | M_ZERO);
 		}
 		/*
 		 * Below we allocate nbufs with different allocation
 		 * flags than we use when allocating normally during
 		 * encryption in the ktls worker thread.  We specify
 		 * M_NORECLAIM in the worker thread. However, we omit
 		 * that flag here and add M_WAITOK so that the VM
 		 * system is permitted to perform expensive work to
 		 * defragment memory.  We do this here, as it does not
 		 * matter if this thread blocks.  If we block a ktls
 		 * worker thread, we risk developing backlogs of
 		 * buffers to be encrypted, leading to surges of
 		 * traffic and potential NIC output drops.
 		 */
 		for (i = 0; i < nbufs; i++) {
 			buf[i] = uma_zalloc(ktls_buffer_zone, M_WAITOK);
 			sc->allocs++;
 		}
 		for (i = 0; i < nbufs; i++) {
 			uma_zfree(ktls_buffer_zone, buf[i]);
 			buf[i] = NULL;
 		}
 	}
 }
 
 static void
 ktls_work_thread(void *ctx)
 {
 	struct ktls_wq *wq = ctx;
 	struct mbuf *m, *n;
 	struct socket *so, *son;
 	STAILQ_HEAD(, mbuf) local_m_head;
 	STAILQ_HEAD(, socket) local_so_head;
 	int cpu;
 
 	cpu = wq - ktls_wq;
 	if (bootverbose)
 		printf("Starting KTLS worker thread for CPU %d\n", cpu);
 
 	/*
 	 * Bind to a core.  If ktls_bind_threads is > 1, then
 	 * we bind to the NUMA domain instead.
 	 */
 	if (ktls_bind_threads) {
 		int error;
 
 		if (ktls_bind_threads > 1) {
 			struct pcpu *pc = pcpu_find(cpu);
 
 			error = ktls_bind_domain(pc->pc_domain);
 		} else {
 			cpuset_t mask;
 
 			CPU_SETOF(cpu, &mask);
 			error = cpuset_setthread(curthread->td_tid, &mask);
 		}
 		if (error)
 			printf("Unable to bind KTLS worker thread for CPU %d: error %d\n",
 				cpu, error);
 	}
 #if defined(__aarch64__) || defined(__amd64__) || defined(__i386__)
 	fpu_kern_thread(0);
 #endif
 	for (;;) {
 		mtx_lock(&wq->mtx);
 		while (STAILQ_EMPTY(&wq->m_head) &&
 		    STAILQ_EMPTY(&wq->so_head)) {
 			wq->running = false;
 			mtx_sleep(wq, &wq->mtx, 0, "-", 0);
 			wq->running = true;
 		}
 
 		STAILQ_INIT(&local_m_head);
 		STAILQ_CONCAT(&local_m_head, &wq->m_head);
 		STAILQ_INIT(&local_so_head);
 		STAILQ_CONCAT(&local_so_head, &wq->so_head);
 		mtx_unlock(&wq->mtx);
 
 		STAILQ_FOREACH_SAFE(m, &local_m_head, m_epg_stailq, n) {
 			if (m->m_epg_flags & EPG_FLAG_2FREE) {
 				ktls_free(m->m_epg_tls);
 				m_free_raw(m);
 			} else {
 				if (m->m_epg_tls->sync_dispatch)
 					ktls_encrypt(wq, m);
 				else
 					ktls_encrypt_async(wq, m);
 				counter_u64_add(ktls_cnt_tx_queued, -1);
 			}
 		}
 
 		STAILQ_FOREACH_SAFE(so, &local_so_head, so_ktls_rx_list, son) {
 			ktls_decrypt(so);
 			counter_u64_add(ktls_cnt_rx_queued, -1);
 		}
 	}
 }
 
 #if defined(INET) || defined(INET6)
 static void
 ktls_disable_ifnet_help(void *context, int pending __unused)
 {
 	struct ktls_session *tls;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct socket *so;
 	int err;
 
 	tls = context;
 	inp = tls->inp;
 	if (inp == NULL)
 		return;
 	INP_WLOCK(inp);
 	so = inp->inp_socket;
 	MPASS(so != NULL);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		goto out;
 	}
 
 	if (so->so_snd.sb_tls_info != NULL)
 		err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
 	else
 		err = ENXIO;
 	if (err == 0) {
 		counter_u64_add(ktls_ifnet_disable_ok, 1);
 		/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
 		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
 		    (tp = intotcpcb(inp)) != NULL &&
 		    tp->t_fb->tfb_hwtls_change != NULL)
 			(*tp->t_fb->tfb_hwtls_change)(tp, 0);
 	} else {
 		counter_u64_add(ktls_ifnet_disable_fail, 1);
 	}
 
 out:
 	sorele(so);
 	if (!in_pcbrele_wlocked(inp))
 		INP_WUNLOCK(inp);
 	ktls_free(tls);
 }
 
 /*
  * Called when re-transmits are becoming a substantial portion of the
  * sends on this connection.  When this happens, we transition the
  * connection to software TLS.  This is needed because most inline TLS
  * NICs keep crypto state only for in-order transmits.  This means
  * that to handle a TCP rexmit (which is out-of-order), the NIC must
  * re-DMA the entire TLS record up to and including the current
  * segment.  This means that when re-transmitting the last ~1448 byte
  * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
  * of magnitude more data than we are sending.  This can cause the
  * PCIe link to saturate well before the network, which can cause
  * output drops, and a general loss of capacity.
  */
 void
 ktls_disable_ifnet(void *arg)
 {
 	struct tcpcb *tp;
 	struct inpcb *inp;
 	struct socket *so;
 	struct ktls_session *tls;
 
 	tp = arg;
 	inp = tp->t_inpcb;
 	INP_WLOCK_ASSERT(inp);
 	so = inp->inp_socket;
 	SOCK_LOCK(so);
 	tls = so->so_snd.sb_tls_info;
 	if (tls->disable_ifnet_pending) {
 		SOCK_UNLOCK(so);
 		return;
 	}
 
 	/*
 	 * note that disable_ifnet_pending is never cleared; disabling
 	 * ifnet can only be done once per session, so we never want
 	 * to do it again
 	 */
 
 	(void)ktls_hold(tls);
 	in_pcbref(inp);
 	soref(so);
 	tls->disable_ifnet_pending = true;
 	tls->inp = inp;
 	SOCK_UNLOCK(so);
 	TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
 	(void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
 }
 #endif
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index bbca2d8663d7..bf22c0245f24 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -1,4499 +1,4319 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2004 The FreeBSD Foundation
  * Copyright (c) 2004-2008 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
  */
 
 /*
  * Comments on the socket life cycle:
  *
  * soalloc() sets of socket layer state for a socket, called only by
  * socreate() and sonewconn().  Socket layer private.
  *
  * sodealloc() tears down socket layer state for a socket, called only by
  * sofree() and sonewconn().  Socket layer private.
  *
  * pru_attach() associates protocol layer state with an allocated socket;
  * called only once, may fail, aborting socket allocation.  This is called
  * from socreate() and sonewconn().  Socket layer private.
  *
  * pru_detach() disassociates protocol layer state from an attached socket,
  * and will be called exactly once for sockets in which pru_attach() has
  * been successfully called.  If pru_attach() returned an error,
  * pru_detach() will not be called.  Socket layer private.
  *
  * pru_abort() and pru_close() notify the protocol layer that the last
  * consumer of a socket is starting to tear down the socket, and that the
  * protocol should terminate the connection.  Historically, pru_abort() also
  * detached protocol state from the socket state, but this is no longer the
  * case.
  *
  * socreate() creates a socket and attaches protocol state.  This is a public
  * interface that may be used by socket layer consumers to create new
  * sockets.
  *
  * sonewconn() creates a socket and attaches protocol state.  This is a
  * public interface  that may be used by protocols to create new sockets when
  * a new connection is received and will be available for accept() on a
  * listen socket.
  *
  * soclose() destroys a socket after possibly waiting for it to disconnect.
  * This is a public interface that socket consumers should use to close and
  * release a socket when done with it.
  *
  * soabort() destroys a socket without waiting for it to disconnect (used
  * only for incoming connections that are already partially or fully
  * connected).  This is used internally by the socket layer when clearing
  * listen socket queues (due to overflow or close on the listen socket), but
  * is also a public interface protocols may use to abort connections in
  * their incomplete listen queues should they no longer be required.  Sockets
  * placed in completed connection listen queues should not be aborted for
  * reasons described in the comment above the soclose() implementation.  This
  * is not a general purpose close routine, and except in the specific
  * circumstances described here, should not be used.
  *
  * sofree() will free a socket and its protocol state if all references on
  * the socket have been released, and is the public interface to attempt to
  * free a socket when a reference is removed.  This is a socket layer private
  * interface.
  *
  * NOTE: In addition to socreate() and soclose(), which provide a single
  * socket reference to the consumer to be managed as required, there are two
  * calls to explicitly manage socket references, soref(), and sorele().
  * Currently, these are generally required only when transitioning a socket
  * from a listen queue to a file descriptor, in order to prevent garbage
  * collection of the socket at an untimely moment.  For a number of reasons,
  * these interfaces are not preferred, and should be avoided.
  *
  * NOTE: With regard to VNETs the general rule is that callers do not set
  * curvnet. Exceptions to this rule include soabort(), sodisconnect(),
  * sofree() (and with that sorele(), sotryfree()), as well as sonewconn()
  * and sorflush(), which are usually called from a pre-set VNET context.
  * sopoll() currently does not need a VNET context to be set.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_kern_tls.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/fcntl.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/domain.h>
 #include <sys/file.h>			/* for struct knote */
 #include <sys/hhook.h>
 #include <sys/kernel.h>
 #include <sys/khelp.h>
 #include <sys/ktls.h>
 #include <sys/event.h>
 #include <sys/eventhandler.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sbuf.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/resourcevar.h>
 #include <net/route.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/jail.h>
 #include <sys/syslog.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 
 #include <net/vnet.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <sys/sysent.h>
 #include <compat/freebsd32/freebsd32.h>
 #endif
 
 static int	soreceive_rcvoob(struct socket *so, struct uio *uio,
 		    int flags);
 static void	so_rdknl_lock(void *);
 static void	so_rdknl_unlock(void *);
 static void	so_rdknl_assert_lock(void *, int);
 static void	so_wrknl_lock(void *);
 static void	so_wrknl_unlock(void *);
 static void	so_wrknl_assert_lock(void *, int);
 
 static void	filt_sordetach(struct knote *kn);
 static int	filt_soread(struct knote *kn, long hint);
 static void	filt_sowdetach(struct knote *kn);
 static int	filt_sowrite(struct knote *kn, long hint);
 static int	filt_soempty(struct knote *kn, long hint);
 static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
 fo_kqfilter_t	soo_kqfilter;
 
 static struct filterops soread_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sordetach,
 	.f_event = filt_soread,
 };
 static struct filterops sowrite_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sowdetach,
 	.f_event = filt_sowrite,
 };
 static struct filterops soempty_filtops = {
 	.f_isfd = 1,
 	.f_detach = filt_sowdetach,
 	.f_event = filt_soempty,
 };
 
 so_gen_t	so_gencnt;	/* generation count for sockets */
 
 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 
 #define	VNET_SO_ASSERT(so)						\
 	VNET_ASSERT(curvnet != NULL,					\
 	    ("%s:%d curvnet is NULL, so=%p", __func__, __LINE__, (so)));
 
 VNET_DEFINE(struct hhook_head *, socket_hhh[HHOOK_SOCKET_LAST + 1]);
 #define	V_socket_hhh		VNET(socket_hhh)
 
 /*
  * Limit on the number of connections in the listen queue waiting
  * for accept(2).
  * NB: The original sysctl somaxconn is still available but hidden
  * to prevent confusion about the actual purpose of this number.
  */
 static u_int somaxconn = SOMAXCONN;
 
 static int
 sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int val;
 
 	val = somaxconn;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 
 	/*
 	 * The purpose of the UINT_MAX / 3 limit, is so that the formula
 	 *   3 * so_qlimit / 2
 	 * below, will not overflow.
          */
 
 	if (val < 1 || val > UINT_MAX / 3)
 		return (EINVAL);
 
 	somaxconn = val;
 	return (0);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, soacceptqueue,
     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, sizeof(int),
     sysctl_somaxconn, "I",
     "Maximum listen socket pending connection accept queue size");
 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
     CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_SKIP | CTLFLAG_MPSAFE, 0,
     sizeof(int), sysctl_somaxconn, "I",
     "Maximum listen socket pending connection accept queue size (compat)");
 
 static int numopensockets;
 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
     &numopensockets, 0, "Number of open sockets");
 
 /*
  * accept_mtx locks down per-socket fields relating to accept queues.  See
  * socketvar.h for an annotation of the protected fields of struct socket.
  */
 struct mtx accept_mtx;
 MTX_SYSINIT(accept_mtx, &accept_mtx, "accept", MTX_DEF);
 
 /*
  * so_global_mtx protects so_gencnt, numopensockets, and the per-socket
  * so_gencnt field.
  */
 static struct mtx so_global_mtx;
 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 
 /*
  * General IPC sysctl name space, used by sockets and a variety of other IPC
  * types.
  */
 SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPC");
 
 /*
  * Initialize the socket subsystem and set up the socket
  * memory allocator.
  */
 static uma_zone_t socket_zone;
 int	maxsockets;
 
 static void
 socket_zone_change(void *tag)
 {
 
 	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 }
 
 static void
 socket_hhook_register(int subtype)
 {
 
 	if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype,
 	    &V_socket_hhh[subtype],
 	    HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register hook\n", __func__);
 }
 
 static void
 socket_hhook_deregister(int subtype)
 {
 
 	if (hhook_head_deregister(V_socket_hhh[subtype]) != 0)
 		printf("%s: WARNING: unable to deregister hook\n", __func__);
 }
 
 static void
 socket_init(void *tag)
 {
 
 	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	maxsockets = uma_zone_set_max(socket_zone, maxsockets);
 	uma_zone_set_warning(socket_zone, "kern.ipc.maxsockets limit reached");
 	EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
 	    EVENTHANDLER_PRI_FIRST);
 }
 SYSINIT(socket, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, socket_init, NULL);
 
 static void
 socket_vnet_init(const void *unused __unused)
 {
 	int i;
 
 	/* We expect a contiguous range */
 	for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 		socket_hhook_register(i);
 }
 VNET_SYSINIT(socket_vnet_init, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
     socket_vnet_init, NULL);
 
 static void
 socket_vnet_uninit(const void *unused __unused)
 {
 	int i;
 
 	for (i = 0; i <= HHOOK_SOCKET_LAST; i++)
 		socket_hhook_deregister(i);
 }
 VNET_SYSUNINIT(socket_vnet_uninit, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY,
     socket_vnet_uninit, NULL);
 
 /*
  * Initialise maxsockets.  This SYSINIT must be run after
  * tunable_mbinit().
  */
 static void
 init_maxsockets(void *ignored)
 {
 
 	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
 	maxsockets = imax(maxsockets, maxfiles);
 }
 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 
 /*
  * Sysctl to get and set the maximum global sockets limit.  Notify protocols
  * of the change so that they can update their dependent limits as required.
  */
 static int
 sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 {
 	int error, newmaxsockets;
 
 	newmaxsockets = maxsockets;
 	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 	if (error == 0 && req->newptr && newmaxsockets != maxsockets) {
 		if (newmaxsockets > maxsockets &&
 		    newmaxsockets <= maxfiles) {
 			maxsockets = newmaxsockets;
 			EVENTHANDLER_INVOKE(maxsockets_change);
 		} else
 			error = EINVAL;
 	}
 	return (error);
 }
 SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &maxsockets, 0,
     sysctl_maxsockets, "IU",
     "Maximum number of sockets available");
 
 /*
  * Socket operation routines.  These routines are called by the routines in
  * sys_socket.c or from a system process, and implement the semantics of
  * socket operations by switching out to the protocol specific routines.
  */
 
 /*
  * Get a socket structure from our zone, and initialize it.  Note that it
  * would probably be better to allocate socket and PCB at the same time, but
  * I'm not convinced that all the protocols can be easily modified to do
  * this.
  *
  * soalloc() returns a socket with a ref count of 0.
  */
 static struct socket *
 soalloc(struct vnet *vnet)
 {
 	struct socket *so;
 
 	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
 	if (so == NULL)
 		return (NULL);
 #ifdef MAC
 	if (mac_socket_init(so, M_NOWAIT) != 0) {
 		uma_zfree(socket_zone, so);
 		return (NULL);
 	}
 #endif
 	if (khelp_init_osd(HELPER_CLASS_SOCKET, &so->osd)) {
 		uma_zfree(socket_zone, so);
 		return (NULL);
 	}
 
 	/*
 	 * The socket locking protocol allows to lock 2 sockets at a time,
 	 * however, the first one must be a listening socket.  WITNESS lacks
 	 * a feature to change class of an existing lock, so we use DUPOK.
 	 */
 	mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
 	mtx_init(&so->so_snd_mtx, "so_snd", NULL, MTX_DEF);
 	mtx_init(&so->so_rcv_mtx, "so_rcv", NULL, MTX_DEF);
 	so->so_rcv.sb_sel = &so->so_rdsel;
 	so->so_snd.sb_sel = &so->so_wrsel;
 	sx_init(&so->so_snd_sx, "so_snd_sx");
 	sx_init(&so->so_rcv_sx, "so_rcv_sx");
 	TAILQ_INIT(&so->so_snd.sb_aiojobq);
 	TAILQ_INIT(&so->so_rcv.sb_aiojobq);
 	TASK_INIT(&so->so_snd.sb_aiotask, 0, soaio_snd, so);
 	TASK_INIT(&so->so_rcv.sb_aiotask, 0, soaio_rcv, so);
 #ifdef VIMAGE
 	VNET_ASSERT(vnet != NULL, ("%s:%d vnet is NULL, so=%p",
 	    __func__, __LINE__, so));
 	so->so_vnet = vnet;
 #endif
 	/* We shouldn't need the so_global_mtx */
 	if (hhook_run_socket(so, NULL, HHOOK_SOCKET_CREATE)) {
 		/* Do we need more comprehensive error returns? */
 		uma_zfree(socket_zone, so);
 		return (NULL);
 	}
 	mtx_lock(&so_global_mtx);
 	so->so_gencnt = ++so_gencnt;
 	++numopensockets;
 #ifdef VIMAGE
 	vnet->vnet_sockcnt++;
 #endif
 	mtx_unlock(&so_global_mtx);
 
 	return (so);
 }
 
 /*
  * Free the storage associated with a socket at the socket layer, tear down
  * locks, labels, etc.  All protocol state is assumed already to have been
  * torn down (and possibly never set up) by the caller.
  */
 void
 sodealloc(struct socket *so)
 {
 
 	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
 	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
 
 	mtx_lock(&so_global_mtx);
 	so->so_gencnt = ++so_gencnt;
 	--numopensockets;	/* Could be below, but faster here. */
 #ifdef VIMAGE
 	VNET_ASSERT(so->so_vnet != NULL, ("%s:%d so_vnet is NULL, so=%p",
 	    __func__, __LINE__, so));
 	so->so_vnet->vnet_sockcnt--;
 #endif
 	mtx_unlock(&so_global_mtx);
 #ifdef MAC
 	mac_socket_destroy(so);
 #endif
 	hhook_run_socket(so, NULL, HHOOK_SOCKET_CLOSE);
 
 	khelp_destroy_osd(&so->osd);
 	if (SOLISTENING(so)) {
 		if (so->sol_accept_filter != NULL)
 			accept_filt_setopt(so, NULL);
 	} else {
 		if (so->so_rcv.sb_hiwat)
 			(void)chgsbsize(so->so_cred->cr_uidinfo,
 			    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
 		if (so->so_snd.sb_hiwat)
 			(void)chgsbsize(so->so_cred->cr_uidinfo,
 			    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
 		sx_destroy(&so->so_snd_sx);
 		sx_destroy(&so->so_rcv_sx);
 		mtx_destroy(&so->so_snd_mtx);
 		mtx_destroy(&so->so_rcv_mtx);
 	}
 	crfree(so->so_cred);
 	mtx_destroy(&so->so_lock);
 	uma_zfree(socket_zone, so);
 }
 
 /*
  * socreate returns a socket with a ref count of 1 and a file descriptor
  * reference.  The socket should be closed with soclose().
  */
 int
 socreate(int dom, struct socket **aso, int type, int proto,
     struct ucred *cred, struct thread *td)
 {
 	struct protosw *prp;
 	struct socket *so;
 	int error;
 
 	if (proto)
 		prp = pffindproto(dom, proto, type);
 	else
 		prp = pffindtype(dom, type);
 
 	if (prp == NULL) {
 		/* No support for domain. */
 		if (pffinddomain(dom) == NULL)
 			return (EAFNOSUPPORT);
 		/* No support for socket type. */
 		if (proto == 0 && type != 0)
 			return (EPROTOTYPE);
 		return (EPROTONOSUPPORT);
 	}
-	if (prp->pr_usrreqs->pru_attach == NULL ||
-	    prp->pr_usrreqs->pru_attach == pru_attach_notsupp)
-		return (EPROTONOSUPPORT);
+
+	MPASS(prp->pr_attach);
 
 	if (IN_CAPABILITY_MODE(td) && (prp->pr_flags & PR_CAPATTACH) == 0)
 		return (ECAPMODE);
 
 	if (prison_check_af(cred, prp->pr_domain->dom_family) != 0)
 		return (EPROTONOSUPPORT);
 
 	if (prp->pr_type != type)
 		return (EPROTOTYPE);
 	so = soalloc(CRED_TO_VNET(cred));
 	if (so == NULL)
 		return (ENOBUFS);
 
 	so->so_type = type;
 	so->so_cred = crhold(cred);
 	if ((prp->pr_domain->dom_family == PF_INET) ||
 	    (prp->pr_domain->dom_family == PF_INET6) ||
 	    (prp->pr_domain->dom_family == PF_ROUTE))
 		so->so_fibnum = td->td_proc->p_fibnum;
 	else
 		so->so_fibnum = 0;
 	so->so_proto = prp;
 #ifdef MAC
 	mac_socket_create(cred, so);
 #endif
 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 	    so_rdknl_assert_lock);
 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 	    so_wrknl_assert_lock);
 	if ((prp->pr_flags & PR_SOCKBUF) == 0) {
 		so->so_snd.sb_mtx = &so->so_snd_mtx;
 		so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 	}
 	/*
 	 * Auto-sizing of socket buffers is managed by the protocols and
 	 * the appropriate flags must be set in the pru_attach function.
 	 */
 	CURVNET_SET(so->so_vnet);
-	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
+	error = prp->pr_attach(so, proto, td);
 	CURVNET_RESTORE();
 	if (error) {
 		sodealloc(so);
 		return (error);
 	}
 	soref(so);
 	*aso = so;
 	return (0);
 }
 
 #ifdef REGRESSION
 static int regression_sonewconn_earlytest = 1;
 SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
     &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
 #endif
 
 static struct timeval overinterval = { 60, 0 };
 SYSCTL_TIMEVAL_SEC(_kern_ipc, OID_AUTO, sooverinterval, CTLFLAG_RW,
     &overinterval,
     "Delay in seconds between warnings for listen socket overflows");
 
 /*
  * When an attempt at a new connection is noted on a socket which supports
  * accept(2), the protocol has two options:
  * 1) Call legacy sonewconn() function, which would call protocol attach
  *    method, same as used for socket(2).
  * 2) Call solisten_clone(), do attach that is specific to a cloned connection,
  *    and then call solisten_enqueue().
  *
  * Note: the ref count on the socket is 0 on return.
  */
 struct socket *
 solisten_clone(struct socket *head)
 {
 	struct sbuf descrsb;
 	struct socket *so;
 	int len, overcount;
 	u_int qlen;
 	const char localprefix[] = "local:";
 	char descrbuf[SUNPATHLEN + sizeof(localprefix)];
 #if defined(INET6)
 	char addrbuf[INET6_ADDRSTRLEN];
 #elif defined(INET)
 	char addrbuf[INET_ADDRSTRLEN];
 #endif
 	bool dolog, over;
 
 	SOLISTEN_LOCK(head);
 	over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
 #ifdef REGRESSION
 	if (regression_sonewconn_earlytest && over) {
 #else
 	if (over) {
 #endif
 		head->sol_overcount++;
 		dolog = !!ratecheck(&head->sol_lastover, &overinterval);
 
 		/*
 		 * If we're going to log, copy the overflow count and queue
 		 * length from the listen socket before dropping the lock.
 		 * Also, reset the overflow count.
 		 */
 		if (dolog) {
 			overcount = head->sol_overcount;
 			head->sol_overcount = 0;
 			qlen = head->sol_qlen;
 		}
 		SOLISTEN_UNLOCK(head);
 
 		if (dolog) {
 			/*
 			 * Try to print something descriptive about the
 			 * socket for the error message.
 			 */
 			sbuf_new(&descrsb, descrbuf, sizeof(descrbuf),
 			    SBUF_FIXEDLEN);
 			switch (head->so_proto->pr_domain->dom_family) {
 #if defined(INET) || defined(INET6)
 #ifdef INET
 			case AF_INET:
 #endif
 #ifdef INET6
 			case AF_INET6:
 				if (head->so_proto->pr_domain->dom_family ==
 				    AF_INET6 ||
 				    (sotoinpcb(head)->inp_inc.inc_flags &
 				    INC_ISIPV6)) {
 					ip6_sprintf(addrbuf,
 					    &sotoinpcb(head)->inp_inc.inc6_laddr);
 					sbuf_printf(&descrsb, "[%s]", addrbuf);
 				} else
 #endif
 				{
 #ifdef INET
 					inet_ntoa_r(
 					    sotoinpcb(head)->inp_inc.inc_laddr,
 					    addrbuf);
 					sbuf_cat(&descrsb, addrbuf);
 #endif
 				}
 				sbuf_printf(&descrsb, ":%hu (proto %u)",
 				    ntohs(sotoinpcb(head)->inp_inc.inc_lport),
 				    head->so_proto->pr_protocol);
 				break;
 #endif /* INET || INET6 */
 			case AF_UNIX:
 				sbuf_cat(&descrsb, localprefix);
 				if (sotounpcb(head)->unp_addr != NULL)
 					len =
 					    sotounpcb(head)->unp_addr->sun_len -
 					    offsetof(struct sockaddr_un,
 					    sun_path);
 				else
 					len = 0;
 				if (len > 0)
 					sbuf_bcat(&descrsb,
 					    sotounpcb(head)->unp_addr->sun_path,
 					    len);
 				else
 					sbuf_cat(&descrsb, "(unknown)");
 				break;
 			}
 
 			/*
 			 * If we can't print something more specific, at least
 			 * print the domain name.
 			 */
 			if (sbuf_finish(&descrsb) != 0 ||
 			    sbuf_len(&descrsb) <= 0) {
 				sbuf_clear(&descrsb);
 				sbuf_cat(&descrsb,
 				    head->so_proto->pr_domain->dom_name ?:
 				    "unknown");
 				sbuf_finish(&descrsb);
 			}
 			KASSERT(sbuf_len(&descrsb) > 0,
 			    ("%s: sbuf creation failed", __func__));
 			/*
 			 * Preserve the historic listen queue overflow log
 			 * message, that starts with "sonewconn:".  It has
 			 * been known to sysadmins for years and also test
 			 * sys/kern/sonewconn_overflow checks for it.
 			 */
 			if (head->so_cred == 0) {
 				log(LOG_DEBUG, "sonewconn: pcb %p (%s): "
 				    "Listen queue overflow: %i already in "
 				    "queue awaiting acceptance (%d "
 				    "occurrences)\n", head->so_pcb,
 				    sbuf_data(&descrsb),
 			    	qlen, overcount);
 			} else {
 				log(LOG_DEBUG, "sonewconn: pcb %p (%s): "
 				    "Listen queue overflow: "
 				    "%i already in queue awaiting acceptance "
 				    "(%d occurrences), euid %d, rgid %d, jail %s\n",
 				    head->so_pcb, sbuf_data(&descrsb), qlen,
 				    overcount, head->so_cred->cr_uid,
 				    head->so_cred->cr_rgid,
 				    head->so_cred->cr_prison ?
 					head->so_cred->cr_prison->pr_name :
 					"not_jailed");
 			}
 			sbuf_delete(&descrsb);
 
 			overcount = 0;
 		}
 
 		return (NULL);
 	}
 	SOLISTEN_UNLOCK(head);
 	VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
 	    __func__, head));
 	so = soalloc(head->so_vnet);
 	if (so == NULL) {
 		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 		    "limit reached or out of memory\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_listen = head;
 	so->so_type = head->so_type;
 	so->so_options = head->so_options & ~SO_ACCEPTCONN;
 	so->so_linger = head->so_linger;
 	so->so_state = head->so_state;
 	so->so_fibnum = head->so_fibnum;
 	so->so_proto = head->so_proto;
 	so->so_cred = crhold(head->so_cred);
 #ifdef MAC
 	mac_socket_newconn(head, so);
 #endif
 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 	    so_rdknl_assert_lock);
 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 	    so_wrknl_assert_lock);
 	VNET_SO_ASSERT(head);
 	if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
 	so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
 	so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
 	so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
 	so->so_rcv.sb_flags = head->sol_sbrcv_flags & SB_AUTOSIZE;
 	so->so_snd.sb_flags = head->sol_sbsnd_flags & SB_AUTOSIZE;
 	if ((so->so_proto->pr_flags & PR_SOCKBUF) == 0) {
 		so->so_snd.sb_mtx = &so->so_snd_mtx;
 		so->so_rcv.sb_mtx = &so->so_rcv_mtx;
 	}
 
 	return (so);
 }
 
 /* Connstatus may be 0, or SS_ISCONFIRMING, or SS_ISCONNECTED. */
 struct socket *
 sonewconn(struct socket *head, int connstatus)
 {
 	struct socket *so;
 
 	if ((so = solisten_clone(head)) == NULL)
 		return (NULL);
 
-	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+	if (so->so_proto->pr_attach(so, 0, NULL) != 0) {
 		sodealloc(so);
-		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
+		log(LOG_DEBUG, "%s: pcb %p: pr_attach() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 
 	solisten_enqueue(so, connstatus);
 
 	return (so);
 }
 
 /*
  * Enqueue socket cloned by solisten_clone() to the listen queue of the
  * listener it has been cloned from.
  */
 void
 solisten_enqueue(struct socket *so, int connstatus)
 {
 	struct socket *head = so->so_listen;
 
 	MPASS(refcount_load(&so->so_count) == 0);
 	refcount_init(&so->so_count, 1);
 
 	SOLISTEN_LOCK(head);
 	if (head->sol_accept_filter != NULL)
 		connstatus = 0;
 	so->so_state |= connstatus;
 	soref(head); /* A socket on (in)complete queue refs head. */
 	if (connstatus) {
 		TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
 		so->so_qstate = SQ_COMP;
 		head->sol_qlen++;
 		solisten_wakeup(head);	/* unlocks */
 	} else {
 		/*
 		 * Keep removing sockets from the head until there's room for
 		 * us to insert on the tail.  In pre-locking revisions, this
 		 * was a simple if(), but as we could be racing with other
 		 * threads and soabort() requires dropping locks, we must
 		 * loop waiting for the condition to be true.
 		 */
 		while (head->sol_incqlen > head->sol_qlimit) {
 			struct socket *sp;
 
 			sp = TAILQ_FIRST(&head->sol_incomp);
 			TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
 			head->sol_incqlen--;
 			SOCK_LOCK(sp);
 			sp->so_qstate = SQ_NONE;
 			sp->so_listen = NULL;
 			SOCK_UNLOCK(sp);
 			sorele_locked(head);	/* does SOLISTEN_UNLOCK, head stays */
 			soabort(sp);
 			SOLISTEN_LOCK(head);
 		}
 		TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
 		so->so_qstate = SQ_INCOMP;
 		head->sol_incqlen++;
 		SOLISTEN_UNLOCK(head);
 	}
 }
 
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 /*
  * Socket part of sctp_peeloff().  Detach a new socket from an
  * association.  The new socket is returned with a reference.
  *
  * XXXGL: reduce copy-paste with solisten_clone().
  */
 struct socket *
 sopeeloff(struct socket *head)
 {
 	struct socket *so;
 
 	VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
 	    __func__, __LINE__, head));
 	so = soalloc(head->so_vnet);
 	if (so == NULL) {
 		log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
 		    "limit reached or out of memory\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_type = head->so_type;
 	so->so_options = head->so_options;
 	so->so_linger = head->so_linger;
 	so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
 	so->so_fibnum = head->so_fibnum;
 	so->so_proto = head->so_proto;
 	so->so_cred = crhold(head->so_cred);
 #ifdef MAC
 	mac_socket_newconn(head, so);
 #endif
 	knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
 	    so_rdknl_assert_lock);
 	knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
 	    so_wrknl_assert_lock);
 	VNET_SO_ASSERT(head);
 	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
-	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+	if ((*so->so_proto->pr_attach)(so, 0, NULL)) {
 		sodealloc(so);
 		log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
 		    __func__, head->so_pcb);
 		return (NULL);
 	}
 	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
 	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
 	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 
 	soref(so);
 
 	return (so);
 }
 #endif	/* SCTP */
 
 int
 sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
-	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td);
+	error = so->so_proto->pr_bind(so, nam, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 sobindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
-	error = (*so->so_proto->pr_usrreqs->pru_bindat)(fd, so, nam, td);
+	error = so->so_proto->pr_bindat(fd, so, nam, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * solisten() transitions a socket from a non-listening state to a listening
  * state, but can also be used to update the listen queue depth on an
  * existing listen socket.  The protocol will call back into the sockets
  * layer using solisten_proto_check() and solisten_proto() to check and set
  * socket-layer listen state.  Call backs are used so that the protocol can
  * acquire both protocol and socket layer locks in whatever order is required
  * by the protocol.
  *
  * Protocol implementors are advised to hold the socket lock across the
  * socket-layer test and set to avoid races at the socket layer.
  */
 int
 solisten(struct socket *so, int backlog, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
-	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td);
+	error = so->so_proto->pr_listen(so, backlog, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Prepare for a call to solisten_proto().  Acquire all socket buffer locks in
  * order to interlock with socket I/O.
  */
 int
 solisten_proto_check(struct socket *so)
 {
 	SOCK_LOCK_ASSERT(so);
 
 	if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 	    SS_ISDISCONNECTING)) != 0)
 		return (EINVAL);
 
 	/*
 	 * Sleeping is not permitted here, so simply fail if userspace is
 	 * attempting to transmit or receive on the socket.  This kind of
 	 * transient failure is not ideal, but it should occur only if userspace
 	 * is misusing the socket interfaces.
 	 */
 	if (!sx_try_xlock(&so->so_snd_sx))
 		return (EAGAIN);
 	if (!sx_try_xlock(&so->so_rcv_sx)) {
 		sx_xunlock(&so->so_snd_sx);
 		return (EAGAIN);
 	}
 	mtx_lock(&so->so_snd_mtx);
 	mtx_lock(&so->so_rcv_mtx);
 
 	/* Interlock with soo_aio_queue(). */
 	if (!SOLISTENING(so) &&
 	   ((so->so_snd.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0 ||
 	   (so->so_rcv.sb_flags & (SB_AIO | SB_AIO_RUNNING)) != 0)) {
 		solisten_proto_abort(so);
 		return (EINVAL);
 	}
 	return (0);
 }
 
 /*
  * Undo the setup done by solisten_proto_check().
  */
 void
 solisten_proto_abort(struct socket *so)
 {
 	mtx_unlock(&so->so_snd_mtx);
 	mtx_unlock(&so->so_rcv_mtx);
 	sx_xunlock(&so->so_snd_sx);
 	sx_xunlock(&so->so_rcv_sx);
 }
 
 void
 solisten_proto(struct socket *so, int backlog)
 {
 	int sbrcv_lowat, sbsnd_lowat;
 	u_int sbrcv_hiwat, sbsnd_hiwat;
 	short sbrcv_flags, sbsnd_flags;
 	sbintime_t sbrcv_timeo, sbsnd_timeo;
 
 	SOCK_LOCK_ASSERT(so);
 	KASSERT((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
 	    SS_ISDISCONNECTING)) == 0,
 	    ("%s: bad socket state %p", __func__, so));
 
 	if (SOLISTENING(so))
 		goto listening;
 
 	/*
 	 * Change this socket to listening state.
 	 */
 	sbrcv_lowat = so->so_rcv.sb_lowat;
 	sbsnd_lowat = so->so_snd.sb_lowat;
 	sbrcv_hiwat = so->so_rcv.sb_hiwat;
 	sbsnd_hiwat = so->so_snd.sb_hiwat;
 	sbrcv_flags = so->so_rcv.sb_flags;
 	sbsnd_flags = so->so_snd.sb_flags;
 	sbrcv_timeo = so->so_rcv.sb_timeo;
 	sbsnd_timeo = so->so_snd.sb_timeo;
 
 	sbdestroy(so, SO_SND);
 	sbdestroy(so, SO_RCV);
 
 #ifdef INVARIANTS
 	bzero(&so->so_rcv,
 	    sizeof(struct socket) - offsetof(struct socket, so_rcv));
 #endif
 
 	so->sol_sbrcv_lowat = sbrcv_lowat;
 	so->sol_sbsnd_lowat = sbsnd_lowat;
 	so->sol_sbrcv_hiwat = sbrcv_hiwat;
 	so->sol_sbsnd_hiwat = sbsnd_hiwat;
 	so->sol_sbrcv_flags = sbrcv_flags;
 	so->sol_sbsnd_flags = sbsnd_flags;
 	so->sol_sbrcv_timeo = sbrcv_timeo;
 	so->sol_sbsnd_timeo = sbsnd_timeo;
 
 	so->sol_qlen = so->sol_incqlen = 0;
 	TAILQ_INIT(&so->sol_incomp);
 	TAILQ_INIT(&so->sol_comp);
 
 	so->sol_accept_filter = NULL;
 	so->sol_accept_filter_arg = NULL;
 	so->sol_accept_filter_str = NULL;
 
 	so->sol_upcall = NULL;
 	so->sol_upcallarg = NULL;
 
 	so->so_options |= SO_ACCEPTCONN;
 
 listening:
 	if (backlog < 0 || backlog > somaxconn)
 		backlog = somaxconn;
 	so->sol_qlimit = backlog;
 
 	mtx_unlock(&so->so_snd_mtx);
 	mtx_unlock(&so->so_rcv_mtx);
 	sx_xunlock(&so->so_snd_sx);
 	sx_xunlock(&so->so_rcv_sx);
 }
 
 /*
  * Wakeup listeners/subsystems once we have a complete connection.
  * Enters with lock, returns unlocked.
  */
 void
 solisten_wakeup(struct socket *sol)
 {
 
 	if (sol->sol_upcall != NULL)
 		(void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
 	else {
 		selwakeuppri(&sol->so_rdsel, PSOCK);
 		KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
 	}
 	SOLISTEN_UNLOCK(sol);
 	wakeup_one(&sol->sol_comp);
 	if ((sol->so_state & SS_ASYNC) && sol->so_sigio != NULL)
 		pgsigio(&sol->so_sigio, SIGIO, 0);
 }
 
 /*
  * Return single connection off a listening socket queue.  Main consumer of
  * the function is kern_accept4().  Some modules, that do their own accept
  * management also use the function.  The socket reference held by the
  * listen queue is handed to the caller.
  *
  * Listening socket must be locked on entry and is returned unlocked on
  * return.
  * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
  */
 int
 solisten_dequeue(struct socket *head, struct socket **ret, int flags)
 {
 	struct socket *so;
 	int error;
 
 	SOLISTEN_LOCK_ASSERT(head);
 
 	while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
 	    head->so_error == 0) {
 		error = msleep(&head->sol_comp, SOCK_MTX(head), PSOCK | PCATCH,
 		    "accept", 0);
 		if (error != 0) {
 			SOLISTEN_UNLOCK(head);
 			return (error);
 		}
 	}
 	if (head->so_error) {
 		error = head->so_error;
 		head->so_error = 0;
 	} else if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp))
 		error = EWOULDBLOCK;
 	else
 		error = 0;
 	if (error) {
 		SOLISTEN_UNLOCK(head);
 		return (error);
 	}
 	so = TAILQ_FIRST(&head->sol_comp);
 	SOCK_LOCK(so);
 	KASSERT(so->so_qstate == SQ_COMP,
 	    ("%s: so %p not SQ_COMP", __func__, so));
 	head->sol_qlen--;
 	so->so_qstate = SQ_NONE;
 	so->so_listen = NULL;
 	TAILQ_REMOVE(&head->sol_comp, so, so_list);
 	if (flags & ACCEPT4_INHERIT)
 		so->so_state |= (head->so_state & SS_NBIO);
 	else
 		so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
 	SOCK_UNLOCK(so);
 	sorele_locked(head);
 
 	*ret = so;
 	return (0);
 }
 
 /*
  * Free socket upon release of the very last reference.
  */
 static void
 sofree(struct socket *so)
 {
 	struct protosw *pr = so->so_proto;
 
 	SOCK_LOCK_ASSERT(so);
 	KASSERT(refcount_load(&so->so_count) == 0,
 	    ("%s: so %p has references", __func__, so));
 	KASSERT(SOLISTENING(so) || so->so_qstate == SQ_NONE,
 	    ("%s: so %p is on listen queue", __func__, so));
 
 	SOCK_UNLOCK(so);
 
 	if (so->so_dtor != NULL)
 		so->so_dtor(so);
 
 	VNET_SO_ASSERT(so);
 	if ((pr->pr_flags & PR_RIGHTS) && !SOLISTENING(so)) {
 		MPASS(pr->pr_domain->dom_dispose != NULL);
 		(*pr->pr_domain->dom_dispose)(so);
 	}
-	if (pr->pr_usrreqs->pru_detach != NULL)
-		(*pr->pr_usrreqs->pru_detach)(so);
+	if (pr->pr_detach != NULL)
+		pr->pr_detach(so);
 
 	/*
 	 * From this point on, we assume that no other references to this
 	 * socket exist anywhere else in the stack.  Therefore, no locks need
 	 * to be acquired or held.
 	 */
 	if (!(pr->pr_flags & PR_SOCKBUF) && !SOLISTENING(so)) {
 		sbdestroy(so, SO_SND);
 		sbdestroy(so, SO_RCV);
 	}
 	seldrain(&so->so_rdsel);
 	seldrain(&so->so_wrsel);
 	knlist_destroy(&so->so_rdsel.si_note);
 	knlist_destroy(&so->so_wrsel.si_note);
 	sodealloc(so);
 }
 
 /*
  * Release a reference on a socket while holding the socket lock.
  * Unlocks the socket lock before returning.
  */
 void
 sorele_locked(struct socket *so)
 {
 	SOCK_LOCK_ASSERT(so);
 	if (refcount_release(&so->so_count))
 		sofree(so);
 	else
 		SOCK_UNLOCK(so);
 }
 
 /*
  * Close a socket on last file table reference removal.  Initiate disconnect
  * if connected.  Free socket when disconnect complete.
  *
  * This function will sorele() the socket.  Note that soclose() may be called
  * prior to the ref count reaching zero.  The actual socket structure will
  * not be freed until the ref count reaches zero.
  */
 int
 soclose(struct socket *so)
 {
 	struct accept_queue lqueue;
 	int error = 0;
 	bool listening, last __diagused;
 
 	CURVNET_SET(so->so_vnet);
 	funsetown(&so->so_sigio);
 	if (so->so_state & SS_ISCONNECTED) {
 		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
 			error = sodisconnect(so);
 			if (error) {
 				if (error == ENOTCONN)
 					error = 0;
 				goto drop;
 			}
 		}
 
 		if ((so->so_options & SO_LINGER) != 0 && so->so_linger != 0) {
 			if ((so->so_state & SS_ISDISCONNECTING) &&
 			    (so->so_state & SS_NBIO))
 				goto drop;
 			while (so->so_state & SS_ISCONNECTED) {
 				error = tsleep(&so->so_timeo,
 				    PSOCK | PCATCH, "soclos",
 				    so->so_linger * hz);
 				if (error)
 					break;
 			}
 		}
 	}
 
 drop:
-	if (so->so_proto->pr_usrreqs->pru_close != NULL)
-		(*so->so_proto->pr_usrreqs->pru_close)(so);
+	if (so->so_proto->pr_close != NULL)
+		so->so_proto->pr_close(so);
 
 	SOCK_LOCK(so);
 	if ((listening = SOLISTENING(so))) {
 		struct socket *sp;
 
 		TAILQ_INIT(&lqueue);
 		TAILQ_SWAP(&lqueue, &so->sol_incomp, socket, so_list);
 		TAILQ_CONCAT(&lqueue, &so->sol_comp, so_list);
 
 		so->sol_qlen = so->sol_incqlen = 0;
 
 		TAILQ_FOREACH(sp, &lqueue, so_list) {
 			SOCK_LOCK(sp);
 			sp->so_qstate = SQ_NONE;
 			sp->so_listen = NULL;
 			SOCK_UNLOCK(sp);
 			last = refcount_release(&so->so_count);
 			KASSERT(!last, ("%s: released last reference for %p",
 			    __func__, so));
 		}
 	}
 	sorele_locked(so);
 	if (listening) {
 		struct socket *sp, *tsp;
 
 		TAILQ_FOREACH_SAFE(sp, &lqueue, so_list, tsp)
 			soabort(sp);
 	}
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * soabort() is used to abruptly tear down a connection, such as when a
  * resource limit is reached (listen queue depth exceeded), or if a listen
  * socket is closed while there are sockets waiting to be accepted.
  *
  * This interface is tricky, because it is called on an unreferenced socket,
  * and must be called only by a thread that has actually removed the socket
  * from the listen queue it was on.  Likely this thread holds the last
  * reference on the socket and soabort() will proceed with sofree().  But
  * it might be not the last, as the sockets on the listen queues are seen
  * from the protocol side.
  *
  * This interface will call into the protocol code, so must not be called
  * with any socket locks held.  Protocols do call it while holding their own
  * recursible protocol mutexes, but this is something that should be subject
  * to review in the future.
  *
  * Usually socket should have a single reference left, but this is not a
  * requirement.  In the past, when we have had named references for file
  * descriptor and protocol, we asserted that none of them are being held.
  */
 void
 soabort(struct socket *so)
 {
 
 	VNET_SO_ASSERT(so);
 
-	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
-		(*so->so_proto->pr_usrreqs->pru_abort)(so);
+	if (so->so_proto->pr_abort != NULL)
+		so->so_proto->pr_abort(so);
 	SOCK_LOCK(so);
 	sorele_locked(so);
 }
 
 int
 soaccept(struct socket *so, struct sockaddr **nam)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
-	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
+	error = so->so_proto->pr_accept(so, nam);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (soconnectat(AT_FDCWD, so, nam, td));
 }
 
 int
 soconnectat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
 	/*
 	 * If protocol is connection-based, can only connect once.
 	 * Otherwise, if connected, try to disconnect first.  This allows
 	 * user to disconnect by connecting to, e.g., a null address.
 	 */
 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
 	    (error = sodisconnect(so)))) {
 		error = EISCONN;
 	} else {
 		/*
 		 * Prevent accumulated error from previous connection from
 		 * biting us.
 		 */
 		so->so_error = 0;
 		if (fd == AT_FDCWD) {
-			error = (*so->so_proto->pr_usrreqs->pru_connect)(so,
-			    nam, td);
+			error = so->so_proto->pr_connect(so, nam, td);
 		} else {
-			error = (*so->so_proto->pr_usrreqs->pru_connectat)(fd,
-			    so, nam, td);
+			error = so->so_proto->pr_connectat(fd, so, nam, td);
 		}
 	}
 	CURVNET_RESTORE();
 
 	return (error);
 }
 
 int
 soconnect2(struct socket *so1, struct socket *so2)
 {
 	int error;
 
 	CURVNET_SET(so1->so_vnet);
-	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
+	error = so1->so_proto->pr_connect2(so1, so2);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 sodisconnect(struct socket *so)
 {
 	int error;
 
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return (ENOTCONN);
 	if (so->so_state & SS_ISDISCONNECTING)
 		return (EALREADY);
 	VNET_SO_ASSERT(so);
-	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
+	error = so->so_proto->pr_disconnect(so);
 	return (error);
 }
 
 int
 sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	long space;
 	ssize_t resid;
 	int clen = 0, error, dontroute;
 
 	KASSERT(so->so_type == SOCK_DGRAM, ("sosend_dgram: !SOCK_DGRAM"));
 	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
 	    ("sosend_dgram: !PR_ATOMIC"));
 
 	if (uio != NULL)
 		resid = uio->uio_resid;
 	else
 		resid = top->m_pkthdr.len;
 	/*
 	 * In theory resid should be unsigned.  However, space must be
 	 * signed, as it might be less than 0 if we over-committed, and we
 	 * must use a signed comparison of space and resid.  On the other
 	 * hand, a negative resid causes us to loop sending 0-length
 	 * segments to the protocol.
 	 */
 	if (resid < 0) {
 		error = EINVAL;
 		goto out;
 	}
 
 	dontroute =
 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
 	if (td != NULL)
 		td->td_ru.ru_msgsnd++;
 	if (control != NULL)
 		clen = control->m_len;
 
 	SOCKBUF_LOCK(&so->so_snd);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		SOCKBUF_UNLOCK(&so->so_snd);
 		error = EPIPE;
 		goto out;
 	}
 	if (so->so_error) {
 		error = so->so_error;
 		so->so_error = 0;
 		SOCKBUF_UNLOCK(&so->so_snd);
 		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		/*
 		 * `sendto' and `sendmsg' is allowed on a connection-based
 		 * socket if it supports implied connect.  Return ENOTCONN if
 		 * not connected and no address is supplied.
 		 */
 		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
 		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
 			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
 			    !(resid == 0 && clen != 0)) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = ENOTCONN;
 				goto out;
 			}
 		} else if (addr == NULL) {
 			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
 				error = ENOTCONN;
 			else
 				error = EDESTADDRREQ;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto out;
 		}
 	}
 
 	/*
 	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
 	 * problem and need fixing.
 	 */
 	space = sbspace(&so->so_snd);
 	if (flags & MSG_OOB)
 		space += 1024;
 	space -= clen;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	if (resid > space) {
 		error = EMSGSIZE;
 		goto out;
 	}
 	if (uio == NULL) {
 		resid = 0;
 		if (flags & MSG_EOR)
 			top->m_flags |= M_EOR;
 	} else {
 		/*
 		 * Copy the data from userland into a mbuf chain.
 		 * If no data is to be copied in, a single empty mbuf
 		 * is returned.
 		 */
 		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
 		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
 		if (top == NULL) {
 			error = EFAULT;	/* only possible error */
 			goto out;
 		}
 		space -= resid - uio->uio_resid;
 		resid = uio->uio_resid;
 	}
 	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
 	/*
 	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
 	 * than with.
 	 */
 	if (dontroute) {
 		SOCK_LOCK(so);
 		so->so_options |= SO_DONTROUTE;
 		SOCK_UNLOCK(so);
 	}
 	/*
 	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
 	 * of date.  We could have received a reset packet in an interrupt or
 	 * maybe we slept while doing page faults in uiomove() etc.  We could
 	 * probably recheck again inside the locking protection here, but
 	 * there are probably other places that this also happens.  We must
 	 * rethink this.
 	 */
 	VNET_SO_ASSERT(so);
-	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
-	    (flags & MSG_OOB) ? PRUS_OOB :
+	error = so->so_proto->pr_send(so, (flags & MSG_OOB) ? PRUS_OOB :
 	/*
 	 * If the user set MSG_EOF, the protocol understands this flag and
 	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
 	 */
 	    ((flags & MSG_EOF) &&
 	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
 	     (resid <= 0)) ?
 		PRUS_EOF :
 		/* If there is more to send set PRUS_MORETOCOME */
 		(flags & MSG_MORETOCOME) ||
 		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
 		top, addr, control, td);
 	if (dontroute) {
 		SOCK_LOCK(so);
 		so->so_options &= ~SO_DONTROUTE;
 		SOCK_UNLOCK(so);
 	}
 	clen = 0;
 	control = NULL;
 	top = NULL;
 out:
 	if (top != NULL)
 		m_freem(top);
 	if (control != NULL)
 		m_freem(control);
 	return (error);
 }
 
 /*
  * Send on a socket.  If send must go all at once and message is larger than
  * send buffering, then hard error.  Lock against other senders.  If must go
  * all at once and not enough room now, then inform user that this would
  * block and do nothing.  Otherwise, if nonblocking, send as much as
  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
  * in mbuf chain must be small enough to send all at once.
  *
  * Returns nonzero on error, timeout or signal; callers must check for short
  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
  * on return.
  */
 int
 sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	long space;
 	ssize_t resid;
 	int clen = 0, error, dontroute;
 	int atomic = sosendallatonce(so) || top;
-	int pru_flag;
+	int pr_send_flag;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
-	int tls_enq_cnt, tls_pruflag;
+	int tls_enq_cnt, tls_send_flag;
 	uint8_t tls_rtype;
 
 	tls = NULL;
 	tls_rtype = TLS_RLTYPE_APP;
 #endif
 	if (uio != NULL)
 		resid = uio->uio_resid;
 	else if ((top->m_flags & M_PKTHDR) != 0)
 		resid = top->m_pkthdr.len;
 	else
 		resid = m_length(top, NULL);
 	/*
 	 * In theory resid should be unsigned.  However, space must be
 	 * signed, as it might be less than 0 if we over-committed, and we
 	 * must use a signed comparison of space and resid.  On the other
 	 * hand, a negative resid causes us to loop sending 0-length
 	 * segments to the protocol.
 	 *
 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
 	 * type sockets since that's an error.
 	 */
 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
 		error = EINVAL;
 		goto out;
 	}
 
 	dontroute =
 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
 	    (so->so_proto->pr_flags & PR_ATOMIC);
 	if (td != NULL)
 		td->td_ru.ru_msgsnd++;
 	if (control != NULL)
 		clen = control->m_len;
 
 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		goto out;
 
 #ifdef KERN_TLS
-	tls_pruflag = 0;
+	tls_send_flag = 0;
 	tls = ktls_hold(so->so_snd.sb_tls_info);
 	if (tls != NULL) {
 		if (tls->mode == TCP_TLS_MODE_SW)
-			tls_pruflag = PRUS_NOTREADY;
+			tls_send_flag = PRUS_NOTREADY;
 
 		if (control != NULL) {
 			struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 
 			if (clen >= sizeof(*cm) &&
 			    cm->cmsg_type == TLS_SET_RECORD_TYPE) {
 				tls_rtype = *((uint8_t *)CMSG_DATA(cm));
 				clen = 0;
 				m_freem(control);
 				control = NULL;
 				atomic = 1;
 			}
 		}
 
 		if (resid == 0 && !ktls_permit_empty_frames(tls)) {
 			error = EINVAL;
 			goto release;
 		}
 	}
 #endif
 
 restart:
 	do {
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = EPIPE;
 			goto release;
 		}
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto release;
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			/*
 			 * `sendto' and `sendmsg' is allowed on a connection-
 			 * based socket if it supports implied connect.
 			 * Return ENOTCONN if not connected and no address is
 			 * supplied.
 			 */
 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
 				    !(resid == 0 && clen != 0)) {
 					SOCKBUF_UNLOCK(&so->so_snd);
 					error = ENOTCONN;
 					goto release;
 				}
 			} else if (addr == NULL) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
 					error = ENOTCONN;
 				else
 					error = EDESTADDRREQ;
 				goto release;
 			}
 		}
 		space = sbspace(&so->so_snd);
 		if (flags & MSG_OOB)
 			space += 1024;
 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
 		    clen > so->so_snd.sb_hiwat) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = EMSGSIZE;
 			goto release;
 		}
 		if (space < resid + clen &&
 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
 			if ((so->so_state & SS_NBIO) ||
 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EWOULDBLOCK;
 				goto release;
 			}
 			error = sbwait(so, SO_SND);
 			SOCKBUF_UNLOCK(&so->so_snd);
 			if (error)
 				goto release;
 			goto restart;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 		space -= clen;
 		do {
 			if (uio == NULL) {
 				resid = 0;
 				if (flags & MSG_EOR)
 					top->m_flags |= M_EOR;
 #ifdef KERN_TLS
 				if (tls != NULL) {
 					ktls_frame(top, tls, &tls_enq_cnt,
 					    tls_rtype);
 					tls_rtype = TLS_RLTYPE_APP;
 				}
 #endif
 			} else {
 				/*
 				 * Copy the data from userland into a mbuf
 				 * chain.  If resid is 0, which can happen
 				 * only if we have control to send, then
 				 * a single empty mbuf is returned.  This
 				 * is a workaround to prevent protocol send
 				 * methods to panic.
 				 */
 #ifdef KERN_TLS
 				if (tls != NULL) {
 					top = m_uiotombuf(uio, M_WAITOK, space,
 					    tls->params.max_frame_len,
 					    M_EXTPG |
 					    ((flags & MSG_EOR) ? M_EOR : 0));
 					if (top != NULL) {
 						ktls_frame(top, tls,
 						    &tls_enq_cnt, tls_rtype);
 					}
 					tls_rtype = TLS_RLTYPE_APP;
 				} else
 #endif
 					top = m_uiotombuf(uio, M_WAITOK, space,
 					    (atomic ? max_hdr : 0),
 					    (atomic ? M_PKTHDR : 0) |
 					    ((flags & MSG_EOR) ? M_EOR : 0));
 				if (top == NULL) {
 					error = EFAULT; /* only possible error */
 					goto release;
 				}
 				space -= resid - uio->uio_resid;
 				resid = uio->uio_resid;
 			}
 			if (dontroute) {
 				SOCK_LOCK(so);
 				so->so_options |= SO_DONTROUTE;
 				SOCK_UNLOCK(so);
 			}
 			/*
 			 * XXX all the SBS_CANTSENDMORE checks previously
 			 * done could be out of date.  We could have received
 			 * a reset packet in an interrupt or maybe we slept
 			 * while doing page faults in uiomove() etc.  We
 			 * could probably recheck again inside the locking
 			 * protection here, but there are probably other
 			 * places that this also happens.  We must rethink
 			 * this.
 			 */
 			VNET_SO_ASSERT(so);
 
-			pru_flag = (flags & MSG_OOB) ? PRUS_OOB :
+			pr_send_flag = (flags & MSG_OOB) ? PRUS_OOB :
 			/*
 			 * If the user set MSG_EOF, the protocol understands
 			 * this flag and nothing left to send then use
 			 * PRU_SEND_EOF instead of PRU_SEND.
 			 */
 			    ((flags & MSG_EOF) &&
 			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
 			     (resid <= 0)) ?
 				PRUS_EOF :
 			/* If there is more to send set PRUS_MORETOCOME. */
 			    (flags & MSG_MORETOCOME) ||
 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
 
 #ifdef KERN_TLS
-			pru_flag |= tls_pruflag;
+			pr_send_flag |= tls_send_flag;
 #endif
 
-			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
-			    pru_flag, top, addr, control, td);
+			error = so->so_proto->pr_send(so, pr_send_flag, top,
+			    addr, control, td);
 
 			if (dontroute) {
 				SOCK_LOCK(so);
 				so->so_options &= ~SO_DONTROUTE;
 				SOCK_UNLOCK(so);
 			}
 
 #ifdef KERN_TLS
 			if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
 				if (error != 0) {
 					m_freem(top);
 					top = NULL;
 				} else {
 					soref(so);
 					ktls_enqueue(top, so, tls_enq_cnt);
 				}
 			}
 #endif
 			clen = 0;
 			control = NULL;
 			top = NULL;
 			if (error)
 				goto release;
 		} while (resid && space > 0);
 	} while (resid);
 
 release:
 	SOCK_IO_SEND_UNLOCK(so);
 out:
 #ifdef KERN_TLS
 	if (tls != NULL)
 		ktls_free(tls);
 #endif
 	if (top != NULL)
 		m_freem(top);
 	if (control != NULL)
 		m_freem(control);
 	return (error);
 }
 
 int
 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
-	error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio,
+	error = so->so_proto->pr_sosend(so, addr, uio,
 	    top, control, flags, td);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * The part of soreceive() that implements reading non-inline out-of-band
  * data from a socket.  For more complete comments, see soreceive(), from
  * which this code originated.
  *
  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
  * unable to return an mbuf chain to the caller.
  */
 static int
 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
 {
 	struct protosw *pr = so->so_proto;
 	struct mbuf *m;
 	int error;
 
 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
 	VNET_SO_ASSERT(so);
 
 	m = m_get(M_WAITOK, MT_DATA);
-	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
+	error = pr->pr_rcvoob(so, m, flags & MSG_PEEK);
 	if (error)
 		goto bad;
 	do {
 		error = uiomove(mtod(m, void *),
 		    (int) min(uio->uio_resid, m->m_len), uio);
 		m = m_free(m);
 	} while (uio->uio_resid && error == 0 && m);
 bad:
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * Following replacement or removal of the first mbuf on the first mbuf chain
  * of a socket buffer, push necessary state changes back into the socket
  * buffer so that other consumers see the values consistently.  'nextrecord'
  * is the callers locally stored value of the original value of
  * sb->sb_mb->m_nextpkt which must be restored when the lead mbuf changes.
  * NOTE: 'nextrecord' may be NULL.
  */
 static __inline void
 sockbuf_pushsync(struct sockbuf *sb, struct mbuf *nextrecord)
 {
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	/*
 	 * First, update for the new value of nextrecord.  If necessary, make
 	 * it the first record.
 	 */
 	if (sb->sb_mb != NULL)
 		sb->sb_mb->m_nextpkt = nextrecord;
 	else
 		sb->sb_mb = nextrecord;
 
 	/*
 	 * Now update any dependent socket buffer fields to reflect the new
 	 * state.  This is an expanded inline of SB_EMPTY_FIXUP(), with the
 	 * addition of a second clause that takes care of the case where
 	 * sb_mb has been updated, but remains the last record.
 	 */
 	if (sb->sb_mb == NULL) {
 		sb->sb_mbtail = NULL;
 		sb->sb_lastrecord = NULL;
 	} else if (sb->sb_mb->m_nextpkt == NULL)
 		sb->sb_lastrecord = sb->sb_mb;
 }
 
 /*
  * Implement receive operations on a socket.  We depend on the way that
  * records are added to the sockbuf by sbappend.  In particular, each record
  * (mbufs linked through m_next) must begin with an address if the protocol
  * so specifies, followed by an optional mbuf or mbufs containing ancillary
  * data, and then zero or more mbufs of data.  In order to allow parallelism
  * between network receive and copying to user space, as well as avoid
  * sleeping with a mutex held, we release the socket buffer mutex during the
  * user space copy.  Although the sockbuf is locked, new data may still be
  * appended, and thus we must maintain consistency of the sockbuf during that
  * time.
  *
  * The caller may receive the data as a single mbuf chain by supplying an
  * mbuf **mp0 for use in returning the chain.  The uio is then used only for
  * the count in uio_resid.
  */
 int
 soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct mbuf *m, **mp;
 	int flags, error, offset;
 	ssize_t len;
 	struct protosw *pr = so->so_proto;
 	struct mbuf *nextrecord;
 	int moff, type = 0;
 	ssize_t orig_resid = uio->uio_resid;
 	bool report_real_len = false;
 
 	mp = mp0;
 	if (psa != NULL)
 		*psa = NULL;
 	if (controlp != NULL)
 		*controlp = NULL;
 	if (flagsp != NULL) {
 		report_real_len = *flagsp & MSG_TRUNC;
 		*flagsp &= ~MSG_TRUNC;
 		flags = *flagsp &~ MSG_EOR;
 	} else
 		flags = 0;
 	if (flags & MSG_OOB)
 		return (soreceive_rcvoob(so, uio, flags));
 	if (mp != NULL)
 		*mp = NULL;
 	if ((pr->pr_flags & PR_WANTRCVD) && (so->so_state & SS_ISCONFIRMING)
 	    && uio->uio_resid) {
 		VNET_SO_ASSERT(so);
-		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
+		pr->pr_rcvd(so, 0);
 	}
 
 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		return (error);
 
 restart:
 	SOCKBUF_LOCK(&so->so_rcv);
 	m = so->so_rcv.sb_mb;
 	/*
 	 * If we have less data than requested, block awaiting more (subject
 	 * to any timeout) if:
 	 *   1. the current count is less than the low water mark, or
 	 *   2. MSG_DONTWAIT is not set
 	 */
 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
 	    sbavail(&so->so_rcv) < uio->uio_resid) &&
 	    sbavail(&so->so_rcv) < so->so_rcv.sb_lowat &&
 	    m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
 		KASSERT(m != NULL || !sbavail(&so->so_rcv),
 		    ("receive: m == %p sbavail == %u",
 		    m, sbavail(&so->so_rcv)));
 		if (so->so_error || so->so_rerror) {
 			if (m != NULL)
 				goto dontblock;
 			if (so->so_error)
 				error = so->so_error;
 			else
 				error = so->so_rerror;
 			if ((flags & MSG_PEEK) == 0) {
 				if (so->so_error)
 					so->so_error = 0;
 				else
 					so->so_rerror = 0;
 			}
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			goto release;
 		}
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 			if (m != NULL)
 				goto dontblock;
 #ifdef KERN_TLS
 			else if (so->so_rcv.sb_tlsdcc == 0 &&
 			    so->so_rcv.sb_tlscc == 0) {
 #else
 			else {
 #endif
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				goto release;
 			}
 		}
 		for (; m != NULL; m = m->m_next)
 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
 				m = so->so_rcv.sb_mb;
 				goto dontblock;
 			}
 		if ((so->so_state & (SS_ISCONNECTING | SS_ISCONNECTED |
 		    SS_ISDISCONNECTING | SS_ISDISCONNECTED)) == 0 &&
 		    (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			error = ENOTCONN;
 			goto release;
 		}
 		if (uio->uio_resid == 0 && !report_real_len) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			goto release;
 		}
 		if ((so->so_state & SS_NBIO) ||
 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			error = EWOULDBLOCK;
 			goto release;
 		}
 		SBLASTRECORDCHK(&so->so_rcv);
 		SBLASTMBUFCHK(&so->so_rcv);
 		error = sbwait(so, SO_RCV);
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		if (error)
 			goto release;
 		goto restart;
 	}
 dontblock:
 	/*
 	 * From this point onward, we maintain 'nextrecord' as a cache of the
 	 * pointer to the next record in the socket buffer.  We must keep the
 	 * various socket buffer pointers and local stack versions of the
 	 * pointers in sync, pushing out modifications before dropping the
 	 * socket buffer mutex, and re-reading them when picking it up.
 	 *
 	 * Otherwise, we will race with the network stack appending new data
 	 * or records onto the socket buffer by using inconsistent/stale
 	 * versions of the field, possibly resulting in socket buffer
 	 * corruption.
 	 *
 	 * By holding the high-level sblock(), we prevent simultaneous
 	 * readers from pulling off the front of the socket buffer.
 	 */
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 	nextrecord = m->m_nextpkt;
 	if (pr->pr_flags & PR_ADDR) {
 		KASSERT(m->m_type == MT_SONAME,
 		    ("m->m_type == %d", m->m_type));
 		orig_resid = 0;
 		if (psa != NULL)
 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
 			    M_NOWAIT);
 		if (flags & MSG_PEEK) {
 			m = m->m_next;
 		} else {
 			sbfree(&so->so_rcv, m);
 			so->so_rcv.sb_mb = m_free(m);
 			m = so->so_rcv.sb_mb;
 			sockbuf_pushsync(&so->so_rcv, nextrecord);
 		}
 	}
 
 	/*
 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
 	 * in the first mbuf chain on the socket buffer.  If MSG_PEEK, we
 	 * just copy the data; if !MSG_PEEK, we call into the protocol to
 	 * perform externalization (or freeing if controlp == NULL).
 	 */
 	if (m != NULL && m->m_type == MT_CONTROL) {
 		struct mbuf *cm = NULL, *cmn;
 		struct mbuf **cme = &cm;
 #ifdef KERN_TLS
 		struct cmsghdr *cmsg;
 		struct tls_get_record tgr;
 
 		/*
 		 * For MSG_TLSAPPDATA, check for an alert record.
 		 * If found, return ENXIO without removing
 		 * it from the receive queue.  This allows a subsequent
 		 * call without MSG_TLSAPPDATA to receive it.
 		 * Note that, for TLS, there should only be a single
 		 * control mbuf with the TLS_GET_RECORD message in it.
 		 */
 		if (flags & MSG_TLSAPPDATA) {
 			cmsg = mtod(m, struct cmsghdr *);
 			if (cmsg->cmsg_type == TLS_GET_RECORD &&
 			    cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) {
 				memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr));
 				if (__predict_false(tgr.tls_type ==
 				    TLS_RLTYPE_ALERT)) {
 					SOCKBUF_UNLOCK(&so->so_rcv);
 					error = ENXIO;
 					goto release;
 				}
 			}
 		}
 #endif
 
 		do {
 			if (flags & MSG_PEEK) {
 				if (controlp != NULL) {
 					*controlp = m_copym(m, 0, m->m_len,
 					    M_NOWAIT);
 					controlp = &(*controlp)->m_next;
 				}
 				m = m->m_next;
 			} else {
 				sbfree(&so->so_rcv, m);
 				so->so_rcv.sb_mb = m->m_next;
 				m->m_next = NULL;
 				*cme = m;
 				cme = &(*cme)->m_next;
 				m = so->so_rcv.sb_mb;
 			}
 		} while (m != NULL && m->m_type == MT_CONTROL);
 		if ((flags & MSG_PEEK) == 0)
 			sockbuf_pushsync(&so->so_rcv, nextrecord);
 		while (cm != NULL) {
 			cmn = cm->m_next;
 			cm->m_next = NULL;
 			if (pr->pr_domain->dom_externalize != NULL) {
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				VNET_SO_ASSERT(so);
 				error = (*pr->pr_domain->dom_externalize)
 				    (cm, controlp, flags);
 				SOCKBUF_LOCK(&so->so_rcv);
 			} else if (controlp != NULL)
 				*controlp = cm;
 			else
 				m_freem(cm);
 			if (controlp != NULL) {
 				while (*controlp != NULL)
 					controlp = &(*controlp)->m_next;
 			}
 			cm = cmn;
 		}
 		if (m != NULL)
 			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
 		else
 			nextrecord = so->so_rcv.sb_mb;
 		orig_resid = 0;
 	}
 	if (m != NULL) {
 		if ((flags & MSG_PEEK) == 0) {
 			KASSERT(m->m_nextpkt == nextrecord,
 			    ("soreceive: post-control, nextrecord !sync"));
 			if (nextrecord == NULL) {
 				KASSERT(so->so_rcv.sb_mb == m,
 				    ("soreceive: post-control, sb_mb!=m"));
 				KASSERT(so->so_rcv.sb_lastrecord == m,
 				    ("soreceive: post-control, lastrecord!=m"));
 			}
 		}
 		type = m->m_type;
 		if (type == MT_OOBDATA)
 			flags |= MSG_OOB;
 	} else {
 		if ((flags & MSG_PEEK) == 0) {
 			KASSERT(so->so_rcv.sb_mb == nextrecord,
 			    ("soreceive: sb_mb != nextrecord"));
 			if (so->so_rcv.sb_mb == NULL) {
 				KASSERT(so->so_rcv.sb_lastrecord == NULL,
 				    ("soreceive: sb_lastercord != NULL"));
 			}
 		}
 	}
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 
 	/*
 	 * Now continue to read any data mbufs off of the head of the socket
 	 * buffer until the read request is satisfied.  Note that 'type' is
 	 * used to store the type of any mbuf reads that have happened so far
 	 * such that soreceive() can stop reading if the type changes, which
 	 * causes soreceive() to return only one of regular data and inline
 	 * out-of-band data in a single socket receive operation.
 	 */
 	moff = 0;
 	offset = 0;
 	while (m != NULL && !(m->m_flags & M_NOTAVAIL) && uio->uio_resid > 0
 	    && error == 0) {
 		/*
 		 * If the type of mbuf has changed since the last mbuf
 		 * examined ('type'), end the receive operation.
 		 */
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (m->m_type == MT_OOBDATA || m->m_type == MT_CONTROL) {
 			if (type != m->m_type)
 				break;
 		} else if (type == MT_OOBDATA)
 			break;
 		else
 		    KASSERT(m->m_type == MT_DATA,
 			("m->m_type == %d", m->m_type));
 		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
 		len = uio->uio_resid;
 		if (so->so_oobmark && len > so->so_oobmark - offset)
 			len = so->so_oobmark - offset;
 		if (len > m->m_len - moff)
 			len = m->m_len - moff;
 		/*
 		 * If mp is set, just pass back the mbufs.  Otherwise copy
 		 * them out via the uio, then free.  Sockbuf must be
 		 * consistent here (points to current mbuf, it points to next
 		 * record) when we drop priority; we must note any additions
 		 * to the sockbuf when we block interrupts again.
 		 */
 		if (mp == NULL) {
 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 			SBLASTRECORDCHK(&so->so_rcv);
 			SBLASTMBUFCHK(&so->so_rcv);
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			if ((m->m_flags & M_EXTPG) != 0)
 				error = m_unmapped_uiomove(m, moff, uio,
 				    (int)len);
 			else
 				error = uiomove(mtod(m, char *) + moff,
 				    (int)len, uio);
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (error) {
 				/*
 				 * The MT_SONAME mbuf has already been removed
 				 * from the record, so it is necessary to
 				 * remove the data mbufs, if any, to preserve
 				 * the invariant in the case of PR_ADDR that
 				 * requires MT_SONAME mbufs at the head of
 				 * each record.
 				 */
 				if (pr->pr_flags & PR_ATOMIC &&
 				    ((flags & MSG_PEEK) == 0))
 					(void)sbdroprecord_locked(&so->so_rcv);
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				goto release;
 			}
 		} else
 			uio->uio_resid -= len;
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (len == m->m_len - moff) {
 			if (m->m_flags & M_EOR)
 				flags |= MSG_EOR;
 			if (flags & MSG_PEEK) {
 				m = m->m_next;
 				moff = 0;
 			} else {
 				nextrecord = m->m_nextpkt;
 				sbfree(&so->so_rcv, m);
 				if (mp != NULL) {
 					m->m_nextpkt = NULL;
 					*mp = m;
 					mp = &m->m_next;
 					so->so_rcv.sb_mb = m = m->m_next;
 					*mp = NULL;
 				} else {
 					so->so_rcv.sb_mb = m_free(m);
 					m = so->so_rcv.sb_mb;
 				}
 				sockbuf_pushsync(&so->so_rcv, nextrecord);
 				SBLASTRECORDCHK(&so->so_rcv);
 				SBLASTMBUFCHK(&so->so_rcv);
 			}
 		} else {
 			if (flags & MSG_PEEK)
 				moff += len;
 			else {
 				if (mp != NULL) {
 					if (flags & MSG_DONTWAIT) {
 						*mp = m_copym(m, 0, len,
 						    M_NOWAIT);
 						if (*mp == NULL) {
 							/*
 							 * m_copym() couldn't
 							 * allocate an mbuf.
 							 * Adjust uio_resid back
 							 * (it was adjusted
 							 * down by len bytes,
 							 * which we didn't end
 							 * up "copying" over).
 							 */
 							uio->uio_resid += len;
 							break;
 						}
 					} else {
 						SOCKBUF_UNLOCK(&so->so_rcv);
 						*mp = m_copym(m, 0, len,
 						    M_WAITOK);
 						SOCKBUF_LOCK(&so->so_rcv);
 					}
 				}
 				sbcut_locked(&so->so_rcv, len);
 			}
 		}
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (so->so_oobmark) {
 			if ((flags & MSG_PEEK) == 0) {
 				so->so_oobmark -= len;
 				if (so->so_oobmark == 0) {
 					so->so_rcv.sb_state |= SBS_RCVATMARK;
 					break;
 				}
 			} else {
 				offset += len;
 				if (offset == so->so_oobmark)
 					break;
 			}
 		}
 		if (flags & MSG_EOR)
 			break;
 		/*
 		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
 		 * must not quit until "uio->uio_resid == 0" or an error
 		 * termination.  If a signal/timeout occurs, return with a
 		 * short count but without error.  Keep sockbuf locked
 		 * against other readers.
 		 */
 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
 		    !sosendallatonce(so) && nextrecord == NULL) {
 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 			if (so->so_error || so->so_rerror ||
 			    so->so_rcv.sb_state & SBS_CANTRCVMORE)
 				break;
 			/*
 			 * Notify the protocol that some data has been
 			 * drained before blocking.
 			 */
 			if (pr->pr_flags & PR_WANTRCVD) {
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				VNET_SO_ASSERT(so);
-				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
+				pr->pr_rcvd(so, flags);
 				SOCKBUF_LOCK(&so->so_rcv);
 			}
 			SBLASTRECORDCHK(&so->so_rcv);
 			SBLASTMBUFCHK(&so->so_rcv);
 			/*
 			 * We could receive some data while was notifying
 			 * the protocol. Skip blocking in this case.
 			 */
 			if (so->so_rcv.sb_mb == NULL) {
 				error = sbwait(so, SO_RCV);
 				if (error) {
 					SOCKBUF_UNLOCK(&so->so_rcv);
 					goto release;
 				}
 			}
 			m = so->so_rcv.sb_mb;
 			if (m != NULL)
 				nextrecord = m->m_nextpkt;
 		}
 	}
 
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
 		if (report_real_len)
 			uio->uio_resid -= m_length(m, NULL) - moff;
 		flags |= MSG_TRUNC;
 		if ((flags & MSG_PEEK) == 0)
 			(void) sbdroprecord_locked(&so->so_rcv);
 	}
 	if ((flags & MSG_PEEK) == 0) {
 		if (m == NULL) {
 			/*
 			 * First part is an inline SB_EMPTY_FIXUP().  Second
 			 * part makes sure sb_lastrecord is up-to-date if
 			 * there is still data in the socket buffer.
 			 */
 			so->so_rcv.sb_mb = nextrecord;
 			if (so->so_rcv.sb_mb == NULL) {
 				so->so_rcv.sb_mbtail = NULL;
 				so->so_rcv.sb_lastrecord = NULL;
 			} else if (nextrecord->m_nextpkt == NULL)
 				so->so_rcv.sb_lastrecord = nextrecord;
 		}
 		SBLASTRECORDCHK(&so->so_rcv);
 		SBLASTMBUFCHK(&so->so_rcv);
 		/*
 		 * If soreceive() is being done from the socket callback,
 		 * then don't need to generate ACK to peer to update window,
 		 * since ACK will be generated on return to TCP.
 		 */
 		if (!(flags & MSG_SOCALLBCK) &&
 		    (pr->pr_flags & PR_WANTRCVD)) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			VNET_SO_ASSERT(so);
-			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
+			pr->pr_rcvd(so, flags);
 			SOCKBUF_LOCK(&so->so_rcv);
 		}
 	}
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	if (orig_resid == uio->uio_resid && orig_resid &&
 	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		goto restart;
 	}
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	if (flagsp != NULL)
 		*flagsp |= flags;
 release:
 	SOCK_IO_RECV_UNLOCK(so);
 	return (error);
 }
 
 /*
  * Optimized version of soreceive() for stream (TCP) sockets.
  */
 int
 soreceive_stream(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	int len = 0, error = 0, flags, oresid;
 	struct sockbuf *sb;
 	struct mbuf *m, *n = NULL;
 
 	/* We only do stream sockets. */
 	if (so->so_type != SOCK_STREAM)
 		return (EINVAL);
 	if (psa != NULL)
 		*psa = NULL;
 	if (flagsp != NULL)
 		flags = *flagsp &~ MSG_EOR;
 	else
 		flags = 0;
 	if (controlp != NULL)
 		*controlp = NULL;
 	if (flags & MSG_OOB)
 		return (soreceive_rcvoob(so, uio, flags));
 	if (mp0 != NULL)
 		*mp0 = NULL;
 
 	sb = &so->so_rcv;
 
 #ifdef KERN_TLS
 	/*
 	 * KTLS store TLS records as records with a control message to
 	 * describe the framing.
 	 *
 	 * We check once here before acquiring locks to optimize the
 	 * common case.
 	 */
 	if (sb->sb_tls_info != NULL)
 		return (soreceive_generic(so, psa, uio, mp0, controlp,
 		    flagsp));
 #endif
 
 	/* Prevent other readers from entering the socket. */
 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		return (error);
 	SOCKBUF_LOCK(sb);
 
 #ifdef KERN_TLS
 	if (sb->sb_tls_info != NULL) {
 		SOCKBUF_UNLOCK(sb);
 		SOCK_IO_RECV_UNLOCK(so);
 		return (soreceive_generic(so, psa, uio, mp0, controlp,
 		    flagsp));
 	}
 #endif
 
 	/* Easy one, no space to copyout anything. */
 	if (uio->uio_resid == 0) {
 		error = EINVAL;
 		goto out;
 	}
 	oresid = uio->uio_resid;
 
 	/* We will never ever get anything unless we are or were connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		error = ENOTCONN;
 		goto out;
 	}
 
 restart:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	/* Abort if socket has reported problems. */
 	if (so->so_error) {
 		if (sbavail(sb) > 0)
 			goto deliver;
 		if (oresid > uio->uio_resid)
 			goto out;
 		error = so->so_error;
 		if (!(flags & MSG_PEEK))
 			so->so_error = 0;
 		goto out;
 	}
 
 	/* Door is closed.  Deliver what is left, if any. */
 	if (sb->sb_state & SBS_CANTRCVMORE) {
 		if (sbavail(sb) > 0)
 			goto deliver;
 		else
 			goto out;
 	}
 
 	/* Socket buffer is empty and we shall not block. */
 	if (sbavail(sb) == 0 &&
 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
 		error = EAGAIN;
 		goto out;
 	}
 
 	/* Socket buffer got some data that we shall deliver now. */
 	if (sbavail(sb) > 0 && !(flags & MSG_WAITALL) &&
 	    ((so->so_state & SS_NBIO) ||
 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
 	     sbavail(sb) >= sb->sb_lowat ||
 	     sbavail(sb) >= uio->uio_resid ||
 	     sbavail(sb) >= sb->sb_hiwat) ) {
 		goto deliver;
 	}
 
 	/* On MSG_WAITALL we must wait until all data or error arrives. */
 	if ((flags & MSG_WAITALL) &&
 	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_hiwat))
 		goto deliver;
 
 	/*
 	 * Wait and block until (more) data comes in.
 	 * NB: Drops the sockbuf lock during wait.
 	 */
 	error = sbwait(so, SO_RCV);
 	if (error)
 		goto out;
 	goto restart;
 
 deliver:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	KASSERT(sbavail(sb) > 0, ("%s: sockbuf empty", __func__));
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
 
 	/* Statistics. */
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 
 	/* Fill uio until full or current end of socket buffer is reached. */
 	len = min(uio->uio_resid, sbavail(sb));
 	if (mp0 != NULL) {
 		/* Dequeue as many mbufs as possible. */
 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
 			if (*mp0 == NULL)
 				*mp0 = sb->sb_mb;
 			else
 				m_cat(*mp0, sb->sb_mb);
 			for (m = sb->sb_mb;
 			     m != NULL && m->m_len <= len;
 			     m = m->m_next) {
 				KASSERT(!(m->m_flags & M_NOTAVAIL),
 				    ("%s: m %p not available", __func__, m));
 				len -= m->m_len;
 				uio->uio_resid -= m->m_len;
 				sbfree(sb, m);
 				n = m;
 			}
 			n->m_next = NULL;
 			sb->sb_mb = m;
 			sb->sb_lastrecord = sb->sb_mb;
 			if (sb->sb_mb == NULL)
 				SB_EMPTY_FIXUP(sb);
 		}
 		/* Copy the remainder. */
 		if (len > 0) {
 			KASSERT(sb->sb_mb != NULL,
 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
 
 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
 			if (m == NULL)
 				len = 0;	/* Don't flush data from sockbuf. */
 			else
 				uio->uio_resid -= len;
 			if (*mp0 != NULL)
 				m_cat(*mp0, m);
 			else
 				*mp0 = m;
 			if (*mp0 == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 		}
 	} else {
 		/* NB: Must unlock socket buffer as uiomove may sleep. */
 		SOCKBUF_UNLOCK(sb);
 		error = m_mbuftouio(uio, sb->sb_mb, len);
 		SOCKBUF_LOCK(sb);
 		if (error)
 			goto out;
 	}
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 
 	/*
 	 * Remove the delivered data from the socket buffer unless we
 	 * were only peeking.
 	 */
 	if (!(flags & MSG_PEEK)) {
 		if (len > 0)
 			sbdrop_locked(sb, len);
 
 		/* Notify protocol that we drained some data. */
 		if ((so->so_proto->pr_flags & PR_WANTRCVD) &&
 		    (((flags & MSG_WAITALL) && uio->uio_resid > 0) ||
 		     !(flags & MSG_SOCALLBCK))) {
 			SOCKBUF_UNLOCK(sb);
 			VNET_SO_ASSERT(so);
-			(*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
+			so->so_proto->pr_rcvd(so, flags);
 			SOCKBUF_LOCK(sb);
 		}
 	}
 
 	/*
 	 * For MSG_WAITALL we may have to loop again and wait for
 	 * more data to come in.
 	 */
 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
 		goto restart;
 out:
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 	SOCKBUF_UNLOCK(sb);
 	SOCK_IO_RECV_UNLOCK(so);
 	return (error);
 }
 
 /*
  * Optimized version of soreceive() for simple datagram cases from userspace.
  * Unlike in the stream case, we're able to drop a datagram if copyout()
  * fails, and because we handle datagrams atomically, we don't need to use a
  * sleep lock to prevent I/O interlacing.
  */
 int
 soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct mbuf *m, *m2;
 	int flags, error;
 	ssize_t len;
 	struct protosw *pr = so->so_proto;
 	struct mbuf *nextrecord;
 
 	if (psa != NULL)
 		*psa = NULL;
 	if (controlp != NULL)
 		*controlp = NULL;
 	if (flagsp != NULL)
 		flags = *flagsp &~ MSG_EOR;
 	else
 		flags = 0;
 
 	/*
 	 * For any complicated cases, fall back to the full
 	 * soreceive_generic().
 	 */
 	if (mp0 != NULL || (flags & (MSG_PEEK | MSG_OOB | MSG_TRUNC)))
 		return (soreceive_generic(so, psa, uio, mp0, controlp,
 		    flagsp));
 
 	/*
 	 * Enforce restrictions on use.
 	 */
 	KASSERT((pr->pr_flags & PR_WANTRCVD) == 0,
 	    ("soreceive_dgram: wantrcvd"));
 	KASSERT(pr->pr_flags & PR_ATOMIC, ("soreceive_dgram: !atomic"));
 	KASSERT((so->so_rcv.sb_state & SBS_RCVATMARK) == 0,
 	    ("soreceive_dgram: SBS_RCVATMARK"));
 	KASSERT((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0,
 	    ("soreceive_dgram: P_CONNREQUIRED"));
 
 	/*
 	 * Loop blocking while waiting for a datagram.
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	while ((m = so->so_rcv.sb_mb) == NULL) {
 		KASSERT(sbavail(&so->so_rcv) == 0,
 		    ("soreceive_dgram: sb_mb NULL but sbavail %u",
 		    sbavail(&so->so_rcv)));
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (error);
 		}
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
 		    uio->uio_resid == 0) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (0);
 		}
 		if ((so->so_state & SS_NBIO) ||
 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (EWOULDBLOCK);
 		}
 		SBLASTRECORDCHK(&so->so_rcv);
 		SBLASTMBUFCHK(&so->so_rcv);
 		error = sbwait(so, SO_RCV);
 		if (error) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			return (error);
 		}
 	}
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 	nextrecord = m->m_nextpkt;
 	if (nextrecord == NULL) {
 		KASSERT(so->so_rcv.sb_lastrecord == m,
 		    ("soreceive_dgram: lastrecord != m"));
 	}
 
 	KASSERT(so->so_rcv.sb_mb->m_nextpkt == nextrecord,
 	    ("soreceive_dgram: m_nextpkt != nextrecord"));
 
 	/*
 	 * Pull 'm' and its chain off the front of the packet queue.
 	 */
 	so->so_rcv.sb_mb = NULL;
 	sockbuf_pushsync(&so->so_rcv, nextrecord);
 
 	/*
 	 * Walk 'm's chain and free that many bytes from the socket buffer.
 	 */
 	for (m2 = m; m2 != NULL; m2 = m2->m_next)
 		sbfree(&so->so_rcv, m2);
 
 	/*
 	 * Do a few last checks before we let go of the lock.
 	 */
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	if (pr->pr_flags & PR_ADDR) {
 		KASSERT(m->m_type == MT_SONAME,
 		    ("m->m_type == %d", m->m_type));
 		if (psa != NULL)
 			*psa = sodupsockaddr(mtod(m, struct sockaddr *),
 			    M_NOWAIT);
 		m = m_free(m);
 	}
 	if (m == NULL) {
 		/* XXXRW: Can this happen? */
 		return (0);
 	}
 
 	/*
 	 * Packet to copyout() is now in 'm' and it is disconnected from the
 	 * queue.
 	 *
 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
 	 * in the first mbuf chain on the socket buffer.  We call into the
 	 * protocol to perform externalization (or freeing if controlp ==
 	 * NULL). In some cases there can be only MT_CONTROL mbufs without
 	 * MT_DATA mbufs.
 	 */
 	if (m->m_type == MT_CONTROL) {
 		struct mbuf *cm = NULL, *cmn;
 		struct mbuf **cme = &cm;
 
 		do {
 			m2 = m->m_next;
 			m->m_next = NULL;
 			*cme = m;
 			cme = &(*cme)->m_next;
 			m = m2;
 		} while (m != NULL && m->m_type == MT_CONTROL);
 		while (cm != NULL) {
 			cmn = cm->m_next;
 			cm->m_next = NULL;
 			if (pr->pr_domain->dom_externalize != NULL) {
 				error = (*pr->pr_domain->dom_externalize)
 				    (cm, controlp, flags);
 			} else if (controlp != NULL)
 				*controlp = cm;
 			else
 				m_freem(cm);
 			if (controlp != NULL) {
 				while (*controlp != NULL)
 					controlp = &(*controlp)->m_next;
 			}
 			cm = cmn;
 		}
 	}
 	KASSERT(m == NULL || m->m_type == MT_DATA,
 	    ("soreceive_dgram: !data"));
 	while (m != NULL && uio->uio_resid > 0) {
 		len = uio->uio_resid;
 		if (len > m->m_len)
 			len = m->m_len;
 		error = uiomove(mtod(m, char *), (int)len, uio);
 		if (error) {
 			m_freem(m);
 			return (error);
 		}
 		if (len == m->m_len)
 			m = m_free(m);
 		else {
 			m->m_data += len;
 			m->m_len -= len;
 		}
 	}
 	if (m != NULL) {
 		flags |= MSG_TRUNC;
 		m_freem(m);
 	}
 	if (flagsp != NULL)
 		*flagsp |= flags;
 	return (0);
 }
 
 int
 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	int error;
 
 	CURVNET_SET(so->so_vnet);
-	error = (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio,
-	    mp0, controlp, flagsp));
+	error = so->so_proto->pr_soreceive(so, psa, uio, mp0, controlp, flagsp);
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 soshutdown(struct socket *so, int how)
 {
 	struct protosw *pr;
 	int error, soerror_enotconn;
 
 	if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
 		return (EINVAL);
 
 	soerror_enotconn = 0;
 	SOCK_LOCK(so);
 	if ((so->so_state &
 	    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
 		/*
 		 * POSIX mandates us to return ENOTCONN when shutdown(2) is
 		 * invoked on a datagram sockets, however historically we would
 		 * actually tear socket down. This is known to be leveraged by
 		 * some applications to unblock process waiting in recvXXX(2)
 		 * by other process that it shares that socket with. Try to meet
 		 * both backward-compatibility and POSIX requirements by forcing
 		 * ENOTCONN but still asking protocol to perform pru_shutdown().
 		 */
 		if (so->so_type != SOCK_DGRAM && !SOLISTENING(so)) {
 			SOCK_UNLOCK(so);
 			return (ENOTCONN);
 		}
 		soerror_enotconn = 1;
 	}
 
 	if (SOLISTENING(so)) {
 		if (how != SHUT_WR) {
 			so->so_error = ECONNABORTED;
 			solisten_wakeup(so);	/* unlocks so */
 		} else {
 			SOCK_UNLOCK(so);
 		}
 		goto done;
 	}
 	SOCK_UNLOCK(so);
 
 	CURVNET_SET(so->so_vnet);
 	pr = so->so_proto;
-	if (pr->pr_usrreqs->pru_flush != NULL)
-		(*pr->pr_usrreqs->pru_flush)(so, how);
+	if (pr->pr_flush != NULL)
+		pr->pr_flush(so, how);
 	if (how != SHUT_WR)
 		sorflush(so);
 	if (how != SHUT_RD) {
-		error = (*pr->pr_usrreqs->pru_shutdown)(so);
+		error = pr->pr_shutdown(so);
 		wakeup(&so->so_timeo);
 		CURVNET_RESTORE();
 		return ((error == 0 && soerror_enotconn) ? ENOTCONN : error);
 	}
 	wakeup(&so->so_timeo);
 	CURVNET_RESTORE();
 
 done:
 	return (soerror_enotconn ? ENOTCONN : 0);
 }
 
 void
 sorflush(struct socket *so)
 {
 	struct protosw *pr;
 	int error;
 
 	VNET_SO_ASSERT(so);
 
 	/*
 	 * Dislodge threads currently blocked in receive and wait to acquire
 	 * a lock against other simultaneous readers before clearing the
 	 * socket buffer.  Don't let our acquire be interrupted by a signal
 	 * despite any existing socket disposition on interruptable waiting.
 	 */
 	socantrcvmore(so);
 
 	error = SOCK_IO_RECV_LOCK(so, SBL_WAIT | SBL_NOINTR);
 	if (error != 0) {
 		KASSERT(SOLISTENING(so),
 		    ("%s: soiolock(%p) failed", __func__, so));
 		return;
 	}
 
 	pr = so->so_proto;
 	if (pr->pr_flags & PR_RIGHTS) {
 		MPASS(pr->pr_domain->dom_dispose != NULL);
 		(*pr->pr_domain->dom_dispose)(so);
 	} else {
 		sbrelease(so, SO_RCV);
 		SOCK_IO_RECV_UNLOCK(so);
 	}
 
 }
 
 /*
  * Wrapper for Socket established helper hook.
  * Parameters: socket, context of the hook point, hook id.
  */
 static int inline
 hhook_run_socket(struct socket *so, void *hctx, int32_t h_id)
 {
 	struct socket_hhook_data hhook_data = {
 		.so = so,
 		.hctx = hctx,
 		.m = NULL,
 		.status = 0
 	};
 
 	CURVNET_SET(so->so_vnet);
 	HHOOKS_RUN_IF(V_socket_hhh[h_id], &hhook_data, &so->osd);
 	CURVNET_RESTORE();
 
 	/* Ugly but needed, since hhooks return void for now */
 	return (hhook_data.status);
 }
 
 /*
  * Perhaps this routine, and sooptcopyout(), below, ought to come in an
  * additional variant to handle the case where the option value needs to be
  * some kind of integer, but not a specific size.  In addition to their use
  * here, these functions are also called by the protocol-level pr_ctloutput()
  * routines.
  */
 int
 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
 {
 	size_t	valsize;
 
 	/*
 	 * If the user gives us more than we wanted, we ignore it, but if we
 	 * don't get the minimum length the caller wants, we return EINVAL.
 	 * On success, sopt->sopt_valsize is set to however much we actually
 	 * retrieved.
 	 */
 	if ((valsize = sopt->sopt_valsize) < minlen)
 		return EINVAL;
 	if (valsize > len)
 		sopt->sopt_valsize = valsize = len;
 
 	if (sopt->sopt_td != NULL)
 		return (copyin(sopt->sopt_val, buf, valsize));
 
 	bcopy(sopt->sopt_val, buf, valsize);
 	return (0);
 }
 
 /*
  * Kernel version of setsockopt(2).
  *
  * XXX: optlen is size_t, not socklen_t
  */
 int
 so_setsockopt(struct socket *so, int level, int optname, void *optval,
     size_t optlen)
 {
 	struct sockopt sopt;
 
 	sopt.sopt_level = level;
 	sopt.sopt_name = optname;
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_val = optval;
 	sopt.sopt_valsize = optlen;
 	sopt.sopt_td = NULL;
 	return (sosetopt(so, &sopt));
 }
 
 int
 sosetopt(struct socket *so, struct sockopt *sopt)
 {
 	int	error, optval;
 	struct	linger l;
 	struct	timeval tv;
 	sbintime_t val, *valp;
 	uint32_t val32;
 #ifdef MAC
 	struct mac extmac;
 #endif
 
 	CURVNET_SET(so->so_vnet);
 	error = 0;
 	if (sopt->sopt_level != SOL_SOCKET) {
 		if (so->so_proto->pr_ctloutput != NULL)
 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
 		else
 			error = ENOPROTOOPT;
 	} else {
 		switch (sopt->sopt_name) {
 		case SO_ACCEPTFILTER:
 			error = accept_filt_setopt(so, sopt);
 			if (error)
 				goto bad;
 			break;
 
 		case SO_LINGER:
 			error = sooptcopyin(sopt, &l, sizeof l, sizeof l);
 			if (error)
 				goto bad;
 			if (l.l_linger < 0 ||
 			    l.l_linger > USHRT_MAX ||
 			    l.l_linger > (INT_MAX / hz)) {
 				error = EDOM;
 				goto bad;
 			}
 			SOCK_LOCK(so);
 			so->so_linger = l.l_linger;
 			if (l.l_onoff)
 				so->so_options |= SO_LINGER;
 			else
 				so->so_options &= ~SO_LINGER;
 			SOCK_UNLOCK(so);
 			break;
 
 		case SO_DEBUG:
 		case SO_KEEPALIVE:
 		case SO_DONTROUTE:
 		case SO_USELOOPBACK:
 		case SO_BROADCAST:
 		case SO_REUSEADDR:
 		case SO_REUSEPORT:
 		case SO_REUSEPORT_LB:
 		case SO_OOBINLINE:
 		case SO_TIMESTAMP:
 		case SO_BINTIME:
 		case SO_NOSIGPIPE:
 		case SO_NO_DDP:
 		case SO_NO_OFFLOAD:
 		case SO_RERROR:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				goto bad;
 			SOCK_LOCK(so);
 			if (optval)
 				so->so_options |= sopt->sopt_name;
 			else
 				so->so_options &= ~sopt->sopt_name;
 			SOCK_UNLOCK(so);
 			break;
 
 		case SO_SETFIB:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				goto bad;
 
 			if (optval < 0 || optval >= rt_numfibs) {
 				error = EINVAL;
 				goto bad;
 			}
 			if (((so->so_proto->pr_domain->dom_family == PF_INET) ||
 			   (so->so_proto->pr_domain->dom_family == PF_INET6) ||
 			   (so->so_proto->pr_domain->dom_family == PF_ROUTE)))
 				so->so_fibnum = optval;
 			else
 				so->so_fibnum = 0;
 			break;
 
 		case SO_USER_COOKIE:
 			error = sooptcopyin(sopt, &val32, sizeof val32,
 			    sizeof val32);
 			if (error)
 				goto bad;
 			so->so_user_cookie = val32;
 			break;
 
 		case SO_SNDBUF:
 		case SO_RCVBUF:
 		case SO_SNDLOWAT:
 		case SO_RCVLOWAT:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				goto bad;
 
 			/*
 			 * Values < 1 make no sense for any of these options,
 			 * so disallow them.
 			 */
 			if (optval < 1) {
 				error = EINVAL;
 				goto bad;
 			}
 
 			error = sbsetopt(so, sopt->sopt_name, optval);
 			break;
 
 		case SO_SNDTIMEO:
 		case SO_RCVTIMEO:
 #ifdef COMPAT_FREEBSD32
 			if (SV_CURPROC_FLAG(SV_ILP32)) {
 				struct timeval32 tv32;
 
 				error = sooptcopyin(sopt, &tv32, sizeof tv32,
 				    sizeof tv32);
 				CP(tv32, tv, tv_sec);
 				CP(tv32, tv, tv_usec);
 			} else
 #endif
 				error = sooptcopyin(sopt, &tv, sizeof tv,
 				    sizeof tv);
 			if (error)
 				goto bad;
 			if (tv.tv_sec < 0 || tv.tv_usec < 0 ||
 			    tv.tv_usec >= 1000000) {
 				error = EDOM;
 				goto bad;
 			}
 			if (tv.tv_sec > INT32_MAX)
 				val = SBT_MAX;
 			else
 				val = tvtosbt(tv);
 			SOCK_LOCK(so);
 			valp = sopt->sopt_name == SO_SNDTIMEO ?
 			    (SOLISTENING(so) ? &so->sol_sbsnd_timeo :
 			    &so->so_snd.sb_timeo) :
 			    (SOLISTENING(so) ? &so->sol_sbrcv_timeo :
 			    &so->so_rcv.sb_timeo);
 			*valp = val;
 			SOCK_UNLOCK(so);
 			break;
 
 		case SO_LABEL:
 #ifdef MAC
 			error = sooptcopyin(sopt, &extmac, sizeof extmac,
 			    sizeof extmac);
 			if (error)
 				goto bad;
 			error = mac_setsockopt_label(sopt->sopt_td->td_ucred,
 			    so, &extmac);
 #else
 			error = EOPNOTSUPP;
 #endif
 			break;
 
 		case SO_TS_CLOCK:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				goto bad;
 			if (optval < 0 || optval > SO_TS_CLOCK_MAX) {
 				error = EINVAL;
 				goto bad;
 			}
 			so->so_ts_clock = optval;
 			break;
 
 		case SO_MAX_PACING_RATE:
 			error = sooptcopyin(sopt, &val32, sizeof(val32),
 			    sizeof(val32));
 			if (error)
 				goto bad;
 			so->so_max_pacing_rate = val32;
 			break;
 
 		default:
 			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
 				error = hhook_run_socket(so, sopt,
 				    HHOOK_SOCKET_OPT);
 			else
 				error = ENOPROTOOPT;
 			break;
 		}
 		if (error == 0 && so->so_proto->pr_ctloutput != NULL)
 			(void)(*so->so_proto->pr_ctloutput)(so, sopt);
 	}
 bad:
 	CURVNET_RESTORE();
 	return (error);
 }
 
 /*
  * Helper routine for getsockopt.
  */
 int
 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
 {
 	int	error;
 	size_t	valsize;
 
 	error = 0;
 
 	/*
 	 * Documented get behavior is that we always return a value, possibly
 	 * truncated to fit in the user's buffer.  Traditional behavior is
 	 * that we always tell the user precisely how much we copied, rather
 	 * than something useful like the total amount we had available for
 	 * her.  Note that this interface is not idempotent; the entire
 	 * answer must be generated ahead of time.
 	 */
 	valsize = min(len, sopt->sopt_valsize);
 	sopt->sopt_valsize = valsize;
 	if (sopt->sopt_val != NULL) {
 		if (sopt->sopt_td != NULL)
 			error = copyout(buf, sopt->sopt_val, valsize);
 		else
 			bcopy(buf, sopt->sopt_val, valsize);
 	}
 	return (error);
 }
 
 int
 sogetopt(struct socket *so, struct sockopt *sopt)
 {
 	int	error, optval;
 	struct	linger l;
 	struct	timeval tv;
 #ifdef MAC
 	struct mac extmac;
 #endif
 
 	CURVNET_SET(so->so_vnet);
 	error = 0;
 	if (sopt->sopt_level != SOL_SOCKET) {
 		if (so->so_proto->pr_ctloutput != NULL)
 			error = (*so->so_proto->pr_ctloutput)(so, sopt);
 		else
 			error = ENOPROTOOPT;
 		CURVNET_RESTORE();
 		return (error);
 	} else {
 		switch (sopt->sopt_name) {
 		case SO_ACCEPTFILTER:
 			error = accept_filt_getopt(so, sopt);
 			break;
 
 		case SO_LINGER:
 			SOCK_LOCK(so);
 			l.l_onoff = so->so_options & SO_LINGER;
 			l.l_linger = so->so_linger;
 			SOCK_UNLOCK(so);
 			error = sooptcopyout(sopt, &l, sizeof l);
 			break;
 
 		case SO_USELOOPBACK:
 		case SO_DONTROUTE:
 		case SO_DEBUG:
 		case SO_KEEPALIVE:
 		case SO_REUSEADDR:
 		case SO_REUSEPORT:
 		case SO_REUSEPORT_LB:
 		case SO_BROADCAST:
 		case SO_OOBINLINE:
 		case SO_ACCEPTCONN:
 		case SO_TIMESTAMP:
 		case SO_BINTIME:
 		case SO_NOSIGPIPE:
 		case SO_NO_DDP:
 		case SO_NO_OFFLOAD:
 		case SO_RERROR:
 			optval = so->so_options & sopt->sopt_name;
 integer:
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		case SO_DOMAIN:
 			optval = so->so_proto->pr_domain->dom_family;
 			goto integer;
 
 		case SO_TYPE:
 			optval = so->so_type;
 			goto integer;
 
 		case SO_PROTOCOL:
 			optval = so->so_proto->pr_protocol;
 			goto integer;
 
 		case SO_ERROR:
 			SOCK_LOCK(so);
 			if (so->so_error) {
 				optval = so->so_error;
 				so->so_error = 0;
 			} else {
 				optval = so->so_rerror;
 				so->so_rerror = 0;
 			}
 			SOCK_UNLOCK(so);
 			goto integer;
 
 		case SO_SNDBUF:
 			optval = SOLISTENING(so) ? so->sol_sbsnd_hiwat :
 			    so->so_snd.sb_hiwat;
 			goto integer;
 
 		case SO_RCVBUF:
 			optval = SOLISTENING(so) ? so->sol_sbrcv_hiwat :
 			    so->so_rcv.sb_hiwat;
 			goto integer;
 
 		case SO_SNDLOWAT:
 			optval = SOLISTENING(so) ? so->sol_sbsnd_lowat :
 			    so->so_snd.sb_lowat;
 			goto integer;
 
 		case SO_RCVLOWAT:
 			optval = SOLISTENING(so) ? so->sol_sbrcv_lowat :
 			    so->so_rcv.sb_lowat;
 			goto integer;
 
 		case SO_SNDTIMEO:
 		case SO_RCVTIMEO:
 			SOCK_LOCK(so);
 			tv = sbttotv(sopt->sopt_name == SO_SNDTIMEO ?
 			    (SOLISTENING(so) ? so->sol_sbsnd_timeo :
 			    so->so_snd.sb_timeo) :
 			    (SOLISTENING(so) ? so->sol_sbrcv_timeo :
 			    so->so_rcv.sb_timeo));
 			SOCK_UNLOCK(so);
 #ifdef COMPAT_FREEBSD32
 			if (SV_CURPROC_FLAG(SV_ILP32)) {
 				struct timeval32 tv32;
 
 				CP(tv, tv32, tv_sec);
 				CP(tv, tv32, tv_usec);
 				error = sooptcopyout(sopt, &tv32, sizeof tv32);
 			} else
 #endif
 				error = sooptcopyout(sopt, &tv, sizeof tv);
 			break;
 
 		case SO_LABEL:
 #ifdef MAC
 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 			    sizeof(extmac));
 			if (error)
 				goto bad;
 			error = mac_getsockopt_label(sopt->sopt_td->td_ucred,
 			    so, &extmac);
 			if (error)
 				goto bad;
 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
 #else
 			error = EOPNOTSUPP;
 #endif
 			break;
 
 		case SO_PEERLABEL:
 #ifdef MAC
 			error = sooptcopyin(sopt, &extmac, sizeof(extmac),
 			    sizeof(extmac));
 			if (error)
 				goto bad;
 			error = mac_getsockopt_peerlabel(
 			    sopt->sopt_td->td_ucred, so, &extmac);
 			if (error)
 				goto bad;
 			error = sooptcopyout(sopt, &extmac, sizeof extmac);
 #else
 			error = EOPNOTSUPP;
 #endif
 			break;
 
 		case SO_LISTENQLIMIT:
 			optval = SOLISTENING(so) ? so->sol_qlimit : 0;
 			goto integer;
 
 		case SO_LISTENQLEN:
 			optval = SOLISTENING(so) ? so->sol_qlen : 0;
 			goto integer;
 
 		case SO_LISTENINCQLEN:
 			optval = SOLISTENING(so) ? so->sol_incqlen : 0;
 			goto integer;
 
 		case SO_TS_CLOCK:
 			optval = so->so_ts_clock;
 			goto integer;
 
 		case SO_MAX_PACING_RATE:
 			optval = so->so_max_pacing_rate;
 			goto integer;
 
 		default:
 			if (V_socket_hhh[HHOOK_SOCKET_OPT]->hhh_nhooks > 0)
 				error = hhook_run_socket(so, sopt,
 				    HHOOK_SOCKET_OPT);
 			else
 				error = ENOPROTOOPT;
 			break;
 		}
 	}
 #ifdef MAC
 bad:
 #endif
 	CURVNET_RESTORE();
 	return (error);
 }
 
 int
 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
 {
 	struct mbuf *m, *m_prev;
 	int sopt_size = sopt->sopt_valsize;
 
 	MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return ENOBUFS;
 	if (sopt_size > MLEN) {
 		MCLGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT);
 		if ((m->m_flags & M_EXT) == 0) {
 			m_free(m);
 			return ENOBUFS;
 		}
 		m->m_len = min(MCLBYTES, sopt_size);
 	} else {
 		m->m_len = min(MLEN, sopt_size);
 	}
 	sopt_size -= m->m_len;
 	*mp = m;
 	m_prev = m;
 
 	while (sopt_size) {
 		MGET(m, sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			m_freem(*mp);
 			return ENOBUFS;
 		}
 		if (sopt_size > MLEN) {
 			MCLGET(m, sopt->sopt_td != NULL ? M_WAITOK :
 			    M_NOWAIT);
 			if ((m->m_flags & M_EXT) == 0) {
 				m_freem(m);
 				m_freem(*mp);
 				return ENOBUFS;
 			}
 			m->m_len = min(MCLBYTES, sopt_size);
 		} else {
 			m->m_len = min(MLEN, sopt_size);
 		}
 		sopt_size -= m->m_len;
 		m_prev->m_next = m;
 		m_prev = m;
 	}
 	return (0);
 }
 
 int
 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
 {
 	struct mbuf *m0 = m;
 
 	if (sopt->sopt_val == NULL)
 		return (0);
 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 		if (sopt->sopt_td != NULL) {
 			int error;
 
 			error = copyin(sopt->sopt_val, mtod(m, char *),
 			    m->m_len);
 			if (error != 0) {
 				m_freem(m0);
 				return(error);
 			}
 		} else
 			bcopy(sopt->sopt_val, mtod(m, char *), m->m_len);
 		sopt->sopt_valsize -= m->m_len;
 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 		m = m->m_next;
 	}
 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
 		panic("ip6_sooptmcopyin");
 	return (0);
 }
 
 int
 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
 {
 	struct mbuf *m0 = m;
 	size_t valsize = 0;
 
 	if (sopt->sopt_val == NULL)
 		return (0);
 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 		if (sopt->sopt_td != NULL) {
 			int error;
 
 			error = copyout(mtod(m, char *), sopt->sopt_val,
 			    m->m_len);
 			if (error != 0) {
 				m_freem(m0);
 				return(error);
 			}
 		} else
 			bcopy(mtod(m, char *), sopt->sopt_val, m->m_len);
 		sopt->sopt_valsize -= m->m_len;
 		sopt->sopt_val = (char *)sopt->sopt_val + m->m_len;
 		valsize += m->m_len;
 		m = m->m_next;
 	}
 	if (m != NULL) {
 		/* enough soopt buffer should be given from user-land */
 		m_freem(m0);
 		return(EINVAL);
 	}
 	sopt->sopt_valsize = valsize;
 	return (0);
 }
 
 /*
  * sohasoutofband(): protocol notifies socket layer of the arrival of new
  * out-of-band data, which will then notify socket consumers.
  */
 void
 sohasoutofband(struct socket *so)
 {
 
 	if (so->so_sigio != NULL)
 		pgsigio(&so->so_sigio, SIGURG, 0);
 	selwakeuppri(&so->so_rdsel, PSOCK);
 }
 
 int
 sopoll(struct socket *so, int events, struct ucred *active_cred,
     struct thread *td)
 {
 
 	/*
 	 * We do not need to set or assert curvnet as long as everyone uses
 	 * sopoll_generic().
 	 */
-	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
-	    td));
+	return (so->so_proto->pr_sopoll(so, events, active_cred, td));
 }
 
 int
 sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
     struct thread *td)
 {
 	int revents;
 
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		if (!(events & (POLLIN | POLLRDNORM)))
 			revents = 0;
 		else if (!TAILQ_EMPTY(&so->sol_comp))
 			revents = events & (POLLIN | POLLRDNORM);
 		else if ((events & POLLINIGNEOF) == 0 && so->so_error)
 			revents = (events & (POLLIN | POLLRDNORM)) | POLLHUP;
 		else {
 			selrecord(td, &so->so_rdsel);
 			revents = 0;
 		}
 	} else {
 		revents = 0;
 		SOCK_SENDBUF_LOCK(so);
 		SOCK_RECVBUF_LOCK(so);
 		if (events & (POLLIN | POLLRDNORM))
 			if (soreadabledata(so))
 				revents |= events & (POLLIN | POLLRDNORM);
 		if (events & (POLLOUT | POLLWRNORM))
 			if (sowriteable(so))
 				revents |= events & (POLLOUT | POLLWRNORM);
 		if (events & (POLLPRI | POLLRDBAND))
 			if (so->so_oobmark ||
 			    (so->so_rcv.sb_state & SBS_RCVATMARK))
 				revents |= events & (POLLPRI | POLLRDBAND);
 		if ((events & POLLINIGNEOF) == 0) {
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				revents |= events & (POLLIN | POLLRDNORM);
 				if (so->so_snd.sb_state & SBS_CANTSENDMORE)
 					revents |= POLLHUP;
 			}
 		}
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE)
 			revents |= events & POLLRDHUP;
 		if (revents == 0) {
 			if (events &
 			    (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND | POLLRDHUP)) {
 				selrecord(td, &so->so_rdsel);
 				so->so_rcv.sb_flags |= SB_SEL;
 			}
 			if (events & (POLLOUT | POLLWRNORM)) {
 				selrecord(td, &so->so_wrsel);
 				so->so_snd.sb_flags |= SB_SEL;
 			}
 		}
 		SOCK_RECVBUF_UNLOCK(so);
 		SOCK_SENDBUF_UNLOCK(so);
 	}
 	SOCK_UNLOCK(so);
 	return (revents);
 }
 
 int
 soo_kqfilter(struct file *fp, struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 	struct sockbuf *sb;
 	sb_which which;
 	struct knlist *knl;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
 		kn->kn_fop = &soread_filtops;
 		knl = &so->so_rdsel.si_note;
 		sb = &so->so_rcv;
 		which = SO_RCV;
 		break;
 	case EVFILT_WRITE:
 		kn->kn_fop = &sowrite_filtops;
 		knl = &so->so_wrsel.si_note;
 		sb = &so->so_snd;
 		which = SO_SND;
 		break;
 	case EVFILT_EMPTY:
 		kn->kn_fop = &soempty_filtops;
 		knl = &so->so_wrsel.si_note;
 		sb = &so->so_snd;
 		which = SO_SND;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		knlist_add(knl, kn, 1);
 	} else {
 		SOCK_BUF_LOCK(so, which);
 		knlist_add(knl, kn, 1);
 		sb->sb_flags |= SB_KNOTE;
 		SOCK_BUF_UNLOCK(so, which);
 	}
 	SOCK_UNLOCK(so);
 	return (0);
 }
 
-/*
- * Some routines that return EOPNOTSUPP for entry points that are not
- * supported by a protocol.  Fill in as needed.
- */
-int
-pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
-    struct thread *td)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
-    struct thread *td)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_connect2_notsupp(struct socket *so1, struct socket *so2)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_control_notsupp(struct socket *so, u_long cmd, void *data,
-    struct ifnet *ifp, struct thread *td)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_disconnect_notsupp(struct socket *so)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_rcvd_notsupp(struct socket *so, int flags)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
-    struct sockaddr *addr, struct mbuf *control, struct thread *td)
-{
-
-	if (control != NULL)
-		m_freem(control);
-	if ((flags & PRUS_NOTREADY) == 0)
-		m_freem(m);
-	return (EOPNOTSUPP);
-}
-
-int
-pru_ready_notsupp(struct socket *so, struct mbuf *m, int count)
-{
-
-	return (EOPNOTSUPP);
-}
-
-/*
- * This isn't really a ``null'' operation, but it's the default one and
- * doesn't do anything destructive.
- */
-int
-pru_sense_null(struct socket *so, struct stat *sb)
-{
-
-	sb->st_blksize = so->so_snd.sb_hiwat;
-	return 0;
-}
-
-int
-pru_shutdown_notsupp(struct socket *so)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
-    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
-    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
-{
-
-	return EOPNOTSUPP;
-}
-
-int
-pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
-    struct thread *td)
-{
-
-	return EOPNOTSUPP;
-}
-
 static void
 filt_sordetach(struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 
 	so_rdknl_lock(so);
 	knlist_remove(&so->so_rdsel.si_note, kn, 1);
 	if (!SOLISTENING(so) && knlist_empty(&so->so_rdsel.si_note))
 		so->so_rcv.sb_flags &= ~SB_KNOTE;
 	so_rdknl_unlock(so);
 }
 
 /*ARGSUSED*/
 static int
 filt_soread(struct knote *kn, long hint)
 {
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
 
 	if (SOLISTENING(so)) {
 		SOCK_LOCK_ASSERT(so);
 		kn->kn_data = so->sol_qlen;
 		if (so->so_error) {
 			kn->kn_flags |= EV_EOF;
 			kn->kn_fflags = so->so_error;
 			return (1);
 		}
 		return (!TAILQ_EMPTY(&so->sol_comp));
 	}
 
 	SOCK_RECVBUF_LOCK_ASSERT(so);
 
 	kn->kn_data = sbavail(&so->so_rcv) - so->so_rcv.sb_ctl;
 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 		kn->kn_flags |= EV_EOF;
 		kn->kn_fflags = so->so_error;
 		return (1);
 	} else if (so->so_error || so->so_rerror)
 		return (1);
 
 	if (kn->kn_sfflags & NOTE_LOWAT) {
 		if (kn->kn_data >= kn->kn_sdata)
 			return (1);
 	} else if (sbavail(&so->so_rcv) >= so->so_rcv.sb_lowat)
 		return (1);
 
 	/* This hook returning non-zero indicates an event, not error */
 	return (hhook_run_socket(so, NULL, HHOOK_FILT_SOREAD));
 }
 
 static void
 filt_sowdetach(struct knote *kn)
 {
 	struct socket *so = kn->kn_fp->f_data;
 
 	so_wrknl_lock(so);
 	knlist_remove(&so->so_wrsel.si_note, kn, 1);
 	if (!SOLISTENING(so) && knlist_empty(&so->so_wrsel.si_note))
 		so->so_snd.sb_flags &= ~SB_KNOTE;
 	so_wrknl_unlock(so);
 }
 
 /*ARGSUSED*/
 static int
 filt_sowrite(struct knote *kn, long hint)
 {
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
 
 	if (SOLISTENING(so))
 		return (0);
 
 	SOCK_SENDBUF_LOCK_ASSERT(so);
 	kn->kn_data = sbspace(&so->so_snd);
 
 	hhook_run_socket(so, kn, HHOOK_FILT_SOWRITE);
 
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		kn->kn_flags |= EV_EOF;
 		kn->kn_fflags = so->so_error;
 		return (1);
 	} else if (so->so_error)	/* temporary udp error */
 		return (1);
 	else if (((so->so_state & SS_ISCONNECTED) == 0) &&
 	    (so->so_proto->pr_flags & PR_CONNREQUIRED))
 		return (0);
 	else if (kn->kn_sfflags & NOTE_LOWAT)
 		return (kn->kn_data >= kn->kn_sdata);
 	else
 		return (kn->kn_data >= so->so_snd.sb_lowat);
 }
 
 static int
 filt_soempty(struct knote *kn, long hint)
 {
 	struct socket *so;
 
 	so = kn->kn_fp->f_data;
 
 	if (SOLISTENING(so))
 		return (1);
 
 	SOCK_SENDBUF_LOCK_ASSERT(so);
 	kn->kn_data = sbused(&so->so_snd);
 
 	if (kn->kn_data == 0)
 		return (1);
 	else
 		return (0);
 }
 
 int
 socheckuid(struct socket *so, uid_t uid)
 {
 
 	if (so == NULL)
 		return (EPERM);
 	if (so->so_cred->cr_uid != uid)
 		return (EPERM);
 	return (0);
 }
 
 /*
  * These functions are used by protocols to notify the socket layer (and its
  * consumers) of state changes in the sockets driven by protocol-side events.
  */
 
 /*
  * Procedures to manipulate state flags of socket and do appropriate wakeups.
  *
  * Normal sequence from the active (originating) side is that
  * soisconnecting() is called during processing of connect() call, resulting
  * in an eventual call to soisconnected() if/when the connection is
  * established.  When the connection is torn down soisdisconnecting() is
  * called during processing of disconnect() call, and soisdisconnected() is
  * called when the connection to the peer is totally severed.  The semantics
  * of these routines are such that connectionless protocols can call
  * soisconnected() and soisdisconnected() only, bypassing the in-progress
  * calls when setting up a ``connection'' takes no time.
  *
  * From the passive side, a socket is created with two queues of sockets:
  * so_incomp for connections in progress and so_comp for connections already
  * made and awaiting user acceptance.  As a protocol is preparing incoming
  * connections, it creates a socket structure queued on so_incomp by calling
  * sonewconn().  When the connection is established, soisconnected() is
  * called, and transfers the socket structure to so_comp, making it available
  * to accept().
  *
  * If a socket is closed with sockets on either so_incomp or so_comp, these
  * sockets are dropped.
  *
  * If higher-level protocols are implemented in the kernel, the wakeups done
  * here will sometimes cause software-interrupt process scheduling.
  */
 void
 soisconnecting(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
 	so->so_state |= SS_ISCONNECTING;
 	SOCK_UNLOCK(so);
 }
 
 void
 soisconnected(struct socket *so)
 {
 	bool last __diagused;
 
 	SOCK_LOCK(so);
 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
 	so->so_state |= SS_ISCONNECTED;
 
 	if (so->so_qstate == SQ_INCOMP) {
 		struct socket *head = so->so_listen;
 		int ret;
 
 		KASSERT(head, ("%s: so %p on incomp of NULL", __func__, so));
 		/*
 		 * Promoting a socket from incomplete queue to complete, we
 		 * need to go through reverse order of locking.  We first do
 		 * trylock, and if that doesn't succeed, we go the hard way
 		 * leaving a reference and rechecking consistency after proper
 		 * locking.
 		 */
 		if (__predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
 			soref(head);
 			SOCK_UNLOCK(so);
 			SOLISTEN_LOCK(head);
 			SOCK_LOCK(so);
 			if (__predict_false(head != so->so_listen)) {
 				/*
 				 * The socket went off the listen queue,
 				 * should be lost race to close(2) of sol.
 				 * The socket is about to soabort().
 				 */
 				SOCK_UNLOCK(so);
 				sorele_locked(head);
 				return;
 			}
 			last = refcount_release(&head->so_count);
 			KASSERT(!last, ("%s: released last reference for %p",
 			    __func__, head));
 		}
 again:
 		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
 			TAILQ_REMOVE(&head->sol_incomp, so, so_list);
 			head->sol_incqlen--;
 			TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
 			head->sol_qlen++;
 			so->so_qstate = SQ_COMP;
 			SOCK_UNLOCK(so);
 			solisten_wakeup(head);	/* unlocks */
 		} else {
 			SOCK_RECVBUF_LOCK(so);
 			soupcall_set(so, SO_RCV,
 			    head->sol_accept_filter->accf_callback,
 			    head->sol_accept_filter_arg);
 			so->so_options &= ~SO_ACCEPTFILTER;
 			ret = head->sol_accept_filter->accf_callback(so,
 			    head->sol_accept_filter_arg, M_NOWAIT);
 			if (ret == SU_ISCONNECTED) {
 				soupcall_clear(so, SO_RCV);
 				SOCK_RECVBUF_UNLOCK(so);
 				goto again;
 			}
 			SOCK_RECVBUF_UNLOCK(so);
 			SOCK_UNLOCK(so);
 			SOLISTEN_UNLOCK(head);
 		}
 		return;
 	}
 	SOCK_UNLOCK(so);
 	wakeup(&so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 }
 
 void
 soisdisconnecting(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTING;
 	so->so_state |= SS_ISDISCONNECTING;
 
 	if (!SOLISTENING(so)) {
 		SOCK_RECVBUF_LOCK(so);
 		socantrcvmore_locked(so);
 		SOCK_SENDBUF_LOCK(so);
 		socantsendmore_locked(so);
 	}
 	SOCK_UNLOCK(so);
 	wakeup(&so->so_timeo);
 }
 
 void
 soisdisconnected(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 
 	/*
 	 * There is at least one reader of so_state that does not
 	 * acquire socket lock, namely soreceive_generic().  Ensure
 	 * that it never sees all flags that track connection status
 	 * cleared, by ordering the update with a barrier semantic of
 	 * our release thread fence.
 	 */
 	so->so_state |= SS_ISDISCONNECTED;
 	atomic_thread_fence_rel();
 	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
 
 	if (!SOLISTENING(so)) {
 		SOCK_UNLOCK(so);
 		SOCK_RECVBUF_LOCK(so);
 		socantrcvmore_locked(so);
 		SOCK_SENDBUF_LOCK(so);
 		sbdrop_locked(&so->so_snd, sbused(&so->so_snd));
 		socantsendmore_locked(so);
 	} else
 		SOCK_UNLOCK(so);
 	wakeup(&so->so_timeo);
 }
 
 int
 soiolock(struct socket *so, struct sx *sx, int flags)
 {
 	int error;
 
 	KASSERT((flags & SBL_VALID) == flags,
 	    ("soiolock: invalid flags %#x", flags));
 
 	if ((flags & SBL_WAIT) != 0) {
 		if ((flags & SBL_NOINTR) != 0) {
 			sx_xlock(sx);
 		} else {
 			error = sx_xlock_sig(sx);
 			if (error != 0)
 				return (error);
 		}
 	} else if (!sx_try_xlock(sx)) {
 		return (EWOULDBLOCK);
 	}
 
 	if (__predict_false(SOLISTENING(so))) {
 		sx_xunlock(sx);
 		return (ENOTCONN);
 	}
 	return (0);
 }
 
 void
 soiounlock(struct sx *sx)
 {
 	sx_xunlock(sx);
 }
 
 /*
  * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
  */
 struct sockaddr *
 sodupsockaddr(const struct sockaddr *sa, int mflags)
 {
 	struct sockaddr *sa2;
 
 	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
 	if (sa2)
 		bcopy(sa, sa2, sa->sa_len);
 	return sa2;
 }
 
 /*
  * Register per-socket destructor.
  */
 void
 sodtor_set(struct socket *so, so_dtor_t *func)
 {
 
 	SOCK_LOCK_ASSERT(so);
 	so->so_dtor = func;
 }
 
 /*
  * Register per-socket buffer upcalls.
  */
 void
 soupcall_set(struct socket *so, sb_which which, so_upcall_t func, void *arg)
 {
 	struct sockbuf *sb;
 
 	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
 
 	switch (which) {
 	case SO_RCV:
 		sb = &so->so_rcv;
 		break;
 	case SO_SND:
 		sb = &so->so_snd;
 		break;
 	}
 	SOCK_BUF_LOCK_ASSERT(so, which);
 	sb->sb_upcall = func;
 	sb->sb_upcallarg = arg;
 	sb->sb_flags |= SB_UPCALL;
 }
 
 void
 soupcall_clear(struct socket *so, sb_which which)
 {
 	struct sockbuf *sb;
 
 	KASSERT(!SOLISTENING(so), ("%s: so %p listening", __func__, so));
 
 	switch (which) {
 	case SO_RCV:
 		sb = &so->so_rcv;
 		break;
 	case SO_SND:
 		sb = &so->so_snd;
 		break;
 	}
 	SOCK_BUF_LOCK_ASSERT(so, which);
 	KASSERT(sb->sb_upcall != NULL,
 	    ("%s: so %p no upcall to clear", __func__, so));
 	sb->sb_upcall = NULL;
 	sb->sb_upcallarg = NULL;
 	sb->sb_flags &= ~SB_UPCALL;
 }
 
 void
 solisten_upcall_set(struct socket *so, so_upcall_t func, void *arg)
 {
 
 	SOLISTEN_LOCK_ASSERT(so);
 	so->sol_upcall = func;
 	so->sol_upcallarg = arg;
 }
 
 static void
 so_rdknl_lock(void *arg)
 {
 	struct socket *so = arg;
 
 retry:
 	if (SOLISTENING(so)) {
 		SOLISTEN_LOCK(so);
 	} else {
 		SOCK_RECVBUF_LOCK(so);
 		if (__predict_false(SOLISTENING(so))) {
 			SOCK_RECVBUF_UNLOCK(so);
 			goto retry;
 		}
 	}
 }
 
 static void
 so_rdknl_unlock(void *arg)
 {
 	struct socket *so = arg;
 
 	if (SOLISTENING(so))
 		SOLISTEN_UNLOCK(so);
 	else
 		SOCK_RECVBUF_UNLOCK(so);
 }
 
 static void
 so_rdknl_assert_lock(void *arg, int what)
 {
 	struct socket *so = arg;
 
 	if (what == LA_LOCKED) {
 		if (SOLISTENING(so))
 			SOLISTEN_LOCK_ASSERT(so);
 		else
 			SOCK_RECVBUF_LOCK_ASSERT(so);
 	} else {
 		if (SOLISTENING(so))
 			SOLISTEN_UNLOCK_ASSERT(so);
 		else
 			SOCK_RECVBUF_UNLOCK_ASSERT(so);
 	}
 }
 
 static void
 so_wrknl_lock(void *arg)
 {
 	struct socket *so = arg;
 
 retry:
 	if (SOLISTENING(so)) {
 		SOLISTEN_LOCK(so);
 	} else {
 		SOCK_SENDBUF_LOCK(so);
 		if (__predict_false(SOLISTENING(so))) {
 			SOCK_SENDBUF_UNLOCK(so);
 			goto retry;
 		}
 	}
 }
 
 static void
 so_wrknl_unlock(void *arg)
 {
 	struct socket *so = arg;
 
 	if (SOLISTENING(so))
 		SOLISTEN_UNLOCK(so);
 	else
 		SOCK_SENDBUF_UNLOCK(so);
 }
 
 static void
 so_wrknl_assert_lock(void *arg, int what)
 {
 	struct socket *so = arg;
 
 	if (what == LA_LOCKED) {
 		if (SOLISTENING(so))
 			SOLISTEN_LOCK_ASSERT(so);
 		else
 			SOCK_SENDBUF_LOCK_ASSERT(so);
 	} else {
 		if (SOLISTENING(so))
 			SOLISTEN_UNLOCK_ASSERT(so);
 		else
 			SOCK_SENDBUF_UNLOCK_ASSERT(so);
 	}
 }
 
 /*
  * Create an external-format (``xsocket'') structure using the information in
  * the kernel-format socket structure pointed to by so.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
  * user code from changes in the kernel structure, and potentially to provide
  * information-hiding if we decide that some of this information should be
  * hidden from users.
  */
 void
 sotoxsocket(struct socket *so, struct xsocket *xso)
 {
 
 	bzero(xso, sizeof(*xso));
 	xso->xso_len = sizeof *xso;
 	xso->xso_so = (uintptr_t)so;
 	xso->so_type = so->so_type;
 	xso->so_options = so->so_options;
 	xso->so_linger = so->so_linger;
 	xso->so_state = so->so_state;
 	xso->so_pcb = (uintptr_t)so->so_pcb;
 	xso->xso_protocol = so->so_proto->pr_protocol;
 	xso->xso_family = so->so_proto->pr_domain->dom_family;
 	xso->so_timeo = so->so_timeo;
 	xso->so_error = so->so_error;
 	xso->so_uid = so->so_cred->cr_uid;
 	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
 	if (SOLISTENING(so)) {
 		xso->so_qlen = so->sol_qlen;
 		xso->so_incqlen = so->sol_incqlen;
 		xso->so_qlimit = so->sol_qlimit;
 		xso->so_oobmark = 0;
 	} else {
 		xso->so_state |= so->so_qstate;
 		xso->so_qlen = xso->so_incqlen = xso->so_qlimit = 0;
 		xso->so_oobmark = so->so_oobmark;
 		sbtoxsockbuf(&so->so_snd, &xso->so_snd);
 		sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
 	}
 }
 
 struct sockbuf *
 so_sockbuf_rcv(struct socket *so)
 {
 
 	return (&so->so_rcv);
 }
 
 struct sockbuf *
 so_sockbuf_snd(struct socket *so)
 {
 
 	return (&so->so_snd);
 }
 
 int
 so_state_get(const struct socket *so)
 {
 
 	return (so->so_state);
 }
 
 void
 so_state_set(struct socket *so, int val)
 {
 
 	so->so_state = val;
 }
 
 int
 so_options_get(const struct socket *so)
 {
 
 	return (so->so_options);
 }
 
 void
 so_options_set(struct socket *so, int val)
 {
 
 	so->so_options = val;
 }
 
 int
 so_error_get(const struct socket *so)
 {
 
 	return (so->so_error);
 }
 
 void
 so_error_set(struct socket *so, int val)
 {
 
 	so->so_error = val;
 }
 
 int
 so_linger_get(const struct socket *so)
 {
 
 	return (so->so_linger);
 }
 
 void
 so_linger_set(struct socket *so, int val)
 {
 
 	KASSERT(val >= 0 && val <= USHRT_MAX && val <= (INT_MAX / hz),
 	    ("%s: val %d out of range", __func__, val));
 
 	so->so_linger = val;
 }
 
 struct protosw *
 so_protosw_get(const struct socket *so)
 {
 
 	return (so->so_proto);
 }
 
 void
 so_protosw_set(struct socket *so, struct protosw *val)
 {
 
 	so->so_proto = val;
 }
 
 void
 so_sorwakeup(struct socket *so)
 {
 
 	sorwakeup(so);
 }
 
 void
 so_sowwakeup(struct socket *so)
 {
 
 	sowwakeup(so);
 }
 
 void
 so_sorwakeup_locked(struct socket *so)
 {
 
 	sorwakeup_locked(so);
 }
 
 void
 so_sowwakeup_locked(struct socket *so)
 {
 
 	sowwakeup_locked(so);
 }
 
 void
 so_lock(struct socket *so)
 {
 
 	SOCK_LOCK(so);
 }
 
 void
 so_unlock(struct socket *so)
 {
 
 	SOCK_UNLOCK(so);
 }
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index d108795efcaf..4be1c9aa6a59 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -1,1623 +1,1623 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_capsicum.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/capsicum.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
 #include <sys/filio.h>
 #include <sys/jail.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #ifdef COMPAT_43
 #include <sys/sysent.h>
 #endif
 #include <sys/uio.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 #ifdef COMPAT_FREEBSD32
 #include <compat/freebsd32/freebsd32_util.h>
 #endif
 
 #include <net/vnet.h>
 
 #include <security/audit/audit.h>
 #include <security/mac/mac_framework.h>
 
 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
 
 static int accept1(struct thread *td, int s, struct sockaddr *uname,
 		   socklen_t *anamelen, int flags);
 static int sockargs(struct mbuf **, char *, socklen_t, int);
 
 /*
  * Convert a user file descriptor to a kernel file entry and check if required
  * capability rights are present.
  * If required copy of current set of capability rights is returned.
  * A reference on the file entry is held upon returning.
  */
 int
 getsock_cap(struct thread *td, int fd, cap_rights_t *rightsp,
     struct file **fpp, u_int *fflagp, struct filecaps *havecapsp)
 {
 	struct file *fp;
 	int error;
 
 	error = fget_cap(td, fd, rightsp, &fp, havecapsp);
 	if (error != 0)
 		return (error);
 	if (fp->f_type != DTYPE_SOCKET) {
 		fdrop(fp, td);
 		if (havecapsp != NULL)
 			filecaps_free(havecapsp);
 		return (ENOTSOCK);
 	}
 	if (fflagp != NULL)
 		*fflagp = fp->f_flag;
 	*fpp = fp;
 	return (0);
 }
 
 /*
  * System call interface to the socket abstraction.
  */
 #if defined(COMPAT_43)
 #define COMPAT_OLDSOCK
 #endif
 
 int
 sys_socket(struct thread *td, struct socket_args *uap)
 {
 
 	return (kern_socket(td, uap->domain, uap->type, uap->protocol));
 }
 
 int
 kern_socket(struct thread *td, int domain, int type, int protocol)
 {
 	struct socket *so;
 	struct file *fp;
 	int fd, error, oflag, fflag;
 
 	AUDIT_ARG_SOCKET(domain, type, protocol);
 
 	oflag = 0;
 	fflag = 0;
 	if ((type & SOCK_CLOEXEC) != 0) {
 		type &= ~SOCK_CLOEXEC;
 		oflag |= O_CLOEXEC;
 	}
 	if ((type & SOCK_NONBLOCK) != 0) {
 		type &= ~SOCK_NONBLOCK;
 		fflag |= FNONBLOCK;
 	}
 
 #ifdef MAC
 	error = mac_socket_check_create(td->td_ucred, domain, type, protocol);
 	if (error != 0)
 		return (error);
 #endif
 	error = falloc(td, &fp, &fd, oflag);
 	if (error != 0)
 		return (error);
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	error = socreate(domain, &so, type, protocol, td->td_ucred, td);
 	if (error != 0) {
 		fdclose(td, fp, fd);
 	} else {
 		finit(fp, FREAD | FWRITE | fflag, DTYPE_SOCKET, so, &socketops);
 		if ((fflag & FNONBLOCK) != 0)
 			(void) fo_ioctl(fp, FIONBIO, &fflag, td->td_ucred, td);
 		td->td_retval[0] = fd;
 	}
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_bind(struct thread *td, struct bind_args *uap)
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_bindat(td, AT_FDCWD, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 int
 kern_bindat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 #ifdef CAPABILITY_MODE
 	if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD))
 		return (ECAPMODE);
 #endif
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
 	error = getsock_cap(td, fd, &cap_bind_rights,
 	    &fp, NULL, NULL);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(sa);
 #endif
 #ifdef MAC
 	error = mac_socket_check_bind(td->td_ucred, so, sa);
 	if (error == 0) {
 #endif
 		if (dirfd == AT_FDCWD)
 			error = sobind(so, sa, td);
 		else
 			error = sobindat(dirfd, so, sa, td);
 #ifdef MAC
 	}
 #endif
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_bindat(struct thread *td, struct bindat_args *uap)
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_bindat(td, uap->fd, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 int
 sys_listen(struct thread *td, struct listen_args *uap)
 {
 
 	return (kern_listen(td, uap->s, uap->backlog));
 }
 
 int
 kern_listen(struct thread *td, int s, int backlog)
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(s);
 	error = getsock_cap(td, s, &cap_listen_rights,
 	    &fp, NULL, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 #ifdef MAC
 		error = mac_socket_check_listen(td->td_ucred, so);
 		if (error == 0)
 #endif
 			error = solisten(so, backlog, td);
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 /*
  * accept1()
  */
 static int
 accept1(td, s, uname, anamelen, flags)
 	struct thread *td;
 	int s;
 	struct sockaddr *uname;
 	socklen_t *anamelen;
 	int flags;
 {
 	struct sockaddr *name;
 	socklen_t namelen;
 	struct file *fp;
 	int error;
 
 	if (uname == NULL)
 		return (kern_accept4(td, s, NULL, NULL, flags, NULL));
 
 	error = copyin(anamelen, &namelen, sizeof (namelen));
 	if (error != 0)
 		return (error);
 
 	error = kern_accept4(td, s, &name, &namelen, flags, &fp);
 
 	if (error != 0)
 		return (error);
 
 	if (error == 0 && uname != NULL) {
 #ifdef COMPAT_OLDSOCK
 		if (SV_PROC_FLAG(td->td_proc, SV_AOUT) &&
 		    (flags & ACCEPT4_COMPAT) != 0)
 			((struct osockaddr *)name)->sa_family =
 			    name->sa_family;
 #endif
 		error = copyout(name, uname, namelen);
 	}
 	if (error == 0)
 		error = copyout(&namelen, anamelen,
 		    sizeof(namelen));
 	if (error != 0)
 		fdclose(td, fp, td->td_retval[0]);
 	fdrop(fp, td);
 	free(name, M_SONAME);
 	return (error);
 }
 
 int
 kern_accept(struct thread *td, int s, struct sockaddr **name,
     socklen_t *namelen, struct file **fp)
 {
 	return (kern_accept4(td, s, name, namelen, ACCEPT4_INHERIT, fp));
 }
 
 int
 kern_accept4(struct thread *td, int s, struct sockaddr **name,
     socklen_t *namelen, int flags, struct file **fp)
 {
 	struct file *headfp, *nfp = NULL;
 	struct sockaddr *sa = NULL;
 	struct socket *head, *so;
 	struct filecaps fcaps;
 	u_int fflag;
 	pid_t pgid;
 	int error, fd, tmp;
 
 	if (name != NULL)
 		*name = NULL;
 
 	AUDIT_ARG_FD(s);
 	error = getsock_cap(td, s, &cap_accept_rights,
 	    &headfp, &fflag, &fcaps);
 	if (error != 0)
 		return (error);
 	head = headfp->f_data;
 	if (!SOLISTENING(head)) {
 		error = EINVAL;
 		goto done;
 	}
 #ifdef MAC
 	error = mac_socket_check_accept(td->td_ucred, head);
 	if (error != 0)
 		goto done;
 #endif
 	error = falloc_caps(td, &nfp, &fd,
 	    (flags & SOCK_CLOEXEC) ? O_CLOEXEC : 0, &fcaps);
 	if (error != 0)
 		goto done;
 	SOCK_LOCK(head);
 	if (!SOLISTENING(head)) {
 		SOCK_UNLOCK(head);
 		error = EINVAL;
 		goto noconnection;
 	}
 
 	error = solisten_dequeue(head, &so, flags);
 	if (error != 0)
 		goto noconnection;
 
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	td->td_retval[0] = fd;
 
 	/* Connection has been removed from the listen queue. */
 	KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0);
 
 	if (flags & ACCEPT4_INHERIT) {
 		pgid = fgetown(&head->so_sigio);
 		if (pgid != 0)
 			fsetown(pgid, &so->so_sigio);
 	} else {
 		fflag &= ~(FNONBLOCK | FASYNC);
 		if (flags & SOCK_NONBLOCK)
 			fflag |= FNONBLOCK;
 	}
 
 	finit(nfp, fflag, DTYPE_SOCKET, so, &socketops);
 	/* Sync socket nonblocking/async state with file flags */
 	tmp = fflag & FNONBLOCK;
 	(void) fo_ioctl(nfp, FIONBIO, &tmp, td->td_ucred, td);
 	tmp = fflag & FASYNC;
 	(void) fo_ioctl(nfp, FIOASYNC, &tmp, td->td_ucred, td);
 	error = soaccept(so, &sa);
 	if (error != 0)
 		goto noconnection;
 	if (sa == NULL) {
 		if (name)
 			*namelen = 0;
 		goto done;
 	}
 	AUDIT_ARG_SOCKADDR(td, AT_FDCWD, sa);
 	if (name) {
 		/* check sa_len before it is destroyed */
 		if (*namelen > sa->sa_len)
 			*namelen = sa->sa_len;
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_STRUCT))
 			ktrsockaddr(sa);
 #endif
 		*name = sa;
 		sa = NULL;
 	}
 noconnection:
 	free(sa, M_SONAME);
 
 	/*
 	 * close the new descriptor, assuming someone hasn't ripped it
 	 * out from under us.
 	 */
 	if (error != 0)
 		fdclose(td, nfp, fd);
 
 	/*
 	 * Release explicitly held references before returning.  We return
 	 * a reference on nfp to the caller on success if they request it.
 	 */
 done:
 	if (nfp == NULL)
 		filecaps_free(&fcaps);
 	if (fp != NULL) {
 		if (error == 0) {
 			*fp = nfp;
 			nfp = NULL;
 		} else
 			*fp = NULL;
 	}
 	if (nfp != NULL)
 		fdrop(nfp, td);
 	fdrop(headfp, td);
 	return (error);
 }
 
 int
 sys_accept(td, uap)
 	struct thread *td;
 	struct accept_args *uap;
 {
 
 	return (accept1(td, uap->s, uap->name, uap->anamelen, ACCEPT4_INHERIT));
 }
 
 int
 sys_accept4(td, uap)
 	struct thread *td;
 	struct accept4_args *uap;
 {
 
 	if (uap->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
 		return (EINVAL);
 
 	return (accept1(td, uap->s, uap->name, uap->anamelen, uap->flags));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 oaccept(struct thread *td, struct oaccept_args *uap)
 {
 
 	return (accept1(td, uap->s, uap->name, uap->anamelen,
 	    ACCEPT4_INHERIT | ACCEPT4_COMPAT));
 }
 #endif /* COMPAT_OLDSOCK */
 
 int
 sys_connect(struct thread *td, struct connect_args *uap)
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_connectat(td, AT_FDCWD, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 int
 kern_connectat(struct thread *td, int dirfd, int fd, struct sockaddr *sa)
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 #ifdef CAPABILITY_MODE
 	if (IN_CAPABILITY_MODE(td) && (dirfd == AT_FDCWD))
 		return (ECAPMODE);
 #endif
 
 	AUDIT_ARG_FD(fd);
 	AUDIT_ARG_SOCKADDR(td, dirfd, sa);
 	error = getsock_cap(td, fd, &cap_connect_rights,
 	    &fp, NULL, NULL);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	if (so->so_state & SS_ISCONNECTING) {
 		error = EALREADY;
 		goto done1;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(sa);
 #endif
 #ifdef MAC
 	error = mac_socket_check_connect(td->td_ucred, so, sa);
 	if (error != 0)
 		goto bad;
 #endif
 	error = soconnectat(dirfd, so, sa, td);
 	if (error != 0)
 		goto bad;
 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) {
 		error = EINPROGRESS;
 		goto done1;
 	}
 	SOCK_LOCK(so);
 	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
 		error = msleep(&so->so_timeo, &so->so_lock, PSOCK | PCATCH,
 		    "connec", 0);
 		if (error != 0)
 			break;
 	}
 	if (error == 0) {
 		error = so->so_error;
 		so->so_error = 0;
 	}
 	SOCK_UNLOCK(so);
 bad:
 	if (error == ERESTART)
 		error = EINTR;
 done1:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_connectat(struct thread *td, struct connectat_args *uap)
 {
 	struct sockaddr *sa;
 	int error;
 
 	error = getsockaddr(&sa, uap->name, uap->namelen);
 	if (error == 0) {
 		error = kern_connectat(td, uap->fd, uap->s, sa);
 		free(sa, M_SONAME);
 	}
 	return (error);
 }
 
 int
 kern_socketpair(struct thread *td, int domain, int type, int protocol,
     int *rsv)
 {
 	struct file *fp1, *fp2;
 	struct socket *so1, *so2;
 	int fd, error, oflag, fflag;
 
 	AUDIT_ARG_SOCKET(domain, type, protocol);
 
 	oflag = 0;
 	fflag = 0;
 	if ((type & SOCK_CLOEXEC) != 0) {
 		type &= ~SOCK_CLOEXEC;
 		oflag |= O_CLOEXEC;
 	}
 	if ((type & SOCK_NONBLOCK) != 0) {
 		type &= ~SOCK_NONBLOCK;
 		fflag |= FNONBLOCK;
 	}
 #ifdef MAC
 	/* We might want to have a separate check for socket pairs. */
 	error = mac_socket_check_create(td->td_ucred, domain, type,
 	    protocol);
 	if (error != 0)
 		return (error);
 #endif
 	error = socreate(domain, &so1, type, protocol, td->td_ucred, td);
 	if (error != 0)
 		return (error);
 	error = socreate(domain, &so2, type, protocol, td->td_ucred, td);
 	if (error != 0)
 		goto free1;
 	/* On success extra reference to `fp1' and 'fp2' is set by falloc. */
 	error = falloc(td, &fp1, &fd, oflag);
 	if (error != 0)
 		goto free2;
 	rsv[0] = fd;
 	fp1->f_data = so1;	/* so1 already has ref count */
 	error = falloc(td, &fp2, &fd, oflag);
 	if (error != 0)
 		goto free3;
 	fp2->f_data = so2;	/* so2 already has ref count */
 	rsv[1] = fd;
 	error = soconnect2(so1, so2);
 	if (error != 0)
 		goto free4;
 	if (type == SOCK_DGRAM) {
 		/*
 		 * Datagram socket connection is asymmetric.
 		 */
 		 error = soconnect2(so2, so1);
 		 if (error != 0)
 			goto free4;
 	} else if (so1->so_proto->pr_flags & PR_CONNREQUIRED) {
 		struct unpcb *unp, *unp2;
 		unp = sotounpcb(so1);
 		unp2 = sotounpcb(so2);
 		/* 
 		 * No need to lock the unps, because the sockets are brand-new.
 		 * No other threads can be using them yet
 		 */
 		unp_copy_peercred(td, unp, unp2, unp);
 	}
 	finit(fp1, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp1->f_data,
 	    &socketops);
 	finit(fp2, FREAD | FWRITE | fflag, DTYPE_SOCKET, fp2->f_data,
 	    &socketops);
 	if ((fflag & FNONBLOCK) != 0) {
 		(void) fo_ioctl(fp1, FIONBIO, &fflag, td->td_ucred, td);
 		(void) fo_ioctl(fp2, FIONBIO, &fflag, td->td_ucred, td);
 	}
 	fdrop(fp1, td);
 	fdrop(fp2, td);
 	return (0);
 free4:
 	fdclose(td, fp2, rsv[1]);
 	fdrop(fp2, td);
 free3:
 	fdclose(td, fp1, rsv[0]);
 	fdrop(fp1, td);
 free2:
 	if (so2 != NULL)
 		(void)soclose(so2);
 free1:
 	if (so1 != NULL)
 		(void)soclose(so1);
 	return (error);
 }
 
 int
 sys_socketpair(struct thread *td, struct socketpair_args *uap)
 {
 	int error, sv[2];
 
 	error = kern_socketpair(td, uap->domain, uap->type,
 	    uap->protocol, sv);
 	if (error != 0)
 		return (error);
 	error = copyout(sv, uap->rsv, 2 * sizeof(int));
 	if (error != 0) {
 		(void)kern_close(td, sv[0]);
 		(void)kern_close(td, sv[1]);
 	}
 	return (error);
 }
 
 static int
 sendit(struct thread *td, int s, struct msghdr *mp, int flags)
 {
 	struct mbuf *control;
 	struct sockaddr *to;
 	int error;
 
 #ifdef CAPABILITY_MODE
 	if (IN_CAPABILITY_MODE(td) && (mp->msg_name != NULL))
 		return (ECAPMODE);
 #endif
 
 	if (mp->msg_name != NULL) {
 		error = getsockaddr(&to, mp->msg_name, mp->msg_namelen);
 		if (error != 0) {
 			to = NULL;
 			goto bad;
 		}
 		mp->msg_name = to;
 	} else {
 		to = NULL;
 	}
 
 	if (mp->msg_control) {
 		if (mp->msg_controllen < sizeof(struct cmsghdr)
 #ifdef COMPAT_OLDSOCK
 		    && (mp->msg_flags != MSG_COMPAT ||
 		    !SV_PROC_FLAG(td->td_proc, SV_AOUT))
 #endif
 		) {
 			error = EINVAL;
 			goto bad;
 		}
 		error = sockargs(&control, mp->msg_control,
 		    mp->msg_controllen, MT_CONTROL);
 		if (error != 0)
 			goto bad;
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags == MSG_COMPAT &&
 		    SV_PROC_FLAG(td->td_proc, SV_AOUT)) {
 			struct cmsghdr *cm;
 
 			M_PREPEND(control, sizeof(*cm), M_WAITOK);
 			cm = mtod(control, struct cmsghdr *);
 			cm->cmsg_len = control->m_len;
 			cm->cmsg_level = SOL_SOCKET;
 			cm->cmsg_type = SCM_RIGHTS;
 		}
 #endif
 	} else {
 		control = NULL;
 	}
 
 	error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
 
 bad:
 	free(to, M_SONAME);
 	return (error);
 }
 
 int
 kern_sendit(struct thread *td, int s, struct msghdr *mp, int flags,
     struct mbuf *control, enum uio_seg segflg)
 {
 	struct file *fp;
 	struct uio auio;
 	struct iovec *iov;
 	struct socket *so;
 	cap_rights_t *rights;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	ssize_t len;
 	int i, error;
 
 	AUDIT_ARG_FD(s);
 	rights = &cap_send_rights;
 	if (mp->msg_name != NULL) {
 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, mp->msg_name);
 		rights = &cap_send_connect_rights;
 	}
 	error = getsock_cap(td, s, rights, &fp, NULL, NULL);
 	if (error != 0) {
 		m_freem(control);
 		return (error);
 	}
 	so = (struct socket *)fp->f_data;
 
 #ifdef KTRACE
 	if (mp->msg_name != NULL && KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(mp->msg_name);
 #endif
 #ifdef MAC
 	if (mp->msg_name != NULL) {
 		error = mac_socket_check_connect(td->td_ucred, so,
 		    mp->msg_name);
 		if (error != 0) {
 			m_freem(control);
 			goto bad;
 		}
 	}
 	error = mac_socket_check_send(td->td_ucred, so);
 	if (error != 0) {
 		m_freem(control);
 		goto bad;
 	}
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = segflg;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			error = EINVAL;
 			m_freem(control);
 			goto bad;
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
 	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
 	if (error != 0) {
 		if (auio.uio_resid != len &&
 		    (so->so_proto->pr_flags & PR_ATOMIC) == 0 &&
 		    (error == ERESTART || error == EINTR ||
 		    error == EWOULDBLOCK))
 			error = 0;
 		/* Generation of SIGPIPE can be controlled per socket */
 		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
 		    !(flags & MSG_NOSIGNAL)) {
 			PROC_LOCK(td->td_proc);
 			tdsignal(td, SIGPIPE);
 			PROC_UNLOCK(td->td_proc);
 		}
 	}
 	if (error == 0)
 		td->td_retval[0] = len - auio.uio_resid;
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = td->td_retval[0];
 		ktrgenio(s, UIO_WRITE, ktruio, error);
 	}
 #endif
 bad:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_sendto(struct thread *td, struct sendto_args *uap)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 
 	msg.msg_name = __DECONST(void *, uap->to);
 	msg.msg_namelen = uap->tolen;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	msg.msg_control = 0;
 #ifdef COMPAT_OLDSOCK
 	if (SV_PROC_FLAG(td->td_proc, SV_AOUT))
 		msg.msg_flags = 0;
 #endif
 	aiov.iov_base = __DECONST(void *, uap->buf);
 	aiov.iov_len = uap->len;
 	return (sendit(td, uap->s, &msg, uap->flags));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 osend(struct thread *td, struct osend_args *uap)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = __DECONST(void *, uap->buf);
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = 0;
 	return (sendit(td, uap->s, &msg, uap->flags));
 }
 
 int
 osendmsg(struct thread *td, struct osendmsg_args *uap)
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_iov = iov;
 	msg.msg_flags = MSG_COMPAT;
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
 int
 sys_sendmsg(struct thread *td, struct sendmsg_args *uap)
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_iov = iov;
 #ifdef COMPAT_OLDSOCK
 	if (SV_PROC_FLAG(td->td_proc, SV_AOUT))
 		msg.msg_flags = 0;
 #endif
 	error = sendit(td, uap->s, &msg, uap->flags);
 	free(iov, M_IOV);
 	return (error);
 }
 
 int
 kern_recvit(struct thread *td, int s, struct msghdr *mp, enum uio_seg fromseg,
     struct mbuf **controlp)
 {
 	struct uio auio;
 	struct iovec *iov;
 	struct mbuf *control, *m;
 	caddr_t ctlbuf;
 	struct file *fp;
 	struct socket *so;
 	struct sockaddr *fromsa = NULL;
 #ifdef KTRACE
 	struct uio *ktruio = NULL;
 #endif
 	ssize_t len;
 	int error, i;
 
 	if (controlp != NULL)
 		*controlp = NULL;
 
 	AUDIT_ARG_FD(s);
 	error = getsock_cap(td, s, &cap_recv_rights,
 	    &fp, NULL, NULL);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 
 #ifdef MAC
 	error = mac_socket_check_receive(td->td_ucred, so);
 	if (error != 0) {
 		fdrop(fp, td);
 		return (error);
 	}
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			fdrop(fp, td);
 			return (EINVAL);
 		}
 	}
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_GENIO))
 		ktruio = cloneuio(&auio);
 #endif
 	control = NULL;
 	len = auio.uio_resid;
 	error = soreceive(so, &fromsa, &auio, NULL,
 	    (mp->msg_control || controlp) ? &control : NULL,
 	    &mp->msg_flags);
 	if (error != 0) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	}
 	if (fromsa != NULL)
 		AUDIT_ARG_SOCKADDR(td, AT_FDCWD, fromsa);
 #ifdef KTRACE
 	if (ktruio != NULL) {
 		ktruio->uio_resid = len - auio.uio_resid;
 		ktrgenio(s, UIO_READ, ktruio, error);
 	}
 #endif
 	if (error != 0)
 		goto out;
 	td->td_retval[0] = len - auio.uio_resid;
 	if (mp->msg_name) {
 		len = mp->msg_namelen;
 		if (len <= 0 || fromsa == NULL)
 			len = 0;
 		else {
 			/* save sa_len before it is destroyed by MSG_COMPAT */
 			len = MIN(len, fromsa->sa_len);
 #ifdef COMPAT_OLDSOCK
 			if ((mp->msg_flags & MSG_COMPAT) != 0 &&
 			    SV_PROC_FLAG(td->td_proc, SV_AOUT))
 				((struct osockaddr *)fromsa)->sa_family =
 				    fromsa->sa_family;
 #endif
 			if (fromseg == UIO_USERSPACE) {
 				error = copyout(fromsa, mp->msg_name,
 				    (unsigned)len);
 				if (error != 0)
 					goto out;
 			} else
 				bcopy(fromsa, mp->msg_name, len);
 		}
 		mp->msg_namelen = len;
 	}
 	if (mp->msg_control && controlp == NULL) {
 #ifdef COMPAT_OLDSOCK
 		/*
 		 * We assume that old recvmsg calls won't receive access
 		 * rights and other control info, esp. as control info
 		 * is always optional and those options didn't exist in 4.3.
 		 * If we receive rights, trim the cmsghdr; anything else
 		 * is tossed.
 		 */
 		if (control && (mp->msg_flags & MSG_COMPAT) != 0 &&
 		    SV_PROC_FLAG(td->td_proc, SV_AOUT)) {
 			if (mtod(control, struct cmsghdr *)->cmsg_level !=
 			    SOL_SOCKET ||
 			    mtod(control, struct cmsghdr *)->cmsg_type !=
 			    SCM_RIGHTS) {
 				mp->msg_controllen = 0;
 				goto out;
 			}
 			control->m_len -= sizeof (struct cmsghdr);
 			control->m_data += sizeof (struct cmsghdr);
 		}
 #endif
 		ctlbuf = mp->msg_control;
 		len = mp->msg_controllen;
 		mp->msg_controllen = 0;
 		for (m = control; m != NULL && len >= m->m_len; m = m->m_next) {
 			if ((error = copyout(mtod(m, caddr_t), ctlbuf,
 			    m->m_len)) != 0)
 				goto out;
 
 			ctlbuf += m->m_len;
 			len -= m->m_len;
 			mp->msg_controllen += m->m_len;
 		}
 		if (m != NULL) {
 			mp->msg_flags |= MSG_CTRUNC;
 			m_dispose_extcontrolm(m);
 		}
 	}
 out:
 	fdrop(fp, td);
 #ifdef KTRACE
 	if (fromsa && KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(fromsa);
 #endif
 	free(fromsa, M_SONAME);
 
 	if (error == 0 && controlp != NULL)
 		*controlp = control;
 	else if (control != NULL) {
 		if (error != 0)
 			m_dispose_extcontrolm(control);
 		m_freem(control);
 	}
 
 	return (error);
 }
 
 static int
 recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp)
 {
 	int error;
 
 	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
 	if (error != 0)
 		return (error);
 	if (namelenp != NULL) {
 		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
 #ifdef COMPAT_OLDSOCK
 		if ((mp->msg_flags & MSG_COMPAT) != 0 &&
 		    SV_PROC_FLAG(td->td_proc, SV_AOUT))
 			error = 0;	/* old recvfrom didn't check */
 #endif
 	}
 	return (error);
 }
 
 static int
 kern_recvfrom(struct thread *td, int s, void *buf, size_t len, int flags,
     struct sockaddr *from, socklen_t *fromlenaddr)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 	int error;
 
 	if (fromlenaddr != NULL) {
 		error = copyin(fromlenaddr, &msg.msg_namelen,
 		    sizeof (msg.msg_namelen));
 		if (error != 0)
 			goto done2;
 	} else {
 		msg.msg_namelen = 0;
 	}
 	msg.msg_name = from;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = buf;
 	aiov.iov_len = len;
 	msg.msg_control = 0;
 	msg.msg_flags = flags;
 	error = recvit(td, s, &msg, fromlenaddr);
 done2:
 	return (error);
 }
 
 int
 sys_recvfrom(struct thread *td, struct recvfrom_args *uap)
 {
 	return (kern_recvfrom(td, uap->s, uap->buf, uap->len,
 	    uap->flags, uap->from, uap->fromlenaddr));
 }
 
 
 #ifdef COMPAT_OLDSOCK
 int
 orecvfrom(struct thread *td, struct orecvfrom_args *uap)
 {
 	return (kern_recvfrom(td, uap->s, uap->buf, uap->len,
 	    uap->flags | MSG_COMPAT, uap->from, uap->fromlenaddr));
 }
 #endif
 
 #ifdef COMPAT_OLDSOCK
 int
 orecv(struct thread *td, struct orecv_args *uap)
 {
 	struct msghdr msg;
 	struct iovec aiov;
 
 	msg.msg_name = 0;
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
 	aiov.iov_base = uap->buf;
 	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
 	msg.msg_flags = uap->flags;
 	return (recvit(td, uap->s, &msg, NULL));
 }
 
 /*
  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
  * overlays the new one, missing only the flags, and with the (old) access
  * rights where the control fields are now.
  */
 int
 orecvmsg(struct thread *td, struct orecvmsg_args *uap)
 {
 	struct msghdr msg;
 	struct iovec *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (struct omsghdr));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_flags = uap->flags | MSG_COMPAT;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, &uap->msg->msg_namelen);
 	if (msg.msg_controllen && error == 0)
 		error = copyout(&msg.msg_controllen,
 		    &uap->msg->msg_accrightslen, sizeof (int));
 	free(iov, M_IOV);
 	return (error);
 }
 #endif
 
 int
 sys_recvmsg(struct thread *td, struct recvmsg_args *uap)
 {
 	struct msghdr msg;
 	struct iovec *uiov, *iov;
 	int error;
 
 	error = copyin(uap->msg, &msg, sizeof (msg));
 	if (error != 0)
 		return (error);
 	error = copyiniov(msg.msg_iov, msg.msg_iovlen, &iov, EMSGSIZE);
 	if (error != 0)
 		return (error);
 	msg.msg_flags = uap->flags;
 #ifdef COMPAT_OLDSOCK
 	if (SV_PROC_FLAG(td->td_proc, SV_AOUT))
 		msg.msg_flags &= ~MSG_COMPAT;
 #endif
 	uiov = msg.msg_iov;
 	msg.msg_iov = iov;
 	error = recvit(td, uap->s, &msg, NULL);
 	if (error == 0) {
 		msg.msg_iov = uiov;
 		error = copyout(&msg, uap->msg, sizeof(msg));
 	}
 	free(iov, M_IOV);
 	return (error);
 }
 
 int
 sys_shutdown(struct thread *td, struct shutdown_args *uap)
 {
 
 	return (kern_shutdown(td, uap->s, uap->how));
 }
 
 int
 kern_shutdown(struct thread *td, int s, int how)
 {
 	struct socket *so;
 	struct file *fp;
 	int error;
 
 	AUDIT_ARG_FD(s);
 	error = getsock_cap(td, s, &cap_shutdown_rights,
 	    &fp, NULL, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = soshutdown(so, how);
 		/*
 		 * Previous versions did not return ENOTCONN, but 0 in
 		 * case the socket was not connected. Some important
 		 * programs like syslogd up to r279016, 2015-02-19,
 		 * still depend on this behavior.
 		 */
 		if (error == ENOTCONN &&
 		    td->td_proc->p_osrel < P_OSREL_SHUTDOWN_ENOTCONN)
 			error = 0;
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 int
 sys_setsockopt(struct thread *td, struct setsockopt_args *uap)
 {
 
 	return (kern_setsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, uap->valsize));
 }
 
 int
 kern_setsockopt(struct thread *td, int s, int level, int name, const void *val,
     enum uio_seg valseg, socklen_t valsize)
 {
 	struct socket *so;
 	struct file *fp;
 	struct sockopt sopt;
 	int error;
 
 	if (val == NULL && valsize != 0)
 		return (EFAULT);
 	if ((int)valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_SET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = __DECONST(void *, val);
 	sopt.sopt_valsize = valsize;
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_setsockopt called with bad valseg");
 	}
 
 	AUDIT_ARG_FD(s);
 	error = getsock_cap(td, s, &cap_setsockopt_rights,
 	    &fp, NULL, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sosetopt(so, &sopt);
 		fdrop(fp, td);
 	}
 	return(error);
 }
 
 int
 sys_getsockopt(struct thread *td, struct getsockopt_args *uap)
 {
 	socklen_t valsize;
 	int error;
 
 	if (uap->val) {
 		error = copyin(uap->avalsize, &valsize, sizeof (valsize));
 		if (error != 0)
 			return (error);
 	}
 
 	error = kern_getsockopt(td, uap->s, uap->level, uap->name,
 	    uap->val, UIO_USERSPACE, &valsize);
 
 	if (error == 0)
 		error = copyout(&valsize, uap->avalsize, sizeof (valsize));
 	return (error);
 }
 
 /*
  * Kernel version of getsockopt.
  * optval can be a userland or userspace. optlen is always a kernel pointer.
  */
 int
 kern_getsockopt(struct thread *td, int s, int level, int name, void *val,
     enum uio_seg valseg, socklen_t *valsize)
 {
 	struct socket *so;
 	struct file *fp;
 	struct sockopt sopt;
 	int error;
 
 	if (val == NULL)
 		*valsize = 0;
 	if ((int)*valsize < 0)
 		return (EINVAL);
 
 	sopt.sopt_dir = SOPT_GET;
 	sopt.sopt_level = level;
 	sopt.sopt_name = name;
 	sopt.sopt_val = val;
 	sopt.sopt_valsize = (size_t)*valsize; /* checked non-negative above */
 	switch (valseg) {
 	case UIO_USERSPACE:
 		sopt.sopt_td = td;
 		break;
 	case UIO_SYSSPACE:
 		sopt.sopt_td = NULL;
 		break;
 	default:
 		panic("kern_getsockopt called with bad valseg");
 	}
 
 	AUDIT_ARG_FD(s);
 	error = getsock_cap(td, s, &cap_getsockopt_rights,
 	    &fp, NULL, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sogetopt(so, &sopt);
 		*valsize = sopt.sopt_valsize;
 		fdrop(fp, td);
 	}
 	return (error);
 }
 
 static int
 user_getsockname(struct thread *td, int fdes, struct sockaddr *asa,
     socklen_t *alen, bool compat)
 {
 	struct sockaddr *sa;
 	socklen_t len;
 	int error;
 
 	error = copyin(alen, &len, sizeof(len));
 	if (error != 0)
 		return (error);
 
 	error = kern_getsockname(td, fdes, &sa, &len);
 	if (error != 0)
 		return (error);
 
 	if (len != 0) {
 #ifdef COMPAT_OLDSOCK
 		if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT))
 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
 		error = copyout(sa, asa, len);
 	}
 	free(sa, M_SONAME);
 	if (error == 0)
 		error = copyout(&len, alen, sizeof(len));
 	return (error);
 }
 
 int
 kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
     socklen_t *alen)
 {
 	struct socket *so;
 	struct file *fp;
 	socklen_t len;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getsock_cap(td, fd, &cap_getsockname_rights,
 	    &fp, NULL, NULL);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	*sa = NULL;
 	CURVNET_SET(so->so_vnet);
-	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
+	error = so->so_proto->pr_sockaddr(so, sa);
 	CURVNET_RESTORE();
 	if (error != 0)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
 	else
 		len = MIN(*alen, (*sa)->sa_len);
 	*alen = len;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(*sa);
 #endif
 bad:
 	fdrop(fp, td);
 	if (error != 0 && *sa != NULL) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
 	return (error);
 }
 
 int
 sys_getsockname(struct thread *td, struct getsockname_args *uap)
 {
 	return (user_getsockname(td, uap->fdes, uap->asa, uap->alen, false));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 ogetsockname(struct thread *td, struct ogetsockname_args *uap)
 {
 	return (user_getsockname(td, uap->fdes, uap->asa, uap->alen, true));
 }
 #endif /* COMPAT_OLDSOCK */
 
 static int
 user_getpeername(struct thread *td, int fdes, struct sockaddr *asa,
     socklen_t *alen, bool compat)
 {
 	struct sockaddr *sa;
 	socklen_t len;
 	int error;
 
 	error = copyin(alen, &len, sizeof (len));
 	if (error != 0)
 		return (error);
 
 	error = kern_getpeername(td, fdes, &sa, &len);
 	if (error != 0)
 		return (error);
 
 	if (len != 0) {
 #ifdef COMPAT_OLDSOCK
 		if (compat && SV_PROC_FLAG(td->td_proc, SV_AOUT))
 			((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
 		error = copyout(sa, asa, len);
 	}
 	free(sa, M_SONAME);
 	if (error == 0)
 		error = copyout(&len, alen, sizeof(len));
 	return (error);
 }
 
 int
 kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
     socklen_t *alen)
 {
 	struct socket *so;
 	struct file *fp;
 	socklen_t len;
 	int error;
 
 	AUDIT_ARG_FD(fd);
 	error = getsock_cap(td, fd, &cap_getpeername_rights,
 	    &fp, NULL, NULL);
 	if (error != 0)
 		return (error);
 	so = fp->f_data;
 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
 		error = ENOTCONN;
 		goto done;
 	}
 	*sa = NULL;
 	CURVNET_SET(so->so_vnet);
-	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
+	error = so->so_proto->pr_peeraddr(so, sa);
 	CURVNET_RESTORE();
 	if (error != 0)
 		goto bad;
 	if (*sa == NULL)
 		len = 0;
 	else
 		len = MIN(*alen, (*sa)->sa_len);
 	*alen = len;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_STRUCT))
 		ktrsockaddr(*sa);
 #endif
 bad:
 	if (error != 0 && *sa != NULL) {
 		free(*sa, M_SONAME);
 		*sa = NULL;
 	}
 done:
 	fdrop(fp, td);
 	return (error);
 }
 
 int
 sys_getpeername(struct thread *td, struct getpeername_args *uap)
 {
 	return (user_getpeername(td, uap->fdes, uap->asa, uap->alen, false));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
 ogetpeername(struct thread *td, struct ogetpeername_args *uap)
 {
 	return (user_getpeername(td, uap->fdes, uap->asa, uap->alen, true));
 }
 #endif /* COMPAT_OLDSOCK */
 
 static int
 sockargs(struct mbuf **mp, char *buf, socklen_t buflen, int type)
 {
 	struct sockaddr *sa;
 	struct mbuf *m;
 	int error;
 
 	if (buflen > MLEN) {
 #ifdef COMPAT_OLDSOCK
 		if (type == MT_SONAME && buflen <= 112 &&
 		    SV_CURPROC_FLAG(SV_AOUT))
 			buflen = MLEN;		/* unix domain compat. hack */
 		else
 #endif
 			if (buflen > MCLBYTES)
 				return (EMSGSIZE);
 	}
 	m = m_get2(buflen, M_WAITOK, type, 0);
 	m->m_len = buflen;
 	error = copyin(buf, mtod(m, void *), buflen);
 	if (error != 0)
 		(void) m_free(m);
 	else {
 		*mp = m;
 		if (type == MT_SONAME) {
 			sa = mtod(m, struct sockaddr *);
 
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 			if (sa->sa_family == 0 && sa->sa_len < AF_MAX &&
 			    SV_CURPROC_FLAG(SV_AOUT))
 				sa->sa_family = sa->sa_len;
 #endif
 			sa->sa_len = buflen;
 		}
 	}
 	return (error);
 }
 
 int
 getsockaddr(struct sockaddr **namp, const struct sockaddr *uaddr, size_t len)
 {
 	struct sockaddr *sa;
 	int error;
 
 	if (len > SOCK_MAXADDRLEN)
 		return (ENAMETOOLONG);
 	if (len < offsetof(struct sockaddr, sa_data[0]))
 		return (EINVAL);
 	sa = malloc(len, M_SONAME, M_WAITOK);
 	error = copyin(uaddr, sa, len);
 	if (error != 0) {
 		free(sa, M_SONAME);
 	} else {
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
 		if (sa->sa_family == 0 && sa->sa_len < AF_MAX &&
 		    SV_CURPROC_FLAG(SV_AOUT))
 			sa->sa_family = sa->sa_len;
 #endif
 		sa->sa_len = len;
 		*namp = sa;
 	}
 	return (error);
 }
 
 /*
  * Dispose of externalized rights from an SCM_RIGHTS message.  This function
  * should be used in error or truncation cases to avoid leaking file descriptors
  * into the recipient's (the current thread's) table.
  */
 void
 m_dispose_extcontrolm(struct mbuf *m)
 {
 	struct cmsghdr *cm;
 	struct file *fp;
 	struct thread *td;
 	socklen_t clen, datalen;
 	int error, fd, *fds, nfd;
 
 	td = curthread;
 	for (; m != NULL; m = m->m_next) {
 		if (m->m_type != MT_EXTCONTROL)
 			continue;
 		cm = mtod(m, struct cmsghdr *);
 		clen = m->m_len;
 		while (clen > 0) {
 			if (clen < sizeof(*cm))
 				panic("%s: truncated mbuf %p", __func__, m);
 			datalen = CMSG_SPACE(cm->cmsg_len - CMSG_SPACE(0));
 			if (clen < datalen)
 				panic("%s: truncated mbuf %p", __func__, m);
 
 			if (cm->cmsg_level == SOL_SOCKET &&
 			    cm->cmsg_type == SCM_RIGHTS) {
 				fds = (int *)CMSG_DATA(cm);
 				nfd = (cm->cmsg_len - CMSG_SPACE(0)) /
 				    sizeof(int);
 
 				while (nfd-- > 0) {
 					fd = *fds++;
 					error = fget(td, fd, &cap_no_rights,
 					    &fp);
 					if (error == 0) {
 						fdclose(td, fp, fd);
 						fdrop(fp, td);
 					}
 				}
 			}
 			clen -= datalen;
 			cm = (struct cmsghdr *)((uint8_t *)cm + datalen);
 		}
 		m_chtype(m, MT_CONTROL);
 	}
 }
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index eb6404c86320..1f2d8a6647b9 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -1,3577 +1,3560 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California. All Rights Reserved.
  * Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved.
  * Copyright (c) 2018 Matthew Macy
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
  */
 
 /*
  * UNIX Domain (Local) Sockets
  *
  * This is an implementation of UNIX (local) domain sockets.  Each socket has
  * an associated struct unpcb (UNIX protocol control block).  Stream sockets
  * may be connected to 0 or 1 other socket.  Datagram sockets may be
  * connected to 0, 1, or many other sockets.  Sockets may be created and
  * connected in pairs (socketpair(2)), or bound/connected to using the file
  * system name space.  For most purposes, only the receive socket buffer is
  * used, as sending on one socket delivers directly to the receive socket
  * buffer of a second socket.
  *
  * The implementation is substantially complicated by the fact that
  * "ancillary data", such as file descriptors or credentials, may be passed
  * across UNIX domain sockets.  The potential for passing UNIX domain sockets
  * over other UNIX domain sockets requires the implementation of a simple
  * garbage collector to find and tear down cycles of disconnected sockets.
  *
  * TODO:
  *	RDM
  *	rethink name space problems
  *	need a proper out-of-band
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <sys/resourcevar.h>
 #include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/taskqueue.h>
 #include <sys/un.h>
 #include <sys/unpcb.h>
 #include <sys/vnode.h>
 
 #include <net/vnet.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 
 #include <vm/uma.h>
 
 MALLOC_DECLARE(M_FILECAPS);
 
-/*
- * See unpcb.h for the locking key.
- */
+static struct domain localdomain;
 
 static uma_zone_t	unp_zone;
 static unp_gen_t	unp_gencnt;	/* (l) */
 static u_int		unp_count;	/* (l) Count of local sockets. */
 static ino_t		unp_ino;	/* Prototype for fake inode numbers. */
 static int		unp_rights;	/* (g) File descriptors in flight. */
 static struct unp_head	unp_shead;	/* (l) List of stream sockets. */
 static struct unp_head	unp_dhead;	/* (l) List of datagram sockets. */
 static struct unp_head	unp_sphead;	/* (l) List of seqpacket sockets. */
 
 struct unp_defer {
 	SLIST_ENTRY(unp_defer) ud_link;
 	struct file *ud_fp;
 };
 static SLIST_HEAD(, unp_defer) unp_defers;
 static int unp_defers_count;
 
 static const struct sockaddr	sun_noname = { sizeof(sun_noname), AF_LOCAL };
 
 /*
  * Garbage collection of cyclic file descriptor/socket references occurs
  * asynchronously in a taskqueue context in order to avoid recursion and
  * reentrance in the UNIX domain socket, file descriptor, and socket layer
  * code.  See unp_gc() for a full description.
  */
 static struct timeout_task unp_gc_task;
 
 /*
  * The close of unix domain sockets attached as SCM_RIGHTS is
  * postponed to the taskqueue, to avoid arbitrary recursion depth.
  * The attached sockets might have another sockets attached.
  */
 static struct task	unp_defer_task;
 
 /*
  * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
  * stream sockets, although the total for sender and receiver is actually
  * only PIPSIZ.
  *
  * Datagram sockets really use the sendspace as the maximum datagram size,
  * and don't really want to reserve the sendspace.  Their recvspace should be
  * large enough for at least one max-size datagram plus address.
  */
 #ifndef PIPSIZ
 #define	PIPSIZ	8192
 #endif
 static u_long	unpst_sendspace = PIPSIZ;
 static u_long	unpst_recvspace = PIPSIZ;
 static u_long	unpdg_maxdgram = 2*1024;
 static u_long	unpdg_recvspace = 16*1024;	/* support 8KB syslog msgs */
 static u_long	unpsp_sendspace = PIPSIZ;	/* really max datagram size */
 static u_long	unpsp_recvspace = PIPSIZ;
 
 static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Local domain");
 static SYSCTL_NODE(_net_local, SOCK_STREAM, stream,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "SOCK_STREAM");
 static SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "SOCK_DGRAM");
 static SYSCTL_NODE(_net_local, SOCK_SEQPACKET, seqpacket,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "SOCK_SEQPACKET");
 
 SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
 	   &unpst_sendspace, 0, "Default stream send space.");
 SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
 	   &unpst_recvspace, 0, "Default stream receive space.");
 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
 	   &unpdg_maxdgram, 0, "Maximum datagram size.");
 SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
 	   &unpdg_recvspace, 0, "Default datagram receive space.");
 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, maxseqpacket, CTLFLAG_RW,
 	   &unpsp_sendspace, 0, "Default seqpacket send space.");
 SYSCTL_ULONG(_net_local_seqpacket, OID_AUTO, recvspace, CTLFLAG_RW,
 	   &unpsp_recvspace, 0, "Default seqpacket receive space.");
 SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0,
     "File descriptors in flight.");
 SYSCTL_INT(_net_local, OID_AUTO, deferred, CTLFLAG_RD,
     &unp_defers_count, 0,
     "File descriptors deferred to taskqueue for close.");
 
 /*
  * Locking and synchronization:
  *
  * Several types of locks exist in the local domain socket implementation:
  * - a global linkage lock
  * - a global connection list lock
  * - the mtxpool lock
  * - per-unpcb mutexes
  *
  * The linkage lock protects the global socket lists, the generation number
  * counter and garbage collector state.
  *
  * The connection list lock protects the list of referring sockets in a datagram
  * socket PCB.  This lock is also overloaded to protect a global list of
  * sockets whose buffers contain socket references in the form of SCM_RIGHTS
  * messages.  To avoid recursion, such references are released by a dedicated
  * thread.
  *
  * The mtxpool lock protects the vnode from being modified while referenced.
  * Lock ordering rules require that it be acquired before any PCB locks.
  *
  * The unpcb lock (unp_mtx) protects the most commonly referenced fields in the
  * unpcb.  This includes the unp_conn field, which either links two connected
  * PCBs together (for connected socket types) or points at the destination
  * socket (for connectionless socket types).  The operations of creating or
  * destroying a connection therefore involve locking multiple PCBs.  To avoid
  * lock order reversals, in some cases this involves dropping a PCB lock and
  * using a reference counter to maintain liveness.
  *
  * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
- * allocated in pru_attach() and freed in pru_detach().  The validity of that
+ * allocated in pr_attach() and freed in pr_detach().  The validity of that
  * pointer is an invariant, so no lock is required to dereference the so_pcb
  * pointer if a valid socket reference is held by the caller.  In practice,
  * this is always true during operations performed on a socket.  Each unpcb
  * has a back-pointer to its socket, unp_socket, which will be stable under
  * the same circumstances.
  *
  * This pointer may only be safely dereferenced as long as a valid reference
  * to the unpcb is held.  Typically, this reference will be from the socket,
  * or from another unpcb when the referring unpcb's lock is held (in order
  * that the reference not be invalidated during use).  For example, to follow
  * unp->unp_conn->unp_socket, you need to hold a lock on unp_conn to guarantee
  * that detach is not run clearing unp_socket.
  *
  * Blocking with UNIX domain sockets is a tricky issue: unlike most network
  * protocols, bind() is a non-atomic operation, and connect() requires
  * potential sleeping in the protocol, due to potentially waiting on local or
  * distributed file systems.  We try to separate "lookup" operations, which
  * may sleep, and the IPC operations themselves, which typically can occur
  * with relative atomicity as locks can be held over the entire operation.
  *
  * Another tricky issue is simultaneous multi-threaded or multi-process
  * access to a single UNIX domain socket.  These are handled by the flags
  * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
  * binding, both of which involve dropping UNIX domain socket locks in order
  * to perform namei() and other file system operations.
  */
 static struct rwlock	unp_link_rwlock;
 static struct mtx	unp_defers_lock;
 
 #define	UNP_LINK_LOCK_INIT()		rw_init(&unp_link_rwlock,	\
 					    "unp_link_rwlock")
 
 #define	UNP_LINK_LOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
 					    RA_LOCKED)
 #define	UNP_LINK_UNLOCK_ASSERT()	rw_assert(&unp_link_rwlock,	\
 					    RA_UNLOCKED)
 
 #define	UNP_LINK_RLOCK()		rw_rlock(&unp_link_rwlock)
 #define	UNP_LINK_RUNLOCK()		rw_runlock(&unp_link_rwlock)
 #define	UNP_LINK_WLOCK()		rw_wlock(&unp_link_rwlock)
 #define	UNP_LINK_WUNLOCK()		rw_wunlock(&unp_link_rwlock)
 #define	UNP_LINK_WLOCK_ASSERT()		rw_assert(&unp_link_rwlock,	\
 					    RA_WLOCKED)
 #define	UNP_LINK_WOWNED()		rw_wowned(&unp_link_rwlock)
 
 #define	UNP_DEFERRED_LOCK_INIT()	mtx_init(&unp_defers_lock, \
 					    "unp_defer", NULL, MTX_DEF)
 #define	UNP_DEFERRED_LOCK()		mtx_lock(&unp_defers_lock)
 #define	UNP_DEFERRED_UNLOCK()		mtx_unlock(&unp_defers_lock)
 
 #define UNP_REF_LIST_LOCK()		UNP_DEFERRED_LOCK();
 #define UNP_REF_LIST_UNLOCK()		UNP_DEFERRED_UNLOCK();
 
 #define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
 					    "unp", "unp",	\
 					    MTX_DUPOK|MTX_DEF)
 #define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
 #define	UNP_PCB_LOCKPTR(unp)		(&(unp)->unp_mtx)
 #define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
 #define	UNP_PCB_TRYLOCK(unp)		mtx_trylock(&(unp)->unp_mtx)
 #define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
 #define	UNP_PCB_OWNED(unp)		mtx_owned(&(unp)->unp_mtx)
 #define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
 #define	UNP_PCB_UNLOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_NOTOWNED)
 
 static int	uipc_connect2(struct socket *, struct socket *);
 static int	uipc_ctloutput(struct socket *, struct sockopt *);
 static int	unp_connect(struct socket *, struct sockaddr *,
 		    struct thread *);
 static int	unp_connectat(int, struct socket *, struct sockaddr *,
 		    struct thread *, bool);
 typedef enum { PRU_CONNECT, PRU_CONNECT2 } conn2_how;
 static void	unp_connect2(struct socket *so, struct socket *so2, conn2_how);
 static void	unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
 static void	unp_dispose(struct socket *so);
 static void	unp_shutdown(struct unpcb *);
 static void	unp_drop(struct unpcb *);
 static void	unp_gc(__unused void *, int);
 static void	unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
 static void	unp_discard(struct file *);
 static void	unp_freerights(struct filedescent **, int);
 static int	unp_internalize(struct mbuf **, struct thread *,
 		    struct mbuf **, u_int *, u_int *);
 static void	unp_internalize_fp(struct file *);
 static int	unp_externalize(struct mbuf *, struct mbuf **, int);
 static int	unp_externalize_fp(struct file *);
 static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *,
 		    int, struct mbuf **, u_int *, u_int *);
 static void	unp_process_defers(void * __unused, int);
 
 static void
 unp_pcb_hold(struct unpcb *unp)
 {
 	u_int old __unused;
 
 	old = refcount_acquire(&unp->unp_refcount);
 	KASSERT(old > 0, ("%s: unpcb %p has no references", __func__, unp));
 }
 
 static __result_use_check bool
 unp_pcb_rele(struct unpcb *unp)
 {
 	bool ret;
 
 	UNP_PCB_LOCK_ASSERT(unp);
 
 	if ((ret = refcount_release(&unp->unp_refcount))) {
 		UNP_PCB_UNLOCK(unp);
 		UNP_PCB_LOCK_DESTROY(unp);
 		uma_zfree(unp_zone, unp);
 	}
 	return (ret);
 }
 
 static void
 unp_pcb_rele_notlast(struct unpcb *unp)
 {
 	bool ret __unused;
 
 	ret = refcount_release(&unp->unp_refcount);
 	KASSERT(!ret, ("%s: unpcb %p has no references", __func__, unp));
 }
 
 static void
 unp_pcb_lock_pair(struct unpcb *unp, struct unpcb *unp2)
 {
 	UNP_PCB_UNLOCK_ASSERT(unp);
 	UNP_PCB_UNLOCK_ASSERT(unp2);
 
 	if (unp == unp2) {
 		UNP_PCB_LOCK(unp);
 	} else if ((uintptr_t)unp2 > (uintptr_t)unp) {
 		UNP_PCB_LOCK(unp);
 		UNP_PCB_LOCK(unp2);
 	} else {
 		UNP_PCB_LOCK(unp2);
 		UNP_PCB_LOCK(unp);
 	}
 }
 
 static void
 unp_pcb_unlock_pair(struct unpcb *unp, struct unpcb *unp2)
 {
 	UNP_PCB_UNLOCK(unp);
 	if (unp != unp2)
 		UNP_PCB_UNLOCK(unp2);
 }
 
 /*
  * Try to lock the connected peer of an already locked socket.  In some cases
  * this requires that we unlock the current socket.  The pairbusy counter is
  * used to block concurrent connection attempts while the lock is dropped.  The
  * caller must be careful to revalidate PCB state.
  */
 static struct unpcb *
 unp_pcb_lock_peer(struct unpcb *unp)
 {
 	struct unpcb *unp2;
 
 	UNP_PCB_LOCK_ASSERT(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 == NULL)
 		return (NULL);
 	if (__predict_false(unp == unp2))
 		return (unp);
 
 	UNP_PCB_UNLOCK_ASSERT(unp2);
 
 	if (__predict_true(UNP_PCB_TRYLOCK(unp2)))
 		return (unp2);
 	if ((uintptr_t)unp2 > (uintptr_t)unp) {
 		UNP_PCB_LOCK(unp2);
 		return (unp2);
 	}
 	unp->unp_pairbusy++;
 	unp_pcb_hold(unp2);
 	UNP_PCB_UNLOCK(unp);
 
 	UNP_PCB_LOCK(unp2);
 	UNP_PCB_LOCK(unp);
 	KASSERT(unp->unp_conn == unp2 || unp->unp_conn == NULL,
 	    ("%s: socket %p was reconnected", __func__, unp));
 	if (--unp->unp_pairbusy == 0 && (unp->unp_flags & UNP_WAITING) != 0) {
 		unp->unp_flags &= ~UNP_WAITING;
 		wakeup(unp);
 	}
 	if (unp_pcb_rele(unp2)) {
 		/* unp2 is unlocked. */
 		return (NULL);
 	}
 	if (unp->unp_conn == NULL) {
 		UNP_PCB_UNLOCK(unp2);
 		return (NULL);
 	}
 	return (unp2);
 }
 
-/*
- * Definitions of protocols supported in the LOCAL domain.
- */
-static struct domain localdomain;
-static struct pr_usrreqs uipc_usrreqs_dgram, uipc_usrreqs_stream;
-static struct pr_usrreqs uipc_usrreqs_seqpacket;
-static struct protosw localsw[] = {
-{
-	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&localdomain,
-	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS|
-				    PR_CAPATTACH,
-	.pr_ctloutput =		&uipc_ctloutput,
-	.pr_usrreqs =		&uipc_usrreqs_stream
-},
-{
-	.pr_type =		SOCK_DGRAM,
-	.pr_domain =		&localdomain,
-	.pr_flags =		PR_ATOMIC | PR_ADDR |PR_RIGHTS | PR_CAPATTACH |
-				    PR_SOCKBUF,
-	.pr_ctloutput =		&uipc_ctloutput,
-	.pr_usrreqs =		&uipc_usrreqs_dgram
-},
-{
-	.pr_type =		SOCK_SEQPACKET,
-	.pr_domain =		&localdomain,
-
-	/*
-	 * XXXRW: For now, PR_ADDR because soreceive will bump into them
-	 * due to our use of sbappendaddr.  A new sbappend variants is needed
-	 * that supports both atomic record writes and control data.
-	 */
-	.pr_flags =		PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|
-				    PR_WANTRCVD|PR_RIGHTS|PR_CAPATTACH,
-	.pr_ctloutput =		&uipc_ctloutput,
-	.pr_usrreqs =		&uipc_usrreqs_seqpacket,
-},
-};
-
-static struct domain localdomain = {
-	.dom_family =		AF_LOCAL,
-	.dom_name =		"local",
-	.dom_externalize =	unp_externalize,
-	.dom_dispose =		unp_dispose,
-	.dom_protosw =		localsw,
-	.dom_protoswNPROTOSW =	&localsw[nitems(localsw)]
-};
-DOMAIN_SET(local);
-
 static void
 uipc_abort(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
 	UNP_PCB_UNLOCK_ASSERT(unp);
 
 	UNP_PCB_LOCK(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 != NULL) {
 		unp_pcb_hold(unp2);
 		UNP_PCB_UNLOCK(unp);
 		unp_drop(unp2);
 	} else
 		UNP_PCB_UNLOCK(unp);
 }
 
 static int
 uipc_accept(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp, *unp2;
 	const struct sockaddr *sa;
 
 	/*
 	 * Pass back name of connected socket, if it was bound and we are
 	 * still connected (our peer may have closed already!).
 	 */
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
 
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	UNP_PCB_LOCK(unp);
 	unp2 = unp_pcb_lock_peer(unp);
 	if (unp2 != NULL && unp2->unp_addr != NULL)
 		sa = (struct sockaddr *)unp2->unp_addr;
 	else
 		sa = &sun_noname;
 	bcopy(sa, *nam, sa->sa_len);
 	if (unp2 != NULL)
 		unp_pcb_unlock_pair(unp, unp2);
 	else
 		UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static int
 uipc_attach(struct socket *so, int proto, struct thread *td)
 {
 	u_long sendspace, recvspace;
 	struct unpcb *unp;
 	int error;
 	bool locked;
 
 	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		switch (so->so_type) {
 		case SOCK_STREAM:
 			sendspace = unpst_sendspace;
 			recvspace = unpst_recvspace;
 			break;
 
 		case SOCK_DGRAM:
 			STAILQ_INIT(&so->so_rcv.uxdg_mb);
 			STAILQ_INIT(&so->so_snd.uxdg_mb);
 			TAILQ_INIT(&so->so_rcv.uxdg_conns);
 			/*
 			 * Since send buffer is either bypassed or is a part
 			 * of one-to-many receive buffer, we assign both space
 			 * limits to unpdg_recvspace.
 			 */
 			sendspace = recvspace = unpdg_recvspace;
 			break;
 
 		case SOCK_SEQPACKET:
 			sendspace = unpsp_sendspace;
 			recvspace = unpsp_recvspace;
 			break;
 
 		default:
 			panic("uipc_attach");
 		}
 		error = soreserve(so, sendspace, recvspace);
 		if (error)
 			return (error);
 	}
 	unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
 	if (unp == NULL)
 		return (ENOBUFS);
 	LIST_INIT(&unp->unp_refs);
 	UNP_PCB_LOCK_INIT(unp);
 	unp->unp_socket = so;
 	so->so_pcb = unp;
 	refcount_init(&unp->unp_refcount, 1);
 
 	if ((locked = UNP_LINK_WOWNED()) == false)
 		UNP_LINK_WLOCK();
 
 	unp->unp_gencnt = ++unp_gencnt;
 	unp->unp_ino = ++unp_ino;
 	unp_count++;
 	switch (so->so_type) {
 	case SOCK_STREAM:
 		LIST_INSERT_HEAD(&unp_shead, unp, unp_link);
 		break;
 
 	case SOCK_DGRAM:
 		LIST_INSERT_HEAD(&unp_dhead, unp, unp_link);
 		break;
 
 	case SOCK_SEQPACKET:
 		LIST_INSERT_HEAD(&unp_sphead, unp, unp_link);
 		break;
 
 	default:
 		panic("uipc_attach");
 	}
 
 	if (locked == false)
 		UNP_LINK_WUNLOCK();
 
 	return (0);
 }
 
 static int
 uipc_bindat(int fd, struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
 	struct vattr vattr;
 	int error, namelen;
 	struct nameidata nd;
 	struct unpcb *unp;
 	struct vnode *vp;
 	struct mount *mp;
 	cap_rights_t rights;
 	char *buf;
 
 	if (nam->sa_family != AF_UNIX)
 		return (EAFNOSUPPORT);
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
 
 	if (soun->sun_len > sizeof(struct sockaddr_un))
 		return (EINVAL);
 	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
 	if (namelen <= 0)
 		return (EINVAL);
 
 	/*
 	 * We don't allow simultaneous bind() calls on a single UNIX domain
 	 * socket, so flag in-progress operations, and return an error if an
 	 * operation is already in progress.
 	 *
 	 * Historically, we have not allowed a socket to be rebound, so this
 	 * also returns an error.  Not allowing re-binding simplifies the
 	 * implementation and avoids a great many possible failure modes.
 	 */
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_vnode != NULL) {
 		UNP_PCB_UNLOCK(unp);
 		return (EINVAL);
 	}
 	if (unp->unp_flags & UNP_BINDING) {
 		UNP_PCB_UNLOCK(unp);
 		return (EALREADY);
 	}
 	unp->unp_flags |= UNP_BINDING;
 	UNP_PCB_UNLOCK(unp);
 
 	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
 	bcopy(soun->sun_path, buf, namelen);
 	buf[namelen] = 0;
 
 restart:
 	NDINIT_ATRIGHTS(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME | NOCACHE,
 	    UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_BINDAT));
 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
 	error = namei(&nd);
 	if (error)
 		goto error;
 	vp = nd.ni_vp;
 	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
 		NDFREE_PNBUF(&nd);
 		if (nd.ni_dvp == vp)
 			vrele(nd.ni_dvp);
 		else
 			vput(nd.ni_dvp);
 		if (vp != NULL) {
 			vrele(vp);
 			error = EADDRINUSE;
 			goto error;
 		}
 		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
 		if (error)
 			goto error;
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VSOCK;
 	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_pd->pd_cmask);
 #ifdef MAC
 	error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
 #endif
 	if (error == 0)
 		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
 	NDFREE_PNBUF(&nd);
 	if (error) {
 		VOP_VPUT_PAIR(nd.ni_dvp, NULL, true);
 		vn_finished_write(mp);
 		if (error == ERELOOKUP)
 			goto restart;
 		goto error;
 	}
 	vp = nd.ni_vp;
 	ASSERT_VOP_ELOCKED(vp, "uipc_bind");
 	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
 
 	UNP_PCB_LOCK(unp);
 	VOP_UNP_BIND(vp, unp);
 	unp->unp_vnode = vp;
 	unp->unp_addr = soun;
 	unp->unp_flags &= ~UNP_BINDING;
 	UNP_PCB_UNLOCK(unp);
 	vref(vp);
 	VOP_VPUT_PAIR(nd.ni_dvp, &vp, true);
 	vn_finished_write(mp);
 	free(buf, M_TEMP);
 	return (0);
 
 error:
 	UNP_PCB_LOCK(unp);
 	unp->unp_flags &= ~UNP_BINDING;
 	UNP_PCB_UNLOCK(unp);
 	free(buf, M_TEMP);
 	return (error);
 }
 
 static int
 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (uipc_bindat(AT_FDCWD, so, nam, td));
 }
 
 static int
 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
 	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
 	error = unp_connect(so, nam, td);
 	return (error);
 }
 
 static int
 uipc_connectat(int fd, struct socket *so, struct sockaddr *nam,
     struct thread *td)
 {
 	int error;
 
 	KASSERT(td == curthread, ("uipc_connectat: td != curthread"));
 	error = unp_connectat(fd, so, nam, td, false);
 	return (error);
 }
 
 static void
 uipc_close(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 	struct vnode *vp = NULL;
 	struct mtx *vplock;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
 
 	vplock = NULL;
 	if ((vp = unp->unp_vnode) != NULL) {
 		vplock = mtx_pool_find(mtxpool_sleep, vp);
 		mtx_lock(vplock);
 	}
 	UNP_PCB_LOCK(unp);
 	if (vp && unp->unp_vnode == NULL) {
 		mtx_unlock(vplock);
 		vp = NULL;
 	}
 	if (vp != NULL) {
 		VOP_UNP_DETACH(vp);
 		unp->unp_vnode = NULL;
 	}
 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
 		unp_disconnect(unp, unp2);
 	else
 		UNP_PCB_UNLOCK(unp);
 	if (vp) {
 		mtx_unlock(vplock);
 		vrele(vp);
 	}
 }
 
 static int
 uipc_connect2(struct socket *so1, struct socket *so2)
 {
 	struct unpcb *unp, *unp2;
 
 	if (so1->so_type != so2->so_type)
 		return (EPROTOTYPE);
 
 	unp = so1->so_pcb;
 	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
 	unp2 = so2->so_pcb;
 	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
 	unp_pcb_lock_pair(unp, unp2);
 	unp_connect2(so1, so2, PRU_CONNECT2);
 	unp_pcb_unlock_pair(unp, unp2);
 
 	return (0);
 }
 
 static void
 uipc_detach(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 	struct mtx *vplock;
 	struct vnode *vp;
 	int local_unp_rights;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
 
 	vp = NULL;
 	vplock = NULL;
 
 	UNP_LINK_WLOCK();
 	LIST_REMOVE(unp, unp_link);
 	if (unp->unp_gcflag & UNPGC_DEAD)
 		LIST_REMOVE(unp, unp_dead);
 	unp->unp_gencnt = ++unp_gencnt;
 	--unp_count;
 	UNP_LINK_WUNLOCK();
 
 	UNP_PCB_UNLOCK_ASSERT(unp);
  restart:
 	if ((vp = unp->unp_vnode) != NULL) {
 		vplock = mtx_pool_find(mtxpool_sleep, vp);
 		mtx_lock(vplock);
 	}
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_vnode != vp && unp->unp_vnode != NULL) {
 		if (vplock)
 			mtx_unlock(vplock);
 		UNP_PCB_UNLOCK(unp);
 		goto restart;
 	}
 	if ((vp = unp->unp_vnode) != NULL) {
 		VOP_UNP_DETACH(vp);
 		unp->unp_vnode = NULL;
 	}
 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
 		unp_disconnect(unp, unp2);
 	else
 		UNP_PCB_UNLOCK(unp);
 
 	UNP_REF_LIST_LOCK();
 	while (!LIST_EMPTY(&unp->unp_refs)) {
 		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
 
 		unp_pcb_hold(ref);
 		UNP_REF_LIST_UNLOCK();
 
 		MPASS(ref != unp);
 		UNP_PCB_UNLOCK_ASSERT(ref);
 		unp_drop(ref);
 		UNP_REF_LIST_LOCK();
 	}
 	UNP_REF_LIST_UNLOCK();
 
 	UNP_PCB_LOCK(unp);
 	local_unp_rights = unp_rights;
 	unp->unp_socket->so_pcb = NULL;
 	unp->unp_socket = NULL;
 	free(unp->unp_addr, M_SONAME);
 	unp->unp_addr = NULL;
 	if (!unp_pcb_rele(unp))
 		UNP_PCB_UNLOCK(unp);
 	if (vp) {
 		mtx_unlock(vplock);
 		vrele(vp);
 	}
 	if (local_unp_rights)
 		taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
 
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 		/*
 		 * Everything should have been unlinked/freed by unp_dispose()
 		 * and/or unp_disconnect().
 		 */
 		MPASS(so->so_rcv.uxdg_peeked == NULL);
 		MPASS(STAILQ_EMPTY(&so->so_rcv.uxdg_mb));
 		MPASS(TAILQ_EMPTY(&so->so_rcv.uxdg_conns));
 		MPASS(STAILQ_EMPTY(&so->so_snd.uxdg_mb));
 	}
 }
 
 static int
 uipc_disconnect(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
 
 	UNP_PCB_LOCK(unp);
 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL)
 		unp_disconnect(unp, unp2);
 	else
 		UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static int
 uipc_listen(struct socket *so, int backlog, struct thread *td)
 {
 	struct unpcb *unp;
 	int error;
 
 	MPASS(so->so_type != SOCK_DGRAM);
 
 	/*
 	 * Synchronize with concurrent connection attempts.
 	 */
 	error = 0;
 	unp = sotounpcb(so);
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_conn != NULL || (unp->unp_flags & UNP_CONNECTING) != 0)
 		error = EINVAL;
 	else if (unp->unp_vnode == NULL)
 		error = EDESTADDRREQ;
 	if (error != 0) {
 		UNP_PCB_UNLOCK(unp);
 		return (error);
 	}
 
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	if (error == 0) {
 		cru2xt(td, &unp->unp_peercred);
 		solisten_proto(so, backlog);
 	}
 	SOCK_UNLOCK(so);
 	UNP_PCB_UNLOCK(unp);
 	return (error);
 }
 
 static int
 uipc_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp, *unp2;
 	const struct sockaddr *sa;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
 
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	UNP_LINK_RLOCK();
 	/*
 	 * XXX: It seems that this test always fails even when connection is
 	 * established.  So, this else clause is added as workaround to
 	 * return PF_LOCAL sockaddr.
 	 */
 	unp2 = unp->unp_conn;
 	if (unp2 != NULL) {
 		UNP_PCB_LOCK(unp2);
 		if (unp2->unp_addr != NULL)
 			sa = (struct sockaddr *) unp2->unp_addr;
 		else
 			sa = &sun_noname;
 		bcopy(sa, *nam, sa->sa_len);
 		UNP_PCB_UNLOCK(unp2);
 	} else {
 		sa = &sun_noname;
 		bcopy(sa, *nam, sa->sa_len);
 	}
 	UNP_LINK_RUNLOCK();
 	return (0);
 }
 
 static int
 uipc_rcvd(struct socket *so, int flags)
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	u_int mbcnt, sbcc;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
 	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
 	    ("%s: socktype %d", __func__, so->so_type));
 
 	/*
 	 * Adjust backpressure on sender and wakeup any waiting to write.
 	 *
 	 * The unp lock is acquired to maintain the validity of the unp_conn
 	 * pointer; no lock on unp2 is required as unp2->unp_socket will be
 	 * static as long as we don't permit unp2 to disconnect from unp,
 	 * which is prevented by the lock on unp.  We cache values from
 	 * so_rcv to avoid holding the so_rcv lock over the entire
 	 * transaction on the remote so_snd.
 	 */
 	SOCKBUF_LOCK(&so->so_rcv);
 	mbcnt = so->so_rcv.sb_mbcnt;
 	sbcc = sbavail(&so->so_rcv);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	/*
 	 * There is a benign race condition at this point.  If we're planning to
 	 * clear SB_STOP, but uipc_send is called on the connected socket at
 	 * this instant, it might add data to the sockbuf and set SB_STOP.  Then
 	 * we would erroneously clear SB_STOP below, even though the sockbuf is
 	 * full.  The race is benign because the only ill effect is to allow the
 	 * sockbuf to exceed its size limit, and the size limits are not
 	 * strictly guaranteed anyway.
 	 */
 	UNP_PCB_LOCK(unp);
 	unp2 = unp->unp_conn;
 	if (unp2 == NULL) {
 		UNP_PCB_UNLOCK(unp);
 		return (0);
 	}
 	so2 = unp2->unp_socket;
 	SOCKBUF_LOCK(&so2->so_snd);
 	if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax)
 		so2->so_snd.sb_flags &= ~SB_STOP;
 	sowwakeup_locked(so2);
 	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static int
 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	u_int mbcnt, sbcc;
 	int error;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
 	KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
 	    ("%s: socktype %d", __func__, so->so_type));
 
 	error = 0;
 	if (flags & PRUS_OOB) {
 		error = EOPNOTSUPP;
 		goto release;
 	}
 	if (control != NULL &&
 	    (error = unp_internalize(&control, td, NULL, NULL, NULL)))
 		goto release;
 
 	unp2 = NULL;
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		if (nam != NULL) {
 			if ((error = unp_connect(so, nam, td)) != 0)
 				goto out;
 		} else {
 			error = ENOTCONN;
 			goto out;
 		}
 	}
 
 	UNP_PCB_LOCK(unp);
 	if ((unp2 = unp_pcb_lock_peer(unp)) == NULL) {
 		UNP_PCB_UNLOCK(unp);
 		error = ENOTCONN;
 		goto out;
 	} else if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		unp_pcb_unlock_pair(unp, unp2);
 		error = EPIPE;
 		goto out;
 	}
 	UNP_PCB_UNLOCK(unp);
 	if ((so2 = unp2->unp_socket) == NULL) {
 		UNP_PCB_UNLOCK(unp2);
 		error = ENOTCONN;
 		goto out;
 	}
 	SOCKBUF_LOCK(&so2->so_rcv);
 	if (unp2->unp_flags & UNP_WANTCRED_MASK) {
 		/*
 		 * Credentials are passed only once on SOCK_STREAM and
 		 * SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or
 		 * forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS).
 		 */
 		control = unp_addsockcred(td, control, unp2->unp_flags, NULL,
 		    NULL, NULL);
 		unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT;
 	}
 
 	/*
 	 * Send to paired receive port and wake up readers.  Don't
 	 * check for space available in the receive buffer if we're
 	 * attaching ancillary data; Unix domain sockets only check
 	 * for space in the sending sockbuf, and that check is
 	 * performed one level up the stack.  At that level we cannot
 	 * precisely account for the amount of buffer space used
 	 * (e.g., because control messages are not yet internalized).
 	 */
 	switch (so->so_type) {
 	case SOCK_STREAM:
 		if (control != NULL) {
 			sbappendcontrol_locked(&so2->so_rcv, m,
 			    control, flags);
 			control = NULL;
 		} else
 			sbappend_locked(&so2->so_rcv, m, flags);
 		break;
 
 	case SOCK_SEQPACKET:
 		if (sbappendaddr_nospacecheck_locked(&so2->so_rcv,
 		    &sun_noname, m, control))
 			control = NULL;
 		break;
 	}
 
 	mbcnt = so2->so_rcv.sb_mbcnt;
 	sbcc = sbavail(&so2->so_rcv);
 	if (sbcc)
 		sorwakeup_locked(so2);
 	else
 		SOCKBUF_UNLOCK(&so2->so_rcv);
 
 	/*
 	 * The PCB lock on unp2 protects the SB_STOP flag.  Without it,
 	 * it would be possible for uipc_rcvd to be called at this
 	 * point, drain the receiving sockbuf, clear SB_STOP, and then
 	 * we would set SB_STOP below.  That could lead to an empty
 	 * sockbuf having SB_STOP set
 	 */
 	SOCKBUF_LOCK(&so->so_snd);
 	if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax)
 		so->so_snd.sb_flags |= SB_STOP;
 	SOCKBUF_UNLOCK(&so->so_snd);
 	UNP_PCB_UNLOCK(unp2);
 	m = NULL;
 out:
 	/*
-	 * PRUS_EOF is equivalent to pru_send followed by pru_shutdown.
+	 * PRUS_EOF is equivalent to pr_send followed by pr_shutdown.
 	 */
 	if (flags & PRUS_EOF) {
 		UNP_PCB_LOCK(unp);
 		socantsendmore(so);
 		unp_shutdown(unp);
 		UNP_PCB_UNLOCK(unp);
 	}
 	if (control != NULL && error != 0)
 		unp_scan(control, unp_freerights);
 
 release:
 	if (control != NULL)
 		m_freem(control);
 	/*
 	 * In case of PRUS_NOTREADY, uipc_ready() is responsible
 	 * for freeing memory.
 	 */   
 	if (m != NULL && (flags & PRUS_NOTREADY) == 0)
 		m_freem(m);
 	return (error);
 }
 
 /* PF_UNIX/SOCK_DGRAM version of sbspace() */
 static inline bool
 uipc_dgram_sbspace(struct sockbuf *sb, u_int cc, u_int mbcnt)
 {
 	u_int bleft, mleft;
 
 	MPASS(sb->sb_hiwat >= sb->uxdg_cc);
 	MPASS(sb->sb_mbmax >= sb->uxdg_mbcnt);
 
 	if (__predict_false(sb->sb_state & SBS_CANTRCVMORE))
 		return (false);
 
 	bleft = sb->sb_hiwat - sb->uxdg_cc;
 	mleft = sb->sb_mbmax - sb->uxdg_mbcnt;
 
 	return (bleft >= cc && mleft >= mbcnt);
 }
 
 /*
  * PF_UNIX/SOCK_DGRAM send
  *
  * Allocate a record consisting of 3 mbufs in the sequence of
  * from -> control -> data and append it to the socket buffer.
  *
  * The first mbuf carries sender's name and is a pkthdr that stores
  * overall length of datagram, its memory consumption and control length.
  */
 #define	ctllen	PH_loc.thirtytwo[1]
 _Static_assert(offsetof(struct pkthdr, memlen) + sizeof(u_int) <=
     offsetof(struct pkthdr, ctllen), "unix/dgram can not store ctllen");
 static int
 uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *m, struct mbuf *c, int flags, struct thread *td)
 {
 	struct unpcb *unp, *unp2;
 	const struct sockaddr *from;
 	struct socket *so2;
 	struct sockbuf *sb;
 	struct mbuf *f, *clast;
 	u_int cc, ctl, mbcnt;
 	u_int dcc __diagused, dctl __diagused, dmbcnt __diagused;
 	int error;
 
 	MPASS((uio != NULL && m == NULL) || (m != NULL && uio == NULL));
 
 	error = 0;
 	f = NULL;
 	ctl = 0;
 
 	if (__predict_false(flags & MSG_OOB)) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	if (m == NULL) {
 		if (__predict_false(uio->uio_resid > unpdg_maxdgram)) {
 			error = EMSGSIZE;
 			goto out;
 		}
 		m = m_uiotombuf(uio, M_WAITOK, 0, max_hdr, M_PKTHDR);
 		if (__predict_false(m == NULL)) {
 			error = EFAULT;
 			goto out;
 		}
 		f = m_gethdr(M_WAITOK, MT_SONAME);
 		cc = m->m_pkthdr.len;
 		mbcnt = MSIZE + m->m_pkthdr.memlen;
 		if (c != NULL &&
 		    (error = unp_internalize(&c, td, &clast, &ctl, &mbcnt)))
 			goto out;
 	} else {
-		/* pru_sosend() with mbuf usually is a kernel thread. */
+		/* pr_sosend() with mbuf usually is a kernel thread. */
 
 		M_ASSERTPKTHDR(m);
 		if (__predict_false(c != NULL))
 			panic("%s: control from a kernel thread", __func__);
 
 		if (__predict_false(m->m_pkthdr.len > unpdg_maxdgram)) {
 			error = EMSGSIZE;
 			goto out;
 		}
 		if ((f = m_gethdr(M_NOWAIT, MT_SONAME)) == NULL) {
 			error = ENOBUFS;
 			goto out;
 		}
 		/* Condition the foreign mbuf to our standards. */
 		m_clrprotoflags(m);
 		m_tag_delete_chain(m, NULL);
 		m->m_pkthdr.rcvif = NULL;
 		m->m_pkthdr.flowid = 0;
 		m->m_pkthdr.csum_flags = 0;
 		m->m_pkthdr.fibnum = 0;
 		m->m_pkthdr.rsstype = 0;
 
 		cc = m->m_pkthdr.len;
 		mbcnt = MSIZE;
 		for (struct mbuf *mb = m; mb != NULL; mb = mb->m_next) {
 			mbcnt += MSIZE;
 			if (mb->m_flags & M_EXT)
 				mbcnt += mb->m_ext.ext_size;
 		}
 	}
 
 	unp = sotounpcb(so);
 	MPASS(unp);
 
 	/*
 	 * XXXGL: would be cool to fully remove so_snd out of the equation
 	 * and avoid this lock, which is not only extraneous, but also being
 	 * released, thus still leaving possibility for a race.  We can easily
 	 * handle SBS_CANTSENDMORE/SS_ISCONNECTED complement in unpcb, but it
 	 * is more difficult to invent something to handle so_error.
 	 */
 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		goto out2;
 	SOCK_SENDBUF_LOCK(so);
 	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 		SOCK_SENDBUF_UNLOCK(so);
 		error = EPIPE;
 		goto out3;
 	}
 	if (so->so_error != 0) {
 		error = so->so_error;
 		so->so_error = 0;
 		SOCK_SENDBUF_UNLOCK(so);
 		goto out3;
 	}
 	if (((so->so_state & SS_ISCONNECTED) == 0) && addr == NULL) {
 		SOCK_SENDBUF_UNLOCK(so);
 		error = EDESTADDRREQ;
 		goto out3;
 	}
 	SOCK_SENDBUF_UNLOCK(so);
 
 	if (addr != NULL) {
 		if ((error = unp_connectat(AT_FDCWD, so, addr, td, true)))
 			goto out3;
 		UNP_PCB_LOCK_ASSERT(unp);
 		unp2 = unp->unp_conn;
 		UNP_PCB_LOCK_ASSERT(unp2);
 	} else {
 		UNP_PCB_LOCK(unp);
 		unp2 = unp_pcb_lock_peer(unp);
 		if (unp2 == NULL) {
 			UNP_PCB_UNLOCK(unp);
 			error = ENOTCONN;
 			goto out3;
 		}
 	}
 
 	if (unp2->unp_flags & UNP_WANTCRED_MASK)
 		c = unp_addsockcred(td, c, unp2->unp_flags, &clast, &ctl,
 		    &mbcnt);
 	if (unp->unp_addr != NULL)
 		from = (struct sockaddr *)unp->unp_addr;
 	else
 		from = &sun_noname;
 	f->m_len = from->sa_len;
 	MPASS(from->sa_len <= MLEN);
 	bcopy(from, mtod(f, void *), from->sa_len);
 	ctl += f->m_len;
 
 	/*
 	 * Concatenate mbufs: from -> control -> data.
 	 * Save overall cc and mbcnt in "from" mbuf.
 	 */
 	if (c != NULL) {
 #ifdef INVARIANTS
 		struct mbuf *mc;
 
 		for (mc = c; mc->m_next != NULL; mc = mc->m_next);
 		MPASS(mc == clast);
 #endif
 		f->m_next = c;
 		clast->m_next = m;
 		c = NULL;
 	} else
 		f->m_next = m;
 	m = NULL;
 #ifdef INVARIANTS
 	dcc = dctl = dmbcnt = 0;
 	for (struct mbuf *mb = f; mb != NULL; mb = mb->m_next) {
 		if (mb->m_type == MT_DATA)
 			dcc += mb->m_len;
 		else
 			dctl += mb->m_len;
 		dmbcnt += MSIZE;
 		if (mb->m_flags & M_EXT)
 			dmbcnt += mb->m_ext.ext_size;
 	}
 	MPASS(dcc == cc);
 	MPASS(dctl == ctl);
 	MPASS(dmbcnt == mbcnt);
 #endif
 	f->m_pkthdr.len = cc + ctl;
 	f->m_pkthdr.memlen = mbcnt;
 	f->m_pkthdr.ctllen = ctl;
 
 	/*
 	 * Destination socket buffer selection.
 	 *
 	 * Unconnected sends, when !(so->so_state & SS_ISCONNECTED) and the
 	 * destination address is supplied, create a temporary connection for
 	 * the run time of the function (see call to unp_connectat() above and
 	 * to unp_disconnect() below).  We distinguish them by condition of
 	 * (addr != NULL).  We intentionally avoid adding 'bool connected' for
 	 * that condition, since, again, through the run time of this code we
 	 * are always connected.  For such "unconnected" sends, the destination
 	 * buffer would be the receive buffer of destination socket so2.
 	 *
 	 * For connected sends, data lands on the send buffer of the sender's
 	 * socket "so".  Then, if we just added the very first datagram
 	 * on this send buffer, we need to add the send buffer on to the
 	 * receiving socket's buffer list.  We put ourselves on top of the
 	 * list.  Such logic gives infrequent senders priority over frequent
 	 * senders.
 	 *
 	 * Note on byte count management. As long as event methods kevent(2),
 	 * select(2) are not protocol specific (yet), we need to maintain
 	 * meaningful values on the receive buffer.  So, the receive buffer
 	 * would accumulate counters from all connected buffers potentially
 	 * having sb_ccc > sb_hiwat or sb_mbcnt > sb_mbmax.
 	 */
 	so2 = unp2->unp_socket;
 	sb = (addr == NULL) ? &so->so_snd : &so2->so_rcv;
 	SOCK_RECVBUF_LOCK(so2);
 	if (uipc_dgram_sbspace(sb, cc + ctl, mbcnt)) {
 		if (addr == NULL && STAILQ_EMPTY(&sb->uxdg_mb))
 			TAILQ_INSERT_HEAD(&so2->so_rcv.uxdg_conns, &so->so_snd,
 			    uxdg_clist);
 		STAILQ_INSERT_TAIL(&sb->uxdg_mb, f, m_stailqpkt);
 		sb->uxdg_cc += cc + ctl;
 		sb->uxdg_ctl += ctl;
 		sb->uxdg_mbcnt += mbcnt;
 		so2->so_rcv.sb_acc += cc + ctl;
 		so2->so_rcv.sb_ccc += cc + ctl;
 		so2->so_rcv.sb_ctl += ctl;
 		so2->so_rcv.sb_mbcnt += mbcnt;
 		sorwakeup_locked(so2);
 		f = NULL;
 	} else {
 		soroverflow_locked(so2);
 		error = (so->so_state & SS_NBIO) ? EAGAIN : ENOBUFS;
 	}
 
 	if (addr != NULL)
 		unp_disconnect(unp, unp2);
 	else
 		unp_pcb_unlock_pair(unp, unp2);
 
 	td->td_ru.ru_msgsnd++;
 
 out3:
 	SOCK_IO_SEND_UNLOCK(so);
 out2:
 	if (c)
 		unp_scan(c, unp_freerights);
 out:
 	if (f)
 		m_freem(f);
 	if (c)
 		m_freem(c);
 	if (m)
 		m_freem(m);
 
 	return (error);
 }
 
 /*
  * PF_UNIX/SOCK_DGRAM receive with MSG_PEEK.
  * The mbuf has already been unlinked from the uxdg_mb of socket buffer
  * and needs to be linked onto uxdg_peeked of receive socket buffer.
  */
 static int
 uipc_peek_dgram(struct socket *so, struct mbuf *m, struct sockaddr **psa,
     struct uio *uio, struct mbuf **controlp, int *flagsp)
 {
 	ssize_t len = 0;
 	int error;
 
 	so->so_rcv.uxdg_peeked = m;
 	so->so_rcv.uxdg_cc += m->m_pkthdr.len;
 	so->so_rcv.uxdg_ctl += m->m_pkthdr.ctllen;
 	so->so_rcv.uxdg_mbcnt += m->m_pkthdr.memlen;
 	SOCK_RECVBUF_UNLOCK(so);
 
 	KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type));
 	if (psa != NULL)
 		*psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK);
 
 	m = m->m_next;
 	KASSERT(m, ("%s: no data or control after soname", __func__));
 
 	/*
 	 * With MSG_PEEK the control isn't executed, just copied.
 	 */
 	while (m != NULL && m->m_type == MT_CONTROL) {
 		if (controlp != NULL) {
 			*controlp = m_copym(m, 0, m->m_len, M_WAITOK);
 			controlp = &(*controlp)->m_next;
 		}
 		m = m->m_next;
 	}
 	KASSERT(m == NULL || m->m_type == MT_DATA,
 	    ("%s: not MT_DATA mbuf %p", __func__, m));
 	while (m != NULL && uio->uio_resid > 0) {
 		len = uio->uio_resid;
 		if (len > m->m_len)
 			len = m->m_len;
 		error = uiomove(mtod(m, char *), (int)len, uio);
 		if (error) {
 			SOCK_IO_RECV_UNLOCK(so);
 			return (error);
 		}
 		if (len == m->m_len)
 			m = m->m_next;
 	}
 	SOCK_IO_RECV_UNLOCK(so);
 
 	if (flagsp != NULL) {
 		if (m != NULL) {
 			if (*flagsp & MSG_TRUNC) {
 				/* Report real length of the packet */
 				uio->uio_resid -= m_length(m, NULL) - len;
 			}
 			*flagsp |= MSG_TRUNC;
 		} else
 			*flagsp &= ~MSG_TRUNC;
 	}
 
 	return (0);
 }
 
 /*
  * PF_UNIX/SOCK_DGRAM receive
  */
 static int
 uipc_soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct sockbuf *sb = NULL;
 	struct mbuf *m;
 	int flags, error;
 	ssize_t len = 0;
 	bool nonblock;
 
 	MPASS(mp0 == NULL);
 
 	if (psa != NULL)
 		*psa = NULL;
 	if (controlp != NULL)
 		*controlp = NULL;
 
 	flags = flagsp != NULL ? *flagsp : 0;
 	nonblock = (so->so_state & SS_NBIO) ||
 	    (flags & (MSG_DONTWAIT | MSG_NBIO));
 
 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
 	if (__predict_false(error))
 		return (error);
 
 	/*
 	 * Loop blocking while waiting for a datagram.  Prioritize connected
 	 * peers over unconnected sends.  Set sb to selected socket buffer
 	 * containing an mbuf on exit from the wait loop.  A datagram that
 	 * had already been peeked at has top priority.
 	 */
 	SOCK_RECVBUF_LOCK(so);
 	while ((m = so->so_rcv.uxdg_peeked) == NULL &&
 	    (sb = TAILQ_FIRST(&so->so_rcv.uxdg_conns)) == NULL &&
 	    (m = STAILQ_FIRST(&so->so_rcv.uxdg_mb)) == NULL) {
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCK_RECVBUF_UNLOCK(so);
 			SOCK_IO_RECV_UNLOCK(so);
 			return (error);
 		}
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE ||
 		    uio->uio_resid == 0) {
 			SOCK_RECVBUF_UNLOCK(so);
 			SOCK_IO_RECV_UNLOCK(so);
 			return (0);
 		}
 		if (nonblock) {
 			SOCK_RECVBUF_UNLOCK(so);
 			SOCK_IO_RECV_UNLOCK(so);
 			return (EWOULDBLOCK);
 		}
 		error = sbwait(so, SO_RCV);
 		if (error) {
 			SOCK_RECVBUF_UNLOCK(so);
 			SOCK_IO_RECV_UNLOCK(so);
 			return (error);
 		}
 	}
 
 	if (sb == NULL)
 		sb = &so->so_rcv;
 	else if (m == NULL)
 		m = STAILQ_FIRST(&sb->uxdg_mb);
 	else
 		MPASS(m == so->so_rcv.uxdg_peeked);
 
 	MPASS(sb->uxdg_cc > 0);
 	M_ASSERTPKTHDR(m);
 	KASSERT(m->m_type == MT_SONAME, ("m->m_type == %d", m->m_type));
 
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 
 	if (__predict_true(m != so->so_rcv.uxdg_peeked)) {
 		STAILQ_REMOVE_HEAD(&sb->uxdg_mb, m_stailqpkt);
 		if (STAILQ_EMPTY(&sb->uxdg_mb) && sb != &so->so_rcv)
 			TAILQ_REMOVE(&so->so_rcv.uxdg_conns, sb, uxdg_clist);
 	} else
 		so->so_rcv.uxdg_peeked = NULL;
 
 	sb->uxdg_cc -= m->m_pkthdr.len;
 	sb->uxdg_ctl -= m->m_pkthdr.ctllen;
 	sb->uxdg_mbcnt -= m->m_pkthdr.memlen;
 
 	if (__predict_false(flags & MSG_PEEK))
 		return (uipc_peek_dgram(so, m, psa, uio, controlp, flagsp));
 
 	so->so_rcv.sb_acc -= m->m_pkthdr.len;
 	so->so_rcv.sb_ccc -= m->m_pkthdr.len;
 	so->so_rcv.sb_ctl -= m->m_pkthdr.ctllen;
 	so->so_rcv.sb_mbcnt -= m->m_pkthdr.memlen;
 	SOCK_RECVBUF_UNLOCK(so);
 
 	if (psa != NULL)
 		*psa = sodupsockaddr(mtod(m, struct sockaddr *), M_WAITOK);
 	m = m_free(m);
 	KASSERT(m, ("%s: no data or control after soname", __func__));
 
 	/*
 	 * Packet to copyout() is now in 'm' and it is disconnected from the
 	 * queue.
 	 *
 	 * Process one or more MT_CONTROL mbufs present before any data mbufs
 	 * in the first mbuf chain on the socket buffer.  We call into the
 	 * unp_externalize() to perform externalization (or freeing if
 	 * controlp == NULL). In some cases there can be only MT_CONTROL mbufs
 	 * without MT_DATA mbufs.
 	 */
 	while (m != NULL && m->m_type == MT_CONTROL) {
 		struct mbuf *cm;
 
 		/* XXXGL: unp_externalize() is also dom_externalize() KBI and
 		 * it frees whole chain, so we must disconnect the mbuf.
 		 */
 		cm = m; m = m->m_next; cm->m_next = NULL;
 		error = unp_externalize(cm, controlp, flags);
 		if (error != 0) {
 			SOCK_IO_RECV_UNLOCK(so);
 			unp_scan(m, unp_freerights);
 			m_freem(m);
 			return (error);
 		}
 		if (controlp != NULL) {
 			while (*controlp != NULL)
 				controlp = &(*controlp)->m_next;
 		}
 	}
 	KASSERT(m == NULL || m->m_type == MT_DATA,
 	    ("%s: not MT_DATA mbuf %p", __func__, m));
 	while (m != NULL && uio->uio_resid > 0) {
 		len = uio->uio_resid;
 		if (len > m->m_len)
 			len = m->m_len;
 		error = uiomove(mtod(m, char *), (int)len, uio);
 		if (error) {
 			SOCK_IO_RECV_UNLOCK(so);
 			m_freem(m);
 			return (error);
 		}
 		if (len == m->m_len)
 			m = m_free(m);
 		else {
 			m->m_data += len;
 			m->m_len -= len;
 		}
 	}
 	SOCK_IO_RECV_UNLOCK(so);
 
 	if (m != NULL) {
 		if (flagsp != NULL) {
 			if (flags & MSG_TRUNC) {
 				/* Report real length of the packet */
 				uio->uio_resid -= m_length(m, NULL);
 			}
 			*flagsp |= MSG_TRUNC;
 		}
 		m_freem(m);
 	} else if (flagsp != NULL)
 		*flagsp &= ~MSG_TRUNC;
 
 	return (0);
 }
 
 static bool
 uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp)
 {
 	struct mbuf *mb, *n;
 	struct sockbuf *sb;
 
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		SOCK_UNLOCK(so);
 		return (false);
 	}
 	mb = NULL;
 	sb = &so->so_rcv;
 	SOCKBUF_LOCK(sb);
 	if (sb->sb_fnrdy != NULL) {
 		for (mb = sb->sb_mb, n = mb->m_nextpkt; mb != NULL;) {
 			if (mb == m) {
 				*errorp = sbready(sb, m, count);
 				break;
 			}
 			mb = mb->m_next;
 			if (mb == NULL) {
 				mb = n;
 				if (mb != NULL)
 					n = mb->m_nextpkt;
 			}
 		}
 	}
 	SOCKBUF_UNLOCK(sb);
 	SOCK_UNLOCK(so);
 	return (mb != NULL);
 }
 
 static int
 uipc_ready(struct socket *so, struct mbuf *m, int count)
 {
 	struct unpcb *unp, *unp2;
 	struct socket *so2;
 	int error, i;
 
 	unp = sotounpcb(so);
 
 	KASSERT(so->so_type == SOCK_STREAM,
 	    ("%s: unexpected socket type for %p", __func__, so));
 
 	UNP_PCB_LOCK(unp);
 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
 		UNP_PCB_UNLOCK(unp);
 		so2 = unp2->unp_socket;
 		SOCKBUF_LOCK(&so2->so_rcv);
 		if ((error = sbready(&so2->so_rcv, m, count)) == 0)
 			sorwakeup_locked(so2);
 		else
 			SOCKBUF_UNLOCK(&so2->so_rcv);
 		UNP_PCB_UNLOCK(unp2);
 		return (error);
 	}
 	UNP_PCB_UNLOCK(unp);
 
 	/*
 	 * The receiving socket has been disconnected, but may still be valid.
 	 * In this case, the now-ready mbufs are still present in its socket
 	 * buffer, so perform an exhaustive search before giving up and freeing
 	 * the mbufs.
 	 */
 	UNP_LINK_RLOCK();
 	LIST_FOREACH(unp, &unp_shead, unp_link) {
 		if (uipc_ready_scan(unp->unp_socket, m, count, &error))
 			break;
 	}
 	UNP_LINK_RUNLOCK();
 
 	if (unp == NULL) {
 		for (i = 0; i < count; i++)
 			m = m_free(m);
 		error = ECONNRESET;
 	}
 	return (error);
 }
 
 static int
 uipc_sense(struct socket *so, struct stat *sb)
 {
 	struct unpcb *unp;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
 
 	sb->st_blksize = so->so_snd.sb_hiwat;
 	sb->st_dev = NODEV;
 	sb->st_ino = unp->unp_ino;
 	return (0);
 }
 
 static int
 uipc_shutdown(struct socket *so)
 {
 	struct unpcb *unp;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
 
 	UNP_PCB_LOCK(unp);
 	socantsendmore(so);
 	unp_shutdown(unp);
 	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static int
 uipc_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct unpcb *unp;
 	const struct sockaddr *sa;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
 
 	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_addr != NULL)
 		sa = (struct sockaddr *) unp->unp_addr;
 	else
 		sa = &sun_noname;
 	bcopy(sa, *nam, sa->sa_len);
 	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
-static struct pr_usrreqs uipc_usrreqs_dgram = {
-	.pru_abort = 		uipc_abort,
-	.pru_accept =		uipc_accept,
-	.pru_attach =		uipc_attach,
-	.pru_bind =		uipc_bind,
-	.pru_bindat =		uipc_bindat,
-	.pru_connect =		uipc_connect,
-	.pru_connectat =	uipc_connectat,
-	.pru_connect2 =		uipc_connect2,
-	.pru_detach =		uipc_detach,
-	.pru_disconnect =	uipc_disconnect,
-	.pru_peeraddr =		uipc_peeraddr,
-	.pru_sosend =		uipc_sosend_dgram,
-	.pru_sense =		uipc_sense,
-	.pru_shutdown =		uipc_shutdown,
-	.pru_sockaddr =		uipc_sockaddr,
-	.pru_soreceive =	uipc_soreceive_dgram,
-	.pru_close =		uipc_close,
-};
-
-static struct pr_usrreqs uipc_usrreqs_seqpacket = {
-	.pru_abort =		uipc_abort,
-	.pru_accept =		uipc_accept,
-	.pru_attach =		uipc_attach,
-	.pru_bind =		uipc_bind,
-	.pru_bindat =		uipc_bindat,
-	.pru_connect =		uipc_connect,
-	.pru_connectat =	uipc_connectat,
-	.pru_connect2 =		uipc_connect2,
-	.pru_detach =		uipc_detach,
-	.pru_disconnect =	uipc_disconnect,
-	.pru_listen =		uipc_listen,
-	.pru_peeraddr =		uipc_peeraddr,
-	.pru_rcvd =		uipc_rcvd,
-	.pru_send =		uipc_send,
-	.pru_sense =		uipc_sense,
-	.pru_shutdown =		uipc_shutdown,
-	.pru_sockaddr =		uipc_sockaddr,
-	.pru_soreceive =	soreceive_generic,	/* XXX: or...? */
-	.pru_close =		uipc_close,
-};
-
-static struct pr_usrreqs uipc_usrreqs_stream = {
-	.pru_abort = 		uipc_abort,
-	.pru_accept =		uipc_accept,
-	.pru_attach =		uipc_attach,
-	.pru_bind =		uipc_bind,
-	.pru_bindat =		uipc_bindat,
-	.pru_connect =		uipc_connect,
-	.pru_connectat =	uipc_connectat,
-	.pru_connect2 =		uipc_connect2,
-	.pru_detach =		uipc_detach,
-	.pru_disconnect =	uipc_disconnect,
-	.pru_listen =		uipc_listen,
-	.pru_peeraddr =		uipc_peeraddr,
-	.pru_rcvd =		uipc_rcvd,
-	.pru_send =		uipc_send,
-	.pru_ready =		uipc_ready,
-	.pru_sense =		uipc_sense,
-	.pru_shutdown =		uipc_shutdown,
-	.pru_sockaddr =		uipc_sockaddr,
-	.pru_soreceive =	soreceive_generic,
-	.pru_close =		uipc_close,
-};
-
 static int
 uipc_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct unpcb *unp;
 	struct xucred xu;
 	int error, optval;
 
 	if (sopt->sopt_level != SOL_LOCAL)
 		return (EINVAL);
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
 	error = 0;
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case LOCAL_PEERCRED:
 			UNP_PCB_LOCK(unp);
 			if (unp->unp_flags & UNP_HAVEPC)
 				xu = unp->unp_peercred;
 			else {
 				if (so->so_type == SOCK_STREAM)
 					error = ENOTCONN;
 				else
 					error = EINVAL;
 			}
 			UNP_PCB_UNLOCK(unp);
 			if (error == 0)
 				error = sooptcopyout(sopt, &xu, sizeof(xu));
 			break;
 
 		case LOCAL_CREDS:
 			/* Unlocked read. */
 			optval = unp->unp_flags & UNP_WANTCRED_ONESHOT ? 1 : 0;
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		case LOCAL_CREDS_PERSISTENT:
 			/* Unlocked read. */
 			optval = unp->unp_flags & UNP_WANTCRED_ALWAYS ? 1 : 0;
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		case LOCAL_CONNWAIT:
 			/* Unlocked read. */
 			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 
 		default:
 			error = EOPNOTSUPP;
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case LOCAL_CREDS:
 		case LOCAL_CREDS_PERSISTENT:
 		case LOCAL_CONNWAIT:
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 					    sizeof(optval));
 			if (error)
 				break;
 
 #define	OPTSET(bit, exclusive) do {					\
 	UNP_PCB_LOCK(unp);						\
 	if (optval) {							\
 		if ((unp->unp_flags & (exclusive)) != 0) {		\
 			UNP_PCB_UNLOCK(unp);				\
 			error = EINVAL;					\
 			break;						\
 		}							\
 		unp->unp_flags |= (bit);				\
 	} else								\
 		unp->unp_flags &= ~(bit);				\
 	UNP_PCB_UNLOCK(unp);						\
 } while (0)
 
 			switch (sopt->sopt_name) {
 			case LOCAL_CREDS:
 				OPTSET(UNP_WANTCRED_ONESHOT, UNP_WANTCRED_ALWAYS);
 				break;
 
 			case LOCAL_CREDS_PERSISTENT:
 				OPTSET(UNP_WANTCRED_ALWAYS, UNP_WANTCRED_ONESHOT);
 				break;
 
 			case LOCAL_CONNWAIT:
 				OPTSET(UNP_CONNWAIT, 0);
 				break;
 
 			default:
 				break;
 			}
 			break;
 #undef	OPTSET
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 static int
 unp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 
 	return (unp_connectat(AT_FDCWD, so, nam, td, false));
 }
 
 static int
 unp_connectat(int fd, struct socket *so, struct sockaddr *nam,
     struct thread *td, bool return_locked)
 {
 	struct mtx *vplock;
 	struct sockaddr_un *soun;
 	struct vnode *vp;
 	struct socket *so2;
 	struct unpcb *unp, *unp2, *unp3;
 	struct nameidata nd;
 	char buf[SOCK_MAXADDRLEN];
 	struct sockaddr *sa;
 	cap_rights_t rights;
 	int error, len;
 	bool connreq;
 
 	if (nam->sa_family != AF_UNIX)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len > sizeof(struct sockaddr_un))
 		return (EINVAL);
 	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
 	if (len <= 0)
 		return (EINVAL);
 	soun = (struct sockaddr_un *)nam;
 	bcopy(soun->sun_path, buf, len);
 	buf[len] = 0;
 
 	error = 0;
 	unp = sotounpcb(so);
 	UNP_PCB_LOCK(unp);
 	for (;;) {
 		/*
 		 * Wait for connection state to stabilize.  If a connection
 		 * already exists, give up.  For datagram sockets, which permit
 		 * multiple consecutive connect(2) calls, upper layers are
 		 * responsible for disconnecting in advance of a subsequent
 		 * connect(2), but this is not synchronized with PCB connection
 		 * state.
 		 *
 		 * Also make sure that no threads are currently attempting to
 		 * lock the peer socket, to ensure that unp_conn cannot
 		 * transition between two valid sockets while locks are dropped.
 		 */
 		if (SOLISTENING(so))
 			error = EOPNOTSUPP;
 		else if (unp->unp_conn != NULL)
 			error = EISCONN;
 		else if ((unp->unp_flags & UNP_CONNECTING) != 0) {
 			error = EALREADY;
 		}
 		if (error != 0) {
 			UNP_PCB_UNLOCK(unp);
 			return (error);
 		}
 		if (unp->unp_pairbusy > 0) {
 			unp->unp_flags |= UNP_WAITING;
 			mtx_sleep(unp, UNP_PCB_LOCKPTR(unp), 0, "unpeer", 0);
 			continue;
 		}
 		break;
 	}
 	unp->unp_flags |= UNP_CONNECTING;
 	UNP_PCB_UNLOCK(unp);
 
 	connreq = (so->so_proto->pr_flags & PR_CONNREQUIRED) != 0;
 	if (connreq)
 		sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
 	else
 		sa = NULL;
 	NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
 	    UIO_SYSSPACE, buf, fd, cap_rights_init_one(&rights, CAP_CONNECTAT));
 	error = namei(&nd);
 	if (error)
 		vp = NULL;
 	else
 		vp = nd.ni_vp;
 	ASSERT_VOP_LOCKED(vp, "unp_connect");
 	NDFREE_NOTHING(&nd);
 	if (error)
 		goto bad;
 
 	if (vp->v_type != VSOCK) {
 		error = ENOTSOCK;
 		goto bad;
 	}
 #ifdef MAC
 	error = mac_vnode_check_open(td->td_ucred, vp, VWRITE | VREAD);
 	if (error)
 		goto bad;
 #endif
 	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
 	if (error)
 		goto bad;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
 
 	vplock = mtx_pool_find(mtxpool_sleep, vp);
 	mtx_lock(vplock);
 	VOP_UNP_CONNECT(vp, &unp2);
 	if (unp2 == NULL) {
 		error = ECONNREFUSED;
 		goto bad2;
 	}
 	so2 = unp2->unp_socket;
 	if (so->so_type != so2->so_type) {
 		error = EPROTOTYPE;
 		goto bad2;
 	}
 	if (connreq) {
 		if (SOLISTENING(so2)) {
 			CURVNET_SET(so2->so_vnet);
 			so2 = sonewconn(so2, 0);
 			CURVNET_RESTORE();
 		} else
 			so2 = NULL;
 		if (so2 == NULL) {
 			error = ECONNREFUSED;
 			goto bad2;
 		}
 		unp3 = sotounpcb(so2);
 		unp_pcb_lock_pair(unp2, unp3);
 		if (unp2->unp_addr != NULL) {
 			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
 			unp3->unp_addr = (struct sockaddr_un *) sa;
 			sa = NULL;
 		}
 
 		unp_copy_peercred(td, unp3, unp, unp2);
 
 		UNP_PCB_UNLOCK(unp2);
 		unp2 = unp3;
 
 		/*
 		 * It is safe to block on the PCB lock here since unp2 is
 		 * nascent and cannot be connected to any other sockets.
 		 */
 		UNP_PCB_LOCK(unp);
 #ifdef MAC
 		mac_socketpeer_set_from_socket(so, so2);
 		mac_socketpeer_set_from_socket(so2, so);
 #endif
 	} else {
 		unp_pcb_lock_pair(unp, unp2);
 	}
 	KASSERT(unp2 != NULL && so2 != NULL && unp2->unp_socket == so2 &&
 	    sotounpcb(so2) == unp2,
 	    ("%s: unp2 %p so2 %p", __func__, unp2, so2));
 	unp_connect2(so, so2, PRU_CONNECT);
 	KASSERT((unp->unp_flags & UNP_CONNECTING) != 0,
 	    ("%s: unp %p has UNP_CONNECTING clear", __func__, unp));
 	unp->unp_flags &= ~UNP_CONNECTING;
 	if (!return_locked)
 		unp_pcb_unlock_pair(unp, unp2);
 bad2:
 	mtx_unlock(vplock);
 bad:
 	if (vp != NULL) {
 		/*
 		 * If we are returning locked (called via uipc_sosend_dgram()),
 		 * we need to be sure that vput() won't sleep.  This is
 		 * guaranteed by VOP_UNP_CONNECT() call above and unp2 lock.
 		 * SOCK_STREAM/SEQPACKET can't request return_locked (yet).
 		 */
 		MPASS(!(return_locked && connreq));
 		vput(vp);
 	}
 	free(sa, M_SONAME);
 	if (__predict_false(error)) {
 		UNP_PCB_LOCK(unp);
 		KASSERT((unp->unp_flags & UNP_CONNECTING) != 0,
 		    ("%s: unp %p has UNP_CONNECTING clear", __func__, unp));
 		unp->unp_flags &= ~UNP_CONNECTING;
 		UNP_PCB_UNLOCK(unp);
 	}
 	return (error);
 }
 
 /*
  * Set socket peer credentials at connection time.
  *
  * The client's PCB credentials are copied from its process structure.  The
  * server's PCB credentials are copied from the socket on which it called
  * listen(2).  uipc_listen cached that process's credentials at the time.
  */
 void
 unp_copy_peercred(struct thread *td, struct unpcb *client_unp,
     struct unpcb *server_unp, struct unpcb *listen_unp)
 {
 	cru2xt(td, &client_unp->unp_peercred);
 	client_unp->unp_flags |= UNP_HAVEPC;
 
 	memcpy(&server_unp->unp_peercred, &listen_unp->unp_peercred,
 	    sizeof(server_unp->unp_peercred));
 	server_unp->unp_flags |= UNP_HAVEPC;
 	client_unp->unp_flags |= (listen_unp->unp_flags & UNP_WANTCRED_MASK);
 }
 
 static void
 unp_connect2(struct socket *so, struct socket *so2, conn2_how req)
 {
 	struct unpcb *unp;
 	struct unpcb *unp2;
 
 	MPASS(so2->so_type == so->so_type);
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
 	unp2 = sotounpcb(so2);
 	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
 
 	UNP_PCB_LOCK_ASSERT(unp);
 	UNP_PCB_LOCK_ASSERT(unp2);
 	KASSERT(unp->unp_conn == NULL,
 	    ("%s: socket %p is already connected", __func__, unp));
 
 	unp->unp_conn = unp2;
 	unp_pcb_hold(unp2);
 	unp_pcb_hold(unp);
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 		UNP_REF_LIST_LOCK();
 		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
 		UNP_REF_LIST_UNLOCK();
 		soisconnected(so);
 		break;
 
 	case SOCK_STREAM:
 	case SOCK_SEQPACKET:
 		KASSERT(unp2->unp_conn == NULL,
 		    ("%s: socket %p is already connected", __func__, unp2));
 		unp2->unp_conn = unp;
 		if (req == PRU_CONNECT &&
 		    ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT))
 			soisconnecting(so);
 		else
 			soisconnected(so);
 		soisconnected(so2);
 		break;
 
 	default:
 		panic("unp_connect2");
 	}
 }
 
 static void
 unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
 {
 	struct socket *so, *so2;
 	struct mbuf *m = NULL;
 #ifdef INVARIANTS
 	struct unpcb *unptmp;
 #endif
 
 	UNP_PCB_LOCK_ASSERT(unp);
 	UNP_PCB_LOCK_ASSERT(unp2);
 	KASSERT(unp->unp_conn == unp2,
 	    ("%s: unpcb %p is not connected to %p", __func__, unp, unp2));
 
 	unp->unp_conn = NULL;
 	so = unp->unp_socket;
 	so2 = unp2->unp_socket;
 	switch (unp->unp_socket->so_type) {
 	case SOCK_DGRAM:
 		/*
 		 * Remove our send socket buffer from the peer's receive buffer.
 		 * Move the data to the receive buffer only if it is empty.
 		 * This is a protection against a scenario where a peer
 		 * connects, floods and disconnects, effectively blocking
 		 * sendto() from unconnected sockets.
 		 */
 		SOCK_RECVBUF_LOCK(so2);
 		if (!STAILQ_EMPTY(&so->so_snd.uxdg_mb)) {
 			TAILQ_REMOVE(&so2->so_rcv.uxdg_conns, &so->so_snd,
 			    uxdg_clist);
 			if (__predict_true((so2->so_rcv.sb_state &
 			    SBS_CANTRCVMORE) == 0) &&
 			    STAILQ_EMPTY(&so2->so_rcv.uxdg_mb)) {
 				STAILQ_CONCAT(&so2->so_rcv.uxdg_mb,
 				    &so->so_snd.uxdg_mb);
 				so2->so_rcv.uxdg_cc += so->so_snd.uxdg_cc;
 				so2->so_rcv.uxdg_ctl += so->so_snd.uxdg_ctl;
 				so2->so_rcv.uxdg_mbcnt += so->so_snd.uxdg_mbcnt;
 			} else {
 				m = STAILQ_FIRST(&so->so_snd.uxdg_mb);
 				STAILQ_INIT(&so->so_snd.uxdg_mb);
 				so2->so_rcv.sb_acc -= so->so_snd.uxdg_cc;
 				so2->so_rcv.sb_ccc -= so->so_snd.uxdg_cc;
 				so2->so_rcv.sb_ctl -= so->so_snd.uxdg_ctl;
 				so2->so_rcv.sb_mbcnt -= so->so_snd.uxdg_mbcnt;
 			}
 			/* Note: so may reconnect. */
 			so->so_snd.uxdg_cc = 0;
 			so->so_snd.uxdg_ctl = 0;
 			so->so_snd.uxdg_mbcnt = 0;
 		}
 		SOCK_RECVBUF_UNLOCK(so2);
 		UNP_REF_LIST_LOCK();
 #ifdef INVARIANTS
 		LIST_FOREACH(unptmp, &unp2->unp_refs, unp_reflink) {
 			if (unptmp == unp)
 				break;
 		}
 		KASSERT(unptmp != NULL,
 		    ("%s: %p not found in reflist of %p", __func__, unp, unp2));
 #endif
 		LIST_REMOVE(unp, unp_reflink);
 		UNP_REF_LIST_UNLOCK();
 		if (so) {
 			SOCK_LOCK(so);
 			so->so_state &= ~SS_ISCONNECTED;
 			SOCK_UNLOCK(so);
 		}
 		break;
 
 	case SOCK_STREAM:
 	case SOCK_SEQPACKET:
 		if (so)
 			soisdisconnected(so);
 		MPASS(unp2->unp_conn == unp);
 		unp2->unp_conn = NULL;
 		if (so2)
 			soisdisconnected(so2);
 		break;
 	}
 
 	if (unp == unp2) {
 		unp_pcb_rele_notlast(unp);
 		if (!unp_pcb_rele(unp))
 			UNP_PCB_UNLOCK(unp);
 	} else {
 		if (!unp_pcb_rele(unp))
 			UNP_PCB_UNLOCK(unp);
 		if (!unp_pcb_rele(unp2))
 			UNP_PCB_UNLOCK(unp2);
 	}
 
 	if (m != NULL) {
 		unp_scan(m, unp_freerights);
 		m_freem(m);
 	}
 }
 
 /*
  * unp_pcblist() walks the global list of struct unpcb's to generate a
  * pointer list, bumping the refcount on each unpcb.  It then copies them out
  * sequentially, validating the generation number on each to see if it has
  * been detached.  All of this is necessary because copyout() may sleep on
  * disk I/O.
  */
 static int
 unp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	struct unpcb *unp, **unp_list;
 	unp_gen_t gencnt;
 	struct xunpgen *xug;
 	struct unp_head *head;
 	struct xunpcb *xu;
 	u_int i;
 	int error, n;
 
 	switch ((intptr_t)arg1) {
 	case SOCK_STREAM:
 		head = &unp_shead;
 		break;
 
 	case SOCK_DGRAM:
 		head = &unp_dhead;
 		break;
 
 	case SOCK_SEQPACKET:
 		head = &unp_sphead;
 		break;
 
 	default:
 		panic("unp_pcblist: arg1 %d", (int)(intptr_t)arg1);
 	}
 
 	/*
 	 * The process of preparing the PCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		n = unp_count;
 		req->oldidx = 2 * (sizeof *xug)
 			+ (n + n/8) * sizeof(struct xunpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK | M_ZERO);
 	UNP_LINK_RLOCK();
 	gencnt = unp_gencnt;
 	n = unp_count;
 	UNP_LINK_RUNLOCK();
 
 	xug->xug_len = sizeof *xug;
 	xug->xug_count = n;
 	xug->xug_gen = gencnt;
 	xug->xug_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, xug, sizeof *xug);
 	if (error) {
 		free(xug, M_TEMP);
 		return (error);
 	}
 
 	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
 
 	UNP_LINK_RLOCK();
 	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
 	     unp = LIST_NEXT(unp, unp_link)) {
 		UNP_PCB_LOCK(unp);
 		if (unp->unp_gencnt <= gencnt) {
 			if (cr_cansee(req->td->td_ucred,
 			    unp->unp_socket->so_cred)) {
 				UNP_PCB_UNLOCK(unp);
 				continue;
 			}
 			unp_list[i++] = unp;
 			unp_pcb_hold(unp);
 		}
 		UNP_PCB_UNLOCK(unp);
 	}
 	UNP_LINK_RUNLOCK();
 	n = i;			/* In case we lost some during malloc. */
 
 	error = 0;
 	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
 	for (i = 0; i < n; i++) {
 		unp = unp_list[i];
 		UNP_PCB_LOCK(unp);
 		if (unp_pcb_rele(unp))
 			continue;
 
 		if (unp->unp_gencnt <= gencnt) {
 			xu->xu_len = sizeof *xu;
 			xu->xu_unpp = (uintptr_t)unp;
 			/*
 			 * XXX - need more locking here to protect against
 			 * connect/disconnect races for SMP.
 			 */
 			if (unp->unp_addr != NULL)
 				bcopy(unp->unp_addr, &xu->xu_addr,
 				      unp->unp_addr->sun_len);
 			else
 				bzero(&xu->xu_addr, sizeof(xu->xu_addr));
 			if (unp->unp_conn != NULL &&
 			    unp->unp_conn->unp_addr != NULL)
 				bcopy(unp->unp_conn->unp_addr,
 				      &xu->xu_caddr,
 				      unp->unp_conn->unp_addr->sun_len);
 			else
 				bzero(&xu->xu_caddr, sizeof(xu->xu_caddr));
 			xu->unp_vnode = (uintptr_t)unp->unp_vnode;
 			xu->unp_conn = (uintptr_t)unp->unp_conn;
 			xu->xu_firstref = (uintptr_t)LIST_FIRST(&unp->unp_refs);
 			xu->xu_nextref = (uintptr_t)LIST_NEXT(unp, unp_reflink);
 			xu->unp_gencnt = unp->unp_gencnt;
 			sotoxsocket(unp->unp_socket, &xu->xu_socket);
 			UNP_PCB_UNLOCK(unp);
 			error = SYSCTL_OUT(req, xu, sizeof *xu);
 		} else {
 			UNP_PCB_UNLOCK(unp);
 		}
 	}
 	free(xu, M_TEMP);
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		xug->xug_gen = unp_gencnt;
 		xug->xug_sogen = so_gencnt;
 		xug->xug_count = unp_count;
 		error = SYSCTL_OUT(req, xug, sizeof *xug);
 	}
 	free(unp_list, M_TEMP);
 	free(xug, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     (void *)(intptr_t)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb",
     "List of active local datagram sockets");
 SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     (void *)(intptr_t)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb",
     "List of active local stream sockets");
 SYSCTL_PROC(_net_local_seqpacket, OID_AUTO, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     (void *)(intptr_t)SOCK_SEQPACKET, 0, unp_pcblist, "S,xunpcb",
     "List of active local seqpacket sockets");
 
 static void
 unp_shutdown(struct unpcb *unp)
 {
 	struct unpcb *unp2;
 	struct socket *so;
 
 	UNP_PCB_LOCK_ASSERT(unp);
 
 	unp2 = unp->unp_conn;
 	if ((unp->unp_socket->so_type == SOCK_STREAM ||
 	    (unp->unp_socket->so_type == SOCK_SEQPACKET)) && unp2 != NULL) {
 		so = unp2->unp_socket;
 		if (so != NULL)
 			socantrcvmore(so);
 	}
 }
 
 static void
 unp_drop(struct unpcb *unp)
 {
 	struct socket *so;
 	struct unpcb *unp2;
 
 	/*
 	 * Regardless of whether the socket's peer dropped the connection
 	 * with this socket by aborting or disconnecting, POSIX requires
 	 * that ECONNRESET is returned.
 	 */
 
 	UNP_PCB_LOCK(unp);
 	so = unp->unp_socket;
 	if (so)
 		so->so_error = ECONNRESET;
 	if ((unp2 = unp_pcb_lock_peer(unp)) != NULL) {
 		/* Last reference dropped in unp_disconnect(). */
 		unp_pcb_rele_notlast(unp);
 		unp_disconnect(unp, unp2);
 	} else if (!unp_pcb_rele(unp)) {
 		UNP_PCB_UNLOCK(unp);
 	}
 }
 
 static void
 unp_freerights(struct filedescent **fdep, int fdcount)
 {
 	struct file *fp;
 	int i;
 
 	KASSERT(fdcount > 0, ("%s: fdcount %d", __func__, fdcount));
 
 	for (i = 0; i < fdcount; i++) {
 		fp = fdep[i]->fde_file;
 		filecaps_free(&fdep[i]->fde_caps);
 		unp_discard(fp);
 	}
 	free(fdep[0], M_FILECAPS);
 }
 
 static int
 unp_externalize(struct mbuf *control, struct mbuf **controlp, int flags)
 {
 	struct thread *td = curthread;		/* XXX */
 	struct cmsghdr *cm = mtod(control, struct cmsghdr *);
 	int i;
 	int *fdp;
 	struct filedesc *fdesc = td->td_proc->p_fd;
 	struct filedescent **fdep;
 	void *data;
 	socklen_t clen = control->m_len, datalen;
 	int error, newfds;
 	u_int newlen;
 
 	UNP_LINK_UNLOCK_ASSERT();
 
 	error = 0;
 	if (controlp != NULL) /* controlp == NULL => free control messages */
 		*controlp = NULL;
 	while (cm != NULL) {
 		MPASS(clen >= sizeof(*cm) && clen >= cm->cmsg_len);
 
 		data = CMSG_DATA(cm);
 		datalen = (caddr_t)cm + cm->cmsg_len - (caddr_t)data;
 		if (cm->cmsg_level == SOL_SOCKET
 		    && cm->cmsg_type == SCM_RIGHTS) {
 			newfds = datalen / sizeof(*fdep);
 			if (newfds == 0)
 				goto next;
 			fdep = data;
 
 			/* If we're not outputting the descriptors free them. */
 			if (error || controlp == NULL) {
 				unp_freerights(fdep, newfds);
 				goto next;
 			}
 			FILEDESC_XLOCK(fdesc);
 
 			/*
 			 * Now change each pointer to an fd in the global
 			 * table to an integer that is the index to the local
 			 * fd table entry that we set up to point to the
 			 * global one we are transferring.
 			 */
 			newlen = newfds * sizeof(int);
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET, M_WAITOK);
 
 			fdp = (int *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			if ((error = fdallocn(td, 0, fdp, newfds))) {
 				FILEDESC_XUNLOCK(fdesc);
 				unp_freerights(fdep, newfds);
 				m_freem(*controlp);
 				*controlp = NULL;
 				goto next;
 			}
 			for (i = 0; i < newfds; i++, fdp++) {
 				_finstall(fdesc, fdep[i]->fde_file, *fdp,
 				    (flags & MSG_CMSG_CLOEXEC) != 0 ? O_CLOEXEC : 0,
 				    &fdep[i]->fde_caps);
 				unp_externalize_fp(fdep[i]->fde_file);
 			}
 
 			/*
 			 * The new type indicates that the mbuf data refers to
 			 * kernel resources that may need to be released before
 			 * the mbuf is freed.
 			 */
 			m_chtype(*controlp, MT_EXTCONTROL);
 			FILEDESC_XUNLOCK(fdesc);
 			free(fdep[0], M_FILECAPS);
 		} else {
 			/* We can just copy anything else across. */
 			if (error || controlp == NULL)
 				goto next;
 			*controlp = sbcreatecontrol(NULL, datalen,
 			    cm->cmsg_type, cm->cmsg_level, M_WAITOK);
 			bcopy(data,
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *)),
 			    datalen);
 		}
 		controlp = &(*controlp)->m_next;
 
 next:
 		if (CMSG_SPACE(datalen) < clen) {
 			clen -= CMSG_SPACE(datalen);
 			cm = (struct cmsghdr *)
 			    ((caddr_t)cm + CMSG_SPACE(datalen));
 		} else {
 			clen = 0;
 			cm = NULL;
 		}
 	}
 
 	m_freem(control);
 	return (error);
 }
 
 static void
 unp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(unp_zone, maxsockets);
 }
 
 #ifdef INVARIANTS
 static void
 unp_zdtor(void *mem, int size __unused, void *arg __unused)
 {
 	struct unpcb *unp;
 
 	unp = mem;
 
 	KASSERT(LIST_EMPTY(&unp->unp_refs),
 	    ("%s: unpcb %p has lingering refs", __func__, unp));
 	KASSERT(unp->unp_socket == NULL,
 	    ("%s: unpcb %p has socket backpointer", __func__, unp));
 	KASSERT(unp->unp_vnode == NULL,
 	    ("%s: unpcb %p has vnode references", __func__, unp));
 	KASSERT(unp->unp_conn == NULL,
 	    ("%s: unpcb %p is still connected", __func__, unp));
 	KASSERT(unp->unp_addr == NULL,
 	    ("%s: unpcb %p has leaked addr", __func__, unp));
 }
 #endif
 
 static void
 unp_init(void *arg __unused)
 {
 	uma_dtor dtor;
 
 #ifdef INVARIANTS
 	dtor = unp_zdtor;
 #else
 	dtor = NULL;
 #endif
 	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, dtor,
 	    NULL, NULL, UMA_ALIGN_CACHE, 0);
 	uma_zone_set_max(unp_zone, maxsockets);
 	uma_zone_set_warning(unp_zone, "kern.ipc.maxsockets limit reached");
 	EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
 	    NULL, EVENTHANDLER_PRI_ANY);
 	LIST_INIT(&unp_dhead);
 	LIST_INIT(&unp_shead);
 	LIST_INIT(&unp_sphead);
 	SLIST_INIT(&unp_defers);
 	TIMEOUT_TASK_INIT(taskqueue_thread, &unp_gc_task, 0, unp_gc, NULL);
 	TASK_INIT(&unp_defer_task, 0, unp_process_defers, NULL);
 	UNP_LINK_LOCK_INIT();
 	UNP_DEFERRED_LOCK_INIT();
 }
 SYSINIT(unp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, unp_init, NULL);
 
 static void
 unp_internalize_cleanup_rights(struct mbuf *control)
 {
 	struct cmsghdr *cp;
 	struct mbuf *m;
 	void *data;
 	socklen_t datalen;
 
 	for (m = control; m != NULL; m = m->m_next) {
 		cp = mtod(m, struct cmsghdr *);
 		if (cp->cmsg_level != SOL_SOCKET ||
 		    cp->cmsg_type != SCM_RIGHTS)
 			continue;
 		data = CMSG_DATA(cp);
 		datalen = (caddr_t)cp + cp->cmsg_len - (caddr_t)data;
 		unp_freerights(data, datalen / sizeof(struct filedesc *));
 	}
 }
 
 static int
 unp_internalize(struct mbuf **controlp, struct thread *td,
     struct mbuf **clast, u_int *space, u_int *mbcnt)
 {
 	struct mbuf *control, **initial_controlp;
 	struct proc *p;
 	struct filedesc *fdesc;
 	struct bintime *bt;
 	struct cmsghdr *cm;
 	struct cmsgcred *cmcred;
 	struct filedescent *fde, **fdep, *fdev;
 	struct file *fp;
 	struct timeval *tv;
 	struct timespec *ts;
 	void *data;
 	socklen_t clen, datalen;
 	int i, j, error, *fdp, oldfds;
 	u_int newlen;
 
 	MPASS((*controlp)->m_next == NULL); /* COMPAT_OLDSOCK may violate */
 	UNP_LINK_UNLOCK_ASSERT();
 
 	p = td->td_proc;
 	fdesc = p->p_fd;
 	error = 0;
 	control = *controlp;
 	*controlp = NULL;
 	initial_controlp = controlp;
 	for (clen = control->m_len, cm = mtod(control, struct cmsghdr *),
 	    data = CMSG_DATA(cm);
 
 	    clen >= sizeof(*cm) && cm->cmsg_level == SOL_SOCKET &&
 	    clen >= cm->cmsg_len && cm->cmsg_len >= sizeof(*cm) &&
 	    (char *)cm + cm->cmsg_len >= (char *)data;
 
 	    clen -= min(CMSG_SPACE(datalen), clen),
 	    cm = (struct cmsghdr *) ((char *)cm + CMSG_SPACE(datalen)),
 	    data = CMSG_DATA(cm)) {
 		datalen = (char *)cm + cm->cmsg_len - (char *)data;
 		switch (cm->cmsg_type) {
 		case SCM_CREDS:
 			*controlp = sbcreatecontrol(NULL, sizeof(*cmcred),
 			    SCM_CREDS, SOL_SOCKET, M_WAITOK);
 			cmcred = (struct cmsgcred *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			cmcred->cmcred_pid = p->p_pid;
 			cmcred->cmcred_uid = td->td_ucred->cr_ruid;
 			cmcred->cmcred_gid = td->td_ucred->cr_rgid;
 			cmcred->cmcred_euid = td->td_ucred->cr_uid;
 			cmcred->cmcred_ngroups = MIN(td->td_ucred->cr_ngroups,
 			    CMGROUP_MAX);
 			for (i = 0; i < cmcred->cmcred_ngroups; i++)
 				cmcred->cmcred_groups[i] =
 				    td->td_ucred->cr_groups[i];
 			break;
 
 		case SCM_RIGHTS:
 			oldfds = datalen / sizeof (int);
 			if (oldfds == 0)
 				continue;
 			/* On some machines sizeof pointer is bigger than
 			 * sizeof int, so we need to check if data fits into
 			 * single mbuf.  We could allocate several mbufs, and
 			 * unp_externalize() should even properly handle that.
 			 * But it is not worth to complicate the code for an
 			 * insane scenario of passing over 200 file descriptors
 			 * at once.
 			 */
 			newlen = oldfds * sizeof(fdep[0]);
 			if (CMSG_SPACE(newlen) > MCLBYTES) {
 				error = EMSGSIZE;
 				goto out;
 			}
 			/*
 			 * Check that all the FDs passed in refer to legal
 			 * files.  If not, reject the entire operation.
 			 */
 			fdp = data;
 			FILEDESC_SLOCK(fdesc);
 			for (i = 0; i < oldfds; i++, fdp++) {
 				fp = fget_noref(fdesc, *fdp);
 				if (fp == NULL) {
 					FILEDESC_SUNLOCK(fdesc);
 					error = EBADF;
 					goto out;
 				}
 				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
 					FILEDESC_SUNLOCK(fdesc);
 					error = EOPNOTSUPP;
 					goto out;
 				}
 			}
 
 			/*
 			 * Now replace the integer FDs with pointers to the
 			 * file structure and capability rights.
 			 */
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET, M_WAITOK);
 			fdp = data;
 			for (i = 0; i < oldfds; i++, fdp++) {
 				if (!fhold(fdesc->fd_ofiles[*fdp].fde_file)) {
 					fdp = data;
 					for (j = 0; j < i; j++, fdp++) {
 						fdrop(fdesc->fd_ofiles[*fdp].
 						    fde_file, td);
 					}
 					FILEDESC_SUNLOCK(fdesc);
 					error = EBADF;
 					goto out;
 				}
 			}
 			fdp = data;
 			fdep = (struct filedescent **)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			fdev = malloc(sizeof(*fdev) * oldfds, M_FILECAPS,
 			    M_WAITOK);
 			for (i = 0; i < oldfds; i++, fdev++, fdp++) {
 				fde = &fdesc->fd_ofiles[*fdp];
 				fdep[i] = fdev;
 				fdep[i]->fde_file = fde->fde_file;
 				filecaps_copy(&fde->fde_caps,
 				    &fdep[i]->fde_caps, true);
 				unp_internalize_fp(fdep[i]->fde_file);
 			}
 			FILEDESC_SUNLOCK(fdesc);
 			break;
 
 		case SCM_TIMESTAMP:
 			*controlp = sbcreatecontrol(NULL, sizeof(*tv),
 			    SCM_TIMESTAMP, SOL_SOCKET, M_WAITOK);
 			tv = (struct timeval *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			microtime(tv);
 			break;
 
 		case SCM_BINTIME:
 			*controlp = sbcreatecontrol(NULL, sizeof(*bt),
 			    SCM_BINTIME, SOL_SOCKET, M_WAITOK);
 			bt = (struct bintime *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			bintime(bt);
 			break;
 
 		case SCM_REALTIME:
 			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
 			    SCM_REALTIME, SOL_SOCKET, M_WAITOK);
 			ts = (struct timespec *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			nanotime(ts);
 			break;
 
 		case SCM_MONOTONIC:
 			*controlp = sbcreatecontrol(NULL, sizeof(*ts),
 			    SCM_MONOTONIC, SOL_SOCKET, M_WAITOK);
 			ts = (struct timespec *)
 			    CMSG_DATA(mtod(*controlp, struct cmsghdr *));
 			nanouptime(ts);
 			break;
 
 		default:
 			error = EINVAL;
 			goto out;
 		}
 
 		if (space != NULL) {
 			*space += (*controlp)->m_len;
 			*mbcnt += MSIZE;
 			if ((*controlp)->m_flags & M_EXT)
 				*mbcnt += (*controlp)->m_ext.ext_size;
 			*clast = *controlp;
 		}
 		controlp = &(*controlp)->m_next;
 	}
 	if (clen > 0)
 		error = EINVAL;
 
 out:
 	if (error != 0 && initial_controlp != NULL)
 		unp_internalize_cleanup_rights(*initial_controlp);
 	m_freem(control);
 	return (error);
 }
 
 static struct mbuf *
 unp_addsockcred(struct thread *td, struct mbuf *control, int mode,
     struct mbuf **clast, u_int *space, u_int *mbcnt)
 {
 	struct mbuf *m, *n, *n_prev;
 	const struct cmsghdr *cm;
 	int ngroups, i, cmsgtype;
 	size_t ctrlsz;
 
 	ngroups = MIN(td->td_ucred->cr_ngroups, CMGROUP_MAX);
 	if (mode & UNP_WANTCRED_ALWAYS) {
 		ctrlsz = SOCKCRED2SIZE(ngroups);
 		cmsgtype = SCM_CREDS2;
 	} else {
 		ctrlsz = SOCKCREDSIZE(ngroups);
 		cmsgtype = SCM_CREDS;
 	}
 
 	m = sbcreatecontrol(NULL, ctrlsz, cmsgtype, SOL_SOCKET, M_NOWAIT);
 	if (m == NULL)
 		return (control);
 	MPASS((m->m_flags & M_EXT) == 0 && m->m_next == NULL);
 
 	if (mode & UNP_WANTCRED_ALWAYS) {
 		struct sockcred2 *sc;
 
 		sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
 		sc->sc_version = 0;
 		sc->sc_pid = td->td_proc->p_pid;
 		sc->sc_uid = td->td_ucred->cr_ruid;
 		sc->sc_euid = td->td_ucred->cr_uid;
 		sc->sc_gid = td->td_ucred->cr_rgid;
 		sc->sc_egid = td->td_ucred->cr_gid;
 		sc->sc_ngroups = ngroups;
 		for (i = 0; i < sc->sc_ngroups; i++)
 			sc->sc_groups[i] = td->td_ucred->cr_groups[i];
 	} else {
 		struct sockcred *sc;
 
 		sc = (void *)CMSG_DATA(mtod(m, struct cmsghdr *));
 		sc->sc_uid = td->td_ucred->cr_ruid;
 		sc->sc_euid = td->td_ucred->cr_uid;
 		sc->sc_gid = td->td_ucred->cr_rgid;
 		sc->sc_egid = td->td_ucred->cr_gid;
 		sc->sc_ngroups = ngroups;
 		for (i = 0; i < sc->sc_ngroups; i++)
 			sc->sc_groups[i] = td->td_ucred->cr_groups[i];
 	}
 
 	/*
 	 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
 	 * created SCM_CREDS control message (struct sockcred) has another
 	 * format.
 	 */
 	if (control != NULL && cmsgtype == SCM_CREDS)
 		for (n = control, n_prev = NULL; n != NULL;) {
 			cm = mtod(n, struct cmsghdr *);
     			if (cm->cmsg_level == SOL_SOCKET &&
 			    cm->cmsg_type == SCM_CREDS) {
     				if (n_prev == NULL)
 					control = n->m_next;
 				else
 					n_prev->m_next = n->m_next;
 				if (space != NULL) {
 					MPASS(*space >= n->m_len);
 					*space -= n->m_len;
 					MPASS(*mbcnt >= MSIZE);
 					*mbcnt -= MSIZE;
 					if (n->m_flags & M_EXT) {
 						MPASS(*mbcnt >=
 						    n->m_ext.ext_size);
 						*mbcnt -= n->m_ext.ext_size;
 					}
 					MPASS(clast);
 					if (*clast == n) {
 						MPASS(n->m_next == NULL);
 						if (n_prev == NULL)
 							*clast = m;
 						else
 							*clast = n_prev;
 					}
 				}
 				n = m_free(n);
 			} else {
 				n_prev = n;
 				n = n->m_next;
 			}
 		}
 
 	/* Prepend it to the head. */
 	m->m_next = control;
 	if (space != NULL) {
 		*space += m->m_len;
 		*mbcnt += MSIZE;
 		if (control == NULL)
 			*clast = m;
 	}
 	return (m);
 }
 
 static struct unpcb *
 fptounp(struct file *fp)
 {
 	struct socket *so;
 
 	if (fp->f_type != DTYPE_SOCKET)
 		return (NULL);
 	if ((so = fp->f_data) == NULL)
 		return (NULL);
 	if (so->so_proto->pr_domain != &localdomain)
 		return (NULL);
 	return sotounpcb(so);
 }
 
 static void
 unp_discard(struct file *fp)
 {
 	struct unp_defer *dr;
 
 	if (unp_externalize_fp(fp)) {
 		dr = malloc(sizeof(*dr), M_TEMP, M_WAITOK);
 		dr->ud_fp = fp;
 		UNP_DEFERRED_LOCK();
 		SLIST_INSERT_HEAD(&unp_defers, dr, ud_link);
 		UNP_DEFERRED_UNLOCK();
 		atomic_add_int(&unp_defers_count, 1);
 		taskqueue_enqueue(taskqueue_thread, &unp_defer_task);
 	} else
 		closef_nothread(fp);
 }
 
 static void
 unp_process_defers(void *arg __unused, int pending)
 {
 	struct unp_defer *dr;
 	SLIST_HEAD(, unp_defer) drl;
 	int count;
 
 	SLIST_INIT(&drl);
 	for (;;) {
 		UNP_DEFERRED_LOCK();
 		if (SLIST_FIRST(&unp_defers) == NULL) {
 			UNP_DEFERRED_UNLOCK();
 			break;
 		}
 		SLIST_SWAP(&unp_defers, &drl, unp_defer);
 		UNP_DEFERRED_UNLOCK();
 		count = 0;
 		while ((dr = SLIST_FIRST(&drl)) != NULL) {
 			SLIST_REMOVE_HEAD(&drl, ud_link);
 			closef_nothread(dr->ud_fp);
 			free(dr, M_TEMP);
 			count++;
 		}
 		atomic_add_int(&unp_defers_count, -count);
 	}
 }
 
 static void
 unp_internalize_fp(struct file *fp)
 {
 	struct unpcb *unp;
 
 	UNP_LINK_WLOCK();
 	if ((unp = fptounp(fp)) != NULL) {
 		unp->unp_file = fp;
 		unp->unp_msgcount++;
 	}
 	unp_rights++;
 	UNP_LINK_WUNLOCK();
 }
 
 static int
 unp_externalize_fp(struct file *fp)
 {
 	struct unpcb *unp;
 	int ret;
 
 	UNP_LINK_WLOCK();
 	if ((unp = fptounp(fp)) != NULL) {
 		unp->unp_msgcount--;
 		ret = 1;
 	} else
 		ret = 0;
 	unp_rights--;
 	UNP_LINK_WUNLOCK();
 	return (ret);
 }
 
 /*
  * unp_defer indicates whether additional work has been defered for a future
  * pass through unp_gc().  It is thread local and does not require explicit
  * synchronization.
  */
 static int	unp_marked;
 
 static void
 unp_remove_dead_ref(struct filedescent **fdep, int fdcount)
 {
 	struct unpcb *unp;
 	struct file *fp;
 	int i;
 
 	/*
 	 * This function can only be called from the gc task.
 	 */
 	KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
 	    ("%s: not on gc callout", __func__));
 	UNP_LINK_LOCK_ASSERT();
 
 	for (i = 0; i < fdcount; i++) {
 		fp = fdep[i]->fde_file;
 		if ((unp = fptounp(fp)) == NULL)
 			continue;
 		if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
 			continue;
 		unp->unp_gcrefs--;
 	}
 }
 
 static void
 unp_restore_undead_ref(struct filedescent **fdep, int fdcount)
 {
 	struct unpcb *unp;
 	struct file *fp;
 	int i;
 
 	/*
 	 * This function can only be called from the gc task.
 	 */
 	KASSERT(taskqueue_member(taskqueue_thread, curthread) != 0,
 	    ("%s: not on gc callout", __func__));
 	UNP_LINK_LOCK_ASSERT();
 
 	for (i = 0; i < fdcount; i++) {
 		fp = fdep[i]->fde_file;
 		if ((unp = fptounp(fp)) == NULL)
 			continue;
 		if ((unp->unp_gcflag & UNPGC_DEAD) == 0)
 			continue;
 		unp->unp_gcrefs++;
 		unp_marked++;
 	}
 }
 
 static void
 unp_scan_socket(struct socket *so, void (*op)(struct filedescent **, int))
 {
 	struct sockbuf *sb;
 
 	SOCK_LOCK_ASSERT(so);
 
 	if (sotounpcb(so)->unp_gcflag & UNPGC_IGNORE_RIGHTS)
 		return;
 
 	SOCK_RECVBUF_LOCK(so);
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 		unp_scan(STAILQ_FIRST(&so->so_rcv.uxdg_mb), op);
 		unp_scan(so->so_rcv.uxdg_peeked, op);
 		TAILQ_FOREACH(sb, &so->so_rcv.uxdg_conns, uxdg_clist)
 			unp_scan(STAILQ_FIRST(&sb->uxdg_mb), op);
 		break;
 	case SOCK_STREAM:
 	case SOCK_SEQPACKET:
 		unp_scan(so->so_rcv.sb_mb, op);
 		break;
 	}
 	SOCK_RECVBUF_UNLOCK(so);
 }
 
 static void
 unp_gc_scan(struct unpcb *unp, void (*op)(struct filedescent **, int))
 {
 	struct socket *so, *soa;
 
 	so = unp->unp_socket;
 	SOCK_LOCK(so);
 	if (SOLISTENING(so)) {
 		/*
 		 * Mark all sockets in our accept queue.
 		 */
 		TAILQ_FOREACH(soa, &so->sol_comp, so_list)
 			unp_scan_socket(soa, op);
 	} else {
 		/*
 		 * Mark all sockets we reference with RIGHTS.
 		 */
 		unp_scan_socket(so, op);
 	}
 	SOCK_UNLOCK(so);
 }
 
 static int unp_recycled;
 SYSCTL_INT(_net_local, OID_AUTO, recycled, CTLFLAG_RD, &unp_recycled, 0, 
     "Number of unreachable sockets claimed by the garbage collector.");
 
 static int unp_taskcount;
 SYSCTL_INT(_net_local, OID_AUTO, taskcount, CTLFLAG_RD, &unp_taskcount, 0, 
     "Number of times the garbage collector has run.");
 
 SYSCTL_UINT(_net_local, OID_AUTO, sockcount, CTLFLAG_RD, &unp_count, 0, 
     "Number of active local sockets.");
 
 static void
 unp_gc(__unused void *arg, int pending)
 {
 	struct unp_head *heads[] = { &unp_dhead, &unp_shead, &unp_sphead,
 				    NULL };
 	struct unp_head **head;
 	struct unp_head unp_deadhead;	/* List of potentially-dead sockets. */
 	struct file *f, **unref;
 	struct unpcb *unp, *unptmp;
 	int i, total, unp_unreachable;
 
 	LIST_INIT(&unp_deadhead);
 	unp_taskcount++;
 	UNP_LINK_RLOCK();
 	/*
 	 * First determine which sockets may be in cycles.
 	 */
 	unp_unreachable = 0;
 
 	for (head = heads; *head != NULL; head++)
 		LIST_FOREACH(unp, *head, unp_link) {
 			KASSERT((unp->unp_gcflag & ~UNPGC_IGNORE_RIGHTS) == 0,
 			    ("%s: unp %p has unexpected gc flags 0x%x",
 			    __func__, unp, (unsigned int)unp->unp_gcflag));
 
 			f = unp->unp_file;
 
 			/*
 			 * Check for an unreachable socket potentially in a
 			 * cycle.  It must be in a queue as indicated by
 			 * msgcount, and this must equal the file reference
 			 * count.  Note that when msgcount is 0 the file is
 			 * NULL.
 			 */
 			if (f != NULL && unp->unp_msgcount != 0 &&
 			    refcount_load(&f->f_count) == unp->unp_msgcount) {
 				LIST_INSERT_HEAD(&unp_deadhead, unp, unp_dead);
 				unp->unp_gcflag |= UNPGC_DEAD;
 				unp->unp_gcrefs = unp->unp_msgcount;
 				unp_unreachable++;
 			}
 		}
 
 	/*
 	 * Scan all sockets previously marked as potentially being in a cycle
 	 * and remove the references each socket holds on any UNPGC_DEAD
 	 * sockets in its queue.  After this step, all remaining references on
 	 * sockets marked UNPGC_DEAD should not be part of any cycle.
 	 */
 	LIST_FOREACH(unp, &unp_deadhead, unp_dead)
 		unp_gc_scan(unp, unp_remove_dead_ref);
 
 	/*
 	 * If a socket still has a non-negative refcount, it cannot be in a
 	 * cycle.  In this case increment refcount of all children iteratively.
 	 * Stop the scan once we do a complete loop without discovering
 	 * a new reachable socket.
 	 */
 	do {
 		unp_marked = 0;
 		LIST_FOREACH_SAFE(unp, &unp_deadhead, unp_dead, unptmp)
 			if (unp->unp_gcrefs > 0) {
 				unp->unp_gcflag &= ~UNPGC_DEAD;
 				LIST_REMOVE(unp, unp_dead);
 				KASSERT(unp_unreachable > 0,
 				    ("%s: unp_unreachable underflow.",
 				    __func__));
 				unp_unreachable--;
 				unp_gc_scan(unp, unp_restore_undead_ref);
 			}
 	} while (unp_marked);
 
 	UNP_LINK_RUNLOCK();
 
 	if (unp_unreachable == 0)
 		return;
 
 	/*
 	 * Allocate space for a local array of dead unpcbs.
 	 * TODO: can this path be simplified by instead using the local
 	 * dead list at unp_deadhead, after taking out references
 	 * on the file object and/or unpcb and dropping the link lock?
 	 */
 	unref = malloc(unp_unreachable * sizeof(struct file *),
 	    M_TEMP, M_WAITOK);
 
 	/*
 	 * Iterate looking for sockets which have been specifically marked
 	 * as unreachable and store them locally.
 	 */
 	UNP_LINK_RLOCK();
 	total = 0;
 	LIST_FOREACH(unp, &unp_deadhead, unp_dead) {
 		KASSERT((unp->unp_gcflag & UNPGC_DEAD) != 0,
 		    ("%s: unp %p not marked UNPGC_DEAD", __func__, unp));
 		unp->unp_gcflag &= ~UNPGC_DEAD;
 		f = unp->unp_file;
 		if (unp->unp_msgcount == 0 || f == NULL ||
 		    refcount_load(&f->f_count) != unp->unp_msgcount ||
 		    !fhold(f))
 			continue;
 		unref[total++] = f;
 		KASSERT(total <= unp_unreachable,
 		    ("%s: incorrect unreachable count.", __func__));
 	}
 	UNP_LINK_RUNLOCK();
 
 	/*
 	 * Now flush all sockets, free'ing rights.  This will free the
 	 * struct files associated with these sockets but leave each socket
 	 * with one remaining ref.
 	 */
 	for (i = 0; i < total; i++) {
 		struct socket *so;
 
 		so = unref[i]->f_data;
 		CURVNET_SET(so->so_vnet);
 		sorflush(so);
 		CURVNET_RESTORE();
 	}
 
 	/*
 	 * And finally release the sockets so they can be reclaimed.
 	 */
 	for (i = 0; i < total; i++)
 		fdrop(unref[i], NULL);
 	unp_recycled += total;
 	free(unref, M_TEMP);
 }
 
 /*
  * Synchronize against unp_gc, which can trip over data as we are freeing it.
  */
 static void
 unp_dispose(struct socket *so)
 {
 	struct sockbuf *sb;
 	struct unpcb *unp;
 	struct mbuf *m;
 
 	MPASS(!SOLISTENING(so));
 
 	unp = sotounpcb(so);
 	UNP_LINK_WLOCK();
 	unp->unp_gcflag |= UNPGC_IGNORE_RIGHTS;
 	UNP_LINK_WUNLOCK();
 
 	/*
 	 * Grab our special mbufs before calling sbrelease().
 	 */
 	SOCK_RECVBUF_LOCK(so);
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 		while ((sb = TAILQ_FIRST(&so->so_rcv.uxdg_conns)) != NULL) {
 			STAILQ_CONCAT(&so->so_rcv.uxdg_mb, &sb->uxdg_mb);
 			TAILQ_REMOVE(&so->so_rcv.uxdg_conns, sb, uxdg_clist);
 			/* Note: socket of sb may reconnect. */
 			sb->uxdg_cc = sb->uxdg_ctl = sb->uxdg_mbcnt = 0;
 		}
 		sb = &so->so_rcv;
 		if (sb->uxdg_peeked != NULL) {
 			STAILQ_INSERT_HEAD(&sb->uxdg_mb, sb->uxdg_peeked,
 			    m_stailqpkt);
 			sb->uxdg_peeked = NULL;
 		}
 		m = STAILQ_FIRST(&sb->uxdg_mb);
 		STAILQ_INIT(&sb->uxdg_mb);
 		/* XXX: our shortened sbrelease() */
 		(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
 		    RLIM_INFINITY);
 		/*
 		 * XXXGL Mark sb with SBS_CANTRCVMORE.  This is needed to
 		 * prevent uipc_sosend_dgram() or unp_disconnect() adding more
 		 * data to the socket.
 		 * We are now in dom_dispose and it could be a call from
 		 * soshutdown() or from the final sofree().  The sofree() case
 		 * is simple as it guarantees that no more sends will happen,
 		 * however we can race with unp_disconnect() from our peer.
 		 * The shutdown(2) case is more exotic.  It would call into
 		 * dom_dispose() only if socket is SS_ISCONNECTED.  This is
 		 * possible if we did connect(2) on this socket and we also
 		 * had it bound with bind(2) and receive connections from other
 		 * sockets.  Because soshutdown() violates POSIX (see comment
 		 * there) we will end up here shutting down our receive side.
 		 * Of course this will have affect not only on the peer we
 		 * connect(2)ed to, but also on all of the peers who had
 		 * connect(2)ed to us.  Their sends would end up with ENOBUFS.
 		 */
 		sb->sb_state |= SBS_CANTRCVMORE;
 		break;
 	case SOCK_STREAM:
 	case SOCK_SEQPACKET:
 		sb = &so->so_rcv;
 		m = sbcut_locked(sb, sb->sb_ccc);
 		KASSERT(sb->sb_ccc == 0 && sb->sb_mb == 0 && sb->sb_mbcnt == 0,
 		    ("%s: ccc %u mb %p mbcnt %u", __func__,
 		    sb->sb_ccc, (void *)sb->sb_mb, sb->sb_mbcnt));
 		sbrelease_locked(so, SO_RCV);
 		break;
 	}
 	SOCK_RECVBUF_UNLOCK(so);
 	if (SOCK_IO_RECV_OWNED(so))
 		SOCK_IO_RECV_UNLOCK(so);
 
 	if (m != NULL) {
 		unp_scan(m, unp_freerights);
 		m_freem(m);
 	}
 }
 
 static void
 unp_scan(struct mbuf *m0, void (*op)(struct filedescent **, int))
 {
 	struct mbuf *m;
 	struct cmsghdr *cm;
 	void *data;
 	socklen_t clen, datalen;
 
 	while (m0 != NULL) {
 		for (m = m0; m; m = m->m_next) {
 			if (m->m_type != MT_CONTROL)
 				continue;
 
 			cm = mtod(m, struct cmsghdr *);
 			clen = m->m_len;
 
 			while (cm != NULL) {
 				if (sizeof(*cm) > clen || cm->cmsg_len > clen)
 					break;
 
 				data = CMSG_DATA(cm);
 				datalen = (caddr_t)cm + cm->cmsg_len
 				    - (caddr_t)data;
 
 				if (cm->cmsg_level == SOL_SOCKET &&
 				    cm->cmsg_type == SCM_RIGHTS) {
 					(*op)(data, datalen /
 					    sizeof(struct filedescent *));
 				}
 
 				if (CMSG_SPACE(datalen) < clen) {
 					clen -= CMSG_SPACE(datalen);
 					cm = (struct cmsghdr *)
 					    ((caddr_t)cm + CMSG_SPACE(datalen));
 				} else {
 					clen = 0;
 					cm = NULL;
 				}
 			}
 		}
 		m0 = m0->m_nextpkt;
 	}
 }
 
+/*
+ * Definitions of protocols supported in the LOCAL domain.
+ */
+static struct protosw streamproto = {
+	.pr_type =		SOCK_STREAM,
+	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS|
+				    PR_CAPATTACH,
+	.pr_ctloutput =		&uipc_ctloutput,
+	.pr_abort = 		uipc_abort,
+	.pr_accept =		uipc_accept,
+	.pr_attach =		uipc_attach,
+	.pr_bind =		uipc_bind,
+	.pr_bindat =		uipc_bindat,
+	.pr_connect =		uipc_connect,
+	.pr_connectat =		uipc_connectat,
+	.pr_connect2 =		uipc_connect2,
+	.pr_detach =		uipc_detach,
+	.pr_disconnect =	uipc_disconnect,
+	.pr_listen =		uipc_listen,
+	.pr_peeraddr =		uipc_peeraddr,
+	.pr_rcvd =		uipc_rcvd,
+	.pr_send =		uipc_send,
+	.pr_ready =		uipc_ready,
+	.pr_sense =		uipc_sense,
+	.pr_shutdown =		uipc_shutdown,
+	.pr_sockaddr =		uipc_sockaddr,
+	.pr_soreceive =		soreceive_generic,
+	.pr_close =		uipc_close,
+};
+
+static struct protosw dgramproto = {
+	.pr_type =		SOCK_DGRAM,
+	.pr_flags =		PR_ATOMIC | PR_ADDR |PR_RIGHTS | PR_CAPATTACH |
+				    PR_SOCKBUF,
+	.pr_ctloutput =		&uipc_ctloutput,
+	.pr_abort = 		uipc_abort,
+	.pr_accept =		uipc_accept,
+	.pr_attach =		uipc_attach,
+	.pr_bind =		uipc_bind,
+	.pr_bindat =		uipc_bindat,
+	.pr_connect =		uipc_connect,
+	.pr_connectat =		uipc_connectat,
+	.pr_connect2 =		uipc_connect2,
+	.pr_detach =		uipc_detach,
+	.pr_disconnect =	uipc_disconnect,
+	.pr_peeraddr =		uipc_peeraddr,
+	.pr_sosend =		uipc_sosend_dgram,
+	.pr_sense =		uipc_sense,
+	.pr_shutdown =		uipc_shutdown,
+	.pr_sockaddr =		uipc_sockaddr,
+	.pr_soreceive =		uipc_soreceive_dgram,
+	.pr_close =		uipc_close,
+};
+
+static struct protosw seqpacketproto = {
+	.pr_type =		SOCK_SEQPACKET,
+	/*
+	 * XXXRW: For now, PR_ADDR because soreceive will bump into them
+	 * due to our use of sbappendaddr.  A new sbappend variants is needed
+	 * that supports both atomic record writes and control data.
+	 */
+	.pr_flags =		PR_ADDR|PR_ATOMIC|PR_CONNREQUIRED|
+				    PR_WANTRCVD|PR_RIGHTS|PR_CAPATTACH,
+	.pr_ctloutput =		&uipc_ctloutput,
+	.pr_abort =		uipc_abort,
+	.pr_accept =		uipc_accept,
+	.pr_attach =		uipc_attach,
+	.pr_bind =		uipc_bind,
+	.pr_bindat =		uipc_bindat,
+	.pr_connect =		uipc_connect,
+	.pr_connectat =		uipc_connectat,
+	.pr_connect2 =		uipc_connect2,
+	.pr_detach =		uipc_detach,
+	.pr_disconnect =	uipc_disconnect,
+	.pr_listen =		uipc_listen,
+	.pr_peeraddr =		uipc_peeraddr,
+	.pr_rcvd =		uipc_rcvd,
+	.pr_send =		uipc_send,
+	.pr_sense =		uipc_sense,
+	.pr_shutdown =		uipc_shutdown,
+	.pr_sockaddr =		uipc_sockaddr,
+	.pr_soreceive =		soreceive_generic,	/* XXX: or...? */
+	.pr_close =		uipc_close,
+};
+
+static struct domain localdomain = {
+	.dom_family =		AF_LOCAL,
+	.dom_name =		"local",
+	.dom_externalize =	unp_externalize,
+	.dom_dispose =		unp_dispose,
+	.dom_nprotosw =		3,
+	.dom_protosw =		{
+		&streamproto,
+		&dgramproto,
+		&seqpacketproto,
+	}
+};
+DOMAIN_SET(local);
+
 /*
  * A helper function called by VFS before socket-type vnode reclamation.
  * For an active vnode it clears unp_vnode pointer and decrements unp_vnode
  * use count.
  */
 void
 vfs_unp_reclaim(struct vnode *vp)
 {
 	struct unpcb *unp;
 	int active;
 	struct mtx *vplock;
 
 	ASSERT_VOP_ELOCKED(vp, "vfs_unp_reclaim");
 	KASSERT(vp->v_type == VSOCK,
 	    ("vfs_unp_reclaim: vp->v_type != VSOCK"));
 
 	active = 0;
 	vplock = mtx_pool_find(mtxpool_sleep, vp);
 	mtx_lock(vplock);
 	VOP_UNP_CONNECT(vp, &unp);
 	if (unp == NULL)
 		goto done;
 	UNP_PCB_LOCK(unp);
 	if (unp->unp_vnode == vp) {
 		VOP_UNP_DETACH(vp);
 		unp->unp_vnode = NULL;
 		active = 1;
 	}
 	UNP_PCB_UNLOCK(unp);
  done:
 	mtx_unlock(vplock);
 	if (active)
 		vunref(vp);
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_unpflags(int unp_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (unp_flags & UNP_HAVEPC) {
 		db_printf("%sUNP_HAVEPC", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_WANTCRED_ALWAYS) {
 		db_printf("%sUNP_WANTCRED_ALWAYS", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_WANTCRED_ONESHOT) {
 		db_printf("%sUNP_WANTCRED_ONESHOT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_CONNWAIT) {
 		db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_CONNECTING) {
 		db_printf("%sUNP_CONNECTING", comma ? ", " : "");
 		comma = 1;
 	}
 	if (unp_flags & UNP_BINDING) {
 		db_printf("%sUNP_BINDING", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_xucred(int indent, struct xucred *xu)
 {
 	int comma, i;
 
 	db_print_indent(indent);
 	db_printf("cr_version: %u   cr_uid: %u   cr_pid: %d   cr_ngroups: %d\n",
 	    xu->cr_version, xu->cr_uid, xu->cr_pid, xu->cr_ngroups);
 	db_print_indent(indent);
 	db_printf("cr_groups: ");
 	comma = 0;
 	for (i = 0; i < xu->cr_ngroups; i++) {
 		db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
 		comma = 1;
 	}
 	db_printf("\n");
 }
 
 static void
 db_print_unprefs(int indent, struct unp_head *uh)
 {
 	struct unpcb *unp;
 	int counter;
 
 	counter = 0;
 	LIST_FOREACH(unp, uh, unp_reflink) {
 		if (counter % 4 == 0)
 			db_print_indent(indent);
 		db_printf("%p  ", unp);
 		if (counter % 4 == 3)
 			db_printf("\n");
 		counter++;
 	}
 	if (counter != 0 && counter % 4 != 0)
 		db_printf("\n");
 }
 
 DB_SHOW_COMMAND(unpcb, db_show_unpcb)
 {
 	struct unpcb *unp;
 
         if (!have_addr) {
                 db_printf("usage: show unpcb <addr>\n");
                 return;
         }
         unp = (struct unpcb *)addr;
 
 	db_printf("unp_socket: %p   unp_vnode: %p\n", unp->unp_socket,
 	    unp->unp_vnode);
 
 	db_printf("unp_ino: %ju   unp_conn: %p\n", (uintmax_t)unp->unp_ino,
 	    unp->unp_conn);
 
 	db_printf("unp_refs:\n");
 	db_print_unprefs(2, &unp->unp_refs);
 
 	/* XXXRW: Would be nice to print the full address, if any. */
 	db_printf("unp_addr: %p\n", unp->unp_addr);
 
 	db_printf("unp_gencnt: %llu\n",
 	    (unsigned long long)unp->unp_gencnt);
 
 	db_printf("unp_flags: %x (", unp->unp_flags);
 	db_print_unpflags(unp->unp_flags);
 	db_printf(")\n");
 
 	db_printf("unp_peercred:\n");
 	db_print_xucred(2, &unp->unp_peercred);
 
 	db_printf("unp_refcount: %u\n", unp->unp_refcount);
 }
 #endif
diff --git a/sys/net/if.c b/sys/net/if.c
index 79995e3b9ea4..886d3f7833c0 100644
--- a/sys/net/if.c
+++ b/sys/net/if.c
@@ -1,4763 +1,4762 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2010 Bjoern A. Zeeb <bz@FreeBSD.org>
  * Copyright (c) 1980, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)if.c	8.5 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #include "opt_bpf.h"
 #include "opt_inet6.h"
 #include "opt_inet.h"
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
 #include <sys/conf.h>
 #include <sys/eventhandler.h>
 #include <sys/malloc.h>
 #include <sys/domainset.h>
 #include <sys/sbuf.h>
 #include <sys/bus.h>
 #include <sys/epoch.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
 #include <sys/nv.h>
 #include <sys/rwlock.h>
 #include <sys/sockio.h>
 #include <sys/syslog.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/taskqueue.h>
 #include <sys/domain.h>
 #include <sys/jail.h>
 #include <sys/priv.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <machine/stdarg.h>
 #include <vm/uma.h>
 
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_arp.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/if_media.h>
 #include <net/if_mib.h>
 #include <net/if_vlan_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/vnet.h>
 
 #if defined(INET) || defined(INET6)
 #include <net/ethernet.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_carp.h>
 #ifdef INET
 #include <net/debugnet.h>
 #include <netinet/if_ether.h>
 #endif /* INET */
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/in6_ifattach.h>
 #endif /* INET6 */
 #endif /* INET || INET6 */
 
 #include <security/mac/mac_framework.h>
 
 /*
  * Consumers of struct ifreq such as tcpdump assume no pad between ifr_name
  * and ifr_ifru when it is used in SIOCGIFCONF.
  */
 _Static_assert(sizeof(((struct ifreq *)0)->ifr_name) ==
     offsetof(struct ifreq, ifr_ifru), "gap between ifr_name and ifr_ifru");
 
 __read_mostly epoch_t net_epoch_preempt;
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 
 struct ifreq_buffer32 {
 	uint32_t	length;		/* (size_t) */
 	uint32_t	buffer;		/* (void *) */
 };
 
 /*
  * Interface request structure used for socket
  * ioctl's.  All interface ioctl's must have parameter
  * definitions which begin with ifr_name.  The
  * remainder may be interface specific.
  */
 struct ifreq32 {
 	char	ifr_name[IFNAMSIZ];		/* if name, e.g. "en0" */
 	union {
 		struct sockaddr	ifru_addr;
 		struct sockaddr	ifru_dstaddr;
 		struct sockaddr	ifru_broadaddr;
 		struct ifreq_buffer32 ifru_buffer;
 		short		ifru_flags[2];
 		short		ifru_index;
 		int		ifru_jid;
 		int		ifru_metric;
 		int		ifru_mtu;
 		int		ifru_phys;
 		int		ifru_media;
 		uint32_t	ifru_data;
 		int		ifru_cap[2];
 		u_int		ifru_fib;
 		u_char		ifru_vlan_pcp;
 	} ifr_ifru;
 };
 CTASSERT(sizeof(struct ifreq) == sizeof(struct ifreq32));
 CTASSERT(__offsetof(struct ifreq, ifr_ifru) ==
     __offsetof(struct ifreq32, ifr_ifru));
 
 struct ifconf32 {
 	int32_t	ifc_len;
 	union {
 		uint32_t	ifcu_buf;
 		uint32_t	ifcu_req;
 	} ifc_ifcu;
 };
 #define	SIOCGIFCONF32	_IOWR('i', 36, struct ifconf32)
 
 struct ifdrv32 {
 	char		ifd_name[IFNAMSIZ];
 	uint32_t	ifd_cmd;
 	uint32_t	ifd_len;
 	uint32_t	ifd_data;
 };
 #define SIOCSDRVSPEC32	_IOC_NEWTYPE(SIOCSDRVSPEC, struct ifdrv32)
 #define SIOCGDRVSPEC32	_IOC_NEWTYPE(SIOCGDRVSPEC, struct ifdrv32)
 
 struct ifgroupreq32 {
 	char	ifgr_name[IFNAMSIZ];
 	u_int	ifgr_len;
 	union {
 		char		ifgru_group[IFNAMSIZ];
 		uint32_t	ifgru_groups;
 	} ifgr_ifgru;
 };
 #define	SIOCAIFGROUP32	_IOC_NEWTYPE(SIOCAIFGROUP, struct ifgroupreq32)
 #define	SIOCGIFGROUP32	_IOC_NEWTYPE(SIOCGIFGROUP, struct ifgroupreq32)
 #define	SIOCDIFGROUP32	_IOC_NEWTYPE(SIOCDIFGROUP, struct ifgroupreq32)
 #define	SIOCGIFGMEMB32	_IOC_NEWTYPE(SIOCGIFGMEMB, struct ifgroupreq32)
 
 struct ifmediareq32 {
 	char		ifm_name[IFNAMSIZ];
 	int		ifm_current;
 	int		ifm_mask;
 	int		ifm_status;
 	int		ifm_active;
 	int		ifm_count;
 	uint32_t	ifm_ulist;	/* (int *) */
 };
 #define	SIOCGIFMEDIA32	_IOC_NEWTYPE(SIOCGIFMEDIA, struct ifmediareq32)
 #define	SIOCGIFXMEDIA32	_IOC_NEWTYPE(SIOCGIFXMEDIA, struct ifmediareq32)
 #endif /* COMPAT_FREEBSD32 */
 
 union ifreq_union {
 	struct ifreq	ifr;
 #ifdef COMPAT_FREEBSD32
 	struct ifreq32	ifr32;
 #endif
 };
 
 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Link layers");
 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Generic link-management");
 
 SYSCTL_INT(_net_link, OID_AUTO, ifqmaxlen, CTLFLAG_RDTUN,
     &ifqmaxlen, 0, "max send queue size");
 
 /* Log link state change events */
 static int log_link_state_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_link_state_change, CTLFLAG_RW,
 	&log_link_state_change, 0,
 	"log interface link state change events");
 
 /* Log promiscuous mode change events */
 static int log_promisc_mode_change = 1;
 
 SYSCTL_INT(_net_link, OID_AUTO, log_promisc_mode_change, CTLFLAG_RDTUN,
 	&log_promisc_mode_change, 1,
 	"log promiscuous mode change events");
 
 /* Interface description */
 static unsigned int ifdescr_maxlen = 1024;
 SYSCTL_UINT(_net, OID_AUTO, ifdescr_maxlen, CTLFLAG_RW,
 	&ifdescr_maxlen, 0,
 	"administrative maximum length for interface description");
 
 static MALLOC_DEFINE(M_IFDESCR, "ifdescr", "ifnet descriptions");
 
 /* global sx for non-critical path ifdescr */
 static struct sx ifdescr_sx;
 SX_SYSINIT(ifdescr_sx, &ifdescr_sx, "ifnet descr");
 
 void	(*ng_ether_link_state_p)(struct ifnet *ifp, int state);
 void	(*lagg_linkstate_p)(struct ifnet *ifp, int state);
 /* These are external hooks for CARP. */
 void	(*carp_linkstate_p)(struct ifnet *ifp);
 void	(*carp_demote_adj_p)(int, char *);
 int	(*carp_master_p)(struct ifaddr *);
 #if defined(INET) || defined(INET6)
 int	(*carp_forus_p)(struct ifnet *ifp, u_char *dhost);
 int	(*carp_output_p)(struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *sa);
 int	(*carp_ioctl_p)(struct ifreq *, u_long, struct thread *);   
 int	(*carp_attach_p)(struct ifaddr *, int);
 void	(*carp_detach_p)(struct ifaddr *, bool);
 #endif
 #ifdef INET
 int	(*carp_iamatch_p)(struct ifaddr *, uint8_t **);
 #endif
 #ifdef INET6
 struct ifaddr *(*carp_iamatch6_p)(struct ifnet *ifp, struct in6_addr *taddr6);
 caddr_t	(*carp_macmatch6_p)(struct ifnet *ifp, struct mbuf *m,
     const struct in6_addr *taddr);
 #endif
 
 struct mbuf *(*tbr_dequeue_ptr)(struct ifaltq *, int) = NULL;
 
 /*
  * XXX: Style; these should be sorted alphabetically, and unprototyped
  * static functions should be prototyped. Currently they are sorted by
  * declaration order.
  */
 static void	if_attachdomain(void *);
 static void	if_attachdomain1(struct ifnet *);
 static int	ifconf(u_long, caddr_t);
 static void	if_input_default(struct ifnet *, struct mbuf *);
 static int	if_requestencap_default(struct ifnet *, struct if_encap_req *);
 static void	if_route(struct ifnet *, int flag, int fam);
 static int	if_setflag(struct ifnet *, int, int, int *, int);
 static int	if_transmit(struct ifnet *ifp, struct mbuf *m);
 static void	if_unroute(struct ifnet *, int flag, int fam);
 static int	if_delmulti_locked(struct ifnet *, struct ifmultiaddr *, int);
 static void	do_link_state_change(void *, int);
 static int	if_getgroup(struct ifgroupreq *, struct ifnet *);
 static int	if_getgroupmembers(struct ifgroupreq *);
 static void	if_delgroups(struct ifnet *);
 static void	if_attach_internal(struct ifnet *, bool);
 static int	if_detach_internal(struct ifnet *, bool);
 static void	if_siocaddmulti(void *, int);
 static void	if_link_ifnet(struct ifnet *);
 static bool	if_unlink_ifnet(struct ifnet *, bool);
 #ifdef VIMAGE
 static int	if_vmove(struct ifnet *, struct vnet *);
 #endif
 
 #ifdef INET6
 /*
  * XXX: declare here to avoid to include many inet6 related files..
  * should be more generalized?
  */
 extern void	nd6_setmtu(struct ifnet *);
 #endif
 
 /* ipsec helper hooks */
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_in[HHOOK_IPSEC_COUNT]);
 VNET_DEFINE(struct hhook_head *, ipsec_hhh_out[HHOOK_IPSEC_COUNT]);
 
 int	ifqmaxlen = IFQ_MAXLEN;
 VNET_DEFINE(struct ifnethead, ifnet);	/* depend on static init XXX */
 VNET_DEFINE(struct ifgrouphead, ifg_head);
 
 /* Table of ifnet by index. */
 static int if_index;
 static int if_indexlim = 8;
 static struct ifindex_entry {
 	struct ifnet	*ife_ifnet;
 	uint16_t	ife_gencnt;
 } *ifindex_table;
 
 SYSCTL_NODE(_net_link_generic, IFMIB_SYSTEM, system,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Variables global to all interfaces");
 static int
 sysctl_ifcount(SYSCTL_HANDLER_ARGS)
 {
 	int rv = 0;
 
 	IFNET_RLOCK();
 	for (int i = 1; i <= if_index; i++)
 		if (ifindex_table[i].ife_ifnet != NULL &&
 		    ifindex_table[i].ife_ifnet->if_vnet == curvnet)
 			rv = i;
 	IFNET_RUNLOCK();
 
 	return (sysctl_handle_int(oidp, &rv, 0, req));
 }
 SYSCTL_PROC(_net_link_generic_system, IFMIB_IFCOUNT, ifcount,
     CTLTYPE_INT | CTLFLAG_VNET | CTLFLAG_RD, NULL, 0, sysctl_ifcount, "I",
     "Maximum known interface index");
 
 /*
  * The global network interface list (V_ifnet) and related state (such as
  * if_index, if_indexlim, and ifindex_table) are protected by an sxlock.
  * This may be acquired to stabilise the list, or we may rely on NET_EPOCH.
  */
 struct sx ifnet_sxlock;
 SX_SYSINIT_FLAGS(ifnet_sx, &ifnet_sxlock, "ifnet_sx", SX_RECURSE);
 
 struct sx ifnet_detach_sxlock;
 SX_SYSINIT_FLAGS(ifnet_detach, &ifnet_detach_sxlock, "ifnet_detach_sx",
     SX_RECURSE);
 
 #ifdef VIMAGE
 #define	VNET_IS_SHUTTING_DOWN(_vnet)					\
     ((_vnet)->vnet_shutdown && (_vnet)->vnet_state < SI_SUB_VNET_DONE)
 #endif
 
 static	if_com_alloc_t *if_com_alloc[256];
 static	if_com_free_t *if_com_free[256];
 
 static MALLOC_DEFINE(M_IFNET, "ifnet", "interface internals");
 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
 
 struct ifnet *
 ifnet_byindex(u_int idx)
 {
 	struct ifnet *ifp;
 
 	NET_EPOCH_ASSERT();
 
 	if (__predict_false(idx > if_index))
 		return (NULL);
 
 	ifp = ck_pr_load_ptr(&ifindex_table[idx].ife_ifnet);
 
 	if (curvnet != NULL && ifp != NULL && ifp->if_vnet != curvnet)
 		ifp = NULL;
 
 	return (ifp);
 }
 
 struct ifnet *
 ifnet_byindex_ref(u_int idx)
 {
 	struct ifnet *ifp;
 
 	ifp = ifnet_byindex(idx);
 	if (ifp == NULL || (ifp->if_flags & IFF_DYING))
 		return (NULL);
 	if (!if_try_ref(ifp))
 		return (NULL);
 	return (ifp);
 }
 
 struct ifnet *
 ifnet_byindexgen(uint16_t idx, uint16_t gen)
 {
 	struct ifnet *ifp;
 
 	NET_EPOCH_ASSERT();
 
 	if (__predict_false(idx > if_index))
 		return (NULL);
 
 	ifp = ck_pr_load_ptr(&ifindex_table[idx].ife_ifnet);
 
 	if (ifindex_table[idx].ife_gencnt == gen)
 		return (ifp);
 	else
 		return (NULL);
 }
 
 /*
  * Network interface utility routines.
  *
  * Routines with ifa_ifwith* names take sockaddr *'s as
  * parameters.
  */
 
 static void
 if_init(void *arg __unused)
 {
 
 	ifindex_table = malloc(if_indexlim * sizeof(*ifindex_table),
 	    M_IFNET, M_WAITOK | M_ZERO);
 }
 SYSINIT(if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, if_init, NULL);
 
 static void
 vnet_if_init(const void *unused __unused)
 {
 
 	CK_STAILQ_INIT(&V_ifnet);
 	CK_STAILQ_INIT(&V_ifg_head);
 	vnet_if_clone_init();
 }
 VNET_SYSINIT(vnet_if_init, SI_SUB_INIT_IF, SI_ORDER_SECOND, vnet_if_init,
     NULL);
 
 static void
 if_link_ifnet(struct ifnet *ifp)
 {
 
 	IFNET_WLOCK();
 	CK_STAILQ_INSERT_TAIL(&V_ifnet, ifp, if_link);
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt++;
 #endif
 	IFNET_WUNLOCK();
 }
 
 static bool
 if_unlink_ifnet(struct ifnet *ifp, bool vmove)
 {
 	struct ifnet *iter;
 	int found = 0;
 
 	IFNET_WLOCK();
 	CK_STAILQ_FOREACH(iter, &V_ifnet, if_link)
 		if (iter == ifp) {
 			CK_STAILQ_REMOVE(&V_ifnet, ifp, ifnet, if_link);
 			if (!vmove)
 				ifp->if_flags |= IFF_DYING;
 			found = 1;
 			break;
 		}
 #ifdef VIMAGE
 	curvnet->vnet_ifcnt--;
 #endif
 	IFNET_WUNLOCK();
 
 	return (found);
 }
 
 #ifdef VIMAGE
 static void
 vnet_if_return(const void *unused __unused)
 {
 	struct ifnet *ifp, *nifp;
 	struct ifnet **pending;
 	int found __diagused;
 	int i;
 
 	i = 0;
 
 	/*
 	 * We need to protect our access to the V_ifnet tailq. Ordinarily we'd
 	 * enter NET_EPOCH, but that's not possible, because if_vmove() calls
 	 * if_detach_internal(), which waits for NET_EPOCH callbacks to
 	 * complete. We can't do that from within NET_EPOCH.
 	 *
 	 * However, we can also use the IFNET_xLOCK, which is the V_ifnet
 	 * read/write lock. We cannot hold the lock as we call if_vmove()
 	 * though, as that presents LOR w.r.t ifnet_sx, in_multi_sx and iflib
 	 * ctx lock.
 	 */
 	IFNET_WLOCK();
 
 	pending = malloc(sizeof(struct ifnet *) * curvnet->vnet_ifcnt,
 	    M_IFNET, M_WAITOK | M_ZERO);
 
 	/* Return all inherited interfaces to their parent vnets. */
 	CK_STAILQ_FOREACH_SAFE(ifp, &V_ifnet, if_link, nifp) {
 		if (ifp->if_home_vnet != ifp->if_vnet) {
 			found = if_unlink_ifnet(ifp, true);
 			MPASS(found);
 
 			pending[i++] = ifp;
 		}
 	}
 	IFNET_WUNLOCK();
 
 	for (int j = 0; j < i; j++) {
 		sx_xlock(&ifnet_detach_sxlock);
 		if_vmove(pending[j], pending[j]->if_home_vnet);
 		sx_xunlock(&ifnet_detach_sxlock);
 	}
 
 	free(pending, M_IFNET);
 }
 VNET_SYSUNINIT(vnet_if_return, SI_SUB_VNET_DONE, SI_ORDER_ANY,
     vnet_if_return, NULL);
 #endif
 
 /*
  * Allocate a struct ifnet and an index for an interface.  A layer 2
  * common structure will also be allocated if an allocation routine is
  * registered for the passed type.
  */
 static struct ifnet *
 if_alloc_domain(u_char type, int numa_domain)
 {
 	struct ifnet *ifp;
 	u_short idx;
 
 	KASSERT(numa_domain <= IF_NODOM, ("numa_domain too large"));
 	if (numa_domain == IF_NODOM)
 		ifp = malloc(sizeof(struct ifnet), M_IFNET,
 		    M_WAITOK | M_ZERO);
 	else
 		ifp = malloc_domainset(sizeof(struct ifnet), M_IFNET,
 		    DOMAINSET_PREF(numa_domain), M_WAITOK | M_ZERO);
 	ifp->if_type = type;
 	ifp->if_alloctype = type;
 	ifp->if_numa_domain = numa_domain;
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 #endif
 	if (if_com_alloc[type] != NULL) {
 		ifp->if_l2com = if_com_alloc[type](type, ifp);
 		KASSERT(ifp->if_l2com, ("%s: if_com_alloc[%u] failed", __func__,
 		    type));
 	}
 
 	IF_ADDR_LOCK_INIT(ifp);
 	TASK_INIT(&ifp->if_linktask, 0, do_link_state_change, ifp);
 	TASK_INIT(&ifp->if_addmultitask, 0, if_siocaddmulti, ifp);
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_LOCK_INIT(ifp);
 	CK_STAILQ_INIT(&ifp->if_addrhead);
 	CK_STAILQ_INIT(&ifp->if_multiaddrs);
 	CK_STAILQ_INIT(&ifp->if_groups);
 #ifdef MAC
 	mac_ifnet_init(ifp);
 #endif
 	ifq_init(&ifp->if_snd, ifp);
 
 	refcount_init(&ifp->if_refcount, 1);	/* Index reference. */
 	for (int i = 0; i < IFCOUNTERS; i++)
 		ifp->if_counters[i] = counter_u64_alloc(M_WAITOK);
 	ifp->if_get_counter = if_get_counter_default;
 	ifp->if_pcp = IFNET_PCP_NONE;
 
 	/* Allocate an ifindex array entry. */
 	IFNET_WLOCK();
 	/*
 	 * Try to find an empty slot below if_index.  If we fail, take the
 	 * next slot.
 	 */
 	for (idx = 1; idx <= if_index; idx++) {
 		if (ifindex_table[idx].ife_ifnet == NULL)
 			break;
 	}
 
 	/* Catch if_index overflow. */
 	if (idx >= if_indexlim) {
 		struct ifindex_entry *new, *old;
 		int newlim;
 
 		newlim = if_indexlim * 2;
 		new = malloc(newlim * sizeof(*new), M_IFNET, M_WAITOK | M_ZERO);
 		memcpy(new, ifindex_table, if_indexlim * sizeof(*new));
 		old = ifindex_table;
 		ck_pr_store_ptr(&ifindex_table, new);
 		if_indexlim = newlim;
 		epoch_wait_preempt(net_epoch_preempt);
 		free(old, M_IFNET);
 	}
 	if (idx > if_index)
 		if_index = idx;
 
 	ifp->if_index = idx;
 	ifp->if_idxgen = ifindex_table[idx].ife_gencnt;
 	ck_pr_store_ptr(&ifindex_table[idx].ife_ifnet, ifp);
 	IFNET_WUNLOCK();
 
 	return (ifp);
 }
 
 struct ifnet *
 if_alloc_dev(u_char type, device_t dev)
 {
 	int numa_domain;
 
 	if (dev == NULL || bus_get_domain(dev, &numa_domain) != 0)
 		return (if_alloc_domain(type, IF_NODOM));
 	return (if_alloc_domain(type, numa_domain));
 }
 
 struct ifnet *
 if_alloc(u_char type)
 {
 
 	return (if_alloc_domain(type, IF_NODOM));
 }
 /*
  * Do the actual work of freeing a struct ifnet, and layer 2 common
  * structure.  This call is made when the network epoch guarantees
  * us that nobody holds a pointer to the interface.
  */
 static void
 if_free_deferred(epoch_context_t ctx)
 {
 	struct ifnet *ifp = __containerof(ctx, struct ifnet, if_epoch_ctx);
 
 	KASSERT((ifp->if_flags & IFF_DYING),
 	    ("%s: interface not dying", __func__));
 
 	if (if_com_free[ifp->if_alloctype] != NULL)
 		if_com_free[ifp->if_alloctype](ifp->if_l2com,
 		    ifp->if_alloctype);
 
 #ifdef MAC
 	mac_ifnet_destroy(ifp);
 #endif /* MAC */
 	IF_AFDATA_DESTROY(ifp);
 	IF_ADDR_LOCK_DESTROY(ifp);
 	ifq_delete(&ifp->if_snd);
 
 	for (int i = 0; i < IFCOUNTERS; i++)
 		counter_u64_free(ifp->if_counters[i]);
 
 	free(ifp->if_description, M_IFDESCR);
 	free(ifp->if_hw_addr, M_IFADDR);
 	free(ifp, M_IFNET);
 }
 
 /*
  * Deregister an interface and free the associated storage.
  */
 void
 if_free(struct ifnet *ifp)
 {
 
 	ifp->if_flags |= IFF_DYING;			/* XXX: Locking */
 
 	/*
 	 * XXXGL: An interface index is really an alias to ifp pointer.
 	 * Why would we clear the alias now, and not in the deferred
 	 * context?  Indeed there is nothing wrong with some network
 	 * thread obtaining ifp via ifnet_byindex() inside the network
 	 * epoch and then dereferencing ifp while we perform if_free(),
 	 * and after if_free() finished, too.
 	 *
 	 * This early index freeing was important back when ifindex was
 	 * virtualized and interface would outlive the vnet.
 	 */
 	IFNET_WLOCK();
 	MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp);
 	ck_pr_store_ptr(&ifindex_table[ifp->if_index].ife_ifnet, NULL);
 	ifindex_table[ifp->if_index].ife_gencnt++;
 	while (if_index > 0 && ifindex_table[if_index].ife_ifnet == NULL)
 		if_index--;
 	IFNET_WUNLOCK();
 
 	if (refcount_release(&ifp->if_refcount))
 		NET_EPOCH_CALL(if_free_deferred, &ifp->if_epoch_ctx);
 }
 
 /*
  * Interfaces to keep an ifnet type-stable despite the possibility of the
  * driver calling if_free().  If there are additional references, we defer
  * freeing the underlying data structure.
  */
 void
 if_ref(struct ifnet *ifp)
 {
 	u_int old __diagused;
 
 	/* We don't assert the ifnet list lock here, but arguably should. */
 	old = refcount_acquire(&ifp->if_refcount);
 	KASSERT(old > 0, ("%s: ifp %p has 0 refs", __func__, ifp));
 }
 
 bool
 if_try_ref(struct ifnet *ifp)
 {
 	NET_EPOCH_ASSERT();
 	return (refcount_acquire_if_not_zero(&ifp->if_refcount));
 }
 
 void
 if_rele(struct ifnet *ifp)
 {
 
 	if (!refcount_release(&ifp->if_refcount))
 		return;
 	NET_EPOCH_CALL(if_free_deferred, &ifp->if_epoch_ctx);
 }
 
 void
 ifq_init(struct ifaltq *ifq, struct ifnet *ifp)
 {
 
 	mtx_init(&ifq->ifq_mtx, ifp->if_xname, "if send queue", MTX_DEF);
 
 	if (ifq->ifq_maxlen == 0) 
 		ifq->ifq_maxlen = ifqmaxlen;
 
 	ifq->altq_type = 0;
 	ifq->altq_disc = NULL;
 	ifq->altq_flags &= ALTQF_CANTCHANGE;
 	ifq->altq_tbr  = NULL;
 	ifq->altq_ifp  = ifp;
 }
 
 void
 ifq_delete(struct ifaltq *ifq)
 {
 	mtx_destroy(&ifq->ifq_mtx);
 }
 
 /*
  * Perform generic interface initialization tasks and attach the interface
  * to the list of "active" interfaces.  If vmove flag is set on entry
  * to if_attach_internal(), perform only a limited subset of initialization
  * tasks, given that we are moving from one vnet to another an ifnet which
  * has already been fully initialized.
  *
  * Note that if_detach_internal() removes group membership unconditionally
  * even when vmove flag is set, and if_attach_internal() adds only IFG_ALL.
  * Thus, when if_vmove() is applied to a cloned interface, group membership
  * is lost while a cloned one always joins a group whose name is
  * ifc->ifc_name.  To recover this after if_detach_internal() and
  * if_attach_internal(), the cloner should be specified to
  * if_attach_internal() via ifc.  If it is non-NULL, if_attach_internal()
  * attempts to join a group whose name is ifc->ifc_name.
  *
  * XXX:
  *  - The decision to return void and thus require this function to
  *    succeed is questionable.
  *  - We should probably do more sanity checking.  For instance we don't
  *    do anything to insure if_xname is unique or non-empty.
  */
 void
 if_attach(struct ifnet *ifp)
 {
 
 	if_attach_internal(ifp, false);
 }
 
 /*
  * Compute the least common TSO limit.
  */
 void
 if_hw_tsomax_common(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	/*
 	 * 1) If there is no limit currently, take the limit from
 	 * the network adapter.
 	 *
 	 * 2) If the network adapter has a limit below the current
 	 * limit, apply it.
 	 */
 	if (pmax->tsomaxbytes == 0 || (ifp->if_hw_tsomax != 0 &&
 	    ifp->if_hw_tsomax < pmax->tsomaxbytes)) {
 		pmax->tsomaxbytes = ifp->if_hw_tsomax;
 	}
 	if (pmax->tsomaxsegcount == 0 || (ifp->if_hw_tsomaxsegcount != 0 &&
 	    ifp->if_hw_tsomaxsegcount < pmax->tsomaxsegcount)) {
 		pmax->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 	}
 	if (pmax->tsomaxsegsize == 0 || (ifp->if_hw_tsomaxsegsize != 0 &&
 	    ifp->if_hw_tsomaxsegsize < pmax->tsomaxsegsize)) {
 		pmax->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 	}
 }
 
 /*
  * Update TSO limit of a network adapter.
  *
  * Returns zero if no change. Else non-zero.
  */
 int
 if_hw_tsomax_update(if_t ifp, struct ifnet_hw_tsomax *pmax)
 {
 	int retval = 0;
 	if (ifp->if_hw_tsomax != pmax->tsomaxbytes) {
 		ifp->if_hw_tsomax = pmax->tsomaxbytes;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegsize != pmax->tsomaxsegsize) {
 		ifp->if_hw_tsomaxsegsize = pmax->tsomaxsegsize;
 		retval++;
 	}
 	if (ifp->if_hw_tsomaxsegcount != pmax->tsomaxsegcount) {
 		ifp->if_hw_tsomaxsegcount = pmax->tsomaxsegcount;
 		retval++;
 	}
 	return (retval);
 }
 
 static void
 if_attach_internal(struct ifnet *ifp, bool vmove)
 {
 	unsigned socksize, ifasize;
 	int namelen, masklen;
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 
 	MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp);
 
 #ifdef VIMAGE
 	ifp->if_vnet = curvnet;
 	if (ifp->if_home_vnet == NULL)
 		ifp->if_home_vnet = curvnet;
 #endif
 
 	if_addgroup(ifp, IFG_ALL);
 
 #ifdef VIMAGE
 	/* Restore group membership for cloned interface. */
 	if (vmove)
 		if_clone_restoregroup(ifp);
 #endif
 
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_epoch = time_uptime;
 
 	KASSERT((ifp->if_transmit == NULL && ifp->if_qflush == NULL) ||
 	    (ifp->if_transmit != NULL && ifp->if_qflush != NULL),
 	    ("transmit and qflush must both either be set or both be NULL"));
 	if (ifp->if_transmit == NULL) {
 		ifp->if_transmit = if_transmit;
 		ifp->if_qflush = if_qflush;
 	}
 	if (ifp->if_input == NULL)
 		ifp->if_input = if_input_default;
 
 	if (ifp->if_requestencap == NULL)
 		ifp->if_requestencap = if_requestencap_default;
 
 	if (!vmove) {
 #ifdef MAC
 		mac_ifnet_create(ifp);
 #endif
 
 		/*
 		 * Create a Link Level name for this device.
 		 */
 		namelen = strlen(ifp->if_xname);
 		/*
 		 * Always save enough space for any possiable name so we
 		 * can do a rename in place later.
 		 */
 		masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + IFNAMSIZ;
 		socksize = masklen + ifp->if_addrlen;
 		if (socksize < sizeof(*sdl))
 			socksize = sizeof(*sdl);
 		socksize = roundup2(socksize, sizeof(long));
 		ifasize = sizeof(*ifa) + 2 * socksize;
 		ifa = ifa_alloc(ifasize, M_WAITOK);
 		sdl = (struct sockaddr_dl *)(ifa + 1);
 		sdl->sdl_len = socksize;
 		sdl->sdl_family = AF_LINK;
 		bcopy(ifp->if_xname, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl->sdl_index = ifp->if_index;
 		sdl->sdl_type = ifp->if_type;
 		ifp->if_addr = ifa;
 		ifa->ifa_ifp = ifp;
 		ifa->ifa_addr = (struct sockaddr *)sdl;
 		sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
 		ifa->ifa_netmask = (struct sockaddr *)sdl;
 		sdl->sdl_len = masklen;
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		CK_STAILQ_INSERT_HEAD(&ifp->if_addrhead, ifa, ifa_link);
 		/* Reliably crash if used uninitialized. */
 		ifp->if_broadcastaddr = NULL;
 
 		if (ifp->if_type == IFT_ETHER) {
 			ifp->if_hw_addr = malloc(ifp->if_addrlen, M_IFADDR,
 			    M_WAITOK | M_ZERO);
 		}
 
 #if defined(INET) || defined(INET6)
 		/* Use defaults for TSO, if nothing is set */
 		if (ifp->if_hw_tsomax == 0 &&
 		    ifp->if_hw_tsomaxsegcount == 0 &&
 		    ifp->if_hw_tsomaxsegsize == 0) {
 			/*
 			 * The TSO defaults needs to be such that an
 			 * NFS mbuf list of 35 mbufs totalling just
 			 * below 64K works and that a chain of mbufs
 			 * can be defragged into at most 32 segments:
 			 */
 			ifp->if_hw_tsomax = min(IP_MAXPACKET, (32 * MCLBYTES) -
 			    (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN));
 			ifp->if_hw_tsomaxsegcount = 35;
 			ifp->if_hw_tsomaxsegsize = 2048;	/* 2K */
 
 			/* XXX some drivers set IFCAP_TSO after ethernet attach */
 			if (ifp->if_capabilities & IFCAP_TSO) {
 				if_printf(ifp, "Using defaults for TSO: %u/%u/%u\n",
 				    ifp->if_hw_tsomax,
 				    ifp->if_hw_tsomaxsegcount,
 				    ifp->if_hw_tsomaxsegsize);
 			}
 		}
 #endif
 	}
 #ifdef VIMAGE
 	else {
 		/*
 		 * Update the interface index in the link layer address
 		 * of the interface.
 		 */
 		for (ifa = ifp->if_addr; ifa != NULL;
 		    ifa = CK_STAILQ_NEXT(ifa, ifa_link)) {
 			if (ifa->ifa_addr->sa_family == AF_LINK) {
 				sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 				sdl->sdl_index = ifp->if_index;
 			}
 		}
 	}
 #endif
 
 	if_link_ifnet(ifp);
 
 	if (domain_init_status >= 2)
 		if_attachdomain1(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
 }
 
 static void
 if_epochalloc(void *dummy __unused)
 {
 
 	net_epoch_preempt = epoch_alloc("Net preemptible", EPOCH_PREEMPT);
 }
 SYSINIT(ifepochalloc, SI_SUB_EPOCH, SI_ORDER_ANY, if_epochalloc, NULL);
 
 static void
 if_attachdomain(void *dummy)
 {
 	struct ifnet *ifp;
 
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link)
 		if_attachdomain1(ifp);
 }
 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_SECOND,
     if_attachdomain, NULL);
 
 static void
 if_attachdomain1(struct ifnet *ifp)
 {
 	struct domain *dp;
 
 	/*
 	 * Since dp->dom_ifattach calls malloc() with M_WAITOK, we
 	 * cannot lock ifp->if_afdata initialization, entirely.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	if (ifp->if_afdata_initialized >= domain_init_status) {
 		IF_AFDATA_UNLOCK(ifp);
 		log(LOG_WARNING, "%s called more than once on %s\n",
 		    __func__, ifp->if_xname);
 		return;
 	}
 	ifp->if_afdata_initialized = domain_init_status;
 	IF_AFDATA_UNLOCK(ifp);
 
 	/* address family dependent data region */
 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_ifattach)
 			ifp->if_afdata[dp->dom_family] =
 			    (*dp->dom_ifattach)(ifp);
 	}
 }
 
 /*
  * Remove any unicast or broadcast network addresses from an interface.
  */
 void
 if_purgeaddrs(struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 
 #ifdef INET6
 	/*
 	 * Need to leave multicast addresses of proxy NDP llentries
 	 * before in6_purgeifaddr() because the llentries are keys
 	 * for in6_multi objects of proxy NDP entries.
 	 * in6_purgeifaddr()s clean up llentries including proxy NDPs
 	 * then we would lose the keys if they are called earlier.
 	 */
 	in6_purge_proxy_ndp(ifp);
 #endif
 	while (1) {
 		struct epoch_tracker et;
 
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_LINK)
 				break;
 		}
 		NET_EPOCH_EXIT(et);
 
 		if (ifa == NULL)
 			break;
 #ifdef INET
 		/* XXX: Ugly!! ad hoc just for INET */
 		if (ifa->ifa_addr->sa_family == AF_INET) {
 			struct ifaliasreq ifr;
 
 			bzero(&ifr, sizeof(ifr));
 			ifr.ifra_addr = *ifa->ifa_addr;
 			if (ifa->ifa_dstaddr)
 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
 			    NULL) == 0)
 				continue;
 		}
 #endif /* INET */
 #ifdef INET6
 		if (ifa->ifa_addr->sa_family == AF_INET6) {
 			in6_purgeifaddr((struct in6_ifaddr *)ifa);
 			/* ifp_addrhead is already updated */
 			continue;
 		}
 #endif /* INET6 */
 		IF_ADDR_WLOCK(ifp);
 		CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link);
 		IF_ADDR_WUNLOCK(ifp);
 		ifa_free(ifa);
 	}
 }
 
 /*
  * Remove any multicast network addresses from an interface when an ifnet
  * is going away.
  */
 static void
 if_purgemaddrs(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_WLOCK(ifp);
 	while (!CK_STAILQ_EMPTY(&ifp->if_multiaddrs)) {
 		ifma = CK_STAILQ_FIRST(&ifp->if_multiaddrs);
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		if_delmulti_locked(ifp, ifma, 1);
 	}
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 /*
  * Detach an interface, removing it from the list of "active" interfaces.
  * If vmove flag is set on entry to if_detach_internal(), perform only a
  * limited subset of cleanup tasks, given that we are moving an ifnet from
  * one vnet to another, where it must be fully operational.
  *
  * XXXRW: There are some significant questions about event ordering, and
  * how to prevent things from starting to use the interface during detach.
  */
 void
 if_detach(struct ifnet *ifp)
 {
 	bool found;
 
 	CURVNET_SET_QUIET(ifp->if_vnet);
 	found = if_unlink_ifnet(ifp, false);
 	if (found) {
 		sx_xlock(&ifnet_detach_sxlock);
 		if_detach_internal(ifp, false);
 		sx_xunlock(&ifnet_detach_sxlock);
 	}
 	CURVNET_RESTORE();
 }
 
 /*
  * The vmove flag, if set, indicates that we are called from a callpath
  * that is moving an interface to a different vnet instance.
  *
  * The shutdown flag, if set, indicates that we are called in the
  * process of shutting down a vnet instance.  Currently only the
  * vnet_if_return SYSUNINIT function sets it.  Note: we can be called
  * on a vnet instance shutdown without this flag being set, e.g., when
  * the cloned interfaces are destoyed as first thing of teardown.
  */
 static int
 if_detach_internal(struct ifnet *ifp, bool vmove)
 {
 	struct ifaddr *ifa;
 	int i;
 	struct domain *dp;
 #ifdef VIMAGE
 	bool shutdown;
 
 	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
 #endif
 
 	/*
 	 * At this point we know the interface still was on the ifnet list
 	 * and we removed it so we are in a stable state.
 	 */
 	epoch_wait_preempt(net_epoch_preempt);
 
 	/*
 	 * Ensure all pending EPOCH(9) callbacks have been executed. This
 	 * fixes issues about late destruction of multicast options
 	 * which lead to leave group calls, which in turn access the
 	 * belonging ifnet structure:
 	 */
 	NET_EPOCH_DRAIN_CALLBACKS();
 
 	/*
 	 * In any case (destroy or vmove) detach us from the groups
 	 * and remove/wait for pending events on the taskq.
 	 * XXX-BZ in theory an interface could still enqueue a taskq change?
 	 */
 	if_delgroups(ifp);
 
 	taskqueue_drain(taskqueue_swi, &ifp->if_linktask);
 	taskqueue_drain(taskqueue_swi, &ifp->if_addmultitask);
 
 	if_down(ifp);
 
 #ifdef VIMAGE
 	/*
 	 * On VNET shutdown abort here as the stack teardown will do all
 	 * the work top-down for us.
 	 */
 	if (shutdown) {
 		/* Give interface users the chance to clean up. */
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		/*
 		 * In case of a vmove we are done here without error.
 		 * If we would signal an error it would lead to the same
 		 * abort as if we did not find the ifnet anymore.
 		 * if_detach() calls us in void context and does not care
 		 * about an early abort notification, so life is splendid :)
 		 */
 		goto finish_vnet_shutdown;
 	}
 #endif
 
 	/*
 	 * At this point we are not tearing down a VNET and are either
 	 * going to destroy or vmove the interface and have to cleanup
 	 * accordingly.
 	 */
 
 	/*
 	 * Remove routes and flush queues.
 	 */
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(&ifp->if_snd))
 		altq_disable(&ifp->if_snd);
 	if (ALTQ_IS_ATTACHED(&ifp->if_snd))
 		altq_detach(&ifp->if_snd);
 #endif
 
 	if_purgeaddrs(ifp);
 
 #ifdef INET
 	in_ifdetach(ifp);
 #endif
 
 #ifdef INET6
 	/*
 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
 	 * before removing routing entries below, since IPv6 interface direct
 	 * routes are expected to be removed by the IPv6-specific kernel API.
 	 * Otherwise, the kernel will detect some inconsistency and bark it.
 	 */
 	in6_ifdetach(ifp);
 #endif
 	if_purgemaddrs(ifp);
 
 	EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
 
 	if (!vmove) {
 		/*
 		 * Prevent further calls into the device driver via ifnet.
 		 */
 		if_dead(ifp);
 
 		/*
 		 * Clean up all addresses.
 		 */
 		IF_ADDR_WLOCK(ifp);
 		if (!CK_STAILQ_EMPTY(&ifp->if_addrhead)) {
 			ifa = CK_STAILQ_FIRST(&ifp->if_addrhead);
 			CK_STAILQ_REMOVE(&ifp->if_addrhead, ifa, ifaddr, ifa_link);
 			IF_ADDR_WUNLOCK(ifp);
 			ifa_free(ifa);
 		} else
 			IF_ADDR_WUNLOCK(ifp);
 	}
 
 	rt_flushifroutes(ifp);
 
 #ifdef VIMAGE
 finish_vnet_shutdown:
 #endif
 	/*
 	 * We cannot hold the lock over dom_ifdetach calls as they might
 	 * sleep, for example trying to drain a callout, thus open up the
 	 * theoretical race with re-attaching.
 	 */
 	IF_AFDATA_LOCK(ifp);
 	i = ifp->if_afdata_initialized;
 	ifp->if_afdata_initialized = 0;
 	IF_AFDATA_UNLOCK(ifp);
 	for (dp = domains; i > 0 && dp; dp = dp->dom_next) {
 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family]) {
 			(*dp->dom_ifdetach)(ifp,
 			    ifp->if_afdata[dp->dom_family]);
 			ifp->if_afdata[dp->dom_family] = NULL;
 		}
 	}
 
 	return (0);
 }
 
 #ifdef VIMAGE
 /*
  * if_vmove() performs a limited version of if_detach() in current
  * vnet and if_attach()es the ifnet to the vnet specified as 2nd arg.
  */
 static int
 if_vmove(struct ifnet *ifp, struct vnet *new_vnet)
 {
 #ifdef DEV_BPF
 	u_int bif_dlt, bif_hdrlen;
 #endif
 	int rc;
 
 #ifdef DEV_BPF
  	/*
 	 * if_detach_internal() will call the eventhandler to notify
 	 * interface departure.  That will detach if_bpf.  We need to
 	 * safe the dlt and hdrlen so we can re-attach it later.
 	 */
 	bpf_get_bp_params(ifp->if_bpf, &bif_dlt, &bif_hdrlen);
 #endif
 
 	/*
 	 * Detach from current vnet, but preserve LLADDR info, do not
 	 * mark as dead etc. so that the ifnet can be reattached later.
 	 * If we cannot find it, we lost the race to someone else.
 	 */
 	rc = if_detach_internal(ifp, true);
 	if (rc != 0)
 		return (rc);
 
 	/*
 	 * Perform interface-specific reassignment tasks, if provided by
 	 * the driver.
 	 */
 	if (ifp->if_reassign != NULL)
 		ifp->if_reassign(ifp, new_vnet, NULL);
 
 	/*
 	 * Switch to the context of the target vnet.
 	 */
 	CURVNET_SET_QUIET(new_vnet);
 	if_attach_internal(ifp, true);
 
 #ifdef DEV_BPF
 	if (ifp->if_bpf == NULL)
 		bpfattach(ifp, bif_dlt, bif_hdrlen);
 #endif
 
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * Move an ifnet to or from another child prison/vnet, specified by the jail id.
  */
 static int
 if_vmove_loan(struct thread *td, struct ifnet *ifp, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct ifnet *difp;
 	int error;
 	bool found __diagused;
 	bool shutdown;
 
 	MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp);
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Do not try to move the iface from and to the same prison. */
 	if (pr->pr_vnet == ifp->if_vnet) {
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the named iface does not exists in the dst. prison/vnet. */
 	/* XXX Lock interfaces to avoid races. */
 	CURVNET_SET_QUIET(pr->pr_vnet);
 	difp = ifunit(ifname);
 	if (difp != NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 	sx_xlock(&ifnet_detach_sxlock);
 
 	/* Make sure the VNET is stable. */
 	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
 	if (shutdown) {
 		sx_xunlock(&ifnet_detach_sxlock);
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EBUSY);
 	}
 	CURVNET_RESTORE();
 
 	found = if_unlink_ifnet(ifp, true);
 	if (! found) {
 		sx_xunlock(&ifnet_detach_sxlock);
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (ENODEV);
 	}
 
 	/* Move the interface into the child jail/vnet. */
 	error = if_vmove(ifp, pr->pr_vnet);
 
 	/* Report the new if_xname back to the userland on success. */
 	if (error == 0)
 		sprintf(ifname, "%s", ifp->if_xname);
 
 	sx_xunlock(&ifnet_detach_sxlock);
 
 	prison_free(pr);
 	return (error);
 }
 
 static int
 if_vmove_reclaim(struct thread *td, char *ifname, int jid)
 {
 	struct prison *pr;
 	struct vnet *vnet_dst;
 	struct ifnet *ifp;
 	int error, found __diagused;
  	bool shutdown;
 
 	/* Try to find the prison within our visibility. */
 	sx_slock(&allprison_lock);
 	pr = prison_find_child(td->td_ucred->cr_prison, jid);
 	sx_sunlock(&allprison_lock);
 	if (pr == NULL)
 		return (ENXIO);
 	prison_hold_locked(pr);
 	mtx_unlock(&pr->pr_mtx);
 
 	/* Make sure the named iface exists in the source prison/vnet. */
 	CURVNET_SET(pr->pr_vnet);
 	ifp = ifunit(ifname);		/* XXX Lock to avoid races. */
 	if (ifp == NULL) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (ENXIO);
 	}
 
 	/* Do not try to move the iface from and to the same prison. */
 	vnet_dst = TD_TO_VNET(td);
 	if (vnet_dst == ifp->if_vnet) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EEXIST);
 	}
 
 	/* Make sure the VNET is stable. */
 	shutdown = VNET_IS_SHUTTING_DOWN(ifp->if_vnet);
 	if (shutdown) {
 		CURVNET_RESTORE();
 		prison_free(pr);
 		return (EBUSY);
 	}
 
 	/* Get interface back from child jail/vnet. */
 	found = if_unlink_ifnet(ifp, true);
 	MPASS(found);
 	sx_xlock(&ifnet_detach_sxlock);
 	error = if_vmove(ifp, vnet_dst);
 	sx_xunlock(&ifnet_detach_sxlock);
 	CURVNET_RESTORE();
 
 	/* Report the new if_xname back to the userland on success. */
 	if (error == 0)
 		sprintf(ifname, "%s", ifp->if_xname);
 
 	prison_free(pr);
 	return (error);
 }
 #endif /* VIMAGE */
 
 /*
  * Add a group to an interface
  */
 int
 if_addgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list		*ifgl;
 	struct ifg_group	*ifg = NULL;
 	struct ifg_member	*ifgm;
 	int 			 new = 0;
 
 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
 	    groupname[strlen(groupname) - 1] <= '9')
 		return (EINVAL);
 
 	IFNET_WLOCK();
 	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname)) {
 			IFNET_WUNLOCK();
 			return (EEXIST);
 		}
 
 	if ((ifgl = malloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL) {
 	    	IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	if ((ifgm = malloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
 		free(ifgl, M_TEMP);
 		IFNET_WUNLOCK();
 		return (ENOMEM);
 	}
 
 	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (!strcmp(ifg->ifg_group, groupname))
 			break;
 
 	if (ifg == NULL) {
 		if ((ifg = malloc(sizeof(*ifg), M_TEMP, M_NOWAIT)) == NULL) {
 			free(ifgl, M_TEMP);
 			free(ifgm, M_TEMP);
 			IFNET_WUNLOCK();
 			return (ENOMEM);
 		}
 		strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
 		ifg->ifg_refcnt = 0;
 		CK_STAILQ_INIT(&ifg->ifg_members);
 		CK_STAILQ_INSERT_TAIL(&V_ifg_head, ifg, ifg_next);
 		new = 1;
 	}
 
 	ifg->ifg_refcnt++;
 	ifgl->ifgl_group = ifg;
 	ifgm->ifgm_ifp = ifp;
 
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
 	CK_STAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	IFNET_WUNLOCK();
 
 	if (new)
 		EVENTHANDLER_INVOKE(group_attach_event, ifg);
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 
 	return (0);
 }
 
 /*
  * Helper function to remove a group out of an interface.  Expects the global
  * ifnet lock to be write-locked, and drops it before returning.
  */
 static void
 _if_delgroup_locked(struct ifnet *ifp, struct ifg_list *ifgl,
     const char *groupname)
 {
 	struct ifg_member *ifgm;
 	bool freeifgl;
 
 	IFNET_WLOCK_ASSERT();
 
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_REMOVE(&ifp->if_groups, ifgl, ifg_list, ifgl_next);
 	IF_ADDR_WUNLOCK(ifp);
 
 	CK_STAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next) {
 		if (ifgm->ifgm_ifp == ifp) {
 			CK_STAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm,
 			    ifg_member, ifgm_next);
 			break;
 		}
 	}
 
 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
 		CK_STAILQ_REMOVE(&V_ifg_head, ifgl->ifgl_group, ifg_group,
 		    ifg_next);
 		freeifgl = true;
 	} else {
 		freeifgl = false;
 	}
 	IFNET_WUNLOCK();
 
 	epoch_wait_preempt(net_epoch_preempt);
 	if (freeifgl) {
 		EVENTHANDLER_INVOKE(group_detach_event, ifgl->ifgl_group);
 		free(ifgl->ifgl_group, M_TEMP);
 	}
 	free(ifgm, M_TEMP);
 	free(ifgl, M_TEMP);
 
 	EVENTHANDLER_INVOKE(group_change_event, groupname);
 }
 
 /*
  * Remove a group from an interface
  */
 int
 if_delgroup(struct ifnet *ifp, const char *groupname)
 {
 	struct ifg_list *ifgl;
 
 	IFNET_WLOCK();
 	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 		if (strcmp(ifgl->ifgl_group->ifg_group, groupname) == 0)
 			break;
 	if (ifgl == NULL) {
 		IFNET_WUNLOCK();
 		return (ENOENT);
 	}
 
 	_if_delgroup_locked(ifp, ifgl, groupname);
 
 	return (0);
 }
 
 /*
  * Remove an interface from all groups
  */
 static void
 if_delgroups(struct ifnet *ifp)
 {
 	struct ifg_list *ifgl;
 	char groupname[IFNAMSIZ];
 
 	IFNET_WLOCK();
 	while ((ifgl = CK_STAILQ_FIRST(&ifp->if_groups)) != NULL) {
 		strlcpy(groupname, ifgl->ifgl_group->ifg_group, IFNAMSIZ);
 		_if_delgroup_locked(ifp, ifgl, groupname);
 		IFNET_WLOCK();
 	}
 	IFNET_WUNLOCK();
 }
 
 /*
  * Stores all groups from an interface in memory pointed to by ifgr.
  */
 static int
 if_getgroup(struct ifgroupreq *ifgr, struct ifnet *ifp)
 {
 	int			 len, error;
 	struct ifg_list		*ifgl;
 	struct ifg_req		 ifgrq, *ifgp;
 
 	NET_EPOCH_ASSERT();
 
 	if (ifgr->ifgr_len == 0) {
 		CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
 			ifgr->ifgr_len += sizeof(struct ifg_req);
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	/* XXX: wire */
 	CK_STAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
 		if (len < sizeof(ifgrq))
 			return (EINVAL);
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
 		    sizeof(ifgrq.ifgrq_group));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req))))
 			return (error);
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 
 	return (0);
 }
 
 /*
  * Stores all members of a group in memory pointed to by igfr
  */
 static int
 if_getgroupmembers(struct ifgroupreq *ifgr)
 {
 	struct ifg_group	*ifg;
 	struct ifg_member	*ifgm;
 	struct ifg_req		 ifgrq, *ifgp;
 	int			 len, error;
 
 	IFNET_RLOCK();
 	CK_STAILQ_FOREACH(ifg, &V_ifg_head, ifg_next)
 		if (strcmp(ifg->ifg_group, ifgr->ifgr_name) == 0)
 			break;
 	if (ifg == NULL) {
 		IFNET_RUNLOCK();
 		return (ENOENT);
 	}
 
 	if (ifgr->ifgr_len == 0) {
 		CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
 			ifgr->ifgr_len += sizeof(ifgrq);
 		IFNET_RUNLOCK();
 		return (0);
 	}
 
 	len = ifgr->ifgr_len;
 	ifgp = ifgr->ifgr_groups;
 	CK_STAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
 		if (len < sizeof(ifgrq)) {
 			IFNET_RUNLOCK();
 			return (EINVAL);
 		}
 		bzero(&ifgrq, sizeof ifgrq);
 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
 		    sizeof(ifgrq.ifgrq_member));
 		if ((error = copyout(&ifgrq, ifgp, sizeof(struct ifg_req)))) {
 			IFNET_RUNLOCK();
 			return (error);
 		}
 		len -= sizeof(ifgrq);
 		ifgp++;
 	}
 	IFNET_RUNLOCK();
 
 	return (0);
 }
 
 /*
  * Return counter values from counter(9)s stored in ifnet.
  */
 uint64_t
 if_get_counter_default(struct ifnet *ifp, ift_counter cnt)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	return (counter_u64_fetch(ifp->if_counters[cnt]));
 }
 
 /*
  * Increase an ifnet counter. Usually used for counters shared
  * between the stack and a driver, but function supports them all.
  */
 void
 if_inc_counter(struct ifnet *ifp, ift_counter cnt, int64_t inc)
 {
 
 	KASSERT(cnt < IFCOUNTERS, ("%s: invalid cnt %d", __func__, cnt));
 
 	counter_u64_add(ifp->if_counters[cnt], inc);
 }
 
 /*
  * Copy data from ifnet to userland API structure if_data.
  */
 void
 if_data_copy(struct ifnet *ifp, struct if_data *ifd)
 {
 
 	ifd->ifi_type = ifp->if_type;
 	ifd->ifi_physical = 0;
 	ifd->ifi_addrlen = ifp->if_addrlen;
 	ifd->ifi_hdrlen = ifp->if_hdrlen;
 	ifd->ifi_link_state = ifp->if_link_state;
 	ifd->ifi_vhid = 0;
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_mtu = ifp->if_mtu;
 	ifd->ifi_metric = ifp->if_metric;
 	ifd->ifi_baudrate = ifp->if_baudrate;
 	ifd->ifi_hwassist = ifp->if_hwassist;
 	ifd->ifi_epoch = ifp->if_epoch;
 	ifd->ifi_lastchange = ifp->if_lastchange;
 
 	ifd->ifi_ipackets = ifp->if_get_counter(ifp, IFCOUNTER_IPACKETS);
 	ifd->ifi_ierrors = ifp->if_get_counter(ifp, IFCOUNTER_IERRORS);
 	ifd->ifi_opackets = ifp->if_get_counter(ifp, IFCOUNTER_OPACKETS);
 	ifd->ifi_oerrors = ifp->if_get_counter(ifp, IFCOUNTER_OERRORS);
 	ifd->ifi_collisions = ifp->if_get_counter(ifp, IFCOUNTER_COLLISIONS);
 	ifd->ifi_ibytes = ifp->if_get_counter(ifp, IFCOUNTER_IBYTES);
 	ifd->ifi_obytes = ifp->if_get_counter(ifp, IFCOUNTER_OBYTES);
 	ifd->ifi_imcasts = ifp->if_get_counter(ifp, IFCOUNTER_IMCASTS);
 	ifd->ifi_omcasts = ifp->if_get_counter(ifp, IFCOUNTER_OMCASTS);
 	ifd->ifi_iqdrops = ifp->if_get_counter(ifp, IFCOUNTER_IQDROPS);
 	ifd->ifi_oqdrops = ifp->if_get_counter(ifp, IFCOUNTER_OQDROPS);
 	ifd->ifi_noproto = ifp->if_get_counter(ifp, IFCOUNTER_NOPROTO);
 }
 
 /*
  * Initialization, destruction and refcounting functions for ifaddrs.
  */
 struct ifaddr *
 ifa_alloc(size_t size, int flags)
 {
 	struct ifaddr *ifa;
 
 	KASSERT(size >= sizeof(struct ifaddr),
 	    ("%s: invalid size %zu", __func__, size));
 
 	ifa = malloc(size, M_IFADDR, M_ZERO | flags);
 	if (ifa == NULL)
 		return (NULL);
 
 	if ((ifa->ifa_opackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ipackets = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_obytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 	if ((ifa->ifa_ibytes = counter_u64_alloc(flags)) == NULL)
 		goto fail;
 
 	refcount_init(&ifa->ifa_refcnt, 1);
 
 	return (ifa);
 
 fail:
 	/* free(NULL) is okay */
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 
 	return (NULL);
 }
 
 void
 ifa_ref(struct ifaddr *ifa)
 {
 	u_int old __diagused;
 
 	old = refcount_acquire(&ifa->ifa_refcnt);
 	KASSERT(old > 0, ("%s: ifa %p has 0 refs", __func__, ifa));
 }
 
 int
 ifa_try_ref(struct ifaddr *ifa)
 {
 
 	NET_EPOCH_ASSERT();
 	return (refcount_acquire_if_not_zero(&ifa->ifa_refcnt));
 }
 
 static void
 ifa_destroy(epoch_context_t ctx)
 {
 	struct ifaddr *ifa;
 
 	ifa = __containerof(ctx, struct ifaddr, ifa_epoch_ctx);
 	counter_u64_free(ifa->ifa_opackets);
 	counter_u64_free(ifa->ifa_ipackets);
 	counter_u64_free(ifa->ifa_obytes);
 	counter_u64_free(ifa->ifa_ibytes);
 	free(ifa, M_IFADDR);
 }
 
 void
 ifa_free(struct ifaddr *ifa)
 {
 
 	if (refcount_release(&ifa->ifa_refcnt))
 		NET_EPOCH_CALL(ifa_destroy, &ifa->ifa_epoch_ctx);
 }
 
 /*
  * XXX: Because sockaddr_dl has deeper structure than the sockaddr
  * structs used to represent other address families, it is necessary
  * to perform a different comparison.
  */
 
 #define	sa_dl_equal(a1, a2)	\
 	((((const struct sockaddr_dl *)(a1))->sdl_len ==		\
 	 ((const struct sockaddr_dl *)(a2))->sdl_len) &&		\
 	 (bcmp(CLLADDR((const struct sockaddr_dl *)(a1)),		\
 	       CLLADDR((const struct sockaddr_dl *)(a2)),		\
 	       ((const struct sockaddr_dl *)(a1))->sdl_alen) == 0))
 
 /*
  * Locate an interface based on a complete address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithaddr(const struct sockaddr *addr)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (sa_equal(addr, ifa->ifa_addr)) {
 				goto done;
 			}
 			/* IP6 doesn't have broadcast */
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				goto done;
 			}
 		}
 	}
 	ifa = NULL;
 done:
 	return (ifa);
 }
 
 int
 ifa_ifwithaddr_check(const struct sockaddr *addr)
 {
 	struct epoch_tracker et;
 	int rc;
 
 	NET_EPOCH_ENTER(et);
 	rc = (ifa_ifwithaddr(addr) != NULL);
 	NET_EPOCH_EXIT(et);
 	return (rc);
 }
 
 /*
  * Locate an interface based on the broadcast address.
  */
 /* ARGSUSED */
 struct ifaddr *
 ifa_ifwithbroadaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if ((ifp->if_flags & IFF_BROADCAST) &&
 			    ifa->ifa_broadaddr &&
 			    ifa->ifa_broadaddr->sa_len != 0 &&
 			    sa_equal(ifa->ifa_broadaddr, addr)) {
 				goto done;
 			}
 		}
 	}
 	ifa = NULL;
 done:
 	return (ifa);
 }
 
 /*
  * Locate the point to point interface with a given destination address.
  */
 /*ARGSUSED*/
 struct ifaddr *
 ifa_ifwithdstaddr(const struct sockaddr *addr, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 
 	NET_EPOCH_ASSERT();
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
 			continue;
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != addr->sa_family)
 				continue;
 			if (ifa->ifa_dstaddr != NULL &&
 			    sa_equal(addr, ifa->ifa_dstaddr)) {
 				goto done;
 			}
 		}
 	}
 	ifa = NULL;
 done:
 	return (ifa);
 }
 
 /*
  * Find an interface on a specific network.  If many, choice
  * is most specific found.
  */
 struct ifaddr *
 ifa_ifwithnet(const struct sockaddr *addr, int ignore_ptp, int fibnum)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 	const char *addr_data = addr->sa_data, *cplim;
 
 	NET_EPOCH_ASSERT();
 	/*
 	 * AF_LINK addresses can be looked up directly by their index number,
 	 * so do that if we can.
 	 */
 	if (af == AF_LINK) {
 		ifp = ifnet_byindex(
 		    ((const struct sockaddr_dl *)addr)->sdl_index);
 		return (ifp ? ifp->if_addr : NULL);
 	}
 
 	/*
 	 * Scan though each interface, looking for ones that have addresses
 	 * in this address family and the requested fib.
 	 */
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if ((fibnum != RT_ALL_FIBS) && (ifp->if_fib != fibnum))
 			continue;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			const char *cp, *cp2, *cp3;
 
 			if (ifa->ifa_addr->sa_family != af)
 next:				continue;
 			if (af == AF_INET && 
 			    ifp->if_flags & IFF_POINTOPOINT && !ignore_ptp) {
 				/*
 				 * This is a bit broken as it doesn't
 				 * take into account that the remote end may
 				 * be a single node in the network we are
 				 * looking for.
 				 * The trouble is that we don't know the
 				 * netmask for the remote end.
 				 */
 				if (ifa->ifa_dstaddr != NULL &&
 				    sa_equal(addr, ifa->ifa_dstaddr)) {
 					goto done;
 				}
 			} else {
 				/*
 				 * Scan all the bits in the ifa's address.
 				 * If a bit dissagrees with what we are
 				 * looking for, mask it with the netmask
 				 * to see if it really matters.
 				 * (A byte at a time)
 				 */
 				if (ifa->ifa_netmask == 0)
 					continue;
 				cp = addr_data;
 				cp2 = ifa->ifa_addr->sa_data;
 				cp3 = ifa->ifa_netmask->sa_data;
 				cplim = ifa->ifa_netmask->sa_len
 					+ (char *)ifa->ifa_netmask;
 				while (cp3 < cplim)
 					if ((*cp++ ^ *cp2++) & *cp3++)
 						goto next; /* next address! */
 				/*
 				 * If the netmask of what we just found
 				 * is more specific than what we had before
 				 * (if we had one), or if the virtual status
 				 * of new prefix is better than of the old one,
 				 * then remember the new one before continuing
 				 * to search for an even better one.
 				 */
 				if (ifa_maybe == NULL ||
 				    ifa_preferred(ifa_maybe, ifa) ||
 				    rn_refines((caddr_t)ifa->ifa_netmask,
 				    (caddr_t)ifa_maybe->ifa_netmask)) {
 					ifa_maybe = ifa;
 				}
 			}
 		}
 	}
 	ifa = ifa_maybe;
 	ifa_maybe = NULL;
 done:
 	return (ifa);
 }
 
 /*
  * Find an interface address specific to an interface best matching
  * a given address.
  */
 struct ifaddr *
 ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp)
 {
 	struct ifaddr *ifa;
 	const char *cp, *cp2, *cp3;
 	char *cplim;
 	struct ifaddr *ifa_maybe = NULL;
 	u_int af = addr->sa_family;
 
 	if (af >= AF_MAX)
 		return (NULL);
 
 	NET_EPOCH_ASSERT();
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != af)
 			continue;
 		if (ifa_maybe == NULL)
 			ifa_maybe = ifa;
 		if (ifa->ifa_netmask == 0) {
 			if (sa_equal(addr, ifa->ifa_addr) ||
 			    (ifa->ifa_dstaddr &&
 			    sa_equal(addr, ifa->ifa_dstaddr)))
 				goto done;
 			continue;
 		}
 		if (ifp->if_flags & IFF_POINTOPOINT) {
 			if (sa_equal(addr, ifa->ifa_dstaddr))
 				goto done;
 		} else {
 			cp = addr->sa_data;
 			cp2 = ifa->ifa_addr->sa_data;
 			cp3 = ifa->ifa_netmask->sa_data;
 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
 			for (; cp3 < cplim; cp3++)
 				if ((*cp++ ^ *cp2++) & *cp3)
 					break;
 			if (cp3 == cplim)
 				goto done;
 		}
 	}
 	ifa = ifa_maybe;
 done:
 	return (ifa);
 }
 
 /*
  * See whether new ifa is better than current one:
  * 1) A non-virtual one is preferred over virtual.
  * 2) A virtual in master state preferred over any other state.
  *
  * Used in several address selecting functions.
  */
 int
 ifa_preferred(struct ifaddr *cur, struct ifaddr *next)
 {
 
 	return (cur->ifa_carp && (!next->ifa_carp ||
 	    ((*carp_master_p)(next) && !(*carp_master_p)(cur))));
 }
 
 struct sockaddr_dl *
 link_alloc_sdl(size_t size, int flags)
 {
 
 	return (malloc(size, M_TEMP, flags));
 }
 
 void
 link_free_sdl(struct sockaddr *sa)
 {
 	free(sa, M_TEMP);
 }
 
 /*
  * Fills in given sdl with interface basic info.
  * Returns pointer to filled sdl.
  */
 struct sockaddr_dl *
 link_init_sdl(struct ifnet *ifp, struct sockaddr *paddr, u_char iftype)
 {
 	struct sockaddr_dl *sdl;
 
 	sdl = (struct sockaddr_dl *)paddr;
 	memset(sdl, 0, sizeof(struct sockaddr_dl));
 	sdl->sdl_len = sizeof(struct sockaddr_dl);
 	sdl->sdl_family = AF_LINK;
 	sdl->sdl_index = ifp->if_index;
 	sdl->sdl_type = iftype;
 
 	return (sdl);
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 static void
 if_unroute(struct ifnet *ifp, int flag, int fam)
 {
 
 	KASSERT(flag == IFF_UP, ("if_unroute: flag != IFF_UP"));
 
 	ifp->if_flags &= ~flag;
 	getmicrotime(&ifp->if_lastchange);
 	ifp->if_qflush(ifp);
 
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 static void
 if_route(struct ifnet *ifp, int flag, int fam)
 {
 
 	KASSERT(flag == IFF_UP, ("if_route: flag != IFF_UP"));
 
 	ifp->if_flags |= flag;
 	getmicrotime(&ifp->if_lastchange);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	rt_ifmsg(ifp);
 #ifdef INET6
 	in6_if_up(ifp);
 #endif
 }
 
 void	(*vlan_link_state_p)(struct ifnet *);	/* XXX: private from if_vlan */
 void	(*vlan_trunk_cap_p)(struct ifnet *);		/* XXX: private from if_vlan */
 struct ifnet *(*vlan_trunkdev_p)(struct ifnet *);
 struct	ifnet *(*vlan_devat_p)(struct ifnet *, uint16_t);
 int	(*vlan_tag_p)(struct ifnet *, uint16_t *);
 int	(*vlan_pcp_p)(struct ifnet *, uint16_t *);
 int	(*vlan_setcookie_p)(struct ifnet *, void *);
 void	*(*vlan_cookie_p)(struct ifnet *);
 
 /*
  * Handle a change in the interface link state. To avoid LORs
  * between driver lock and upper layer locks, as well as possible
  * recursions, we post event to taskqueue, and all job
  * is done in static do_link_state_change().
  */
 void
 if_link_state_change(struct ifnet *ifp, int link_state)
 {
 	/* Return if state hasn't changed. */
 	if (ifp->if_link_state == link_state)
 		return;
 
 	ifp->if_link_state = link_state;
 
 	/* XXXGL: reference ifp? */
 	taskqueue_enqueue(taskqueue_swi, &ifp->if_linktask);
 }
 
 static void
 do_link_state_change(void *arg, int pending)
 {
 	struct ifnet *ifp;
 	int link_state;
 
 	ifp = arg;
 	link_state = ifp->if_link_state;
 
 	CURVNET_SET(ifp->if_vnet);
 	rt_ifmsg(ifp);
 	if (ifp->if_vlantrunk != NULL)
 		(*vlan_link_state_p)(ifp);
 
 	if ((ifp->if_type == IFT_ETHER || ifp->if_type == IFT_L2VLAN) &&
 	    ifp->if_l2com != NULL)
 		(*ng_ether_link_state_p)(ifp, link_state);
 	if (ifp->if_carp)
 		(*carp_linkstate_p)(ifp);
 	if (ifp->if_bridge)
 		ifp->if_bridge_linkstate(ifp);
 	if (ifp->if_lagg)
 		(*lagg_linkstate_p)(ifp, link_state);
 
 	if (IS_DEFAULT_VNET(curvnet))
 		devctl_notify("IFNET", ifp->if_xname,
 		    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN",
 		    NULL);
 	if (pending > 1)
 		if_printf(ifp, "%d link states coalesced\n", pending);
 	if (log_link_state_change)
 		if_printf(ifp, "link state changed to %s\n",
 		    (link_state == LINK_STATE_UP) ? "UP" : "DOWN" );
 	EVENTHANDLER_INVOKE(ifnet_link_event, ifp, link_state);
 	CURVNET_RESTORE();
 }
 
 /*
  * Mark an interface down and notify protocols of
  * the transition.
  */
 void
 if_down(struct ifnet *ifp)
 {
 
 	EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_DOWN);
 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
 }
 
 /*
  * Mark an interface up and notify protocols of
  * the transition.
  */
 void
 if_up(struct ifnet *ifp)
 {
 
 	if_route(ifp, IFF_UP, AF_UNSPEC);
 	EVENTHANDLER_INVOKE(ifnet_event, ifp, IFNET_EVENT_UP);
 }
 
 /*
  * Flush an interface queue.
  */
 void
 if_qflush(struct ifnet *ifp)
 {
 	struct mbuf *m, *n;
 	struct ifaltq *ifq;
 
 	ifq = &ifp->if_snd;
 	IFQ_LOCK(ifq);
 #ifdef ALTQ
 	if (ALTQ_IS_ENABLED(ifq))
 		ALTQ_PURGE(ifq);
 #endif
 	n = ifq->ifq_head;
 	while ((m = n) != NULL) {
 		n = m->m_nextpkt;
 		m_freem(m);
 	}
 	ifq->ifq_head = 0;
 	ifq->ifq_tail = 0;
 	ifq->ifq_len = 0;
 	IFQ_UNLOCK(ifq);
 }
 
 /*
  * Map interface name to interface structure pointer, with or without
  * returning a reference.
  */
 struct ifnet *
 ifunit_ref(const char *name)
 {
 	struct epoch_tracker et;
 	struct ifnet *ifp;
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0 &&
 		    !(ifp->if_flags & IFF_DYING))
 			break;
 	}
 	if (ifp != NULL) {
 		if_ref(ifp);
 		MPASS(ifindex_table[ifp->if_index].ife_ifnet == ifp);
 	}
 
 	NET_EPOCH_EXIT(et);
 	return (ifp);
 }
 
 struct ifnet *
 ifunit(const char *name)
 {
 	struct epoch_tracker et;
 	struct ifnet *ifp;
 
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (strncmp(name, ifp->if_xname, IFNAMSIZ) == 0)
 			break;
 	}
 	NET_EPOCH_EXIT(et);
 	return (ifp);
 }
 
 void *
 ifr_buffer_get_buffer(void *data)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return ((void *)(uintptr_t)
 		    ifrup->ifr32.ifr_ifru.ifru_buffer.buffer);
 #endif
 	return (ifrup->ifr.ifr_ifru.ifru_buffer.buffer);
 }
 
 static void
 ifr_buffer_set_buffer_null(void *data)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		ifrup->ifr32.ifr_ifru.ifru_buffer.buffer = 0;
 	else
 #endif
 		ifrup->ifr.ifr_ifru.ifru_buffer.buffer = NULL;
 }
 
 size_t
 ifr_buffer_get_length(void *data)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return (ifrup->ifr32.ifr_ifru.ifru_buffer.length);
 #endif
 	return (ifrup->ifr.ifr_ifru.ifru_buffer.length);
 }
 
 static void
 ifr_buffer_set_length(void *data, size_t len)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = data;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		ifrup->ifr32.ifr_ifru.ifru_buffer.length = len;
 	else
 #endif
 		ifrup->ifr.ifr_ifru.ifru_buffer.length = len;
 }
 
 void *
 ifr_data_get_ptr(void *ifrp)
 {
 	union ifreq_union *ifrup;
 
 	ifrup = ifrp;
 #ifdef COMPAT_FREEBSD32
 	if (SV_CURPROC_FLAG(SV_ILP32))
 		return ((void *)(uintptr_t)
 		    ifrup->ifr32.ifr_ifru.ifru_data);
 #endif
 		return (ifrup->ifr.ifr_ifru.ifru_data);
 }
 
 struct ifcap_nv_bit_name {
 	int cap_bit;
 	const char *cap_name;
 };
 #define CAPNV(x) {.cap_bit = IFCAP_##x, \
     .cap_name = __CONCAT(IFCAP_, __CONCAT(x, _NAME)) }
 const struct ifcap_nv_bit_name ifcap_nv_bit_names[] = {
 	CAPNV(RXCSUM),
 	CAPNV(TXCSUM),
 	CAPNV(NETCONS),
 	CAPNV(VLAN_MTU),
 	CAPNV(VLAN_HWTAGGING),
 	CAPNV(JUMBO_MTU),
 	CAPNV(POLLING),
 	CAPNV(VLAN_HWCSUM),
 	CAPNV(TSO4),
 	CAPNV(TSO6),
 	CAPNV(LRO),
 	CAPNV(WOL_UCAST),
 	CAPNV(WOL_MCAST),
 	CAPNV(WOL_MAGIC),
 	CAPNV(TOE4),
 	CAPNV(TOE6),
 	CAPNV(VLAN_HWFILTER),
 	CAPNV(VLAN_HWTSO),
 	CAPNV(LINKSTATE),
 	CAPNV(NETMAP),
 	CAPNV(RXCSUM_IPV6),
 	CAPNV(TXCSUM_IPV6),
 	CAPNV(HWSTATS),
 	CAPNV(TXRTLMT),
 	CAPNV(HWRXTSTMP),
 	CAPNV(MEXTPG),
 	CAPNV(TXTLS4),
 	CAPNV(TXTLS6),
 	CAPNV(VXLAN_HWCSUM),
 	CAPNV(VXLAN_HWTSO),
 	CAPNV(TXTLS_RTLMT),
 	{0, NULL}
 };
 #define CAP2NV(x) {.cap_bit = IFCAP2_##x, \
     .cap_name = __CONCAT(IFCAP2_, __CONCAT(x, _NAME)) }
 const struct ifcap_nv_bit_name ifcap2_nv_bit_names[] = {
 	CAP2NV(RXTLS4),
 	CAP2NV(RXTLS6),
 	{0, NULL}
 };
 #undef CAPNV
 #undef CAP2NV
 
 int
 if_capnv_to_capint(const nvlist_t *nv, int *old_cap,
     const struct ifcap_nv_bit_name *nn, bool all)
 {
 	int i, res;
 
 	res = 0;
 	for (i = 0; nn[i].cap_name != NULL; i++) {
 		if (nvlist_exists_bool(nv, nn[i].cap_name)) {
 			if (all || nvlist_get_bool(nv, nn[i].cap_name))
 				res |= nn[i].cap_bit;
 		} else {
 			res |= *old_cap & nn[i].cap_bit;
 		}
 	}
 	return (res);
 }
 
 void
 if_capint_to_capnv(nvlist_t *nv, const struct ifcap_nv_bit_name *nn,
     int ifr_cap, int ifr_req)
 {
 	int i;
 
 	for (i = 0; nn[i].cap_name != NULL; i++) {
 		if ((nn[i].cap_bit & ifr_cap) != 0) {
 			nvlist_add_bool(nv, nn[i].cap_name,
 			    (nn[i].cap_bit & ifr_req) != 0);
 		}
 	}
 }
 
 /*
  * Hardware specific interface ioctls.
  */
 int
 ifhwioctl(u_long cmd, struct ifnet *ifp, caddr_t data, struct thread *td)
 {
 	struct ifreq *ifr;
 	int error = 0, do_ifup = 0;
 	int new_flags, temp_flags;
 	size_t namelen, onamelen;
 	size_t descrlen, nvbuflen;
 	char *descrbuf, *odescrbuf;
 	char new_name[IFNAMSIZ];
 	char old_name[IFNAMSIZ], strbuf[IFNAMSIZ + 8];
 	struct ifaddr *ifa;
 	struct sockaddr_dl *sdl;
 	void *buf;
 	nvlist_t *nvcap;
 	struct siocsifcapnv_driver_data drv_ioctl_data;
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 	case SIOCGIFINDEX:
 		ifr->ifr_index = ifp->if_index;
 		break;
 
 	case SIOCGIFFLAGS:
 		temp_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifr->ifr_flags = temp_flags & 0xffff;
 		ifr->ifr_flagshigh = temp_flags >> 16;
 		break;
 
 	case SIOCGIFCAP:
 		ifr->ifr_reqcap = ifp->if_capabilities;
 		ifr->ifr_curcap = ifp->if_capenable;
 		break;
 
 	case SIOCGIFCAPNV:
 		if ((ifp->if_capabilities & IFCAP_NV) == 0) {
 			error = EINVAL;
 			break;
 		}
 		buf = NULL;
 		nvcap = nvlist_create(0);
 		for (;;) {
 			if_capint_to_capnv(nvcap, ifcap_nv_bit_names,
 			    ifp->if_capabilities, ifp->if_capenable);
 			if_capint_to_capnv(nvcap, ifcap2_nv_bit_names,
 			    ifp->if_capabilities2, ifp->if_capenable2);
 			error = (*ifp->if_ioctl)(ifp, SIOCGIFCAPNV,
 			    __DECONST(caddr_t, nvcap));
 			if (error != 0) {
 				if_printf(ifp,
 			    "SIOCGIFCAPNV driver mistake: nvlist error %d\n",
 				    error);
 				break;
 			}
 			buf = nvlist_pack(nvcap, &nvbuflen);
 			if (buf == NULL) {
 				error = nvlist_error(nvcap);
 				if (error == 0)
 					error = EDOOFUS;
 				break;
 			}
 			if (nvbuflen > ifr->ifr_cap_nv.buf_length) {
 				ifr->ifr_cap_nv.length = nvbuflen;
 				ifr->ifr_cap_nv.buffer = NULL;
 				error = EFBIG;
 				break;
 			}
 			ifr->ifr_cap_nv.length = nvbuflen;
 			error = copyout(buf, ifr->ifr_cap_nv.buffer, nvbuflen);
 			break;
 		}
 		free(buf, M_NVLIST);
 		nvlist_destroy(nvcap);
 		break;
 
 	case SIOCGIFDATA:
 	{
 		struct if_data ifd;
 
 		/* Ensure uninitialised padding is not leaked. */
 		memset(&ifd, 0, sizeof(ifd));
 
 		if_data_copy(ifp, &ifd);
 		error = copyout(&ifd, ifr_data_get_ptr(ifr), sizeof(ifd));
 		break;
 	}
 
 #ifdef MAC
 	case SIOCGIFMAC:
 		error = mac_ifnet_ioctl_get(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCGIFMETRIC:
 		ifr->ifr_metric = ifp->if_metric;
 		break;
 
 	case SIOCGIFMTU:
 		ifr->ifr_mtu = ifp->if_mtu;
 		break;
 
 	case SIOCGIFPHYS:
 		/* XXXGL: did this ever worked? */
 		ifr->ifr_phys = 0;
 		break;
 
 	case SIOCGIFDESCR:
 		error = 0;
 		sx_slock(&ifdescr_sx);
 		if (ifp->if_description == NULL)
 			error = ENOMSG;
 		else {
 			/* space for terminating nul */
 			descrlen = strlen(ifp->if_description) + 1;
 			if (ifr_buffer_get_length(ifr) < descrlen)
 				ifr_buffer_set_buffer_null(ifr);
 			else
 				error = copyout(ifp->if_description,
 				    ifr_buffer_get_buffer(ifr), descrlen);
 			ifr_buffer_set_length(ifr, descrlen);
 		}
 		sx_sunlock(&ifdescr_sx);
 		break;
 
 	case SIOCSIFDESCR:
 		error = priv_check(td, PRIV_NET_SETIFDESCR);
 		if (error)
 			return (error);
 
 		/*
 		 * Copy only (length-1) bytes to make sure that
 		 * if_description is always nul terminated.  The
 		 * length parameter is supposed to count the
 		 * terminating nul in.
 		 */
 		if (ifr_buffer_get_length(ifr) > ifdescr_maxlen)
 			return (ENAMETOOLONG);
 		else if (ifr_buffer_get_length(ifr) == 0)
 			descrbuf = NULL;
 		else {
 			descrbuf = malloc(ifr_buffer_get_length(ifr),
 			    M_IFDESCR, M_WAITOK | M_ZERO);
 			error = copyin(ifr_buffer_get_buffer(ifr), descrbuf,
 			    ifr_buffer_get_length(ifr) - 1);
 			if (error) {
 				free(descrbuf, M_IFDESCR);
 				break;
 			}
 		}
 
 		sx_xlock(&ifdescr_sx);
 		odescrbuf = ifp->if_description;
 		ifp->if_description = descrbuf;
 		sx_xunlock(&ifdescr_sx);
 
 		getmicrotime(&ifp->if_lastchange);
 		free(odescrbuf, M_IFDESCR);
 		break;
 
 	case SIOCGIFFIB:
 		ifr->ifr_fib = ifp->if_fib;
 		break;
 
 	case SIOCSIFFIB:
 		error = priv_check(td, PRIV_NET_SETIFFIB);
 		if (error)
 			return (error);
 		if (ifr->ifr_fib >= rt_numfibs)
 			return (EINVAL);
 
 		ifp->if_fib = ifr->ifr_fib;
 		break;
 
 	case SIOCSIFFLAGS:
 		error = priv_check(td, PRIV_NET_SETIFFLAGS);
 		if (error)
 			return (error);
 		/*
 		 * Currently, no driver owned flags pass the IFF_CANTCHANGE
 		 * check, so we don't need special handling here yet.
 		 */
 		new_flags = (ifr->ifr_flags & 0xffff) |
 		    (ifr->ifr_flagshigh << 16);
 		if (ifp->if_flags & IFF_UP &&
 		    (new_flags & IFF_UP) == 0) {
 			if_down(ifp);
 		} else if (new_flags & IFF_UP &&
 		    (ifp->if_flags & IFF_UP) == 0) {
 			do_ifup = 1;
 		}
 		/* See if permanently promiscuous mode bit is about to flip */
 		if ((ifp->if_flags ^ new_flags) & IFF_PPROMISC) {
 			if (new_flags & IFF_PPROMISC)
 				ifp->if_flags |= IFF_PROMISC;
 			else if (ifp->if_pcount == 0)
 				ifp->if_flags &= ~IFF_PROMISC;
 			if (log_promisc_mode_change)
                                 if_printf(ifp, "permanently promiscuous mode %s\n",
                                     ((new_flags & IFF_PPROMISC) ?
                                      "enabled" : "disabled"));
 		}
 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
 			(new_flags &~ IFF_CANTCHANGE);
 		if (ifp->if_ioctl) {
 			(void) (*ifp->if_ioctl)(ifp, cmd, data);
 		}
 		if (do_ifup)
 			if_up(ifp);
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAP:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error != 0)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		if (ifr->ifr_reqcap & ~ifp->if_capabilities)
 			return (EINVAL);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFCAPNV:
 		error = priv_check(td, PRIV_NET_SETIFCAP);
 		if (error != 0)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		if ((ifp->if_capabilities & IFCAP_NV) == 0)
 			return (EINVAL);
 		if (ifr->ifr_cap_nv.length > IFR_CAP_NV_MAXBUFSIZE)
 			return (EINVAL);
 		nvcap = NULL;
 		buf = malloc(ifr->ifr_cap_nv.length, M_TEMP, M_WAITOK);
 		for (;;) {
 			error = copyin(ifr->ifr_cap_nv.buffer, buf,
 			    ifr->ifr_cap_nv.length);
 			if (error != 0)
 				break;
 			nvcap = nvlist_unpack(buf, ifr->ifr_cap_nv.length, 0);
 			if (nvcap == NULL) {
 				error = EINVAL;
 				break;
 			}
 			drv_ioctl_data.reqcap = if_capnv_to_capint(nvcap,
 			    &ifp->if_capenable, ifcap_nv_bit_names, false);
 			if ((drv_ioctl_data.reqcap &
 			    ~ifp->if_capabilities) != 0) {
 				error = EINVAL;
 				break;
 			}
 			drv_ioctl_data.reqcap2 = if_capnv_to_capint(nvcap,
 			    &ifp->if_capenable2, ifcap2_nv_bit_names, false);
 			if ((drv_ioctl_data.reqcap2 &
 			    ~ifp->if_capabilities2) != 0) {
 				error = EINVAL;
 				break;
 			}
 			drv_ioctl_data.nvcap = nvcap;
 			error = (*ifp->if_ioctl)(ifp, SIOCSIFCAPNV,
 			    (caddr_t)&drv_ioctl_data);
 			break;
 		}
 		nvlist_destroy(nvcap);
 		free(buf, M_TEMP);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 #ifdef MAC
 	case SIOCSIFMAC:
 		error = mac_ifnet_ioctl_set(td->td_ucred, ifr, ifp);
 		break;
 #endif
 
 	case SIOCSIFNAME:
 		error = priv_check(td, PRIV_NET_SETIFNAME);
 		if (error)
 			return (error);
 		error = copyinstr(ifr_data_get_ptr(ifr), new_name, IFNAMSIZ,
 		    NULL);
 		if (error != 0)
 			return (error);
 		if (new_name[0] == '\0')
 			return (EINVAL);
 		if (strcmp(new_name, ifp->if_xname) == 0)
 			break;
 		if (ifunit(new_name) != NULL)
 			return (EEXIST);
 
 		/*
 		 * XXX: Locking.  Nothing else seems to lock if_flags,
 		 * and there are numerous other races with the
 		 * ifunit() checks not being atomic with namespace
 		 * changes (renames, vmoves, if_attach, etc).
 		 */
 		ifp->if_flags |= IFF_RENAMING;
 		
 		EVENTHANDLER_INVOKE(ifnet_departure_event, ifp);
 
 		if_printf(ifp, "changing name to '%s'\n", new_name);
 
 		IF_ADDR_WLOCK(ifp);
 		strlcpy(old_name, ifp->if_xname, sizeof(old_name));
 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
 		ifa = ifp->if_addr;
 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 		namelen = strlen(new_name);
 		onamelen = sdl->sdl_nlen;
 		/*
 		 * Move the address if needed.  This is safe because we
 		 * allocate space for a name of length IFNAMSIZ when we
 		 * create this in if_attach().
 		 */
 		if (namelen != onamelen) {
 			bcopy(sdl->sdl_data + onamelen,
 			    sdl->sdl_data + namelen, sdl->sdl_alen);
 		}
 		bcopy(new_name, sdl->sdl_data, namelen);
 		sdl->sdl_nlen = namelen;
 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
 		bzero(sdl->sdl_data, onamelen);
 		while (namelen != 0)
 			sdl->sdl_data[--namelen] = 0xff;
 		IF_ADDR_WUNLOCK(ifp);
 
 		EVENTHANDLER_INVOKE(ifnet_arrival_event, ifp);
 
 		ifp->if_flags &= ~IFF_RENAMING;
 
 		snprintf(strbuf, sizeof(strbuf), "name=%s", new_name);
 		devctl_notify("IFNET", old_name, "RENAME", strbuf);
 		break;
 
 #ifdef VIMAGE
 	case SIOCSIFVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error)
 			return (error);
 		error = if_vmove_loan(td, ifp, ifr->ifr_name, ifr->ifr_jid);
 		break;
 #endif
 
 	case SIOCSIFMETRIC:
 		error = priv_check(td, PRIV_NET_SETIFMETRIC);
 		if (error)
 			return (error);
 		ifp->if_metric = ifr->ifr_metric;
 		getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYS:
 		error = priv_check(td, PRIV_NET_SETIFPHYS);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFMTU:
 	{
 		u_long oldmtu = ifp->if_mtu;
 
 		error = priv_check(td, PRIV_NET_SETIFMTU);
 		if (error)
 			return (error);
 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU)
 			return (EINVAL);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		/* Disallow MTU changes on bridge member interfaces. */
 		if (ifp->if_bridge)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0) {
 			getmicrotime(&ifp->if_lastchange);
 			rt_ifmsg(ifp);
 #ifdef INET
 			DEBUGNET_NOTIFY_MTU(ifp);
 #endif
 		}
 		/*
 		 * If the link MTU changed, do network layer specific procedure.
 		 */
 		if (ifp->if_mtu != oldmtu) {
 #ifdef INET6
 			nd6_setmtu(ifp);
 #endif
 			rt_updatemtu(ifp);
 		}
 		break;
 	}
 
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		if (cmd == SIOCADDMULTI)
 			error = priv_check(td, PRIV_NET_ADDMULTI);
 		else
 			error = priv_check(td, PRIV_NET_DELMULTI);
 		if (error)
 			return (error);
 
 		/* Don't allow group membership on non-multicast interfaces. */
 		if ((ifp->if_flags & IFF_MULTICAST) == 0)
 			return (EOPNOTSUPP);
 
 		/* Don't let users screw up protocols' entries. */
 		if (ifr->ifr_addr.sa_family != AF_LINK)
 			return (EINVAL);
 
 		if (cmd == SIOCADDMULTI) {
 			struct epoch_tracker et;
 			struct ifmultiaddr *ifma;
 
 			/*
 			 * Userland is only permitted to join groups once
 			 * via the if_addmulti() KPI, because it cannot hold
 			 * struct ifmultiaddr * between calls. It may also
 			 * lose a race while we check if the membership
 			 * already exists.
 			 */
 			NET_EPOCH_ENTER(et);
 			ifma = if_findmulti(ifp, &ifr->ifr_addr);
 			NET_EPOCH_EXIT(et);
 			if (ifma != NULL)
 				error = EADDRINUSE;
 			else
 				error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
 		} else {
 			error = if_delmulti(ifp, &ifr->ifr_addr);
 		}
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCSIFPHYADDR:
 	case SIOCDIFPHYADDR:
 #ifdef INET6
 	case SIOCSIFPHYADDR_IN6:
 #endif
 	case SIOCSIFMEDIA:
 	case SIOCSIFGENERIC:
 		error = priv_check(td, PRIV_NET_HWIOCTL);
 		if (error)
 			return (error);
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		if (error == 0)
 			getmicrotime(&ifp->if_lastchange);
 		break;
 
 	case SIOCGIFSTATUS:
 	case SIOCGIFPSRCADDR:
 	case SIOCGIFPDSTADDR:
 	case SIOCGIFMEDIA:
 	case SIOCGIFXMEDIA:
 	case SIOCGIFGENERIC:
 	case SIOCGIFRSSKEY:
 	case SIOCGIFRSSHASH:
 	case SIOCGIFDOWNREASON:
 		if (ifp->if_ioctl == NULL)
 			return (EOPNOTSUPP);
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 		break;
 
 	case SIOCSIFLLADDR:
 		error = priv_check(td, PRIV_NET_SETLLADDR);
 		if (error)
 			return (error);
 		error = if_setlladdr(ifp,
 		    ifr->ifr_addr.sa_data, ifr->ifr_addr.sa_len);
 		break;
 
 	case SIOCGHWADDR:
 		error = if_gethwaddr(ifp, ifr);
 		break;
 
 	case SIOCAIFGROUP:
 		error = priv_check(td, PRIV_NET_ADDIFGROUP);
 		if (error)
 			return (error);
 		error = if_addgroup(ifp,
 		    ((struct ifgroupreq *)data)->ifgr_group);
 		if (error != 0)
 			return (error);
 		break;
 
 	case SIOCGIFGROUP:
 	{
 		struct epoch_tracker et;
 
 		NET_EPOCH_ENTER(et);
 		error = if_getgroup((struct ifgroupreq *)data, ifp);
 		NET_EPOCH_EXIT(et);
 		break;
 	}
 
 	case SIOCDIFGROUP:
 		error = priv_check(td, PRIV_NET_DELIFGROUP);
 		if (error)
 			return (error);
 		error = if_delgroup(ifp,
 		    ((struct ifgroupreq *)data)->ifgr_group);
 		if (error != 0)
 			return (error);
 		break;
 
 	default:
 		error = ENOIOCTL;
 		break;
 	}
 	return (error);
 }
 
 /*
  * Interface ioctls.
  */
 int
 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct thread *td)
 {
 #ifdef COMPAT_FREEBSD32
 	union {
 		struct ifconf ifc;
 		struct ifdrv ifd;
 		struct ifgroupreq ifgr;
 		struct ifmediareq ifmr;
 	} thunk;
 	u_long saved_cmd;
 	struct ifconf32 *ifc32;
 	struct ifdrv32 *ifd32;
 	struct ifgroupreq32 *ifgr32;
 	struct ifmediareq32 *ifmr32;
 #endif
 	struct ifnet *ifp;
 	struct ifreq *ifr;
 	int error;
 	int oif_flags;
 #ifdef VIMAGE
 	bool shutdown;
 #endif
 
 	CURVNET_SET(so->so_vnet);
 #ifdef VIMAGE
 	/* Make sure the VNET is stable. */
 	shutdown = VNET_IS_SHUTTING_DOWN(so->so_vnet);
 	if (shutdown) {
 		CURVNET_RESTORE();
 		return (EBUSY);
 	}
 #endif
 
 #ifdef COMPAT_FREEBSD32
 	saved_cmd = cmd;
 	switch (cmd) {
 	case SIOCGIFCONF32:
 		ifc32 = (struct ifconf32 *)data;
 		thunk.ifc.ifc_len = ifc32->ifc_len;
 		thunk.ifc.ifc_buf = PTRIN(ifc32->ifc_buf);
 		data = (caddr_t)&thunk.ifc;
 		cmd = SIOCGIFCONF;
 		break;
 	case SIOCGDRVSPEC32:
 	case SIOCSDRVSPEC32:
 		ifd32 = (struct ifdrv32 *)data;
 		memcpy(thunk.ifd.ifd_name, ifd32->ifd_name,
 		    sizeof(thunk.ifd.ifd_name));
 		thunk.ifd.ifd_cmd = ifd32->ifd_cmd;
 		thunk.ifd.ifd_len = ifd32->ifd_len;
 		thunk.ifd.ifd_data = PTRIN(ifd32->ifd_data);
 		data = (caddr_t)&thunk.ifd;
 		cmd = _IOC_NEWTYPE(cmd, struct ifdrv);
 		break;
 	case SIOCAIFGROUP32:
 	case SIOCGIFGROUP32:
 	case SIOCDIFGROUP32:
 	case SIOCGIFGMEMB32:
 		ifgr32 = (struct ifgroupreq32 *)data;
 		memcpy(thunk.ifgr.ifgr_name, ifgr32->ifgr_name,
 		    sizeof(thunk.ifgr.ifgr_name));
 		thunk.ifgr.ifgr_len = ifgr32->ifgr_len;
 		switch (cmd) {
 		case SIOCAIFGROUP32:
 		case SIOCDIFGROUP32:
 			memcpy(thunk.ifgr.ifgr_group, ifgr32->ifgr_group,
 			    sizeof(thunk.ifgr.ifgr_group));
 			break;
 		case SIOCGIFGROUP32:
 		case SIOCGIFGMEMB32:
 			thunk.ifgr.ifgr_groups = PTRIN(ifgr32->ifgr_groups);
 			break;
 		}
 		data = (caddr_t)&thunk.ifgr;
 		cmd = _IOC_NEWTYPE(cmd, struct ifgroupreq);
 		break;
 	case SIOCGIFMEDIA32:
 	case SIOCGIFXMEDIA32:
 		ifmr32 = (struct ifmediareq32 *)data;
 		memcpy(thunk.ifmr.ifm_name, ifmr32->ifm_name,
 		    sizeof(thunk.ifmr.ifm_name));
 		thunk.ifmr.ifm_current = ifmr32->ifm_current;
 		thunk.ifmr.ifm_mask = ifmr32->ifm_mask;
 		thunk.ifmr.ifm_status = ifmr32->ifm_status;
 		thunk.ifmr.ifm_active = ifmr32->ifm_active;
 		thunk.ifmr.ifm_count = ifmr32->ifm_count;
 		thunk.ifmr.ifm_ulist = PTRIN(ifmr32->ifm_ulist);
 		data = (caddr_t)&thunk.ifmr;
 		cmd = _IOC_NEWTYPE(cmd, struct ifmediareq);
 		break;
 	}
 #endif
 
 	switch (cmd) {
 	case SIOCGIFCONF:
 		error = ifconf(cmd, data);
 		goto out_noref;
 	}
 
 	ifr = (struct ifreq *)data;
 	switch (cmd) {
 #ifdef VIMAGE
 	case SIOCSIFRVNET:
 		error = priv_check(td, PRIV_NET_SETIFVNET);
 		if (error == 0)
 			error = if_vmove_reclaim(td, ifr->ifr_name,
 			    ifr->ifr_jid);
 		goto out_noref;
 #endif
 	case SIOCIFCREATE:
 	case SIOCIFCREATE2:
 		error = priv_check(td, PRIV_NET_IFCREATE);
 		if (error == 0)
 			error = if_clone_create(ifr->ifr_name,
 			    sizeof(ifr->ifr_name), cmd == SIOCIFCREATE2 ?
 			    ifr_data_get_ptr(ifr) : NULL);
 		goto out_noref;
 	case SIOCIFDESTROY:
 		error = priv_check(td, PRIV_NET_IFDESTROY);
 
 		if (error == 0) {
 			sx_xlock(&ifnet_detach_sxlock);
 			error = if_clone_destroy(ifr->ifr_name);
 			sx_xunlock(&ifnet_detach_sxlock);
 		}
 		goto out_noref;
 
 	case SIOCIFGCLONERS:
 		error = if_clone_list((struct if_clonereq *)data);
 		goto out_noref;
 
 	case SIOCGIFGMEMB:
 		error = if_getgroupmembers((struct ifgroupreq *)data);
 		goto out_noref;
 
 #if defined(INET) || defined(INET6)
 	case SIOCSVH:
 	case SIOCGVH:
 		if (carp_ioctl_p == NULL)
 			error = EPROTONOSUPPORT;
 		else
 			error = (*carp_ioctl_p)(ifr, cmd, td);
 		goto out_noref;
 #endif
 	}
 
 	ifp = ifunit_ref(ifr->ifr_name);
 	if (ifp == NULL) {
 		error = ENXIO;
 		goto out_noref;
 	}
 
 	error = ifhwioctl(cmd, ifp, data, td);
 	if (error != ENOIOCTL)
 		goto out_ref;
 
 	oif_flags = ifp->if_flags;
 	if (so->so_proto == NULL) {
 		error = EOPNOTSUPP;
 		goto out_ref;
 	}
 
 	/*
 	 * Pass the request on to the socket control method, and if the
 	 * latter returns EOPNOTSUPP, directly to the interface.
 	 *
 	 * Make an exception for the legacy SIOCSIF* requests.  Drivers
 	 * trust SIOCSIFADDR et al to come from an already privileged
 	 * layer, and do not perform any credentials checks or input
 	 * validation.
 	 */
-	error = ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data,
-	    ifp, td));
+	error = so->so_proto->pr_control(so, cmd, data, ifp, td);
 	if (error == EOPNOTSUPP && ifp != NULL && ifp->if_ioctl != NULL &&
 	    cmd != SIOCSIFADDR && cmd != SIOCSIFBRDADDR &&
 	    cmd != SIOCSIFDSTADDR && cmd != SIOCSIFNETMASK)
 		error = (*ifp->if_ioctl)(ifp, cmd, data);
 
 	if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
 #ifdef INET6
 		if (ifp->if_flags & IFF_UP)
 			in6_if_up(ifp);
 #endif
 	}
 
 out_ref:
 	if_rele(ifp);
 out_noref:
 	CURVNET_RESTORE();
 #ifdef COMPAT_FREEBSD32
 	if (error != 0)
 		return (error);
 	switch (saved_cmd) {
 	case SIOCGIFCONF32:
 		ifc32->ifc_len = thunk.ifc.ifc_len;
 		break;
 	case SIOCGDRVSPEC32:
 		/*
 		 * SIOCGDRVSPEC is IOWR, but nothing actually touches
 		 * the struct so just assert that ifd_len (the only
 		 * field it might make sense to update) hasn't
 		 * changed.
 		 */
 		KASSERT(thunk.ifd.ifd_len == ifd32->ifd_len,
 		    ("ifd_len was updated %u -> %zu", ifd32->ifd_len,
 			thunk.ifd.ifd_len));
 		break;
 	case SIOCGIFGROUP32:
 	case SIOCGIFGMEMB32:
 		ifgr32->ifgr_len = thunk.ifgr.ifgr_len;
 		break;
 	case SIOCGIFMEDIA32:
 	case SIOCGIFXMEDIA32:
 		ifmr32->ifm_current = thunk.ifmr.ifm_current;
 		ifmr32->ifm_mask = thunk.ifmr.ifm_mask;
 		ifmr32->ifm_status = thunk.ifmr.ifm_status;
 		ifmr32->ifm_active = thunk.ifmr.ifm_active;
 		ifmr32->ifm_count = thunk.ifmr.ifm_count;
 		break;
 	}
 #endif
 	return (error);
 }
 
 /*
  * The code common to handling reference counted flags,
  * e.g., in ifpromisc() and if_allmulti().
  * The "pflag" argument can specify a permanent mode flag to check,
  * such as IFF_PPROMISC for promiscuous mode; should be 0 if none.
  *
  * Only to be used on stack-owned flags, not driver-owned flags.
  */
 static int
 if_setflag(struct ifnet *ifp, int flag, int pflag, int *refcount, int onswitch)
 {
 	struct ifreq ifr;
 	int error;
 	int oldflags, oldcount;
 
 	/* Sanity checks to catch programming errors */
 	KASSERT((flag & (IFF_DRV_OACTIVE|IFF_DRV_RUNNING)) == 0,
 	    ("%s: setting driver-owned flag %d", __func__, flag));
 
 	if (onswitch)
 		KASSERT(*refcount >= 0,
 		    ("%s: increment negative refcount %d for flag %d",
 		    __func__, *refcount, flag));
 	else
 		KASSERT(*refcount > 0,
 		    ("%s: decrement non-positive refcount %d for flag %d",
 		    __func__, *refcount, flag));
 
 	/* In case this mode is permanent, just touch refcount */
 	if (ifp->if_flags & pflag) {
 		*refcount += onswitch ? 1 : -1;
 		return (0);
 	}
 
 	/* Save ifnet parameters for if_ioctl() may fail */
 	oldcount = *refcount;
 	oldflags = ifp->if_flags;
 
 	/*
 	 * See if we aren't the only and touching refcount is enough.
 	 * Actually toggle interface flag if we are the first or last.
 	 */
 	if (onswitch) {
 		if ((*refcount)++)
 			return (0);
 		ifp->if_flags |= flag;
 	} else {
 		if (--(*refcount))
 			return (0);
 		ifp->if_flags &= ~flag;
 	}
 
 	/* Call down the driver since we've changed interface flags */
 	if (ifp->if_ioctl == NULL) {
 		error = EOPNOTSUPP;
 		goto recover;
 	}
 	ifr.ifr_flags = ifp->if_flags & 0xffff;
 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
 	error = (*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 	if (error)
 		goto recover;
 	/* Notify userland that interface flags have changed */
 	rt_ifmsg(ifp);
 	return (0);
 
 recover:
 	/* Recover after driver error */
 	*refcount = oldcount;
 	ifp->if_flags = oldflags;
 	return (error);
 }
 
 /*
  * Set/clear promiscuous mode on interface ifp based on the truth value
  * of pswitch.  The calls are reference counted so that only the first
  * "on" request actually has an effect, as does the final "off" request.
  * Results are undefined if the "off" and "on" requests are not matched.
  */
 int
 ifpromisc(struct ifnet *ifp, int pswitch)
 {
 	int error;
 	int oldflags = ifp->if_flags;
 
 	error = if_setflag(ifp, IFF_PROMISC, IFF_PPROMISC,
 			   &ifp->if_pcount, pswitch);
 	/* If promiscuous mode status has changed, log a message */
 	if (error == 0 && ((ifp->if_flags ^ oldflags) & IFF_PROMISC) &&
             log_promisc_mode_change)
 		if_printf(ifp, "promiscuous mode %s\n",
 		    (ifp->if_flags & IFF_PROMISC) ? "enabled" : "disabled");
 	return (error);
 }
 
 /*
  * Return interface configuration
  * of system.  List may be used
  * in later ioctl's (above) to get
  * other information.
  */
 /*ARGSUSED*/
 static int
 ifconf(u_long cmd, caddr_t data)
 {
 	struct ifconf *ifc = (struct ifconf *)data;
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 	struct sbuf *sb;
 	int error, full = 0, valid_len, max_len;
 
 	/* Limit initial buffer size to maxphys to avoid DoS from userspace. */
 	max_len = maxphys - 1;
 
 	/* Prevent hostile input from being able to crash the system */
 	if (ifc->ifc_len <= 0)
 		return (EINVAL);
 
 again:
 	if (ifc->ifc_len <= max_len) {
 		max_len = ifc->ifc_len;
 		full = 1;
 	}
 	sb = sbuf_new(NULL, NULL, max_len + 1, SBUF_FIXEDLEN);
 	max_len = 0;
 	valid_len = 0;
 
 	IFNET_RLOCK();
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		struct epoch_tracker et;
 		int addrs;
 
 		/*
 		 * Zero the ifr to make sure we don't disclose the contents
 		 * of the stack.
 		 */
 		memset(&ifr, 0, sizeof(ifr));
 
 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
 		    >= sizeof(ifr.ifr_name)) {
 			sbuf_delete(sb);
 			IFNET_RUNLOCK();
 			return (ENAMETOOLONG);
 		}
 
 		addrs = 0;
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa = ifa->ifa_addr;
 
 			if (prison_if(curthread->td_ucred, sa) != 0)
 				continue;
 			addrs++;
 			if (sa->sa_len <= sizeof(*sa)) {
 				if (sa->sa_len < sizeof(*sa)) {
 					memset(&ifr.ifr_ifru.ifru_addr, 0,
 					    sizeof(ifr.ifr_ifru.ifru_addr));
 					memcpy(&ifr.ifr_ifru.ifru_addr, sa,
 					    sa->sa_len);
 				} else
 					ifr.ifr_ifru.ifru_addr = *sa;
 				sbuf_bcat(sb, &ifr, sizeof(ifr));
 				max_len += sizeof(ifr);
 			} else {
 				sbuf_bcat(sb, &ifr,
 				    offsetof(struct ifreq, ifr_addr));
 				max_len += offsetof(struct ifreq, ifr_addr);
 				sbuf_bcat(sb, sa, sa->sa_len);
 				max_len += sa->sa_len;
 			}
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 		NET_EPOCH_EXIT(et);
 		if (addrs == 0) {
 			sbuf_bcat(sb, &ifr, sizeof(ifr));
 			max_len += sizeof(ifr);
 
 			if (sbuf_error(sb) == 0)
 				valid_len = sbuf_len(sb);
 		}
 	}
 	IFNET_RUNLOCK();
 
 	/*
 	 * If we didn't allocate enough space (uncommon), try again.  If
 	 * we have already allocated as much space as we are allowed,
 	 * return what we've got.
 	 */
 	if (valid_len != max_len && !full) {
 		sbuf_delete(sb);
 		goto again;
 	}
 
 	ifc->ifc_len = valid_len;
 	sbuf_finish(sb);
 	error = copyout(sbuf_data(sb), ifc->ifc_req, ifc->ifc_len);
 	sbuf_delete(sb);
 	return (error);
 }
 
 /*
  * Just like ifpromisc(), but for all-multicast-reception mode.
  */
 int
 if_allmulti(struct ifnet *ifp, int onswitch)
 {
 
 	return (if_setflag(ifp, IFF_ALLMULTI, 0, &ifp->if_amcount, onswitch));
 }
 
 struct ifmultiaddr *
 if_findmulti(struct ifnet *ifp, const struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 
 	IF_ADDR_LOCK_ASSERT(ifp);
 
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (sa->sa_family == AF_LINK) {
 			if (sa_dl_equal(ifma->ifma_addr, sa))
 				break;
 		} else {
 			if (sa_equal(ifma->ifma_addr, sa))
 				break;
 		}
 	}
 
 	return ifma;
 }
 
 /*
  * Allocate a new ifmultiaddr and initialize based on passed arguments.  We
  * make copies of passed sockaddrs.  The ifmultiaddr will not be added to
  * the ifnet multicast address list here, so the caller must do that and
  * other setup work (such as notifying the device driver).  The reference
  * count is initialized to 1.
  */
 static struct ifmultiaddr *
 if_allocmulti(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr *llsa,
     int mflags)
 {
 	struct ifmultiaddr *ifma;
 	struct sockaddr *dupsa;
 
 	ifma = malloc(sizeof *ifma, M_IFMADDR, mflags |
 	    M_ZERO);
 	if (ifma == NULL)
 		return (NULL);
 
 	dupsa = malloc(sa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(sa, dupsa, sa->sa_len);
 	ifma->ifma_addr = dupsa;
 
 	ifma->ifma_ifp = ifp;
 	ifma->ifma_refcount = 1;
 	ifma->ifma_protospec = NULL;
 
 	if (llsa == NULL) {
 		ifma->ifma_lladdr = NULL;
 		return (ifma);
 	}
 
 	dupsa = malloc(llsa->sa_len, M_IFMADDR, mflags);
 	if (dupsa == NULL) {
 		free(ifma->ifma_addr, M_IFMADDR);
 		free(ifma, M_IFMADDR);
 		return (NULL);
 	}
 	bcopy(llsa, dupsa, llsa->sa_len);
 	ifma->ifma_lladdr = dupsa;
 
 	return (ifma);
 }
 
 /*
  * if_freemulti: free ifmultiaddr structure and possibly attached related
  * addresses.  The caller is responsible for implementing reference
  * counting, notifying the driver, handling routing messages, and releasing
  * any dependent link layer state.
  */
 #ifdef MCAST_VERBOSE
 extern void kdb_backtrace(void);
 #endif
 static void
 if_freemulti_internal(struct ifmultiaddr *ifma)
 {
 
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti: refcount %d",
 	    ifma->ifma_refcount));
 
 	if (ifma->ifma_lladdr != NULL)
 		free(ifma->ifma_lladdr, M_IFMADDR);
 #ifdef MCAST_VERBOSE
 	kdb_backtrace();
 	printf("%s freeing ifma: %p\n", __func__, ifma);
 #endif
 	free(ifma->ifma_addr, M_IFMADDR);
 	free(ifma, M_IFMADDR);
 }
 
 static void
 if_destroymulti(epoch_context_t ctx)
 {
 	struct ifmultiaddr *ifma;
 
 	ifma = __containerof(ctx, struct ifmultiaddr, ifma_epoch_ctx);
 	if_freemulti_internal(ifma);
 }
 
 void
 if_freemulti(struct ifmultiaddr *ifma)
 {
 	KASSERT(ifma->ifma_refcount == 0, ("if_freemulti_epoch: refcount %d",
 	    ifma->ifma_refcount));
 
 	NET_EPOCH_CALL(if_destroymulti, &ifma->ifma_epoch_ctx);
 }
 
 /*
  * Register an additional multicast address with a network interface.
  *
  * - If the address is already present, bump the reference count on the
  *   address and return.
  * - If the address is not link-layer, look up a link layer address.
  * - Allocate address structures for one or both addresses, and attach to the
  *   multicast address list on the interface.  If automatically adding a link
  *   layer address, the protocol address will own a reference to the link
  *   layer address, to be freed when it is freed.
  * - Notify the network device driver of an addition to the multicast address
  *   list.
  *
  * 'sa' points to caller-owned memory with the desired multicast address.
  *
  * 'retifma' will be used to return a pointer to the resulting multicast
  * address reference, if desired.
  */
 int
 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
     struct ifmultiaddr **retifma)
 {
 	struct ifmultiaddr *ifma, *ll_ifma;
 	struct sockaddr *llsa;
 	struct sockaddr_dl sdl;
 	int error;
 
 #ifdef INET
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 #endif
 #ifdef INET6
 	IN6_MULTI_LIST_UNLOCK_ASSERT();
 #endif
 	/*
 	 * If the address is already present, return a new reference to it;
 	 * otherwise, allocate storage and set up a new address.
 	 */
 	IF_ADDR_WLOCK(ifp);
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL) {
 		ifma->ifma_refcount++;
 		if (retifma != NULL)
 			*retifma = ifma;
 		IF_ADDR_WUNLOCK(ifp);
 		return (0);
 	}
 
 	/*
 	 * The address isn't already present; resolve the protocol address
 	 * into a link layer address, and then look that up, bump its
 	 * refcount or allocate an ifma for that also.
 	 * Most link layer resolving functions returns address data which
 	 * fits inside default sockaddr_dl structure. However callback
 	 * can allocate another sockaddr structure, in that case we need to
 	 * free it later.
 	 */
 	llsa = NULL;
 	ll_ifma = NULL;
 	if (ifp->if_resolvemulti != NULL) {
 		/* Provide called function with buffer size information */
 		sdl.sdl_len = sizeof(sdl);
 		llsa = (struct sockaddr *)&sdl;
 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
 		if (error)
 			goto unlock_out;
 	}
 
 	/*
 	 * Allocate the new address.  Don't hook it up yet, as we may also
 	 * need to allocate a link layer multicast address.
 	 */
 	ifma = if_allocmulti(ifp, sa, llsa, M_NOWAIT);
 	if (ifma == NULL) {
 		error = ENOMEM;
 		goto free_llsa_out;
 	}
 
 	/*
 	 * If a link layer address is found, we'll need to see if it's
 	 * already present in the address list, or allocate is as well.
 	 * When this block finishes, the link layer address will be on the
 	 * list.
 	 */
 	if (llsa != NULL) {
 		ll_ifma = if_findmulti(ifp, llsa);
 		if (ll_ifma == NULL) {
 			ll_ifma = if_allocmulti(ifp, llsa, NULL, M_NOWAIT);
 			if (ll_ifma == NULL) {
 				--ifma->ifma_refcount;
 				if_freemulti(ifma);
 				error = ENOMEM;
 				goto free_llsa_out;
 			}
 			ll_ifma->ifma_flags |= IFMA_F_ENQUEUED;
 			CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ll_ifma,
 			    ifma_link);
 		} else
 			ll_ifma->ifma_refcount++;
 		ifma->ifma_llifma = ll_ifma;
 	}
 
 	/*
 	 * We now have a new multicast address, ifma, and possibly a new or
 	 * referenced link layer address.  Add the primary address to the
 	 * ifnet address list.
 	 */
 	ifma->ifma_flags |= IFMA_F_ENQUEUED;
 	CK_STAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
 
 	if (retifma != NULL)
 		*retifma = ifma;
 
 	/*
 	 * Must generate the message while holding the lock so that 'ifma'
 	 * pointer is still valid.
 	 */
 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
 	IF_ADDR_WUNLOCK(ifp);
 
 	/*
 	 * We are certain we have added something, so call down to the
 	 * interface to let them know about it.
 	 */
 	if (ifp->if_ioctl != NULL) {
 		if (THREAD_CAN_SLEEP())
 			(void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
 		else
 			taskqueue_enqueue(taskqueue_swi, &ifp->if_addmultitask);
 	}
 
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 	return (0);
 
 free_llsa_out:
 	if ((llsa != NULL) && (llsa != (struct sockaddr *)&sdl))
 		link_free_sdl(llsa);
 
 unlock_out:
 	IF_ADDR_WUNLOCK(ifp);
 	return (error);
 }
 
 static void
 if_siocaddmulti(void *arg, int pending)
 {
 	struct ifnet *ifp;
 
 	ifp = arg;
 #ifdef DIAGNOSTIC
 	if (pending > 1)
 		if_printf(ifp, "%d SIOCADDMULTI coalesced\n", pending);
 #endif
 	CURVNET_SET(ifp->if_vnet);
 	(void )(*ifp->if_ioctl)(ifp, SIOCADDMULTI, 0);
 	CURVNET_RESTORE();
 }
 
 /*
  * Delete a multicast group membership by network-layer group address.
  *
  * Returns ENOENT if the entry could not be found. If ifp no longer
  * exists, results are undefined. This entry point should only be used
  * from subsystems which do appropriate locking to hold ifp for the
  * duration of the call.
  * Network-layer protocol domains must use if_delmulti_ifma().
  */
 int
 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
 {
 	struct ifmultiaddr *ifma;
 	int lastref;
 
 	KASSERT(ifp, ("%s: NULL ifp", __func__));
 
 	IF_ADDR_WLOCK(ifp);
 	lastref = 0;
 	ifma = if_findmulti(ifp, sa);
 	if (ifma != NULL)
 		lastref = if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 
 	if (ifma == NULL)
 		return (ENOENT);
 
 	if (lastref && ifp->if_ioctl != NULL) {
 		(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 	}
 
 	return (0);
 }
 
 /*
  * Delete all multicast group membership for an interface.
  * Should be used to quickly flush all multicast filters.
  */
 void
 if_delallmulti(struct ifnet *ifp)
 {
 	struct ifmultiaddr *ifma;
 	struct ifmultiaddr *next;
 
 	IF_ADDR_WLOCK(ifp);
 	CK_STAILQ_FOREACH_SAFE(ifma, &ifp->if_multiaddrs, ifma_link, next)
 		if_delmulti_locked(ifp, ifma, 0);
 	IF_ADDR_WUNLOCK(ifp);
 }
 
 void
 if_delmulti_ifma(struct ifmultiaddr *ifma)
 {
 	if_delmulti_ifma_flags(ifma, 0);
 }
 
 /*
  * Delete a multicast group membership by group membership pointer.
  * Network-layer protocol domains must use this routine.
  *
  * It is safe to call this routine if the ifp disappeared.
  */
 void
 if_delmulti_ifma_flags(struct ifmultiaddr *ifma, int flags)
 {
 	struct ifnet *ifp;
 	int lastref;
 	MCDPRINTF("%s freeing ifma: %p\n", __func__, ifma);
 #ifdef INET
 	IN_MULTI_LIST_UNLOCK_ASSERT();
 #endif
 	ifp = ifma->ifma_ifp;
 #ifdef DIAGNOSTIC
 	if (ifp == NULL) {
 		printf("%s: ifma_ifp seems to be detached\n", __func__);
 	} else {
 		struct epoch_tracker et;
 		struct ifnet *oifp;
 
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(oifp, &V_ifnet, if_link)
 			if (ifp == oifp)
 				break;
 		NET_EPOCH_EXIT(et);
 		if (ifp != oifp)
 			ifp = NULL;
 	}
 #endif
 	/*
 	 * If and only if the ifnet instance exists: Acquire the address lock.
 	 */
 	if (ifp != NULL)
 		IF_ADDR_WLOCK(ifp);
 
 	lastref = if_delmulti_locked(ifp, ifma, flags);
 
 	if (ifp != NULL) {
 		/*
 		 * If and only if the ifnet instance exists:
 		 *  Release the address lock.
 		 *  If the group was left: update the hardware hash filter.
 		 */
 		IF_ADDR_WUNLOCK(ifp);
 		if (lastref && ifp->if_ioctl != NULL) {
 			(void)(*ifp->if_ioctl)(ifp, SIOCDELMULTI, 0);
 		}
 	}
 }
 
 /*
  * Perform deletion of network-layer and/or link-layer multicast address.
  *
  * Return 0 if the reference count was decremented.
  * Return 1 if the final reference was released, indicating that the
  * hardware hash filter should be reprogrammed.
  */
 static int
 if_delmulti_locked(struct ifnet *ifp, struct ifmultiaddr *ifma, int detaching)
 {
 	struct ifmultiaddr *ll_ifma;
 
 	if (ifp != NULL && ifma->ifma_ifp != NULL) {
 		KASSERT(ifma->ifma_ifp == ifp,
 		    ("%s: inconsistent ifp %p", __func__, ifp));
 		IF_ADDR_WLOCK_ASSERT(ifp);
 	}
 
 	ifp = ifma->ifma_ifp;
 	MCDPRINTF("%s freeing %p from %s \n", __func__, ifma, ifp ? ifp->if_xname : "");
 
 	/*
 	 * If the ifnet is detaching, null out references to ifnet,
 	 * so that upper protocol layers will notice, and not attempt
 	 * to obtain locks for an ifnet which no longer exists. The
 	 * routing socket announcement must happen before the ifnet
 	 * instance is detached from the system.
 	 */
 	if (detaching) {
 #ifdef DIAGNOSTIC
 		printf("%s: detaching ifnet instance %p\n", __func__, ifp);
 #endif
 		/*
 		 * ifp may already be nulled out if we are being reentered
 		 * to delete the ll_ifma.
 		 */
 		if (ifp != NULL) {
 			rt_newmaddrmsg(RTM_DELMADDR, ifma);
 			ifma->ifma_ifp = NULL;
 		}
 	}
 
 	if (--ifma->ifma_refcount > 0)
 		return 0;
 
 	if (ifp != NULL && detaching == 0 && (ifma->ifma_flags & IFMA_F_ENQUEUED)) {
 		CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifmultiaddr, ifma_link);
 		ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 	}
 	/*
 	 * If this ifma is a network-layer ifma, a link-layer ifma may
 	 * have been associated with it. Release it first if so.
 	 */
 	ll_ifma = ifma->ifma_llifma;
 	if (ll_ifma != NULL) {
 		KASSERT(ifma->ifma_lladdr != NULL,
 		    ("%s: llifma w/o lladdr", __func__));
 		if (detaching)
 			ll_ifma->ifma_ifp = NULL;	/* XXX */
 		if (--ll_ifma->ifma_refcount == 0) {
 			if (ifp != NULL) {
 				if (ll_ifma->ifma_flags & IFMA_F_ENQUEUED) {
 					CK_STAILQ_REMOVE(&ifp->if_multiaddrs, ll_ifma, ifmultiaddr,
 						ifma_link);
 					ll_ifma->ifma_flags &= ~IFMA_F_ENQUEUED;
 				}
 			}
 			if_freemulti(ll_ifma);
 		}
 	}
 #ifdef INVARIANTS
 	if (ifp) {
 		struct ifmultiaddr *ifmatmp;
 
 		CK_STAILQ_FOREACH(ifmatmp, &ifp->if_multiaddrs, ifma_link)
 			MPASS(ifma != ifmatmp);
 	}
 #endif
 	if_freemulti(ifma);
 	/*
 	 * The last reference to this instance of struct ifmultiaddr
 	 * was released; the hardware should be notified of this change.
 	 */
 	return 1;
 }
 
 /*
  * Set the link layer address on an interface.
  *
  * At this time we only support certain types of interfaces,
  * and we don't allow the length of the address to change.
  *
  * Set noinline to be dtrace-friendly
  */
 __noinline int
 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
 {
 	struct sockaddr_dl *sdl;
 	struct ifaddr *ifa;
 	struct ifreq ifr;
 
 	ifa = ifp->if_addr;
 	if (ifa == NULL)
 		return (EINVAL);
 
 	sdl = (struct sockaddr_dl *)ifa->ifa_addr;
 	if (sdl == NULL)
 		return (EINVAL);
 
 	if (len != sdl->sdl_alen)	/* don't allow length to change */
 		return (EINVAL);
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_XETHER:
 	case IFT_L2VLAN:
 	case IFT_BRIDGE:
 	case IFT_IEEE8023ADLAG:
 		bcopy(lladdr, LLADDR(sdl), len);
 		break;
 	default:
 		return (ENODEV);
 	}
 
 	/*
 	 * If the interface is already up, we need
 	 * to re-init it in order to reprogram its
 	 * address filter.
 	 */
 	if ((ifp->if_flags & IFF_UP) != 0) {
 		if (ifp->if_ioctl) {
 			ifp->if_flags &= ~IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 			ifp->if_flags |= IFF_UP;
 			ifr.ifr_flags = ifp->if_flags & 0xffff;
 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
 			(*ifp->if_ioctl)(ifp, SIOCSIFFLAGS, (caddr_t)&ifr);
 		}
 	}
 	EVENTHANDLER_INVOKE(iflladdr_event, ifp);
 
 	return (0);
 }
 
 /*
  * Compat function for handling basic encapsulation requests.
  * Not converted stacks (FDDI, IB, ..) supports traditional
  * output model: ARP (and other similar L2 protocols) are handled
  * inside output routine, arpresolve/nd6_resolve() returns MAC
  * address instead of full prepend.
  *
  * This function creates calculated header==MAC for IPv4/IPv6 and
  * returns EAFNOSUPPORT (which is then handled in ARP code) for other
  * address families.
  */
 static int
 if_requestencap_default(struct ifnet *ifp, struct if_encap_req *req)
 {
 
 	if (req->rtype != IFENCAP_LL)
 		return (EOPNOTSUPP);
 
 	if (req->bufsize < req->lladdr_len)
 		return (ENOMEM);
 
 	switch (req->family) {
 	case AF_INET:
 	case AF_INET6:
 		break;
 	default:
 		return (EAFNOSUPPORT);
 	}
 
 	/* Copy lladdr to storage as is */
 	memmove(req->buf, req->lladdr, req->lladdr_len);
 	req->bufsize = req->lladdr_len;
 	req->lladdr_off = 0;
 
 	return (0);
 }
 
 /*
  * Tunnel interfaces can nest, also they may cause infinite recursion
  * calls when misconfigured. We'll prevent this by detecting loops.
  * High nesting level may cause stack exhaustion. We'll prevent this
  * by introducing upper limit.
  *
  * Return 0, if tunnel nesting count is equal or less than limit.
  */
 int
 if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, uint32_t cookie,
     int limit)
 {
 	struct m_tag *mtag;
 	int count;
 
 	count = 1;
 	mtag = NULL;
 	while ((mtag = m_tag_locate(m, cookie, 0, mtag)) != NULL) {
 		if (*(struct ifnet **)(mtag + 1) == ifp) {
 			log(LOG_NOTICE, "%s: loop detected\n", if_name(ifp));
 			return (EIO);
 		}
 		count++;
 	}
 	if (count > limit) {
 		log(LOG_NOTICE,
 		    "%s: if_output recursively called too many times(%d)\n",
 		    if_name(ifp), count);
 		return (EIO);
 	}
 	mtag = m_tag_alloc(cookie, 0, sizeof(struct ifnet *), M_NOWAIT);
 	if (mtag == NULL)
 		return (ENOMEM);
 	*(struct ifnet **)(mtag + 1) = ifp;
 	m_tag_prepend(m, mtag);
 	return (0);
 }
 
 /*
  * Get the link layer address that was read from the hardware at attach.
  *
  * This is only set by Ethernet NICs (IFT_ETHER), but laggX interfaces re-type
  * their component interfaces as IFT_IEEE8023ADLAG.
  */
 int
 if_gethwaddr(struct ifnet *ifp, struct ifreq *ifr)
 {
 
 	if (ifp->if_hw_addr == NULL)
 		return (ENODEV);
 
 	switch (ifp->if_type) {
 	case IFT_ETHER:
 	case IFT_IEEE8023ADLAG:
 		bcopy(ifp->if_hw_addr, ifr->ifr_addr.sa_data, ifp->if_addrlen);
 		return (0);
 	default:
 		return (ENODEV);
 	}
 }
 
 /*
  * The name argument must be a pointer to storage which will last as
  * long as the interface does.  For physical devices, the result of
  * device_get_name(dev) is a good choice and for pseudo-devices a
  * static string works well.
  */
 void
 if_initname(struct ifnet *ifp, const char *name, int unit)
 {
 	ifp->if_dname = name;
 	ifp->if_dunit = unit;
 	if (unit != IF_DUNIT_NONE)
 		snprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
 	else
 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
 }
 
 static int
 if_vlog(struct ifnet *ifp, int pri, const char *fmt, va_list ap)
 {
 	char if_fmt[256];
 
 	snprintf(if_fmt, sizeof(if_fmt), "%s: %s", ifp->if_xname, fmt);
 	vlog(pri, if_fmt, ap);
 	return (0);
 }
 
 
 int
 if_printf(struct ifnet *ifp, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	if_vlog(ifp, LOG_INFO, fmt, ap);
 	va_end(ap);
 	return (0);
 }
 
 int
 if_log(struct ifnet *ifp, int pri, const char *fmt, ...)
 {
 	va_list ap;
 
 	va_start(ap, fmt);
 	if_vlog(ifp, pri, fmt, ap);
 	va_end(ap);
 	return (0);
 }
 
 void
 if_start(struct ifnet *ifp)
 {
 
 	(*(ifp)->if_start)(ifp);
 }
 
 /*
  * Backwards compatibility interface for drivers 
  * that have not implemented it
  */
 static int
 if_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	int error;
 
 	IFQ_HANDOFF(ifp, m, error);
 	return (error);
 }
 
 static void
 if_input_default(struct ifnet *ifp __unused, struct mbuf *m)
 {
 
 	m_freem(m);
 }
 
 int
 if_handoff(struct ifqueue *ifq, struct mbuf *m, struct ifnet *ifp, int adjust)
 {
 	int active = 0;
 
 	IF_LOCK(ifq);
 	if (_IF_QFULL(ifq)) {
 		IF_UNLOCK(ifq);
 		if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
 		m_freem(m);
 		return (0);
 	}
 	if (ifp != NULL) {
 		if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len + adjust);
 		if (m->m_flags & (M_BCAST|M_MCAST))
 			if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1);
 		active = ifp->if_drv_flags & IFF_DRV_OACTIVE;
 	}
 	_IF_ENQUEUE(ifq, m);
 	IF_UNLOCK(ifq);
 	if (ifp != NULL && !active)
 		(*(ifp)->if_start)(ifp);
 	return (1);
 }
 
 void
 if_register_com_alloc(u_char type,
     if_com_alloc_t *a, if_com_free_t *f)
 {
 
 	KASSERT(if_com_alloc[type] == NULL,
 	    ("if_register_com_alloc: %d already registered", type));
 	KASSERT(if_com_free[type] == NULL,
 	    ("if_register_com_alloc: %d free already registered", type));
 
 	if_com_alloc[type] = a;
 	if_com_free[type] = f;
 }
 
 void
 if_deregister_com_alloc(u_char type)
 {
 
 	KASSERT(if_com_alloc[type] != NULL,
 	    ("if_deregister_com_alloc: %d not registered", type));
 	KASSERT(if_com_free[type] != NULL,
 	    ("if_deregister_com_alloc: %d free not registered", type));
 
 	/*
 	 * Ensure all pending EPOCH(9) callbacks have been executed. This
 	 * fixes issues about late invocation of if_destroy(), which leads
 	 * to memory leak from if_com_alloc[type] allocated if_l2com.
 	 */
 	NET_EPOCH_DRAIN_CALLBACKS();
 
 	if_com_alloc[type] = NULL;
 	if_com_free[type] = NULL;
 }
 
 /* API for driver access to network stack owned ifnet.*/
 uint64_t
 if_setbaudrate(struct ifnet *ifp, uint64_t baudrate)
 {
 	uint64_t oldbrate;
 
 	oldbrate = ifp->if_baudrate;
 	ifp->if_baudrate = baudrate;
 	return (oldbrate);
 }
 
 uint64_t
 if_getbaudrate(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_baudrate);
 }
 
 int
 if_setcapabilities(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capabilities = capabilities;
 	return (0);
 }
 
 int
 if_setcapabilitiesbit(if_t ifp, int setbit, int clearbit)
 {
 	((struct ifnet *)ifp)->if_capabilities |= setbit;
 	((struct ifnet *)ifp)->if_capabilities &= ~clearbit;
 
 	return (0);
 }
 
 int
 if_getcapabilities(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capabilities;
 }
 
 int 
 if_setcapenable(if_t ifp, int capabilities)
 {
 	((struct ifnet *)ifp)->if_capenable = capabilities;
 	return (0);
 }
 
 int 
 if_setcapenablebit(if_t ifp, int setcap, int clearcap)
 {
 	if(setcap) 
 		((struct ifnet *)ifp)->if_capenable |= setcap;
 	if(clearcap)
 		((struct ifnet *)ifp)->if_capenable &= ~clearcap;
 
 	return (0);
 }
 
 const char *
 if_getdname(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_dname;
 }
 
 int 
 if_togglecapenable(if_t ifp, int togglecap)
 {
 	((struct ifnet *)ifp)->if_capenable ^= togglecap;
 	return (0);
 }
 
 int
 if_getcapenable(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_capenable;
 }
 
 /*
  * This is largely undesirable because it ties ifnet to a device, but does
  * provide flexiblity for an embedded product vendor. Should be used with
  * the understanding that it violates the interface boundaries, and should be
  * a last resort only.
  */
 int
 if_setdev(if_t ifp, void *dev)
 {
 	return (0);
 }
 
 int
 if_setdrvflagbits(if_t ifp, int set_flags, int clear_flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags |= set_flags;
 	((struct ifnet *)ifp)->if_drv_flags &= ~clear_flags;
 
 	return (0);
 }
 
 int
 if_getdrvflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_drv_flags;
 }
 
 int
 if_setdrvflags(if_t ifp, int flags)
 {
 	((struct ifnet *)ifp)->if_drv_flags = flags;
 	return (0);
 }
 
 int
 if_setflags(if_t ifp, int flags)
 {
 
 	ifp->if_flags = flags;
 	return (0);
 }
 
 int
 if_setflagbits(if_t ifp, int set, int clear)
 {
 	((struct ifnet *)ifp)->if_flags |= set;
 	((struct ifnet *)ifp)->if_flags &= ~clear;
 
 	return (0);
 }
 
 int
 if_getflags(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_flags;
 }
 
 int
 if_clearhwassist(if_t ifp)
 {
 	((struct ifnet *)ifp)->if_hwassist = 0;
 	return (0);
 }
 
 int
 if_sethwassistbits(if_t ifp, int toset, int toclear)
 {
 	((struct ifnet *)ifp)->if_hwassist |= toset;
 	((struct ifnet *)ifp)->if_hwassist &= ~toclear;
 
 	return (0);
 }
 
 int
 if_sethwassist(if_t ifp, int hwassist_bit)
 {
 	((struct ifnet *)ifp)->if_hwassist = hwassist_bit;
 	return (0);
 }
 
 int
 if_gethwassist(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_hwassist;
 }
 
 int
 if_setmtu(if_t ifp, int mtu)
 {
 	((struct ifnet *)ifp)->if_mtu = mtu;
 	return (0);
 }
 
 int
 if_getmtu(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_mtu;
 }
 
 int
 if_getmtu_family(if_t ifp, int family)
 {
 	struct domain *dp;
 
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_family == family && dp->dom_ifmtu != NULL)
 			return (dp->dom_ifmtu((struct ifnet *)ifp));
 	}
 
 	return (((struct ifnet *)ifp)->if_mtu);
 }
 
 /*
  * Methods for drivers to access interface unicast and multicast
  * link level addresses.  Driver shall not know 'struct ifaddr' neither
  * 'struct ifmultiaddr'.
  */
 u_int
 if_lladdr_count(if_t ifp)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	u_int count;
 
 	count = 0;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link)
 		if (ifa->ifa_addr->sa_family == AF_LINK)
 			count++;
 	NET_EPOCH_EXIT(et);
 
 	return (count);
 }
 
 u_int
 if_foreach_lladdr(if_t ifp, iflladdr_cb_t cb, void *cb_arg)
 {
 	struct epoch_tracker et;
 	struct ifaddr *ifa;
 	u_int count;
 
 	MPASS(cb);
 
 	count = 0;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family != AF_LINK)
 			continue;
 		count += (*cb)(cb_arg, (struct sockaddr_dl *)ifa->ifa_addr,
 		    count);
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (count);
 }
 
 u_int
 if_llmaddr_count(if_t ifp)
 {
 	struct epoch_tracker et;
 	struct ifmultiaddr *ifma;
 	int count;
 
 	count = 0;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
 		if (ifma->ifma_addr->sa_family == AF_LINK)
 			count++;
 	NET_EPOCH_EXIT(et);
 
 	return (count);
 }
 
 u_int
 if_foreach_llmaddr(if_t ifp, iflladdr_cb_t cb, void *cb_arg)
 {
 	struct epoch_tracker et;
 	struct ifmultiaddr *ifma;
 	u_int count;
 
 	MPASS(cb);
 
 	count = 0;
 	NET_EPOCH_ENTER(et);
 	CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 		if (ifma->ifma_addr->sa_family != AF_LINK)
 			continue;
 		count += (*cb)(cb_arg, (struct sockaddr_dl *)ifma->ifma_addr,
 		    count);
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (count);
 }
 
 int
 if_setsoftc(if_t ifp, void *softc)
 {
 	((struct ifnet *)ifp)->if_softc = softc;
 	return (0);
 }
 
 void *
 if_getsoftc(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_softc;
 }
 
 void 
 if_setrcvif(struct mbuf *m, if_t ifp)
 {
 
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	m->m_pkthdr.rcvif = (struct ifnet *)ifp;
 }
 
 void 
 if_setvtag(struct mbuf *m, uint16_t tag)
 {
 	m->m_pkthdr.ether_vtag = tag;	
 }
 
 uint16_t
 if_getvtag(struct mbuf *m)
 {
 
 	return (m->m_pkthdr.ether_vtag);
 }
 
 int
 if_sendq_empty(if_t ifp)
 {
 	return IFQ_DRV_IS_EMPTY(&((struct ifnet *)ifp)->if_snd);
 }
 
 struct ifaddr *
 if_getifaddr(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_addr;
 }
 
 int
 if_getamcount(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_amcount;
 }
 
 int
 if_setsendqready(if_t ifp)
 {
 	IFQ_SET_READY(&((struct ifnet *)ifp)->if_snd);
 	return (0);
 }
 
 int
 if_setsendqlen(if_t ifp, int tx_desc_count)
 {
 	IFQ_SET_MAXLEN(&((struct ifnet *)ifp)->if_snd, tx_desc_count);
 	((struct ifnet *)ifp)->if_snd.ifq_drv_maxlen = tx_desc_count;
 
 	return (0);
 }
 
 int
 if_vlantrunkinuse(if_t ifp)
 {
 	return ((struct ifnet *)ifp)->if_vlantrunk != NULL?1:0;
 }
 
 int
 if_input(if_t ifp, struct mbuf* sendmp)
 {
 	(*((struct ifnet *)ifp)->if_input)((struct ifnet *)ifp, sendmp);
 	return (0);
 
 }
 
 struct mbuf *
 if_dequeue(if_t ifp)
 {
 	struct mbuf *m;
 	IFQ_DRV_DEQUEUE(&((struct ifnet *)ifp)->if_snd, m);
 
 	return (m);
 }
 
 int
 if_sendq_prepend(if_t ifp, struct mbuf *m)
 {
 	IFQ_DRV_PREPEND(&((struct ifnet *)ifp)->if_snd, m);
 	return (0);
 }
 
 int
 if_setifheaderlen(if_t ifp, int len)
 {
 	((struct ifnet *)ifp)->if_hdrlen = len;
 	return (0);
 }
 
 caddr_t
 if_getlladdr(if_t ifp)
 {
 	return (IF_LLADDR((struct ifnet *)ifp));
 }
 
 void *
 if_gethandle(u_char type)
 {
 	return (if_alloc(type));
 }
 
 void
 if_bpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	BPF_MTAP(ifp, m);
 }
 
 void
 if_etherbpfmtap(if_t ifh, struct mbuf *m)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 
 	ETHER_BPF_MTAP(ifp, m);
 }
 
 void
 if_vlancap(if_t ifh)
 {
 	struct ifnet *ifp = (struct ifnet *)ifh;
 	VLAN_CAPABILITIES(ifp);
 }
 
 int
 if_sethwtsomax(if_t ifp, u_int if_hw_tsomax)
 {
 
 	((struct ifnet *)ifp)->if_hw_tsomax = if_hw_tsomax;
         return (0);
 }
 
 int
 if_sethwtsomaxsegcount(if_t ifp, u_int if_hw_tsomaxsegcount)
 {
 
 	((struct ifnet *)ifp)->if_hw_tsomaxsegcount = if_hw_tsomaxsegcount;
         return (0);
 }
 
 int
 if_sethwtsomaxsegsize(if_t ifp, u_int if_hw_tsomaxsegsize)
 {
 
 	((struct ifnet *)ifp)->if_hw_tsomaxsegsize = if_hw_tsomaxsegsize;
         return (0);
 }
 
 u_int
 if_gethwtsomax(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_hw_tsomax);
 }
 
 u_int
 if_gethwtsomaxsegcount(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_hw_tsomaxsegcount);
 }
 
 u_int
 if_gethwtsomaxsegsize(if_t ifp)
 {
 
 	return (((struct ifnet *)ifp)->if_hw_tsomaxsegsize);
 }
 
 void
 if_setinitfn(if_t ifp, void (*init_fn)(void *))
 {
 	((struct ifnet *)ifp)->if_init = init_fn;
 }
 
 void
 if_setioctlfn(if_t ifp, int (*ioctl_fn)(if_t, u_long, caddr_t))
 {
 	((struct ifnet *)ifp)->if_ioctl = (void *)ioctl_fn;
 }
 
 void
 if_setstartfn(if_t ifp, void (*start_fn)(if_t))
 {
 	((struct ifnet *)ifp)->if_start = (void *)start_fn;
 }
 
 void
 if_settransmitfn(if_t ifp, if_transmit_fn_t start_fn)
 {
 	((struct ifnet *)ifp)->if_transmit = start_fn;
 }
 
 void if_setqflushfn(if_t ifp, if_qflush_fn_t flush_fn)
 {
 	((struct ifnet *)ifp)->if_qflush = flush_fn;
 
 }
 
 void
 if_setgetcounterfn(if_t ifp, if_get_counter_t fn)
 {
 
 	ifp->if_get_counter = fn;
 }
 
 #ifdef DDB
 static void
 if_show_ifnet(struct ifnet *ifp)
 {
 
 	if (ifp == NULL)
 		return;
 	db_printf("%s:\n", ifp->if_xname);
 #define	IF_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, ifp->e);
 	IF_DB_PRINTF("%s", if_dname);
 	IF_DB_PRINTF("%d", if_dunit);
 	IF_DB_PRINTF("%s", if_description);
 	IF_DB_PRINTF("%u", if_index);
 	IF_DB_PRINTF("%d", if_idxgen);
 	IF_DB_PRINTF("%u", if_refcount);
 	IF_DB_PRINTF("%p", if_softc);
 	IF_DB_PRINTF("%p", if_l2com);
 	IF_DB_PRINTF("%p", if_llsoftc);
 	IF_DB_PRINTF("%d", if_amcount);
 	IF_DB_PRINTF("%p", if_addr);
 	IF_DB_PRINTF("%p", if_broadcastaddr);
 	IF_DB_PRINTF("%p", if_afdata);
 	IF_DB_PRINTF("%d", if_afdata_initialized);
 	IF_DB_PRINTF("%u", if_fib);
 	IF_DB_PRINTF("%p", if_vnet);
 	IF_DB_PRINTF("%p", if_home_vnet);
 	IF_DB_PRINTF("%p", if_vlantrunk);
 	IF_DB_PRINTF("%p", if_bpf);
 	IF_DB_PRINTF("%u", if_pcount);
 	IF_DB_PRINTF("%p", if_bridge);
 	IF_DB_PRINTF("%p", if_lagg);
 	IF_DB_PRINTF("%p", if_pf_kif);
 	IF_DB_PRINTF("%p", if_carp);
 	IF_DB_PRINTF("%p", if_label);
 	IF_DB_PRINTF("%p", if_netmap);
 	IF_DB_PRINTF("0x%08x", if_flags);
 	IF_DB_PRINTF("0x%08x", if_drv_flags);
 	IF_DB_PRINTF("0x%08x", if_capabilities);
 	IF_DB_PRINTF("0x%08x", if_capenable);
 	IF_DB_PRINTF("%p", if_snd.ifq_head);
 	IF_DB_PRINTF("%p", if_snd.ifq_tail);
 	IF_DB_PRINTF("%d", if_snd.ifq_len);
 	IF_DB_PRINTF("%d", if_snd.ifq_maxlen);
 	IF_DB_PRINTF("%p", if_snd.ifq_drv_head);
 	IF_DB_PRINTF("%p", if_snd.ifq_drv_tail);
 	IF_DB_PRINTF("%d", if_snd.ifq_drv_len);
 	IF_DB_PRINTF("%d", if_snd.ifq_drv_maxlen);
 	IF_DB_PRINTF("%d", if_snd.altq_type);
 	IF_DB_PRINTF("%x", if_snd.altq_flags);
 #undef IF_DB_PRINTF
 }
 
 DB_SHOW_COMMAND(ifnet, db_show_ifnet)
 {
 
 	if (!have_addr) {
 		db_printf("usage: show ifnet <struct ifnet *>\n");
 		return;
 	}
 
 	if_show_ifnet((struct ifnet *)addr);
 }
 
 DB_SHOW_ALL_COMMAND(ifnets, db_show_all_ifnets)
 {
 	struct ifnet *ifp;
 	u_short idx;
 
 	for (idx = 1; idx <= if_index; idx++) {
 		ifp = ifindex_table[idx].ife_ifnet;
 		if (ifp == NULL)
 			continue;
 		db_printf( "%20s ifp=%p\n", ifp->if_xname, ifp);
 		if (db_pager_quit)
 			break;
 	}
 }
 #endif	/* DDB */
diff --git a/sys/net/if_ovpn.c b/sys/net/if_ovpn.c
index 7d5d384a6f75..a90c11c1dcbf 100644
--- a/sys/net/if_ovpn.c
+++ b/sys/net/if_ovpn.c
@@ -1,2479 +1,2479 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2021-2022 Rubicon Communications, LLC (Netgate)
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  */
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf_ring.h>
 #include <sys/epoch.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/nv.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/sdt.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
 
 #include <machine/atomic.h>
 
 #include <net/bpf.h>
 #include <net/if.h>
 #include <net/if_clone.h>
 #include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 #include <net/route/nhop.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_fib.h>
 
 #include <machine/in_cksum.h>
 
 #include <opencrypto/cryptodev.h>
 
 #include "if_ovpn.h"
 
 struct ovpn_kkey_dir {
 	int			refcount;
 	uint8_t			key[32];
 	uint8_t			keylen;
 	uint8_t			nonce[8];
 	uint8_t			noncelen;
 	enum ovpn_key_cipher	cipher;
 	crypto_session_t	cryptoid;
 
 	struct mtx		replay_mtx;
 	/*
 	 * Last seen gapless sequence number. New rx seq numbers must be
 	 * strictly higher than this.
 	 */
 	uint32_t		rx_seq;
 	/* Seen packets, relative to rx_seq. bit(0) will always be 0. */
 	uint64_t		rx_window;
 };
 
 struct ovpn_kkey {
 	struct ovpn_kkey_dir	*encrypt;
 	struct ovpn_kkey_dir	*decrypt;
 	uint8_t			 keyid;
 	uint32_t		 peerid;
 };
 
 struct ovpn_keepalive {
 	uint32_t	interval;
 	uint32_t	timeout;
 };
 
 struct ovpn_wire_header {
 	uint32_t	 opcode; /* opcode, key id, peer id */
 	uint32_t	 seq;
 	uint8_t		 auth_tag[16];
 };
 
 struct ovpn_notification {
 	enum ovpn_notif_type	type;
 	uint32_t		peerid;
 };
 
 struct ovpn_softc;
 
 struct ovpn_kpeer {
 	int			 refcount;
 	uint32_t		 peerid;
 
 	struct ovpn_softc	*sc;
 	struct sockaddr_storage	 local;
 	struct sockaddr_storage	 remote;
 
 	struct in_addr		 vpn4;
 	struct in6_addr		 vpn6;
 
 	struct ovpn_kkey	 keys[2];
 	uint32_t		 tx_seq;
 
 	struct ovpn_keepalive	 keepalive;
 	uint32_t		*last_active;
 	struct callout		 ping_send;
 	struct callout		 ping_rcv;
 };
 
 #define OVPN_MAX_PEERS	128
 
 struct ovpn_counters {
 	uint64_t	lost_ctrl_pkts_in;
 	uint64_t	lost_ctrl_pkts_out;
 	uint64_t	lost_data_pkts_in;
 	uint64_t	lost_data_pkts_out;
 	uint64_t	nomem_data_pkts_in;
 	uint64_t	nomem_data_pkts_out;
 	uint64_t	received_ctrl_pkts;
 	uint64_t	received_data_pkts;
 	uint64_t	sent_ctrl_pkts;
 	uint64_t	sent_data_pkts;
 
 	uint64_t	transport_bytes_sent;
 	uint64_t	transport_bytes_received;
 	uint64_t	tunnel_bytes_sent;
 	uint64_t	tunnel_bytes_received;
 };
 #define OVPN_COUNTER_SIZE (sizeof(struct ovpn_counters)/sizeof(uint64_t))
 
 struct ovpn_softc {
 	int			 refcount;
 	struct rmlock		 lock;
 	struct ifnet		*ifp;
 	struct socket		*so;
 	int			 peercount;
 	struct ovpn_kpeer	*peers[OVPN_MAX_PEERS]; /* XXX Hard limit for now? */
 
 	/* Pending packets */
 	struct buf_ring		*rxring;
 	struct buf_ring		*notifring;
 
 	counter_u64_t 		 counters[OVPN_COUNTER_SIZE];
 
 	struct epoch_context	 epoch_ctx;
 };
 
 static struct ovpn_kpeer *ovpn_find_peer(struct ovpn_softc *, uint32_t);
 static bool ovpn_udp_input(struct mbuf *, int, struct inpcb *,
     const struct sockaddr *, void *);
 static int ovpn_transmit_to_peer(struct ifnet *, struct mbuf *,
     struct ovpn_kpeer *, struct rm_priotracker *);
 static int ovpn_encap(struct ovpn_softc *, uint32_t, struct mbuf *);
 static int ovpn_get_af(struct mbuf *);
 static void ovpn_free_kkey_dir(struct ovpn_kkey_dir *);
 static bool ovpn_check_replay(struct ovpn_kkey_dir *, uint32_t);
 
 #define OVPN_MTU_MIN		576
 #define OVPN_MTU_MAX		(IP_MAXPACKET - sizeof(struct ip) - \
     sizeof(struct udphdr) - sizeof(struct ovpn_wire_header))
 
 #define OVPN_OP_DATA_V2		0x09
 #define OVPN_OP_SHIFT		3
 
 VNET_DEFINE_STATIC(struct if_clone *, ovpn_cloner);
 #define	V_ovpn_cloner	VNET(ovpn_cloner)
 
 #define OVPN_RLOCK_TRACKER	struct rm_priotracker _ovpn_lock_tracker; \
     struct rm_priotracker *_ovpn_lock_trackerp = &_ovpn_lock_tracker
 #define OVPN_RLOCK(sc)		rm_rlock(&(sc)->lock, _ovpn_lock_trackerp)
 #define OVPN_RUNLOCK(sc)	rm_runlock(&(sc)->lock, _ovpn_lock_trackerp)
 #define OVPN_WLOCK(sc)		rm_wlock(&(sc)->lock)
 #define OVPN_WUNLOCK(sc)	rm_wunlock(&(sc)->lock)
 #define OVPN_ASSERT(sc)		rm_assert(&(sc)->lock, RA_LOCKED)
 #define OVPN_RASSERT(sc)	rm_assert(&(sc)->lock, RA_RLOCKED)
 #define OVPN_WASSERT(sc)	rm_assert(&(sc)->lock, RA_WLOCKED)
 #define OVPN_UNLOCK_ASSERT(sc)	rm_assert(&(sc)->lock, RA_UNLOCKED)
 
 #define OVPN_COUNTER_ADD(sc, name, val)	\
 	counter_u64_add(sc->counters[offsetof(struct ovpn_counters, name) / \
 	    sizeof(uint64_t)], val)
 
 #define TO_IN(x)		((struct sockaddr_in *)(x))
 #define TO_IN6(x)		((struct sockaddr_in6 *)(x))
 
 SDT_PROVIDER_DEFINE(if_ovpn);
 SDT_PROBE_DEFINE1(if_ovpn, tx, transmit, start, "struct mbuf *");
 SDT_PROBE_DEFINE2(if_ovpn, tx, route, ip4, "struct in_addr *", "struct ovpn_kpeer *");
 SDT_PROBE_DEFINE2(if_ovpn, tx, route, ip6, "struct in6_addr *", "struct ovpn_kpeer *");
 
 static const char ovpnname[] = "ovpn";
 static const char ovpngroupname[] = "openvpn";
 
 static MALLOC_DEFINE(M_OVPN, ovpnname, "OpenVPN DCO Interface");
 
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, IFT_OTHER, openvpn, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "OpenVPN DCO Interface");
 VNET_DEFINE_STATIC(int, replay_protection) = 0;
 #define	V_replay_protection	VNET(replay_protection)
 SYSCTL_INT(_net_link_openvpn, OID_AUTO, replay_protection, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(replay_protection), 0, "Validate sequence numbers");
 
 static struct ovpn_kpeer *
 ovpn_find_peer(struct ovpn_softc *sc, uint32_t peerid)
 {
 	struct ovpn_kpeer *p = NULL;
 
 	OVPN_ASSERT(sc);
 
 	for (int i = 0; i < OVPN_MAX_PEERS; i++) {
 		p = sc->peers[i];
 		if (p == NULL)
 			continue;
 
 		if (p->peerid == peerid) {
 			MPASS(p->sc == sc);
 			break;
 		}
 	}
 
 	return (p);
 }
 
 static struct ovpn_kpeer *
 ovpn_find_only_peer(struct ovpn_softc *sc)
 {
 	OVPN_ASSERT(sc);
 
 	for (int i = 0; i < OVPN_MAX_PEERS; i++) {
 		if (sc->peers[i] == NULL)
 			continue;
 		return (sc->peers[i]);
 	}
 
 	MPASS(false);
 
 	return (NULL);
 }
 
 static uint16_t
 ovpn_get_port(struct sockaddr_storage *s)
 {
 	switch (s->ss_family) {
 	case AF_INET: {
 		struct sockaddr_in *in = (struct sockaddr_in *)s;
 		return (in->sin_port);
 	}
 	case AF_INET6: {
 		struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)s;
 		return (in6->sin6_port);
 	}
 	default:
 		panic("Unsupported address family %d", s->ss_family);
 	}
 }
 
 static int
 ovpn_nvlist_to_sockaddr(const nvlist_t *nvl, struct sockaddr_storage *sa)
 {
 	int af;
 
 	if (! nvlist_exists_number(nvl, "af"))
 		return (EINVAL);
 	if (! nvlist_exists_binary(nvl, "address"))
 		return (EINVAL);
 	if (! nvlist_exists_number(nvl, "port"))
 		return (EINVAL);
 
 	af = nvlist_get_number(nvl, "af");
 
 	switch (af) {
 #ifdef INET
 	case AF_INET: {
 		struct sockaddr_in *in = (struct sockaddr_in *)sa;
 		size_t len;
 		const void *addr = nvlist_get_binary(nvl, "address", &len);
 		in->sin_family = af;
 		if (len != sizeof(in->sin_addr))
 			return (EINVAL);
 
 		memcpy(&in->sin_addr, addr, sizeof(in->sin_addr));
 		in->sin_port = nvlist_get_number(nvl, "port");
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6: {
 		struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)sa;
 		size_t len;
 		const void *addr = nvlist_get_binary(nvl, "address", &len);
 		in6->sin6_family = af;
 		if (len != sizeof(in6->sin6_addr))
 			return (EINVAL);
 
 		memcpy(&in6->sin6_addr, addr, sizeof(in6->sin6_addr));
 		in6->sin6_port = nvlist_get_number(nvl, "port");
 		break;
 	}
 #endif
 	default:
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static bool
 ovpn_has_peers(struct ovpn_softc *sc)
 {
 	OVPN_ASSERT(sc);
 
 	return (sc->peercount > 0);
 }
 
 static void
 ovpn_rele_so(struct ovpn_softc *sc, struct ovpn_kpeer *peer)
 {
 	bool has_peers;
 
 	OVPN_WASSERT(sc);
 
 	if (sc->so == NULL)
 		return;
 
 	has_peers = ovpn_has_peers(sc);
 
 	/* Only remove the tunnel function if we're releasing the socket for
 	 * the last peer. */
 	if (! has_peers)
 		(void)udp_set_kernel_tunneling(sc->so, NULL, NULL, NULL);
 
 	sorele(sc->so);
 
 	if (! has_peers)
 		sc->so = NULL;
 }
 
 static void
 ovpn_notify_del_peer(struct ovpn_softc *sc, struct ovpn_kpeer *peer)
 {
 	struct ovpn_notification *n;
 
 	OVPN_WASSERT(sc);
 
 	n = malloc(sizeof(*n), M_OVPN, M_NOWAIT);
 	if (n == NULL)
 		return;
 
 	n->peerid = peer->peerid;
 	n->type = OVPN_NOTIF_DEL_PEER;
 	if (buf_ring_enqueue(sc->notifring, n) != 0) {
 		free(n, M_OVPN);
 	} else if (sc->so != NULL) {
 		/* Wake up userspace */
 		sc->so->so_error = EAGAIN;
 		sorwakeup(sc->so);
 		sowwakeup(sc->so);
 	}
 }
 
 static void
 ovpn_peer_release_ref(struct ovpn_kpeer *peer, bool locked)
 {
 	struct ovpn_softc *sc;
 
 	atomic_add_int(&peer->refcount, -1);
 
 	if (atomic_load_int(&peer->refcount) > 0)
 		return;
 
 	sc = peer->sc;
 
 	if (! locked) {
 		OVPN_WLOCK(sc);
 
 		/* Might have changed before we acquired the lock. */
 		if (atomic_load_int(&peer->refcount) > 0) {
 			OVPN_WUNLOCK(sc);
 			return;
 		}
 	}
 
 	/* The peer should have been removed from the list already. */
 	MPASS(ovpn_find_peer(sc, peer->peerid) == NULL);
 
 	ovpn_notify_del_peer(sc, peer);
 
 	for (int i = 0; i < 2; i++) {
 		ovpn_free_kkey_dir(peer->keys[i].encrypt);
 		ovpn_free_kkey_dir(peer->keys[i].decrypt);
 	}
 
 	ovpn_rele_so(sc, peer);
 
 	callout_stop(&peer->ping_send);
 	callout_stop(&peer->ping_rcv);
 	uma_zfree_pcpu(pcpu_zone_4, peer->last_active);
 	free(peer, M_OVPN);
 
 	if (! locked)
 		OVPN_WUNLOCK(sc);
 }
 
 static int
 ovpn_new_peer(struct ifnet *ifp, const nvlist_t *nvl)
 {
 #ifdef INET6
 	struct epoch_tracker et;
 #endif
 	struct sockaddr_storage remote;
 	struct ovpn_kpeer *peer = NULL;
 	struct file *fp = NULL;
 	struct sockaddr *name = NULL;
 	struct ovpn_softc *sc = ifp->if_softc;
 	struct thread *td = curthread;
 	struct socket *so = NULL;
 	u_int fflag;
 	int fd;
 	uint32_t peerid;
 	int ret = 0, i;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "fd"))
 		return (EINVAL);
 
 	if (! nvlist_exists_nvlist(nvl, "remote"))
 		return (EINVAL);
 
 	peerid = nvlist_get_number(nvl, "peerid");
 
 	ret = ovpn_nvlist_to_sockaddr(nvlist_get_nvlist(nvl, "remote"),
 	    &remote);
 	if (ret != 0)
 		return (ret);
 
 	fd = nvlist_get_number(nvl, "fd");
 
 	/* Look up the userspace process and use the fd to find the socket. */
 	ret = getsock_cap(td, fd, &cap_connect_rights, &fp,
 	    &fflag, NULL);
 	if (ret != 0)
 		return (ret);
 
 	so = fp->f_data;
 
 	peer = malloc(sizeof(*peer), M_OVPN, M_WAITOK | M_ZERO);
 	peer->peerid = peerid;
 	peer->sc = sc;
 	peer->tx_seq = 1;
 	peer->refcount = 1;
 	peer->last_active = uma_zalloc_pcpu(pcpu_zone_4, M_WAITOK | M_ZERO);
 
 	if (nvlist_exists_binary(nvl, "vpn_ipv4")) {
 		size_t len;
 		const void *addr = nvlist_get_binary(nvl, "vpn_ipv4", &len);
 		if (len != sizeof(peer->vpn4)) {
 			ret = EINVAL;
 			goto error;
 		}
 		memcpy(&peer->vpn4, addr, len);
 	}
 
 	if (nvlist_exists_binary(nvl, "vpn_ipv6")) {
 		size_t len;
 		const void *addr = nvlist_get_binary(nvl, "vpn_ipv6", &len);
 		if (len != sizeof(peer->vpn6)) {
 			ret = EINVAL;
 			goto error;
 		}
 		memcpy(&peer->vpn6, addr, len);
 	}
 
 	callout_init_rm(&peer->ping_send, &sc->lock, CALLOUT_SHAREDLOCK);
 	callout_init_rm(&peer->ping_rcv, &sc->lock, 0);
 
-	ret = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &name);
+	ret = so->so_proto->pr_sockaddr(so, &name);
 	if (ret)
 		goto error;
 
 	if (ovpn_get_port((struct sockaddr_storage *)name) == 0) {
 		ret = EINVAL;
 		goto error;
 	}
 	if (name->sa_family != remote.ss_family) {
 		ret = EINVAL;
 		goto error;
 	}
 
 	memcpy(&peer->local, name, name->sa_len);
 	memcpy(&peer->remote, &remote, sizeof(remote));
 	free(name, M_SONAME);
 	name = NULL;
 
 	if (peer->local.ss_family == AF_INET6 &&
 	    IN6_IS_ADDR_V4MAPPED(&TO_IN6(&peer->remote)->sin6_addr)) {
 		/* V4 mapped address, so treat this as v4, not v6. */
 		in6_sin6_2_sin_in_sock((struct sockaddr *)&peer->local);
 		in6_sin6_2_sin_in_sock((struct sockaddr *)&peer->remote);
 	}
 
 #ifdef INET6
 	if (peer->local.ss_family == AF_INET6 &&
 	    IN6_IS_ADDR_UNSPECIFIED(&TO_IN6(&peer->local)->sin6_addr)) {
 		NET_EPOCH_ENTER(et);
 		ret = in6_selectsrc_addr(curthread->td_proc->p_fibnum,
 		    &TO_IN6(&peer->remote)->sin6_addr,
 		    0, NULL, &TO_IN6(&peer->local)->sin6_addr, NULL);
 		NET_EPOCH_EXIT(et);
 		if (ret != 0) {
 			goto error;
 		}
 	}
 #endif
 	OVPN_WLOCK(sc);
 
 	/* Disallow peer id re-use. */
 	if (ovpn_find_peer(sc, peerid) != NULL) {
 		ret = EEXIST;
 		goto error_locked;
 	}
 
 	/* Make sure this is really a UDP socket. */
 	if (so->so_type != SOCK_DGRAM || so->so_proto->pr_type != SOCK_DGRAM) {
 		ret = EPROTOTYPE;
 		goto error_locked;
 	}
 
 	/* Must be the same socket as for other peers on this interface. */
 	if (sc->so != NULL && so != sc->so)
 		goto error_locked;
 
 	if (sc->so == NULL)
 		sc->so = so;
 
 	/* Insert the peer into the list. */
 	for (i = 0; i < OVPN_MAX_PEERS; i++) {
 		if (sc->peers[i] != NULL)
 			continue;
 
 		MPASS(sc->peers[i] == NULL);
 		sc->peers[i] = peer;
 		sc->peercount++;
 		soref(sc->so);
 		break;
 	}
 	if (i == OVPN_MAX_PEERS) {
 		ret = ENOSPC;
 		goto error_locked;
 	}
 
 	ret = udp_set_kernel_tunneling(sc->so, ovpn_udp_input, NULL, sc);
 	if (ret == EBUSY) {
 		/* Fine, another peer already set the input function. */
 		ret = 0;
 	}
 	if (ret != 0) {
 		sc->peers[i] = NULL;
 		sc->peercount--;
 		goto error_locked;
 	}
 
 	OVPN_WUNLOCK(sc);
 
 	goto done;
 
 error_locked:
 	OVPN_WUNLOCK(sc);
 error:
 	free(name, M_SONAME);
 	uma_zfree_pcpu(pcpu_zone_4, peer->last_active);
 	free(peer, M_OVPN);
 done:
 	if (fp != NULL)
 		fdrop(fp, td);
 
 	return (ret);
 }
 
 static int
 _ovpn_del_peer(struct ovpn_softc *sc, uint32_t peerid)
 {
 	struct ovpn_kpeer *peer;
 	int i;
 
 	OVPN_WASSERT(sc);
 
 	for (i = 0; i < OVPN_MAX_PEERS; i++) {
 		if (sc->peers[i] == NULL)
 			continue;
 		if (sc->peers[i]->peerid != peerid)
 			continue;
 
 		peer = sc->peers[i];
 		break;
 	}
 	if (i == OVPN_MAX_PEERS)
 		return (ENOENT);
 
 	sc->peers[i] = NULL;
 	sc->peercount--;
 
 	ovpn_peer_release_ref(peer, true);
 
 	return (0);
 }
 
 static int
 ovpn_del_peer(struct ifnet *ifp, nvlist_t *nvl)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	uint32_t peerid;
 	int ret;
 
 	OVPN_WASSERT(sc);
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 
 	peerid = nvlist_get_number(nvl, "peerid");
 
 	ret = _ovpn_del_peer(sc, peerid);
 
 	return (ret);
 }
 
 static int
 ovpn_create_kkey_dir(struct ovpn_kkey_dir **kdirp,
     const nvlist_t *nvl)
 {
 	struct crypto_session_params csp;
 	struct ovpn_kkey_dir *kdir;
 	const char *ciphername;
 	enum ovpn_key_cipher cipher;
 	const void *key, *iv;
 	size_t keylen = 0, ivlen = 0;
 	int error;
 
 	if (! nvlist_exists_string(nvl, "cipher"))
 		return (EINVAL);
 	ciphername = nvlist_get_string(nvl, "cipher");
 
 	if (strcmp(ciphername, "none") == 0)
 		cipher = OVPN_CIPHER_ALG_NONE;
 	else if (strcmp(ciphername, "AES-256-GCM") == 0)
 		cipher = OVPN_CIPHER_ALG_AES_GCM;
 	else if (strcmp(ciphername, "CHACHA20-POLY1305") == 0)
 		cipher = OVPN_CIPHER_ALG_CHACHA20_POLY1305;
 	else
 		return (EINVAL);
 
 	if (cipher != OVPN_CIPHER_ALG_NONE) {
 		if (! nvlist_exists_binary(nvl, "key"))
 			return (EINVAL);
 		key = nvlist_get_binary(nvl, "key", &keylen);
 		if (keylen > sizeof(kdir->key))
 			return (E2BIG);
 
 		if (! nvlist_exists_binary(nvl, "iv"))
 			return (EINVAL);
 		iv = nvlist_get_binary(nvl, "iv", &ivlen);
 		if (ivlen != 8)
 			return (E2BIG);
 	}
 
 	kdir = malloc(sizeof(struct ovpn_kkey_dir), M_OVPN,
 	    M_WAITOK | M_ZERO);
 
 	kdir->cipher = cipher;
 	kdir->keylen = keylen;
 	memcpy(kdir->key, key, keylen);
 	kdir->noncelen = ivlen;
 	memcpy(kdir->nonce, iv, ivlen);
 
 	if (kdir->cipher != OVPN_CIPHER_ALG_NONE) {
 		/* Crypto init */
 		bzero(&csp, sizeof(csp));
 		csp.csp_mode = CSP_MODE_AEAD;
 
 		if (kdir->cipher == OVPN_CIPHER_ALG_CHACHA20_POLY1305)
 			csp.csp_cipher_alg = CRYPTO_CHACHA20_POLY1305;
 		else
 			csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16;
 
 		csp.csp_flags |= CSP_F_SEPARATE_AAD;
 
 		csp.csp_cipher_klen = kdir->keylen;
 		csp.csp_cipher_key = kdir->key;
 		csp.csp_ivlen = 96 / 8;
 
 		error = crypto_newsession(&kdir->cryptoid, &csp,
 		    CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
 		if (error) {
 			free(kdir, M_OVPN);
 			return (error);
 		}
 	}
 
 	mtx_init(&kdir->replay_mtx, "if_ovpn rx replay", NULL, MTX_DEF);
 	*kdirp = kdir;
 
 	return (0);
 }
 
 static void
 ovpn_free_kkey_dir(struct ovpn_kkey_dir *kdir)
 {
 	if (kdir == NULL)
 		return;
 
 	mtx_destroy(&kdir->replay_mtx);
 
 	crypto_freesession(kdir->cryptoid);
 	free(kdir, M_OVPN);
 }
 
 static int
 ovpn_set_key(struct ifnet *ifp, const nvlist_t *nvl)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	struct ovpn_kkey_dir *enc, *dec;
 	struct ovpn_kpeer *peer;
 	int slot, keyid, peerid;
 	int error;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "slot"))
 		return (EINVAL);
 	slot = nvlist_get_number(nvl, "slot");
 
 	if (! nvlist_exists_number(nvl, "keyid"))
 		return (EINVAL);
 	keyid = nvlist_get_number(nvl, "keyid");
 
 	if (! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 	peerid = nvlist_get_number(nvl, "peerid");
 
 	if (slot != OVPN_KEY_SLOT_PRIMARY &&
 	    slot != OVPN_KEY_SLOT_SECONDARY)
 		return (EINVAL);
 
 	if (! nvlist_exists_nvlist(nvl, "encrypt") ||
 	    ! nvlist_exists_nvlist(nvl, "decrypt"))
 		return (EINVAL);
 
 	error = ovpn_create_kkey_dir(&enc, nvlist_get_nvlist(nvl, "encrypt"));
 	if (error)
 		return (error);
 
 	error = ovpn_create_kkey_dir(&dec, nvlist_get_nvlist(nvl, "decrypt"));
 	if (error) {
 		ovpn_free_kkey_dir(enc);
 		return (error);
 	}
 
 	OVPN_WLOCK(sc);
 
 	peer = ovpn_find_peer(sc, peerid);
 	if (peer == NULL) {
 		ovpn_free_kkey_dir(dec);
 		ovpn_free_kkey_dir(enc);
 		OVPN_WUNLOCK(sc);
 		return (ENOENT);
 	}
 
 	ovpn_free_kkey_dir(peer->keys[slot].encrypt);
 	ovpn_free_kkey_dir(peer->keys[slot].decrypt);
 
 	peer->keys[slot].encrypt = enc;
 	peer->keys[slot].decrypt = dec;
 
 	peer->keys[slot].keyid = keyid;
 	peer->keys[slot].peerid = peerid;
 
 	OVPN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 ovpn_check_key(struct ovpn_softc *sc, struct ovpn_kpeer *peer, enum ovpn_key_slot slot)
 {
 	OVPN_ASSERT(sc);
 
 	if (peer->keys[slot].encrypt == NULL)
 		return (ENOLINK);
 
 	if (peer->keys[slot].decrypt == NULL)
 		return (ENOLINK);
 
 	return (0);
 }
 
 static int
 ovpn_start(struct ifnet *ifp)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 
 	OVPN_WLOCK(sc);
 
 	ifp->if_flags |= IFF_UP;
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	if_link_state_change(ifp, LINK_STATE_UP);
 
 	OVPN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 ovpn_swap_keys(struct ifnet *ifp, nvlist_t *nvl)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	struct ovpn_kpeer *peer;
 	struct ovpn_kkey tmpkey;
 	int error;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 
 	OVPN_WLOCK(sc);
 
 	peer = ovpn_find_peer(sc, nvlist_get_number(nvl, "peerid"));
 	if (peer == NULL) {
 		OVPN_WUNLOCK(sc);
 		return (ENOENT);
 	}
 
 	/* Check that we have a second key to swap to. */
 	error = ovpn_check_key(sc, peer, OVPN_KEY_SLOT_SECONDARY);
 	if (error) {
 		OVPN_WUNLOCK(sc);
 		return (error);
 	}
 
 	tmpkey = peer->keys[0];
 	peer->keys[0] = peer->keys[1];
 	peer->keys[1] = tmpkey;
 
 	OVPN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 ovpn_del_key(struct ifnet *ifp, const nvlist_t *nvl)
 {
 	enum ovpn_key_slot slot;
 	struct ovpn_kpeer *peer;
 	struct ovpn_softc *sc = ifp->if_softc;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "slot"))
 		return (EINVAL);
 	slot = nvlist_get_number(nvl, "slot");
 
 	if (slot != OVPN_KEY_SLOT_PRIMARY &&
 	    slot != OVPN_KEY_SLOT_SECONDARY)
 		return (EINVAL);
 
 	OVPN_WLOCK(sc);
 
 	peer = ovpn_find_peer(sc, nvlist_get_number(nvl, "peerid"));
 	if (peer == NULL) {
 		OVPN_WUNLOCK(sc);
 		return (ENOENT);
 	}
 
 	ovpn_free_kkey_dir(peer->keys[slot].encrypt);
 	ovpn_free_kkey_dir(peer->keys[slot].decrypt);
 
 	peer->keys[slot].encrypt = NULL;
 	peer->keys[slot].decrypt = NULL;
 
 	peer->keys[slot].keyid = 0;
 	peer->keys[slot].peerid = 0;
 
 	OVPN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 ovpn_send_pkt(struct ifnet *ifp, const nvlist_t *nvl)
 {
 	struct epoch_tracker et;
 	struct ovpn_softc *sc = ifp->if_softc;
 	struct mbuf *m;
 	const uint8_t *pkt;
 	size_t pktlen;
 	uint32_t peerid;
 	int ret;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_binary(nvl, "packet"))
 		return (EINVAL);
 	pkt = nvlist_get_binary(nvl, "packet", &pktlen);
 
 	if (! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 
 	peerid = nvlist_get_number(nvl, "peerid");
 
 	/*
 	 * Check that userspace isn't giving us a data packet. That might lead
 	 * to IV re-use, which would be bad.
 	 */
 	if ((pkt[0] >> OVPN_OP_SHIFT) == OVPN_OP_DATA_V2)
 		return (EINVAL);
 
 	m = m_get2(pktlen, M_WAITOK, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		return (ENOMEM);
 
 	m->m_len = m->m_pkthdr.len = pktlen;
 	m_copyback(m, 0, m->m_len, pkt);
 
 	/* Now prepend IP/UDP headers and transmit the mbuf. */
 	NET_EPOCH_ENTER(et);
 	ret = ovpn_encap(sc, peerid, m);
 	NET_EPOCH_EXIT(et);
 	if (ret == 0)
 		OVPN_COUNTER_ADD(sc, sent_ctrl_pkts, 1);
 	else
 		OVPN_COUNTER_ADD(sc, lost_ctrl_pkts_out, 1);
 
 	return (ret);
 }
 
 static void
 ovpn_send_ping(void *arg)
 {
 	static const uint8_t ping_str[] = {
 		0x2a, 0x18, 0x7b, 0xf3, 0x64, 0x1e, 0xb4, 0xcb,
 		0x07, 0xed, 0x2d, 0x0a, 0x98, 0x1f, 0xc7, 0x48
 	};
 
 	struct epoch_tracker et;
 	struct ovpn_kpeer *peer = arg;
 	struct ovpn_softc *sc = peer->sc;
 	struct mbuf *m;
 
 	OVPN_RASSERT(sc);
 
 	/* Ensure we repeat! */
 	callout_reset(&peer->ping_send, peer->keepalive.interval * hz,
 	    ovpn_send_ping, peer);
 
 	m = m_get2(sizeof(ping_str), M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL)
 		return;
 
 	m_copyback(m, 0, sizeof(ping_str), ping_str);
 	m->m_len = m->m_pkthdr.len = sizeof(ping_str);
 
 	CURVNET_SET(sc->ifp->if_vnet);
 	NET_EPOCH_ENTER(et);
 	(void)ovpn_transmit_to_peer(sc->ifp, m, peer, NULL);
 	NET_EPOCH_EXIT(et);
 	CURVNET_RESTORE();
 }
 
 static void
 ovpn_timeout(void *arg)
 {
 	struct ovpn_kpeer *peer = arg;
 	struct ovpn_softc *sc = peer->sc;
 	uint32_t last, _last_active;
 	int ret __diagused;
 	int cpu;
 
 	OVPN_WASSERT(sc);
 
 	last = 0;
 	CPU_FOREACH(cpu) {
 		_last_active = *zpcpu_get_cpu(peer->last_active, cpu);
 		if (_last_active > last)
 			last = _last_active;
 	}
 
 	if (last + peer->keepalive.timeout > time_uptime) {
 		callout_reset(&peer->ping_rcv,
 		    (peer->keepalive.timeout - (time_uptime - last)) * hz,
 		    ovpn_timeout, peer);
 		return;
 	}
 
 	CURVNET_SET(sc->ifp->if_vnet);
 	ret = _ovpn_del_peer(sc, peer->peerid);
 	MPASS(ret == 0);
 	CURVNET_RESTORE();
 }
 
 static int
 ovpn_set_peer(struct ifnet *ifp, const nvlist_t *nvl)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	struct ovpn_kpeer *peer;
 
 	if (nvl == NULL)
 		return (EINVAL);
 
 	if (! nvlist_exists_number(nvl, "interval") ||
 	    ! nvlist_exists_number(nvl, "timeout") ||
 	    ! nvlist_exists_number(nvl, "peerid"))
 		return (EINVAL);
 
 	OVPN_WLOCK(sc);
 
 	peer = ovpn_find_peer(sc, nvlist_get_number(nvl, "peerid"));
 	if (peer == NULL) {
 		OVPN_WUNLOCK(sc);
 		return (ENOENT);
 	}
 
 	peer->keepalive.interval = nvlist_get_number(nvl, "interval");
 	peer->keepalive.timeout = nvlist_get_number(nvl, "timeout");
 
 	if (peer->keepalive.interval > 0)
 		callout_reset(&peer->ping_send, peer->keepalive.interval * hz,
 		    ovpn_send_ping, peer);
 	if (peer->keepalive.timeout > 0)
 		callout_reset(&peer->ping_rcv, peer->keepalive.timeout * hz,
 		    ovpn_timeout, peer);
 
 	OVPN_WUNLOCK(sc);
 
 	return (0);
 }
 
 static int
 ovpn_ioctl_set(struct ifnet *ifp, struct ifdrv *ifd)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	uint8_t *buf = NULL;
 	nvlist_t *nvl = NULL;
 	int ret;
 
 	if (ifd->ifd_len != 0) {
 		if (ifd->ifd_len > OVPN_MAX_REQUEST_SIZE)
 			return (E2BIG);
 
 		buf = malloc(ifd->ifd_len, M_OVPN, M_WAITOK);
 
 		ret = copyin(ifd->ifd_data, buf, ifd->ifd_len);
 		if (ret != 0) {
 			free(buf, M_OVPN);
 			return (ret);
 		}
 
 		nvl = nvlist_unpack(buf, ifd->ifd_len, 0);
 		free(buf, M_OVPN);
 		if (nvl == NULL) {
 			return (EINVAL);
 		}
 	}
 
 	switch (ifd->ifd_cmd) {
 	case OVPN_NEW_PEER:
 		ret = ovpn_new_peer(ifp, nvl);
 		break;
 	case OVPN_DEL_PEER:
 		OVPN_WLOCK(sc);
 		ret = ovpn_del_peer(ifp, nvl);
 		OVPN_WUNLOCK(sc);
 		break;
 	case OVPN_NEW_KEY:
 		ret = ovpn_set_key(ifp, nvl);
 		break;
 	case OVPN_START_VPN:
 		ret = ovpn_start(ifp);
 		break;
 	case OVPN_SWAP_KEYS:
 		ret = ovpn_swap_keys(ifp, nvl);
 		break;
 	case OVPN_DEL_KEY:
 		ret = ovpn_del_key(ifp, nvl);
 		break;
 	case OVPN_SEND_PKT:
 		ret = ovpn_send_pkt(ifp, nvl);
 		break;
 	case OVPN_SET_PEER:
 		ret = ovpn_set_peer(ifp, nvl);
 		break;
 	default:
 		ret = ENOTSUP;
 	}
 
 	nvlist_destroy(nvl);
 	return (ret);
 }
 
 static int
 ovpn_add_counters(nvlist_t *parent, const char *name, counter_u64_t in,
     counter_u64_t out)
 {
 	nvlist_t *nvl;
 
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		return (ENOMEM);
 
 	nvlist_add_number(nvl, "in", counter_u64_fetch(in));
 	nvlist_add_number(nvl, "out", counter_u64_fetch(out));
 
 	nvlist_add_nvlist(parent, name, nvl);
 
 	nvlist_destroy(nvl);
 
 	return (0);
 }
 
 static int
 ovpn_get_stats(struct ovpn_softc *sc, nvlist_t **onvl)
 {
 	nvlist_t *nvl;
 	int ret;
 
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		return (ENOMEM);
 
 #define OVPN_COUNTER_OUT(name, in, out) \
 	do { \
 		ret = ovpn_add_counters(nvl, name, \
 		    sc->counters[offsetof(struct ovpn_counters, in) / \
 		    sizeof(uint64_t)], \
 		    sc->counters[offsetof(struct ovpn_counters, out) / \
 		    sizeof(uint64_t)]); \
 		if (ret != 0) \
 			goto error; \
 	} while(0)
 
 	OVPN_COUNTER_OUT("lost_ctrl", lost_ctrl_pkts_in, lost_ctrl_pkts_out);
 	OVPN_COUNTER_OUT("lost_data", lost_data_pkts_in, lost_data_pkts_out);
 	OVPN_COUNTER_OUT("nomem_data", nomem_data_pkts_in,
 	    nomem_data_pkts_out);
 	OVPN_COUNTER_OUT("data", received_data_pkts, sent_data_pkts);
 	OVPN_COUNTER_OUT("ctrl", received_ctrl_pkts, sent_ctrl_pkts);
 	OVPN_COUNTER_OUT("tunnel", tunnel_bytes_received,
 	    tunnel_bytes_received);
 	OVPN_COUNTER_OUT("transport", transport_bytes_received,
 	    transport_bytes_received);
 #undef OVPN_COUNTER_OUT
 
 	*onvl = nvl;
 
 	return (0);
 
 error:
 	nvlist_destroy(nvl);
 	return (ret);
 }
 
 static int
 ovpn_poll_pkt(struct ovpn_softc *sc, nvlist_t **onvl)
 {
 	nvlist_t *nvl;
 
 	nvl = nvlist_create(0);
 	if (nvl == NULL)
 		return (ENOMEM);
 
 	nvlist_add_number(nvl, "pending",
 	    buf_ring_count(sc->rxring) + buf_ring_count(sc->notifring));
 
 	*onvl = nvl;
 
 	return (0);
 }
 
 static int
 opvn_get_pkt(struct ovpn_softc *sc, nvlist_t **onvl)
 {
 	struct ovpn_notification *n;
 	struct ovpn_wire_header *ohdr;
 	struct mbuf *m;
 	uint8_t *buf;
 	nvlist_t *nvl;
 	uint32_t peerid;
 	u_int mlength;
 
 	/* Check if we have notifications pending. */
 	n = buf_ring_dequeue_mc(sc->notifring);
 	if (n != NULL) {
 		nvl = nvlist_create(0);
 		if (nvl == NULL) {
 			free(n, M_OVPN);
 			return (ENOMEM);
 		}
 		nvlist_add_number(nvl, "peerid", n->peerid);
 		nvlist_add_number(nvl, "notification", n->type);
 		free(n, M_OVPN);
 
 		*onvl = nvl;
 
 		return (0);
 	}
 
 	/* Queued packet. */
 	m = buf_ring_dequeue_mc(sc->rxring);
 	if (m == NULL)
 		return (ENOENT);
 
 	mlength = m_length(m, NULL);
 	buf = malloc(mlength, M_NVLIST, M_WAITOK);
 	m_copydata(m, 0, mlength, buf);
 	ohdr = (struct ovpn_wire_header *)buf;
 	peerid = ntohl(ohdr->opcode) & 0x00ffffff;
 
 	nvl = nvlist_create(0);
 	if (nvl == NULL) {
 		OVPN_COUNTER_ADD(sc, lost_ctrl_pkts_in, 1);
 		m_freem(m);
 		free(buf, M_NVLIST);
 		return (ENOMEM);
 	}
 
 	nvlist_move_binary(nvl, "packet", buf, mlength);
 	buf = NULL;
 	nvlist_add_number(nvl, "peerid", peerid);
 
 	*onvl = nvl;
 
 	m_freem(m);
 
 	return (0);
 }
 
 static int
 ovpn_ioctl_get(struct ifnet *ifp, struct ifdrv *ifd)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	nvlist_t *nvl = NULL;
 	int error;
 
 	switch (ifd->ifd_cmd) {
 	case OVPN_GET_STATS:
 		error = ovpn_get_stats(sc, &nvl);
 		break;
 	case OVPN_POLL_PKT:
 		error = ovpn_poll_pkt(sc, &nvl);
 		break;
 	case OVPN_GET_PKT:
 		error = opvn_get_pkt(sc, &nvl);
 		break;
 	default:
 		error = ENOTSUP;
 		break;
 	}
 
 	if (error == 0) {
 		void *packed = NULL;
 		size_t len;
 
 		MPASS(nvl != NULL);
 
 		packed = nvlist_pack(nvl, &len);
 		if (! packed) {
 			nvlist_destroy(nvl);
 			return (ENOMEM);
 		}
 
 		if (len > ifd->ifd_len) {
 			free(packed, M_NVLIST);
 			nvlist_destroy(nvl);
 			return (ENOSPC);
 		}
 
 		error = copyout(packed, ifd->ifd_data, len);
 		ifd->ifd_len = len;
 
 		free(packed, M_NVLIST);
 		nvlist_destroy(nvl);
 	}
 
 	return (error);
 }
 
 static int
 ovpn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifdrv *ifd;
 	int error;
 
 	switch (cmd) {
 	case SIOCSDRVSPEC:
 	case SIOCGDRVSPEC:
 		error = priv_check(curthread, PRIV_NET_OVPN);
 		if (error)
 			return (error);
 		break;
 	}
 
 	switch (cmd) {
 	case SIOCSDRVSPEC:
 		ifd = (struct ifdrv *)data;
 		error = ovpn_ioctl_set(ifp, ifd);
 		break;
 	case SIOCGDRVSPEC:
 		ifd = (struct ifdrv *)data;
 		error = ovpn_ioctl_get(ifp, ifd);
 		break;
 	case SIOCSIFMTU: {
 		struct ifreq *ifr = (struct ifreq *)data;
 		if (ifr->ifr_mtu < OVPN_MTU_MIN || ifr->ifr_mtu > OVPN_MTU_MAX)
 			return (EINVAL);
 
 		ifp->if_mtu = ifr->ifr_mtu;
 		return (0);
 	}
 	case SIOCSIFADDR:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 	case SIOCGIFMTU:
 	case SIOCSIFFLAGS:
 		return (0);
 	default:
 		error = EINVAL;
 	}
 
 	return (error);
 }
 
 static int
 ovpn_encrypt_tx_cb(struct cryptop *crp)
 {
 	struct ovpn_kpeer *peer = crp->crp_opaque;
 	struct ovpn_softc *sc = peer->sc;
 	struct mbuf *m = crp->crp_buf.cb_mbuf;
 	int ret;
 
 	if (crp->crp_etype != 0) {
 		crypto_freereq(crp);
 		ovpn_peer_release_ref(peer, false);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1);
 		m_freem(m);
 		return (0);
 	}
 
 	CURVNET_SET(sc->ifp->if_vnet);
 
 	MPASS(crp->crp_buf.cb_type == CRYPTO_BUF_MBUF);
 
 	ret = ovpn_encap(sc, peer->peerid, m);
 	if (ret == 0) {
 		OVPN_COUNTER_ADD(sc, sent_data_pkts, 1);
 		OVPN_COUNTER_ADD(sc, tunnel_bytes_sent, m->m_pkthdr.len -
 		    sizeof(struct ovpn_wire_header));
 	}
 
 	CURVNET_RESTORE();
 
 	crypto_freereq(crp);
 	ovpn_peer_release_ref(peer, false);
 
 	return (0);
 }
 
 static void
 ovpn_finish_rx(struct ovpn_softc *sc, struct mbuf *m,
     struct ovpn_kpeer *peer, struct ovpn_kkey *key, uint32_t seq,
     struct rm_priotracker *_ovpn_lock_trackerp)
 {
 	uint32_t af;
 	int ret __diagused;
 
 	OVPN_RASSERT(sc);
 
 	/* Replay protection. */
 	if (V_replay_protection && ! ovpn_check_replay(key->decrypt, seq)) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		m_freem(m);
 		return;
 	}
 
 	critical_enter();
 	*zpcpu_get(peer->last_active) = time_uptime;
 	critical_exit();
 
 	OVPN_RUNLOCK(sc);
 
 	OVPN_COUNTER_ADD(sc, received_data_pkts, 1);
 	OVPN_COUNTER_ADD(sc, tunnel_bytes_received, m->m_pkthdr.len);
 
 	/* Receive the packet on our interface. */
 	m->m_pkthdr.rcvif = sc->ifp;
 
 	/* Clear checksum flags in case the real hardware set them. */
 	m->m_pkthdr.csum_flags = 0;
 
 	/* Ensure we can read the first byte. */
 	m = m_pullup(m, 1);
 	if (m == NULL) {
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1);
 		return;
 	}
 
 	/*
 	 * Check for address family, and disregard any control packets (e.g.
 	 * keepalive).
 	 */
 	af = ovpn_get_af(m);
 	if (af != 0) {
 		BPF_MTAP2(sc->ifp, &af, sizeof(af), m);
 		ret = netisr_dispatch(af == AF_INET ? NETISR_IP : NETISR_IPV6,
 		    m);
 		MPASS(ret == 0);
 	} else {
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		m_freem(m);
 	}
 }
 
 static struct ovpn_kkey *
 ovpn_find_key(struct ovpn_softc *sc, struct ovpn_kpeer *peer,
     const struct ovpn_wire_header *ohdr)
 {
 	struct ovpn_kkey *key = NULL;
 	uint8_t keyid;
 
 	OVPN_RASSERT(sc);
 
 	keyid = (ntohl(ohdr->opcode) >> 24) & 0x07;
 
 	if (peer->keys[0].keyid == keyid)
 		key = &peer->keys[0];
 	else if (peer->keys[1].keyid == keyid)
 		key = &peer->keys[1];
 
 	return (key);
 }
 
 static int
 ovpn_decrypt_rx_cb(struct cryptop *crp)
 {
 	struct ovpn_softc *sc = crp->crp_opaque;
 	struct mbuf *m = crp->crp_buf.cb_mbuf;
 	struct ovpn_kkey *key;
 	struct ovpn_kpeer *peer;
 	struct ovpn_wire_header *ohdr;
 	uint32_t peerid;
 
 	OVPN_RLOCK_TRACKER;
 
 	OVPN_RLOCK(sc);
 
 	MPASS(crp->crp_buf.cb_type == CRYPTO_BUF_MBUF);
 
 	if (crp->crp_etype != 0) {
 		crypto_freereq(crp);
 		atomic_add_int(&sc->refcount, -1);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		OVPN_RUNLOCK(sc);
 		m_freem(m);
 		return (0);
 	}
 
 	CURVNET_SET(sc->ifp->if_vnet);
 
 	ohdr = mtodo(m, sizeof(struct udphdr));
 
 	peerid = ntohl(ohdr->opcode) & 0x00ffffff;
 	peer = ovpn_find_peer(sc, peerid);
 	if (peer == NULL) {
 		/* No such peer. Drop packet. */
 		crypto_freereq(crp);
 		atomic_add_int(&sc->refcount, -1);
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		m_freem(m);
 		CURVNET_RESTORE();
 		return (0);
 	}
 
 	key = ovpn_find_key(sc, peer, ohdr);
 	if (key == NULL) {
 		crypto_freereq(crp);
 		atomic_add_int(&sc->refcount, -1);
 		/*
 		 * Has this key been removed between us starting the decrypt
 		 * and finishing it?
 		 */
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		m_freem(m);
 		CURVNET_RESTORE();
 		return (0);
 	}
 
 	/* Now remove the outer headers */
 	m_adj_decap(m, sizeof(struct udphdr) +
 	    sizeof(struct ovpn_wire_header));
 
 	ovpn_finish_rx(sc, m, peer, key, ntohl(ohdr->seq), _ovpn_lock_trackerp);
 	OVPN_UNLOCK_ASSERT(sc);
 
 	CURVNET_RESTORE();
 
 	crypto_freereq(crp);
 	atomic_add_int(&sc->refcount, -1);
 
 	return (0);
 }
 
 static uint8_t EMPTY_BUFFER[AES_BLOCK_LEN];
 
 static int
 ovpn_get_af(struct mbuf *m)
 {
 	struct ip *ip;
 	struct ip6_hdr *ip6;
 
 	/*
 	 * We should pullup, but we're only interested in the first byte, so
 	 * that'll always be contiguous.
 	 */
 	ip = mtod(m, struct ip *);
 	if (ip->ip_v == IPVERSION)
 		return (AF_INET);
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	if (ip6->ip6_vfc == IPV6_VERSION)
 		return (AF_INET6);
 
 	return (0);
 }
 
 #ifdef INET
 static struct ovpn_kpeer *
 ovpn_find_peer_by_ip(struct ovpn_softc *sc, const struct in_addr addr)
 {
 	struct ovpn_kpeer *peer = NULL;
 
 	OVPN_ASSERT(sc);
 
 	for (int i = 0; i < OVPN_MAX_PEERS; i++) {
 		if (sc->peers[i] == NULL)
 			continue;
 		if (addr.s_addr == sc->peers[i]->vpn4.s_addr) {
 			peer = sc->peers[i];
 			break;
 		}
 	}
 
 	return (peer);
 }
 #endif
 
 #ifdef INET6
 static struct ovpn_kpeer *
 ovpn_find_peer_by_ip6(struct ovpn_softc *sc, const struct in6_addr *addr)
 {
 	struct ovpn_kpeer *peer = NULL;
 
 	OVPN_ASSERT(sc);
 
 	for (int i = 0; i < OVPN_MAX_PEERS; i++) {
 		if (sc->peers[i] == NULL)
 			continue;
 		if (memcmp(addr, &sc->peers[i]->vpn6, sizeof(*addr)) == 0) {
 			peer = sc->peers[i];
 			break;
 		}
 	}
 
 	return (peer);
 }
 #endif
 
 static struct ovpn_kpeer *
 ovpn_route_peer(struct ovpn_softc *sc, struct mbuf **m0,
     const struct sockaddr *dst)
 {
 	struct ovpn_kpeer *peer = NULL;
 	int af;
 
 	NET_EPOCH_ASSERT();
 	OVPN_ASSERT(sc);
 
 	/* Shortcut if we're a client (or are a server and have only one client). */
 	if (sc->peercount == 1)
 		return (ovpn_find_only_peer(sc));
 
 	if (dst != NULL)
 		af = dst->sa_family;
 	else
 		af = ovpn_get_af(*m0);
 
 	switch (af) {
 #ifdef INET
 	case AF_INET: {
 		const struct sockaddr_in *sa = (const struct sockaddr_in *)dst;
 		struct nhop_object *nh;
 		const struct in_addr *ip_dst;
 
 		if (sa != NULL) {
 			ip_dst = &sa->sin_addr;
 		} else {
 			struct ip *ip;
 
 			*m0 = m_pullup(*m0, sizeof(struct ip));
 			if (*m0 == NULL)
 				return (NULL);
 			ip = mtod(*m0, struct ip *);
 			ip_dst = &ip->ip_dst;
 		}
 
 		peer = ovpn_find_peer_by_ip(sc, *ip_dst);
 		SDT_PROBE2(if_ovpn, tx, route, ip4, ip_dst, peer);
 		if (peer == NULL) {
 			nh = fib4_lookup(M_GETFIB(*m0), *ip_dst, 0,
 			    NHR_NONE, 0);
 			if (nh && (nh->nh_flags & NHF_GATEWAY)) {
 				peer = ovpn_find_peer_by_ip(sc,
 				    nh->gw4_sa.sin_addr);
 				SDT_PROBE2(if_ovpn, tx, route, ip4,
 				    &nh->gw4_sa.sin_addr, peer);
 			}
 		}
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6: {
 		const struct sockaddr_in6 *sa6 =
 		    (const struct sockaddr_in6 *)dst;
 		struct nhop_object *nh;
 		const struct in6_addr *ip6_dst;
 
 		if (sa6 != NULL) {
 			ip6_dst = &sa6->sin6_addr;
 		} else {
 			struct ip6_hdr *ip6;
 
 			*m0 = m_pullup(*m0, sizeof(struct ip6_hdr));
 			if (*m0 == NULL)
 				return (NULL);
 			ip6 = mtod(*m0, struct ip6_hdr *);
 			ip6_dst = &ip6->ip6_dst;
 		}
 
 		peer = ovpn_find_peer_by_ip6(sc, ip6_dst);
 		SDT_PROBE2(if_ovpn, tx, route, ip6, ip6_dst, peer);
 		if (peer == NULL) {
 			nh = fib6_lookup(M_GETFIB(*m0), ip6_dst, 0,
 			    NHR_NONE, 0);
 			if (nh && (nh->nh_flags & NHF_GATEWAY)) {
 				peer = ovpn_find_peer_by_ip6(sc,
 				    &nh->gw6_sa.sin6_addr);
 				SDT_PROBE2(if_ovpn, tx, route, ip6,
 				    &nh->gw6_sa.sin6_addr, peer);
 			}
 		}
 		break;
 	}
 #endif
 	}
 
 	return (peer);
 }
 
 static int
 ovpn_transmit(struct ifnet *ifp, struct mbuf *m)
 {
 	return (ifp->if_output(ifp, m, NULL, NULL));
 }
 
 static int
 ovpn_transmit_to_peer(struct ifnet *ifp, struct mbuf *m,
     struct ovpn_kpeer *peer, struct rm_priotracker *_ovpn_lock_trackerp)
 {
 	struct ovpn_wire_header *ohdr;
 	struct ovpn_kkey *key;
 	struct ovpn_softc *sc;
 	struct cryptop *crp;
 	uint32_t af, seq;
 	size_t len, real_len, ovpn_hdr_len;
 	int tunnel_len;
 	int ret;
 
 	sc = ifp->if_softc;
 
 	OVPN_RASSERT(sc);
 
 	tunnel_len = m->m_pkthdr.len;
 
 	key = &peer->keys[OVPN_KEY_SLOT_PRIMARY];
 	if (key->encrypt == NULL) {
 		if (_ovpn_lock_trackerp != NULL)
 			OVPN_RUNLOCK(sc);
 		m_freem(m);
 		return (ENOLINK);
 	}
 
 	af = ovpn_get_af(m);
 	/* Don't capture control packets. */
 	if (af != 0)
 		BPF_MTAP2(ifp, &af, sizeof(af), m);
 
 	real_len = len = m->m_pkthdr.len;
 	MPASS(real_len <= ifp->if_mtu);
 
 	ovpn_hdr_len = sizeof(struct ovpn_wire_header);
 	if (key->encrypt->cipher == OVPN_CIPHER_ALG_NONE)
 		ovpn_hdr_len -= 16; /* No auth tag. */
 	else {
 		/* Round up the len to a multiple of our block size. */
 		len = roundup2(real_len, AES_BLOCK_LEN);
 
 		/* Now extend the mbuf. */
 		m_append(m, len - real_len, EMPTY_BUFFER);
 	}
 
 	M_PREPEND(m, ovpn_hdr_len, M_NOWAIT);
 	if (m == NULL) {
 		if (_ovpn_lock_trackerp != NULL)
 			OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 		return (ENOBUFS);
 	}
 	ohdr = mtod(m, struct ovpn_wire_header *);
 	ohdr->opcode = (OVPN_OP_DATA_V2 << OVPN_OP_SHIFT) | key->keyid;
 	ohdr->opcode <<= 24;
 	ohdr->opcode |= key->peerid;
 	ohdr->opcode = htonl(ohdr->opcode);
 
 	seq = atomic_fetchadd_32(&peer->tx_seq, 1);
 	seq = htonl(seq);
 	ohdr->seq = seq;
 
 	if (key->encrypt->cipher == OVPN_CIPHER_ALG_NONE) {
 		ret = ovpn_encap(sc, peer->peerid, m);
 		if (_ovpn_lock_trackerp != NULL)
 			OVPN_RUNLOCK(sc);
 		if (ret == 0) {
 			OVPN_COUNTER_ADD(sc, sent_data_pkts, 1);
 			OVPN_COUNTER_ADD(sc, tunnel_bytes_sent, tunnel_len);
 		}
 		return (ret);
 	}
 
 	crp = crypto_getreq(key->encrypt->cryptoid, M_NOWAIT);
 	if (crp == NULL) {
 		if (_ovpn_lock_trackerp != NULL)
 			OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 		m_freem(m);
 		return (ENOBUFS);
 	}
 
 	/* Encryption covers only the payload, not the header. */
 	crp->crp_payload_start = sizeof(*ohdr);
 	crp->crp_payload_length = len;
 	crp->crp_op = CRYPTO_OP_ENCRYPT;
 
 	/*
 	 * AAD data covers the ovpn_wire_header minus the auth
 	 * tag.
 	 */
 	crp->crp_aad_length = sizeof(*ohdr) - sizeof(ohdr->auth_tag);
 	crp->crp_aad = ohdr;
 	crp->crp_aad_start = 0;
 	crp->crp_op |= CRYPTO_OP_COMPUTE_DIGEST;
 	crp->crp_digest_start = offsetof(struct ovpn_wire_header, auth_tag);
 
 	crp->crp_flags |= CRYPTO_F_IV_SEPARATE;
 	memcpy(crp->crp_iv, &seq, sizeof(seq));
 	memcpy(crp->crp_iv + sizeof(seq), key->encrypt->nonce,
 	    key->encrypt->noncelen);
 
 	crypto_use_mbuf(crp, m);
 	crp->crp_flags |= CRYPTO_F_CBIFSYNC;
 	crp->crp_callback = ovpn_encrypt_tx_cb;
 	crp->crp_opaque = peer;
 
 	atomic_add_int(&peer->refcount, 1);
 	if (_ovpn_lock_trackerp != NULL)
 		OVPN_RUNLOCK(sc);
 	ret = crypto_dispatch(crp);
 	if (ret) {
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1);
 	}
 
 	return (ret);
 }
 
 /*
  * Note: Expects to hold the read lock on entry, and will release it itself.
  */
 static int
 ovpn_encap(struct ovpn_softc *sc, uint32_t peerid, struct mbuf *m)
 {
 	struct udphdr *udp;
 	struct ovpn_kpeer *peer;
 	int len;
 
 	OVPN_RLOCK_TRACKER;
 
 	OVPN_RLOCK(sc);
 	NET_EPOCH_ASSERT();
 
 	peer = ovpn_find_peer(sc, peerid);
 	if (peer == NULL || sc->ifp->if_link_state != LINK_STATE_UP) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	len = m->m_pkthdr.len;
 
 	M_PREPEND(m, sizeof(struct udphdr), M_NOWAIT);
 	if (m == NULL) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 		m_freem(m);
 		return (ENOBUFS);
 	}
 	udp = mtod(m, struct udphdr *);
 
 	MPASS(peer->local.ss_family == peer->remote.ss_family);
 
 	udp->uh_sport = ovpn_get_port(&peer->local);
 	udp->uh_dport = ovpn_get_port(&peer->remote);
 	udp->uh_ulen = htons(sizeof(struct udphdr) + len);
 
 	switch (peer->remote.ss_family) {
 #ifdef INET
 	case AF_INET: {
 		struct sockaddr_in *in_local = TO_IN(&peer->local);
 		struct sockaddr_in *in_remote = TO_IN(&peer->remote);
 		struct ip *ip;
 
 		/*
 		 * This requires knowing the source IP, which we don't. Happily
 		 * we're allowed to keep this at 0, and the checksum won't do
 		 * anything the crypto won't already do.
 		 */
 		udp->uh_sum = 0;
 
 		/* Set the checksum flags so we recalculate checksums. */
 		m->m_pkthdr.csum_flags |= CSUM_IP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 
 		M_PREPEND(m, sizeof(struct ip), M_NOWAIT);
 		if (m == NULL) {
 			OVPN_RUNLOCK(sc);
 			OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 			return (ENOBUFS);
 		}
 		ip = mtod(m, struct ip *);
 
 		ip->ip_tos = 0;
 		ip->ip_len = htons(sizeof(struct ip) + sizeof(struct udphdr) +
 		   len);
 		ip->ip_off = 0;
 		ip->ip_ttl = V_ip_defttl;
 		ip->ip_p = IPPROTO_UDP;
 		ip->ip_sum = 0;
 		if (in_local->sin_port != 0)
 			ip->ip_src = in_local->sin_addr;
 		else
 			ip->ip_src.s_addr = INADDR_ANY;
 		ip->ip_dst = in_remote->sin_addr;
 
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, transport_bytes_sent, m->m_pkthdr.len);
 
 		return (ip_output(m, NULL, NULL, 0, NULL, NULL));
 	}
 #endif
 #ifdef INET6
 	case AF_INET6: {
 		struct sockaddr_in6 *in6_local = TO_IN6(&peer->local);
 		struct sockaddr_in6 *in6_remote = TO_IN6(&peer->remote);
 		struct ip6_hdr *ip6;
 
 		M_PREPEND(m, sizeof(struct ip6_hdr), M_NOWAIT);
 		if (m == NULL) {
 			OVPN_RUNLOCK(sc);
 			OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 			return (ENOBUFS);
 		}
 		m = m_pullup(m, sizeof(*ip6) + sizeof(*udp));
 		if (m == NULL) {
 			OVPN_RUNLOCK(sc);
 			OVPN_COUNTER_ADD(sc, nomem_data_pkts_out, 1);
 			return (ENOBUFS);
 		}
 
 		ip6 = mtod(m, struct ip6_hdr *);
 
 		ip6->ip6_vfc = IPV6_VERSION;
 		ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK;
 		ip6->ip6_plen = htons(sizeof(*ip6) + sizeof(struct udphdr) +
 		    len);
 		ip6->ip6_nxt = IPPROTO_UDP;
 		ip6->ip6_hlim = V_ip6_defhlim;
 
 		memcpy(&ip6->ip6_src, &in6_local->sin6_addr,
 		    sizeof(ip6->ip6_src));
 		memcpy(&ip6->ip6_dst, &in6_remote->sin6_addr,
 		    sizeof(ip6->ip6_dst));
 
 		udp = mtodo(m, sizeof(*ip6));
 		udp->uh_sum = in6_cksum_pseudo(ip6,
 		    m->m_pkthdr.len - sizeof(struct ip6_hdr),
 		    IPPROTO_UDP, 0);
 
 		m->m_pkthdr.csum_flags |= CSUM_UDP_IPV6;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, transport_bytes_sent, m->m_pkthdr.len);
 
 		return (ip6_output(m, NULL, NULL, IPV6_UNSPECSRC, NULL, NULL,
 		    NULL));
 	}
 #endif
 	default:
 		panic("Unsupported address family %d",
 		    peer->remote.ss_family);
 	}
 }
 
 static int
 ovpn_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
 	struct route *ro)
 {
 	struct ovpn_softc *sc;
 	struct ovpn_kpeer *peer;
 
 	OVPN_RLOCK_TRACKER;
 
 	sc = ifp->if_softc;
 
 	OVPN_RLOCK(sc);
 
 	SDT_PROBE1(if_ovpn, tx, transmit, start, m);
 
 	if (__predict_false(ifp->if_link_state != LINK_STATE_UP)) {
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1);
 		OVPN_RUNLOCK(sc);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	/**
 	 * Only obey 'dst' (i.e. the gateway) if no route is supplied.
 	 * That's our indication that we're being called through pf's route-to,
 	 * and we should route according to 'dst' instead. We can't do so
 	 * consistently, because the usual openvpn configuration sets the first
 	 * non-server IP in the subnet as the gateway. If we always use that
 	 * one we'd end up routing all traffic to the first client.
 	 * tl;dr: 'ro == NULL' tells us pf is doing a route-to, and then but
 	 * only then, we should treat 'dst' as the destination. */
 	peer = ovpn_route_peer(sc, &m, ro == NULL ? dst : NULL);
 	if (peer == NULL) {
 		/* No destination. */
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_out, 1);
 		OVPN_RUNLOCK(sc);
 		m_freem(m);
 		return (ENETDOWN);
 	}
 
 	return (ovpn_transmit_to_peer(ifp, m, peer, _ovpn_lock_trackerp));
 }
 
 static void
 ovpn_rcv_ctrl(struct ovpn_softc *sc, struct mbuf *m, int off)
 {
 	/* Lop off the IP and UDP headers */
 	m_adj_decap(m, off);
 
 	/* Keep in the local ring until userspace fetches it. */
 	if (buf_ring_enqueue(sc->rxring, m) != 0) {
 		OVPN_COUNTER_ADD(sc, lost_ctrl_pkts_in, 1);
 		m_freem(m);
 		return;
 	}
 
 	OVPN_COUNTER_ADD(sc, received_ctrl_pkts, 1);
 }
 
 static bool
 ovpn_check_replay(struct ovpn_kkey_dir *key, uint32_t seq)
 {
 	uint32_t d;
 
 	mtx_lock(&key->replay_mtx);
 
 	/* Sequence number must be strictly greater than rx_seq */
 	if (seq <= key->rx_seq) {
 		mtx_unlock(&key->replay_mtx);
 		return (false);
 	}
 
 	/* Large jump. The packet authenticated okay, so just accept that. */
 	if (seq > (key->rx_seq + (sizeof(key->rx_window) * 8))) {
 		key->rx_seq = seq;
 		key->rx_window = 0;
 		mtx_unlock(&key->replay_mtx);
 		return (true);
 	}
 
 	/* Happy case. */
 	if ((seq == key->rx_seq + 1) && key->rx_window == 0) {
 		key->rx_seq++;
 		mtx_unlock(&key->replay_mtx);
 		return (true);
 	}
 
 	d = seq - key->rx_seq - 1;
 
 	if (key->rx_window & ((uint64_t)1 << d)) {
 		/* Dupe! */
 		mtx_unlock(&key->replay_mtx);
 		return (false);
 	}
 
 	key->rx_window |= (uint64_t)1 << d;
 
 	while (key->rx_window & 1) {
 		key->rx_seq++;
 		key->rx_window >>= 1;
 	}
 
 	mtx_unlock(&key->replay_mtx);
 
 	return (true);
 }
 
 static struct ovpn_kpeer *
 ovpn_peer_from_mbuf(struct ovpn_softc *sc, struct mbuf *m, int off)
 {
 	struct ovpn_wire_header ohdr;
 	uint32_t peerid;
 	const size_t hdrlen = sizeof(ohdr) - sizeof(ohdr.auth_tag);
 
 	OVPN_RASSERT(sc);
 
 	if (m_length(m, NULL) < (off + sizeof(struct udphdr) + hdrlen))
 		return (NULL);
 
 	m_copydata(m, off + sizeof(struct udphdr), hdrlen, (caddr_t)&ohdr);
 
 	peerid = ntohl(ohdr.opcode) & 0x00ffffff;
 
 	return (ovpn_find_peer(sc, peerid));
 }
 
 static bool
 ovpn_udp_input(struct mbuf *m, int off, struct inpcb *inp,
     const struct sockaddr *sa, void *ctx)
 {
 	struct ovpn_softc *sc = ctx;
 	struct ovpn_wire_header *ohdr;
 	struct udphdr *uhdr;
 	struct ovpn_kkey *key;
 	struct cryptop *crp;
 	struct ovpn_kpeer *peer;
 	size_t ohdrlen;
 	int ret;
 	uint8_t op;
 
 	OVPN_RLOCK_TRACKER;
 
 	M_ASSERTPKTHDR(m);
 
 	OVPN_COUNTER_ADD(sc, transport_bytes_received, m->m_pkthdr.len - off);
 
 	ohdrlen = sizeof(*ohdr) - sizeof(ohdr->auth_tag);
 
 	OVPN_RLOCK(sc);
 
 	peer = ovpn_peer_from_mbuf(sc, m, off);
 	if (peer == NULL) {
 		OVPN_RUNLOCK(sc);
 		return (false);
 	}
 
 	m = m_pullup(m, off + sizeof(*uhdr) + ohdrlen);
 	if (m == NULL) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1);
 		return (true);
 	}
 	uhdr = mtodo(m, off);
 	ohdr = mtodo(m, off + sizeof(*uhdr));
 
 	op = ntohl(ohdr->opcode) >> 24 >> OVPN_OP_SHIFT;
 
 	/*
 	 * Simplify things by getting rid of the preceding headers, we don't
 	 * care about them.
 	 */
 	m_adj_decap(m, off);
 
 	uhdr = mtodo(m, 0);
 	ohdr = mtodo(m, sizeof(*uhdr));
 
 	if (op != OVPN_OP_DATA_V2) {
 		OVPN_RUNLOCK(sc);
 		ovpn_rcv_ctrl(sc, m, sizeof(struct udphdr));
 		INP_WLOCK(inp);
 		udp_notify(inp, EAGAIN);
 		INP_WUNLOCK(inp);
 		return (true);
 	}
 
 	key = ovpn_find_key(sc, peer, ohdr);
 	if (key == NULL || key->decrypt == NULL) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 		m_freem(m);
 		return (true);
 	}
 
 	if (key->decrypt->cipher == OVPN_CIPHER_ALG_NONE) {
 		/* Now remove the outer headers */
 		m_adj_decap(m, sizeof(struct udphdr) + ohdrlen);
 
 		ohdr = mtodo(m, sizeof(*uhdr));
 
 		ovpn_finish_rx(sc, m, peer, key, ntohl(ohdr->seq),
 		    _ovpn_lock_trackerp);
 		OVPN_UNLOCK_ASSERT(sc);
 		return (true);
 	}
 
 	ohdrlen += sizeof(ohdr->auth_tag);
 
 	m = m_pullup(m, sizeof(*uhdr) + ohdrlen);
 	if (m == NULL) {
 		OVPN_RUNLOCK(sc);
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1);
 		return (true);
 	}
 	uhdr = mtodo(m, 0);
 	ohdr = mtodo(m, sizeof(*uhdr));
 
 	/* Decrypt */
 	crp = crypto_getreq(key->decrypt->cryptoid, M_NOWAIT);
 	if (crp == NULL) {
 		OVPN_COUNTER_ADD(sc, nomem_data_pkts_in, 1);
 		OVPN_RUNLOCK(sc);
 		m_freem(m);
 		return (true);
 	}
 
 	crp->crp_payload_start = sizeof(struct udphdr) + sizeof(*ohdr);
 	crp->crp_payload_length = ntohs(uhdr->uh_ulen) -
 	    sizeof(*uhdr) - sizeof(*ohdr);
 	crp->crp_op = CRYPTO_OP_DECRYPT;
 
 	/* AAD validation. */
 	crp->crp_aad_length = sizeof(*ohdr) - sizeof(ohdr->auth_tag);
 	crp->crp_aad = ohdr;
 	crp->crp_aad_start = 0;
 	crp->crp_op |= CRYPTO_OP_VERIFY_DIGEST;
 	crp->crp_digest_start = sizeof(struct udphdr) +
 	    offsetof(struct ovpn_wire_header, auth_tag);
 
 	crp->crp_flags |= CRYPTO_F_IV_SEPARATE;
 	memcpy(crp->crp_iv, &ohdr->seq, sizeof(ohdr->seq));
 	memcpy(crp->crp_iv + sizeof(ohdr->seq), key->decrypt->nonce,
 	    key->decrypt->noncelen);
 
 	crypto_use_mbuf(crp, m);
 	crp->crp_flags |= CRYPTO_F_CBIFSYNC;
 	crp->crp_callback = ovpn_decrypt_rx_cb;
 	crp->crp_opaque = sc;
 
 	atomic_add_int(&sc->refcount, 1);
 	OVPN_RUNLOCK(sc);
 	ret = crypto_dispatch(crp);
 	if (ret != 0) {
 		OVPN_COUNTER_ADD(sc, lost_data_pkts_in, 1);
 	}
 
 	return (true);
 }
 
 static void
 ovpn_qflush(struct ifnet *ifp __unused)
 {
 
 }
 
 static void
 ovpn_flush_rxring(struct ovpn_softc *sc)
 {
 	struct mbuf *m;
 	struct ovpn_notification *n;
 
 	OVPN_WASSERT(sc);
 
 	while (! buf_ring_empty(sc->rxring)) {
 		m = buf_ring_dequeue_sc(sc->rxring);
 		m_freem(m);
 	}
 
 	while (! buf_ring_empty(sc->notifring)) {
 		n = buf_ring_dequeue_sc(sc->notifring);
 		free(n, M_OVPN);
 	}
 }
 
 #ifdef VIMAGE
 static void
 ovpn_reassign(struct ifnet *ifp, struct vnet *new_vnet __unused,
     char *unused __unused)
 {
 	struct ovpn_softc *sc = ifp->if_softc;
 	int i;
 	int ret __diagused;
 
 	i = 0;
 
 	OVPN_WLOCK(sc);
 
 	/* Flush keys & configuration. */
 	do {
 		if (sc->peers[i] != NULL) {
 			ret = _ovpn_del_peer(sc, sc->peers[i]->peerid);
 			MPASS(ret == 0);
 		}
 		i++;
 	} while (i < OVPN_MAX_PEERS);
 
 	ovpn_flush_rxring(sc);
 
 	OVPN_WUNLOCK(sc);
 }
 #endif
 
 static int
 ovpn_clone_match(struct if_clone *ifc, const char *name)
 {
 	/*
 	 * Allow all names that start with 'ovpn', specifically because pfSense
 	 * uses ovpnc1 / ovpns2
 	 */
 	return (strncmp(ovpnname, name, strlen(ovpnname)) == 0);
 }
 
 static int
 ovpn_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 {
 	struct ovpn_softc *sc;
 	struct ifnet *ifp;
 	char *dp;
 	int error, unit, wildcard;
 
 	/* Try to see if a special unit was requested. */
 	error = ifc_name2unit(name, &unit);
 	if (error != 0)
 		return (error);
 	wildcard = (unit < 0);
 
 	error = ifc_alloc_unit(ifc, &unit);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If no unit had been given, we need to adjust the ifName.
 	 */
 	for (dp = name; *dp != '\0'; dp++);
 	if (wildcard) {
 		error = snprintf(dp, len - (dp - name), "%d", unit);
 		if (error > len - (dp - name)) {
 			/* ifName too long. */
 			ifc_free_unit(ifc, unit);
 			return (ENOSPC);
 		}
 		dp += error;
 	}
 
 	/* Make sure it doesn't already exist. */
 	if (ifunit(name) != NULL)
 		return (EEXIST);
 
 	sc = malloc(sizeof(struct ovpn_softc), M_OVPN, M_WAITOK | M_ZERO);
 	sc->ifp = if_alloc(IFT_ENC);
 	rm_init_flags(&sc->lock, "if_ovpn_lock", RM_RECURSE);
 	sc->refcount = 0;
 
 	sc->rxring = buf_ring_alloc(32, M_OVPN, M_WAITOK, NULL);
 	sc->notifring = buf_ring_alloc(32, M_OVPN, M_WAITOK, NULL);
 
 	COUNTER_ARRAY_ALLOC(sc->counters, OVPN_COUNTER_SIZE, M_WAITOK);
 
 	ifp = sc->ifp;
 	ifp->if_softc = sc;
 	strlcpy(ifp->if_xname, name, IFNAMSIZ);
 	ifp->if_dname = ovpngroupname;
 	ifp->if_dunit = unit;
 
 	ifp->if_addrlen = 0;
 	ifp->if_mtu = 1428;
 	ifp->if_flags = IFF_POINTOPOINT | IFF_MULTICAST;
 	ifp->if_ioctl = ovpn_ioctl;
 	ifp->if_transmit = ovpn_transmit;
 	ifp->if_output = ovpn_output;
 	ifp->if_qflush = ovpn_qflush;
 #ifdef VIMAGE
 	ifp->if_reassign = ovpn_reassign;
 #endif
 	ifp->if_capabilities |= IFCAP_LINKSTATE;
 	ifp->if_capenable |= IFCAP_LINKSTATE;
 
 	if_attach(ifp);
 	bpfattach(ifp, DLT_NULL, sizeof(uint32_t));
 
 	return (0);
 }
 
 static void
 ovpn_clone_destroy_cb(struct epoch_context *ctx)
 {
 	struct ovpn_softc *sc;
 
 	sc = __containerof(ctx, struct ovpn_softc, epoch_ctx);
 
 	MPASS(sc->peercount == 0);
 	for (int i = 0; i < OVPN_MAX_PEERS; i++) {
 		MPASS(sc->peers[i] == NULL);
 	}
 
 	COUNTER_ARRAY_FREE(sc->counters, OVPN_COUNTER_SIZE);
 
 	if_free(sc->ifp);
 	free(sc, M_OVPN);
 }
 
 static int
 ovpn_clone_destroy(struct if_clone *ifc, struct ifnet *ifp)
 {
 	struct ovpn_softc *sc;
 	int unit;
 	int i;
 	int ret __diagused;
 
 	sc = ifp->if_softc;
 	unit = ifp->if_dunit;
 
 	OVPN_WLOCK(sc);
 
 	if (atomic_load_int(&sc->refcount) > 0) {
 		OVPN_WUNLOCK(sc);
 		return (EBUSY);
 	}
 
 	i = 0;
 	do {
 		if (sc->peers[i] != NULL) {
 			ret = _ovpn_del_peer(sc, sc->peers[i]->peerid);
 			MPASS(ret == 0);
 		}
 		i++;
 	} while (i < OVPN_MAX_PEERS);
 
 	ovpn_flush_rxring(sc);
 	buf_ring_free(sc->rxring, M_OVPN);
 	buf_ring_free(sc->notifring, M_OVPN);
 
 	OVPN_WUNLOCK(sc);
 
 	bpfdetach(ifp);
 	if_detach(ifp);
 	ifp->if_softc = NULL;
 
 	NET_EPOCH_CALL(ovpn_clone_destroy_cb, &sc->epoch_ctx);
 
 	if (unit != IF_DUNIT_NONE)
 		ifc_free_unit(ifc, unit);
 
 	return (0);
 }
 
 static void
 vnet_ovpn_init(const void *unused __unused)
 {
 	V_ovpn_cloner = if_clone_advanced(ovpngroupname, 0, ovpn_clone_match,
 	    ovpn_clone_create, ovpn_clone_destroy);
 }
 VNET_SYSINIT(vnet_ovpn_init, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_ovpn_init, NULL);
 
 static void
 vnet_ovpn_uninit(const void *unused __unused)
 {
 	if_clone_detach(V_ovpn_cloner);
 }
 VNET_SYSUNINIT(vnet_ovpn_uninit, SI_SUB_PSEUDO, SI_ORDER_ANY,
     vnet_ovpn_uninit, NULL);
 
 static int
 ovpnmodevent(module_t mod, int type, void *data)
 {
 	switch (type) {
 	case MOD_LOAD:
 		/* Done in vnet_ovpn_init() */
 		break;
 	case MOD_UNLOAD:
 		/* Done in vnet_ovpn_uninit() */
 		break;
 	default:
 		return (EOPNOTSUPP);
 	}
 
 	return (0);
 }
 
 static moduledata_t ovpn_mod = {
 	"if_ovpn",
 	ovpnmodevent,
 	0
 };
 
 DECLARE_MODULE(if_ovpn, ovpn_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_ovpn, 1);
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index f0fcc7ab9004..1ddb72d926b9 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -1,2676 +1,2669 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1988, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)rtsock.c	8.7 (Berkeley) 10/12/95
  * $FreeBSD$
  */
 #include "opt_ddb.h"
 #include "opt_route.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/domain.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/if_llatbl.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/route/route_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip_carp.h>
 #ifdef INET6
 #include <netinet6/in6_var.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #endif
 #include <net/route/nhop.h>
 
 #define	DEBUG_MOD_NAME	rtsock
 #define	DEBUG_MAX_LEVEL	LOG_DEBUG
 #include <net/route/route_debug.h>
 _DECLARE_DEBUG(LOG_INFO);
 
 #ifdef COMPAT_FREEBSD32
 #include <sys/mount.h>
 #include <compat/freebsd32/freebsd32.h>
 
 struct if_msghdr32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	uint16_t _ifm_spare1;
 	struct	if_data ifm_data;
 };
 
 struct if_msghdrl32 {
 	uint16_t ifm_msglen;
 	uint8_t	ifm_version;
 	uint8_t	ifm_type;
 	int32_t	ifm_addrs;
 	int32_t	ifm_flags;
 	uint16_t ifm_index;
 	uint16_t _ifm_spare1;
 	uint16_t ifm_len;
 	uint16_t ifm_data_off;
 	uint32_t _ifm_spare2;
 	struct	if_data ifm_data;
 };
 
 struct ifa_msghdrl32 {
 	uint16_t ifam_msglen;
 	uint8_t	ifam_version;
 	uint8_t	ifam_type;
 	int32_t	ifam_addrs;
 	int32_t	ifam_flags;
 	uint16_t ifam_index;
 	uint16_t _ifam_spare1;
 	uint16_t ifam_len;
 	uint16_t ifam_data_off;
 	int32_t	ifam_metric;
 	struct	if_data ifam_data;
 };
 
 #define SA_SIZE32(sa)						\
     (  (((struct sockaddr *)(sa))->sa_len == 0) ?		\
 	sizeof(int)		:				\
 	1 + ( (((struct sockaddr *)(sa))->sa_len - 1) | (sizeof(int) - 1) ) )
 
 #endif /* COMPAT_FREEBSD32 */
 
 struct linear_buffer {
 	char		*base;	/* Base allocated memory pointer */
 	uint32_t	offset;	/* Currently used offset */
 	uint32_t	size;	/* Total buffer size */
 };
 #define	SCRATCH_BUFFER_SIZE	1024
 
 #define	RTS_PID_LOG(_l, _fmt, ...)	RT_LOG_##_l(_l, "PID %d: " _fmt, curproc ? curproc->p_pid : 0, ## __VA_ARGS__)
 
 MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
 
 /* NB: these are not modified */
 static struct	sockaddr route_src = { 2, PF_ROUTE, };
 static struct	sockaddr sa_zero   = { sizeof(sa_zero), AF_INET, };
 
 /* These are external hooks for CARP. */
 int	(*carp_get_vhid_p)(struct ifaddr *);
 
 /*
  * Used by rtsock callback code to decide whether to filter the update
  * notification to a socket bound to a particular FIB.
  */
 #define	RTS_FILTER_FIB	M_PROTO8
 /*
  * Used to store address family of the notification.
  */
 #define	m_rtsock_family	m_pkthdr.PH_loc.eight[0]
 
 struct rcb {
 	LIST_ENTRY(rcb) list;
 	struct socket	*rcb_socket;
 	sa_family_t	rcb_family;
 };
 
 typedef struct {
 	LIST_HEAD(, rcb)	cblist;
 	int	ip_count;	/* attached w/ AF_INET */
 	int	ip6_count;	/* attached w/ AF_INET6 */
 	int	any_count;	/* total attached */
 } route_cb_t;
 VNET_DEFINE_STATIC(route_cb_t, route_cb);
 #define	V_route_cb VNET(route_cb)
 
 struct mtx rtsock_mtx;
 MTX_SYSINIT(rtsock, &rtsock_mtx, "rtsock route_cb lock", MTX_DEF);
 
 #define	RTSOCK_LOCK()	mtx_lock(&rtsock_mtx)
 #define	RTSOCK_UNLOCK()	mtx_unlock(&rtsock_mtx)
 #define	RTSOCK_LOCK_ASSERT()	mtx_assert(&rtsock_mtx, MA_OWNED)
 
 SYSCTL_NODE(_net, OID_AUTO, route, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "");
 
 struct walkarg {
 	int	family;
 	int	w_tmemsize;
 	int	w_op, w_arg;
 	caddr_t	w_tmem;
 	struct sysctl_req *w_req;
 	struct sockaddr *dst;
 	struct sockaddr *mask;
 };
 
 static void	rts_input(struct mbuf *m);
 static struct mbuf *rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo);
 static int	rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo,
 			struct walkarg *w, int *plen);
 static int	rt_xaddrs(caddr_t cp, caddr_t cplim,
 			struct rt_addrinfo *rtinfo);
 static int	cleanup_xaddrs(struct rt_addrinfo *info, struct linear_buffer *lb);
 static int	sysctl_dumpentry(struct rtentry *rt, void *vw);
 static int	sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh,
 			uint32_t weight, struct walkarg *w);
 static int	sysctl_iflist(int af, struct walkarg *w);
 static int	sysctl_ifmalist(int af, struct walkarg *w);
 static void	rt_getmetrics(const struct rtentry *rt,
 			const struct nhop_object *nh, struct rt_metrics *out);
 static void	rt_dispatch(struct mbuf *, sa_family_t);
 static void	rt_ifannouncemsg(struct ifnet *ifp, int what);
 static int	handle_rtm_get(struct rt_addrinfo *info, u_int fibnum,
 			struct rt_msghdr *rtm, struct rib_cmd_info *rc);
 static int	update_rtm_from_rc(struct rt_addrinfo *info,
 			struct rt_msghdr **prtm, int alloc_len,
 			struct rib_cmd_info *rc, struct nhop_object *nh);
 static void	send_rtm_reply(struct socket *so, struct rt_msghdr *rtm,
 			struct mbuf *m, sa_family_t saf, u_int fibnum,
 			int rtm_errno);
 static bool	can_export_rte(struct ucred *td_ucred, bool rt_is_host,
 			const struct sockaddr *rt_dst);
 
 static struct netisr_handler rtsock_nh = {
 	.nh_name = "rtsock",
 	.nh_handler = rts_input,
 	.nh_proto = NETISR_ROUTE,
 	.nh_policy = NETISR_POLICY_SOURCE,
 };
 
 static int
 sysctl_route_netisr_maxqlen(SYSCTL_HANDLER_ARGS)
 {
 	int error, qlimit;
 
 	netisr_getqlimit(&rtsock_nh, &qlimit);
 	error = sysctl_handle_int(oidp, &qlimit, 0, req);
         if (error || !req->newptr)
                 return (error);
 	if (qlimit < 1)
 		return (EINVAL);
 	return (netisr_setqlimit(&rtsock_nh, qlimit));
 }
 SYSCTL_PROC(_net_route, OID_AUTO, netisr_maxqlen,
     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, sysctl_route_netisr_maxqlen, "I",
     "maximum routing socket dispatch queue length");
 
 static void
 vnet_rts_init(void)
 {
 	int tmp;
 
 	if (IS_DEFAULT_VNET(curvnet)) {
 		if (TUNABLE_INT_FETCH("net.route.netisr_maxqlen", &tmp))
 			rtsock_nh.nh_qlimit = tmp;
 		netisr_register(&rtsock_nh);
 	}
 #ifdef VIMAGE
 	 else
 		netisr_register_vnet(&rtsock_nh);
 #endif
 }
 VNET_SYSINIT(vnet_rtsock, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_rts_init, 0);
 
 #ifdef VIMAGE
 static void
 vnet_rts_uninit(void)
 {
 
 	netisr_unregister_vnet(&rtsock_nh);
 }
 VNET_SYSUNINIT(vnet_rts_uninit, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     vnet_rts_uninit, 0);
 #endif
 
 static void
 rts_handle_ifnet_arrival(void *arg __unused, struct ifnet *ifp)
 {
 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
 }
 EVENTHANDLER_DEFINE(ifnet_arrival_event, rts_handle_ifnet_arrival, NULL, 0);
 
 static void
 rts_handle_ifnet_departure(void *arg __unused, struct ifnet *ifp)
 {
 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
 }
 EVENTHANDLER_DEFINE(ifnet_departure_event, rts_handle_ifnet_departure, NULL, 0);
 
 static void
 rts_append_data(struct socket *so, struct mbuf *m)
 {
 
 	if (sbappendaddr(&so->so_rcv, &route_src, m, NULL) == 0) {
 		soroverflow(so);
 		m_freem(m);
 	} else
 		sorwakeup(so);
 }
 
 static void
 rts_input(struct mbuf *m)
 {
 	struct rcb *rcb;
 	struct socket *last;
 
 	last = NULL;
 	RTSOCK_LOCK();
 	LIST_FOREACH(rcb, &V_route_cb.cblist, list) {
 		if (rcb->rcb_family != AF_UNSPEC &&
 		    rcb->rcb_family != m->m_rtsock_family)
 			continue;
 		if ((m->m_flags & RTS_FILTER_FIB) &&
 		    M_GETFIB(m) != rcb->rcb_socket->so_fibnum)
 			continue;
 		if (last != NULL) {
 			struct mbuf *n;
 
 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 			if (n != NULL)
 				rts_append_data(last, n);
 		}
 		last = rcb->rcb_socket;
 	}
 	if (last != NULL)
 		rts_append_data(last, m);
 	else
 		m_freem(m);
 	RTSOCK_UNLOCK();
 }
 
 static void
 rts_close(struct socket *so)
 {
 
 	soisdisconnected(so);
 }
 
 static SYSCTL_NODE(_net, OID_AUTO, rtsock, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Routing socket infrastructure");
 static u_long rts_sendspace = 8192;
 SYSCTL_ULONG(_net_rtsock, OID_AUTO, sendspace, CTLFLAG_RW, &rts_sendspace, 0,
     "Default routing socket send space");
 static u_long rts_recvspace = 8192;
 SYSCTL_ULONG(_net_rtsock, OID_AUTO, recvspace, CTLFLAG_RW, &rts_recvspace, 0,
     "Default routing socket receive space");
 
 static int
 rts_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct rcb *rcb;
 	int error;
 
 	error = soreserve(so, rts_sendspace, rts_recvspace);
 	if (error)
 		return (error);
 
 	rcb = malloc(sizeof(*rcb), M_PCB, M_WAITOK);
 	rcb->rcb_socket = so;
 	rcb->rcb_family = proto;
 
 	so->so_pcb = rcb;
 	so->so_fibnum = td->td_proc->p_fibnum;
 	so->so_options |= SO_USELOOPBACK;
 
 	RTSOCK_LOCK();
 	LIST_INSERT_HEAD(&V_route_cb.cblist, rcb, list);
 	switch (proto) {
 	case AF_INET:
 		V_route_cb.ip_count++;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count++;
 		break;
 	}
 	V_route_cb.any_count++;
 	RTSOCK_UNLOCK();
 	soisconnected(so);
 
 	return (0);
 }
 
 static void
 rts_detach(struct socket *so)
 {
 	struct rcb *rcb = so->so_pcb;
 
 	RTSOCK_LOCK();
 	LIST_REMOVE(rcb, list);
 	switch(rcb->rcb_family) {
 	case AF_INET:
 		V_route_cb.ip_count--;
 		break;
 	case AF_INET6:
 		V_route_cb.ip6_count--;
 		break;
 	}
 	V_route_cb.any_count--;
 	RTSOCK_UNLOCK();
 	free(rcb, M_PCB);
 	so->so_pcb = NULL;
 }
 
 static int
 rts_shutdown(struct socket *so)
 {
 
 	socantsendmore(so);
 	return (0);
 }
 
 #ifndef _SOCKADDR_UNION_DEFINED
 #define	_SOCKADDR_UNION_DEFINED
 /*
  * The union of all possible address formats we handle.
  */
 union sockaddr_union {
 	struct sockaddr		sa;
 	struct sockaddr_in	sin;
 	struct sockaddr_in6	sin6;
 };
 #endif /* _SOCKADDR_UNION_DEFINED */
 
 static int
 rtm_get_jailed(struct rt_addrinfo *info, struct ifnet *ifp,
     struct nhop_object *nh, union sockaddr_union *saun, struct ucred *cred)
 {
 #if defined(INET) || defined(INET6)
 	struct epoch_tracker et;
 #endif
 
 	/* First, see if the returned address is part of the jail. */
 	if (prison_if(cred, nh->nh_ifa->ifa_addr) == 0) {
 		info->rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr;
 		return (0);
 	}
 
 	switch (info->rti_info[RTAX_DST]->sa_family) {
 #ifdef INET
 	case AF_INET:
 	{
 		struct in_addr ia;
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa;
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET)
 				continue;
 			ia = ((struct sockaddr_in *)sa)->sin_addr;
 			if (prison_check_ip4(cred, &ia) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		NET_EPOCH_EXIT(et);
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			ia = ((struct sockaddr_in *)nh->nh_ifa->ifa_addr)->
 			    sin_addr;
 			if (prison_get_ip4(cred, &ia) != 0)
 				return (ESRCH);
 		}
 		bzero(&saun->sin, sizeof(struct sockaddr_in));
 		saun->sin.sin_len = sizeof(struct sockaddr_in);
 		saun->sin.sin_family = AF_INET;
 		saun->sin.sin_addr.s_addr = ia.s_addr;
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin;
 		break;
 	}
 #endif
 #ifdef INET6
 	case AF_INET6:
 	{
 		struct in6_addr ia6;
 		struct ifaddr *ifa;
 		int found;
 
 		found = 0;
 		/*
 		 * Try to find an address on the given outgoing interface
 		 * that belongs to the jail.
 		 */
 		NET_EPOCH_ENTER(et);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			struct sockaddr *sa;
 			sa = ifa->ifa_addr;
 			if (sa->sa_family != AF_INET6)
 				continue;
 			bcopy(&((struct sockaddr_in6 *)sa)->sin6_addr,
 			    &ia6, sizeof(struct in6_addr));
 			if (prison_check_ip6(cred, &ia6) == 0) {
 				found = 1;
 				break;
 			}
 		}
 		NET_EPOCH_EXIT(et);
 		if (!found) {
 			/*
 			 * As a last resort return the 'default' jail address.
 			 */
 			ia6 = ((struct sockaddr_in6 *)nh->nh_ifa->ifa_addr)->
 			    sin6_addr;
 			if (prison_get_ip6(cred, &ia6) != 0)
 				return (ESRCH);
 		}
 		bzero(&saun->sin6, sizeof(struct sockaddr_in6));
 		saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		saun->sin6.sin6_family = AF_INET6;
 		bcopy(&ia6, &saun->sin6.sin6_addr, sizeof(struct in6_addr));
 		if (sa6_recoverscope(&saun->sin6) != 0)
 			return (ESRCH);
 		info->rti_info[RTAX_IFA] = (struct sockaddr *)&saun->sin6;
 		break;
 	}
 #endif
 	default:
 		return (ESRCH);
 	}
 	return (0);
 }
 
 static int
 fill_blackholeinfo(struct rt_addrinfo *info, union sockaddr_union *saun)
 {
 	struct ifaddr *ifa;
 	sa_family_t saf;
 
 	if (V_loif == NULL) {
 		RTS_PID_LOG(LOG_INFO, "Unable to add blackhole/reject nhop without loopback");
 		return (ENOTSUP);
 	}
 	info->rti_ifp = V_loif;
 
 	saf = info->rti_info[RTAX_DST]->sa_family;
 
 	CK_STAILQ_FOREACH(ifa, &info->rti_ifp->if_addrhead, ifa_link) {
 		if (ifa->ifa_addr->sa_family == saf) {
 			info->rti_ifa = ifa;
 			break;
 		}
 	}
 	if (info->rti_ifa == NULL) {
 		RTS_PID_LOG(LOG_INFO, "Unable to find ifa for blackhole/reject nhop");
 		return (ENOTSUP);
 	}
 
 	bzero(saun, sizeof(union sockaddr_union));
 	switch (saf) {
 #ifdef INET
 	case AF_INET:
 		saun->sin.sin_family = AF_INET;
 		saun->sin.sin_len = sizeof(struct sockaddr_in);
 		saun->sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		saun->sin6.sin6_family = AF_INET6;
 		saun->sin6.sin6_len = sizeof(struct sockaddr_in6);
 		saun->sin6.sin6_addr = in6addr_loopback;
 		break;
 #endif
 	default:
 		RTS_PID_LOG(LOG_INFO, "unsupported family: %d", saf);
 		return (ENOTSUP);
 	}
 	info->rti_info[RTAX_GATEWAY] = &saun->sa;
 	info->rti_flags |= RTF_GATEWAY;
 
 	return (0);
 }
 
 /*
  * Fills in @info based on userland-provided @rtm message.
  *
  * Returns 0 on success.
  */
 static int
 fill_addrinfo(struct rt_msghdr *rtm, int len, struct linear_buffer *lb, u_int fibnum,
     struct rt_addrinfo *info)
 {
 	int error;
 
 	rtm->rtm_pid = curproc->p_pid;
 	info->rti_addrs = rtm->rtm_addrs;
 
 	info->rti_mflags = rtm->rtm_inits;
 	info->rti_rmx = &rtm->rtm_rmx;
 
 	/*
 	 * rt_xaddrs() performs s6_addr[2] := sin6_scope_id for AF_INET6
 	 * link-local address because rtrequest requires addresses with
 	 * embedded scope id.
 	 */
 	if (rt_xaddrs((caddr_t)(rtm + 1), len + (caddr_t)rtm, info))
 		return (EINVAL);
 
 	info->rti_flags = rtm->rtm_flags;
 	error = cleanup_xaddrs(info, lb);
 	if (error != 0)
 		return (error);
 	/*
 	 * Verify that the caller has the appropriate privilege; RTM_GET
 	 * is the only operation the non-superuser is allowed.
 	 */
 	if (rtm->rtm_type != RTM_GET) {
 		error = priv_check(curthread, PRIV_NET_ROUTE);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * The given gateway address may be an interface address.
 	 * For example, issuing a "route change" command on a route
 	 * entry that was created from a tunnel, and the gateway
 	 * address given is the local end point. In this case the 
 	 * RTF_GATEWAY flag must be cleared or the destination will
 	 * not be reachable even though there is no error message.
 	 */
 	if (info->rti_info[RTAX_GATEWAY] != NULL &&
 	    info->rti_info[RTAX_GATEWAY]->sa_family != AF_LINK) {
 		struct nhop_object *nh;
 
 		/* 
 		 * A host route through the loopback interface is 
 		 * installed for each interface adddress. In pre 8.0
 		 * releases the interface address of a PPP link type
 		 * is not reachable locally. This behavior is fixed as 
 		 * part of the new L2/L3 redesign and rewrite work. The
 		 * signature of this interface address route is the
 		 * AF_LINK sa_family type of the gateway, and the
 		 * rt_ifp has the IFF_LOOPBACK flag set.
 		 */
 		nh = rib_lookup(fibnum, info->rti_info[RTAX_GATEWAY], NHR_NONE, 0);
 		if (nh != NULL && nh->gw_sa.sa_family == AF_LINK &&
 		    nh->nh_ifp->if_flags & IFF_LOOPBACK) {
 				info->rti_flags &= ~RTF_GATEWAY;
 				info->rti_flags |= RTF_GWFLAG_COMPAT;
 		}
 	}
 
 	return (0);
 }
 
 static struct nhop_object *
 select_nhop(struct nhop_object *nh, const struct sockaddr *gw)
 {
 	if (!NH_IS_NHGRP(nh))
 		return (nh);
 #ifdef ROUTE_MPATH
 	const struct weightened_nhop *wn;
 	uint32_t num_nhops;
 	wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
 	if (gw == NULL)
 		return (wn[0].nh);
 	for (int i = 0; i < num_nhops; i++) {
 		if (match_nhop_gw(wn[i].nh, gw))
 			return (wn[i].nh);
 	}
 #endif
 	return (NULL);
 }
 
 /*
  * Handles RTM_GET message from routing socket, returning matching rt.
  *
  * Returns:
  * 0 on success, with locked and referenced matching rt in @rt_nrt
  * errno of failure
  */
 static int
 handle_rtm_get(struct rt_addrinfo *info, u_int fibnum,
     struct rt_msghdr *rtm, struct rib_cmd_info *rc)
 {
 	RIB_RLOCK_TRACKER;
 	struct rib_head *rnh;
 	struct nhop_object *nh;
 	sa_family_t saf;
 
 	saf = info->rti_info[RTAX_DST]->sa_family;
 
 	rnh = rt_tables_get_rnh(fibnum, saf);
 	if (rnh == NULL)
 		return (EAFNOSUPPORT);
 
 	RIB_RLOCK(rnh);
 
 	/*
 	 * By (implicit) convention host route (one without netmask)
 	 * means longest-prefix-match request and the route with netmask
 	 * means exact-match lookup.
 	 * As cleanup_xaddrs() cleans up info flags&addrs for the /32,/128
 	 * prefixes, use original data to check for the netmask presence.
 	 */
 	if ((rtm->rtm_addrs & RTA_NETMASK) == 0) {
 		/*
 		 * Provide longest prefix match for
 		 * address lookup (no mask).
 		 * 'route -n get addr'
 		 */
 		rc->rc_rt = (struct rtentry *) rnh->rnh_matchaddr(
 		    info->rti_info[RTAX_DST], &rnh->head);
 	} else
 		rc->rc_rt = (struct rtentry *) rnh->rnh_lookup(
 		    info->rti_info[RTAX_DST],
 		    info->rti_info[RTAX_NETMASK], &rnh->head);
 
 	if (rc->rc_rt == NULL) {
 		RIB_RUNLOCK(rnh);
 		return (ESRCH);
 	}
 
 	nh = select_nhop(rt_get_raw_nhop(rc->rc_rt), info->rti_info[RTAX_GATEWAY]);
 	if (nh == NULL) {
 		RIB_RUNLOCK(rnh);
 		return (ESRCH);
 	}
 	/*
 	 * If performing proxied L2 entry insertion, and
 	 * the actual PPP host entry is found, perform
 	 * another search to retrieve the prefix route of
 	 * the local end point of the PPP link.
 	 * TODO: move this logic to userland.
 	 */
 	if (rtm->rtm_flags & RTF_ANNOUNCE) {
 		struct sockaddr_storage laddr;
 
 		if (nh->nh_ifp != NULL &&
 		    nh->nh_ifp->if_type == IFT_PROPVIRTUAL) {
 			struct ifaddr *ifa;
 
 			ifa = ifa_ifwithnet(info->rti_info[RTAX_DST], 1,
 					RT_ALL_FIBS);
 			if (ifa != NULL)
 				rt_maskedcopy(ifa->ifa_addr,
 					      (struct sockaddr *)&laddr,
 					      ifa->ifa_netmask);
 		} else
 			rt_maskedcopy(nh->nh_ifa->ifa_addr,
 				      (struct sockaddr *)&laddr,
 				      nh->nh_ifa->ifa_netmask);
 		/* 
 		 * refactor rt and no lock operation necessary
 		 */
 		rc->rc_rt = (struct rtentry *)rnh->rnh_matchaddr(
 		    (struct sockaddr *)&laddr, &rnh->head);
 		if (rc->rc_rt == NULL) {
 			RIB_RUNLOCK(rnh);
 			return (ESRCH);
 		}
 		nh = select_nhop(rt_get_raw_nhop(rc->rc_rt), info->rti_info[RTAX_GATEWAY]);
 		if (nh == NULL) {
 			RIB_RUNLOCK(rnh);
 			return (ESRCH);
 		}
 	}
 	rc->rc_nh_new = nh;
 	rc->rc_nh_weight = rc->rc_rt->rt_weight;
 	RIB_RUNLOCK(rnh);
 
 	return (0);
 }
 
 static void
 init_sockaddrs_family(int family, struct sockaddr *dst, struct sockaddr *mask)
 {
 #ifdef INET
 	if (family == AF_INET) {
 		struct sockaddr_in *dst4 = (struct sockaddr_in *)dst;
 		struct sockaddr_in *mask4 = (struct sockaddr_in *)mask;
 
 		bzero(dst4, sizeof(struct sockaddr_in));
 		bzero(mask4, sizeof(struct sockaddr_in));
 
 		dst4->sin_family = AF_INET;
 		dst4->sin_len = sizeof(struct sockaddr_in);
 		mask4->sin_family = AF_INET;
 		mask4->sin_len = sizeof(struct sockaddr_in);
 	}
 #endif
 #ifdef INET6
 	if (family == AF_INET6) {
 		struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst;
 		struct sockaddr_in6 *mask6 = (struct sockaddr_in6 *)mask;
 
 		bzero(dst6, sizeof(struct sockaddr_in6));
 		bzero(mask6, sizeof(struct sockaddr_in6));
 
 		dst6->sin6_family = AF_INET6;
 		dst6->sin6_len = sizeof(struct sockaddr_in6);
 		mask6->sin6_family = AF_INET6;
 		mask6->sin6_len = sizeof(struct sockaddr_in6);
 	}
 #endif
 }
 
 static void
 export_rtaddrs(const struct rtentry *rt, struct sockaddr *dst,
     struct sockaddr *mask)
 {
 #ifdef INET
 	if (dst->sa_family == AF_INET) {
 		struct sockaddr_in *dst4 = (struct sockaddr_in *)dst;
 		struct sockaddr_in *mask4 = (struct sockaddr_in *)mask;
 		uint32_t scopeid = 0;
 		rt_get_inet_prefix_pmask(rt, &dst4->sin_addr, &mask4->sin_addr,
 		    &scopeid);
 		return;
 	}
 #endif
 #ifdef INET6
 	if (dst->sa_family == AF_INET6) {
 		struct sockaddr_in6 *dst6 = (struct sockaddr_in6 *)dst;
 		struct sockaddr_in6 *mask6 = (struct sockaddr_in6 *)mask;
 		uint32_t scopeid = 0;
 		rt_get_inet6_prefix_pmask(rt, &dst6->sin6_addr,
 		    &mask6->sin6_addr, &scopeid);
 		dst6->sin6_scope_id = scopeid;
 		return;
 	}
 #endif
 }
 
 static int
 update_rtm_from_info(struct rt_addrinfo *info, struct rt_msghdr **prtm,
     int alloc_len)
 {
 	struct rt_msghdr *rtm, *orig_rtm = NULL;
 	struct walkarg w;
 	int len;
 
 	rtm = *prtm;
 	/* Check if we need to realloc storage */
 	rtsock_msg_buffer(rtm->rtm_type, info, NULL, &len);
 	if (len > alloc_len) {
 		struct rt_msghdr *tmp_rtm;
 
 		tmp_rtm = malloc(len, M_TEMP, M_NOWAIT);
 		if (tmp_rtm == NULL)
 			return (ENOBUFS);
 		bcopy(rtm, tmp_rtm, rtm->rtm_msglen);
 		orig_rtm = rtm;
 		rtm = tmp_rtm;
 		alloc_len = len;
 
 		/*
 		 * Delay freeing original rtm as info contains
 		 * data referencing it.
 		 */
 	}
 
 	w.w_tmem = (caddr_t)rtm;
 	w.w_tmemsize = alloc_len;
 	rtsock_msg_buffer(rtm->rtm_type, info, &w, &len);
 	rtm->rtm_addrs = info->rti_addrs;
 
 	if (orig_rtm != NULL)
 		free(orig_rtm, M_TEMP);
 	*prtm = rtm;
 	return (0);
 }
 
 
 /*
  * Update sockaddrs, flags, etc in @prtm based on @rc data.
  * rtm can be reallocated.
  *
  * Returns 0 on success, along with pointer to (potentially reallocated)
  *  rtm.
  *
  */
 static int
 update_rtm_from_rc(struct rt_addrinfo *info, struct rt_msghdr **prtm,
     int alloc_len, struct rib_cmd_info *rc, struct nhop_object *nh)
 {
 	union sockaddr_union saun;
 	struct rt_msghdr *rtm;
 	struct ifnet *ifp;
 	int error;
 
 	rtm = *prtm;
 	union sockaddr_union sa_dst, sa_mask;
 	int family = info->rti_info[RTAX_DST]->sa_family;
 	init_sockaddrs_family(family, &sa_dst.sa, &sa_mask.sa);
 	export_rtaddrs(rc->rc_rt, &sa_dst.sa, &sa_mask.sa);
 
 	info->rti_info[RTAX_DST] = &sa_dst.sa;
 	info->rti_info[RTAX_NETMASK] = rt_is_host(rc->rc_rt) ? NULL : &sa_mask.sa;
 	info->rti_info[RTAX_GATEWAY] = &nh->gw_sa;
 	info->rti_info[RTAX_GENMASK] = 0;
 	ifp = nh->nh_ifp;
 	if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) {
 		if (ifp) {
 			info->rti_info[RTAX_IFP] =
 			    ifp->if_addr->ifa_addr;
 			error = rtm_get_jailed(info, ifp, nh,
 			    &saun, curthread->td_ucred);
 			if (error != 0)
 				return (error);
 			if (ifp->if_flags & IFF_POINTOPOINT)
 				info->rti_info[RTAX_BRD] =
 				    nh->nh_ifa->ifa_dstaddr;
 			rtm->rtm_index = ifp->if_index;
 		} else {
 			info->rti_info[RTAX_IFP] = NULL;
 			info->rti_info[RTAX_IFA] = NULL;
 		}
 	} else if (ifp != NULL)
 		rtm->rtm_index = ifp->if_index;
 
 	if ((error = update_rtm_from_info(info, prtm, alloc_len)) != 0)
 		return (error);
 
 	rtm = *prtm;
 	rtm->rtm_flags = rc->rc_rt->rte_flags | nhop_get_rtflags(nh);
 	if (rtm->rtm_flags & RTF_GWFLAG_COMPAT)
 		rtm->rtm_flags = RTF_GATEWAY | 
 			(rtm->rtm_flags & ~RTF_GWFLAG_COMPAT);
 	rt_getmetrics(rc->rc_rt, nh, &rtm->rtm_rmx);
 	rtm->rtm_rmx.rmx_weight = rc->rc_nh_weight;
 
 	return (0);
 }
 
 #ifdef ROUTE_MPATH
 static void
 save_del_notification(struct rib_cmd_info *rc, void *_cbdata)
 {
 	struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
 
 	if (rc->rc_cmd == RTM_DELETE)
 		*rc_new = *rc;
 }
 
 static void
 save_add_notification(struct rib_cmd_info *rc, void *_cbdata)
 {
 	struct rib_cmd_info *rc_new = (struct rib_cmd_info *)_cbdata;
 
 	if (rc->rc_cmd == RTM_ADD)
 		*rc_new = *rc;
 }
 #endif
 
 #if defined(INET6) || defined(INET)
 static struct sockaddr *
 alloc_sockaddr_aligned(struct linear_buffer *lb, int len)
 {
 	len = roundup2(len, sizeof(uint64_t));
 	if (lb->offset + len > lb->size)
 		return (NULL);
 	struct sockaddr *sa = (struct sockaddr *)(lb->base + lb->offset);
 	lb->offset += len;
 	return (sa);
 }
 #endif
 
 static int
 rts_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *nam, struct mbuf *control, struct thread *td)
 {
 	struct rt_msghdr *rtm = NULL;
 	struct rt_addrinfo info;
 	struct epoch_tracker et;
 #ifdef INET6
 	struct sockaddr_storage ss;
 	struct sockaddr_in6 *sin6;
 	int i, rti_need_deembed = 0;
 #endif
 	int alloc_len = 0, len, error = 0, fibnum;
 	sa_family_t saf = AF_UNSPEC;
 	struct rib_cmd_info rc;
 	struct nhop_object *nh;
 
 	if ((flags & PRUS_OOB) || control != NULL) {
 		m_freem(m);
 		if (control != NULL)
 			m_freem(control);
 		return (EOPNOTSUPP);
 	}
 
 	fibnum = so->so_fibnum;
 #define senderr(e) { error = e; goto flush;}
 	if (m == NULL || ((m->m_len < sizeof(long)) &&
 		       (m = m_pullup(m, sizeof(long))) == NULL))
 		return (ENOBUFS);
 	if ((m->m_flags & M_PKTHDR) == 0)
 		panic("route_output");
 	NET_EPOCH_ENTER(et);
 	len = m->m_pkthdr.len;
 	if (len < sizeof(*rtm) ||
 	    len != mtod(m, struct rt_msghdr *)->rtm_msglen)
 		senderr(EINVAL);
 
 	/*
 	 * Most of current messages are in range 200-240 bytes,
 	 * minimize possible re-allocation on reply using larger size
 	 * buffer aligned on 1k boundaty.
 	 */
 	alloc_len = roundup2(len, 1024);
 	int total_len = alloc_len + SCRATCH_BUFFER_SIZE;
 	if ((rtm = malloc(total_len, M_TEMP, M_NOWAIT)) == NULL)
 		senderr(ENOBUFS);
 
 	m_copydata(m, 0, len, (caddr_t)rtm);
 	bzero(&info, sizeof(info));
 	nh = NULL;
 	struct linear_buffer lb = {
 		.base = (char *)rtm + alloc_len,
 		.size = SCRATCH_BUFFER_SIZE,
 	};
 
 	if (rtm->rtm_version != RTM_VERSION) {
 		/* Do not touch message since format is unknown */
 		free(rtm, M_TEMP);
 		rtm = NULL;
 		senderr(EPROTONOSUPPORT);
 	}
 
 	/*
 	 * Starting from here, it is possible
 	 * to alter original message and insert
 	 * caller PID and error value.
 	 */
 
 	if ((error = fill_addrinfo(rtm, len, &lb, fibnum, &info)) != 0) {
 		senderr(error);
 	}
 	/* fill_addringo() embeds scope into IPv6 addresses */
 #ifdef INET6
 	rti_need_deembed = 1;
 #endif
 
 	saf = info.rti_info[RTAX_DST]->sa_family;
 
 	/* support for new ARP code */
 	if (rtm->rtm_flags & RTF_LLDATA) {
 		error = lla_rt_output(rtm, &info);
 		goto flush;
 	}
 
 	union sockaddr_union gw_saun;
 	int blackhole_flags = rtm->rtm_flags & (RTF_BLACKHOLE|RTF_REJECT);
 	if (blackhole_flags != 0) {
 		if (blackhole_flags != (RTF_BLACKHOLE | RTF_REJECT))
 			error = fill_blackholeinfo(&info, &gw_saun);
 		else {
 			RTS_PID_LOG(LOG_DEBUG, "both BLACKHOLE and REJECT flags specifiied");
 			error = EINVAL;
 		}
 		if (error != 0)
 			senderr(error);
 	}
 
 	switch (rtm->rtm_type) {
 	case RTM_ADD:
 	case RTM_CHANGE:
 		if (rtm->rtm_type == RTM_ADD) {
 			if (info.rti_info[RTAX_GATEWAY] == NULL) {
 				RTS_PID_LOG(LOG_DEBUG, "RTM_ADD w/o gateway");
 				senderr(EINVAL);
 			}
 		}
 		error = rib_action(fibnum, rtm->rtm_type, &info, &rc);
 		if (error == 0) {
 #ifdef ROUTE_MPATH
 			if (NH_IS_NHGRP(rc.rc_nh_new) ||
 			    (rc.rc_nh_old && NH_IS_NHGRP(rc.rc_nh_old))) {
 				struct rib_cmd_info rc_simple = {};
 				rib_decompose_notification(&rc,
 				    save_add_notification, (void *)&rc_simple);
 				rc = rc_simple;
 			}
 #endif
 			/* nh MAY be empty if RTM_CHANGE request is no-op */
 			nh = rc.rc_nh_new;
 			if (nh != NULL) {
 				rtm->rtm_index = nh->nh_ifp->if_index;
 				rtm->rtm_flags = rc.rc_rt->rte_flags | nhop_get_rtflags(nh);
 			}
 		}
 		break;
 
 	case RTM_DELETE:
 		error = rib_action(fibnum, RTM_DELETE, &info, &rc);
 		if (error == 0) {
 #ifdef ROUTE_MPATH
 			if (NH_IS_NHGRP(rc.rc_nh_old) ||
 			    (rc.rc_nh_new && NH_IS_NHGRP(rc.rc_nh_new))) {
 				struct rib_cmd_info rc_simple = {};
 				rib_decompose_notification(&rc,
 				    save_del_notification, (void *)&rc_simple);
 				rc = rc_simple;
 			}
 #endif
 			nh = rc.rc_nh_old;
 		}
 		break;
 
 	case RTM_GET:
 		error = handle_rtm_get(&info, fibnum, rtm, &rc);
 		if (error != 0)
 			senderr(error);
 		nh = rc.rc_nh_new;
 
 		if (!can_export_rte(curthread->td_ucred,
 		    info.rti_info[RTAX_NETMASK] == NULL,
 		    info.rti_info[RTAX_DST])) {
 			senderr(ESRCH);
 		}
 		break;
 
 	default:
 		senderr(EOPNOTSUPP);
 	}
 
 	if (error == 0 && nh != NULL) {
 		error = update_rtm_from_rc(&info, &rtm, alloc_len, &rc, nh);
 		/*
 		 * Note that some sockaddr pointers may have changed to
 		 * point to memory outsize @rtm. Some may be pointing
 		 * to the on-stack variables.
 		 * Given that, any pointer in @info CANNOT BE USED.
 		 */
 
 		/*
 		 * scopeid deembedding has been performed while
 		 * writing updated rtm in rtsock_msg_buffer().
 		 * With that in mind, skip deembedding procedure below.
 		 */
 #ifdef INET6
 		rti_need_deembed = 0;
 #endif
 	}
 
 flush:
 	NET_EPOCH_EXIT(et);
 
 #ifdef INET6
 	if (rtm != NULL) {
 		if (rti_need_deembed) {
 			/* sin6_scope_id is recovered before sending rtm. */
 			sin6 = (struct sockaddr_in6 *)&ss;
 			for (i = 0; i < RTAX_MAX; i++) {
 				if (info.rti_info[i] == NULL)
 					continue;
 				if (info.rti_info[i]->sa_family != AF_INET6)
 					continue;
 				bcopy(info.rti_info[i], sin6, sizeof(*sin6));
 				if (sa6_recoverscope(sin6) == 0)
 					bcopy(sin6, info.rti_info[i],
 						    sizeof(*sin6));
 			}
 			if (update_rtm_from_info(&info, &rtm, alloc_len) != 0) {
 				if (error != 0)
 					error = ENOBUFS;
 			}
 		}
 	}
 #endif
 	send_rtm_reply(so, rtm, m, saf, fibnum, error);
 
 	return (error);
 }
 
 /*
  * Sends the prepared reply message in @rtm to all rtsock clients.
  * Frees @m and @rtm.
  *
  */
 static void
 send_rtm_reply(struct socket *so, struct rt_msghdr *rtm, struct mbuf *m,
     sa_family_t saf, u_int fibnum, int rtm_errno)
 {
 	struct rcb *rcb = NULL;
 
 	/*
 	 * Check to see if we don't want our own messages.
 	 */
 	if ((so->so_options & SO_USELOOPBACK) == 0) {
 		if (V_route_cb.any_count <= 1) {
 			if (rtm != NULL)
 				free(rtm, M_TEMP);
 			m_freem(m);
 			return;
 		}
 		/* There is another listener, so construct message */
 		rcb = so->so_pcb;
 	}
 
 	if (rtm != NULL) {
 		if (rtm_errno!= 0)
 			rtm->rtm_errno = rtm_errno;
 		else
 			rtm->rtm_flags |= RTF_DONE;
 
 		m_copyback(m, 0, rtm->rtm_msglen, (caddr_t)rtm);
 		if (m->m_pkthdr.len < rtm->rtm_msglen) {
 			m_freem(m);
 			m = NULL;
 		} else if (m->m_pkthdr.len > rtm->rtm_msglen)
 			m_adj(m, rtm->rtm_msglen - m->m_pkthdr.len);
 
 		free(rtm, M_TEMP);
 	}
 	if (m != NULL) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 		if (rcb) {
 			/*
 			 * XXX insure we don't get a copy by
 			 * invalidating our protocol
 			 */
 			sa_family_t family = rcb->rcb_family;
 			rcb->rcb_family = AF_UNSPEC;
 			rt_dispatch(m, saf);
 			rcb->rcb_family = family;
 		} else
 			rt_dispatch(m, saf);
 	}
 }
 
 static void
 rt_getmetrics(const struct rtentry *rt, const struct nhop_object *nh,
     struct rt_metrics *out)
 {
 
 	bzero(out, sizeof(*out));
 	out->rmx_mtu = nh->nh_mtu;
 	out->rmx_weight = rt->rt_weight;
 	out->rmx_nhidx = nhop_get_idx(nh);
 	/* Kernel -> userland timebase conversion. */
 	out->rmx_expire = nhop_get_expire(nh) ?
 	    nhop_get_expire(nh) - time_uptime + time_second : 0;
 }
 
 /*
  * Extract the addresses of the passed sockaddrs.
  * Do a little sanity checking so as to avoid bad memory references.
  * This data is derived straight from userland.
  */
 static int
 rt_xaddrs(caddr_t cp, caddr_t cplim, struct rt_addrinfo *rtinfo)
 {
 	struct sockaddr *sa;
 	int i;
 
 	for (i = 0; i < RTAX_MAX && cp < cplim; i++) {
 		if ((rtinfo->rti_addrs & (1 << i)) == 0)
 			continue;
 		sa = (struct sockaddr *)cp;
 		/*
 		 * It won't fit.
 		 */
 		if (cp + sa->sa_len > cplim) {
 			RTS_PID_LOG(LOG_DEBUG, "sa_len too big for sa type %d", i);
 			return (EINVAL);
 		}
 		/*
 		 * there are no more.. quit now
 		 * If there are more bits, they are in error.
 		 * I've seen this. route(1) can evidently generate these. 
 		 * This causes kernel to core dump.
 		 * for compatibility, If we see this, point to a safe address.
 		 */
 		if (sa->sa_len == 0) {
 			rtinfo->rti_info[i] = &sa_zero;
 			return (0); /* should be EINVAL but for compat */
 		}
 		/* accept it */
 #ifdef INET6
 		if (sa->sa_family == AF_INET6)
 			sa6_embedscope((struct sockaddr_in6 *)sa,
 			    V_ip6_use_defzone);
 #endif
 		rtinfo->rti_info[i] = sa;
 		cp += SA_SIZE(sa);
 	}
 	return (0);
 }
 
 #ifdef INET
 static inline void
 fill_sockaddr_inet(struct sockaddr_in *sin, struct in_addr addr)
 {
 
 	const struct sockaddr_in nsin = {
 		.sin_family = AF_INET,
 		.sin_len = sizeof(struct sockaddr_in),
 		.sin_addr = addr,
 	};
 	*sin = nsin;
 }
 #endif
 
 #ifdef INET6
 static inline void
 fill_sockaddr_inet6(struct sockaddr_in6 *sin6, const struct in6_addr *addr6,
     uint32_t scopeid)
 {
 
 	const struct sockaddr_in6 nsin6 = {
 		.sin6_family = AF_INET6,
 		.sin6_len = sizeof(struct sockaddr_in6),
 		.sin6_addr = *addr6,
 		.sin6_scope_id = scopeid,
 	};
 	*sin6 = nsin6;
 }
 #endif
 
 #if defined(INET6) || defined(INET)
 /*
  * Checks if gateway is suitable for lltable operations.
  * Lltable code requires AF_LINK gateway with ifindex
  *  and mac address specified.
  * Returns 0 on success.
  */
 static int
 cleanup_xaddrs_lladdr(struct rt_addrinfo *info)
 {
 	struct sockaddr_dl *sdl = (struct sockaddr_dl *)info->rti_info[RTAX_GATEWAY];
 
 	if (sdl->sdl_family != AF_LINK)
 		return (EINVAL);
 
 	if (sdl->sdl_index == 0) {
 		RTS_PID_LOG(LOG_DEBUG, "AF_LINK gateway w/o ifindex");
 		return (EINVAL);
 	}
 
 	if (offsetof(struct sockaddr_dl, sdl_data) + sdl->sdl_nlen + sdl->sdl_alen > sdl->sdl_len) {
 		RTS_PID_LOG(LOG_DEBUG, "AF_LINK gw: sdl_nlen/sdl_alen too large");
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 cleanup_xaddrs_gateway(struct rt_addrinfo *info, struct linear_buffer *lb)
 {
 	struct sockaddr *gw = info->rti_info[RTAX_GATEWAY];
 	struct sockaddr *sa;
 
 	if (info->rti_flags & RTF_LLDATA)
 		return (cleanup_xaddrs_lladdr(info));
 
 	switch (gw->sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *gw_sin = (struct sockaddr_in *)gw;
 
 			/* Ensure reads do not go beyoud SA boundary */
 			if (SA_SIZE(gw) < offsetof(struct sockaddr_in, sin_zero)) {
 				RTS_PID_LOG(LOG_DEBUG, "gateway sin_len too small: %d",
 				    gw->sa_len);
 				return (EINVAL);
 			}
 			sa = alloc_sockaddr_aligned(lb, sizeof(struct sockaddr_in));
 			if (sa == NULL)
 				return (ENOBUFS);
 			fill_sockaddr_inet((struct sockaddr_in *)sa, gw_sin->sin_addr);
 			info->rti_info[RTAX_GATEWAY] = sa;
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *gw_sin6 = (struct sockaddr_in6 *)gw;
 			if (gw_sin6->sin6_len < sizeof(struct sockaddr_in6)) {
 				RTS_PID_LOG(LOG_DEBUG, "gateway sin6_len too small: %d",
 				    gw->sa_len);
 				return (EINVAL);
 			}
 			fill_sockaddr_inet6(gw_sin6, &gw_sin6->sin6_addr, 0);
 			break;
 		}
 #endif
 	case AF_LINK:
 		{
 			struct sockaddr_dl *gw_sdl;
 
 			size_t sdl_min_len = offsetof(struct sockaddr_dl, sdl_data);
 			gw_sdl = (struct sockaddr_dl *)gw;
 			if (gw_sdl->sdl_len < sdl_min_len) {
 				RTS_PID_LOG(LOG_DEBUG, "gateway sdl_len too small: %d",
 				    gw_sdl->sdl_len);
 				return (EINVAL);
 			}
 			sa = alloc_sockaddr_aligned(lb, sizeof(struct sockaddr_dl_short));
 			if (sa == NULL)
 				return (ENOBUFS);
 
 			const struct sockaddr_dl_short sdl = {
 				.sdl_family = AF_LINK,
 				.sdl_len = sizeof(struct sockaddr_dl_short),
 				.sdl_index = gw_sdl->sdl_index,
 			};
 			*((struct sockaddr_dl_short *)sa) = sdl;
 			info->rti_info[RTAX_GATEWAY] = sa;
 			break;
 		}
 	}
 
 	return (0);
 }
 #endif
 
 static void
 remove_netmask(struct rt_addrinfo *info)
 {
 	info->rti_info[RTAX_NETMASK] = NULL;
 	info->rti_flags |= RTF_HOST;
 	info->rti_addrs &= ~RTA_NETMASK;
 }
 
 #ifdef INET
 static int
 cleanup_xaddrs_inet(struct rt_addrinfo *info, struct linear_buffer *lb)
 {
 	struct sockaddr_in *dst_sa, *mask_sa;
 	const int sa_len = sizeof(struct sockaddr_in);
 	struct in_addr dst, mask;
 
 	/* Check & fixup dst/netmask combination first */
 	dst_sa = (struct sockaddr_in *)info->rti_info[RTAX_DST];
 	mask_sa = (struct sockaddr_in *)info->rti_info[RTAX_NETMASK];
 
 	/* Ensure reads do not go beyound the buffer size */
 	if (SA_SIZE(dst_sa) < offsetof(struct sockaddr_in, sin_zero)) {
 		RTS_PID_LOG(LOG_DEBUG, "prefix dst sin_len too small: %d",
 		    dst_sa->sin_len);
 		return (EINVAL);
 	}
 
 	if ((mask_sa != NULL) && mask_sa->sin_len < sizeof(struct sockaddr_in)) {
 		/*
 		 * Some older routing software encode mask length into the
 		 * sin_len, thus resulting in "truncated" sockaddr.
 		 */
 		int len = mask_sa->sin_len - offsetof(struct sockaddr_in, sin_addr);
 		if (len >= 0) {
 			mask.s_addr = 0;
 			if (len > sizeof(struct in_addr))
 				len = sizeof(struct in_addr);
 			memcpy(&mask, &mask_sa->sin_addr, len);
 		} else {
 			RTS_PID_LOG(LOG_DEBUG, "prefix mask sin_len too small: %d",
 			    mask_sa->sin_len);
 			return (EINVAL);
 		}
 	} else
 		mask.s_addr = mask_sa ? mask_sa->sin_addr.s_addr : INADDR_BROADCAST;
 
 	dst.s_addr = htonl(ntohl(dst_sa->sin_addr.s_addr) & ntohl(mask.s_addr));
 
 	/* Construct new "clean" dst/mask sockaddresses */
 	if ((dst_sa = (struct sockaddr_in *)alloc_sockaddr_aligned(lb, sa_len)) == NULL)
 		return (ENOBUFS);
 	fill_sockaddr_inet(dst_sa, dst);
 	info->rti_info[RTAX_DST] = (struct sockaddr *)dst_sa;
 
 	if (mask.s_addr != INADDR_BROADCAST) {
 		if ((mask_sa = (struct sockaddr_in *)alloc_sockaddr_aligned(lb, sa_len)) == NULL)
 			return (ENOBUFS);
 		fill_sockaddr_inet(mask_sa, mask);
 		info->rti_info[RTAX_NETMASK] = (struct sockaddr *)mask_sa;
 		info->rti_flags &= ~RTF_HOST;
 	} else
 		remove_netmask(info);
 
 	/* Check gateway */
 	if (info->rti_info[RTAX_GATEWAY] != NULL)
 		return (cleanup_xaddrs_gateway(info, lb));
 
 	return (0);
 }
 #endif
 
 #ifdef INET6
 static int
 cleanup_xaddrs_inet6(struct rt_addrinfo *info, struct linear_buffer *lb)
 {
 	struct sockaddr *sa;
 	struct sockaddr_in6 *dst_sa, *mask_sa;
 	struct in6_addr mask, *dst;
 	const int sa_len = sizeof(struct sockaddr_in6);
 
 	/* Check & fixup dst/netmask combination first */
 	dst_sa = (struct sockaddr_in6 *)info->rti_info[RTAX_DST];
 	mask_sa = (struct sockaddr_in6 *)info->rti_info[RTAX_NETMASK];
 
 	if (dst_sa->sin6_len < sizeof(struct sockaddr_in6)) {
 		RTS_PID_LOG(LOG_DEBUG, "prefix dst sin6_len too small: %d",
 		    dst_sa->sin6_len);
 		return (EINVAL);
 	}
 
 	if (mask_sa && mask_sa->sin6_len < sizeof(struct sockaddr_in6)) {
 		/*
 		 * Some older routing software encode mask length into the
 		 * sin6_len, thus resulting in "truncated" sockaddr.
 		 */
 		int len = mask_sa->sin6_len - offsetof(struct sockaddr_in6, sin6_addr);
 		if (len >= 0) {
 			bzero(&mask, sizeof(mask));
 			if (len > sizeof(struct in6_addr))
 				len = sizeof(struct in6_addr);
 			memcpy(&mask, &mask_sa->sin6_addr, len);
 		} else {
 			RTS_PID_LOG(LOG_DEBUG, "rtsock: prefix mask sin6_len too small: %d",
 			    mask_sa->sin6_len);
 			return (EINVAL);
 		}
 	} else
 		mask = mask_sa ? mask_sa->sin6_addr : in6mask128;
 
 	dst = &dst_sa->sin6_addr;
 	IN6_MASK_ADDR(dst, &mask);
 
 	if ((sa = alloc_sockaddr_aligned(lb, sa_len)) == NULL)
 		return (ENOBUFS);
 	fill_sockaddr_inet6((struct sockaddr_in6 *)sa, dst, 0);
 	info->rti_info[RTAX_DST] = sa;
 
 	if (!IN6_ARE_ADDR_EQUAL(&mask, &in6mask128)) {
 		if ((sa = alloc_sockaddr_aligned(lb, sa_len)) == NULL)
 			return (ENOBUFS);
 		fill_sockaddr_inet6((struct sockaddr_in6 *)sa, &mask, 0);
 		info->rti_info[RTAX_NETMASK] = sa;
 		info->rti_flags &= ~RTF_HOST;
 	} else
 		remove_netmask(info);
 
 	/* Check gateway */
 	if (info->rti_info[RTAX_GATEWAY] != NULL)
 		return (cleanup_xaddrs_gateway(info, lb));
 
 	return (0);
 }
 #endif
 
 static int
 cleanup_xaddrs(struct rt_addrinfo *info, struct linear_buffer *lb)
 {
 	int error = EAFNOSUPPORT;
 
 	if (info->rti_info[RTAX_DST] == NULL) {
 		RTS_PID_LOG(LOG_DEBUG, "prefix dst is not set");
 		return (EINVAL);
 	}
 
 	if (info->rti_flags & RTF_LLDATA) {
 		/*
 		 * arp(8)/ndp(8) sends RTA_NETMASK for the associated
 		 * prefix along with the actual address in RTA_DST.
 		 * Remove netmask to avoid unnecessary address masking.
 		 */
 		remove_netmask(info);
 	}
 
 	switch (info->rti_info[RTAX_DST]->sa_family) {
 #ifdef INET
 	case AF_INET:
 		error = cleanup_xaddrs_inet(info, lb);
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		error = cleanup_xaddrs_inet6(info, lb);
 		break;
 #endif
 	}
 
 	return (error);
 }
 
 /*
  * Fill in @dmask with valid netmask leaving original @smask
  * intact. Mostly used with radix netmasks.
  */
 struct sockaddr *
 rtsock_fix_netmask(const struct sockaddr *dst, const struct sockaddr *smask,
     struct sockaddr_storage *dmask)
 {
 	if (dst == NULL || smask == NULL)
 		return (NULL);
 
 	memset(dmask, 0, dst->sa_len);
 	memcpy(dmask, smask, smask->sa_len);
 	dmask->ss_len = dst->sa_len;
 	dmask->ss_family = dst->sa_family;
 
 	return ((struct sockaddr *)dmask);
 }
 
 /*
  * Writes information related to @rtinfo object to newly-allocated mbuf.
  * Assumes MCLBYTES is enough to construct any message.
  * Used for OS notifications of vaious events (if/ifa announces,etc)
  *
  * Returns allocated mbuf or NULL on failure.
  */
 static struct mbuf *
 rtsock_msg_mbuf(int type, struct rt_addrinfo *rtinfo)
 {
 	struct sockaddr_storage ss;
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	int i;
 	struct sockaddr *sa;
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	int len, dlen;
 
 	switch (type) {
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_DELMADDR:
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	case RTM_IFINFO:
 		len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_IFANNOUNCE:
 	case RTM_IEEE80211:
 		len = sizeof(struct if_announcemsghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	/* XXXGL: can we use MJUMPAGESIZE cluster here? */
 	KASSERT(len <= MCLBYTES, ("%s: message too big", __func__));
 	if (len > MHLEN)
 		m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR);
 	else
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (m);
 
 	m->m_pkthdr.len = m->m_len = len;
 	rtm = mtod(m, struct rt_msghdr *);
 	bzero((caddr_t)rtm, len);
 	for (i = 0; i < RTAX_MAX; i++) {
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 
 		dlen = SA_SIZE(sa);
 		KASSERT(dlen <= sizeof(ss),
 		    ("%s: sockaddr size overflow", __func__));
 		bzero(&ss, sizeof(ss));
 		bcopy(sa, &ss, sa->sa_len);
 		sa = (struct sockaddr *)&ss;
 #ifdef INET6
 		if (sa->sa_family == AF_INET6) {
 			sin6 = (struct sockaddr_in6 *)sa;
 			(void)sa6_recoverscope(sin6);
 		}
 #endif
 		m_copyback(m, len, dlen, (caddr_t)sa);
 		len += dlen;
 	}
 	if (m->m_pkthdr.len != len) {
 		m_freem(m);
 		return (NULL);
 	}
 	rtm->rtm_msglen = len;
 	rtm->rtm_version = RTM_VERSION;
 	rtm->rtm_type = type;
 	return (m);
 }
 
 /*
  * Writes information related to @rtinfo object to preallocated buffer.
  * Stores needed size in @plen. If @w is NULL, calculates size without
  * writing.
  * Used for sysctl dumps and rtsock answers (RTM_DEL/RTM_GET) generation.
  *
  * Returns 0 on success.
  *
  */
 static int
 rtsock_msg_buffer(int type, struct rt_addrinfo *rtinfo, struct walkarg *w, int *plen)
 {
 	struct sockaddr_storage ss;
 	int len, buflen = 0, dlen, i;
 	caddr_t cp = NULL;
 	struct rt_msghdr *rtm = NULL;
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 #ifdef COMPAT_FREEBSD32
 	bool compat32 = false;
 #endif
 
 	switch (type) {
 	case RTM_DELADDR:
 	case RTM_NEWADDR:
 		if (w != NULL && w->w_op == NET_RT_IFLISTL) {
 #ifdef COMPAT_FREEBSD32
 			if (w->w_req->flags & SCTL_MASK32) {
 				len = sizeof(struct ifa_msghdrl32);
 				compat32 = true;
 			} else
 #endif
 				len = sizeof(struct ifa_msghdrl);
 		} else
 			len = sizeof(struct ifa_msghdr);
 		break;
 
 	case RTM_IFINFO:
 #ifdef COMPAT_FREEBSD32
 		if (w != NULL && w->w_req->flags & SCTL_MASK32) {
 			if (w->w_op == NET_RT_IFLISTL)
 				len = sizeof(struct if_msghdrl32);
 			else
 				len = sizeof(struct if_msghdr32);
 			compat32 = true;
 			break;
 		}
 #endif
 		if (w != NULL && w->w_op == NET_RT_IFLISTL)
 			len = sizeof(struct if_msghdrl);
 		else
 			len = sizeof(struct if_msghdr);
 		break;
 
 	case RTM_NEWMADDR:
 		len = sizeof(struct ifma_msghdr);
 		break;
 
 	default:
 		len = sizeof(struct rt_msghdr);
 	}
 
 	if (w != NULL) {
 		rtm = (struct rt_msghdr *)w->w_tmem;
 		buflen = w->w_tmemsize - len;
 		cp = (caddr_t)w->w_tmem + len;
 	}
 
 	rtinfo->rti_addrs = 0;
 	for (i = 0; i < RTAX_MAX; i++) {
 		struct sockaddr *sa;
 
 		if ((sa = rtinfo->rti_info[i]) == NULL)
 			continue;
 		rtinfo->rti_addrs |= (1 << i);
 #ifdef COMPAT_FREEBSD32
 		if (compat32)
 			dlen = SA_SIZE32(sa);
 		else
 #endif
 			dlen = SA_SIZE(sa);
 		if (cp != NULL && buflen >= dlen) {
 			KASSERT(dlen <= sizeof(ss),
 			    ("%s: sockaddr size overflow", __func__));
 			bzero(&ss, sizeof(ss));
 			bcopy(sa, &ss, sa->sa_len);
 			sa = (struct sockaddr *)&ss;
 #ifdef INET6
 			if (sa->sa_family == AF_INET6) {
 				sin6 = (struct sockaddr_in6 *)sa;
 				(void)sa6_recoverscope(sin6);
 			}
 #endif
 			bcopy((caddr_t)sa, cp, (unsigned)dlen);
 			cp += dlen;
 			buflen -= dlen;
 		} else if (cp != NULL) {
 			/*
 			 * Buffer too small. Count needed size
 			 * and return with error.
 			 */
 			cp = NULL;
 		}
 
 		len += dlen;
 	}
 
 	if (cp != NULL) {
 		dlen = ALIGN(len) - len;
 		if (buflen < dlen)
 			cp = NULL;
 		else {
 			bzero(cp, dlen);
 			cp += dlen;
 			buflen -= dlen;
 		}
 	}
 	len = ALIGN(len);
 
 	if (cp != NULL) {
 		/* fill header iff buffer is large enough */
 		rtm->rtm_version = RTM_VERSION;
 		rtm->rtm_type = type;
 		rtm->rtm_msglen = len;
 	}
 
 	*plen = len;
 
 	if (w != NULL && cp == NULL)
 		return (ENOBUFS);
 
 	return (0);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that a redirect has occurred, a routing lookup
  * has failed, or that a protocol has detected timeouts to a particular
  * destination.
  */
 void
 rt_missmsg_fib(int type, struct rt_addrinfo *rtinfo, int flags, int error,
     int fibnum)
 {
 	struct rt_msghdr *rtm;
 	struct mbuf *m;
 	struct sockaddr *sa = rtinfo->rti_info[RTAX_DST];
 
 	if (V_route_cb.any_count == 0)
 		return;
 	m = rtsock_msg_mbuf(type, rtinfo);
 	if (m == NULL)
 		return;
 
 	if (fibnum != RT_ALL_FIBS) {
 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out "
 		    "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs));
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_flags = RTF_DONE | flags;
 	rtm->rtm_errno = error;
 	rtm->rtm_addrs = rtinfo->rti_addrs;
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 }
 
 void
 rt_missmsg(int type, struct rt_addrinfo *rtinfo, int flags, int error)
 {
 
 	rt_missmsg_fib(type, rtinfo, flags, error, RT_ALL_FIBS);
 }
 
 /*
  * This routine is called to generate a message from the routing
  * socket indicating that the status of a network interface has changed.
  */
 void
 rt_ifmsg(struct ifnet *ifp)
 {
 	struct if_msghdr *ifm;
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	if (V_route_cb.any_count == 0)
 		return;
 	bzero((caddr_t)&info, sizeof(info));
 	m = rtsock_msg_mbuf(RTM_IFINFO, &info);
 	if (m == NULL)
 		return;
 	ifm = mtod(m, struct if_msghdr *);
 	ifm->ifm_index = ifp->if_index;
 	ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 	if_data_copy(ifp, &ifm->ifm_data);
 	ifm->ifm_addrs = 0;
 	rt_dispatch(m, AF_UNSPEC);
 }
 
 /*
  * Announce interface address arrival/withdraw.
  * Please do not call directly, use rt_addrmsg().
  * Assume input data to be valid.
  * Returns 0 on success.
  */
 int
 rtsock_addrmsg(int cmd, struct ifaddr *ifa, int fibnum)
 {
 	struct rt_addrinfo info;
 	struct sockaddr *sa;
 	int ncmd;
 	struct mbuf *m;
 	struct ifa_msghdr *ifam;
 	struct ifnet *ifp = ifa->ifa_ifp;
 	struct sockaddr_storage ss;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	ncmd = cmd == RTM_ADD ? RTM_NEWADDR : RTM_DELADDR;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
 	info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
 	info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 	    info.rti_info[RTAX_IFA], ifa->ifa_netmask, &ss);
 	info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 	if ((m = rtsock_msg_mbuf(ncmd, &info)) == NULL)
 		return (ENOBUFS);
 	ifam = mtod(m, struct ifa_msghdr *);
 	ifam->ifam_index = ifp->if_index;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_addrs = info.rti_addrs;
 
 	if (fibnum != RT_ALL_FIBS) {
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * Announce route addition/removal to rtsock based on @rt data.
  * Callers are advives to use rt_routemsg() instead of using this
  *  function directly.
  * Assume @rt data is consistent.
  *
  * Returns 0 on success.
  */
 int
 rtsock_routemsg(int cmd, struct rtentry *rt, struct nhop_object *nh,
     int fibnum)
 {
 	union sockaddr_union dst, mask;
 	struct rt_addrinfo info;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	int family = rt_get_family(rt);
 	init_sockaddrs_family(family, &dst.sa, &mask.sa);
 	export_rtaddrs(rt, &dst.sa, &mask.sa);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = &dst.sa;
 	info.rti_info[RTAX_NETMASK] = &mask.sa;
 	info.rti_info[RTAX_GATEWAY] = &nh->gw_sa;
 	info.rti_flags = rt->rte_flags | nhop_get_rtflags(nh);
 	info.rti_ifp = nh->nh_ifp;
 
 	return (rtsock_routemsg_info(cmd, &info, fibnum));
 }
 
 int
 rtsock_routemsg_info(int cmd, struct rt_addrinfo *info, int fibnum)
 {
 	struct rt_msghdr *rtm;
 	struct sockaddr *sa;
 	struct mbuf *m;
 
 	if (V_route_cb.any_count == 0)
 		return (0);
 
 	if (info->rti_flags & RTF_HOST)
 		info->rti_info[RTAX_NETMASK] = NULL;
 
 	m = rtsock_msg_mbuf(cmd, info);
 	if (m == NULL)
 		return (ENOBUFS);
 
 	if (fibnum != RT_ALL_FIBS) {
 		KASSERT(fibnum >= 0 && fibnum < rt_numfibs, ("%s: fibnum out "
 		    "of range 0 <= %d < %d", __func__, fibnum, rt_numfibs));
 		M_SETFIB(m, fibnum);
 		m->m_flags |= RTS_FILTER_FIB;
 	}
 
 	rtm = mtod(m, struct rt_msghdr *);
 	rtm->rtm_addrs = info->rti_addrs;
 	if (info->rti_ifp != NULL)
 		rtm->rtm_index = info->rti_ifp->if_index;
 	/* Add RTF_DONE to indicate command 'completion' required by API */
 	info->rti_flags |= RTF_DONE;
 	/* Reported routes has to be up */
 	if (cmd == RTM_ADD || cmd == RTM_CHANGE)
 		info->rti_flags |= RTF_UP;
 	rtm->rtm_flags = info->rti_flags;
 
 	sa = info->rti_info[RTAX_DST];
 	rt_dispatch(m, sa ? sa->sa_family : AF_UNSPEC);
 
 	return (0);
 }
 
 /*
  * This is the analogue to the rt_newaddrmsg which performs the same
  * function but for multicast group memberhips.  This is easier since
  * there is no route state to worry about.
  */
 void
 rt_newmaddrmsg(int cmd, struct ifmultiaddr *ifma)
 {
 	struct rt_addrinfo info;
 	struct mbuf *m = NULL;
 	struct ifnet *ifp = ifma->ifma_ifp;
 	struct ifma_msghdr *ifmam;
 
 	if (V_route_cb.any_count == 0)
 		return;
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 	if (ifp && ifp->if_addr)
 		info.rti_info[RTAX_IFP] = ifp->if_addr->ifa_addr;
 	else
 		info.rti_info[RTAX_IFP] = NULL;
 	/*
 	 * If a link-layer address is present, present it as a ``gateway''
 	 * (similarly to how ARP entries, e.g., are presented).
 	 */
 	info.rti_info[RTAX_GATEWAY] = ifma->ifma_lladdr;
 	m = rtsock_msg_mbuf(cmd, &info);
 	if (m == NULL)
 		return;
 	ifmam = mtod(m, struct ifma_msghdr *);
 	KASSERT(ifp != NULL, ("%s: link-layer multicast address w/o ifp\n",
 	    __func__));
 	ifmam->ifmam_index = ifp->if_index;
 	ifmam->ifmam_addrs = info.rti_addrs;
 	rt_dispatch(m, ifma->ifma_addr ? ifma->ifma_addr->sa_family : AF_UNSPEC);
 }
 
 static struct mbuf *
 rt_makeifannouncemsg(struct ifnet *ifp, int type, int what,
 	struct rt_addrinfo *info)
 {
 	struct if_announcemsghdr *ifan;
 	struct mbuf *m;
 
 	if (V_route_cb.any_count == 0)
 		return NULL;
 	bzero((caddr_t)info, sizeof(*info));
 	m = rtsock_msg_mbuf(type, info);
 	if (m != NULL) {
 		ifan = mtod(m, struct if_announcemsghdr *);
 		ifan->ifan_index = ifp->if_index;
 		strlcpy(ifan->ifan_name, ifp->if_xname,
 			sizeof(ifan->ifan_name));
 		ifan->ifan_what = what;
 	}
 	return m;
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * IEEE80211 wireless events.
  * XXX we piggyback on the RTM_IFANNOUNCE msg format in a clumsy way.
  */
 void
 rt_ieee80211msg(struct ifnet *ifp, int what, void *data, size_t data_len)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IEEE80211, what, &info);
 	if (m != NULL) {
 		/*
 		 * Append the ieee80211 data.  Try to stick it in the
 		 * mbuf containing the ifannounce msg; otherwise allocate
 		 * a new mbuf and append.
 		 *
 		 * NB: we assume m is a single mbuf.
 		 */
 		if (data_len > M_TRAILINGSPACE(m)) {
 			struct mbuf *n = m_get(M_NOWAIT, MT_DATA);
 			if (n == NULL) {
 				m_freem(m);
 				return;
 			}
 			bcopy(data, mtod(n, void *), data_len);
 			n->m_len = data_len;
 			m->m_next = n;
 		} else if (data_len > 0) {
 			bcopy(data, mtod(m, u_int8_t *) + m->m_len, data_len);
 			m->m_len += data_len;
 		}
 		if (m->m_flags & M_PKTHDR)
 			m->m_pkthdr.len += data_len;
 		mtod(m, struct if_announcemsghdr *)->ifan_msglen += data_len;
 		rt_dispatch(m, AF_UNSPEC);
 	}
 }
 
 /*
  * This is called to generate routing socket messages indicating
  * network interface arrival and departure.
  */
 static void
 rt_ifannouncemsg(struct ifnet *ifp, int what)
 {
 	struct mbuf *m;
 	struct rt_addrinfo info;
 
 	m = rt_makeifannouncemsg(ifp, RTM_IFANNOUNCE, what, &info);
 	if (m != NULL)
 		rt_dispatch(m, AF_UNSPEC);
 }
 
 static void
 rt_dispatch(struct mbuf *m, sa_family_t saf)
 {
 
 	M_ASSERTPKTHDR(m);
 
 	m->m_rtsock_family = saf;
 	if (V_loif)
 		m->m_pkthdr.rcvif = V_loif;
 	else {
 		m_freem(m);
 		return;
 	}
 	netisr_queue(NETISR_ROUTE, m);	/* mbuf is free'd on failure. */
 }
 
 /*
  * Checks if rte can be exported w.r.t jails/vnets.
  *
  * Returns true if it can, false otherwise.
  */
 static bool
 can_export_rte(struct ucred *td_ucred, bool rt_is_host,
     const struct sockaddr *rt_dst)
 {
 
 	if ((!rt_is_host) ? jailed_without_vnet(td_ucred)
 	    : prison_if(td_ucred, rt_dst) != 0)
 		return (false);
 	return (true);
 }
 
 
 /*
  * This is used in dumping the kernel table via sysctl().
  */
 static int
 sysctl_dumpentry(struct rtentry *rt, void *vw)
 {
 	struct walkarg *w = vw;
 	struct nhop_object *nh;
 
 	NET_EPOCH_ASSERT();
 
 	export_rtaddrs(rt, w->dst, w->mask);
 	if (!can_export_rte(w->w_req->td->td_ucred, rt_is_host(rt), w->dst))
 		return (0);
 	nh = rt_get_raw_nhop(rt);
 #ifdef ROUTE_MPATH
 	if (NH_IS_NHGRP(nh)) {
 		const struct weightened_nhop *wn;
 		uint32_t num_nhops;
 		int error;
 		wn = nhgrp_get_nhops((struct nhgrp_object *)nh, &num_nhops);
 		for (int i = 0; i < num_nhops; i++) {
 			error = sysctl_dumpnhop(rt, wn[i].nh, wn[i].weight, w);
 			if (error != 0)
 				return (error);
 		}
 	} else
 #endif
 		sysctl_dumpnhop(rt, nh, rt->rt_weight, w);
 
 	return (0);
 }
 
 
 static int
 sysctl_dumpnhop(struct rtentry *rt, struct nhop_object *nh, uint32_t weight,
     struct walkarg *w)
 {
 	struct rt_addrinfo info;
 	int error = 0, size;
 	uint32_t rtflags;
 
 	rtflags = nhop_get_rtflags(nh);
 
 	if (w->w_op == NET_RT_FLAGS && !(rtflags & w->w_arg))
 		return (0);
 
 	bzero((caddr_t)&info, sizeof(info));
 	info.rti_info[RTAX_DST] = w->dst;
 	info.rti_info[RTAX_GATEWAY] = &nh->gw_sa;
 	info.rti_info[RTAX_NETMASK] = (rtflags & RTF_HOST) ? NULL : w->mask;
 	info.rti_info[RTAX_GENMASK] = 0;
 	if (nh->nh_ifp && !(nh->nh_ifp->if_flags & IFF_DYING)) {
 		info.rti_info[RTAX_IFP] = nh->nh_ifp->if_addr->ifa_addr;
 		info.rti_info[RTAX_IFA] = nh->nh_ifa->ifa_addr;
 		if (nh->nh_ifp->if_flags & IFF_POINTOPOINT)
 			info.rti_info[RTAX_BRD] = nh->nh_ifa->ifa_dstaddr;
 	}
 	if ((error = rtsock_msg_buffer(RTM_GET, &info, w, &size)) != 0)
 		return (error);
 	if (w->w_req && w->w_tmem) {
 		struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
 
 		bzero(&rtm->rtm_index,
 		    sizeof(*rtm) - offsetof(struct rt_msghdr, rtm_index));
 
 		/*
 		 * rte flags may consist of RTF_HOST (duplicated in nhop rtflags)
 		 * and RTF_UP (if entry is linked, which is always true here).
 		 * Given that, use nhop rtflags & add RTF_UP.
 		 */
 		rtm->rtm_flags = rtflags | RTF_UP;
 		if (rtm->rtm_flags & RTF_GWFLAG_COMPAT)
 			rtm->rtm_flags = RTF_GATEWAY | 
 				(rtm->rtm_flags & ~RTF_GWFLAG_COMPAT);
 		rt_getmetrics(rt, nh, &rtm->rtm_rmx);
 		rtm->rtm_rmx.rmx_weight = weight;
 		rtm->rtm_index = nh->nh_ifp->if_index;
 		rtm->rtm_addrs = info.rti_addrs;
 		error = SYSCTL_OUT(w->w_req, (caddr_t)rtm, size);
 		return (error);
 	}
 	return (error);
 }
 
 static int
 sysctl_iflist_ifml(struct ifnet *ifp, const struct if_data *src_ifd,
     struct rt_addrinfo *info, struct walkarg *w, int len)
 {
 	struct if_msghdrl *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdrl32 *ifm32;
 
 		ifm32 = (struct if_msghdrl32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifm32->_ifm_spare1 = 0;
 		ifm32->ifm_len = sizeof(*ifm32);
 		ifm32->ifm_data_off = offsetof(struct if_msghdrl32, ifm_data);
 		ifm32->_ifm_spare2 = 0;
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifm->_ifm_spare1 = 0;
 		ifm->ifm_len = sizeof(*ifm);
 		ifm->ifm_data_off = offsetof(struct if_msghdrl, ifm_data);
 		ifm->_ifm_spare2 = 0;
 		ifd = &ifm->ifm_data;
 	}
 
 	memcpy(ifd, src_ifd, sizeof(*ifd));
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifm(struct ifnet *ifp, const struct if_data *src_ifd,
     struct rt_addrinfo *info, struct walkarg *w, int len)
 {
 	struct if_msghdr *ifm;
 	struct if_data *ifd;
 
 	ifm = (struct if_msghdr *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct if_msghdr32 *ifm32;
 
 		ifm32 = (struct if_msghdr32 *)ifm;
 		ifm32->ifm_addrs = info->rti_addrs;
 		ifm32->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm32->ifm_index = ifp->if_index;
 		ifm32->_ifm_spare1 = 0;
 		ifd = &ifm32->ifm_data;
 	} else
 #endif
 	{
 		ifm->ifm_addrs = info->rti_addrs;
 		ifm->ifm_flags = ifp->if_flags | ifp->if_drv_flags;
 		ifm->ifm_index = ifp->if_index;
 		ifm->_ifm_spare1 = 0;
 		ifd = &ifm->ifm_data;
 	}
 
 	memcpy(ifd, src_ifd, sizeof(*ifd));
 
 	return (SYSCTL_OUT(w->w_req, (caddr_t)ifm, len));
 }
 
 static int
 sysctl_iflist_ifaml(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdrl *ifam;
 	struct if_data *ifd;
 
 	ifam = (struct ifa_msghdrl *)w->w_tmem;
 
 #ifdef COMPAT_FREEBSD32
 	if (w->w_req->flags & SCTL_MASK32) {
 		struct ifa_msghdrl32 *ifam32;
 
 		ifam32 = (struct ifa_msghdrl32 *)ifam;
 		ifam32->ifam_addrs = info->rti_addrs;
 		ifam32->ifam_flags = ifa->ifa_flags;
 		ifam32->ifam_index = ifa->ifa_ifp->if_index;
 		ifam32->_ifam_spare1 = 0;
 		ifam32->ifam_len = sizeof(*ifam32);
 		ifam32->ifam_data_off =
 		    offsetof(struct ifa_msghdrl32, ifam_data);
 		ifam32->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam32->ifam_data;
 	} else
 #endif
 	{
 		ifam->ifam_addrs = info->rti_addrs;
 		ifam->ifam_flags = ifa->ifa_flags;
 		ifam->ifam_index = ifa->ifa_ifp->if_index;
 		ifam->_ifam_spare1 = 0;
 		ifam->ifam_len = sizeof(*ifam);
 		ifam->ifam_data_off = offsetof(struct ifa_msghdrl, ifam_data);
 		ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 		ifd = &ifam->ifam_data;
 	}
 
 	bzero(ifd, sizeof(*ifd));
 	ifd->ifi_datalen = sizeof(struct if_data);
 	ifd->ifi_ipackets = counter_u64_fetch(ifa->ifa_ipackets);
 	ifd->ifi_opackets = counter_u64_fetch(ifa->ifa_opackets);
 	ifd->ifi_ibytes = counter_u64_fetch(ifa->ifa_ibytes);
 	ifd->ifi_obytes = counter_u64_fetch(ifa->ifa_obytes);
 
 	/* Fixup if_data carp(4) vhid. */
 	if (carp_get_vhid_p != NULL)
 		ifd->ifi_vhid = (*carp_get_vhid_p)(ifa);
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist_ifam(struct ifaddr *ifa, struct rt_addrinfo *info,
     struct walkarg *w, int len)
 {
 	struct ifa_msghdr *ifam;
 
 	ifam = (struct ifa_msghdr *)w->w_tmem;
 	ifam->ifam_addrs = info->rti_addrs;
 	ifam->ifam_flags = ifa->ifa_flags;
 	ifam->ifam_index = ifa->ifa_ifp->if_index;
 	ifam->_ifam_spare1 = 0;
 	ifam->ifam_metric = ifa->ifa_ifp->if_metric;
 
 	return (SYSCTL_OUT(w->w_req, w->w_tmem, len));
 }
 
 static int
 sysctl_iflist(int af, struct walkarg *w)
 {
 	struct ifnet *ifp;
 	struct ifaddr *ifa;
 	struct if_data ifd;
 	struct rt_addrinfo info;
 	int len, error = 0;
 	struct sockaddr_storage ss;
 
 	bzero((caddr_t)&info, sizeof(info));
 	bzero(&ifd, sizeof(ifd));
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		if_data_copy(ifp, &ifd);
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa->ifa_addr;
 		error = rtsock_msg_buffer(RTM_IFINFO, &info, w, &len);
 		if (error != 0)
 			goto done;
 		info.rti_info[RTAX_IFP] = NULL;
 		if (w->w_req && w->w_tmem) {
 			if (w->w_op == NET_RT_IFLISTL)
 				error = sysctl_iflist_ifml(ifp, &ifd, &info, w,
 				    len);
 			else
 				error = sysctl_iflist_ifm(ifp, &ifd, &info, w,
 				    len);
 			if (error)
 				goto done;
 		}
 		while ((ifa = CK_STAILQ_NEXT(ifa, ifa_link)) != NULL) {
 			if (af && af != ifa->ifa_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifa->ifa_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifa->ifa_addr;
 			info.rti_info[RTAX_NETMASK] = rtsock_fix_netmask(
 			    ifa->ifa_addr, ifa->ifa_netmask, &ss);
 			info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
 			error = rtsock_msg_buffer(RTM_NEWADDR, &info, w, &len);
 			if (error != 0)
 				goto done;
 			if (w->w_req && w->w_tmem) {
 				if (w->w_op == NET_RT_IFLISTL)
 					error = sysctl_iflist_ifaml(ifa, &info,
 					    w, len);
 				else
 					error = sysctl_iflist_ifam(ifa, &info,
 					    w, len);
 				if (error)
 					goto done;
 			}
 		}
 		info.rti_info[RTAX_IFA] = NULL;
 		info.rti_info[RTAX_NETMASK] = NULL;
 		info.rti_info[RTAX_BRD] = NULL;
 	}
 done:
 	return (error);
 }
 
 static int
 sysctl_ifmalist(int af, struct walkarg *w)
 {
 	struct rt_addrinfo info;
 	struct ifaddr *ifa;
 	struct ifmultiaddr *ifma;
 	struct ifnet *ifp;
 	int error, len;
 
 	NET_EPOCH_ASSERT();
 
 	error = 0;
 	bzero((caddr_t)&info, sizeof(info));
 
 	CK_STAILQ_FOREACH(ifp, &V_ifnet, if_link) {
 		if (w->w_arg && w->w_arg != ifp->if_index)
 			continue;
 		ifa = ifp->if_addr;
 		info.rti_info[RTAX_IFP] = ifa ? ifa->ifa_addr : NULL;
 		CK_STAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
 			if (af && af != ifma->ifma_addr->sa_family)
 				continue;
 			if (prison_if(w->w_req->td->td_ucred,
 			    ifma->ifma_addr) != 0)
 				continue;
 			info.rti_info[RTAX_IFA] = ifma->ifma_addr;
 			info.rti_info[RTAX_GATEWAY] =
 			    (ifma->ifma_addr->sa_family != AF_LINK) ?
 			    ifma->ifma_lladdr : NULL;
 			error = rtsock_msg_buffer(RTM_NEWMADDR, &info, w, &len);
 			if (error != 0)
 				break;
 			if (w->w_req && w->w_tmem) {
 				struct ifma_msghdr *ifmam;
 
 				ifmam = (struct ifma_msghdr *)w->w_tmem;
 				ifmam->ifmam_index = ifma->ifma_ifp->if_index;
 				ifmam->ifmam_flags = 0;
 				ifmam->ifmam_addrs = info.rti_addrs;
 				ifmam->_ifmam_spare1 = 0;
 				error = SYSCTL_OUT(w->w_req, w->w_tmem, len);
 				if (error != 0)
 					break;
 			}
 		}
 		if (error != 0)
 			break;
 	}
 	return (error);
 }
 
 static void
 rtable_sysctl_dump(uint32_t fibnum, int family, struct walkarg *w)
 {
 	union sockaddr_union sa_dst, sa_mask;
 
 	w->family = family;
 	w->dst = (struct sockaddr *)&sa_dst;
 	w->mask = (struct sockaddr *)&sa_mask;
 
 	init_sockaddrs_family(family, w->dst, w->mask);
 
 	rib_walk(fibnum, family, false, sysctl_dumpentry, w);
 }
 
 static int
 sysctl_rtsock(SYSCTL_HANDLER_ARGS)
 {
 	struct epoch_tracker et;
 	int	*name = (int *)arg1;
 	u_int	namelen = arg2;
 	struct rib_head *rnh = NULL; /* silence compiler. */
 	int	i, lim, error = EINVAL;
 	int	fib = 0;
 	u_char	af;
 	struct	walkarg w;
 
 	if (namelen < 3)
 		return (EINVAL);
 
 	name++;
 	namelen--;
 	if (req->newptr)
 		return (EPERM);
 	if (name[1] == NET_RT_DUMP || name[1] == NET_RT_NHOP || name[1] == NET_RT_NHGRP) {
 		if (namelen == 3)
 			fib = req->td->td_proc->p_fibnum;
 		else if (namelen == 4)
 			fib = (name[3] == RT_ALL_FIBS) ?
 			    req->td->td_proc->p_fibnum : name[3];
 		else
 			return ((namelen < 3) ? EISDIR : ENOTDIR);
 		if (fib < 0 || fib >= rt_numfibs)
 			return (EINVAL);
 	} else if (namelen != 3)
 		return ((namelen < 3) ? EISDIR : ENOTDIR);
 	af = name[0];
 	if (af > AF_MAX)
 		return (EINVAL);
 	bzero(&w, sizeof(w));
 	w.w_op = name[1];
 	w.w_arg = name[2];
 	w.w_req = req;
 
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error)
 		return (error);
 
 	/*
 	 * Allocate reply buffer in advance.
 	 * All rtsock messages has maximum length of u_short.
 	 */
 	w.w_tmemsize = 65536;
 	w.w_tmem = malloc(w.w_tmemsize, M_TEMP, M_WAITOK);
 
 	NET_EPOCH_ENTER(et);
 	switch (w.w_op) {
 	case NET_RT_DUMP:
 	case NET_RT_FLAGS:
 		if (af == 0) {			/* dump all tables */
 			i = 1;
 			lim = AF_MAX;
 		} else				/* dump only one table */
 			i = lim = af;
 
 		/*
 		 * take care of llinfo entries, the caller must
 		 * specify an AF
 		 */
 		if (w.w_op == NET_RT_FLAGS &&
 		    (w.w_arg == 0 || w.w_arg & RTF_LLINFO)) {
 			if (af != 0)
 				error = lltable_sysctl_dumparp(af, w.w_req);
 			else
 				error = EINVAL;
 			break;
 		}
 		/*
 		 * take care of routing entries
 		 */
 		for (error = 0; error == 0 && i <= lim; i++) {
 			rnh = rt_tables_get_rnh(fib, i);
 			if (rnh != NULL) {
 				rtable_sysctl_dump(fib, i, &w);
 			} else if (af != 0)
 				error = EAFNOSUPPORT;
 		}
 		break;
 	case NET_RT_NHOP:
 	case NET_RT_NHGRP:
 		/* Allow dumping one specific af/fib at a time */
 		if (namelen < 4) {
 			error = EINVAL;
 			break;
 		}
 		fib = name[3];
 		if (fib < 0 || fib > rt_numfibs) {
 			error = EINVAL;
 			break;
 		}
 		rnh = rt_tables_get_rnh(fib, af);
 		if (rnh == NULL) {
 			error = EAFNOSUPPORT;
 			break;
 		}
 		if (w.w_op == NET_RT_NHOP)
 			error = nhops_dump_sysctl(rnh, w.w_req);
 		else
 #ifdef ROUTE_MPATH
 			error = nhgrp_dump_sysctl(rnh, w.w_req);
 #else
 			error = ENOTSUP;
 #endif
 		break;
 	case NET_RT_IFLIST:
 	case NET_RT_IFLISTL:
 		error = sysctl_iflist(af, &w);
 		break;
 
 	case NET_RT_IFMALIST:
 		error = sysctl_ifmalist(af, &w);
 		break;
 	}
 	NET_EPOCH_EXIT(et);
 
 	free(w.w_tmem, M_TEMP);
 	return (error);
 }
 
 static SYSCTL_NODE(_net, PF_ROUTE, routetable, CTLFLAG_RD | CTLFLAG_MPSAFE,
     sysctl_rtsock, "Return route tables and interface/address lists");
 
 /*
  * Definitions of protocols supported in the ROUTE domain.
  */
 
 static struct domain routedomain;		/* or at least forward */
 
-static struct pr_usrreqs route_usrreqs = {
-	.pru_abort =		rts_close,
-	.pru_attach =		rts_attach,
-	.pru_detach =		rts_detach,
-	.pru_send =		rts_send,
-	.pru_shutdown =		rts_shutdown,
-	.pru_close =		rts_close,
-};
-
-static struct protosw routesw[] = {
-{
+static struct protosw routesw = {
 	.pr_type =		SOCK_RAW,
-	.pr_domain =		&routedomain,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_usrreqs =		&route_usrreqs
-}
+	.pr_abort =		rts_close,
+	.pr_attach =		rts_attach,
+	.pr_detach =		rts_detach,
+	.pr_send =		rts_send,
+	.pr_shutdown =		rts_shutdown,
+	.pr_close =		rts_close,
 };
 
 static struct domain routedomain = {
 	.dom_family =		PF_ROUTE,
 	.dom_name =		"route",
-	.dom_protosw =		routesw,
-	.dom_protoswNPROTOSW =	&routesw[nitems(routesw)]
+	.dom_nprotosw =		1,
+	.dom_protosw =		{ &routesw },
 };
 
 DOMAIN_SET(route);
diff --git a/sys/netgraph/bluetooth/socket/ng_btsocket.c b/sys/netgraph/bluetooth/socket/ng_btsocket.c
index 1ea3b1f649b3..0ed694cb19db 100644
--- a/sys/netgraph/bluetooth/socket/ng_btsocket.c
+++ b/sys/netgraph/bluetooth/socket/ng_btsocket.c
@@ -1,285 +1,249 @@
 /*
  * ng_btsocket.c
  */
 
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001-2002 Maksim Yevmenkin <m_evmenkin@yahoo.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $Id: ng_btsocket.c,v 1.4 2003/09/14 23:29:06 max Exp $
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bitstring.h>
 #include <sys/errno.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 
 #include <net/vnet.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/bluetooth/include/ng_bluetooth.h>
 #include <netgraph/bluetooth/include/ng_hci.h>
 #include <netgraph/bluetooth/include/ng_l2cap.h>
 #include <netgraph/bluetooth/include/ng_btsocket.h>
 #include <netgraph/bluetooth/include/ng_btsocket_hci_raw.h>
 #include <netgraph/bluetooth/include/ng_btsocket_l2cap.h>
 #include <netgraph/bluetooth/include/ng_btsocket_rfcomm.h>
 #include <netgraph/bluetooth/include/ng_btsocket_sco.h>
 
 static int			ng_btsocket_modevent (module_t, int, void *);
-static struct domain		ng_btsocket_domain;
-
-/*
- * Bluetooth raw HCI sockets
- */
-
-static struct pr_usrreqs	ng_btsocket_hci_raw_usrreqs = {
-	.pru_abort =		ng_btsocket_hci_raw_abort,
-	.pru_attach =		ng_btsocket_hci_raw_attach,
-	.pru_bind =		ng_btsocket_hci_raw_bind,
-	.pru_connect =		ng_btsocket_hci_raw_connect,
-	.pru_control =		ng_btsocket_hci_raw_control,
-	.pru_detach =		ng_btsocket_hci_raw_detach,
-	.pru_disconnect =	ng_btsocket_hci_raw_disconnect,
-	.pru_peeraddr =		ng_btsocket_hci_raw_peeraddr,
-	.pru_send =		ng_btsocket_hci_raw_send,
-	.pru_sockaddr =		ng_btsocket_hci_raw_sockaddr,
-	.pru_close =		ng_btsocket_hci_raw_close,
-};
-
-/*
- * Bluetooth raw L2CAP sockets
- */
-
-static struct pr_usrreqs	ng_btsocket_l2cap_raw_usrreqs = {
-	.pru_abort =		ng_btsocket_l2cap_raw_abort,
-	.pru_attach =		ng_btsocket_l2cap_raw_attach,
-	.pru_bind =		ng_btsocket_l2cap_raw_bind,
-	.pru_connect =		ng_btsocket_l2cap_raw_connect,
-	.pru_control =		ng_btsocket_l2cap_raw_control,
-	.pru_detach =		ng_btsocket_l2cap_raw_detach,
-	.pru_disconnect =	ng_btsocket_l2cap_raw_disconnect,
-	.pru_peeraddr =		ng_btsocket_l2cap_raw_peeraddr,
-	.pru_send =		ng_btsocket_l2cap_raw_send,
-	.pru_sockaddr =		ng_btsocket_l2cap_raw_sockaddr,
-	.pru_close =		ng_btsocket_l2cap_raw_close,
-};
-
-/*
- * Bluetooth SEQPACKET L2CAP sockets
- */
-
-static struct pr_usrreqs	ng_btsocket_l2cap_usrreqs = {
-	.pru_abort =		ng_btsocket_l2cap_abort,
-	.pru_accept =		ng_btsocket_l2cap_accept,
-	.pru_attach =		ng_btsocket_l2cap_attach,
-	.pru_bind =		ng_btsocket_l2cap_bind,
-	.pru_connect =		ng_btsocket_l2cap_connect,
-	.pru_control =		ng_btsocket_l2cap_control,
-	.pru_detach =		ng_btsocket_l2cap_detach,
-	.pru_disconnect =	ng_btsocket_l2cap_disconnect,
-        .pru_listen =		ng_btsocket_l2cap_listen,
-	.pru_peeraddr =		ng_btsocket_l2cap_peeraddr,
-	.pru_send =		ng_btsocket_l2cap_send,
-	.pru_sockaddr =		ng_btsocket_l2cap_sockaddr,
-	.pru_close =		ng_btsocket_l2cap_close,
-};
-
-/*
- * Bluetooth STREAM RFCOMM sockets
- */
-
-static struct pr_usrreqs	ng_btsocket_rfcomm_usrreqs = {
-	.pru_abort =		ng_btsocket_rfcomm_abort,
-	.pru_accept =		ng_btsocket_rfcomm_accept,
-	.pru_attach =		ng_btsocket_rfcomm_attach,
-	.pru_bind =		ng_btsocket_rfcomm_bind,
-	.pru_connect =		ng_btsocket_rfcomm_connect,
-	.pru_control =		ng_btsocket_rfcomm_control,
-	.pru_detach =		ng_btsocket_rfcomm_detach,
-	.pru_disconnect =	ng_btsocket_rfcomm_disconnect,
-        .pru_listen =		ng_btsocket_rfcomm_listen,
-	.pru_peeraddr =		ng_btsocket_rfcomm_peeraddr,
-	.pru_send =		ng_btsocket_rfcomm_send,
-	.pru_sockaddr =		ng_btsocket_rfcomm_sockaddr,
-	.pru_close =		ng_btsocket_rfcomm_close,
-};
-
-/*
- * Bluetooth SEQPACKET SCO sockets
- */
-
-static struct pr_usrreqs	ng_btsocket_sco_usrreqs = {
-	.pru_abort =		ng_btsocket_sco_abort,
-	.pru_accept =		ng_btsocket_sco_accept,
-	.pru_attach =		ng_btsocket_sco_attach,
-	.pru_bind =		ng_btsocket_sco_bind,
-	.pru_connect =		ng_btsocket_sco_connect,
-	.pru_control =		ng_btsocket_sco_control,
-	.pru_detach =		ng_btsocket_sco_detach,
-	.pru_disconnect =	ng_btsocket_sco_disconnect,
-	.pru_listen =		ng_btsocket_sco_listen,
-	.pru_peeraddr =		ng_btsocket_sco_peeraddr,
-	.pru_send =		ng_btsocket_sco_send,
-	.pru_sockaddr =		ng_btsocket_sco_sockaddr,
-	.pru_close =		ng_btsocket_sco_close,
-};
 
 /* 
  * Definitions of protocols supported in the BLUETOOTH domain 
  */
 
-static struct protosw		ng_btsocket_protosw[] = {
-{
+/* Bluetooth raw HCI sockets */
+static struct protosw ng_btsocket_hci_raw_protosw = {
 	.pr_type =		SOCK_RAW,
-	.pr_domain =		&ng_btsocket_domain,
 	.pr_protocol =		BLUETOOTH_PROTO_HCI,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_ctloutput =		ng_btsocket_hci_raw_ctloutput,
-	.pr_usrreqs =		&ng_btsocket_hci_raw_usrreqs,
-},
-{
+	.pr_abort =		ng_btsocket_hci_raw_abort,
+	.pr_attach =		ng_btsocket_hci_raw_attach,
+	.pr_bind =		ng_btsocket_hci_raw_bind,
+	.pr_connect =		ng_btsocket_hci_raw_connect,
+	.pr_control =		ng_btsocket_hci_raw_control,
+	.pr_detach =		ng_btsocket_hci_raw_detach,
+	.pr_disconnect =	ng_btsocket_hci_raw_disconnect,
+	.pr_peeraddr =		ng_btsocket_hci_raw_peeraddr,
+	.pr_send =		ng_btsocket_hci_raw_send,
+	.pr_sockaddr =		ng_btsocket_hci_raw_sockaddr,
+	.pr_close =		ng_btsocket_hci_raw_close,
+};
+
+/* Bluetooth raw L2CAP sockets */
+static struct protosw ng_btsocket_l2cap_raw_protosw = {
 	.pr_type =		SOCK_RAW,
-	.pr_domain =		&ng_btsocket_domain,
 	.pr_protocol =		BLUETOOTH_PROTO_L2CAP,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_usrreqs =		&ng_btsocket_l2cap_raw_usrreqs,
-},
-{
+	.pr_abort =		ng_btsocket_l2cap_raw_abort,
+	.pr_attach =		ng_btsocket_l2cap_raw_attach,
+	.pr_bind =		ng_btsocket_l2cap_raw_bind,
+	.pr_connect =		ng_btsocket_l2cap_raw_connect,
+	.pr_control =		ng_btsocket_l2cap_raw_control,
+	.pr_detach =		ng_btsocket_l2cap_raw_detach,
+	.pr_disconnect =	ng_btsocket_l2cap_raw_disconnect,
+	.pr_peeraddr =		ng_btsocket_l2cap_raw_peeraddr,
+	.pr_send =		ng_btsocket_l2cap_raw_send,
+	.pr_sockaddr =		ng_btsocket_l2cap_raw_sockaddr,
+	.pr_close =		ng_btsocket_l2cap_raw_close,
+};
+
+/* Bluetooth SEQPACKET L2CAP sockets */
+static struct protosw ng_btsocket_l2cap_protosw = {
 	.pr_type =		SOCK_SEQPACKET,
-	.pr_domain =		&ng_btsocket_domain,
 	.pr_protocol =		BLUETOOTH_PROTO_L2CAP,
 	.pr_flags =		PR_ATOMIC|PR_CONNREQUIRED,
 	.pr_ctloutput =		ng_btsocket_l2cap_ctloutput,
-	.pr_usrreqs =		&ng_btsocket_l2cap_usrreqs,
-},
-{
+	.pr_abort =		ng_btsocket_l2cap_abort,
+	.pr_accept =		ng_btsocket_l2cap_accept,
+	.pr_attach =		ng_btsocket_l2cap_attach,
+	.pr_bind =		ng_btsocket_l2cap_bind,
+	.pr_connect =		ng_btsocket_l2cap_connect,
+	.pr_control =		ng_btsocket_l2cap_control,
+	.pr_detach =		ng_btsocket_l2cap_detach,
+	.pr_disconnect =	ng_btsocket_l2cap_disconnect,
+        .pr_listen =		ng_btsocket_l2cap_listen,
+	.pr_peeraddr =		ng_btsocket_l2cap_peeraddr,
+	.pr_send =		ng_btsocket_l2cap_send,
+	.pr_sockaddr =		ng_btsocket_l2cap_sockaddr,
+	.pr_close =		ng_btsocket_l2cap_close,
+};
+
+/* Bluetooth STREAM RFCOMM sockets */
+static struct protosw ng_btsocket_rfcomm_protosw = {
 	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&ng_btsocket_domain,
 	.pr_protocol =		BLUETOOTH_PROTO_RFCOMM,
 	.pr_flags =		PR_CONNREQUIRED,
 	.pr_ctloutput =		ng_btsocket_rfcomm_ctloutput,
-	.pr_usrreqs =		&ng_btsocket_rfcomm_usrreqs,
-},
-{
+	.pr_abort =		ng_btsocket_rfcomm_abort,
+	.pr_accept =		ng_btsocket_rfcomm_accept,
+	.pr_attach =		ng_btsocket_rfcomm_attach,
+	.pr_bind =		ng_btsocket_rfcomm_bind,
+	.pr_connect =		ng_btsocket_rfcomm_connect,
+	.pr_control =		ng_btsocket_rfcomm_control,
+	.pr_detach =		ng_btsocket_rfcomm_detach,
+	.pr_disconnect =	ng_btsocket_rfcomm_disconnect,
+        .pr_listen =		ng_btsocket_rfcomm_listen,
+	.pr_peeraddr =		ng_btsocket_rfcomm_peeraddr,
+	.pr_send =		ng_btsocket_rfcomm_send,
+	.pr_sockaddr =		ng_btsocket_rfcomm_sockaddr,
+	.pr_close =		ng_btsocket_rfcomm_close,
+};
+
+/* Bluetooth SEQPACKET SCO sockets */
+static struct protosw ng_btsocket_sco_protosw = {
 	.pr_type =		SOCK_SEQPACKET,
-	.pr_domain =		&ng_btsocket_domain,
 	.pr_protocol =		BLUETOOTH_PROTO_SCO,
 	.pr_flags =		PR_ATOMIC|PR_CONNREQUIRED,
 	.pr_ctloutput =		ng_btsocket_sco_ctloutput,
-	.pr_usrreqs =		&ng_btsocket_sco_usrreqs,
-},
+	.pr_abort =		ng_btsocket_sco_abort,
+	.pr_accept =		ng_btsocket_sco_accept,
+	.pr_attach =		ng_btsocket_sco_attach,
+	.pr_bind =		ng_btsocket_sco_bind,
+	.pr_connect =		ng_btsocket_sco_connect,
+	.pr_control =		ng_btsocket_sco_control,
+	.pr_detach =		ng_btsocket_sco_detach,
+	.pr_disconnect =	ng_btsocket_sco_disconnect,
+	.pr_listen =		ng_btsocket_sco_listen,
+	.pr_peeraddr =		ng_btsocket_sco_peeraddr,
+	.pr_send =		ng_btsocket_sco_send,
+	.pr_sockaddr =		ng_btsocket_sco_sockaddr,
+	.pr_close =		ng_btsocket_sco_close,
 };
 
-#define ng_btsocket_protosw_end \
-	&ng_btsocket_protosw[nitems(ng_btsocket_protosw)]
-
 /*
  * BLUETOOTH domain
  */
 
 static struct domain ng_btsocket_domain = {
 	.dom_family =		AF_BLUETOOTH,
 	.dom_name =		"bluetooth",
-	.dom_protosw =		ng_btsocket_protosw,
-	.dom_protoswNPROTOSW =	ng_btsocket_protosw_end
+	.dom_nprotosw =		5,
+	.dom_protosw = {
+		&ng_btsocket_hci_raw_protosw,
+		&ng_btsocket_l2cap_raw_protosw,
+		&ng_btsocket_l2cap_protosw,
+		&ng_btsocket_rfcomm_protosw,
+		&ng_btsocket_sco_protosw,
+	},
 };
 
 /* 
  * Socket sysctl tree 
  */
 
 SYSCTL_NODE(_net_bluetooth_hci, OID_AUTO, sockets,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Bluetooth HCI sockets family");
 SYSCTL_NODE(_net_bluetooth_l2cap, OID_AUTO, sockets,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Bluetooth L2CAP sockets family");
 SYSCTL_NODE(_net_bluetooth_rfcomm, OID_AUTO, sockets,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Bluetooth RFCOMM sockets family");
 SYSCTL_NODE(_net_bluetooth_sco, OID_AUTO, sockets,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Bluetooth SCO sockets family");
 
 /* 
  * Module 
  */
 
 static moduledata_t	ng_btsocket_mod = {
 	"ng_btsocket",
 	ng_btsocket_modevent,
 	NULL
 };
 
 DECLARE_MODULE(ng_btsocket, ng_btsocket_mod, SI_SUB_PROTO_DOMAIN,
 	SI_ORDER_ANY);
 MODULE_VERSION(ng_btsocket, NG_BLUETOOTH_VERSION);
 MODULE_DEPEND(ng_btsocket, ng_bluetooth, NG_BLUETOOTH_VERSION,
 	NG_BLUETOOTH_VERSION, NG_BLUETOOTH_VERSION);
 MODULE_DEPEND(ng_btsocket, netgraph, NG_ABI_VERSION,
 	NG_ABI_VERSION, NG_ABI_VERSION);
 
 /*
  * Handle loading and unloading for this node type.
  * This is to handle auxiliary linkages (e.g protocol domain addition).
  */
 
 static int  
 ng_btsocket_modevent(module_t mod, int event, void *data)
 {
 	int	error = 0;
         
 	switch (event) {
 	case MOD_LOAD:
 		break;
 
 	case MOD_UNLOAD:
 		/* XXX can't unload protocol domain yet */
 		error = EBUSY;
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 
 	return (error);
 } /* ng_btsocket_modevent */
 
 DOMAIN_SET(ng_btsocket_);
diff --git a/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c b/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c
index ac504483b0a8..ee1b3b85f1dc 100644
--- a/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c
+++ b/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c
@@ -1,3558 +1,3558 @@
 /*
  * ng_btsocket_rfcomm.c
  */
 
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2001-2003 Maksim Yevmenkin <m_evmenkin@yahoo.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * $Id: ng_btsocket_rfcomm.c,v 1.28 2003/09/14 23:29:06 max Exp $
  * $FreeBSD$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bitstring.h>
 #include <sys/domain.h>
 #include <sys/endian.h>
 #include <sys/errno.h>
 #include <sys/filedesc.h>
 #include <sys/ioccom.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
 
 #include <net/vnet.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/bluetooth/include/ng_bluetooth.h>
 #include <netgraph/bluetooth/include/ng_hci.h>
 #include <netgraph/bluetooth/include/ng_l2cap.h>
 #include <netgraph/bluetooth/include/ng_btsocket.h>
 #include <netgraph/bluetooth/include/ng_btsocket_l2cap.h>
 #include <netgraph/bluetooth/include/ng_btsocket_rfcomm.h>
 
 /* MALLOC define */
 #ifdef NG_SEPARATE_MALLOC
 static MALLOC_DEFINE(M_NETGRAPH_BTSOCKET_RFCOMM, "netgraph_btsocks_rfcomm",
 		"Netgraph Bluetooth RFCOMM sockets");
 #else
 #define M_NETGRAPH_BTSOCKET_RFCOMM M_NETGRAPH
 #endif /* NG_SEPARATE_MALLOC */
 
 /* Debug */
 #define NG_BTSOCKET_RFCOMM_INFO \
 	if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_INFO_LEVEL && \
 	    ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \
 		printf
 
 #define NG_BTSOCKET_RFCOMM_WARN \
 	if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_WARN_LEVEL && \
 	    ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \
 		printf
 
 #define NG_BTSOCKET_RFCOMM_ERR \
 	if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_ERR_LEVEL && \
 	    ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \
 		printf
 
 #define NG_BTSOCKET_RFCOMM_ALERT \
 	if (ng_btsocket_rfcomm_debug_level >= NG_BTSOCKET_ALERT_LEVEL && \
 	    ppsratecheck(&ng_btsocket_rfcomm_lasttime, &ng_btsocket_rfcomm_curpps, 1)) \
 		printf
 
 #define	ALOT	0x7fff
 
 /* Local prototypes */
 static int ng_btsocket_rfcomm_upcall
 	(struct socket *so, void *arg, int waitflag);
 static void ng_btsocket_rfcomm_sessions_task
 	(void *ctx, int pending);
 static void ng_btsocket_rfcomm_session_task
 	(ng_btsocket_rfcomm_session_p s);
 #define ng_btsocket_rfcomm_task_wakeup() \
 	taskqueue_enqueue(taskqueue_swi_giant, &ng_btsocket_rfcomm_task)
 
 static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_connect_ind
 	(ng_btsocket_rfcomm_session_p s, int channel);
 static void ng_btsocket_rfcomm_connect_cfm
 	(ng_btsocket_rfcomm_session_p s);
 
 static int ng_btsocket_rfcomm_session_create
 	(ng_btsocket_rfcomm_session_p *sp, struct socket *l2so,
 	 bdaddr_p src, bdaddr_p dst, struct thread *td);
 static int ng_btsocket_rfcomm_session_accept
 	(ng_btsocket_rfcomm_session_p s0);
 static int ng_btsocket_rfcomm_session_connect
 	(ng_btsocket_rfcomm_session_p s);
 static int ng_btsocket_rfcomm_session_receive
 	(ng_btsocket_rfcomm_session_p s);
 static int ng_btsocket_rfcomm_session_send
 	(ng_btsocket_rfcomm_session_p s);
 static void ng_btsocket_rfcomm_session_clean
 	(ng_btsocket_rfcomm_session_p s);
 static void ng_btsocket_rfcomm_session_process_pcb
 	(ng_btsocket_rfcomm_session_p s);
 static ng_btsocket_rfcomm_session_p ng_btsocket_rfcomm_session_by_addr
 	(bdaddr_p src, bdaddr_p dst);
 
 static int ng_btsocket_rfcomm_receive_frame
 	(ng_btsocket_rfcomm_session_p s, struct mbuf *m0);
 static int ng_btsocket_rfcomm_receive_sabm
 	(ng_btsocket_rfcomm_session_p s, int dlci);
 static int ng_btsocket_rfcomm_receive_disc
 	(ng_btsocket_rfcomm_session_p s, int dlci);
 static int ng_btsocket_rfcomm_receive_ua
 	(ng_btsocket_rfcomm_session_p s, int dlci);
 static int ng_btsocket_rfcomm_receive_dm
 	(ng_btsocket_rfcomm_session_p s, int dlci);
 static int ng_btsocket_rfcomm_receive_uih
 	(ng_btsocket_rfcomm_session_p s, int dlci, int pf, struct mbuf *m0);
 static int ng_btsocket_rfcomm_receive_mcc
 	(ng_btsocket_rfcomm_session_p s, struct mbuf *m0);
 static int ng_btsocket_rfcomm_receive_test
 	(ng_btsocket_rfcomm_session_p s, struct mbuf *m0);
 static int ng_btsocket_rfcomm_receive_fc
 	(ng_btsocket_rfcomm_session_p s, struct mbuf *m0);
 static int ng_btsocket_rfcomm_receive_msc
 	(ng_btsocket_rfcomm_session_p s, struct mbuf *m0);
 static int ng_btsocket_rfcomm_receive_rpn
 	(ng_btsocket_rfcomm_session_p s, struct mbuf *m0);
 static int ng_btsocket_rfcomm_receive_rls
 	(ng_btsocket_rfcomm_session_p s, struct mbuf *m0);
 static int ng_btsocket_rfcomm_receive_pn
 	(ng_btsocket_rfcomm_session_p s, struct mbuf *m0);
 static void ng_btsocket_rfcomm_set_pn
 	(ng_btsocket_rfcomm_pcb_p pcb, u_int8_t cr, u_int8_t flow_control, 
 	 u_int8_t credits, u_int16_t mtu);
 
 static int ng_btsocket_rfcomm_send_command
 	(ng_btsocket_rfcomm_session_p s, u_int8_t type, u_int8_t dlci);
 static int ng_btsocket_rfcomm_send_uih
 	(ng_btsocket_rfcomm_session_p s, u_int8_t address, u_int8_t pf, 
 	 u_int8_t credits, struct mbuf *data);
 static int ng_btsocket_rfcomm_send_msc
 	(ng_btsocket_rfcomm_pcb_p pcb);
 static int ng_btsocket_rfcomm_send_pn
 	(ng_btsocket_rfcomm_pcb_p pcb);
 static int ng_btsocket_rfcomm_send_credits
 	(ng_btsocket_rfcomm_pcb_p pcb);
 
 static int ng_btsocket_rfcomm_pcb_send
 	(ng_btsocket_rfcomm_pcb_p pcb, int limit);
 static void ng_btsocket_rfcomm_pcb_kill
 	(ng_btsocket_rfcomm_pcb_p pcb, int error);
 static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_pcb_by_dlci
 	(ng_btsocket_rfcomm_session_p s, int dlci);
 static ng_btsocket_rfcomm_pcb_p ng_btsocket_rfcomm_pcb_listener
 	(bdaddr_p src, int channel);
 
 static void ng_btsocket_rfcomm_timeout
 	(ng_btsocket_rfcomm_pcb_p pcb);
 static void ng_btsocket_rfcomm_untimeout
 	(ng_btsocket_rfcomm_pcb_p pcb);
 static void ng_btsocket_rfcomm_process_timeout
 	(void *xpcb);
 
 static struct mbuf * ng_btsocket_rfcomm_prepare_packet
 	(struct sockbuf *sb, int length);
 
 /* Globals */
 extern int					ifqmaxlen;
 static u_int32_t				ng_btsocket_rfcomm_debug_level;
 static u_int32_t				ng_btsocket_rfcomm_timo;
 struct task					ng_btsocket_rfcomm_task;
 static LIST_HEAD(, ng_btsocket_rfcomm_session)	ng_btsocket_rfcomm_sessions;
 static struct mtx				ng_btsocket_rfcomm_sessions_mtx;
 static LIST_HEAD(, ng_btsocket_rfcomm_pcb)	ng_btsocket_rfcomm_sockets;
 static struct mtx				ng_btsocket_rfcomm_sockets_mtx;
 static struct timeval				ng_btsocket_rfcomm_lasttime;
 static int					ng_btsocket_rfcomm_curpps;
 
 /* Sysctl tree */
 SYSCTL_DECL(_net_bluetooth_rfcomm_sockets);
 static SYSCTL_NODE(_net_bluetooth_rfcomm_sockets, OID_AUTO, stream,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Bluetooth STREAM RFCOMM sockets family");
 SYSCTL_UINT(_net_bluetooth_rfcomm_sockets_stream, OID_AUTO, debug_level,
 	CTLFLAG_RW,
 	&ng_btsocket_rfcomm_debug_level, NG_BTSOCKET_INFO_LEVEL,
 	"Bluetooth STREAM RFCOMM sockets debug level");
 SYSCTL_UINT(_net_bluetooth_rfcomm_sockets_stream, OID_AUTO, timeout,
 	CTLFLAG_RW,
 	&ng_btsocket_rfcomm_timo, 60,
 	"Bluetooth STREAM RFCOMM sockets timeout");
 
 /*****************************************************************************
  *****************************************************************************
  **                              RFCOMM CRC
  *****************************************************************************
  *****************************************************************************/
 
 static u_int8_t	ng_btsocket_rfcomm_crc_table[256] = {
 	0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75,
 	0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b,
 	0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69,
 	0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67,
 
 	0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d,
 	0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43,
 	0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51,
 	0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f,
 
 	0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05,
 	0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b,
 	0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19,
 	0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17,
 
 	0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d,
 	0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33,
 	0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21,
 	0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f,
 
 	0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95,
 	0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b,
 	0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89,
 	0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87,
 
 	0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad,
 	0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3,
 	0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1,
 	0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf,
 
 	0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5,
 	0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb,
 	0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9,
 	0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7,
 
 	0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd,
 	0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3,
 	0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1,
 	0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf
 };
 
 /* CRC */
 static u_int8_t
 ng_btsocket_rfcomm_crc(u_int8_t *data, int length)
 {
 	u_int8_t	crc = 0xff;
 
 	while (length --)
 		crc = ng_btsocket_rfcomm_crc_table[crc ^ *data++];
 
 	return (crc);
 } /* ng_btsocket_rfcomm_crc */
 
 /* FCS on 2 bytes */
 static u_int8_t
 ng_btsocket_rfcomm_fcs2(u_int8_t *data)
 {
 	return (0xff - ng_btsocket_rfcomm_crc(data, 2));
 } /* ng_btsocket_rfcomm_fcs2 */
   
 /* FCS on 3 bytes */
 static u_int8_t
 ng_btsocket_rfcomm_fcs3(u_int8_t *data)
 {
 	return (0xff - ng_btsocket_rfcomm_crc(data, 3));
 } /* ng_btsocket_rfcomm_fcs3 */
 
 /* 
  * Check FCS
  *
  * From Bluetooth spec
  *
  * "... In 07.10, the frame check sequence (FCS) is calculated on different 
  * sets of fields for different frame types. These are the fields that the 
  * FCS are calculated on:
  *
  * For SABM, DISC, UA, DM frames: on Address, Control and length field.
  * For UIH frames: on Address and Control field.
  *
  * (This is stated here for clarification, and to set the standard for RFCOMM;
  * the fields included in FCS calculation have actually changed in version
  * 7.0.0 of TS 07.10, but RFCOMM will not change the FCS calculation scheme
  * from the one above.) ..."
  */
 
 static int
 ng_btsocket_rfcomm_check_fcs(u_int8_t *data, int type, u_int8_t fcs)
 {
 	if (type != RFCOMM_FRAME_UIH)
 		return (ng_btsocket_rfcomm_fcs3(data) != fcs);
 
 	return (ng_btsocket_rfcomm_fcs2(data) != fcs);
 } /* ng_btsocket_rfcomm_check_fcs */
 
 /*****************************************************************************
  *****************************************************************************
  **                              Socket interface
  *****************************************************************************
  *****************************************************************************/
 
 /* 
  * Initialize everything
  */
 
 static void
 ng_btsocket_rfcomm_init(void *arg __unused)
 {
 
 	ng_btsocket_rfcomm_debug_level = NG_BTSOCKET_WARN_LEVEL;
 	ng_btsocket_rfcomm_timo = 60;
 
 	/* RFCOMM task */
 	TASK_INIT(&ng_btsocket_rfcomm_task, 0,
 		ng_btsocket_rfcomm_sessions_task, NULL);
 
 	/* RFCOMM sessions list */
 	LIST_INIT(&ng_btsocket_rfcomm_sessions);
 	mtx_init(&ng_btsocket_rfcomm_sessions_mtx,
 		"btsocks_rfcomm_sessions_mtx", NULL, MTX_DEF);
 
 	/* RFCOMM sockets list */
 	LIST_INIT(&ng_btsocket_rfcomm_sockets);
 	mtx_init(&ng_btsocket_rfcomm_sockets_mtx,
 		"btsocks_rfcomm_sockets_mtx", NULL, MTX_DEF);
 } /* ng_btsocket_rfcomm_init */
 SYSINIT(ng_btsocket_rfcomm_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD,
     ng_btsocket_rfcomm_init, NULL);
 
 /*
  * Abort connection on socket
  */
 
 void
 ng_btsocket_rfcomm_abort(struct socket *so)
 {
 
 	so->so_error = ECONNABORTED;
 	(void)ng_btsocket_rfcomm_disconnect(so);
 } /* ng_btsocket_rfcomm_abort */
 
 void
 ng_btsocket_rfcomm_close(struct socket *so)
 {
 
 	(void)ng_btsocket_rfcomm_disconnect(so);
 } /* ng_btsocket_rfcomm_close */
 
 /*
  * Accept connection on socket. Nothing to do here, socket must be connected
  * and ready, so just return peer address and be done with it.
  */
 
 int
 ng_btsocket_rfcomm_accept(struct socket *so, struct sockaddr **nam)
 {
 	return (ng_btsocket_rfcomm_peeraddr(so, nam));
 } /* ng_btsocket_rfcomm_accept */
 
 /*
  * Create and attach new socket
  */
 
 int
 ng_btsocket_rfcomm_attach(struct socket *so, int proto, struct thread *td)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = so2rfcomm_pcb(so);
 	int				error;
 
 	/* Check socket and protocol */
 	if (so->so_type != SOCK_STREAM)
 		return (ESOCKTNOSUPPORT);
 
 #if 0 /* XXX sonewconn() calls "pru_attach" with proto == 0 */
 	if (proto != 0) 
 		if (proto != BLUETOOTH_PROTO_RFCOMM)
 			return (EPROTONOSUPPORT);
 #endif /* XXX */
 
 	if (pcb != NULL)
 		return (EISCONN);
 
 	/* Reserve send and receive space if it is not reserved yet */
 	if ((so->so_snd.sb_hiwat == 0) || (so->so_rcv.sb_hiwat == 0)) {
 		error = soreserve(so, NG_BTSOCKET_RFCOMM_SENDSPACE,
 					NG_BTSOCKET_RFCOMM_RECVSPACE);
 		if (error != 0)
 			return (error);
 	}
 
 	/* Allocate the PCB */
         pcb = malloc(sizeof(*pcb),
 		M_NETGRAPH_BTSOCKET_RFCOMM, M_NOWAIT | M_ZERO);
         if (pcb == NULL)
                 return (ENOMEM);
 
 	/* Link the PCB and the socket */
 	so->so_pcb = (caddr_t) pcb;
 	pcb->so = so;
 
 	/* Initialize PCB */
 	pcb->state = NG_BTSOCKET_RFCOMM_DLC_CLOSED;
 	pcb->flags = NG_BTSOCKET_RFCOMM_DLC_CFC;
 
 	pcb->lmodem =
 	pcb->rmodem = (RFCOMM_MODEM_RTC | RFCOMM_MODEM_RTR | RFCOMM_MODEM_DV);
 
 	pcb->mtu = RFCOMM_DEFAULT_MTU;
 	pcb->tx_cred = 0;
 	pcb->rx_cred = RFCOMM_DEFAULT_CREDITS;
 
 	mtx_init(&pcb->pcb_mtx, "btsocks_rfcomm_pcb_mtx", NULL, MTX_DEF);
 	callout_init_mtx(&pcb->timo, &pcb->pcb_mtx, 0);
 
 	/* Add the PCB to the list */
 	mtx_lock(&ng_btsocket_rfcomm_sockets_mtx);
 	LIST_INSERT_HEAD(&ng_btsocket_rfcomm_sockets, pcb, next);
 	mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx);
 
         return (0);
 } /* ng_btsocket_rfcomm_attach */
 
 /*
  * Bind socket
  */
 
 int
 ng_btsocket_rfcomm_bind(struct socket *so, struct sockaddr *nam, 
 		struct thread *td)
 {
 	ng_btsocket_rfcomm_pcb_t	*pcb = so2rfcomm_pcb(so), *pcb1;
 	struct sockaddr_rfcomm		*sa = (struct sockaddr_rfcomm *) nam;
 
 	if (pcb == NULL)
 		return (EINVAL);
 
 	/* Verify address */
 	if (sa == NULL)
 		return (EINVAL);
 	if (sa->rfcomm_family != AF_BLUETOOTH)
 		return (EAFNOSUPPORT);
 	if (sa->rfcomm_len != sizeof(*sa))
 		return (EINVAL);
 	if (sa->rfcomm_channel > 30)
 		return (EINVAL);
 
 	mtx_lock(&pcb->pcb_mtx);
 
 	if (sa->rfcomm_channel != 0) {
 		mtx_lock(&ng_btsocket_rfcomm_sockets_mtx);
 
 		LIST_FOREACH(pcb1, &ng_btsocket_rfcomm_sockets, next) {
 			if (pcb1->channel == sa->rfcomm_channel &&
 			    bcmp(&pcb1->src, &sa->rfcomm_bdaddr,
 					sizeof(pcb1->src)) == 0) {
 				mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx);
 				mtx_unlock(&pcb->pcb_mtx);
 
 				return (EADDRINUSE);
 			}
 		}
 
 		mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx);
 	}
 
 	bcopy(&sa->rfcomm_bdaddr, &pcb->src, sizeof(pcb->src));
 	pcb->channel = sa->rfcomm_channel;
 
 	mtx_unlock(&pcb->pcb_mtx);
 
 	return (0);
 } /* ng_btsocket_rfcomm_bind */
 
 /*
  * Connect socket
  */
 
 int
 ng_btsocket_rfcomm_connect(struct socket *so, struct sockaddr *nam, 
 		struct thread *td)
 {
 	ng_btsocket_rfcomm_pcb_t	*pcb = so2rfcomm_pcb(so);
 	struct sockaddr_rfcomm		*sa = (struct sockaddr_rfcomm *) nam;
 	ng_btsocket_rfcomm_session_t	*s = NULL;
 	struct socket			*l2so = NULL;
 	int				 dlci, error = 0;
 
 	if (pcb == NULL)
 		return (EINVAL);
 
 	/* Verify address */
 	if (sa == NULL)
 		return (EINVAL);
 	if (sa->rfcomm_family != AF_BLUETOOTH)
 		return (EAFNOSUPPORT);
 	if (sa->rfcomm_len != sizeof(*sa))
 		return (EINVAL);
 	if (sa->rfcomm_channel > 30)
 		return (EINVAL);
 	if (sa->rfcomm_channel == 0 ||
 	    bcmp(&sa->rfcomm_bdaddr, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0)
 		return (EDESTADDRREQ);
 
 	/*
 	 * Note that we will not check for errors in socreate() because
 	 * if we failed to create L2CAP socket at this point we still
 	 * might have already open session.
 	 */
 
 	error = socreate(PF_BLUETOOTH, &l2so, SOCK_SEQPACKET,
 			BLUETOOTH_PROTO_L2CAP, td->td_ucred, td);
 
 	/* 
 	 * Look for session between "pcb->src" and "sa->rfcomm_bdaddr" (dst)
 	 */
 
 	mtx_lock(&ng_btsocket_rfcomm_sessions_mtx);
 
 	s = ng_btsocket_rfcomm_session_by_addr(&pcb->src, &sa->rfcomm_bdaddr);
 	if (s == NULL) {
 		/*
 		 * We need to create new RFCOMM session. Check if we have L2CAP
 		 * socket. If l2so == NULL then error has the error code from
 		 * socreate()
 		 */
 
 		if (l2so == NULL) {
 			mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx);
 			return (error);
 		}
 
 		error = ng_btsocket_rfcomm_session_create(&s, l2so,
 				&pcb->src, &sa->rfcomm_bdaddr, td);
 		if (error != 0) {
 			mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx);
 			soclose(l2so);
 
 			return (error);
 		}
 	} else if (l2so != NULL)
 		soclose(l2so); /* we don't need new L2CAP socket */
 
 	/*
 	 * Check if we already have the same DLCI the same session
 	 */
 
 	mtx_lock(&s->session_mtx);
 	mtx_lock(&pcb->pcb_mtx);
 
 	dlci = RFCOMM_MKDLCI(!INITIATOR(s), sa->rfcomm_channel);
 
 	if (ng_btsocket_rfcomm_pcb_by_dlci(s, dlci) != NULL) {
 		mtx_unlock(&pcb->pcb_mtx);
 		mtx_unlock(&s->session_mtx);
 		mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx);
 
 		return (EBUSY);
 	}
 
 	/*
 	 * Check session state and if its not acceptable then refuse connection
 	 */
 
 	switch (s->state) {
 	case NG_BTSOCKET_RFCOMM_SESSION_CONNECTING:
 	case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED:
 	case NG_BTSOCKET_RFCOMM_SESSION_OPEN:
 		/*
 		 * Update destination address and channel and attach 
 		 * DLC to the session
 		 */
 
 		bcopy(&sa->rfcomm_bdaddr, &pcb->dst, sizeof(pcb->dst));
 		pcb->channel = sa->rfcomm_channel;
 		pcb->dlci = dlci;
 
 		LIST_INSERT_HEAD(&s->dlcs, pcb, session_next);
 		pcb->session = s;
 
 		ng_btsocket_rfcomm_timeout(pcb);
 		soisconnecting(pcb->so);
 
 		if (s->state == NG_BTSOCKET_RFCOMM_SESSION_OPEN) {
 			pcb->mtu = s->mtu;
 			bcopy(&so2l2cap_pcb(s->l2so)->src, &pcb->src,
 				sizeof(pcb->src));
 
 			pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONFIGURING;
 
 			error = ng_btsocket_rfcomm_send_pn(pcb);
 			if (error == 0)
 				error = ng_btsocket_rfcomm_task_wakeup();
 		} else
 			pcb->state = NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT;
 		break;
 
 	default:
 		error = ECONNRESET;
 		break;
 	}
 
 	mtx_unlock(&pcb->pcb_mtx);
 	mtx_unlock(&s->session_mtx);
 	mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx);
 
 	return (error);
 } /* ng_btsocket_rfcomm_connect */
 
 /*
  * Process ioctl's calls on socket.
  * XXX FIXME this should provide interface to the RFCOMM multiplexor channel
  */
 
 int
 ng_btsocket_rfcomm_control(struct socket *so, u_long cmd, void *data,
 		struct ifnet *ifp, struct thread *td)
 {
 	return (EINVAL);
 } /* ng_btsocket_rfcomm_control */
 
 /*
  * Process getsockopt/setsockopt system calls
  */
 
 int
 ng_btsocket_rfcomm_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	ng_btsocket_rfcomm_pcb_p		pcb = so2rfcomm_pcb(so);
 	struct ng_btsocket_rfcomm_fc_info	fcinfo;
 	int					error = 0;
 
 	if (pcb == NULL)
 		return (EINVAL);
 	if (sopt->sopt_level != SOL_RFCOMM)
 		return (0);
 
 	mtx_lock(&pcb->pcb_mtx);
 
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case SO_RFCOMM_MTU:
 			error = sooptcopyout(sopt, &pcb->mtu, sizeof(pcb->mtu));
 			break;
 
 		case SO_RFCOMM_FC_INFO:
 			fcinfo.lmodem = pcb->lmodem;
 			fcinfo.rmodem = pcb->rmodem;
 			fcinfo.tx_cred = pcb->tx_cred;
 			fcinfo.rx_cred = pcb->rx_cred;
 			fcinfo.cfc = (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC)?
 				1 : 0;
 			fcinfo.reserved = 0;
 
 			error = sooptcopyout(sopt, &fcinfo, sizeof(fcinfo));
 			break;
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	default:
 		error = EINVAL;
 		break;
 	}
 
 	mtx_unlock(&pcb->pcb_mtx);
 
 	return (error);
 } /* ng_btsocket_rfcomm_ctloutput */
 
 /*
  * Detach and destroy socket
  */
 
 void
 ng_btsocket_rfcomm_detach(struct socket *so)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = so2rfcomm_pcb(so);
 
 	KASSERT(pcb != NULL, ("ng_btsocket_rfcomm_detach: pcb == NULL"));
 
 	mtx_lock(&pcb->pcb_mtx);
 
 	switch (pcb->state) {
 	case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT:
 	case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING:
 	case NG_BTSOCKET_RFCOMM_DLC_CONNECTING:
 	case NG_BTSOCKET_RFCOMM_DLC_CONNECTED:
 		/* XXX What to do with pending request? */
 		if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO)
 			ng_btsocket_rfcomm_untimeout(pcb);
 
 		if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT)
 			pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_DETACHED;
 		else
 			pcb->state = NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING;
 
 		ng_btsocket_rfcomm_task_wakeup();
 		break;
 
 	case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING:
 		ng_btsocket_rfcomm_task_wakeup();
 		break;
 	}
 
 	while (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CLOSED)
 		msleep(&pcb->state, &pcb->pcb_mtx, PZERO, "rf_det", 0);
 
 	if (pcb->session != NULL)
 		panic("%s: pcb->session != NULL\n", __func__);
 	if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO)
 		panic("%s: timeout on closed DLC, flags=%#x\n",
 			__func__, pcb->flags);
 
 	mtx_lock(&ng_btsocket_rfcomm_sockets_mtx);
 	LIST_REMOVE(pcb, next);
 	mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx);
 
 	mtx_unlock(&pcb->pcb_mtx);
 
 	mtx_destroy(&pcb->pcb_mtx);
 	bzero(pcb, sizeof(*pcb));
 	free(pcb, M_NETGRAPH_BTSOCKET_RFCOMM);
 
 	soisdisconnected(so);
 	so->so_pcb = NULL;
 } /* ng_btsocket_rfcomm_detach */
 
 /*
  * Disconnect socket
  */
 
 int
 ng_btsocket_rfcomm_disconnect(struct socket *so)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = so2rfcomm_pcb(so);
 
 	if (pcb == NULL)
 		return (EINVAL);
 
 	mtx_lock(&pcb->pcb_mtx);
 
 	if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING) {
 		mtx_unlock(&pcb->pcb_mtx);
 		return (EINPROGRESS);
 	}
 
 	/* XXX What to do with pending request? */
 	if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO)
 		ng_btsocket_rfcomm_untimeout(pcb);
 
 	switch (pcb->state) {
 	case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING: /* XXX can we get here? */
 	case NG_BTSOCKET_RFCOMM_DLC_CONNECTING: /* XXX can we get here? */
 	case NG_BTSOCKET_RFCOMM_DLC_CONNECTED:
 
 		/*
 		 * Just change DLC state and enqueue RFCOMM task. It will
 		 * queue and send DISC on the DLC.
 		 */ 
 
 		pcb->state = NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING;
 		soisdisconnecting(so);
 
 		ng_btsocket_rfcomm_task_wakeup();
 		break;
 
 	case NG_BTSOCKET_RFCOMM_DLC_CLOSED:
 	case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT:
 		break;
 
 	default:
 		panic("%s: Invalid DLC state=%d, flags=%#x\n",
 			__func__, pcb->state, pcb->flags);
 		break;
 	}
 
 	mtx_unlock(&pcb->pcb_mtx);
 
 	return (0);
 } /* ng_btsocket_rfcomm_disconnect */
 
 /*
  * Listen on socket. First call to listen() will create listening RFCOMM session
  */
 
 int
 ng_btsocket_rfcomm_listen(struct socket *so, int backlog, struct thread *td)
 {
 	ng_btsocket_rfcomm_pcb_p	 pcb = so2rfcomm_pcb(so), pcb1;
 	ng_btsocket_rfcomm_session_p	 s = NULL;
 	struct socket			*l2so = NULL;
 	int				 error, socreate_error, usedchannels;
 
 	if (pcb == NULL)
 		return (EINVAL);
 	if (pcb->channel > 30)
 		return (EADDRNOTAVAIL);
 
 	usedchannels = 0;
 
 	mtx_lock(&pcb->pcb_mtx);
 
 	if (pcb->channel == 0) {
 		mtx_lock(&ng_btsocket_rfcomm_sockets_mtx);
 
 		LIST_FOREACH(pcb1, &ng_btsocket_rfcomm_sockets, next)
 			if (pcb1->channel != 0 &&
 			    bcmp(&pcb1->src, &pcb->src, sizeof(pcb->src)) == 0)
 				usedchannels |= (1 << (pcb1->channel - 1));
 
 		for (pcb->channel = 30; pcb->channel > 0; pcb->channel --)
 			if (!(usedchannels & (1 << (pcb->channel - 1))))
 				break;
 
 		if (pcb->channel == 0) {
 			mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx);
 			mtx_unlock(&pcb->pcb_mtx);
 
 			return (EADDRNOTAVAIL);
 		}
 
 		mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx);
 	}
 
 	mtx_unlock(&pcb->pcb_mtx);
 
 	/*
 	 * Note that we will not check for errors in socreate() because
 	 * if we failed to create L2CAP socket at this point we still
 	 * might have already open session.
 	 */
 
 	socreate_error = socreate(PF_BLUETOOTH, &l2so, SOCK_SEQPACKET,
 			BLUETOOTH_PROTO_L2CAP, td->td_ucred, td);
 
 	/*
 	 * Transition the socket and session into the LISTENING state.  Check
 	 * for collisions first, as there can only be one.
 	 */
 	mtx_lock(&ng_btsocket_rfcomm_sessions_mtx);
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	SOCK_UNLOCK(so);
 	if (error != 0)
 		goto out;
 
 	LIST_FOREACH(s, &ng_btsocket_rfcomm_sessions, next)
 		if (s->state == NG_BTSOCKET_RFCOMM_SESSION_LISTENING)
 			break;
 
 	if (s == NULL) {
 		/*
 		 * We need to create default RFCOMM session. Check if we have 
 		 * L2CAP socket. If l2so == NULL then error has the error code 
 		 * from socreate()
 		 */
 		if (l2so == NULL) {
 			solisten_proto_abort(so);
 			error = socreate_error;
 			goto out;
 		}
 
 		/* 
 		 * Create default listen RFCOMM session. The default RFCOMM 
 		 * session will listen on ANY address.
 		 *
 		 * XXX FIXME Note that currently there is no way to adjust MTU
 		 * for the default session.
 		 */
 		error = ng_btsocket_rfcomm_session_create(&s, l2so,
 					NG_HCI_BDADDR_ANY, NULL, td);
 		if (error != 0) {
 			solisten_proto_abort(so);
 			goto out;
 		}
 		l2so = NULL;
 	}
 	SOCK_LOCK(so);
 	solisten_proto(so, backlog);
 	SOCK_UNLOCK(so);
 out:
 	mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx);
 	/*
 	 * If we still have an l2so reference here, it's unneeded, so release
 	 * it.
 	 */
 	if (l2so != NULL)
 		soclose(l2so);
 	return (error);
 } /* ng_btsocket_listen */
 
 /*
  * Get peer address
  */
 
 int
 ng_btsocket_rfcomm_peeraddr(struct socket *so, struct sockaddr **nam)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = so2rfcomm_pcb(so);
 	struct sockaddr_rfcomm		sa;
 
 	if (pcb == NULL)
 		return (EINVAL);
 
 	bcopy(&pcb->dst, &sa.rfcomm_bdaddr, sizeof(sa.rfcomm_bdaddr));
 	sa.rfcomm_channel = pcb->channel;
 	sa.rfcomm_len = sizeof(sa);
 	sa.rfcomm_family = AF_BLUETOOTH;
 
 	*nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT);
 
 	return ((*nam == NULL)? ENOMEM : 0);
 } /* ng_btsocket_rfcomm_peeraddr */
 
 /*
  * Send data to socket
  */
 
 int
 ng_btsocket_rfcomm_send(struct socket *so, int flags, struct mbuf *m,
 		struct sockaddr *nam, struct mbuf *control, struct thread *td)
 {
 	ng_btsocket_rfcomm_pcb_t	*pcb = so2rfcomm_pcb(so);
 	int				 error = 0;
 
 	/* Check socket and input */
 	if (pcb == NULL || m == NULL || control != NULL) {
 		error = EINVAL;
 		goto drop;
 	}
 
 	mtx_lock(&pcb->pcb_mtx);
 
 	/* Make sure DLC is connected */
 	if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTED) {
 		mtx_unlock(&pcb->pcb_mtx);
 		error = ENOTCONN;
 		goto drop;
 	}
 
 	/* Put the packet on the socket's send queue and wakeup RFCOMM task */
 	sbappend(&pcb->so->so_snd, m, flags);
 	m = NULL;
 
 	if (!(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_SENDING)) {
 		pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_SENDING;
 		error = ng_btsocket_rfcomm_task_wakeup();
 	}
 
 	mtx_unlock(&pcb->pcb_mtx);
 drop:
 	NG_FREE_M(m); /* checks for != NULL */
 	NG_FREE_M(control);
 
 	return (error);
 } /* ng_btsocket_rfcomm_send */
 
 /*
  * Get socket address
  */
 
 int
 ng_btsocket_rfcomm_sockaddr(struct socket *so, struct sockaddr **nam)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = so2rfcomm_pcb(so);
 	struct sockaddr_rfcomm		sa;
 
 	if (pcb == NULL)
 		return (EINVAL);
 
 	bcopy(&pcb->src, &sa.rfcomm_bdaddr, sizeof(sa.rfcomm_bdaddr));
 	sa.rfcomm_channel = pcb->channel;
 	sa.rfcomm_len = sizeof(sa);
 	sa.rfcomm_family = AF_BLUETOOTH;
 
 	*nam = sodupsockaddr((struct sockaddr *) &sa, M_NOWAIT);
 
 	return ((*nam == NULL)? ENOMEM : 0);
 } /* ng_btsocket_rfcomm_sockaddr */
 
 /*
  * Upcall function for L2CAP sockets. Enqueue RFCOMM task.
  */
 
 static int
 ng_btsocket_rfcomm_upcall(struct socket *so, void *arg, int waitflag)
 {
 	int	error;
 
 	if (so == NULL)
 		panic("%s: so == NULL\n", __func__);
 
 	if ((error = ng_btsocket_rfcomm_task_wakeup()) != 0)
 		NG_BTSOCKET_RFCOMM_ALERT(
 "%s: Could not enqueue RFCOMM task, error=%d\n", __func__, error);
 	return (SU_OK);
 } /* ng_btsocket_rfcomm_upcall */
 
 /*
  * RFCOMM task. Will handle all RFCOMM sessions in one pass.
  * XXX FIXME does not scale very well
  */
 
 static void
 ng_btsocket_rfcomm_sessions_task(void *ctx, int pending)
 {
 	ng_btsocket_rfcomm_session_p	s = NULL, s_next = NULL;
 
 	mtx_lock(&ng_btsocket_rfcomm_sessions_mtx);
 
 	for (s = LIST_FIRST(&ng_btsocket_rfcomm_sessions); s != NULL; ) {
 		mtx_lock(&s->session_mtx);
 		s_next = LIST_NEXT(s, next);
 
 		ng_btsocket_rfcomm_session_task(s);
 
 		if (s->state == NG_BTSOCKET_RFCOMM_SESSION_CLOSED) {
 			/* Unlink and clean the session */
 			LIST_REMOVE(s, next);
 
 			NG_BT_MBUFQ_DRAIN(&s->outq);
 			if (!LIST_EMPTY(&s->dlcs))
 				panic("%s: DLC list is not empty\n", __func__);
 
 			/* Close L2CAP socket */
 			SOCKBUF_LOCK(&s->l2so->so_rcv);
 			soupcall_clear(s->l2so, SO_RCV);
 			SOCKBUF_UNLOCK(&s->l2so->so_rcv);
 			SOCKBUF_LOCK(&s->l2so->so_snd);
 			soupcall_clear(s->l2so, SO_SND);
 			SOCKBUF_UNLOCK(&s->l2so->so_snd);
 			soclose(s->l2so);
 
 			mtx_unlock(&s->session_mtx);
 
 			mtx_destroy(&s->session_mtx);
 			bzero(s, sizeof(*s));
 			free(s, M_NETGRAPH_BTSOCKET_RFCOMM);
 		} else
 			mtx_unlock(&s->session_mtx);
 
 		s = s_next;
 	}
 
 	mtx_unlock(&ng_btsocket_rfcomm_sessions_mtx);
 } /* ng_btsocket_rfcomm_sessions_task */
 
 /*
  * Process RFCOMM session. Will handle all RFCOMM sockets in one pass.
  */
 
 static void
 ng_btsocket_rfcomm_session_task(ng_btsocket_rfcomm_session_p s)
 {
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	if (s->l2so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 		NG_BTSOCKET_RFCOMM_INFO(
 "%s: L2CAP connection has been terminated, so=%p, so_state=%#x, so_count=%d, " \
 "state=%d, flags=%#x\n", __func__, s->l2so, s->l2so->so_state, 
 			s->l2so->so_count, s->state, s->flags);
 
 		s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED;
 		ng_btsocket_rfcomm_session_clean(s);
 	}
 
 	/* Now process upcall */
 	switch (s->state) {
 	/* Try to accept new L2CAP connection(s) */
 	case NG_BTSOCKET_RFCOMM_SESSION_LISTENING:
 		while (ng_btsocket_rfcomm_session_accept(s) == 0)
 			;
 		break;
 
 	/* Process the results of the L2CAP connect */
 	case NG_BTSOCKET_RFCOMM_SESSION_CONNECTING:
 		ng_btsocket_rfcomm_session_process_pcb(s);
 
 		if (ng_btsocket_rfcomm_session_connect(s) != 0) {
 			s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED;
 			ng_btsocket_rfcomm_session_clean(s);
 		} 
 		break;
 
 	/* Try to receive/send more data */
 	case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED:
 	case NG_BTSOCKET_RFCOMM_SESSION_OPEN:
 	case NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING:
 		ng_btsocket_rfcomm_session_process_pcb(s);
 
 		if (ng_btsocket_rfcomm_session_receive(s) != 0) {
 			s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED;
 			ng_btsocket_rfcomm_session_clean(s);
 		} else if (ng_btsocket_rfcomm_session_send(s) != 0) {
 			s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED;
 			ng_btsocket_rfcomm_session_clean(s);
 		}
 		break;
 
 	case NG_BTSOCKET_RFCOMM_SESSION_CLOSED:
 		break;
 
 	default:
 		panic("%s: Invalid session state=%d, flags=%#x\n",
 			__func__, s->state, s->flags);
 		break;
 	}
 } /* ng_btsocket_rfcomm_session_task */
 
 /*
  * Process RFCOMM connection indicator. Caller must hold s->session_mtx
  */
 
 static ng_btsocket_rfcomm_pcb_p
 ng_btsocket_rfcomm_connect_ind(ng_btsocket_rfcomm_session_p s, int channel)
 {
 	ng_btsocket_rfcomm_pcb_p	 pcb = NULL, pcb1 = NULL;
 	ng_btsocket_l2cap_pcb_p		 l2pcb = NULL;
 	struct socket			*so1;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	/*
 	 * Try to find RFCOMM socket that listens on given source address 
 	 * and channel. This will return the best possible match.
 	 */
 
 	l2pcb = so2l2cap_pcb(s->l2so);
 	pcb = ng_btsocket_rfcomm_pcb_listener(&l2pcb->src, channel);
 	if (pcb == NULL)
 		return (NULL);
 
 	/*
 	 * Check the pending connections queue and if we have space then 
 	 * create new socket and set proper source and destination address,
 	 * and channel.
 	 */
 
 	mtx_lock(&pcb->pcb_mtx);
 
 	CURVNET_SET(pcb->so->so_vnet);
 	so1 = sonewconn(pcb->so, 0);
 	CURVNET_RESTORE();
 
 	mtx_unlock(&pcb->pcb_mtx);
 
 	if (so1 == NULL)
 		return (NULL);
 
 	/*
 	 * If we got here than we have created new socket. So complete the 
 	 * connection. Set source and destination address from the session.
 	 */
 
 	pcb1 = so2rfcomm_pcb(so1);
 	if (pcb1 == NULL)
 		panic("%s: pcb1 == NULL\n", __func__);
 
 	mtx_lock(&pcb1->pcb_mtx);
 
 	bcopy(&l2pcb->src, &pcb1->src, sizeof(pcb1->src));
 	bcopy(&l2pcb->dst, &pcb1->dst, sizeof(pcb1->dst));
 	pcb1->channel = channel;
 
 	/* Link new DLC to the session. We already hold s->session_mtx */
 	LIST_INSERT_HEAD(&s->dlcs, pcb1, session_next);
 	pcb1->session = s;
 			
 	mtx_unlock(&pcb1->pcb_mtx);
 
 	return (pcb1);
 } /* ng_btsocket_rfcomm_connect_ind */
 
 /*
  * Process RFCOMM connect confirmation. Caller must hold s->session_mtx.
  */
 
 static void
 ng_btsocket_rfcomm_connect_cfm(ng_btsocket_rfcomm_session_p s)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = NULL, pcb_next = NULL;
 	int				error;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	/*
 	 * Wake up all waiting sockets and send PN request for each of them. 
 	 * Note that timeout already been set in ng_btsocket_rfcomm_connect()
 	 *
 	 * Note: cannot use LIST_FOREACH because ng_btsocket_rfcomm_pcb_kill
 	 * will unlink DLC from the session
 	 */
 
 	for (pcb = LIST_FIRST(&s->dlcs); pcb != NULL; ) {
 		mtx_lock(&pcb->pcb_mtx);
 		pcb_next = LIST_NEXT(pcb, session_next);
 
 		if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT) {
 			pcb->mtu = s->mtu;
 			bcopy(&so2l2cap_pcb(s->l2so)->src, &pcb->src,
 				sizeof(pcb->src));
 
 			error = ng_btsocket_rfcomm_send_pn(pcb);
 			if (error == 0)
 				pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONFIGURING;
 			else
 				ng_btsocket_rfcomm_pcb_kill(pcb, error);
 		}
 
 		mtx_unlock(&pcb->pcb_mtx);
 		pcb = pcb_next;
 	}
 } /* ng_btsocket_rfcomm_connect_cfm */
 
 /*****************************************************************************
  *****************************************************************************
  **                              RFCOMM sessions
  *****************************************************************************
  *****************************************************************************/
 
 /*
  * Create new RFCOMM session. That function WILL NOT take ownership over l2so.
  * Caller MUST free l2so if function failed.
  */
 
 static int
 ng_btsocket_rfcomm_session_create(ng_btsocket_rfcomm_session_p *sp,
 		struct socket *l2so, bdaddr_p src, bdaddr_p dst,
 		struct thread *td)
 {
 	ng_btsocket_rfcomm_session_p	s = NULL;
 	struct sockaddr_l2cap		l2sa;
 	struct sockopt			l2sopt;
 	int				error;
 	u_int16_t			mtu;
 
 	mtx_assert(&ng_btsocket_rfcomm_sessions_mtx, MA_OWNED);
 
 	/* Allocate the RFCOMM session */
         s = malloc(sizeof(*s),
 		M_NETGRAPH_BTSOCKET_RFCOMM, M_NOWAIT | M_ZERO);
         if (s == NULL)
                 return (ENOMEM);
 
 	/* Set defaults */
 	s->mtu = RFCOMM_DEFAULT_MTU;
 	s->flags = 0;
 	s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED;
 	NG_BT_MBUFQ_INIT(&s->outq, ifqmaxlen);
 
 	/*
 	 * XXX Mark session mutex as DUPOK to prevent "duplicated lock of 
 	 * the same type" message. When accepting new L2CAP connection
 	 * ng_btsocket_rfcomm_session_accept() holds both session mutexes 
 	 * for "old" (accepting) session and "new" (created) session.
 	 */
 
 	mtx_init(&s->session_mtx, "btsocks_rfcomm_session_mtx", NULL,
 		MTX_DEF|MTX_DUPOK);
 
 	LIST_INIT(&s->dlcs);
 
 	/* Prepare L2CAP socket */
 	SOCKBUF_LOCK(&l2so->so_rcv);
 	soupcall_set(l2so, SO_RCV, ng_btsocket_rfcomm_upcall, NULL);
 	SOCKBUF_UNLOCK(&l2so->so_rcv);
 	SOCKBUF_LOCK(&l2so->so_snd);
 	soupcall_set(l2so, SO_SND, ng_btsocket_rfcomm_upcall, NULL);
 	SOCKBUF_UNLOCK(&l2so->so_snd);
 	l2so->so_state |= SS_NBIO;
 	s->l2so = l2so;
 
 	mtx_lock(&s->session_mtx);
 
 	/*
 	 * "src" == NULL and "dst" == NULL means just create session.
 	 * caller must do the rest
 	 */
 
 	if (src == NULL && dst == NULL)
 		goto done;
 
 	/*
 	 * Set incoming MTU on L2CAP socket. It is RFCOMM session default MTU 
 	 * plus 5 bytes: RFCOMM frame header, one extra byte for length and one
 	 * extra byte for credits.
 	 */
 
 	mtu = s->mtu + sizeof(struct rfcomm_frame_hdr) + 1 + 1;
 
 	l2sopt.sopt_dir = SOPT_SET;
 	l2sopt.sopt_level = SOL_L2CAP;
 	l2sopt.sopt_name = SO_L2CAP_IMTU;
 	l2sopt.sopt_val = (void *) &mtu;
 	l2sopt.sopt_valsize = sizeof(mtu);
 	l2sopt.sopt_td = NULL;
 
 	error = sosetopt(s->l2so, &l2sopt);
 	if (error != 0)
 		goto bad;
 
 	/* Bind socket to "src" address */
 	l2sa.l2cap_len = sizeof(l2sa);
 	l2sa.l2cap_family = AF_BLUETOOTH;
 	l2sa.l2cap_psm = (dst == NULL)? htole16(NG_L2CAP_PSM_RFCOMM) : 0;
 	bcopy(src, &l2sa.l2cap_bdaddr, sizeof(l2sa.l2cap_bdaddr));
 	l2sa.l2cap_cid = 0;
 	l2sa.l2cap_bdaddr_type = BDADDR_BREDR;
 
 	error = sobind(s->l2so, (struct sockaddr *) &l2sa, td);
 	if (error != 0)
 		goto bad;
 
 	/* If "dst" is not NULL then initiate connect(), otherwise listen() */
 	if (dst == NULL) {
 		s->flags = 0;
 		s->state = NG_BTSOCKET_RFCOMM_SESSION_LISTENING;
 
 		error = solisten(s->l2so, 10, td);
 		if (error != 0)
 			goto bad;
 	} else {
 		s->flags = NG_BTSOCKET_RFCOMM_SESSION_INITIATOR;
 		s->state = NG_BTSOCKET_RFCOMM_SESSION_CONNECTING;
 
 		l2sa.l2cap_len = sizeof(l2sa);   
 		l2sa.l2cap_family = AF_BLUETOOTH;
 		l2sa.l2cap_psm = htole16(NG_L2CAP_PSM_RFCOMM);
 	        bcopy(dst, &l2sa.l2cap_bdaddr, sizeof(l2sa.l2cap_bdaddr));
 		l2sa.l2cap_cid = 0;
 		l2sa.l2cap_bdaddr_type = BDADDR_BREDR;
 
 		error = soconnect(s->l2so, (struct sockaddr *) &l2sa, td);
 		if (error != 0)
 			goto bad;
 	}
 
 done:
 	LIST_INSERT_HEAD(&ng_btsocket_rfcomm_sessions, s, next);
 	*sp = s;
 
 	mtx_unlock(&s->session_mtx);
 
 	return (0);
 
 bad:
 	mtx_unlock(&s->session_mtx);
 
 	/* Return L2CAP socket back to its original state */
 	SOCKBUF_LOCK(&l2so->so_rcv);
 	soupcall_clear(s->l2so, SO_RCV);
 	SOCKBUF_UNLOCK(&l2so->so_rcv);
 	SOCKBUF_LOCK(&l2so->so_snd);
 	soupcall_clear(s->l2so, SO_SND);
 	SOCKBUF_UNLOCK(&l2so->so_snd);
 	l2so->so_state &= ~SS_NBIO;
 
 	mtx_destroy(&s->session_mtx);
 	bzero(s, sizeof(*s));
 	free(s, M_NETGRAPH_BTSOCKET_RFCOMM);
 
 	return (error);
 } /* ng_btsocket_rfcomm_session_create */
 
 /*
  * Process accept() on RFCOMM session
  * XXX FIXME locking for "l2so"?
  */
 
 static int
 ng_btsocket_rfcomm_session_accept(ng_btsocket_rfcomm_session_p s0)
 {
 	struct socket			*l2so;
 	struct sockaddr_l2cap		*l2sa = NULL;
 	ng_btsocket_l2cap_pcb_t		*l2pcb = NULL;
 	ng_btsocket_rfcomm_session_p	 s = NULL;
 	int				 error;
 
 	mtx_assert(&ng_btsocket_rfcomm_sessions_mtx, MA_OWNED);
 	mtx_assert(&s0->session_mtx, MA_OWNED);
 
 	SOLISTEN_LOCK(s0->l2so);
 	error = solisten_dequeue(s0->l2so, &l2so, 0);
 	if (error == EWOULDBLOCK)
 		return (error);
 	if (error) {
 		NG_BTSOCKET_RFCOMM_ERR(
 "%s: Could not accept connection on L2CAP socket, error=%d\n", __func__, error);
 		return (error);
 	}
 
 	error = soaccept(l2so, (struct sockaddr **) &l2sa);
 	if (error != 0) {
 		NG_BTSOCKET_RFCOMM_ERR(
 "%s: soaccept() on L2CAP socket failed, error=%d\n", __func__, error);
 		soclose(l2so);
 
 		return (error);
 	}
 
 	/*
 	 * Check if there is already active RFCOMM session between two devices.
 	 * If so then close L2CAP connection. We only support one RFCOMM session
 	 * between each pair of devices. Note that here we assume session in any
 	 * state. The session even could be in the middle of disconnecting.
 	 */
 
 	l2pcb = so2l2cap_pcb(l2so);
 	s = ng_btsocket_rfcomm_session_by_addr(&l2pcb->src, &l2pcb->dst);
 	if (s == NULL) {
 		/* Create a new RFCOMM session */
 		error = ng_btsocket_rfcomm_session_create(&s, l2so, NULL, NULL,
 				curthread /* XXX */);
 		if (error == 0) {
 			mtx_lock(&s->session_mtx);
 
 			s->flags = 0;
 			s->state = NG_BTSOCKET_RFCOMM_SESSION_CONNECTED;
 
 			/*
 			 * Adjust MTU on incoming connection. Reserve 5 bytes:
 			 * RFCOMM frame header, one extra byte for length and 
 			 * one extra byte for credits.
 			 */
 
 			s->mtu = min(l2pcb->imtu, l2pcb->omtu) -
 					sizeof(struct rfcomm_frame_hdr) - 1 - 1;
 
 			mtx_unlock(&s->session_mtx);
 		} else {
 			NG_BTSOCKET_RFCOMM_ALERT(
 "%s: Failed to create new RFCOMM session, error=%d\n", __func__, error);
 
 			soclose(l2so);
 		}
 	} else {
 		NG_BTSOCKET_RFCOMM_WARN(
 "%s: Rejecting duplicating RFCOMM session between src=%x:%x:%x:%x:%x:%x and " \
 "dst=%x:%x:%x:%x:%x:%x, state=%d, flags=%#x\n",	__func__,
 			l2pcb->src.b[5], l2pcb->src.b[4], l2pcb->src.b[3],
 			l2pcb->src.b[2], l2pcb->src.b[1], l2pcb->src.b[0],
 			l2pcb->dst.b[5], l2pcb->dst.b[4], l2pcb->dst.b[3],
 			l2pcb->dst.b[2], l2pcb->dst.b[1], l2pcb->dst.b[0],
 			s->state, s->flags);
 
 		error = EBUSY;
 		soclose(l2so);
 	}
 
 	return (error);
 } /* ng_btsocket_rfcomm_session_accept */
 
 /*
  * Process connect() on RFCOMM session
  * XXX FIXME locking for "l2so"?
  */
 
 static int
 ng_btsocket_rfcomm_session_connect(ng_btsocket_rfcomm_session_p s)
 {
 	ng_btsocket_l2cap_pcb_p	l2pcb = so2l2cap_pcb(s->l2so);
 	int			error;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	/* First check if connection has failed */
 	if ((error = s->l2so->so_error) != 0) {
 		s->l2so->so_error = 0;
 
 		NG_BTSOCKET_RFCOMM_ERR(
 "%s: Could not connect RFCOMM session, error=%d, state=%d, flags=%#x\n",
 			__func__, error, s->state, s->flags);
 
 		return (error);
 	}
 
 	/* Is connection still in progress? */
 	if (s->l2so->so_state & SS_ISCONNECTING)
 		return (0); 
 
 	/* 
 	 * If we got here then we are connected. Send SABM on DLCI 0 to 
 	 * open multiplexor channel.
 	 */
 
 	if (error == 0) {
 		s->state = NG_BTSOCKET_RFCOMM_SESSION_CONNECTED;
 
 		/*
 		 * Adjust MTU on outgoing connection. Reserve 5 bytes: RFCOMM 
 		 * frame header, one extra byte for length and one extra byte 
 		 * for credits.
 		 */
 
 		s->mtu = min(l2pcb->imtu, l2pcb->omtu) -
 				sizeof(struct rfcomm_frame_hdr) - 1 - 1;
 
 		error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_SABM,0);
 		if (error == 0)
 			error = ng_btsocket_rfcomm_task_wakeup();
 	}
 
 	return (error);
 }/* ng_btsocket_rfcomm_session_connect */
 
 /*
  * Receive data on RFCOMM session
  * XXX FIXME locking for "l2so"?
  */
 
 static int
 ng_btsocket_rfcomm_session_receive(ng_btsocket_rfcomm_session_p s)
 {
 	struct mbuf	*m = NULL;
 	struct uio	 uio;
 	int		 more, flags, error;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	/* Can we read from the L2CAP socket? */
 	if (!soreadable(s->l2so))
 		return (0);
 
 	/* First check for error on L2CAP socket */
 	if ((error = s->l2so->so_error) != 0) {
 		s->l2so->so_error = 0;
 
 		NG_BTSOCKET_RFCOMM_ERR(
 "%s: Could not receive data from L2CAP socket, error=%d, state=%d, flags=%#x\n",
 			__func__, error, s->state, s->flags);
 
 		return (error);
 	}
 
 	/*
 	 * Read all packets from the L2CAP socket. 
 	 * XXX FIXME/VERIFY is that correct? For now use m->m_nextpkt as
 	 * indication that there is more packets on the socket's buffer.
 	 * Also what should we use in uio.uio_resid?
 	 * May be s->mtu + sizeof(struct rfcomm_frame_hdr) + 1 + 1?
 	 */
 
 	for (more = 1; more; ) {
 		/* Try to get next packet from socket */
 		bzero(&uio, sizeof(uio));
 /*		uio.uio_td = NULL; */
 		uio.uio_resid = 1000000000;
 		flags = MSG_DONTWAIT;
 
 		m = NULL;
 		error = soreceive(s->l2so, NULL, &uio, &m,
 		    (struct mbuf **) NULL, &flags);
 		if (error != 0) {
 			if (error == EWOULDBLOCK)
 				return (0); /* XXX can happen? */
 
 			NG_BTSOCKET_RFCOMM_ERR(
 "%s: Could not receive data from L2CAP socket, error=%d\n", __func__, error);
 
 			return (error);
 		}
 
 		more = (m->m_nextpkt != NULL);
 		m->m_nextpkt = NULL;
 
 		ng_btsocket_rfcomm_receive_frame(s, m);
 	}
 
 	return (0);
 } /* ng_btsocket_rfcomm_session_receive */
 
 /*
  * Send data on RFCOMM session
  * XXX FIXME locking for "l2so"?
  */
 
 static int
 ng_btsocket_rfcomm_session_send(ng_btsocket_rfcomm_session_p s)
 {
 	struct mbuf	*m = NULL;
 	int		 error;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	/* Send as much as we can from the session queue */
 	while (sowriteable(s->l2so)) {
 		/* Check if socket still OK */
 		if ((error = s->l2so->so_error) != 0) {
 			s->l2so->so_error = 0;
 
 			NG_BTSOCKET_RFCOMM_ERR(
 "%s: Detected error=%d on L2CAP socket, state=%d, flags=%#x\n",
 				__func__, error, s->state, s->flags);
 
 			return (error);
 		}
 
 		NG_BT_MBUFQ_DEQUEUE(&s->outq, m);
 		if (m == NULL)
 			return (0); /* we are done */
 
 		/* Call send function on the L2CAP socket */
-		error = (*s->l2so->so_proto->pr_usrreqs->pru_send)(s->l2so,
-				0, m, NULL, NULL, curthread /* XXX */);
+		error = s->l2so->so_proto->pr_send(s->l2so, 0, m, NULL, NULL,
+		    curthread /* XXX */);
 		if (error != 0) {
 			NG_BTSOCKET_RFCOMM_ERR(
 "%s: Could not send data to L2CAP socket, error=%d\n", __func__, error);
 
 			return (error);
 		}
 	}
 
 	return (0);
 } /* ng_btsocket_rfcomm_session_send */
 
 /*
  * Close and disconnect all DLCs for the given session. Caller must hold 
  * s->sesson_mtx. Will wakeup session.
  */
 
 static void
 ng_btsocket_rfcomm_session_clean(ng_btsocket_rfcomm_session_p s)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = NULL, pcb_next = NULL;
 	int				error;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	/*
 	 * Note: cannot use LIST_FOREACH because ng_btsocket_rfcomm_pcb_kill
 	 * will unlink DLC from the session
 	 */
 
 	for (pcb = LIST_FIRST(&s->dlcs); pcb != NULL; ) {
 		mtx_lock(&pcb->pcb_mtx);
 		pcb_next = LIST_NEXT(pcb, session_next);
 
 		NG_BTSOCKET_RFCOMM_INFO(
 "%s: Disconnecting dlci=%d, state=%d, flags=%#x\n",
 			__func__, pcb->dlci, pcb->state, pcb->flags);
 
 		if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONNECTED)
 			error = ECONNRESET;
 		else
 			error = ECONNREFUSED;
 
 		ng_btsocket_rfcomm_pcb_kill(pcb, error);
 
 		mtx_unlock(&pcb->pcb_mtx);
 		pcb = pcb_next;
 	}
 } /* ng_btsocket_rfcomm_session_clean */
 
 /*
  * Process all DLCs on the session. Caller MUST hold s->session_mtx.
  */
 
 static void
 ng_btsocket_rfcomm_session_process_pcb(ng_btsocket_rfcomm_session_p s)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = NULL, pcb_next = NULL;
 	int				error;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	/*
 	 * Note: cannot use LIST_FOREACH because ng_btsocket_rfcomm_pcb_kill
 	 * will unlink DLC from the session
 	 */
 
 	for (pcb = LIST_FIRST(&s->dlcs); pcb != NULL; ) {
 		mtx_lock(&pcb->pcb_mtx);
 		pcb_next = LIST_NEXT(pcb, session_next);
 
 		switch (pcb->state) {
 		/*
 		 * If DLC in W4_CONNECT state then we should check for both
 		 * timeout and detach.
 		 */
 
 		case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT:
 			if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_DETACHED)
 				ng_btsocket_rfcomm_pcb_kill(pcb, 0);
 			else if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT)
 				ng_btsocket_rfcomm_pcb_kill(pcb, ETIMEDOUT);
 			break;
 
 		/*
 		 * If DLC in CONFIGURING or CONNECTING state then we only
 		 * should check for timeout. If detach() was called then
 		 * DLC will be moved into DISCONNECTING state.
 		 */
 
 		case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING:
 		case NG_BTSOCKET_RFCOMM_DLC_CONNECTING:
 			if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT)
 				ng_btsocket_rfcomm_pcb_kill(pcb, ETIMEDOUT);
 			break;
 
 		/*
 		 * If DLC in CONNECTED state then we need to send data (if any)
 		 * from the socket's send queue. Note that we will send data
 		 * from either all sockets or none. This may overload session's
 		 * outgoing queue (but we do not check for that).
 		 *
  		 * XXX FIXME need scheduler for RFCOMM sockets
 		 */
 
 		case NG_BTSOCKET_RFCOMM_DLC_CONNECTED:
 			error = ng_btsocket_rfcomm_pcb_send(pcb, ALOT);
 			if (error != 0)
 				ng_btsocket_rfcomm_pcb_kill(pcb, error);
 			break;
 
 		/*
 		 * If DLC in DISCONNECTING state then we must send DISC frame.
 		 * Note that if DLC has timeout set then we do not need to 
 		 * resend DISC frame.
 		 *
 		 * XXX FIXME need to drain all data from the socket's queue
 		 * if LINGER option was set
 		 */
 
 		case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING:
 			if (!(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO)) {
 				error = ng_btsocket_rfcomm_send_command(
 						pcb->session, RFCOMM_FRAME_DISC,
 						pcb->dlci);
 				if (error == 0)
 					ng_btsocket_rfcomm_timeout(pcb);
 				else
 					ng_btsocket_rfcomm_pcb_kill(pcb, error);
 			} else if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT)
 				ng_btsocket_rfcomm_pcb_kill(pcb, ETIMEDOUT);
 			break;
 		
 /*		case NG_BTSOCKET_RFCOMM_DLC_CLOSED: */
 		default:
 			panic("%s: Invalid DLC state=%d, flags=%#x\n",
 				__func__, pcb->state, pcb->flags);
 			break;
 		}
 
 		mtx_unlock(&pcb->pcb_mtx);
 		pcb = pcb_next;
 	}
 } /* ng_btsocket_rfcomm_session_process_pcb */
 
 /*
  * Find RFCOMM session between "src" and "dst".
  * Caller MUST hold ng_btsocket_rfcomm_sessions_mtx.
  */
 
 static ng_btsocket_rfcomm_session_p
 ng_btsocket_rfcomm_session_by_addr(bdaddr_p src, bdaddr_p dst)
 {
 	ng_btsocket_rfcomm_session_p	s = NULL;
 	ng_btsocket_l2cap_pcb_p		l2pcb = NULL;
 	int				any_src;
 
 	mtx_assert(&ng_btsocket_rfcomm_sessions_mtx, MA_OWNED);
 
 	any_src = (bcmp(src, NG_HCI_BDADDR_ANY, sizeof(*src)) == 0);
 
 	LIST_FOREACH(s, &ng_btsocket_rfcomm_sessions, next) {
 		l2pcb = so2l2cap_pcb(s->l2so);
 
 		if ((any_src || bcmp(&l2pcb->src, src, sizeof(*src)) == 0) &&
 		    bcmp(&l2pcb->dst, dst, sizeof(*dst)) == 0)
 			break;
 	}
 
 	return (s);
 } /* ng_btsocket_rfcomm_session_by_addr */
 
 /*****************************************************************************
  *****************************************************************************
  **                                  RFCOMM 
  *****************************************************************************
  *****************************************************************************/
 
 /*
  * Process incoming RFCOMM frame. Caller must hold s->session_mtx.
  * XXX FIXME check frame length
  */
 
 static int
 ng_btsocket_rfcomm_receive_frame(ng_btsocket_rfcomm_session_p s,
 		struct mbuf *m0)
 {
 	struct rfcomm_frame_hdr	*hdr = NULL;
 	struct mbuf		*m = NULL;
 	u_int16_t		 length;
 	u_int8_t		 dlci, type;
 	int			 error = 0;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	/* Pullup as much as we can into first mbuf (for direct access) */
 	length = min(m0->m_pkthdr.len, MHLEN);
 	if (m0->m_len < length) {
 		if ((m0 = m_pullup(m0, length)) == NULL) {
 			NG_BTSOCKET_RFCOMM_ALERT(
 "%s: m_pullup(%d) failed\n", __func__, length);
 
 			return (ENOBUFS);
 		}
 	}
 
 	hdr = mtod(m0, struct rfcomm_frame_hdr *);
 	dlci = RFCOMM_DLCI(hdr->address);
 	type = RFCOMM_TYPE(hdr->control);
 
 	/* Test EA bit in length. If not set then we have 2 bytes of length */
 	if (!RFCOMM_EA(hdr->length)) {
 		bcopy(&hdr->length, &length, sizeof(length));
 		length = le16toh(length) >> 1;
 		m_adj(m0, sizeof(*hdr) + 1);
 	} else {
 		length = hdr->length >> 1;
 		m_adj(m0, sizeof(*hdr));
 	}
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got frame type=%#x, dlci=%d, length=%d, cr=%d, pf=%d, len=%d\n",
 		__func__, type, dlci, length, RFCOMM_CR(hdr->address),
 		RFCOMM_PF(hdr->control), m0->m_pkthdr.len);
 
 	/*
 	 * Get FCS (the last byte in the frame)
 	 * XXX this will not work if mbuf chain ends with empty mbuf.
 	 * XXX let's hope it never happens :)
 	 */
 
 	for (m = m0; m->m_next != NULL; m = m->m_next)
 		;
 	if (m->m_len <= 0)
 		panic("%s: Empty mbuf at the end of the chain, len=%d\n",
 			__func__, m->m_len);
 
 	/*
 	 * Check FCS. We only need to calculate FCS on first 2 or 3 bytes
 	 * and already m_pullup'ed mbuf chain, so it should be safe.
 	 */
 
 	if (ng_btsocket_rfcomm_check_fcs((u_int8_t *) hdr, type, m->m_data[m->m_len - 1])) {
 		NG_BTSOCKET_RFCOMM_ERR(
 "%s: Invalid RFCOMM packet. Bad checksum\n", __func__);
 		NG_FREE_M(m0);
 
 		return (EINVAL);
 	}
 
 	m_adj(m0, -1); /* Trim FCS byte */
 
 	/*
 	 * Process RFCOMM frame.
 	 *
 	 * From TS 07.10 spec
 	 * 
 	 * "... In the case where a SABM or DISC command with the P bit set
 	 * to 0 is received then the received frame shall be discarded..."
  	 *
 	 * "... If a unsolicited DM response is received then the frame shall
 	 * be processed irrespective of the P/F setting... "
 	 *
 	 * "... The station may transmit response frames with the F bit set 
 	 * to 0 at any opportunity on an asynchronous basis. However, in the 
 	 * case where a UA response is received with the F bit set to 0 then 
 	 * the received frame shall be discarded..."
 	 *
 	 * From Bluetooth spec
 	 *
 	 * "... When credit based flow control is being used, the meaning of
 	 * the P/F bit in the control field of the RFCOMM header is redefined
 	 * for UIH frames..."
 	 */
 
 	switch (type) {
 	case RFCOMM_FRAME_SABM:
 		if (RFCOMM_PF(hdr->control))
 			error = ng_btsocket_rfcomm_receive_sabm(s, dlci);
 		break;
 
 	case RFCOMM_FRAME_DISC:
 		if (RFCOMM_PF(hdr->control))
 			error = ng_btsocket_rfcomm_receive_disc(s, dlci);
 		break;
 
 	case RFCOMM_FRAME_UA:
 		if (RFCOMM_PF(hdr->control))
 			error = ng_btsocket_rfcomm_receive_ua(s, dlci);
 		break;
 
 	case RFCOMM_FRAME_DM:
 		error = ng_btsocket_rfcomm_receive_dm(s, dlci);
 		break;
 
 	case RFCOMM_FRAME_UIH:
 		if (dlci == 0)
 			error = ng_btsocket_rfcomm_receive_mcc(s, m0);
 		else
 			error = ng_btsocket_rfcomm_receive_uih(s, dlci,
 					RFCOMM_PF(hdr->control), m0);
 
 		return (error);
 		/* NOT REACHED */
 
 	default:
 		NG_BTSOCKET_RFCOMM_ERR(
 "%s: Invalid RFCOMM packet. Unknown type=%#x\n", __func__, type);
 		error = EINVAL;
 		break;
 	}
 
 	NG_FREE_M(m0);
 
 	return (error);
 } /* ng_btsocket_rfcomm_receive_frame */
 
 /*
  * Process RFCOMM SABM frame
  */
 
 static int
 ng_btsocket_rfcomm_receive_sabm(ng_btsocket_rfcomm_session_p s, int dlci)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = NULL;
 	int				error = 0;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got SABM, session state=%d, flags=%#x, mtu=%d, dlci=%d\n",
 		__func__, s->state, s->flags, s->mtu, dlci);
 
 	/* DLCI == 0 means open multiplexor channel */
 	if (dlci == 0) {
 		switch (s->state) {
 		case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED:
 		case NG_BTSOCKET_RFCOMM_SESSION_OPEN:
 			error = ng_btsocket_rfcomm_send_command(s,
 					RFCOMM_FRAME_UA, dlci);
 			if (error == 0) {
 				s->state = NG_BTSOCKET_RFCOMM_SESSION_OPEN;
 				ng_btsocket_rfcomm_connect_cfm(s);
 			} else {
 				s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED;
 				ng_btsocket_rfcomm_session_clean(s);
 			}
 			break;
 
 		default:
 			NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got SABM for session in invalid state state=%d, flags=%#x\n",
 				__func__, s->state, s->flags);
 			error = EINVAL;
 			break;
 		}
 
 		return (error);
 	}
 
 	/* Make sure multiplexor channel is open */
 	if (s->state != NG_BTSOCKET_RFCOMM_SESSION_OPEN) {
 		NG_BTSOCKET_RFCOMM_ERR(
 "%s: Got SABM for dlci=%d with multiplexor channel closed, state=%d, " \
 "flags=%#x\n",		__func__, dlci, s->state, s->flags);
 
 		return (EINVAL);
 	}
 
 	/*
 	 * Check if we have this DLCI. This might happen when remote
 	 * peer uses PN command before actual open (SABM) happens.
 	 */
 
 	pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci);
 	if (pcb != NULL) {
 		mtx_lock(&pcb->pcb_mtx);
 
 		if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTING) {
 			NG_BTSOCKET_RFCOMM_ERR(
 "%s: Got SABM for dlci=%d in invalid state=%d, flags=%#x\n",
 				__func__, dlci, pcb->state, pcb->flags);
 			mtx_unlock(&pcb->pcb_mtx);
 
 			return (ENOENT);
 		}
 
 		ng_btsocket_rfcomm_untimeout(pcb);
 
 		error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_UA,dlci);
 		if (error == 0)
 			error = ng_btsocket_rfcomm_send_msc(pcb);
 
 		if (error == 0) {
 			pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTED;
 			soisconnected(pcb->so);
 		} else
 			ng_btsocket_rfcomm_pcb_kill(pcb, error);
 
 		mtx_unlock(&pcb->pcb_mtx);
 
 		return (error);
 	}
 
 	/*
 	 * We do not have requested DLCI, so it must be an incoming connection
 	 * with default parameters. Try to accept it.
 	 */ 
 
 	pcb = ng_btsocket_rfcomm_connect_ind(s, RFCOMM_SRVCHANNEL(dlci));
 	if (pcb != NULL) {
 		mtx_lock(&pcb->pcb_mtx);
 
 		pcb->dlci = dlci;
 
 		error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_UA,dlci);
 		if (error == 0)
 			error = ng_btsocket_rfcomm_send_msc(pcb);
 
 		if (error == 0) {
 			pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTED;
 			soisconnected(pcb->so);
 		} else
 			ng_btsocket_rfcomm_pcb_kill(pcb, error);
 
 		mtx_unlock(&pcb->pcb_mtx);
 	} else
 		/* Nobody is listen()ing on the requested DLCI */
 		error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_DM,dlci);
 
 	return (error);
 } /* ng_btsocket_rfcomm_receive_sabm */
 
 /*
  * Process RFCOMM DISC frame
  */
 
 static int
 ng_btsocket_rfcomm_receive_disc(ng_btsocket_rfcomm_session_p s, int dlci)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = NULL;
 	int				error = 0;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got DISC, session state=%d, flags=%#x, mtu=%d, dlci=%d\n",
 		__func__, s->state, s->flags, s->mtu, dlci);
 
 	/* DLCI == 0 means close multiplexor channel */
 	if (dlci == 0) {
 		/* XXX FIXME assume that remote side will close the socket */
 		error = ng_btsocket_rfcomm_send_command(s, RFCOMM_FRAME_UA, 0);
 		if (error == 0) {
 			if (s->state == NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING)
 				s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; /* XXX */
 			else
 				s->state = NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING;
 		} else
 			s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED; /* XXX */
 
 		ng_btsocket_rfcomm_session_clean(s);
 	} else {
 		pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci);
 		if (pcb != NULL) {
 			int	err;
 
 			mtx_lock(&pcb->pcb_mtx);
 
 			NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got DISC for dlci=%d, state=%d, flags=%#x\n",
 				__func__, dlci, pcb->state, pcb->flags);
 
 			error = ng_btsocket_rfcomm_send_command(s,
 					RFCOMM_FRAME_UA, dlci);
 
 			if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONNECTED)
 				err = 0;
 			else
 				err = ECONNREFUSED;
 
 			ng_btsocket_rfcomm_pcb_kill(pcb, err);
 
 			mtx_unlock(&pcb->pcb_mtx);
 		} else {
 			NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got DISC for non-existing dlci=%d\n", __func__, dlci);
 
 			error = ng_btsocket_rfcomm_send_command(s,
 					RFCOMM_FRAME_DM, dlci);
 		}
 	}
 
 	return (error);
 } /* ng_btsocket_rfcomm_receive_disc */
 
 /*
  * Process RFCOMM UA frame
  */
 
 static int
 ng_btsocket_rfcomm_receive_ua(ng_btsocket_rfcomm_session_p s, int dlci)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = NULL;
 	int				error = 0;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got UA, session state=%d, flags=%#x, mtu=%d, dlci=%d\n",
 		__func__, s->state, s->flags, s->mtu, dlci);
 
 	/* dlci == 0 means multiplexor channel */
 	if (dlci == 0) {
 		switch (s->state) {
 		case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED:
 			s->state = NG_BTSOCKET_RFCOMM_SESSION_OPEN;
 			ng_btsocket_rfcomm_connect_cfm(s);
 			break;
 
 		case NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING:
 			s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED;
 			ng_btsocket_rfcomm_session_clean(s);
 			break;
 
 		default:
 			NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got UA for session in invalid state=%d(%d), flags=%#x, mtu=%d\n",
 				__func__, s->state, INITIATOR(s), s->flags,
 				s->mtu);
 			error = ENOENT;
 			break;
 		}
 
 		return (error);
 	}
 
 	/* Check if we have this DLCI */
 	pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci);
 	if (pcb != NULL) {
 		mtx_lock(&pcb->pcb_mtx);
 
 		NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got UA for dlci=%d, state=%d, flags=%#x\n",
 			__func__, dlci, pcb->state, pcb->flags);
 
 		switch (pcb->state) {
 		case NG_BTSOCKET_RFCOMM_DLC_CONNECTING:
 			ng_btsocket_rfcomm_untimeout(pcb);
 
 			error = ng_btsocket_rfcomm_send_msc(pcb);
 			if (error == 0) {
 				pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTED;
 				soisconnected(pcb->so);
 			}
 			break;
 
 		case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING:
 			ng_btsocket_rfcomm_pcb_kill(pcb, 0);
 			break;
 
 		default:
 			NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got UA for dlci=%d in invalid state=%d, flags=%#x\n",
 				__func__, dlci, pcb->state, pcb->flags);
 			error = ENOENT;
 			break;
 		}
 
 		mtx_unlock(&pcb->pcb_mtx);
 	} else {
 		NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got UA for non-existing dlci=%d\n", __func__, dlci);
 
 		error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_DM,dlci);
 	}
 
 	return (error);
 } /* ng_btsocket_rfcomm_receive_ua */
 
 /*
  * Process RFCOMM DM frame
  */
 
 static int
 ng_btsocket_rfcomm_receive_dm(ng_btsocket_rfcomm_session_p s, int dlci)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = NULL;
 	int				error;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got DM, session state=%d, flags=%#x, mtu=%d, dlci=%d\n",
 		__func__, s->state, s->flags, s->mtu, dlci);
 
 	/* DLCI == 0 means multiplexor channel */
 	if (dlci == 0) {
 		/* Disconnect all dlc's on the session */
 		s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED;
 		ng_btsocket_rfcomm_session_clean(s);
 	} else {
 		pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci);
 		if (pcb != NULL) {
 			mtx_lock(&pcb->pcb_mtx);
 
 			NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got DM for dlci=%d, state=%d, flags=%#x\n",
 				__func__, dlci, pcb->state, pcb->flags);
 
 			if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONNECTED)
 				error = ECONNRESET;
 			else
 				error = ECONNREFUSED;
 
 			ng_btsocket_rfcomm_pcb_kill(pcb, error);
 
 			mtx_unlock(&pcb->pcb_mtx);
 		} else
 			NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got DM for non-existing dlci=%d\n", __func__, dlci);
 	}
 
 	return (0);
 } /* ng_btsocket_rfcomm_receive_dm */
 
 /*
  * Process RFCOMM UIH frame (data)
  */
 
 static int
 ng_btsocket_rfcomm_receive_uih(ng_btsocket_rfcomm_session_p s, int dlci,
 		int pf, struct mbuf *m0)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = NULL;
 	int				error = 0;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got UIH, session state=%d, flags=%#x, mtu=%d, dlci=%d, pf=%d, len=%d\n",
 		__func__, s->state, s->flags, s->mtu, dlci, pf,
 		m0->m_pkthdr.len);
 
 	/* XXX should we do it here? Check for session flow control */
 	if (s->flags & NG_BTSOCKET_RFCOMM_SESSION_LFC) {
 		NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got UIH with session flow control asserted, state=%d, flags=%#x\n",
 			__func__, s->state, s->flags);
 		goto drop;
 	}
 
 	/* Check if we have this dlci */
 	pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, dlci);
 	if (pcb == NULL) {
 		NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got UIH for non-existing dlci=%d\n", __func__, dlci);
 		error = ng_btsocket_rfcomm_send_command(s,RFCOMM_FRAME_DM,dlci);
 		goto drop;
 	}
 
 	mtx_lock(&pcb->pcb_mtx);
 
 	/* Check dlci state */	
 	if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTED) {
 		NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got UIH for dlci=%d in invalid state=%d, flags=%#x\n",
 			__func__, dlci, pcb->state, pcb->flags);
 		error = EINVAL;
 		goto drop1;
 	}
 
 	/* Check dlci flow control */
 	if (((pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) && pcb->rx_cred <= 0) ||
 	     (pcb->lmodem & RFCOMM_MODEM_FC)) {
 		NG_BTSOCKET_RFCOMM_ERR(
 "%s: Got UIH for dlci=%d with asserted flow control, state=%d, " \
 "flags=%#x, rx_cred=%d, lmodem=%#x\n",
 			__func__, dlci, pcb->state, pcb->flags,
 			pcb->rx_cred, pcb->lmodem);
 		goto drop1;
 	}
 
 	/* Did we get any credits? */
 	if ((pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) && pf) {
 		NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got %d more credits for dlci=%d, state=%d, flags=%#x, " \
 "rx_cred=%d, tx_cred=%d\n",
 			__func__, *mtod(m0, u_int8_t *), dlci, pcb->state, 
 			pcb->flags, pcb->rx_cred, pcb->tx_cred);
 
 		pcb->tx_cred += *mtod(m0, u_int8_t *);
 		m_adj(m0, 1);
 
 		/* Send more from the DLC. XXX check for errors? */
 		ng_btsocket_rfcomm_pcb_send(pcb, ALOT);
 	} 
 
 	/* OK the of the rest of the mbuf is the data */
 	if (m0->m_pkthdr.len > 0) {
 		/* If we are using credit flow control decrease rx_cred here */
 		if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) {
 			/* Give remote peer more credits (if needed) */
 			if (-- pcb->rx_cred <= RFCOMM_MAX_CREDITS / 2)
 				ng_btsocket_rfcomm_send_credits(pcb);
 			else
 				NG_BTSOCKET_RFCOMM_INFO(
 "%s: Remote side still has credits, dlci=%d, state=%d, flags=%#x, " \
 "rx_cred=%d, tx_cred=%d\n",		__func__, dlci, pcb->state, pcb->flags,
 					pcb->rx_cred, pcb->tx_cred);
 		}
 		
 		/* Check packet against mtu on dlci */
 		if (m0->m_pkthdr.len > pcb->mtu) {
 			NG_BTSOCKET_RFCOMM_ERR(
 "%s: Got oversized UIH for dlci=%d, state=%d, flags=%#x, mtu=%d, len=%d\n",
 				__func__, dlci, pcb->state, pcb->flags,
 				pcb->mtu, m0->m_pkthdr.len);
 
 			error = EMSGSIZE;
 		} else if (m0->m_pkthdr.len > sbspace(&pcb->so->so_rcv)) {
 			/*
 			 * This is really bad. Receive queue on socket does
 			 * not have enough space for the packet. We do not
 			 * have any other choice but drop the packet. 
 			 */
 
 			NG_BTSOCKET_RFCOMM_ERR(
 "%s: Not enough space in socket receive queue. Dropping UIH for dlci=%d, " \
 "state=%d, flags=%#x, len=%d, space=%ld\n",
 				__func__, dlci, pcb->state, pcb->flags,
 				m0->m_pkthdr.len, sbspace(&pcb->so->so_rcv));
 
 			error = ENOBUFS;
 		} else {
 			/* Append packet to the socket receive queue */
 			sbappend(&pcb->so->so_rcv, m0, 0);
 			m0 = NULL;
 
 			sorwakeup(pcb->so);
 		}
 	}
 drop1:
 	mtx_unlock(&pcb->pcb_mtx);
 drop:
 	NG_FREE_M(m0); /* checks for != NULL */
 
 	return (error);
 } /* ng_btsocket_rfcomm_receive_uih */
 
 /*
  * Process RFCOMM MCC command (Multiplexor)
  * 
  * From TS 07.10 spec
  *
  * "5.4.3.1 Information Data
  * 
  *  ...The frames (UIH) sent by the initiating station have the C/R bit set 
  *  to 1 and those sent by the responding station have the C/R bit set to 0..."
  *
  * "5.4.6.2 Operating procedures
  *
  *  Messages always exist in pairs; a command message and a corresponding 
  *  response message. If the C/R bit is set to 1 the message is a command, 
  *  if it is set to 0 the message is a response...
  *
  *  ...
  * 
  *  NOTE: Notice that when UIH frames are used to convey information on DLCI 0
  *  there are at least two different fields that contain a C/R bit, and the 
  *  bits are set of different form. The C/R bit in the Type field shall be set
  *  as it is stated above, while the C/R bit in the Address field (see subclause
  *  5.2.1.2) shall be set as it is described in subclause 5.4.3.1."
  */
 
 static int
 ng_btsocket_rfcomm_receive_mcc(ng_btsocket_rfcomm_session_p s, struct mbuf *m0)
 {
 	struct rfcomm_mcc_hdr	*hdr = NULL;
 	u_int8_t		 cr, type, length;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	/*
 	 * We can access data directly in the first mbuf, because we have
 	 * m_pullup()'ed mbuf chain in ng_btsocket_rfcomm_receive_frame().
 	 * All MCC commands should fit into single mbuf (except probably TEST).
 	 */
 
 	hdr = mtod(m0, struct rfcomm_mcc_hdr *);
 	cr = RFCOMM_CR(hdr->type);
 	type = RFCOMM_MCC_TYPE(hdr->type);
 	length = RFCOMM_MCC_LENGTH(hdr->length);
 
 	/* Check MCC frame length */
 	if (sizeof(*hdr) + length != m0->m_pkthdr.len) {
 		NG_BTSOCKET_RFCOMM_ERR(
 "%s: Invalid MCC frame length=%d, len=%d\n",
 			__func__, length, m0->m_pkthdr.len);
 		NG_FREE_M(m0);
 
 		return (EMSGSIZE);
 	}
 
 	switch (type) {
 	case RFCOMM_MCC_TEST:
 		return (ng_btsocket_rfcomm_receive_test(s, m0));
 		/* NOT REACHED */
 
 	case RFCOMM_MCC_FCON:
 	case RFCOMM_MCC_FCOFF:
 		return (ng_btsocket_rfcomm_receive_fc(s, m0));
 		/* NOT REACHED */
 
 	case RFCOMM_MCC_MSC:
 		return (ng_btsocket_rfcomm_receive_msc(s, m0));
 		/* NOT REACHED */
 
 	case RFCOMM_MCC_RPN:
 		return (ng_btsocket_rfcomm_receive_rpn(s, m0));
 		/* NOT REACHED */
 
 	case RFCOMM_MCC_RLS:
 		return (ng_btsocket_rfcomm_receive_rls(s, m0));
 		/* NOT REACHED */
 
 	case RFCOMM_MCC_PN:
 		return (ng_btsocket_rfcomm_receive_pn(s, m0));
 		/* NOT REACHED */
 
 	case RFCOMM_MCC_NSC:
 		NG_BTSOCKET_RFCOMM_ERR(
 "%s: Got MCC NSC, type=%#x, cr=%d, length=%d, session state=%d, flags=%#x, " \
 "mtu=%d, len=%d\n",	__func__, RFCOMM_MCC_TYPE(*((u_int8_t *)(hdr + 1))), cr,
 			 length, s->state, s->flags, s->mtu, m0->m_pkthdr.len);
 		NG_FREE_M(m0);
 		break;
 
 	default:
 		NG_BTSOCKET_RFCOMM_ERR(
 "%s: Got unknown MCC, type=%#x, cr=%d, length=%d, session state=%d, " \
 "flags=%#x, mtu=%d, len=%d\n",
 			__func__, type, cr, length, s->state, s->flags,
 			s->mtu, m0->m_pkthdr.len);
 
 		/* Reuse mbuf to send NSC */
 		hdr = mtod(m0, struct rfcomm_mcc_hdr *);
 		m0->m_pkthdr.len = m0->m_len = sizeof(*hdr);
 
 		/* Create MCC NSC header */
 		hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_NSC);
 		hdr->length = RFCOMM_MKLEN8(1);
 
 		/* Put back MCC command type we did not like */
 		m0->m_data[m0->m_len] = RFCOMM_MKMCC_TYPE(cr, type);
 		m0->m_pkthdr.len ++;
 		m0->m_len ++;
 
 		/* Send UIH frame */
 		return (ng_btsocket_rfcomm_send_uih(s,
 				RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0));
 		/* NOT REACHED */
 	}
 
 	return (0);
 } /* ng_btsocket_rfcomm_receive_mcc */
 
 /*
  * Receive RFCOMM TEST MCC command
  */
 
 static int
 ng_btsocket_rfcomm_receive_test(ng_btsocket_rfcomm_session_p s, struct mbuf *m0)
 {
 	struct rfcomm_mcc_hdr	*hdr = mtod(m0, struct rfcomm_mcc_hdr *);
 	int			 error = 0;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got MCC TEST, cr=%d, length=%d, session state=%d, flags=%#x, mtu=%d, " \
 "len=%d\n",	__func__, RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length),
 		s->state, s->flags, s->mtu, m0->m_pkthdr.len);
 
 	if (RFCOMM_CR(hdr->type)) {
 		hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_TEST);
 		error = ng_btsocket_rfcomm_send_uih(s,
 				RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0);
 	} else
 		NG_FREE_M(m0); /* XXX ignore response */
 
 	return (error);
 } /* ng_btsocket_rfcomm_receive_test */
 
 /*
  * Receive RFCOMM FCON/FCOFF MCC command
  */
 
 static int
 ng_btsocket_rfcomm_receive_fc(ng_btsocket_rfcomm_session_p s, struct mbuf *m0)
 {
 	struct rfcomm_mcc_hdr	*hdr = mtod(m0, struct rfcomm_mcc_hdr *);
 	u_int8_t		 type = RFCOMM_MCC_TYPE(hdr->type);
 	int			 error = 0;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	/*
 	 * Turn ON/OFF aggregate flow on the entire session. When remote peer 
 	 * asserted flow control no transmission shall occur except on dlci 0
 	 * (control channel).
 	 */
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got MCC FC%s, cr=%d, length=%d, session state=%d, flags=%#x, mtu=%d, " \
 "len=%d\n",	__func__, (type == RFCOMM_MCC_FCON)? "ON" : "OFF",
 		RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length),
 		s->state, s->flags, s->mtu, m0->m_pkthdr.len);
 
 	if (RFCOMM_CR(hdr->type)) {
 		if (type == RFCOMM_MCC_FCON)
 			s->flags &= ~NG_BTSOCKET_RFCOMM_SESSION_RFC;
 		else
 			s->flags |= NG_BTSOCKET_RFCOMM_SESSION_RFC;
 
 		hdr->type = RFCOMM_MKMCC_TYPE(0, type);
 		error = ng_btsocket_rfcomm_send_uih(s,
 				RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0);
 	} else
 		NG_FREE_M(m0); /* XXX ignore response */
 
 	return (error);
 } /* ng_btsocket_rfcomm_receive_fc  */
 
 /*
  * Receive RFCOMM MSC MCC command
  */
 
 static int
 ng_btsocket_rfcomm_receive_msc(ng_btsocket_rfcomm_session_p s, struct mbuf *m0)
 {
 	struct rfcomm_mcc_hdr		*hdr = mtod(m0, struct rfcomm_mcc_hdr*);
 	struct rfcomm_mcc_msc		*msc = (struct rfcomm_mcc_msc *)(hdr+1);
 	ng_btsocket_rfcomm_pcb_t	*pcb = NULL;
 	int				 error = 0;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got MCC MSC, dlci=%d, cr=%d, length=%d, session state=%d, flags=%#x, " \
 "mtu=%d, len=%d\n",
 		__func__,  RFCOMM_DLCI(msc->address), RFCOMM_CR(hdr->type),
 		RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags,
 		s->mtu, m0->m_pkthdr.len);
 
 	if (RFCOMM_CR(hdr->type)) {
 		pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, RFCOMM_DLCI(msc->address));
 		if (pcb == NULL) {
 			NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got MSC command for non-existing dlci=%d\n",
 				__func__, RFCOMM_DLCI(msc->address));
 			NG_FREE_M(m0);
 
 			return (ENOENT);
 		}
 
 		mtx_lock(&pcb->pcb_mtx);
 
 		if (pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTING &&
 		    pcb->state != NG_BTSOCKET_RFCOMM_DLC_CONNECTED) {
 			NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got MSC on dlci=%d in invalid state=%d\n",
 				__func__, RFCOMM_DLCI(msc->address),
 				pcb->state);
 
 			mtx_unlock(&pcb->pcb_mtx);
 			NG_FREE_M(m0);
 
 			return (EINVAL);
 		}
 
 		pcb->rmodem = msc->modem; /* Update remote port signals */
 
 		hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_MSC);
 		error = ng_btsocket_rfcomm_send_uih(s,
 				RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0);
 
 #if 0 /* YYY */
 		/* Send more data from DLC. XXX check for errors? */
 		if (!(pcb->rmodem & RFCOMM_MODEM_FC) &&
 		    !(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC))
 			ng_btsocket_rfcomm_pcb_send(pcb, ALOT);
 #endif /* YYY */
 
 		mtx_unlock(&pcb->pcb_mtx);
 	} else
 		NG_FREE_M(m0); /* XXX ignore response */
 
 	return (error);
 } /* ng_btsocket_rfcomm_receive_msc */
 
 /*
  * Receive RFCOMM RPN MCC command
  * XXX FIXME do we need htole16/le16toh for RPN param_mask?
  */
 
 static int
 ng_btsocket_rfcomm_receive_rpn(ng_btsocket_rfcomm_session_p s, struct mbuf *m0)
 {
 	struct rfcomm_mcc_hdr	*hdr = mtod(m0, struct rfcomm_mcc_hdr *);
 	struct rfcomm_mcc_rpn	*rpn = (struct rfcomm_mcc_rpn *)(hdr + 1);
 	int			 error = 0;
 	u_int16_t		 param_mask;
 	u_int8_t		 bit_rate, data_bits, stop_bits, parity,
 				 flow_control, xon_char, xoff_char;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got MCC RPN, dlci=%d, cr=%d, length=%d, session state=%d, flags=%#x, " \
 "mtu=%d, len=%d\n",
 		__func__, RFCOMM_DLCI(rpn->dlci), RFCOMM_CR(hdr->type),
 		RFCOMM_MCC_LENGTH(hdr->length), s->state, s->flags,
 		s->mtu, m0->m_pkthdr.len);
 
 	if (RFCOMM_CR(hdr->type)) {
 		param_mask = RFCOMM_RPN_PM_ALL;
 
 		if (RFCOMM_MCC_LENGTH(hdr->length) == 1) {
 			/* Request - return default setting */
 			bit_rate = RFCOMM_RPN_BR_115200;
 			data_bits = RFCOMM_RPN_DATA_8;
 			stop_bits = RFCOMM_RPN_STOP_1;
 			parity = RFCOMM_RPN_PARITY_NONE;
 			flow_control = RFCOMM_RPN_FLOW_NONE;
 			xon_char = RFCOMM_RPN_XON_CHAR;
 			xoff_char = RFCOMM_RPN_XOFF_CHAR;
                 } else {
 			/*
 			 * Ignore/accept bit_rate, 8 bits, 1 stop bit, no 
 			 * parity, no flow control lines, default XON/XOFF 
 			 * chars.
 			 */
 
 			bit_rate = rpn->bit_rate;
 			rpn->param_mask = le16toh(rpn->param_mask); /* XXX */
 
 			data_bits = RFCOMM_RPN_DATA_BITS(rpn->line_settings);
 			if (rpn->param_mask & RFCOMM_RPN_PM_DATA &&
 			    data_bits != RFCOMM_RPN_DATA_8) {
 				data_bits = RFCOMM_RPN_DATA_8;
 				param_mask ^= RFCOMM_RPN_PM_DATA;
 			}
 
 			stop_bits = RFCOMM_RPN_STOP_BITS(rpn->line_settings);
 			if (rpn->param_mask & RFCOMM_RPN_PM_STOP &&
 			    stop_bits != RFCOMM_RPN_STOP_1) {
 				stop_bits = RFCOMM_RPN_STOP_1;
 				param_mask ^= RFCOMM_RPN_PM_STOP;
 			}
 
 			parity = RFCOMM_RPN_PARITY(rpn->line_settings);
 			if (rpn->param_mask & RFCOMM_RPN_PM_PARITY &&
 			    parity != RFCOMM_RPN_PARITY_NONE) {
 				parity = RFCOMM_RPN_PARITY_NONE;
 				param_mask ^= RFCOMM_RPN_PM_PARITY;
 			}
 
 			flow_control = rpn->flow_control;
 			if (rpn->param_mask & RFCOMM_RPN_PM_FLOW &&
 			    flow_control != RFCOMM_RPN_FLOW_NONE) {
 				flow_control = RFCOMM_RPN_FLOW_NONE;
 				param_mask ^= RFCOMM_RPN_PM_FLOW;
 			}
 
 			xon_char = rpn->xon_char;
 			if (rpn->param_mask & RFCOMM_RPN_PM_XON &&
 			    xon_char != RFCOMM_RPN_XON_CHAR) {
 				xon_char = RFCOMM_RPN_XON_CHAR;
 				param_mask ^= RFCOMM_RPN_PM_XON;
 			}
 
 			xoff_char = rpn->xoff_char;
 			if (rpn->param_mask & RFCOMM_RPN_PM_XOFF &&
 			    xoff_char != RFCOMM_RPN_XOFF_CHAR) {
 				xoff_char = RFCOMM_RPN_XOFF_CHAR;
 				param_mask ^= RFCOMM_RPN_PM_XOFF;
 			}
 		}
 
 		rpn->bit_rate = bit_rate;
 		rpn->line_settings = RFCOMM_MKRPN_LINE_SETTINGS(data_bits, 
 						stop_bits, parity);
 		rpn->flow_control = flow_control;
 		rpn->xon_char = xon_char;
 		rpn->xoff_char = xoff_char;
 		rpn->param_mask = htole16(param_mask); /* XXX */
 
 		m0->m_pkthdr.len = m0->m_len = sizeof(*hdr) + sizeof(*rpn);
 
 		hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_RPN);
 		error = ng_btsocket_rfcomm_send_uih(s,
 				RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0);
 	} else
 		NG_FREE_M(m0); /* XXX ignore response */
 
 	return (error);
 } /* ng_btsocket_rfcomm_receive_rpn */
 
 /*
  * Receive RFCOMM RLS MCC command
  */
 
 static int
 ng_btsocket_rfcomm_receive_rls(ng_btsocket_rfcomm_session_p s, struct mbuf *m0)
 {
 	struct rfcomm_mcc_hdr	*hdr = mtod(m0, struct rfcomm_mcc_hdr *);
 	struct rfcomm_mcc_rls	*rls = (struct rfcomm_mcc_rls *)(hdr + 1);
 	int			 error = 0;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	/*
 	 * XXX FIXME Do we have to do anything else here? Remote peer tries to 
 	 * tell us something about DLCI. Just report what we have received and
 	 * return back received values as required by TS 07.10 spec.
 	 */
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got MCC RLS, dlci=%d, status=%#x, cr=%d, length=%d, session state=%d, " \
 "flags=%#x, mtu=%d, len=%d\n",
 		__func__, RFCOMM_DLCI(rls->address), rls->status,
 		RFCOMM_CR(hdr->type), RFCOMM_MCC_LENGTH(hdr->length),
 		s->state, s->flags, s->mtu, m0->m_pkthdr.len);
 
 	if (RFCOMM_CR(hdr->type)) {
 		if (rls->status & 0x1)
 			NG_BTSOCKET_RFCOMM_ERR(
 "%s: Got RLS dlci=%d, error=%#x\n", __func__, RFCOMM_DLCI(rls->address),
 				rls->status >> 1);
 
 		hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_RLS);
 		error = ng_btsocket_rfcomm_send_uih(s,
 				RFCOMM_MKADDRESS(INITIATOR(s), 0), 0, 0, m0);
 	} else
 		NG_FREE_M(m0); /* XXX ignore responses */
 
 	return (error);
 } /* ng_btsocket_rfcomm_receive_rls */
 
 /*
  * Receive RFCOMM PN MCC command
  */
 
 static int
 ng_btsocket_rfcomm_receive_pn(ng_btsocket_rfcomm_session_p s, struct mbuf *m0)
 {
 	struct rfcomm_mcc_hdr		*hdr = mtod(m0, struct rfcomm_mcc_hdr*);
 	struct rfcomm_mcc_pn		*pn = (struct rfcomm_mcc_pn *)(hdr+1);
 	ng_btsocket_rfcomm_pcb_t	*pcb = NULL;
 	int				 error = 0;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Got MCC PN, dlci=%d, cr=%d, length=%d, flow_control=%#x, priority=%d, " \
 "ack_timer=%d, mtu=%d, max_retrans=%d, credits=%d, session state=%d, " \
 "flags=%#x, session mtu=%d, len=%d\n",
 		__func__, pn->dlci, RFCOMM_CR(hdr->type),
 		RFCOMM_MCC_LENGTH(hdr->length), pn->flow_control, pn->priority,
 		pn->ack_timer, le16toh(pn->mtu), pn->max_retrans, pn->credits,
 		s->state, s->flags, s->mtu, m0->m_pkthdr.len);
 
 	if (pn->dlci == 0) {
 		NG_BTSOCKET_RFCOMM_ERR("%s: Zero dlci in MCC PN\n", __func__);
 		NG_FREE_M(m0);
 
 		return (EINVAL);
 	}
 
 	/* Check if we have this dlci */
 	pcb = ng_btsocket_rfcomm_pcb_by_dlci(s, pn->dlci);
 	if (pcb != NULL) {
 		mtx_lock(&pcb->pcb_mtx);
 
 		if (RFCOMM_CR(hdr->type)) {
 			/* PN Request */
 			ng_btsocket_rfcomm_set_pn(pcb, 1, pn->flow_control,
 				pn->credits, pn->mtu);
 
 			if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) {
 				pn->flow_control = 0xe0;
 				pn->credits = RFCOMM_DEFAULT_CREDITS;
 			} else {
 				pn->flow_control = 0;
 				pn->credits = 0;
 			}
 
 			hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_PN);
 			error = ng_btsocket_rfcomm_send_uih(s, 
 					RFCOMM_MKADDRESS(INITIATOR(s), 0),
 					0, 0, m0);
 		} else {
 			/* PN Response - proceed with SABM. Timeout still set */
 			if (pcb->state == NG_BTSOCKET_RFCOMM_DLC_CONFIGURING) {
 				ng_btsocket_rfcomm_set_pn(pcb, 0,
 					pn->flow_control, pn->credits, pn->mtu);
 
 				pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTING;
 				error = ng_btsocket_rfcomm_send_command(s,
 						RFCOMM_FRAME_SABM, pn->dlci);
 			} else
 				NG_BTSOCKET_RFCOMM_WARN(
 "%s: Got PN response for dlci=%d in invalid state=%d\n",
 					__func__, pn->dlci, pcb->state);
 
 			NG_FREE_M(m0);
 		}
 
 		mtx_unlock(&pcb->pcb_mtx);
 	} else if (RFCOMM_CR(hdr->type)) {
 		/* PN request to non-existing dlci - incoming connection */
 		pcb = ng_btsocket_rfcomm_connect_ind(s,
 				RFCOMM_SRVCHANNEL(pn->dlci));
 		if (pcb != NULL) {
 			mtx_lock(&pcb->pcb_mtx);
 
 			pcb->dlci = pn->dlci;
 
 			ng_btsocket_rfcomm_set_pn(pcb, 1, pn->flow_control,
 				pn->credits, pn->mtu);
 
 			if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) {
 				pn->flow_control = 0xe0;
 				pn->credits = RFCOMM_DEFAULT_CREDITS;
 			} else {
 				pn->flow_control = 0;
 				pn->credits = 0;
 			}
 
 			hdr->type = RFCOMM_MKMCC_TYPE(0, RFCOMM_MCC_PN);
 			error = ng_btsocket_rfcomm_send_uih(s, 
 					RFCOMM_MKADDRESS(INITIATOR(s), 0),
 					0, 0, m0);
 
 			if (error == 0) {
 				ng_btsocket_rfcomm_timeout(pcb);
 				pcb->state = NG_BTSOCKET_RFCOMM_DLC_CONNECTING;
 				soisconnecting(pcb->so);
 			} else
 				ng_btsocket_rfcomm_pcb_kill(pcb, error);
 
 			mtx_unlock(&pcb->pcb_mtx);
 		} else {
 			/* Nobody is listen()ing on this channel */
 			error = ng_btsocket_rfcomm_send_command(s,
 					RFCOMM_FRAME_DM, pn->dlci);
 			NG_FREE_M(m0);
 		}
 	} else
 		NG_FREE_M(m0); /* XXX ignore response to non-existing dlci */
 
 	return (error);
 } /* ng_btsocket_rfcomm_receive_pn */
 
 /*
  * Set PN parameters for dlci. Caller must hold pcb->pcb_mtx.
  * 
  * From Bluetooth spec.
  * 
  * "... The CL1 - CL4 field is completely redefined. (In TS07.10 this defines 
  *  the convergence layer to use, which is not applicable to RFCOMM. In RFCOMM,
  *  in Bluetooth versions up to 1.0B, this field was forced to 0).
  *
  *  In the PN request sent prior to a DLC establishment, this field must contain
  *  the value 15 (0xF), indicating support of credit based flow control in the 
  *  sender. See Table 5.3 below. If the PN response contains any other value 
  *  than 14 (0xE) in this field, it is inferred that the peer RFCOMM entity is 
  *  not supporting the credit based flow control feature. (This is only possible
  *  if the peer RFCOMM implementation is only conforming to Bluetooth version 
  *  1.0B.) If a PN request is sent on an already open DLC, then this field must
  *  contain the value zero; it is not possible to set initial credits  more 
  *  than once per DLC activation. A responding implementation must set this 
  *  field in the PN response to 14 (0xE), if (and only if) the value in the PN 
  *  request was 15..."
  */
 
 static void
 ng_btsocket_rfcomm_set_pn(ng_btsocket_rfcomm_pcb_p pcb, u_int8_t cr,
 		u_int8_t flow_control, u_int8_t credits, u_int16_t mtu)
 {
 	mtx_assert(&pcb->pcb_mtx, MA_OWNED);
 
 	pcb->mtu = le16toh(mtu);
 
 	if (cr) {
 		if (flow_control == 0xf0) {
 			pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_CFC;
 			pcb->tx_cred = credits;
 		} else {
 			pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_CFC;
 			pcb->tx_cred = 0;
 		}
 	} else {
 		if (flow_control == 0xe0) {
 			pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_CFC;
 			pcb->tx_cred = credits;
 		} else {
 			pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_CFC;
 			pcb->tx_cred = 0;
 		}
 	}
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: cr=%d, dlci=%d, state=%d, flags=%#x, mtu=%d, rx_cred=%d, tx_cred=%d\n",
 		__func__, cr, pcb->dlci, pcb->state, pcb->flags, pcb->mtu,
 		pcb->rx_cred, pcb->tx_cred);
 } /* ng_btsocket_rfcomm_set_pn */
 
 /*
  * Send RFCOMM SABM/DISC/UA/DM frames. Caller must hold s->session_mtx
  */
 
 static int
 ng_btsocket_rfcomm_send_command(ng_btsocket_rfcomm_session_p s,
 		u_int8_t type, u_int8_t dlci)
 {
 	struct rfcomm_cmd_hdr	*hdr = NULL;
 	struct mbuf		*m = NULL;
 	int			 cr;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Sending command type %#x, session state=%d, flags=%#x, mtu=%d, dlci=%d\n",
 		__func__, type, s->state, s->flags, s->mtu, dlci);
 
 	switch (type) {
 	case RFCOMM_FRAME_SABM:
 	case RFCOMM_FRAME_DISC:
 		cr = INITIATOR(s);
 		break;
 
 	case RFCOMM_FRAME_UA:
 	case RFCOMM_FRAME_DM:
 		cr = !INITIATOR(s);
 		break;
 
 	default:
 		panic("%s: Invalid frame type=%#x\n", __func__, type);
 		return (EINVAL);
 		/* NOT REACHED */
 	}
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 
 	m->m_pkthdr.len = m->m_len = sizeof(*hdr);
 
 	hdr = mtod(m, struct rfcomm_cmd_hdr *);
 	hdr->address = RFCOMM_MKADDRESS(cr, dlci);
 	hdr->control = RFCOMM_MKCONTROL(type, 1);
 	hdr->length = RFCOMM_MKLEN8(0);
 	hdr->fcs = ng_btsocket_rfcomm_fcs3((u_int8_t *) hdr);
 
 	NG_BT_MBUFQ_ENQUEUE(&s->outq, m);
 
 	return (0);
 } /* ng_btsocket_rfcomm_send_command */
 
 /*
  * Send RFCOMM UIH frame. Caller must hold s->session_mtx
  */
 
 static int
 ng_btsocket_rfcomm_send_uih(ng_btsocket_rfcomm_session_p s, u_int8_t address,
 		u_int8_t pf, u_int8_t credits, struct mbuf *data)
 {
 	struct rfcomm_frame_hdr	*hdr = NULL;
 	struct mbuf		*m = NULL, *mcrc = NULL;
 	u_int16_t		 length;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL) {
 		NG_FREE_M(data);
 		return (ENOBUFS);
 	}
 	m->m_pkthdr.len = m->m_len = sizeof(*hdr);
 
 	MGET(mcrc, M_NOWAIT, MT_DATA);
 	if (mcrc == NULL) {
 		NG_FREE_M(data);
 		return (ENOBUFS);
 	}
 	mcrc->m_len = 1;
 
 	/* Fill UIH frame header */
 	hdr = mtod(m, struct rfcomm_frame_hdr *);
 	hdr->address = address;
 	hdr->control = RFCOMM_MKCONTROL(RFCOMM_FRAME_UIH, pf);
 
 	/* Calculate FCS */
 	mcrc->m_data[0] = ng_btsocket_rfcomm_fcs2((u_int8_t *) hdr);
 
 	/* Put length back */
 	length = (data != NULL)? data->m_pkthdr.len : 0;
 	if (length > 127) {
 		u_int16_t	l = htole16(RFCOMM_MKLEN16(length));
 
 		bcopy(&l, &hdr->length, sizeof(l));
 		m->m_pkthdr.len ++;
 		m->m_len ++;
 	} else
 		hdr->length = RFCOMM_MKLEN8(length);
 
 	if (pf) {
 		m->m_data[m->m_len] = credits;
 		m->m_pkthdr.len ++;
 		m->m_len ++;
 	}
 
 	/* Add payload */
 	if (data != NULL) {
 		m_cat(m, data);
 		m->m_pkthdr.len += length;
 	}
 
 	/* Put FCS back */
 	m_cat(m, mcrc);
 	m->m_pkthdr.len ++;
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Sending UIH state=%d, flags=%#x, address=%d, length=%d, pf=%d, " \
 "credits=%d, len=%d\n",
 		__func__, s->state, s->flags, address, length, pf, credits,
 		m->m_pkthdr.len);
 
 	NG_BT_MBUFQ_ENQUEUE(&s->outq, m);
 
 	return (0);
 } /* ng_btsocket_rfcomm_send_uih */
 
 /*
  * Send MSC request. Caller must hold pcb->pcb_mtx and pcb->session->session_mtx
  */
 
 static int
 ng_btsocket_rfcomm_send_msc(ng_btsocket_rfcomm_pcb_p pcb)
 {
 	struct mbuf		*m = NULL;
 	struct rfcomm_mcc_hdr	*hdr = NULL;
 	struct rfcomm_mcc_msc	*msc = NULL;
 
 	mtx_assert(&pcb->session->session_mtx, MA_OWNED);
 	mtx_assert(&pcb->pcb_mtx, MA_OWNED);
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 
 	m->m_pkthdr.len = m->m_len = sizeof(*hdr) + sizeof(*msc);
 
 	hdr = mtod(m, struct rfcomm_mcc_hdr *);
 	msc = (struct rfcomm_mcc_msc *)(hdr + 1);
 
 	hdr->type = RFCOMM_MKMCC_TYPE(1, RFCOMM_MCC_MSC);
 	hdr->length = RFCOMM_MKLEN8(sizeof(*msc));
 
 	msc->address = RFCOMM_MKADDRESS(1, pcb->dlci);
 	msc->modem = pcb->lmodem;
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Sending MSC dlci=%d, state=%d, flags=%#x, address=%d, modem=%#x\n",
 		__func__, pcb->dlci, pcb->state, pcb->flags, msc->address,
 		msc->modem);
 
 	return (ng_btsocket_rfcomm_send_uih(pcb->session,
 			RFCOMM_MKADDRESS(INITIATOR(pcb->session), 0), 0, 0, m));
 } /* ng_btsocket_rfcomm_send_msc */
 
 /*
  * Send PN request. Caller must hold pcb->pcb_mtx and pcb->session->session_mtx
  */
 
 static int
 ng_btsocket_rfcomm_send_pn(ng_btsocket_rfcomm_pcb_p pcb)
 {
 	struct mbuf		*m = NULL;
 	struct rfcomm_mcc_hdr	*hdr = NULL;
 	struct rfcomm_mcc_pn	*pn = NULL;
 
 	mtx_assert(&pcb->session->session_mtx, MA_OWNED);
 	mtx_assert(&pcb->pcb_mtx, MA_OWNED);
 
 	MGETHDR(m, M_NOWAIT, MT_DATA);
 	if (m == NULL)
 		return (ENOBUFS);
 
 	m->m_pkthdr.len = m->m_len = sizeof(*hdr) + sizeof(*pn);
 
 	hdr = mtod(m, struct rfcomm_mcc_hdr *);
 	pn = (struct rfcomm_mcc_pn *)(hdr + 1);
 
 	hdr->type = RFCOMM_MKMCC_TYPE(1, RFCOMM_MCC_PN);
 	hdr->length = RFCOMM_MKLEN8(sizeof(*pn));
 
 	pn->dlci = pcb->dlci;
 
 	/*
 	 * Set default DLCI priority as described in GSM 07.10
 	 * (ETSI TS 101 369) clause 5.6 page 42
 	 */
 
 	pn->priority = (pcb->dlci < 56)? (((pcb->dlci >> 3) << 3) + 7) : 61;
 	pn->ack_timer = 0;
 	pn->mtu = htole16(pcb->mtu);
 	pn->max_retrans = 0;
 
 	if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC) {
 		pn->flow_control = 0xf0;
 		pn->credits = pcb->rx_cred;
 	} else {
 		pn->flow_control = 0;
 		pn->credits = 0;
 	}
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Sending PN dlci=%d, state=%d, flags=%#x, mtu=%d, flow_control=%#x, " \
 "credits=%d\n",	__func__, pcb->dlci, pcb->state, pcb->flags, pcb->mtu,
 		pn->flow_control, pn->credits);
 
 	return (ng_btsocket_rfcomm_send_uih(pcb->session,
 			RFCOMM_MKADDRESS(INITIATOR(pcb->session), 0), 0, 0, m));
 } /* ng_btsocket_rfcomm_send_pn */
 
 /*
  * Calculate and send credits based on available space in receive buffer
  */
 
 static int
 ng_btsocket_rfcomm_send_credits(ng_btsocket_rfcomm_pcb_p pcb)
 {
 	int		error = 0;
 	u_int8_t	credits;
 
 	mtx_assert(&pcb->pcb_mtx, MA_OWNED);
 	mtx_assert(&pcb->session->session_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Sending more credits, dlci=%d, state=%d, flags=%#x, mtu=%d, " \
 "space=%ld, tx_cred=%d, rx_cred=%d\n",
 		__func__, pcb->dlci, pcb->state, pcb->flags, pcb->mtu,
 		sbspace(&pcb->so->so_rcv), pcb->tx_cred, pcb->rx_cred);
 
 	credits = sbspace(&pcb->so->so_rcv) / pcb->mtu;
 	if (credits > 0) {
 		if (pcb->rx_cred + credits > RFCOMM_MAX_CREDITS)
 			credits = RFCOMM_MAX_CREDITS - pcb->rx_cred;
 
 		error = ng_btsocket_rfcomm_send_uih(
 				pcb->session,
 				RFCOMM_MKADDRESS(INITIATOR(pcb->session),
 					pcb->dlci), 1, credits, NULL);
 		if (error == 0) {
 			pcb->rx_cred += credits;
 
 			NG_BTSOCKET_RFCOMM_INFO(
 "%s: Gave remote side %d more credits, dlci=%d, state=%d, flags=%#x, " \
 "rx_cred=%d, tx_cred=%d\n",	__func__, credits, pcb->dlci, pcb->state,
 				pcb->flags, pcb->rx_cred, pcb->tx_cred);
 		} else
 			NG_BTSOCKET_RFCOMM_ERR(
 "%s: Could not send credits, error=%d, dlci=%d, state=%d, flags=%#x, " \
 "mtu=%d, space=%ld, tx_cred=%d, rx_cred=%d\n",
 				__func__, error, pcb->dlci, pcb->state,
 				pcb->flags, pcb->mtu, sbspace(&pcb->so->so_rcv),
 				pcb->tx_cred, pcb->rx_cred);
 	}
 
 	return (error);
 } /* ng_btsocket_rfcomm_send_credits */
 
 /*****************************************************************************
  *****************************************************************************
  **                              RFCOMM DLCs
  *****************************************************************************
  *****************************************************************************/
 
 /*
  * Send data from socket send buffer
  * Caller must hold pcb->pcb_mtx and pcb->session->session_mtx
  */
 
 static int
 ng_btsocket_rfcomm_pcb_send(ng_btsocket_rfcomm_pcb_p pcb, int limit)
 {
 	struct mbuf	*m = NULL;
 	int		 sent, length, error;
 
 	mtx_assert(&pcb->session->session_mtx, MA_OWNED);
 	mtx_assert(&pcb->pcb_mtx, MA_OWNED);
 
 	if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC)
 		limit = min(limit, pcb->tx_cred);
 	else if (!(pcb->rmodem & RFCOMM_MODEM_FC))
 		limit = min(limit, RFCOMM_MAX_CREDITS); /* XXX ??? */
 	else
 		limit = 0;
 
 	if (limit == 0) {
 		NG_BTSOCKET_RFCOMM_INFO(
 "%s: Could not send - remote flow control asserted, dlci=%d, flags=%#x, " \
 "rmodem=%#x, tx_cred=%d\n",
 			__func__, pcb->dlci, pcb->flags, pcb->rmodem,
 			pcb->tx_cred);
 
 		return (0);
 	}
 
 	for (error = 0, sent = 0; sent < limit; sent ++) { 
 		length = min(pcb->mtu, sbavail(&pcb->so->so_snd));
 		if (length == 0)
 			break;
 
 		/* Get the chunk from the socket's send buffer */
 		m = ng_btsocket_rfcomm_prepare_packet(&pcb->so->so_snd, length);
 		if (m == NULL) {
 			error = ENOBUFS;
 			break;
 		}
 
 		sbdrop(&pcb->so->so_snd, length);
 
 		error = ng_btsocket_rfcomm_send_uih(pcb->session,
 				RFCOMM_MKADDRESS(INITIATOR(pcb->session),
 					pcb->dlci), 0, 0, m);
 		if (error != 0)
 			break;
 	}
 
 	if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_CFC)
 		pcb->tx_cred -= sent;
 
 	if (error == 0 && sent > 0) {
 		pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_SENDING;
 		sowwakeup(pcb->so);
 	}
 
 	return (error);
 } /* ng_btsocket_rfcomm_pcb_send */
 
 /*
  * Unlink and disconnect DLC. If ng_btsocket_rfcomm_pcb_kill() returns
  * non zero value than socket has no reference and has to be detached.
  * Caller must hold pcb->pcb_mtx and pcb->session->session_mtx
  */
 
 static void
 ng_btsocket_rfcomm_pcb_kill(ng_btsocket_rfcomm_pcb_p pcb, int error)
 {
 	ng_btsocket_rfcomm_session_p	s = pcb->session;
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Killing DLC, so=%p, dlci=%d, state=%d, flags=%#x, error=%d\n",
 		__func__, pcb->so, pcb->dlci, pcb->state, pcb->flags, error);
 
 	if (pcb->session == NULL)
 		panic("%s: DLC without session, pcb=%p, state=%d, flags=%#x\n",
 			__func__, pcb, pcb->state, pcb->flags);
 
 	mtx_assert(&pcb->session->session_mtx, MA_OWNED);
 	mtx_assert(&pcb->pcb_mtx, MA_OWNED);
 
 	if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO)
 		ng_btsocket_rfcomm_untimeout(pcb);
 
 	/* Detach DLC from the session. Does not matter which state DLC in */
 	LIST_REMOVE(pcb, session_next);
 	pcb->session = NULL;
 
 	/* Change DLC state and wakeup all sleepers */
 	pcb->state = NG_BTSOCKET_RFCOMM_DLC_CLOSED;
 	pcb->so->so_error = error;
 	soisdisconnected(pcb->so);
 	wakeup(&pcb->state);
 
 	/* Check if we have any DLCs left on the session */
 	if (LIST_EMPTY(&s->dlcs) && INITIATOR(s)) {
 		NG_BTSOCKET_RFCOMM_INFO(
 "%s: Disconnecting session, state=%d, flags=%#x, mtu=%d\n",
 			__func__, s->state, s->flags, s->mtu);
 
 		switch (s->state) {
 		case NG_BTSOCKET_RFCOMM_SESSION_CLOSED:
 		case NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING:
 			/*
 			 * Do not have to do anything here. We can get here
 			 * when L2CAP connection was terminated or we have 
 			 * received DISC on multiplexor channel
 			 */
 			break;
 
 		case NG_BTSOCKET_RFCOMM_SESSION_OPEN:
 			/* Send DISC on multiplexor channel */
 			error = ng_btsocket_rfcomm_send_command(s,
 					RFCOMM_FRAME_DISC, 0);
 			if (error == 0) {
 				s->state = NG_BTSOCKET_RFCOMM_SESSION_DISCONNECTING;
 				break;
 			}
 			/* FALL THROUGH */
 
 		case NG_BTSOCKET_RFCOMM_SESSION_CONNECTING:
 		case NG_BTSOCKET_RFCOMM_SESSION_CONNECTED:
 			s->state = NG_BTSOCKET_RFCOMM_SESSION_CLOSED;
 			break;
 
 /*		case NG_BTSOCKET_RFCOMM_SESSION_LISTENING: */
 		default:
 			panic("%s: Invalid session state=%d, flags=%#x\n",
 				__func__, s->state, s->flags);
 			break;
 		}
 
 		ng_btsocket_rfcomm_task_wakeup();
 	}
 } /* ng_btsocket_rfcomm_pcb_kill */
 
 /*
  * Look for given dlci for given RFCOMM session. Caller must hold s->session_mtx
  */
 
 static ng_btsocket_rfcomm_pcb_p
 ng_btsocket_rfcomm_pcb_by_dlci(ng_btsocket_rfcomm_session_p s, int dlci)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = NULL;
 
 	mtx_assert(&s->session_mtx, MA_OWNED);
 
 	LIST_FOREACH(pcb, &s->dlcs, session_next)
 		if (pcb->dlci == dlci)
 			break;
 
 	return (pcb);
 } /* ng_btsocket_rfcomm_pcb_by_dlci */
 
 /*
  * Look for socket that listens on given src address and given channel
  */
 
 static ng_btsocket_rfcomm_pcb_p
 ng_btsocket_rfcomm_pcb_listener(bdaddr_p src, int channel)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = NULL, pcb1 = NULL;
 
 	mtx_lock(&ng_btsocket_rfcomm_sockets_mtx);
 
 	LIST_FOREACH(pcb, &ng_btsocket_rfcomm_sockets, next) {
 		if (pcb->channel != channel || !SOLISTENING(pcb->so))
 			continue;
 
 		if (bcmp(&pcb->src, src, sizeof(*src)) == 0)
 			break;
 
 		if (bcmp(&pcb->src, NG_HCI_BDADDR_ANY, sizeof(bdaddr_t)) == 0)
 			pcb1 = pcb;
 	}
 
 	mtx_unlock(&ng_btsocket_rfcomm_sockets_mtx);
 
 	return ((pcb != NULL)? pcb : pcb1);
 } /* ng_btsocket_rfcomm_pcb_listener */
 
 /*****************************************************************************
  *****************************************************************************
  **                              Misc. functions 
  *****************************************************************************
  *****************************************************************************/
 
 /*
  *  Set timeout. Caller MUST hold pcb_mtx
  */
 
 static void
 ng_btsocket_rfcomm_timeout(ng_btsocket_rfcomm_pcb_p pcb)
 {
 	mtx_assert(&pcb->pcb_mtx, MA_OWNED);
 
 	if (!(pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO)) {
 		pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_TIMO;
 		pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT;
 		callout_reset(&pcb->timo, ng_btsocket_rfcomm_timo * hz,
 		    ng_btsocket_rfcomm_process_timeout, pcb);
 	} else
 		panic("%s: Duplicated socket timeout?!\n", __func__);
 } /* ng_btsocket_rfcomm_timeout */
 
 /*
  *  Unset pcb timeout. Caller MUST hold pcb_mtx
  */
 
 static void
 ng_btsocket_rfcomm_untimeout(ng_btsocket_rfcomm_pcb_p pcb)
 {
 	mtx_assert(&pcb->pcb_mtx, MA_OWNED);
 
 	if (pcb->flags & NG_BTSOCKET_RFCOMM_DLC_TIMO) {
 		callout_stop(&pcb->timo);
 		pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMO;
 		pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT;
 	} else
 		panic("%s: No socket timeout?!\n", __func__);
 } /* ng_btsocket_rfcomm_timeout */
 
 /*
  * Process pcb timeout
  */
 
 static void
 ng_btsocket_rfcomm_process_timeout(void *xpcb)
 {
 	ng_btsocket_rfcomm_pcb_p	pcb = (ng_btsocket_rfcomm_pcb_p) xpcb;
 
 	mtx_assert(&pcb->pcb_mtx, MA_OWNED);
 
 	NG_BTSOCKET_RFCOMM_INFO(
 "%s: Timeout, so=%p, dlci=%d, state=%d, flags=%#x\n",
 		__func__, pcb->so, pcb->dlci, pcb->state, pcb->flags);
 
 	pcb->flags &= ~NG_BTSOCKET_RFCOMM_DLC_TIMO;
 	pcb->flags |= NG_BTSOCKET_RFCOMM_DLC_TIMEDOUT;
 
 	switch (pcb->state) {
 	case NG_BTSOCKET_RFCOMM_DLC_CONFIGURING:
 	case NG_BTSOCKET_RFCOMM_DLC_CONNECTING:
 		pcb->state = NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING;
 		break;
 
 	case NG_BTSOCKET_RFCOMM_DLC_W4_CONNECT:
 	case NG_BTSOCKET_RFCOMM_DLC_DISCONNECTING:
 		break;
 
 	default:
 		panic(
 "%s: DLC timeout in invalid state, dlci=%d, state=%d, flags=%#x\n",
 			__func__, pcb->dlci, pcb->state, pcb->flags);
 		break;
 	}
 
 	ng_btsocket_rfcomm_task_wakeup();
 } /* ng_btsocket_rfcomm_process_timeout */
 
 /*
  * Get up to length bytes from the socket buffer
  */
 
 static struct mbuf *
 ng_btsocket_rfcomm_prepare_packet(struct sockbuf *sb, int length)
 {
 	struct mbuf	*top = NULL, *m = NULL, *n = NULL, *nextpkt = NULL;
 	int		 mlen, noff, len;
 
 	MGETHDR(top, M_NOWAIT, MT_DATA);
 	if (top == NULL)
 		return (NULL);
 
 	top->m_pkthdr.len = length;
 	top->m_len = 0;
 	mlen = MHLEN;
 
 	m = top;
 	n = sb->sb_mb;
 	nextpkt = n->m_nextpkt;
 	noff = 0;
 
 	while (length > 0 && n != NULL) {
 		len = min(mlen - m->m_len, n->m_len - noff);
 		if (len > length)
 			len = length;
 
 		bcopy(mtod(n, caddr_t)+noff, mtod(m, caddr_t)+m->m_len, len);
 		m->m_len += len;
 		noff += len;
 		length -= len;
 
 		if (length > 0 && m->m_len == mlen) {
 			MGET(m->m_next, M_NOWAIT, MT_DATA);
 			if (m->m_next == NULL) {
 				NG_FREE_M(top);
 				return (NULL);
 			}
 
 			m = m->m_next;
 			m->m_len = 0;
 			mlen = MLEN;
 		}
 
 		if (noff == n->m_len) {
 			noff = 0;
 			n = n->m_next;
 
 			if (n == NULL)
 				n = nextpkt;
 
 			nextpkt = (n != NULL)? n->m_nextpkt : NULL;
 		}
 	}
 
 	if (length < 0)
 		panic("%s: length=%d\n", __func__, length);
 	if (length > 0 && n == NULL)
 		panic("%s: bogus length=%d, n=%p\n", __func__, length, n);
 
 	return (top);
 } /* ng_btsocket_rfcomm_prepare_packet */
diff --git a/sys/netgraph/ng_ksocket.c b/sys/netgraph/ng_ksocket.c
index 7c8cbd38e98e..d4f41fe02205 100644
--- a/sys/netgraph/ng_ksocket.c
+++ b/sys/netgraph/ng_ksocket.c
@@ -1,1270 +1,1270 @@
 /*
  * ng_ksocket.c
  */
 
 /*-
  * Copyright (c) 1996-1999 Whistle Communications, Inc.
  * All rights reserved.
  * 
  * Subject to the following obligations and disclaimer of warranty, use and
  * redistribution of this software, in source or object code forms, with or
  * without modifications are expressly permitted by Whistle Communications;
  * provided, however, that:
  * 1. Any and all reproductions of the source or object code must include the
  *    copyright notice above and the following disclaimer of warranties; and
  * 2. No rights are granted, in any manner or form, to use Whistle
  *    Communications, Inc. trademarks, including the mark "WHISTLE
  *    COMMUNICATIONS" on advertising, endorsements, or otherwise except as
  *    such appears in the above copyright notice or in the software.
  * 
  * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
  * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
  * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
  * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
  * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
  * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
  * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
  * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
  * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
  * OF SUCH DAMAGE.
  *
  * Author: Archie Cobbs <archie@freebsd.org>
  *
  * $FreeBSD$
  * $Whistle: ng_ksocket.c,v 1.1 1999/11/16 20:04:40 archie Exp $
  */
 
 /*
  * Kernel socket node type.  This node type is basically a kernel-mode
  * version of a socket... kindof like the reverse of the socket node type.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
 #include <sys/ctype.h>
 #include <sys/protosw.h>
 #include <sys/errno.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
 #include <sys/un.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/ng_parse.h>
 #include <netgraph/ng_ksocket.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>
 
 #ifdef NG_SEPARATE_MALLOC
 static MALLOC_DEFINE(M_NETGRAPH_KSOCKET, "netgraph_ksock",
     "netgraph ksock node");
 #else
 #define M_NETGRAPH_KSOCKET M_NETGRAPH
 #endif
 
 #define OFFSETOF(s, e) ((char *)&((s *)0)->e - (char *)((s *)0))
 #define SADATA_OFFSET	(OFFSETOF(struct sockaddr, sa_data))
 
 /* Node private data */
 struct ng_ksocket_private {
 	node_p		node;
 	hook_p		hook;
 	struct socket	*so;
 	int		fn_sent;	/* FN call on incoming event was sent */
 	LIST_HEAD(, ng_ksocket_private)	embryos;
 	LIST_ENTRY(ng_ksocket_private)	siblings;
 	u_int32_t	flags;
 	u_int32_t	response_token;
 	ng_ID_t		response_addr;
 };
 typedef struct ng_ksocket_private *priv_p;
 
 /* Flags for priv_p */
 #define	KSF_CONNECTING	0x00000001	/* Waiting for connection complete */
 #define	KSF_ACCEPTING	0x00000002	/* Waiting for accept complete */
 #define	KSF_EOFSEEN	0x00000004	/* Have sent 0-length EOF mbuf */
 #define	KSF_CLONED	0x00000008	/* Cloned from an accepting socket */
 #define	KSF_EMBRYONIC	0x00000010	/* Cloned node with no hooks yet */
 
 /* Netgraph node methods */
 static ng_constructor_t	ng_ksocket_constructor;
 static ng_rcvmsg_t	ng_ksocket_rcvmsg;
 static ng_shutdown_t	ng_ksocket_shutdown;
 static ng_newhook_t	ng_ksocket_newhook;
 static ng_rcvdata_t	ng_ksocket_rcvdata;
 static ng_connect_t	ng_ksocket_connect;
 static ng_disconnect_t	ng_ksocket_disconnect;
 
 /* Alias structure */
 struct ng_ksocket_alias {
 	const char	*name;
 	const int	value;
 	const int	family;
 };
 
 /* Protocol family aliases */
 static const struct ng_ksocket_alias ng_ksocket_families[] = {
 	{ "local",	PF_LOCAL	},
 	{ "inet",	PF_INET		},
 	{ "inet6",	PF_INET6	},
 	{ "atm",	PF_ATM		},
 	{ NULL,		-1		},
 };
 
 /* Socket type aliases */
 static const struct ng_ksocket_alias ng_ksocket_types[] = {
 	{ "stream",	SOCK_STREAM	},
 	{ "dgram",	SOCK_DGRAM	},
 	{ "raw",	SOCK_RAW	},
 	{ "rdm",	SOCK_RDM	},
 	{ "seqpacket",	SOCK_SEQPACKET	},
 	{ NULL,		-1		},
 };
 
 /* Protocol aliases */
 static const struct ng_ksocket_alias ng_ksocket_protos[] = {
 	{ "ip",		IPPROTO_IP,		PF_INET		},
 	{ "raw",	IPPROTO_RAW,		PF_INET		},
 	{ "icmp",	IPPROTO_ICMP,		PF_INET		},
 	{ "igmp",	IPPROTO_IGMP,		PF_INET		},
 	{ "tcp",	IPPROTO_TCP,		PF_INET		},
 	{ "udp",	IPPROTO_UDP,		PF_INET		},
 	{ "gre",	IPPROTO_GRE,		PF_INET		},
 	{ "esp",	IPPROTO_ESP,		PF_INET		},
 	{ "ah",		IPPROTO_AH,		PF_INET		},
 	{ "swipe",	IPPROTO_SWIPE,		PF_INET		},
 	{ "encap",	IPPROTO_ENCAP,		PF_INET		},
 	{ "divert",	IPPROTO_DIVERT,		PF_INET		},
 	{ "pim",	IPPROTO_PIM,		PF_INET		},
 	{ NULL,		-1					},
 };
 
 /* Helper functions */
 static int	ng_ksocket_accept(priv_p);
 static int	ng_ksocket_incoming(struct socket *so, void *arg, int waitflag);
 static int	ng_ksocket_parse(const struct ng_ksocket_alias *aliases,
 			const char *s, int family);
 static void	ng_ksocket_incoming2(node_p node, hook_p hook,
 			void *arg1, int arg2);
 
 /************************************************************************
 			STRUCT SOCKADDR PARSE TYPE
  ************************************************************************/
 
 /* Get the length of the data portion of a generic struct sockaddr */
 static int
 ng_parse_generic_sockdata_getLength(const struct ng_parse_type *type,
 	const u_char *start, const u_char *buf)
 {
 	const struct sockaddr *sa;
 
 	sa = (const struct sockaddr *)(buf - SADATA_OFFSET);
 	return (sa->sa_len < SADATA_OFFSET) ? 0 : sa->sa_len - SADATA_OFFSET;
 }
 
 /* Type for the variable length data portion of a generic struct sockaddr */
 static const struct ng_parse_type ng_ksocket_generic_sockdata_type = {
 	&ng_parse_bytearray_type,
 	&ng_parse_generic_sockdata_getLength
 };
 
 /* Type for a generic struct sockaddr */
 static const struct ng_parse_struct_field
     ng_parse_generic_sockaddr_type_fields[] = {
 	  { "len",	&ng_parse_uint8_type			},
 	  { "family",	&ng_parse_uint8_type			},
 	  { "data",	&ng_ksocket_generic_sockdata_type	},
 	  { NULL }
 };
 static const struct ng_parse_type ng_ksocket_generic_sockaddr_type = {
 	&ng_parse_struct_type,
 	&ng_parse_generic_sockaddr_type_fields
 };
 
 /* Convert a struct sockaddr from ASCII to binary.  If its a protocol
    family that we specially handle, do that, otherwise defer to the
    generic parse type ng_ksocket_generic_sockaddr_type. */
 static int
 ng_ksocket_sockaddr_parse(const struct ng_parse_type *type,
 	const char *s, int *off, const u_char *const start,
 	u_char *const buf, int *buflen)
 {
 	struct sockaddr *const sa = (struct sockaddr *)buf;
 	enum ng_parse_token tok;
 	char fambuf[32];
 	int family, len;
 	char *t;
 
 	/* If next token is a left curly brace, use generic parse type */
 	if ((tok = ng_parse_get_token(s, off, &len)) == T_LBRACE) {
 		return (*ng_ksocket_generic_sockaddr_type.supertype->parse)
 		    (&ng_ksocket_generic_sockaddr_type,
 		    s, off, start, buf, buflen);
 	}
 
 	/* Get socket address family followed by a slash */
 	while (isspace(s[*off]))
 		(*off)++;
 	if ((t = strchr(s + *off, '/')) == NULL)
 		return (EINVAL);
 	if ((len = t - (s + *off)) > sizeof(fambuf) - 1)
 		return (EINVAL);
 	strncpy(fambuf, s + *off, len);
 	fambuf[len] = '\0';
 	*off += len + 1;
 	if ((family = ng_ksocket_parse(ng_ksocket_families, fambuf, 0)) == -1)
 		return (EINVAL);
 
 	/* Set family */
 	if (*buflen < SADATA_OFFSET)
 		return (ERANGE);
 	sa->sa_family = family;
 
 	/* Set family-specific data and length */
 	switch (sa->sa_family) {
 	case PF_LOCAL:		/* Get pathname */
 	    {
 		const int pathoff = OFFSETOF(struct sockaddr_un, sun_path);
 		struct sockaddr_un *const sun = (struct sockaddr_un *)sa;
 		int toklen, pathlen;
 		char *path;
 
 		if ((path = ng_get_string_token(s, off, &toklen, NULL)) == NULL)
 			return (EINVAL);
 		pathlen = strlen(path);
 		if (pathlen > SOCK_MAXADDRLEN) {
 			free(path, M_NETGRAPH_KSOCKET);
 			return (E2BIG);
 		}
 		if (*buflen < pathoff + pathlen) {
 			free(path, M_NETGRAPH_KSOCKET);
 			return (ERANGE);
 		}
 		*off += toklen;
 		bcopy(path, sun->sun_path, pathlen);
 		sun->sun_len = pathoff + pathlen;
 		free(path, M_NETGRAPH_KSOCKET);
 		break;
 	    }
 
 	case PF_INET:		/* Get an IP address with optional port */
 	    {
 		struct sockaddr_in *const sin = (struct sockaddr_in *)sa;
 		int i;
 
 		/* Parse this: <ipaddress>[:port] */
 		for (i = 0; i < 4; i++) {
 			u_long val;
 			char *eptr;
 
 			val = strtoul(s + *off, &eptr, 10);
 			if (val > 0xff || eptr == s + *off)
 				return (EINVAL);
 			*off += (eptr - (s + *off));
 			((u_char *)&sin->sin_addr)[i] = (u_char)val;
 			if (i < 3) {
 				if (s[*off] != '.')
 					return (EINVAL);
 				(*off)++;
 			} else if (s[*off] == ':') {
 				(*off)++;
 				val = strtoul(s + *off, &eptr, 10);
 				if (val > 0xffff || eptr == s + *off)
 					return (EINVAL);
 				*off += (eptr - (s + *off));
 				sin->sin_port = htons(val);
 			} else
 				sin->sin_port = 0;
 		}
 		bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 		sin->sin_len = sizeof(*sin);
 		break;
 	    }
 
 #if 0
 	case PF_INET6:	/* XXX implement this someday */
 #endif
 
 	default:
 		return (EINVAL);
 	}
 
 	/* Done */
 	*buflen = sa->sa_len;
 	return (0);
 }
 
 /* Convert a struct sockaddr from binary to ASCII */
 static int
 ng_ksocket_sockaddr_unparse(const struct ng_parse_type *type,
 	const u_char *data, int *off, char *cbuf, int cbuflen)
 {
 	const struct sockaddr *sa = (const struct sockaddr *)(data + *off);
 	int slen = 0;
 
 	/* Output socket address, either in special or generic format */
 	switch (sa->sa_family) {
 	case PF_LOCAL:
 	    {
 		const int pathoff = OFFSETOF(struct sockaddr_un, sun_path);
 		const struct sockaddr_un *sun = (const struct sockaddr_un *)sa;
 		const int pathlen = sun->sun_len - pathoff;
 		char pathbuf[SOCK_MAXADDRLEN + 1];
 		char *pathtoken;
 
 		bcopy(sun->sun_path, pathbuf, pathlen);
 		if ((pathtoken = ng_encode_string(pathbuf, pathlen)) == NULL)
 			return (ENOMEM);
 		slen += snprintf(cbuf, cbuflen, "local/%s", pathtoken);
 		free(pathtoken, M_NETGRAPH_KSOCKET);
 		if (slen >= cbuflen)
 			return (ERANGE);
 		*off += sun->sun_len;
 		return (0);
 	    }
 
 	case PF_INET:
 	    {
 		const struct sockaddr_in *sin = (const struct sockaddr_in *)sa;
 
 		slen += snprintf(cbuf, cbuflen, "inet/%d.%d.%d.%d",
 		  ((const u_char *)&sin->sin_addr)[0],
 		  ((const u_char *)&sin->sin_addr)[1],
 		  ((const u_char *)&sin->sin_addr)[2],
 		  ((const u_char *)&sin->sin_addr)[3]);
 		if (sin->sin_port != 0) {
 			slen += snprintf(cbuf + strlen(cbuf),
 			    cbuflen - strlen(cbuf), ":%d",
 			    (u_int)ntohs(sin->sin_port));
 		}
 		if (slen >= cbuflen)
 			return (ERANGE);
 		*off += sizeof(*sin);
 		return(0);
 	    }
 
 #if 0
 	case PF_INET6:	/* XXX implement this someday */
 #endif
 
 	default:
 		return (*ng_ksocket_generic_sockaddr_type.supertype->unparse)
 		    (&ng_ksocket_generic_sockaddr_type,
 		    data, off, cbuf, cbuflen);
 	}
 }
 
 /* Parse type for struct sockaddr */
 static const struct ng_parse_type ng_ksocket_sockaddr_type = {
 	NULL,
 	NULL,
 	NULL,
 	&ng_ksocket_sockaddr_parse,
 	&ng_ksocket_sockaddr_unparse,
 	NULL		/* no such thing as a default struct sockaddr */
 };
 
 /************************************************************************
 		STRUCT NG_KSOCKET_SOCKOPT PARSE TYPE
  ************************************************************************/
 
 /* Get length of the struct ng_ksocket_sockopt value field, which is the
    just the excess of the message argument portion over the length of
    the struct ng_ksocket_sockopt. */
 static int
 ng_parse_sockoptval_getLength(const struct ng_parse_type *type,
 	const u_char *start, const u_char *buf)
 {
 	static const int offset = OFFSETOF(struct ng_ksocket_sockopt, value);
 	const struct ng_ksocket_sockopt *sopt;
 	const struct ng_mesg *msg;
 
 	sopt = (const struct ng_ksocket_sockopt *)(buf - offset);
 	msg = (const struct ng_mesg *)((const u_char *)sopt - sizeof(*msg));
 	return msg->header.arglen - sizeof(*sopt);
 }
 
 /* Parse type for the option value part of a struct ng_ksocket_sockopt
    XXX Eventually, we should handle the different socket options specially.
    XXX This would avoid byte order problems, eg an integer value of 1 is
    XXX going to be "[1]" for little endian or "[3=1]" for big endian. */
 static const struct ng_parse_type ng_ksocket_sockoptval_type = {
 	&ng_parse_bytearray_type,
 	&ng_parse_sockoptval_getLength
 };
 
 /* Parse type for struct ng_ksocket_sockopt */
 static const struct ng_parse_struct_field ng_ksocket_sockopt_type_fields[]
 	= NG_KSOCKET_SOCKOPT_INFO(&ng_ksocket_sockoptval_type);
 static const struct ng_parse_type ng_ksocket_sockopt_type = {
 	&ng_parse_struct_type,
 	&ng_ksocket_sockopt_type_fields
 };
 
 /* Parse type for struct ng_ksocket_accept */
 static const struct ng_parse_struct_field ng_ksocket_accept_type_fields[]
 	= NGM_KSOCKET_ACCEPT_INFO;
 static const struct ng_parse_type ng_ksocket_accept_type = {
 	&ng_parse_struct_type,
 	&ng_ksocket_accept_type_fields
 };
 
 /* List of commands and how to convert arguments to/from ASCII */
 static const struct ng_cmdlist ng_ksocket_cmds[] = {
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_BIND,
 	  "bind",
 	  &ng_ksocket_sockaddr_type,
 	  NULL
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_LISTEN,
 	  "listen",
 	  &ng_parse_int32_type,
 	  NULL
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_ACCEPT,
 	  "accept",
 	  NULL,
 	  &ng_ksocket_accept_type
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_CONNECT,
 	  "connect",
 	  &ng_ksocket_sockaddr_type,
 	  &ng_parse_int32_type
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_GETNAME,
 	  "getname",
 	  NULL,
 	  &ng_ksocket_sockaddr_type
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_GETPEERNAME,
 	  "getpeername",
 	  NULL,
 	  &ng_ksocket_sockaddr_type
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_SETOPT,
 	  "setopt",
 	  &ng_ksocket_sockopt_type,
 	  NULL
 	},
 	{
 	  NGM_KSOCKET_COOKIE,
 	  NGM_KSOCKET_GETOPT,
 	  "getopt",
 	  &ng_ksocket_sockopt_type,
 	  &ng_ksocket_sockopt_type
 	},
 	{ 0 }
 };
 
 /* Node type descriptor */
 static struct ng_type ng_ksocket_typestruct = {
 	.version =	NG_ABI_VERSION,
 	.name =		NG_KSOCKET_NODE_TYPE,
 	.constructor =	ng_ksocket_constructor,
 	.rcvmsg =	ng_ksocket_rcvmsg,
 	.shutdown =	ng_ksocket_shutdown,
 	.newhook =	ng_ksocket_newhook,
 	.connect =	ng_ksocket_connect,
 	.rcvdata =	ng_ksocket_rcvdata,
 	.disconnect =	ng_ksocket_disconnect,
 	.cmdlist =	ng_ksocket_cmds,
 };
 NETGRAPH_INIT(ksocket, &ng_ksocket_typestruct);
 
 #define ERROUT(x)	do { error = (x); goto done; } while (0)
 
 /************************************************************************
 			NETGRAPH NODE STUFF
  ************************************************************************/
 
 /*
  * Node type constructor
  * The NODE part is assumed to be all set up.
  * There is already a reference to the node for us.
  */
 static int
 ng_ksocket_constructor(node_p node)
 {
 	priv_p priv;
 
 	/* Allocate private structure */
 	priv = malloc(sizeof(*priv), M_NETGRAPH_KSOCKET, M_NOWAIT | M_ZERO);
 	if (priv == NULL)
 		return (ENOMEM);
 
 	LIST_INIT(&priv->embryos);
 	/* cross link them */
 	priv->node = node;
 	NG_NODE_SET_PRIVATE(node, priv);
 
 	/* Done */
 	return (0);
 }
 
 /*
  * Give our OK for a hook to be added. The hook name is of the
  * form "<family>/<type>/<proto>" where the three components may
  * be decimal numbers or else aliases from the above lists.
  *
  * Connecting a hook amounts to opening the socket.  Disconnecting
  * the hook closes the socket and destroys the node as well.
  */
 static int
 ng_ksocket_newhook(node_p node, hook_p hook, const char *name0)
 {
 	struct thread *td = curthread;	/* XXX broken */
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	char *s1, *s2, name[NG_HOOKSIZ];
 	int family, type, protocol, error;
 
 	/* Check if we're already connected */
 	if (priv->hook != NULL)
 		return (EISCONN);
 
 	if (priv->flags & KSF_CLONED) {
 		if (priv->flags & KSF_EMBRYONIC) {
 			/* Remove ourselves from our parent's embryo list */
 			LIST_REMOVE(priv, siblings);
 			priv->flags &= ~KSF_EMBRYONIC;
 		}
 	} else {
 		/* Extract family, type, and protocol from hook name */
 		snprintf(name, sizeof(name), "%s", name0);
 		s1 = name;
 		if ((s2 = strchr(s1, '/')) == NULL)
 			return (EINVAL);
 		*s2++ = '\0';
 		family = ng_ksocket_parse(ng_ksocket_families, s1, 0);
 		if (family == -1)
 			return (EINVAL);
 		s1 = s2;
 		if ((s2 = strchr(s1, '/')) == NULL)
 			return (EINVAL);
 		*s2++ = '\0';
 		type = ng_ksocket_parse(ng_ksocket_types, s1, 0);
 		if (type == -1)
 			return (EINVAL);
 		s1 = s2;
 		protocol = ng_ksocket_parse(ng_ksocket_protos, s1, family);
 		if (protocol == -1)
 			return (EINVAL);
 
 		/* Create the socket */
 		error = socreate(family, &priv->so, type, protocol,
 		   td->td_ucred, td);
 		if (error != 0)
 			return (error);
 
 		/* XXX call soreserve() ? */
 	}
 
 	/* OK */
 	priv->hook = hook;
 
 	/*
 	 * In case of misconfigured routing a packet may reenter
 	 * ksocket node recursively. Decouple stack to avoid possible
 	 * panics about sleeping with locks held.
 	 */
 	NG_HOOK_FORCE_QUEUE(hook);
 
 	return(0);
 }
 
 static int
 ng_ksocket_connect(hook_p hook)
 {
 	node_p node = NG_HOOK_NODE(hook);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct socket *const so = priv->so;
 
 	/* Add our hook for incoming data and other events */
 	SOCKBUF_LOCK(&priv->so->so_rcv);
 	soupcall_set(priv->so, SO_RCV, ng_ksocket_incoming, node);
 	SOCKBUF_UNLOCK(&priv->so->so_rcv);
 	SOCKBUF_LOCK(&priv->so->so_snd);
 	soupcall_set(priv->so, SO_SND, ng_ksocket_incoming, node);
 	SOCKBUF_UNLOCK(&priv->so->so_snd);
 	SOCK_LOCK(priv->so);
 	priv->so->so_state |= SS_NBIO;
 	SOCK_UNLOCK(priv->so);
 	/*
 	 * --Original comment--
 	 * On a cloned socket we may have already received one or more
 	 * upcalls which we couldn't handle without a hook.  Handle
 	 * those now.
 	 * We cannot call the upcall function directly
 	 * from here, because until this function has returned our
 	 * hook isn't connected.
 	 *
 	 * ---meta comment for -current ---
 	 * XXX This is dubius.
 	 * Upcalls between the time that the hook was
 	 * first created and now (on another processesor) will
 	 * be earlier on the queue than the request to finalise the hook.
 	 * By the time the hook is finalised,
 	 * The queued upcalls will have happened and the code
 	 * will have discarded them because of a lack of a hook.
 	 * (socket not open).
 	 *
 	 * This is a bad byproduct of the complicated way in which hooks
 	 * are now created (3 daisy chained async events).
 	 *
 	 * Since we are a netgraph operation
 	 * We know that we hold a lock on this node. This forces the
 	 * request we make below to be queued rather than implemented
 	 * immediately which will cause the upcall function to be called a bit
 	 * later.
 	 * However, as we will run any waiting queued operations immediately
 	 * after doing this one, if we have not finalised the other end
 	 * of the hook, those queued operations will fail.
 	 */
 	if (priv->flags & KSF_CLONED) {
 		ng_send_fn(node, NULL, &ng_ksocket_incoming2, so, M_NOWAIT);
 	}
 
 	return (0);
 }
 
 /*
  * Receive a control message
  */
 static int
 ng_ksocket_rcvmsg(node_p node, item_p item, hook_p lasthook)
 {
 	struct thread *td = curthread;	/* XXX broken */
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct socket *const so = priv->so;
 	struct ng_mesg *resp = NULL;
 	int error = 0;
 	struct ng_mesg *msg;
 
 	NGI_GET_MSG(item, msg);
 	switch (msg->header.typecookie) {
 	case NGM_KSOCKET_COOKIE:
 		switch (msg->header.cmd) {
 		case NGM_KSOCKET_BIND:
 		    {
 			struct sockaddr *const sa
 			    = (struct sockaddr *)msg->data;
 
 			/* Sanity check */
 			if (msg->header.arglen < SADATA_OFFSET
 			    || msg->header.arglen < sa->sa_len)
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Bind */
 			error = sobind(so, sa, td);
 			break;
 		    }
 		case NGM_KSOCKET_LISTEN:
 		    {
 			/* Sanity check */
 			if (msg->header.arglen != sizeof(int32_t))
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Listen */
 			so->so_state |= SS_NBIO;
 			error = solisten(so, *((int32_t *)msg->data), td);
 			break;
 		    }
 
 		case NGM_KSOCKET_ACCEPT:
 		    {
 			/* Sanity check */
 			if (msg->header.arglen != 0)
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Make sure the socket is capable of accepting */
 			if (!(so->so_options & SO_ACCEPTCONN))
 				ERROUT(EINVAL);
 			if (priv->flags & KSF_ACCEPTING)
 				ERROUT(EALREADY);
 
 			/*
 			 * If a connection is already complete, take it.
 			 * Otherwise let the upcall function deal with
 			 * the connection when it comes in.
 			 */
 			error = ng_ksocket_accept(priv);
 			if (error != 0 && error != EWOULDBLOCK)
 				ERROUT(error);
 			priv->response_token = msg->header.token;
 			priv->response_addr = NGI_RETADDR(item);
 			break;
 		    }
 
 		case NGM_KSOCKET_CONNECT:
 		    {
 			struct sockaddr *const sa
 			    = (struct sockaddr *)msg->data;
 
 			/* Sanity check */
 			if (msg->header.arglen < SADATA_OFFSET
 			    || msg->header.arglen < sa->sa_len)
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Do connect */
 			if ((so->so_state & SS_ISCONNECTING) != 0)
 				ERROUT(EALREADY);
 			if ((error = soconnect(so, sa, td)) != 0) {
 				so->so_state &= ~SS_ISCONNECTING;
 				ERROUT(error);
 			}
 			if ((so->so_state & SS_ISCONNECTING) != 0) {
 				/* We will notify the sender when we connect */
 				priv->response_token = msg->header.token;
 				priv->response_addr = NGI_RETADDR(item);
 				priv->flags |= KSF_CONNECTING;
 				ERROUT(EINPROGRESS);
 			}
 			break;
 		    }
 
 		case NGM_KSOCKET_GETNAME:
 		case NGM_KSOCKET_GETPEERNAME:
 		    {
 			int (*func)(struct socket *so, struct sockaddr **nam);
 			struct sockaddr *sa = NULL;
 			int len;
 
 			/* Sanity check */
 			if (msg->header.arglen != 0)
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Get function */
 			if (msg->header.cmd == NGM_KSOCKET_GETPEERNAME) {
 				if ((so->so_state
 				    & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
 					ERROUT(ENOTCONN);
-				func = so->so_proto->pr_usrreqs->pru_peeraddr;
+				func = so->so_proto->pr_peeraddr;
 			} else
-				func = so->so_proto->pr_usrreqs->pru_sockaddr;
+				func = so->so_proto->pr_sockaddr;
 
 			/* Get local or peer address */
 			if ((error = (*func)(so, &sa)) != 0)
 				goto bail;
 			len = (sa == NULL) ? 0 : sa->sa_len;
 
 			/* Send it back in a response */
 			NG_MKRESPONSE(resp, msg, len, M_NOWAIT);
 			if (resp == NULL) {
 				error = ENOMEM;
 				goto bail;
 			}
 			bcopy(sa, resp->data, len);
 
 		bail:
 			/* Cleanup */
 			if (sa != NULL)
 				free(sa, M_SONAME);
 			break;
 		    }
 
 		case NGM_KSOCKET_GETOPT:
 		    {
 			struct ng_ksocket_sockopt *ksopt =
 			    (struct ng_ksocket_sockopt *)msg->data;
 			struct sockopt sopt;
 
 			/* Sanity check */
 			if (msg->header.arglen != sizeof(*ksopt))
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Get response with room for option value */
 			NG_MKRESPONSE(resp, msg, sizeof(*ksopt)
 			    + NG_KSOCKET_MAX_OPTLEN, M_NOWAIT);
 			if (resp == NULL)
 				ERROUT(ENOMEM);
 
 			/* Get socket option, and put value in the response */
 			sopt.sopt_dir = SOPT_GET;
 			sopt.sopt_level = ksopt->level;
 			sopt.sopt_name = ksopt->name;
 			sopt.sopt_td = NULL;
 			sopt.sopt_valsize = NG_KSOCKET_MAX_OPTLEN;
 			ksopt = (struct ng_ksocket_sockopt *)resp->data;
 			sopt.sopt_val = ksopt->value;
 			if ((error = sogetopt(so, &sopt)) != 0) {
 				NG_FREE_MSG(resp);
 				break;
 			}
 
 			/* Set actual value length */
 			resp->header.arglen = sizeof(*ksopt)
 			    + sopt.sopt_valsize;
 			break;
 		    }
 
 		case NGM_KSOCKET_SETOPT:
 		    {
 			struct ng_ksocket_sockopt *const ksopt =
 			    (struct ng_ksocket_sockopt *)msg->data;
 			const int valsize = msg->header.arglen - sizeof(*ksopt);
 			struct sockopt sopt;
 
 			/* Sanity check */
 			if (valsize < 0)
 				ERROUT(EINVAL);
 			if (so == NULL)
 				ERROUT(ENXIO);
 
 			/* Set socket option */
 			sopt.sopt_dir = SOPT_SET;
 			sopt.sopt_level = ksopt->level;
 			sopt.sopt_name = ksopt->name;
 			sopt.sopt_val = ksopt->value;
 			sopt.sopt_valsize = valsize;
 			sopt.sopt_td = NULL;
 			error = sosetopt(so, &sopt);
 			break;
 		    }
 
 		default:
 			error = EINVAL;
 			break;
 		}
 		break;
 	default:
 		error = EINVAL;
 		break;
 	}
 done:
 	NG_RESPOND_MSG(error, node, item, resp);
 	NG_FREE_MSG(msg);
 	return (error);
 }
 
 /*
  * Receive incoming data on our hook.  Send it out the socket.
  */
 static int
 ng_ksocket_rcvdata(hook_p hook, item_p item)
 {
 	struct thread *td = curthread;	/* XXX broken */
 	const node_p node = NG_HOOK_NODE(hook);
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct socket *const so = priv->so;
 	struct sockaddr *sa = NULL;
 	int error;
 	struct mbuf *m;
 #ifdef ALIGNED_POINTER
 	struct mbuf *n;
 #endif /* ALIGNED_POINTER */
 	struct sa_tag *stag;
 
 	/* Extract data */
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 #ifdef ALIGNED_POINTER
 	if (!ALIGNED_POINTER(mtod(m, caddr_t), uint32_t)) {
 		n = m_defrag(m, M_NOWAIT);
 		if (n == NULL) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 		m = n;
 	}
 #endif /* ALIGNED_POINTER */
 	/*
 	 * Look if socket address is stored in packet tags.
 	 * If sockaddr is ours, or provided by a third party (zero id),
 	 * then we accept it.
 	 */
 	if (((stag = (struct sa_tag *)m_tag_locate(m, NGM_KSOCKET_COOKIE,
 	    NG_KSOCKET_TAG_SOCKADDR, NULL)) != NULL) &&
 	    (stag->id == NG_NODE_ID(node) || stag->id == 0))
 		sa = &stag->sa;
 
 	/* Reset specific mbuf flags to prevent addressing problems. */
 	m->m_flags &= ~(M_BCAST|M_MCAST);
 
 	/* Send packet */
 	error = sosend(so, sa, 0, m, 0, 0, td);
 
 	return (error);
 }
 
 /*
  * Destroy node
  */
 static int
 ng_ksocket_shutdown(node_p node)
 {
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	priv_p embryo;
 
 	/* Close our socket (if any) */
 	if (priv->so != NULL) {
 		SOCKBUF_LOCK(&priv->so->so_rcv);
 		soupcall_clear(priv->so, SO_RCV);
 		SOCKBUF_UNLOCK(&priv->so->so_rcv);
 		SOCKBUF_LOCK(&priv->so->so_snd);
 		soupcall_clear(priv->so, SO_SND);
 		SOCKBUF_UNLOCK(&priv->so->so_snd);
 		soclose(priv->so);
 		priv->so = NULL;
 	}
 
 	/* If we are an embryo, take ourselves out of the parent's list */
 	if (priv->flags & KSF_EMBRYONIC) {
 		LIST_REMOVE(priv, siblings);
 		priv->flags &= ~KSF_EMBRYONIC;
 	}
 
 	/* Remove any embryonic children we have */
 	while (!LIST_EMPTY(&priv->embryos)) {
 		embryo = LIST_FIRST(&priv->embryos);
 		ng_rmnode_self(embryo->node);
 	}
 
 	/* Take down netgraph node */
 	bzero(priv, sizeof(*priv));
 	free(priv, M_NETGRAPH_KSOCKET);
 	NG_NODE_SET_PRIVATE(node, NULL);
 	NG_NODE_UNREF(node);		/* let the node escape */
 	return (0);
 }
 
 /*
  * Hook disconnection
  */
 static int
 ng_ksocket_disconnect(hook_p hook)
 {
 	KASSERT(NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook)) == 0,
 	    ("%s: numhooks=%d?", __func__,
 	    NG_NODE_NUMHOOKS(NG_HOOK_NODE(hook))));
 	if (NG_NODE_IS_VALID(NG_HOOK_NODE(hook)))
 		ng_rmnode_self(NG_HOOK_NODE(hook));
 	return (0);
 }
 
 /************************************************************************
 			HELPER STUFF
  ************************************************************************/
 /*
  * You should not "just call" a netgraph node function from an external
  * asynchronous event. This is because in doing so you are ignoring the
  * locking on the netgraph nodes. Instead call your function via ng_send_fn().
  * This will call the function you chose, but will first do all the
  * locking rigmarole. Your function MAY only be called at some distant future
  * time (several millisecs away) so don't give it any arguments
  * that may be revoked soon (e.g. on your stack).
  *
  * To decouple stack, we use queue version of ng_send_fn().
  */
 
 static int
 ng_ksocket_incoming(struct socket *so, void *arg, int waitflag)
 {
 	const node_p node = arg;
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	int wait = ((waitflag & M_WAITOK) ? NG_WAITOK : 0) | NG_QUEUE;
 
 	/*
 	 * Even if node is not locked, as soon as we are called, we assume
 	 * it exist and it's private area is valid. With some care we can
 	 * access it. Mark node that incoming event for it was sent to
 	 * avoid unneded queue trashing.
 	 */
 	if (atomic_cmpset_int(&priv->fn_sent, 0, 1) &&
 	    ng_send_fn1(node, NULL, &ng_ksocket_incoming2, so, 0, wait)) {
 		atomic_store_rel_int(&priv->fn_sent, 0);
 	}
 	return (SU_OK);
 }
 
 /*
  * When incoming data is appended to the socket, we get notified here.
  * This is also called whenever a significant event occurs for the socket.
  * Our original caller may have queued this even some time ago and
  * we cannot trust that he even still exists. The node however is being
  * held with a reference by the queueing code and guarantied to be valid.
  */
 static void
 ng_ksocket_incoming2(node_p node, hook_p hook, void *arg1, int arg2)
 {
 	struct socket *so = arg1;
 	const priv_p priv = NG_NODE_PRIVATE(node);
 	struct ng_mesg *response;
 	int error;
 
 	KASSERT(so == priv->so, ("%s: wrong socket", __func__));
 
 	/* Allow next incoming event to be queued. */
 	atomic_store_rel_int(&priv->fn_sent, 0);
 
 	/* Check whether a pending connect operation has completed */
 	if (priv->flags & KSF_CONNECTING) {
 		if ((error = so->so_error) != 0) {
 			so->so_error = 0;
 			so->so_state &= ~SS_ISCONNECTING;
 		}
 		if (!(so->so_state & SS_ISCONNECTING)) {
 			NG_MKMESSAGE(response, NGM_KSOCKET_COOKIE,
 			    NGM_KSOCKET_CONNECT, sizeof(int32_t), M_NOWAIT);
 			if (response != NULL) {
 				response->header.flags |= NGF_RESP;
 				response->header.token = priv->response_token;
 				*(int32_t *)response->data = error;
 				/*
 				 * send an async "response" message
 				 * to the node that set us up
 				 * (if it still exists)
 				 */
 				NG_SEND_MSG_ID(error, node,
 				    response, priv->response_addr, 0);
 			}
 			priv->flags &= ~KSF_CONNECTING;
 		}
 	}
 
 	/* Check whether a pending accept operation has completed */
 	if (priv->flags & KSF_ACCEPTING)
 		(void )ng_ksocket_accept(priv);
 
 	/*
 	 * If we don't have a hook, we must handle data events later.  When
 	 * the hook gets created and is connected, this upcall function
 	 * will be called again.
 	 */
 	if (priv->hook == NULL)
 		return;
 
 	/* Read and forward available mbufs. */
 	while (1) {
 		struct uio uio;
 		struct sockaddr *sa;
 		struct mbuf *m;
 		int flags;
 
 		/* Try to get next packet from socket. */
 		uio.uio_td = NULL;
 		uio.uio_resid = IP_MAXPACKET;
 		flags = MSG_DONTWAIT;
 		sa = NULL;
 		if ((error = soreceive(so, (so->so_state & SS_ISCONNECTED) ?
 		    NULL : &sa, &uio, &m, NULL, &flags)) != 0)
 			break;
 
 		/* See if we got anything. */
 		if (flags & MSG_TRUNC) {
 			m_freem(m);
 			m = NULL;
 		}
 		if (m == NULL) {
 			if (sa != NULL)
 				free(sa, M_SONAME);
 			break;
 		}
 
 		KASSERT(m->m_nextpkt == NULL, ("%s: nextpkt", __func__));
 
 		/*
 		 * Stream sockets do not have packet boundaries, so
 		 * we have to allocate a header mbuf and attach the
 		 * stream of data to it.
 		 */
 		if (so->so_type == SOCK_STREAM) {
 			struct mbuf *mh;
 
 			mh = m_gethdr(M_NOWAIT, MT_DATA);
 			if (mh == NULL) {
 				m_freem(m);
 				if (sa != NULL)
 					free(sa, M_SONAME);
 				break;
 			}
 
 			mh->m_next = m;
 			for (; m; m = m->m_next)
 				mh->m_pkthdr.len += m->m_len;
 			m = mh;
 		}
 
 		/* Put peer's socket address (if any) into a tag */
 		if (sa != NULL) {
 			struct sa_tag	*stag;
 
 			stag = (struct sa_tag *)m_tag_alloc(NGM_KSOCKET_COOKIE,
 			    NG_KSOCKET_TAG_SOCKADDR, sizeof(ng_ID_t) +
 			    sa->sa_len, M_NOWAIT);
 			if (stag == NULL) {
 				free(sa, M_SONAME);
 				goto sendit;
 			}
 			bcopy(sa, &stag->sa, sa->sa_len);
 			free(sa, M_SONAME);
 			stag->id = NG_NODE_ID(node);
 			m_tag_prepend(m, &stag->tag);
 		}
 
 sendit:		/* Forward data with optional peer sockaddr as packet tag */
 		NG_SEND_DATA_ONLY(error, priv->hook, m);
 	}
 
 	/*
 	 * If the peer has closed the connection, forward a 0-length mbuf
 	 * to indicate end-of-file.
 	 */
 	if (so->so_rcv.sb_state & SBS_CANTRCVMORE &&
 	    !(priv->flags & KSF_EOFSEEN)) {
 		struct mbuf *m;
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m != NULL)
 			NG_SEND_DATA_ONLY(error, priv->hook, m);
 		priv->flags |= KSF_EOFSEEN;
 	}
 }
 
 static int
 ng_ksocket_accept(priv_p priv)
 {
 	struct socket *const head = priv->so;
 	struct socket *so;
 	struct sockaddr *sa = NULL;
 	struct ng_mesg *resp;
 	struct ng_ksocket_accept *resp_data;
 	node_p node;
 	priv_p priv2;
 	int len;
 	int error;
 
 	SOLISTEN_LOCK(head);
 	error = solisten_dequeue(head, &so, SOCK_NONBLOCK);
 	if (error == EWOULDBLOCK) {
 		priv->flags |= KSF_ACCEPTING;
 		return (error);
 	}
 	priv->flags &= ~KSF_ACCEPTING;
 	if (error)
 		return (error);
 
 	if ((error = soaccept(so, &sa)) != 0)
 		return (error);
 
 	len = OFFSETOF(struct ng_ksocket_accept, addr);
 	if (sa != NULL)
 		len += sa->sa_len;
 
 	NG_MKMESSAGE(resp, NGM_KSOCKET_COOKIE, NGM_KSOCKET_ACCEPT, len,
 	    M_NOWAIT);
 	if (resp == NULL) {
 		soclose(so);
 		goto out;
 	}
 	resp->header.flags |= NGF_RESP;
 	resp->header.token = priv->response_token;
 
 	/* Clone a ksocket node to wrap the new socket */
 	error = ng_make_node_common(&ng_ksocket_typestruct, &node);
 	if (error) {
 		free(resp, M_NETGRAPH);
 		soclose(so);
 		goto out;
 	}
 
 	if (ng_ksocket_constructor(node) != 0) {
 		NG_NODE_UNREF(node);
 		free(resp, M_NETGRAPH);
 		soclose(so);
 		goto out;
 	}
 
 	priv2 = NG_NODE_PRIVATE(node);
 	priv2->so = so;
 	priv2->flags |= KSF_CLONED | KSF_EMBRYONIC;
 
 	/*
 	 * Insert the cloned node into a list of embryonic children
 	 * on the parent node.  When a hook is created on the cloned
 	 * node it will be removed from this list.  When the parent
 	 * is destroyed it will destroy any embryonic children it has.
 	 */
 	LIST_INSERT_HEAD(&priv->embryos, priv2, siblings);
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	soupcall_set(so, SO_RCV, ng_ksocket_incoming, node);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 	SOCKBUF_LOCK(&so->so_snd);
 	soupcall_set(so, SO_SND, ng_ksocket_incoming, node);
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	/* Fill in the response data and send it or return it to the caller */
 	resp_data = (struct ng_ksocket_accept *)resp->data;
 	resp_data->nodeid = NG_NODE_ID(node);
 	if (sa != NULL)
 		bcopy(sa, &resp_data->addr, sa->sa_len);
 	NG_SEND_MSG_ID(error, node, resp, priv->response_addr, 0);
 
 out:
 	if (sa != NULL)
 		free(sa, M_SONAME);
 
 	return (0);
 }
 
 /*
  * Parse out either an integer value or an alias.
  */
 static int
 ng_ksocket_parse(const struct ng_ksocket_alias *aliases,
 	const char *s, int family)
 {
 	int k, val;
 	char *eptr;
 
 	/* Try aliases */
 	for (k = 0; aliases[k].name != NULL; k++) {
 		if (strcmp(s, aliases[k].name) == 0
 		    && aliases[k].family == family)
 			return aliases[k].value;
 	}
 
 	/* Try parsing as a number */
 	val = (int)strtoul(s, &eptr, 10);
 	if (val < 0 || *eptr != '\0')
 		return (-1);
 	return (val);
 }
diff --git a/sys/netgraph/ng_socket.c b/sys/netgraph/ng_socket.c
index a7a70b0c35a6..852d51af296a 100644
--- a/sys/netgraph/ng_socket.c
+++ b/sys/netgraph/ng_socket.c
@@ -1,1225 +1,1208 @@
 /*
  * ng_socket.c
  */
 
 /*-
  * Copyright (c) 1996-1999 Whistle Communications, Inc.
  * All rights reserved.
  *
  * Subject to the following obligations and disclaimer of warranty, use and
  * redistribution of this software, in source or object code forms, with or
  * without modifications are expressly permitted by Whistle Communications;
  * provided, however, that:
  * 1. Any and all reproductions of the source or object code must include the
  *    copyright notice above and the following disclaimer of warranties; and
  * 2. No rights are granted, in any manner or form, to use Whistle
  *    Communications, Inc. trademarks, including the mark "WHISTLE
  *    COMMUNICATIONS" on advertising, endorsements, or otherwise except as
  *    such appears in the above copyright notice or in the software.
  *
  * THIS SOFTWARE IS BEING PROVIDED BY WHISTLE COMMUNICATIONS "AS IS", AND
  * TO THE MAXIMUM EXTENT PERMITTED BY LAW, WHISTLE COMMUNICATIONS MAKES NO
  * REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED, REGARDING THIS SOFTWARE,
  * INCLUDING WITHOUT LIMITATION, ANY AND ALL IMPLIED WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT.
  * WHISTLE COMMUNICATIONS DOES NOT WARRANT, GUARANTEE, OR MAKE ANY
  * REPRESENTATIONS REGARDING THE USE OF, OR THE RESULTS OF THE USE OF THIS
  * SOFTWARE IN TERMS OF ITS CORRECTNESS, ACCURACY, RELIABILITY OR OTHERWISE.
  * IN NO EVENT SHALL WHISTLE COMMUNICATIONS BE LIABLE FOR ANY DAMAGES
  * RESULTING FROM OR ARISING OUT OF ANY USE OF THIS SOFTWARE, INCLUDING
  * WITHOUT LIMITATION, ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
  * PUNITIVE, OR CONSEQUENTIAL DAMAGES, PROCUREMENT OF SUBSTITUTE GOODS OR
  * SERVICES, LOSS OF USE, DATA OR PROFITS, HOWEVER CAUSED AND UNDER ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF WHISTLE COMMUNICATIONS IS ADVISED OF THE POSSIBILITY
  * OF SUCH DAMAGE.
  *
  * Author: Julian Elischer <julian@freebsd.org>
  *
  * $FreeBSD$
  * $Whistle: ng_socket.c,v 1.28 1999/11/01 09:24:52 julian Exp $
  */
 
 /*
  * Netgraph socket nodes
  *
  * There are two types of netgraph sockets, control and data.
  * Control sockets have a netgraph node, but data sockets are
  * parasitic on control sockets, and have no node of their own.
  */
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/hash.h>
 #include <sys/kernel.h>
 #include <sys/linker.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/epoch.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 #include <net/vnet.h>
 
 #include <netgraph/ng_message.h>
 #include <netgraph/netgraph.h>
 #include <netgraph/ng_socketvar.h>
 #include <netgraph/ng_socket.h>
 
 #ifdef NG_SEPARATE_MALLOC
 static MALLOC_DEFINE(M_NETGRAPH_PATH, "netgraph_path", "netgraph path info");
 static MALLOC_DEFINE(M_NETGRAPH_SOCK, "netgraph_sock", "netgraph socket info");
 #else
 #define M_NETGRAPH_PATH M_NETGRAPH
 #define M_NETGRAPH_SOCK M_NETGRAPH
 #endif
 
 /*
  * It's Ascii-art time!
  *   +-------------+   +-------------+
  *   |socket  (ctl)|   |socket (data)|
  *   +-------------+   +-------------+
  *          ^                 ^
  *          |                 |
  *          v                 v
  *    +-----------+     +-----------+
  *    |pcb   (ctl)|     |pcb  (data)|
  *    +-----------+     +-----------+
  *          ^                 ^
  *          |                 |
  *          v                 v
  *      +--------------------------+
  *      |   Socket type private    |
  *      |       data               |
  *      +--------------------------+
  *                   ^
  *                   |
  *                   v
  *           +----------------+
  *           | struct ng_node |
  *           +----------------+
  */
 
 /* Netgraph node methods */
 static ng_constructor_t	ngs_constructor;
 static ng_rcvmsg_t	ngs_rcvmsg;
 static ng_shutdown_t	ngs_shutdown;
 static ng_newhook_t	ngs_newhook;
 static ng_connect_t	ngs_connect;
 static ng_findhook_t	ngs_findhook;
 static ng_rcvdata_t	ngs_rcvdata;
 static ng_disconnect_t	ngs_disconnect;
 
 /* Internal methods */
 static int	ng_attach_data(struct socket *so);
 static int	ng_attach_cntl(struct socket *so);
 static int	ng_attach_common(struct socket *so, int type);
 static void	ng_detach_common(struct ngpcb *pcbp, int type);
 static void	ng_socket_free_priv(struct ngsock *priv);
 static int	ng_connect_data(struct sockaddr *nam, struct ngpcb *pcbp);
 static int	ng_bind(struct sockaddr *nam, struct ngpcb *pcbp);
 
 static int	ngs_mod_event(module_t mod, int event, void *data);
 static void	ng_socket_item_applied(void *context, int error);
 
 /* Netgraph type descriptor */
 static struct ng_type typestruct = {
 	.version =	NG_ABI_VERSION,
 	.name =		NG_SOCKET_NODE_TYPE,
 	.mod_event =	ngs_mod_event,
 	.constructor =	ngs_constructor,
 	.rcvmsg =	ngs_rcvmsg,
 	.shutdown =	ngs_shutdown,
 	.newhook =	ngs_newhook,
 	.connect =	ngs_connect,
 	.findhook =	ngs_findhook,
 	.rcvdata =	ngs_rcvdata,
 	.disconnect =	ngs_disconnect,
 };
 NETGRAPH_INIT_ORDERED(socket, &typestruct, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
 
 /* Buffer space */
 static u_long ngpdg_sendspace = 20 * 1024;	/* really max datagram size */
 SYSCTL_ULONG(_net_graph, OID_AUTO, maxdgram, CTLFLAG_RW,
     &ngpdg_sendspace , 0, "Maximum outgoing Netgraph datagram size");
 static u_long ngpdg_recvspace = 20 * 1024;
 SYSCTL_ULONG(_net_graph, OID_AUTO, recvspace, CTLFLAG_RW,
     &ngpdg_recvspace , 0, "Maximum space for incoming Netgraph datagrams");
 
 /* List of all sockets (for netstat -f netgraph) */
 static LIST_HEAD(, ngpcb) ngsocklist;
 
 static struct mtx	ngsocketlist_mtx;
 
 #define sotongpcb(so) ((struct ngpcb *)(so)->so_pcb)
 
 /* If getting unexplained errors returned, set this to "kdb_enter("X"); */
 #ifndef TRAP_ERROR
 #define TRAP_ERROR
 #endif
 
 struct hookpriv {
 	LIST_ENTRY(hookpriv)	next;
 	hook_p			hook;
 };
 LIST_HEAD(ngshash, hookpriv);
 
 /* Per-node private data */
 struct ngsock {
 	struct ng_node	*node;		/* the associated netgraph node */
 	struct ngpcb	*datasock;	/* optional data socket */
 	struct ngpcb	*ctlsock;	/* optional control socket */
 	struct ngshash	*hash;		/* hash for hook names */
 	u_long		hmask;		/* hash mask */
 	int	flags;
 	int	refs;
 	struct mtx	mtx;		/* mtx to wait on */
 	int		error;		/* place to store error */
 };
 
 #define	NGS_FLAG_NOLINGER	1	/* close with last hook */
 
 /***************************************************************
 	Control sockets
 ***************************************************************/
 
 static int
 ngc_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 	int error;
 
 	error = priv_check(td, PRIV_NETGRAPH_CONTROL);
 	if (error)
 		return (error);
 	if (pcbp != NULL)
 		return (EISCONN);
 	return (ng_attach_cntl(so));
 }
 
 static void
 ngc_detach(struct socket *so)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 
 	KASSERT(pcbp != NULL, ("ngc_detach: pcbp == NULL"));
 	ng_detach_common(pcbp, NG_CONTROL);
 }
 
 static int
 ngc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 	 struct mbuf *control, struct thread *td)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 	struct ngsock *const priv = NG_NODE_PRIVATE(pcbp->sockdata->node);
 	struct sockaddr_ng *const sap = (struct sockaddr_ng *) addr;
 	struct ng_mesg *msg;
 	struct mbuf *m0;
 	item_p item;
 	char *path = NULL;
 	int len, error = 0;
 	struct ng_apply_info apply;
 
 	if (control) {
 		error = EINVAL;
 		goto release;
 	}
 
 	/* Require destination as there may be >= 1 hooks on this node. */
 	if (addr == NULL) {
 		error = EDESTADDRREQ;
 		goto release;
 	}
 
 	if (sap->sg_len > NG_NODESIZ + offsetof(struct sockaddr_ng, sg_data)) {
 		error = EINVAL;
 		goto release;
 	}
 
 	/*
 	 * Allocate an expendable buffer for the path, chop off
 	 * the sockaddr header, and make sure it's NUL terminated.
 	 */
 	len = sap->sg_len - offsetof(struct sockaddr_ng, sg_data);
 	path = malloc(len + 1, M_NETGRAPH_PATH, M_WAITOK);
 	bcopy(sap->sg_data, path, len);
 	path[len] = '\0';
 
 	/*
 	 * Move the actual message out of mbufs into a linear buffer.
 	 * Start by adding up the size of the data. (could use mh_len?)
 	 */
 	for (len = 0, m0 = m; m0 != NULL; m0 = m0->m_next)
 		len += m0->m_len;
 
 	/*
 	 * Move the data into a linear buffer as well.
 	 * Messages are not delivered in mbufs.
 	 */
 	msg = malloc(len + 1, M_NETGRAPH_MSG, M_WAITOK);
 	m_copydata(m, 0, len, (char *)msg);
 
 	if (msg->header.version != NG_VERSION) {
 		free(msg, M_NETGRAPH_MSG);
 		error = EINVAL;
 		goto release;
 	}
 
 	/*
 	 * Hack alert!
 	 * We look into the message and if it mkpeers a node of unknown type, we
 	 * try to load it. We need to do this now, in syscall thread, because if
 	 * message gets queued and applied later we will get panic.
 	 */
 	if (msg->header.typecookie == NGM_GENERIC_COOKIE &&
 	    msg->header.cmd == NGM_MKPEER) {
 		struct ngm_mkpeer *const mkp = (struct ngm_mkpeer *) msg->data;
 
 		if (ng_findtype(mkp->type) == NULL) {
 			char filename[NG_TYPESIZ + 3];
 			int fileid;
 
 			/* Not found, try to load it as a loadable module. */
 			snprintf(filename, sizeof(filename), "ng_%s",
 			    mkp->type);
 			error = kern_kldload(curthread, filename, &fileid);
 			if (error != 0) {
 				free(msg, M_NETGRAPH_MSG);
 				goto release;
 			}
 
 			/* See if type has been loaded successfully. */
 			if (ng_findtype(mkp->type) == NULL) {
 				free(msg, M_NETGRAPH_MSG);
 				(void)kern_kldunload(curthread, fileid,
 				    LINKER_UNLOAD_NORMAL);
 				error =  ENXIO;
 				goto release;
 			}
 		}
 	}
 
 	item = ng_package_msg(msg, NG_WAITOK);
 	if ((error = ng_address_path((pcbp->sockdata->node), item, path, 0))
 	    != 0) {
 #ifdef TRACE_MESSAGES
 		printf("ng_address_path: errx=%d\n", error);
 #endif
 		goto release;
 	}
 
 #ifdef TRACE_MESSAGES
 	printf("[%x]:<---------[socket]: c=<%d>cmd=%x(%s) f=%x #%d (%s)\n",
 		item->el_dest->nd_ID,
 		msg->header.typecookie,
 		msg->header.cmd,
 		msg->header.cmdstr,
 		msg->header.flags,
 		msg->header.token,
 		item->el_dest->nd_type->name);
 #endif
 	SAVE_LINE(item);
 	/*
 	 * We do not want to return from syscall until the item
 	 * is processed by destination node. We register callback
 	 * on the item, which will update priv->error when item
 	 * was applied.
 	 * If ng_snd_item() has queued item, we sleep until
 	 * callback wakes us up.
 	 */
 	bzero(&apply, sizeof(apply));
 	apply.apply = ng_socket_item_applied;
 	apply.context = priv;
 	item->apply = &apply;
 	priv->error = -1;
 
 	error = ng_snd_item(item, 0);
 
 	mtx_lock(&priv->mtx);
 	if (priv->error == -1)
 		msleep(priv, &priv->mtx, 0, "ngsock", 0);
 	mtx_unlock(&priv->mtx);
 	KASSERT(priv->error != -1,
 	    ("ng_socket: priv->error wasn't updated"));
 	error = priv->error;
 
 release:
 	if (path != NULL)
 		free(path, M_NETGRAPH_PATH);
 	if (control != NULL)
 		m_freem(control);
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 static int
 ngc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 
 	if (pcbp == NULL)
 		return (EINVAL);
 	return (ng_bind(nam, pcbp));
 }
 
 static int
 ngc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	/*
 	 * At this time refuse to do this.. it used to
 	 * do something but it was undocumented and not used.
 	 */
 	printf("program tried to connect control socket to remote node\n");
 	return (EINVAL);
 }
 
 /***************************************************************
 	Data sockets
 ***************************************************************/
 
 static int
 ngd_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 
 	if (pcbp != NULL)
 		return (EISCONN);
 	return (ng_attach_data(so));
 }
 
 static void
 ngd_detach(struct socket *so)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 
 	KASSERT(pcbp != NULL, ("ngd_detach: pcbp == NULL"));
 	ng_detach_common(pcbp, NG_DATA);
 }
 
 static int
 ngd_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 	 struct mbuf *control, struct thread *td)
 {
 	struct epoch_tracker et;
 	struct ngpcb *const pcbp = sotongpcb(so);
 	struct sockaddr_ng *const sap = (struct sockaddr_ng *) addr;
 	int	len, error;
 	hook_p  hook = NULL;
 	item_p	item;
 	char	hookname[NG_HOOKSIZ];
 
 	if ((pcbp == NULL) || (control != NULL)) {
 		error = EINVAL;
 		goto release;
 	}
 	if (pcbp->sockdata == NULL) {
 		error = ENOTCONN;
 		goto release;
 	}
 
 	if (sap == NULL) {
 		len = 0;		/* Make compiler happy. */
 	} else {
 		if (sap->sg_len > NG_NODESIZ +
 		    offsetof(struct sockaddr_ng, sg_data)) {
 			error = EINVAL;
 			goto release;
 		}
 		len = sap->sg_len - offsetof(struct sockaddr_ng, sg_data);
 	}
 
 	/*
 	 * If the user used any of these ways to not specify an address
 	 * then handle specially.
 	 */
 	if ((sap == NULL) || (len <= 0) || (*sap->sg_data == '\0')) {
 		if (NG_NODE_NUMHOOKS(pcbp->sockdata->node) != 1) {
 			error = EDESTADDRREQ;
 			goto release;
 		}
 		/*
 		 * If exactly one hook exists, just use it.
 		 * Special case to allow write(2) to work on an ng_socket.
 		 */
 		hook = LIST_FIRST(&pcbp->sockdata->node->nd_hooks);
 	} else {
 		if (len >= NG_HOOKSIZ) {
 			error = EINVAL;
 			goto release;
 		}
 
 		/*
 		 * chop off the sockaddr header, and make sure it's NUL
 		 * terminated
 		 */
 		bcopy(sap->sg_data, hookname, len);
 		hookname[len] = '\0';
 
 		/* Find the correct hook from 'hookname' */
 		hook = ng_findhook(pcbp->sockdata->node, hookname);
 		if (hook == NULL) {
 			error = EHOSTUNREACH;
 			goto release;
 		}
 	}
 
 	/* Send data. */
 	item = ng_package_data(m, NG_WAITOK);
 	m = NULL;
 	NET_EPOCH_ENTER(et);
 	NG_FWD_ITEM_HOOK(error, item, hook);
 	NET_EPOCH_EXIT(et);
 
 release:
 	if (control != NULL)
 		m_freem(control);
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 static int
 ngd_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct ngpcb *const pcbp = sotongpcb(so);
 
 	if (pcbp == NULL)
 		return (EINVAL);
 	return (ng_connect_data(nam, pcbp));
 }
 
 /*
  * Used for both data and control sockets
  */
 static int
 ng_getsockaddr(struct socket *so, struct sockaddr **addr)
 {
 	struct ngpcb *pcbp;
 	struct sockaddr_ng *sg;
 	int sg_len;
 	int error = 0;
 
 	pcbp = sotongpcb(so);
 	if ((pcbp == NULL) || (pcbp->sockdata == NULL))
 		/* XXXGL: can this still happen? */
 		return (EINVAL);
 
 	sg_len = sizeof(struct sockaddr_ng) + NG_NODESIZ -
 	    sizeof(sg->sg_data);
 	sg = malloc(sg_len, M_SONAME, M_WAITOK | M_ZERO);
 
 	mtx_lock(&pcbp->sockdata->mtx);
 	if (pcbp->sockdata->node != NULL) {
 		node_p node = pcbp->sockdata->node;
 
 		if (NG_NODE_HAS_NAME(node))
 			bcopy(NG_NODE_NAME(node), sg->sg_data,
 			    strlen(NG_NODE_NAME(node)));
 		mtx_unlock(&pcbp->sockdata->mtx);
 
 		sg->sg_len = sg_len;
 		sg->sg_family = AF_NETGRAPH;
 		*addr = (struct sockaddr *)sg;
 	} else {
 		mtx_unlock(&pcbp->sockdata->mtx);
 		free(sg, M_SONAME);
 		error = EINVAL;
 	}
 
 	return (error);
 }
 
 /*
  * Attach a socket to it's protocol specific partner.
  * For a control socket, actually create a netgraph node and attach
  * to it as well.
  */
 
 static int
 ng_attach_cntl(struct socket *so)
 {
 	struct ngsock *priv;
 	struct ngpcb *pcbp;
 	node_p node;
 	int error;
 
 	/* Setup protocol control block */
 	if ((error = ng_attach_common(so, NG_CONTROL)) != 0)
 		return (error);
 	pcbp = sotongpcb(so);
 
 	/* Make the generic node components */
 	if ((error = ng_make_node_common(&typestruct, &node)) != 0) {
 		ng_detach_common(pcbp, NG_CONTROL);
 		return (error);
 	}
 
 	/*
 	 * Allocate node private info and hash. We start
 	 * with 16 hash entries, however we may grow later
 	 * in ngs_newhook(). We can't predict how much hooks
 	 * does this node plan to have.
 	 */
 	priv = malloc(sizeof(*priv), M_NETGRAPH_SOCK, M_WAITOK | M_ZERO);
 	priv->hash = hashinit(16, M_NETGRAPH_SOCK, &priv->hmask);
 
 	/* Initialize mutex. */
 	mtx_init(&priv->mtx, "ng_socket", NULL, MTX_DEF);
 
 	/* Link the pcb the private data. */
 	priv->ctlsock = pcbp;
 	pcbp->sockdata = priv;
 	priv->refs++;
 	priv->node = node;
 	pcbp->node_id = node->nd_ID;	/* hint for netstat(1) */
 
 	/* Link the node and the private data. */
 	NG_NODE_SET_PRIVATE(priv->node, priv);
 	NG_NODE_REF(priv->node);
 	priv->refs++;
 
 	return (0);
 }
 
 static int
 ng_attach_data(struct socket *so)
 {
 	return (ng_attach_common(so, NG_DATA));
 }
 
 /*
  * Set up a socket protocol control block.
  * This code is shared between control and data sockets.
  */
 static int
 ng_attach_common(struct socket *so, int type)
 {
 	struct ngpcb *pcbp;
 	int error;
 
 	/* Standard socket setup stuff. */
 	error = soreserve(so, ngpdg_sendspace, ngpdg_recvspace);
 	if (error)
 		return (error);
 
 	/* Allocate the pcb. */
 	pcbp = malloc(sizeof(struct ngpcb), M_PCB, M_WAITOK | M_ZERO);
 	pcbp->type = type;
 
 	/* Link the pcb and the socket. */
 	so->so_pcb = (caddr_t)pcbp;
 	pcbp->ng_socket = so;
 
 	/* Add the socket to linked list */
 	mtx_lock(&ngsocketlist_mtx);
 	LIST_INSERT_HEAD(&ngsocklist, pcbp, socks);
 	mtx_unlock(&ngsocketlist_mtx);
 	return (0);
 }
 
 /*
  * Disassociate the socket from it's protocol specific
  * partner. If it's attached to a node's private data structure,
  * then unlink from that too. If we were the last socket attached to it,
  * then shut down the entire node. Shared code for control and data sockets.
  */
 static void
 ng_detach_common(struct ngpcb *pcbp, int which)
 {
 	struct ngsock *priv = pcbp->sockdata;
 
 	if (priv != NULL) {
 		mtx_lock(&priv->mtx);
 
 		switch (which) {
 		case NG_CONTROL:
 			priv->ctlsock = NULL;
 			break;
 		case NG_DATA:
 			priv->datasock = NULL;
 			break;
 		default:
 			panic("%s", __func__);
 		}
 		pcbp->sockdata = NULL;
 		pcbp->node_id = 0;
 
 		ng_socket_free_priv(priv);
 	}
 
 	pcbp->ng_socket->so_pcb = NULL;
 	mtx_lock(&ngsocketlist_mtx);
 	LIST_REMOVE(pcbp, socks);
 	mtx_unlock(&ngsocketlist_mtx);
 	free(pcbp, M_PCB);
 }
 
 /*
  * Remove a reference from node private data.
  */
 static void
 ng_socket_free_priv(struct ngsock *priv)
 {
 	mtx_assert(&priv->mtx, MA_OWNED);
 
 	priv->refs--;
 
 	if (priv->refs == 0) {
 		mtx_destroy(&priv->mtx);
 		hashdestroy(priv->hash, M_NETGRAPH_SOCK, priv->hmask);
 		free(priv, M_NETGRAPH_SOCK);
 		return;
 	}
 
 	if ((priv->refs == 1) && (priv->node != NULL)) {
 		node_p node = priv->node;
 
 		priv->node = NULL;
 		mtx_unlock(&priv->mtx);
 		NG_NODE_UNREF(node);
 		ng_rmnode_self(node);
 	} else
 		mtx_unlock(&priv->mtx);
 }
 
 /*
  * Connect the data socket to a named control socket node.
  */
 static int
 ng_connect_data(struct sockaddr *nam, struct ngpcb *pcbp)
 {
 	struct sockaddr_ng *sap;
 	node_p farnode;
 	struct ngsock *priv;
 	int error;
 	item_p item;
 
 	/* If we are already connected, don't do it again. */
 	if (pcbp->sockdata != NULL)
 		return (EISCONN);
 
 	/*
 	 * Find the target (victim) and check it doesn't already have
 	 * a data socket. Also check it is a 'socket' type node.
 	 * Use ng_package_data() and ng_address_path() to do this.
 	 */
 
 	sap = (struct sockaddr_ng *) nam;
 	/* The item will hold the node reference. */
 	item = ng_package_data(NULL, NG_WAITOK);
 
 	if ((error = ng_address_path(NULL, item,  sap->sg_data, 0)))
 		return (error); /* item is freed on failure */
 
 	/*
 	 * Extract node from item and free item. Remember we now have
 	 * a reference on the node. The item holds it for us.
 	 * when we free the item we release the reference.
 	 */
 	farnode = item->el_dest; /* shortcut */
 	if (strcmp(farnode->nd_type->name, NG_SOCKET_NODE_TYPE) != 0) {
 		NG_FREE_ITEM(item); /* drop the reference to the node */
 		return (EINVAL);
 	}
 	priv = NG_NODE_PRIVATE(farnode);
 	if (priv->datasock != NULL) {
 		NG_FREE_ITEM(item);	/* drop the reference to the node */
 		return (EADDRINUSE);
 	}
 
 	/*
 	 * Link the PCB and the private data struct. and note the extra
 	 * reference. Drop the extra reference on the node.
 	 */
 	mtx_lock(&priv->mtx);
 	priv->datasock = pcbp;
 	pcbp->sockdata = priv;
 	pcbp->node_id = priv->node->nd_ID;	/* hint for netstat(1) */
 	priv->refs++;
 	mtx_unlock(&priv->mtx);
 	NG_FREE_ITEM(item);	/* drop the reference to the node */
 	return (0);
 }
 
 /*
  * Binding a socket means giving the corresponding node a name
  */
 static int
 ng_bind(struct sockaddr *nam, struct ngpcb *pcbp)
 {
 	struct ngsock *const priv = pcbp->sockdata;
 	struct sockaddr_ng *const sap = (struct sockaddr_ng *) nam;
 
 	if (priv == NULL) {
 		TRAP_ERROR;
 		return (EINVAL);
 	}
 	if ((sap->sg_len < 4) || (sap->sg_len > (NG_NODESIZ + 2)) ||
 	    (sap->sg_data[0] == '\0') ||
 	    (sap->sg_data[sap->sg_len - 3] != '\0')) {
 		TRAP_ERROR;
 		return (EINVAL);
 	}
 	return (ng_name_node(priv->node, sap->sg_data));
 }
 
 /***************************************************************
 	Netgraph node
 ***************************************************************/
 
 /*
  * You can only create new nodes from the socket end of things.
  */
 static int
 ngs_constructor(node_p nodep)
 {
 	return (EINVAL);
 }
 
 static void
 ngs_rehash(node_p node)
 {
 	struct ngsock *priv = NG_NODE_PRIVATE(node);
 	struct ngshash *new;
 	struct hookpriv *hp;
 	hook_p hook;
 	uint32_t h;
 	u_long hmask;
 
 	new = hashinit_flags((priv->hmask + 1) * 2, M_NETGRAPH_SOCK, &hmask,
 	    HASH_NOWAIT);
 	if (new == NULL)
 		return;
 
 	LIST_FOREACH(hook, &node->nd_hooks, hk_hooks) {
 		hp = NG_HOOK_PRIVATE(hook);
 #ifdef INVARIANTS
 		LIST_REMOVE(hp, next);
 #endif
 		h = hash32_str(NG_HOOK_NAME(hook), HASHINIT) & hmask;
 		LIST_INSERT_HEAD(&new[h], hp, next);
 	}
 
 	hashdestroy(priv->hash, M_NETGRAPH_SOCK, priv->hmask);
 	priv->hash = new;
 	priv->hmask = hmask;
 }
 
 /*
  * We allow any hook to be connected to the node.
  * There is no per-hook private information though.
  */
 static int
 ngs_newhook(node_p node, hook_p hook, const char *name)
 {
 	struct ngsock *const priv = NG_NODE_PRIVATE(node);
 	struct hookpriv *hp;
 	uint32_t h;
 
 	hp = malloc(sizeof(*hp), M_NETGRAPH_SOCK, M_NOWAIT);
 	if (hp == NULL)
 		return (ENOMEM);
 	if (node->nd_numhooks * 2 > priv->hmask)
 		ngs_rehash(node);
 	hp->hook = hook;
 	h = hash32_str(name, HASHINIT) & priv->hmask;
 	LIST_INSERT_HEAD(&priv->hash[h], hp, next);
 	NG_HOOK_SET_PRIVATE(hook, hp);
 
 	return (0);
 }
 
 /*
  * If only one hook, allow read(2) and write(2) to work.
  */
 static int
 ngs_connect(hook_p hook)
 {
 	node_p node = NG_HOOK_NODE(hook);
 	struct ngsock *priv = NG_NODE_PRIVATE(node);
 
 	if ((priv->datasock) && (priv->datasock->ng_socket)) {
 		if (NG_NODE_NUMHOOKS(node) == 1)
 			priv->datasock->ng_socket->so_state |= SS_ISCONNECTED;
 		else
 			priv->datasock->ng_socket->so_state &= ~SS_ISCONNECTED;
 	}
 	return (0);
 }
 
 /* Look up hook by name */
 static hook_p
 ngs_findhook(node_p node, const char *name)
 {
 	struct ngsock *priv = NG_NODE_PRIVATE(node);
 	struct hookpriv *hp;
 	uint32_t h;
 
 	/*
 	 * Microoptimisation for an ng_socket with
 	 * a single hook, which is a common case.
 	 */
 	if (node->nd_numhooks == 1) {
 		hook_p hook;
 
 		hook = LIST_FIRST(&node->nd_hooks);
 
 		if (strcmp(NG_HOOK_NAME(hook), name) == 0)
 			return (hook);
 		else
 			return (NULL);
 	}
 
 	h = hash32_str(name, HASHINIT) & priv->hmask;
 
 	LIST_FOREACH(hp, &priv->hash[h], next)
 		if (strcmp(NG_HOOK_NAME(hp->hook), name) == 0)
 			return (hp->hook);
 
 	return (NULL);
 }
 
 /*
  * Incoming messages get passed up to the control socket.
  * Unless they are for us specifically (socket_type)
  */
 static int
 ngs_rcvmsg(node_p node, item_p item, hook_p lasthook)
 {
 	struct ngsock *const priv = NG_NODE_PRIVATE(node);
 	struct ngpcb *pcbp;
 	struct socket *so;
 	struct sockaddr_ng addr;
 	struct ng_mesg *msg;
 	struct mbuf *m;
 	ng_ID_t	retaddr = NGI_RETADDR(item);
 	int addrlen;
 	int error = 0;
 
 	NGI_GET_MSG(item, msg);
 	NG_FREE_ITEM(item);
 
 	/*
 	 * Grab priv->mtx here to prevent destroying of control socket
 	 * after checking that priv->ctlsock is not NULL.
 	 */
 	mtx_lock(&priv->mtx);
 	pcbp = priv->ctlsock;
 
 	/*
 	 * Only allow mesgs to be passed if we have the control socket.
 	 * Data sockets can only support the generic messages.
 	 */
 	if (pcbp == NULL) {
 		mtx_unlock(&priv->mtx);
 		TRAP_ERROR;
 		NG_FREE_MSG(msg);
 		return (EINVAL);
 	}
 	so = pcbp->ng_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 
 	/* As long as the race is handled, priv->mtx may be unlocked now. */
 	mtx_unlock(&priv->mtx);
 
 #ifdef TRACE_MESSAGES
 	printf("[%x]:---------->[socket]: c=<%d>cmd=%x(%s) f=%x #%d\n",
 		retaddr,
 		msg->header.typecookie,
 		msg->header.cmd,
 		msg->header.cmdstr,
 		msg->header.flags,
 		msg->header.token);
 #endif
 
 	if (msg->header.typecookie == NGM_SOCKET_COOKIE) {
 		switch (msg->header.cmd) {
 		case NGM_SOCK_CMD_NOLINGER:
 			priv->flags |= NGS_FLAG_NOLINGER;
 			break;
 		case NGM_SOCK_CMD_LINGER:
 			priv->flags &= ~NGS_FLAG_NOLINGER;
 			break;
 		default:
 			error = EINVAL;		/* unknown command */
 		}
 		SOCKBUF_UNLOCK(&so->so_rcv);
 
 		/* Free the message and return. */
 		NG_FREE_MSG(msg);
 		return (error);
 	}
 
 	/* Get the return address into a sockaddr. */
 	bzero(&addr, sizeof(addr));
 	addr.sg_len = sizeof(addr);
 	addr.sg_family = AF_NETGRAPH;
 	addrlen = snprintf((char *)&addr.sg_data, sizeof(addr.sg_data),
 	    "[%x]:", retaddr);
 	if (addrlen < 0 || addrlen > sizeof(addr.sg_data)) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		printf("%s: snprintf([%x]) failed - %d\n", __func__, retaddr,
 		    addrlen);
 		NG_FREE_MSG(msg);
 		return (EINVAL);
 	}
 
 	/* Copy the message itself into an mbuf chain. */
 	m = m_devget((caddr_t)msg, sizeof(struct ng_mesg) + msg->header.arglen,
 	    0, NULL, NULL);
 
 	/*
 	 * Here we free the message. We need to do that
 	 * regardless of whether we got mbufs.
 	 */
 	NG_FREE_MSG(msg);
 
 	if (m == NULL) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		TRAP_ERROR;
 		return (ENOBUFS);
 	}
 
 	/* Send it up to the socket. */
 	if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)&addr, m,
 	    NULL) == 0) {
 		soroverflow_locked(so);
 		TRAP_ERROR;
 		m_freem(m);
 		return (ENOBUFS);
 	}
 
 	/* sorwakeup_locked () releases the lock internally. */
 	sorwakeup_locked(so);
 
 	return (error);
 }
 
 /*
  * Receive data on a hook
  */
 static int
 ngs_rcvdata(hook_p hook, item_p item)
 {
 	struct ngsock *const priv = NG_NODE_PRIVATE(NG_HOOK_NODE(hook));
 	struct ngpcb *const pcbp = priv->datasock;
 	struct socket *so;
 	struct sockaddr_ng *addr;
 	char *addrbuf[NG_HOOKSIZ + 4];
 	int addrlen;
 	struct mbuf *m;
 
 	NGI_GET_M(item, m);
 	NG_FREE_ITEM(item);
 
 	/* If there is no data socket, black-hole it. */
 	if (pcbp == NULL) {
 		NG_FREE_M(m);
 		return (0);
 	}
 	so = pcbp->ng_socket;
 
 	/* Get the return address into a sockaddr. */
 	addrlen = strlen(NG_HOOK_NAME(hook));	/* <= NG_HOOKSIZ - 1 */
 	addr = (struct sockaddr_ng *) addrbuf;
 	addr->sg_len = addrlen + 3;
 	addr->sg_family = AF_NETGRAPH;
 	bcopy(NG_HOOK_NAME(hook), addr->sg_data, addrlen);
 	addr->sg_data[addrlen] = '\0';
 
 	/* Try to tell the socket which hook it came in on. */
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)addr, m,
 	    NULL) == 0) {
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		m_freem(m);
 		TRAP_ERROR;
 		return (ENOBUFS);
 	}
 
 	/* sorwakeup_locked () releases the lock internally. */
 	sorwakeup_locked(so);
 	return (0);
 }
 
 /*
  * Hook disconnection
  *
  * For this type, removal of the last link destroys the node
  * if the NOLINGER flag is set.
  */
 static int
 ngs_disconnect(hook_p hook)
 {
 	node_p node = NG_HOOK_NODE(hook);
 	struct ngsock *const priv = NG_NODE_PRIVATE(node);
 	struct hookpriv *hp = NG_HOOK_PRIVATE(hook);
 
 	LIST_REMOVE(hp, next);
 	free(hp, M_NETGRAPH_SOCK);
 
 	if ((priv->datasock) && (priv->datasock->ng_socket)) {
 		if (NG_NODE_NUMHOOKS(node) == 1)
 			priv->datasock->ng_socket->so_state |= SS_ISCONNECTED;
 		else
 			priv->datasock->ng_socket->so_state &= ~SS_ISCONNECTED;
 	}
 
 	if ((priv->flags & NGS_FLAG_NOLINGER) &&
 	    (NG_NODE_NUMHOOKS(node) == 0) && (NG_NODE_IS_VALID(node)))
 		ng_rmnode_self(node);
 
 	return (0);
 }
 
 /*
  * Do local shutdown processing.
  * In this case, that involves making sure the socket
  * knows we should be shutting down.
  */
 static int
 ngs_shutdown(node_p node)
 {
 	struct ngsock *const priv = NG_NODE_PRIVATE(node);
 	struct ngpcb *dpcbp, *pcbp;
 
 	mtx_lock(&priv->mtx);
 	dpcbp = priv->datasock;
 	pcbp = priv->ctlsock;
 
 	if (dpcbp != NULL)
 		soisdisconnected(dpcbp->ng_socket);
 
 	if (pcbp != NULL)
 		soisdisconnected(pcbp->ng_socket);
 
 	priv->node = NULL;
 	NG_NODE_SET_PRIVATE(node, NULL);
 	ng_socket_free_priv(priv);
 
 	NG_NODE_UNREF(node);
 	return (0);
 }
 
 static void
 ng_socket_item_applied(void *context, int error)
 {
 	struct ngsock *const priv = (struct ngsock *)context;
 
 	mtx_lock(&priv->mtx);
 	priv->error = error;
 	wakeup(priv);
 	mtx_unlock(&priv->mtx);
 
 }
 
 static	int
 dummy_disconnect(struct socket *so)
 {
 	return (0);
 }
+
 /*
+ * Definitions of protocols supported in the NETGRAPH domain.
  * Control and data socket type descriptors
  *
  * XXXRW: Perhaps _close should do something?
  */
-
-static struct pr_usrreqs ngc_usrreqs = {
-	.pru_attach =		ngc_attach,
-	.pru_bind =		ngc_bind,
-	.pru_connect =		ngc_connect,
-	.pru_detach =		ngc_detach,
-	.pru_disconnect =	dummy_disconnect,
-	.pru_send =		ngc_send,
-	.pru_sockaddr =		ng_getsockaddr,
-};
-
-static struct pr_usrreqs ngd_usrreqs = {
-	.pru_attach =		ngd_attach,
-	.pru_connect =		ngd_connect,
-	.pru_detach =		ngd_detach,
-	.pru_disconnect =	dummy_disconnect,
-	.pru_send =		ngd_send,
-	.pru_sockaddr =		ng_getsockaddr,
-};
-
-/*
- * Definitions of protocols supported in the NETGRAPH domain.
- */
-
-extern struct domain ngdomain;		/* stop compiler warnings */
-
-static struct protosw ngsw[] = {
-{
+static struct protosw ngcontrol_protosw = {
 	.pr_type =		SOCK_DGRAM,
-	.pr_domain =		&ngdomain,
 	.pr_protocol =		NG_CONTROL,
 	.pr_flags =		PR_ATOMIC | PR_ADDR /* | PR_RIGHTS */,
-	.pr_usrreqs =		&ngc_usrreqs
-},
-{
+	.pr_attach =		ngc_attach,
+	.pr_bind =		ngc_bind,
+	.pr_connect =		ngc_connect,
+	.pr_detach =		ngc_detach,
+	.pr_disconnect =	dummy_disconnect,
+	.pr_send =		ngc_send,
+	.pr_sockaddr =		ng_getsockaddr,
+};
+static struct protosw ngdata_protosw = {
 	.pr_type =		SOCK_DGRAM,
-	.pr_domain =		&ngdomain,
 	.pr_protocol =		NG_DATA,
 	.pr_flags =		PR_ATOMIC | PR_ADDR,
-	.pr_usrreqs =		&ngd_usrreqs
-}
+	.pr_attach =		ngd_attach,
+	.pr_connect =		ngd_connect,
+	.pr_detach =		ngd_detach,
+	.pr_disconnect =	dummy_disconnect,
+	.pr_send =		ngd_send,
+	.pr_sockaddr =		ng_getsockaddr,
 };
 
-struct domain ngdomain = {
+static struct domain ngdomain = {
 	.dom_family =		AF_NETGRAPH,
 	.dom_name =		"netgraph",
-	.dom_protosw =		ngsw,
-	.dom_protoswNPROTOSW =	&ngsw[nitems(ngsw)]
+	.dom_nprotosw =		2,
+	.dom_protosw =		{ &ngcontrol_protosw, &ngdata_protosw },
 };
 
 /*
  * Handle loading and unloading for this node type.
  * This is to handle auxiliary linkages (e.g protocol domain addition).
  */
 static int
 ngs_mod_event(module_t mod, int event, void *data)
 {
 	int error = 0;
 
 	switch (event) {
 	case MOD_LOAD:
 		mtx_init(&ngsocketlist_mtx, "ng_socketlist", NULL, MTX_DEF);
 		break;
 	case MOD_UNLOAD:
 		/* Ensure there are no open netgraph sockets. */
 		if (!LIST_EMPTY(&ngsocklist)) {
 			error = EBUSY;
 			break;
 		}
 #ifdef NOTYET
 		/* Unregister protocol domain XXX can't do this yet.. */
 #endif
 		error = EBUSY;
 		break;
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 DOMAIN_SET(ng);
 
 SYSCTL_INT(_net_graph, OID_AUTO, family, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, AF_NETGRAPH, "");
 static SYSCTL_NODE(_net_graph, OID_AUTO, data, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "DATA");
 SYSCTL_INT(_net_graph_data, OID_AUTO, proto, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, NG_DATA, "");
 static SYSCTL_NODE(_net_graph, OID_AUTO, control, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "CONTROL");
 SYSCTL_INT(_net_graph_control, OID_AUTO, proto, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, NG_CONTROL, "");
diff --git a/sys/netinet/in_proto.c b/sys/netinet/in_proto.c
index cac885560a30..35b02d706e72 100644
--- a/sys/netinet/in_proto.c
+++ b/sys/netinet/in_proto.c
@@ -1,303 +1,187 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_proto.c	8.2 (Berkeley) 2/9/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mrouting.h"
 #include "opt_ipsec.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/socket.h>
 #include <sys/domain.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 
 /*
  * While this file provides the domain and protocol switch tables for IPv4, it
  * also provides the sysctl node declarations for net.inet.* often shared with
  * IPv6 for common features or by upper layer protocols.  In case of no IPv4
  * support compile out everything but these sysctl nodes.
  */
 #ifdef INET
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #endif /* INET */
 
 #if defined(INET) || defined(INET6)
 #include <netinet/in.h>
 #endif
 
 #ifdef INET
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/igmp_var.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/ip_encap.h>
 
 /*
  * TCP/IP protocol family: IP, ICMP, UDP, TCP.
  */
 
-static struct pr_usrreqs nousrreqs;
-
 #ifdef SCTP
 #include <netinet/in_pcb.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_var.h>
 #endif
 
-FEATURE(inet, "Internet Protocol version 4");
-
-extern	struct domain inetdomain;
+/* netinet/raw_ip.c */
+extern struct protosw rip_protosw, rsvp_protosw, rawipv4_protosw,
+    rawipv6_protosw, mobile_protosw, etherip_protosw, icmp_protosw,
+    igmp_protosw, gre_protosw, pim_protosw, ripwild_protosw;
+/* netinet/udp_usrreq.c */
+extern struct protosw udp_protosw, udplite_protosw;
 
-/* Spacer for loadable protocols. */
-#define IPPROTOSPACER   			\
-{						\
-	.pr_domain =		&inetdomain,	\
-	.pr_protocol =		PROTO_SPACER,	\
-	.pr_usrreqs =		&nousrreqs	\
-}
-
-struct protosw inetsw[] = {
-{
-	.pr_type =		SOCK_DGRAM,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_UDP,
-	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_CAPATTACH,
-	.pr_ctloutput =		udp_ctloutput,
-	.pr_usrreqs =		&udp_usrreqs
-},
-{
-	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_TCP,
-	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD|
-				    PR_CAPATTACH,
-	.pr_ctloutput =		tcp_ctloutput,
-	.pr_usrreqs =		&tcp_usrreqs
-},
-#ifdef SCTP
-{
-	.pr_type =		SOCK_SEQPACKET,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_SCTP,
-	.pr_flags =		PR_WANTRCVD,
-	.pr_ctloutput =		sctp_ctloutput,
-	.pr_usrreqs =		&sctp_usrreqs
-},
-{
-	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_SCTP,
-	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD,
-	.pr_ctloutput =		sctp_ctloutput,
-	.pr_usrreqs =		&sctp_usrreqs
-},
-#endif /* SCTP */
-{
-	.pr_type =		SOCK_DGRAM,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_UDPLITE,
-	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_CAPATTACH,
-	.pr_ctloutput =		udp_ctloutput,
-	.pr_usrreqs =		&udp_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_RAW,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_ICMP,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_IGMP,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_RSVP,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_IPV4,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_MOBILE,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_ETHERIP,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_GRE,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-},
-# ifdef INET6
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_IPV6,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-},
-#endif
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_PIM,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-},
-/* Spacer n-times for loadable protocols. */
-IPPROTOSPACER,
-IPPROTOSPACER,
-IPPROTOSPACER,
-IPPROTOSPACER,
-IPPROTOSPACER,
-IPPROTOSPACER,
-IPPROTOSPACER,
-IPPROTOSPACER,
-/* raw wildcard */
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_usrreqs =		&rip_usrreqs
-},
-};
+FEATURE(inet, "Internet Protocol version 4");
 
 struct domain inetdomain = {
 	.dom_family =		AF_INET,
 	.dom_name =		"internet",
-	.dom_protosw =		inetsw,
-	.dom_protoswNPROTOSW =	&inetsw[nitems(inetsw)],
 	.dom_rtattach =		in_inithead,
 #ifdef VIMAGE
 	.dom_rtdetach =		in_detachhead,
 #endif
 	.dom_ifattach =		in_domifattach,
-	.dom_ifdetach =		in_domifdetach
+	.dom_ifdetach =		in_domifdetach,
+	.dom_nprotosw =		24,
+	.dom_protosw = {
+		&tcp_protosw,
+		&udp_protosw,
+#ifdef SCTP
+		&sctp_seqpacket_protosw,
+		&sctp_stream_protosw,
+#else
+		NULL, NULL,
+#endif
+		&udplite_protosw,
+		&rip_protosw,
+		/*
+		 * XXXGL: it is entirely possible that all below raw-based
+		 * protosw definitions are not needed.  They could have existed
+		 * just to define pr_input, pr_drain, pr_*timo or PR_LASTHDR
+		 * flag, and were never supposed to create a special socket.
+		 */
+		&icmp_protosw,
+		&igmp_protosw,
+		&rsvp_protosw,
+		&rawipv4_protosw,
+		&mobile_protosw,
+		&etherip_protosw,
+		&gre_protosw,
+#ifdef INET6
+		&rawipv6_protosw,
+#else
+		NULL,
+#endif
+		&pim_protosw,
+		/* Spacer 8 times for loadable protocols. XXXGL: why 8? */
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		&ripwild_protosw,
+	},
 };
 
 DOMAIN_SET(inet);
 #endif /* INET */
 
 SYSCTL_NODE(_net, PF_INET, inet, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Internet Family");
 
 SYSCTL_NODE(_net_inet, IPPROTO_IP, ip, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IP");
 SYSCTL_NODE(_net_inet, IPPROTO_ICMP, icmp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "ICMP");
 SYSCTL_NODE(_net_inet, IPPROTO_UDP, udp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "UDP");
 SYSCTL_NODE(_net_inet, IPPROTO_TCP, tcp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP");
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 SYSCTL_NODE(_net_inet, IPPROTO_SCTP, sctp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "SCTP");
 #endif
 SYSCTL_NODE(_net_inet, IPPROTO_IGMP, igmp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IGMP");
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 /* XXX no protocol # to use, pick something "reserved" */
 SYSCTL_NODE(_net_inet, 253, ipsec, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPSEC");
 SYSCTL_NODE(_net_inet, IPPROTO_AH, ah, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "AH");
 SYSCTL_NODE(_net_inet, IPPROTO_ESP, esp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "ESP");
 SYSCTL_NODE(_net_inet, IPPROTO_IPCOMP, ipcomp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPCOMP");
 SYSCTL_NODE(_net_inet, IPPROTO_IPIP, ipip, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPIP");
 #endif /* IPSEC */
 SYSCTL_NODE(_net_inet, IPPROTO_RAW, raw, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "RAW");
 SYSCTL_NODE(_net_inet, OID_AUTO, accf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Accept filters");
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
index c8a2ad7b4c94..b4730b069fc1 100644
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -1,811 +1,807 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_sctp.h"
 #ifndef INET
 #error "IPDIVERT requires INET"
 #endif
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/kernel.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <net/vnet.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <security/mac/mac_framework.h>
 /*
  * Divert sockets
  */
 
 /*
  * Allocate enough space to hold a full IP packet
  */
 #define	DIVSNDQ		(65536 + 100)
 #define	DIVRCVQ		(65536 + 100)
 
 /*
  * Divert sockets work in conjunction with ipfw or other packet filters,
  * see the divert(4) manpage for features.
  * Packets are selected by the packet filter and tagged with an
  * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by
  * the packet filter) and information on the matching filter rule for
  * subsequent reinjection. The divert_port is used to put the packet
  * on the corresponding divert socket, while the rule number is passed
  * up (at least partially) as the sin_port in the struct sockaddr.
  *
  * Packets written to the divert socket carry in sin_addr a
  * destination address, and in sin_port the number of the filter rule
  * after which to continue processing.
  * If the destination address is INADDR_ANY, the packet is treated as
  * as outgoing and sent to ip_output(); otherwise it is treated as
  * incoming and sent to ip_input().
  * Further, sin_zero carries some information on the interface,
  * which can be used in the reinject -- see comments in the code.
  *
  * On reinjection, processing in ip_input() and ip_output()
  * will be exactly the same as for the original packet, except that
  * packet filter processing will start at the rule number after the one
  * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0
  * will apply the entire ruleset to the packet).
  */
 
 /* Internal variables. */
 VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo);
 #define	V_divcbinfo			VNET(divcbinfo)
 
 static u_long	div_sendspace = DIVSNDQ;	/* XXX sysctl ? */
 static u_long	div_recvspace = DIVRCVQ;	/* XXX sysctl ? */
 
 static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m,
     struct sockaddr_in *sin);
 static int div_output_outbound(int family, struct socket *so, struct mbuf *m);
 
 /*
  * Initialize divert connection block queue.
  */
 INPCBSTORAGE_DEFINE(divcbstor, "divinp", "divcb", "div", "divhash");
 
 static void
 div_init(void *arg __unused)
 {
 
 	/*
 	 * XXX We don't use the hash list for divert IP, but it's easier to
 	 * allocate one-entry hash lists than it is to check all over the
 	 * place for hashbase == NULL.
 	 */
 	in_pcbinfo_init(&V_divcbinfo, &divcbstor, 1, 1);
 }
 VNET_SYSINIT(div_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_init, NULL);
 
 static void
 div_destroy(void *unused __unused)
 {
 
 	in_pcbinfo_destroy(&V_divcbinfo);
 }
 VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_destroy, NULL);
 
 static bool
 div_port_match(const struct inpcb *inp, void *v)
 {
 	uint16_t nport = *(uint16_t *)v;
 
 	return (inp->inp_lport == nport);
 }
 
 /*
  * Divert a packet by passing it up to the divert socket at port 'port'.
  */
 static void
 divert_packet(struct mbuf *m, bool incoming)
 {
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	struct ip *ip;
 #endif
 	struct inpcb *inp;
 	struct socket *sa;
 	u_int16_t nport;
 	struct sockaddr_in divsrc;
 	struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo,
 	    INPLOOKUP_RLOCKPCB, div_port_match, &nport);
 	struct m_tag *mtag;
 
 	NET_EPOCH_ASSERT();
 
 	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
 	if (mtag == NULL) {
 		m_freem(m);
 		return;
 	}
 	/* Assure header */
 	if (m->m_len < sizeof(struct ip) &&
 	    (m = m_pullup(m, sizeof(struct ip))) == NULL)
 		return;
 
 	/* Delayed checksums are currently not compatible with divert. */
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP) {
 		ip = mtod(m, struct ip *);
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 #ifdef INET6
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) {
 		in6_delayed_cksum(m, m->m_pkthdr.len -
 		    sizeof(struct ip6_hdr), sizeof(struct ip6_hdr));
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) {
 		sctp_delayed_cksum(m, sizeof(struct ip6_hdr));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6;
 	}
 #endif
 #endif /* INET6 */
 	bzero(&divsrc, sizeof(divsrc));
 	divsrc.sin_len = sizeof(divsrc);
 	divsrc.sin_family = AF_INET;
 	/* record matching rule, in host format */
 	divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum;
 	/*
 	 * Record receive interface address, if any.
 	 * But only for incoming packets.
 	 */
 	if (incoming) {
 		struct ifaddr *ifa;
 		struct ifnet *ifp;
 
 		/* Sanity check */
 		M_ASSERTPKTHDR(m);
 
 		/* Find IP address for receive interface */
 		ifp = m->m_pkthdr.rcvif;
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family != AF_INET)
 				continue;
 			divsrc.sin_addr =
 			    ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr;
 			break;
 		}
 	}
 	/*
 	 * Record the incoming interface name whenever we have one.
 	 */
 	if (m->m_pkthdr.rcvif) {
 		/*
 		 * Hide the actual interface name in there in the
 		 * sin_zero array. XXX This needs to be moved to a
 		 * different sockaddr type for divert, e.g.
 		 * sockaddr_div with multiple fields like
 		 * sockaddr_dl. Presently we have only 7 bytes
 		 * but that will do for now as most interfaces
 		 * are 4 or less + 2 or less bytes for unit.
 		 * There is probably a faster way of doing this,
 		 * possibly taking it from the sockaddr_dl on the iface.
 		 * This solves the problem of a P2P link and a LAN interface
 		 * having the same address, which can result in the wrong
 		 * interface being assigned to the packet when fed back
 		 * into the divert socket. Theoretically if the daemon saves
 		 * and re-uses the sockaddr_in as suggested in the man pages,
 		 * this iface name will come along for the ride.
 		 * (see div_output for the other half of this.)
 		 */
 		strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname,
 		    sizeof(divsrc.sin_zero));
 	}
 
 	/* Put packet on socket queue, if any */
 	sa = NULL;
 	/* nport is inp_next's context. */
 	nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info));
 	while ((inp = inp_next(&inpi)) != NULL) {
 		sa = inp->inp_socket;
 		SOCKBUF_LOCK(&sa->so_rcv);
 		if (sbappendaddr_locked(&sa->so_rcv,
 		    (struct sockaddr *)&divsrc, m, NULL) == 0) {
 			soroverflow_locked(sa);
 			sa = NULL;	/* force mbuf reclaim below */
 		} else
 			sorwakeup_locked(sa);
 		/* XXX why does only one socket match? */
 		INP_RUNLOCK(inp);
 		break;
 	}
 	if (sa == NULL) {
 		m_freem(m);
 		KMOD_IPSTAT_INC(ips_noproto);
 		KMOD_IPSTAT_DEC(ips_delivered);
         }
 }
 
 /*
  * Deliver packet back into the IP processing machinery.
  *
  * If no address specified, or address is 0.0.0.0, send to ip_output();
  * otherwise, send to ip_input() and mark as having been received on
  * the interface with that address.
  */
 static int
 div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin,
     struct mbuf *control)
 {
 	struct epoch_tracker et;
 	const struct ip *ip;
 	struct m_tag *mtag;
 	struct ipfw_rule_ref *dt;
 	int error, family;
 
 	if (control) {
 		m_freem(control);		/* XXX */
 		control = NULL;
 	}
 
 	if (sin != NULL) {
 		if (sin->sin_family != AF_INET) {
 			m_freem(m);
 			return (EAFNOSUPPORT);
 		}
 		if (sin->sin_len != sizeof(*sin)) {
 			m_freem(m);
 			return (EINVAL);
 		}
 	}
 
 	/*
 	 * An mbuf may hasn't come from userland, but we pretend
 	 * that it has.
 	 */
 	m->m_pkthdr.rcvif = NULL;
 	m->m_nextpkt = NULL;
 	M_SETFIB(m, so->so_fibnum);
 
 	mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL);
 	if (mtag == NULL) {
 		/* this should be normal */
 		mtag = m_tag_alloc(MTAG_IPFW_RULE, 0,
 		    sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO);
 		if (mtag == NULL) {
 			m_freem(m);
 			return (ENOBUFS);
 		}
 		m_tag_prepend(m, mtag);
 	}
 	dt = (struct ipfw_rule_ref *)(mtag+1);
 
 	/* Loopback avoidance and state recovery */
 	if (sin) {
 		int i;
 
 		/* set the starting point. We provide a non-zero slot,
 		 * but a non_matching chain_id to skip that info and use
 		 * the rulenum/rule_id.
 		 */
 		dt->slot = 1; /* dummy, chain_id is invalid */
 		dt->chain_id = 0;
 		dt->rulenum = sin->sin_port+1; /* host format ? */
 		dt->rule_id = 0;
 		/* XXX: broken for IPv6 */
 		/*
 		 * Find receive interface with the given name, stuffed
 		 * (if it exists) in the sin_zero[] field.
 		 * The name is user supplied data so don't trust its size
 		 * or that it is zero terminated.
 		 */
 		for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++)
 			;
 		if ( i > 0 && i < sizeof(sin->sin_zero))
 			m->m_pkthdr.rcvif = ifunit(sin->sin_zero);
 	}
 
 	ip = mtod(m, struct ip *);
 	switch (ip->ip_v) {
 	case IPVERSION:
 		family = AF_INET;
 		break;
 #ifdef INET6
 	case IPV6_VERSION >> 4:
 		family = AF_INET6;
 		break;
 #endif
 	default:
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 
 	/* Reinject packet into the system as incoming or outgoing */
 	NET_EPOCH_ENTER(et);
 	if (!sin || sin->sin_addr.s_addr == 0) {
 		dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT;
 		error = div_output_outbound(family, so, m);
 	} else {
 		dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN;
 		error = div_output_inbound(family, so, m, sin);
 	}
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * Sends mbuf @m to the wire via ip[6]_output().
  *
  * Returns 0 on success or an errno value on failure.  @m is always consumed.
  */
 static int
 div_output_outbound(int family, struct socket *so, struct mbuf *m)
 {
 	struct ip *const ip = mtod(m, struct ip *);
 	struct mbuf *options;
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	INP_RLOCK(inp);
 	switch (family) {
 	case AF_INET:
 		/*
 		 * Don't allow both user specified and setsockopt
 		 * options, and don't allow packet length sizes that
 		 * will crash.
 		 */
 		if ((((ip->ip_hl << 2) != sizeof(struct ip)) &&
 		    inp->inp_options != NULL) ||
 		    ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (EINVAL);
 		}
 		break;
 #ifdef INET6
 	case AF_INET6:
 	    {
 		struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *);
 
 		/* Don't allow packet length sizes that will crash */
 		if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (EINVAL);
 		}
 		break;
 	    }
 #endif
 	}
 
 	/* Send packet to output processing */
 	KMOD_IPSTAT_INC(ips_rawout);		/* XXX */
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 	/*
 	 * Get ready to inject the packet into ip_output().
 	 * Just in case socket options were specified on the
 	 * divert socket, we duplicate them.  This is done
 	 * to avoid having to hold the PCB locks over the call
 	 * to ip_output(), as doing this results in a number of
 	 * lock ordering complexities.
 	 *
 	 * Note that we set the multicast options argument for
 	 * ip_output() to NULL since it should be invariant that
 	 * they are not present.
 	 */
 	KASSERT(inp->inp_moptions == NULL,
 	    ("multicast options set on a divert socket"));
 	/*
 	 * XXXCSJP: It is unclear to me whether or not it makes
 	 * sense for divert sockets to have options.  However,
 	 * for now we will duplicate them with the INP locks
 	 * held so we can use them in ip_output() without
 	 * requring a reference to the pcb.
 	 */
 	options = NULL;
 	if (inp->inp_options != NULL) {
 		options = m_dup(inp->inp_options, M_NOWAIT);
 		if (options == NULL) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (ENOBUFS);
 		}
 	}
 	INP_RUNLOCK(inp);
 
 	error = 0;
 	switch (family) {
 	case AF_INET:
 		error = ip_output(m, options, NULL,
 		    ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0)
 		    | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL);
 		break;
 #ifdef INET6
 	case AF_INET6:
 		error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL);
 		break;
 #endif
 	}
 	if (options != NULL)
 		m_freem(options);
 
 	return (error);
 }
 
 /*
  * Schedules mbuf @m for local processing via IPv4/IPv6 netisr queue.
  *
  * Returns 0 on success or an errno value on failure.  @m is always consumed.
  */
 static int
 div_output_inbound(int family, struct socket *so, struct mbuf *m,
     struct sockaddr_in *sin)
 {
 	const struct ip *ip;
 	struct ifaddr *ifa;
 
 	if (m->m_pkthdr.rcvif == NULL) {
 		/*
 		 * No luck with the name, check by IP address.
 		 * Clear the port and the ifname to make sure
 		 * there are no distractions for ifa_ifwithaddr.
 		 */
 
 		/* XXX: broken for IPv6 */
 		bzero(sin->sin_zero, sizeof(sin->sin_zero));
 		sin->sin_port = 0;
 		ifa = ifa_ifwithaddr((struct sockaddr *) sin);
 		if (ifa == NULL) {
 			m_freem(m);
 			return (EADDRNOTAVAIL);
 		}
 		m->m_pkthdr.rcvif = ifa->ifa_ifp;
 	}
 #ifdef MAC
 	mac_socket_create_mbuf(so, m);
 #endif
 	/* Send packet to input processing via netisr */
 	switch (family) {
 	case AF_INET:
 		ip = mtod(m, struct ip *);
 		/*
 		 * Restore M_BCAST flag when destination address is
 		 * broadcast. It is expected by ip_tryforward().
 		 */
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
 			m->m_flags |= M_MCAST;
 		else if (in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
 			m->m_flags |= M_BCAST;
 		netisr_queue_src(NETISR_IP, (uintptr_t)so, m);
 		break;
 #ifdef INET6
 	case AF_INET6:
 		netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m);
 		break;
 #endif
 	default:
 		m_freem(m);
 		return (EINVAL);
 	}
 
 	return (0);
 }
 
 static int
 div_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp  = sotoinpcb(so);
 	KASSERT(inp == NULL, ("div_attach: inp != NULL"));
 	if (td != NULL) {
 		error = priv_check(td, PRIV_NETINET_DIVERT);
 		if (error)
 			return (error);
 	}
 	error = soreserve(so, div_sendspace, div_recvspace);
 	if (error)
 		return error;
 	error = in_pcballoc(so, &V_divcbinfo);
 	if (error)
 		return error;
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_ip_p = proto;
 	inp->inp_flags |= INP_HDRINCL;
 	INP_WUNLOCK(inp);
 	return 0;
 }
 
 static void
 div_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_detach: inp == NULL"));
 	INP_WLOCK(inp);
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 }
 
 static int
 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_bind: inp == NULL"));
 	/* in_pcbbind assumes that nam is a sockaddr_in
 	 * and in_pcbbind requires a valid address. Since divert
 	 * sockets don't we need to make sure the address is
 	 * filled in properly.
 	 * XXX -- divert should not be abusing in_pcbind
 	 * and should probably have its own family.
 	 */
 	if (nam->sa_family != AF_INET)
 		return EAFNOSUPPORT;
 	if (nam->sa_len != sizeof(struct sockaddr_in))
 		return EINVAL;
 	((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY;
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(&V_divcbinfo);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(&V_divcbinfo);
 	INP_WUNLOCK(inp);
 	return error;
 }
 
 static int
 div_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("div_shutdown: inp == NULL"));
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return 0;
 }
 
 static int
 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 
 	/* Packet must have a header (but that's about it) */
 	if (m->m_len < sizeof (struct ip) &&
 	    (m = m_pullup(m, sizeof (struct ip))) == NULL) {
 		KMOD_IPSTAT_INC(ips_toosmall);
 		if (control != NULL)
 			m_freem(control);
 		m_freem(m);
 		return EINVAL;
 	}
 
 	/* Send packet */
 	return div_output(so, m, (struct sockaddr_in *)nam, control);
 }
 
 static int
 div_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo,
 	    INPLOOKUP_RLOCKPCB);
 	struct xinpgen xig;
 	struct inpcb *inp;
 	int error;
 
 	if (req->newptr != 0)
 		return EPERM;
 
 	if (req->oldptr == 0) {
 		int n;
 
 		n = V_divcbinfo.ipi_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return 0;
 	}
 
 	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
 		return (error);
 
 	bzero(&xig, sizeof(xig));
 	xig.xig_len = sizeof xig;
 	xig.xig_count = V_divcbinfo.ipi_count;
 	xig.xig_gen = V_divcbinfo.ipi_gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return error;
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		if (inp->inp_gencnt <= xig.xig_gen) {
 			struct xinpcb xi;
 
 			in_pcbtoxinpcb(inp, &xi);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 			if (error) {
 				INP_RUNLOCK(inp);
 				break;
 			}
 		}
 	}
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		xig.xig_gen = V_divcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_divcbinfo.ipi_count;
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 
 	return (error);
 }
 
 #ifdef SYSCTL_NODE
 static SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPDIVERT");
 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist,
    CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE,
     NULL, 0, div_pcblist, "S,xinpcb",
     "List of active divert sockets");
 #endif
 
-struct pr_usrreqs div_usrreqs = {
-	.pru_attach =		div_attach,
-	.pru_bind =		div_bind,
-	.pru_control =		in_control,
-	.pru_detach =		div_detach,
-	.pru_peeraddr =		in_getpeeraddr,
-	.pru_send =		div_send,
-	.pru_shutdown =		div_shutdown,
-	.pru_sockaddr =		in_getsockaddr,
-	.pru_sosetlabel =	in_pcbsosetlabel
-};
-
-struct protosw div_protosw = {
+static struct protosw div_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_protocol =		IPPROTO_DIVERT,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_usrreqs =		&div_usrreqs
+	.pr_attach =		div_attach,
+	.pr_bind =		div_bind,
+	.pr_control =		in_control,
+	.pr_detach =		div_detach,
+	.pr_peeraddr =		in_getpeeraddr,
+	.pr_send =		div_send,
+	.pr_shutdown =		div_shutdown,
+	.pr_sockaddr =		in_getsockaddr,
+	.pr_sosetlabel =	in_pcbsosetlabel
 };
 
 static int
 div_modevent(module_t mod, int type, void *unused)
 {
 	int err = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		/*
 		 * Protocol will be initialized by pf_proto_register().
 		 */
-		err = pf_proto_register(PF_INET, &div_protosw);
+		err = protosw_register(&inetdomain, &div_protosw);
 		if (err != 0)
 			return (err);
 		ip_divert_ptr = divert_packet;
 		break;
 	case MOD_QUIESCE:
 		/*
 		 * IPDIVERT may normally not be unloaded because of the
 		 * potential race conditions.  Tell kldunload we can't be
 		 * unloaded unless the unload is forced.
 		 */
 		err = EPERM;
 		break;
 	case MOD_UNLOAD:
 		/*
 		 * Forced unload.
 		 *
 		 * Module ipdivert can only be unloaded if no sockets are
 		 * connected.  Maybe this can be changed later to forcefully
 		 * disconnect any open sockets.
 		 *
 		 * XXXRW: Note that there is a slight race here, as a new
 		 * socket open request could be spinning on the lock and then
 		 * we destroy the lock.
 		 */
 		INP_INFO_WLOCK(&V_divcbinfo);
 		if (V_divcbinfo.ipi_count != 0) {
 			err = EBUSY;
 			INP_INFO_WUNLOCK(&V_divcbinfo);
 			break;
 		}
 		ip_divert_ptr = NULL;
-		err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW);
+		err = protosw_unregister(&div_protosw);
 		INP_INFO_WUNLOCK(&V_divcbinfo);
 #ifndef VIMAGE
 		div_destroy(NULL);
 #endif
 		break;
 	default:
 		err = EOPNOTSUPP;
 		break;
 	}
 	return err;
 }
 
 static moduledata_t ipdivertmod = {
         "ipdivert",
         div_modevent,
         0
 };
 
 DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY);
 MODULE_DEPEND(ipdivert, ipfw, 3, 3, 3);
 MODULE_VERSION(ipdivert, 1);
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
index b68a6536c8b8..a502f50ac4df 100644
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@@ -1,1601 +1,1600 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_output.c	8.3 (Berkeley) 1/21/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 #include "opt_mbuf_stress_test.h"
 #include "opt_ratelimit.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 #include "opt_sctp.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/ucred.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_vlan_var.h>
 #include <net/if_llatbl.h>
 #include <net/ethernet.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/rss_config.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_systm.h>
 #include <netinet/ip.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_rss.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 #include <netinet/sctp.h>
 #include <netinet/sctp_crc32.h>
 #endif
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 #ifdef MBUF_STRESS_TEST
 static int mbuf_frag_size = 0;
 SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
 	&mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
 #endif
 
 static void	ip_mloopback(struct ifnet *, const struct mbuf *, int);
 
 extern int in_mcast_loop;
-extern	struct protosw inetsw[];
 
 static inline int
 ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags,
     struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error)
 {
 	struct m_tag *fwd_tag = NULL;
 	struct mbuf *m;
 	struct in_addr odst;
 	struct ip *ip;
 	int pflags = PFIL_OUT;
 
 	if (flags & IP_FORWARDING)
 		pflags |= PFIL_FWD;
 
 	m = *mp;
 	ip = mtod(m, struct ip *);
 
 	/* Run through list of hooks for output packets. */
 	odst.s_addr = ip->ip_dst.s_addr;
 	switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, pflags, inp)) {
 	case PFIL_DROPPED:
 		*error = EACCES;
 		/* FALLTHROUGH */
 	case PFIL_CONSUMED:
 		return 1; /* Finished */
 	case PFIL_PASS:
 		*error = 0;
 	}
 	m = *mp;
 	ip = mtod(m, struct ip *);
 
 	/* See if destination IP address was changed by packet filter. */
 	if (odst.s_addr != ip->ip_dst.s_addr) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		/* If destination is now ourself drop to ip_input(). */
 		if (in_localip(ip->ip_dst)) {
 			m->m_flags |= M_FASTFWD_OURS;
 			if (m->m_pkthdr.rcvif == NULL)
 				m->m_pkthdr.rcvif = V_loif;
 			if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 				m->m_pkthdr.csum_flags |=
 					CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 				m->m_pkthdr.csum_data = 0xffff;
 			}
 			m->m_pkthdr.csum_flags |=
 				CSUM_IP_CHECKED | CSUM_IP_VALID;
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 			if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 				m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 			*error = netisr_queue(NETISR_IP, m);
 			return 1; /* Finished */
 		}
 
 		bzero(dst, sizeof(*dst));
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 
 		return -1; /* Reloop */
 	}
 	/* See if fib was changed by packet filter. */
 	if ((*fibnum) != M_GETFIB(m)) {
 		m->m_flags |= M_SKIP_FIREWALL;
 		*fibnum = M_GETFIB(m);
 		return -1; /* Reloop for FIB change */
 	}
 
 	/* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
 	if (m->m_flags & M_FASTFWD_OURS) {
 		if (m->m_pkthdr.rcvif == NULL)
 			m->m_pkthdr.rcvif = V_loif;
 		if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			m->m_pkthdr.csum_flags |=
 				CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			m->m_pkthdr.csum_data = 0xffff;
 		}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 		if (m->m_pkthdr.csum_flags & CSUM_SCTP)
 			m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
 #endif
 		m->m_pkthdr.csum_flags |=
 			CSUM_IP_CHECKED | CSUM_IP_VALID;
 
 		*error = netisr_queue(NETISR_IP, m);
 		return 1; /* Finished */
 	}
 	/* Or forward to some other address? */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
 		bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
 		m->m_flags |= M_SKIP_FIREWALL;
 		m->m_flags &= ~M_IP_NEXTHOP;
 		m_tag_delete(m, fwd_tag);
 
 		return -1; /* Reloop for CHANGE of dst */
 	}
 
 	return 0;
 }
 
 static int
 ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
     const struct sockaddr *gw, struct route *ro, bool stamp_tag)
 {
 #ifdef KERN_TLS
 	struct ktls_session *tls = NULL;
 #endif
 	struct m_snd_tag *mst;
 	int error;
 
 	MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
 	mst = NULL;
 
 #ifdef KERN_TLS
 	/*
 	 * If this is an unencrypted TLS record, save a reference to
 	 * the record.  This local reference is used to call
 	 * ktls_output_eagain after the mbuf has been freed (thus
 	 * dropping the mbuf's reference) in if_output.
 	 */
 	if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
 		tls = ktls_hold(m->m_next->m_epg_tls);
 		mst = tls->snd_tag;
 
 		/*
 		 * If a TLS session doesn't have a valid tag, it must
 		 * have had an earlier ifp mismatch, so drop this
 		 * packet.
 		 */
 		if (mst == NULL) {
 			m_freem(m);
 			error = EAGAIN;
 			goto done;
 		}
 		/*
 		 * Always stamp tags that include NIC ktls.
 		 */
 		stamp_tag = true;
 	}
 #endif
 #ifdef RATELIMIT
 	if (inp != NULL && mst == NULL) {
 		if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
 		    (inp->inp_snd_tag != NULL &&
 		    inp->inp_snd_tag->ifp != ifp))
 			in_pcboutput_txrtlmt(inp, ifp, m);
 
 		if (inp->inp_snd_tag != NULL)
 			mst = inp->inp_snd_tag;
 	}
 #endif
 	if (stamp_tag && mst != NULL) {
 		KASSERT(m->m_pkthdr.rcvif == NULL,
 		    ("trying to add a send tag to a forwarded packet"));
 		if (mst->ifp != ifp) {
 			m_freem(m);
 			error = EAGAIN;
 			goto done;
 		}
 
 		/* stamp send tag on mbuf */
 		m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
 		m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
 	}
 
 	error = (*ifp->if_output)(ifp, m, gw, ro);
 
 done:
 	/* Check for route change invalidating send tags. */
 #ifdef KERN_TLS
 	if (tls != NULL) {
 		if (error == EAGAIN)
 			error = ktls_output_eagain(inp, tls);
 		ktls_free(tls);
 	}
 #endif
 #ifdef RATELIMIT
 	if (error == EAGAIN)
 		in_pcboutput_eagain(inp);
 #endif
 	return (error);
 }
 
 /* rte<>ro_flags translation */
 static inline void
 rt_update_ro_flags(struct route *ro, const struct nhop_object *nh)
 {
 	int nh_flags = nh->nh_flags;
 
 	ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW);
 
 	ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0;
 	ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0;
 	ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0;
 }
 
 /*
  * IP output.  The packet in mbuf chain m contains a skeletal IP
  * header (with len, off, ttl, proto, tos, src, dst).
  * The mbuf chain containing the packet will be freed.
  * The mbuf opt, if present, will not be freed.
  * If route ro is present and has ro_rt initialized, route lookup would be
  * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  * then result of route lookup is stored in ro->ro_rt.
  *
  * In the IP forwarding case, the packet will arrive with options already
  * inserted, so must have a NULL opt pointer.
  */
 int
 ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
     struct ip_moptions *imo, struct inpcb *inp)
 {
 	struct ip *ip;
 	struct ifnet *ifp = NULL;	/* keep compiler happy */
 	struct mbuf *m0;
 	int hlen = sizeof (struct ip);
 	int mtu = 0;
 	int error = 0;
 	int vlan_pcp = -1;
 	struct sockaddr_in *dst;
 	const struct sockaddr *gw;
 	struct in_ifaddr *ia = NULL;
 	struct in_addr src;
 	int isbroadcast;
 	uint16_t ip_len, ip_off;
 	struct route iproute;
 	uint32_t fibnum;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	int no_route_but_check_spd = 0;
 #endif
 
 	M_ASSERTPKTHDR(m);
 	NET_EPOCH_ASSERT();
 
 	if (inp != NULL) {
 		INP_LOCK_ASSERT(inp);
 		M_SETFIB(m, inp->inp_inc.inc_fibnum);
 		if ((flags & IP_NODEFAULTFLOWID) == 0) {
 			m->m_pkthdr.flowid = inp->inp_flowid;
 			M_HASHTYPE_SET(m, inp->inp_flowtype);
 		}
 		if ((inp->inp_flags2 & INP_2PCP_SET) != 0)
 			vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >>
 			    INP_2PCP_SHIFT;
 #ifdef NUMA
 		m->m_pkthdr.numa_domain = inp->inp_numa_domain;
 #endif
 	}
 
 	if (opt) {
 		int len = 0;
 		m = ip_insertoptions(m, opt, &len);
 		if (len != 0)
 			hlen = len; /* ip->ip_hl is updated above */
 	}
 	ip = mtod(m, struct ip *);
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = hlen >> 2;
 		ip_fillid(ip);
 	} else {
 		/* Header already set, fetch hlen from there */
 		hlen = ip->ip_hl << 2;
 	}
 	if ((flags & IP_FORWARDING) == 0)
 		IPSTAT_INC(ips_localout);
 
 	/*
 	 * dst/gw handling:
 	 *
 	 * gw is readonly but can point either to dst OR rt_gateway,
 	 * therefore we need restore gw if we're redoing lookup.
 	 */
 	fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
 	if (ro == NULL) {
 		ro = &iproute;
 		bzero(ro, sizeof (*ro));
 	}
 	dst = (struct sockaddr_in *)&ro->ro_dst;
 	if (ro->ro_nh == NULL) {
 		dst->sin_family = AF_INET;
 		dst->sin_len = sizeof(*dst);
 		dst->sin_addr = ip->ip_dst;
 	}
 	gw = (const struct sockaddr *)dst;
 again:
 	/*
 	 * Validate route against routing table additions;
 	 * a better/more specific route might have been added.
 	 */
 	if (inp != NULL && ro->ro_nh != NULL)
 		NH_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
 	/*
 	 * If there is a cached route,
 	 * check that it is to the same destination
 	 * and is still up.  If not, free it and try again.
 	 * The address family should also be checked in case of sharing the
 	 * cache with IPv6.
 	 * Also check whether routing cache needs invalidation.
 	 */
 	if (ro->ro_nh != NULL &&
 	    ((!NH_IS_VALID(ro->ro_nh)) || dst->sin_family != AF_INET ||
 	    dst->sin_addr.s_addr != ip->ip_dst.s_addr))
 		RO_INVALIDATE_CACHE(ro);
 	ia = NULL;
 	/*
 	 * If routing to interface only, short circuit routing lookup.
 	 * The use of an all-ones broadcast address implies this; an
 	 * interface is specified by the broadcast address of an interface,
 	 * or the destination address of a ptp interface.
 	 */
 	if (flags & IP_SENDONES) {
 		if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
 						      M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ip->ip_dst.s_addr = INADDR_BROADCAST;
 		dst->sin_addr = ip->ip_dst;
 		ifp = ia->ia_ifp;
 		mtu = ifp->if_mtu;
 		ip->ip_ttl = 1;
 		isbroadcast = 1;
 		src = IA_SIN(ia)->sin_addr;
 	} else if (flags & IP_ROUTETOIF) {
 		if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
 						    M_GETFIB(m)))) == NULL &&
 		    (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
 						M_GETFIB(m)))) == NULL) {
 			IPSTAT_INC(ips_noroute);
 			error = ENETUNREACH;
 			goto bad;
 		}
 		ifp = ia->ia_ifp;
 		mtu = ifp->if_mtu;
 		ip->ip_ttl = 1;
 		isbroadcast = ifp->if_flags & IFF_BROADCAST ?
 		    in_ifaddr_broadcast(dst->sin_addr, ia) : 0;
 		src = IA_SIN(ia)->sin_addr;
 	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
 	    imo != NULL && imo->imo_multicast_ifp != NULL) {
 		/*
 		 * Bypass the normal routing lookup for multicast
 		 * packets if the interface is specified.
 		 */
 		ifp = imo->imo_multicast_ifp;
 		mtu = ifp->if_mtu;
 		IFP_TO_IA(ifp, ia);
 		isbroadcast = 0;	/* fool gcc */
 		/* Interface may have no addresses. */
 		if (ia != NULL)
 			src = IA_SIN(ia)->sin_addr;
 		else
 			src.s_addr = INADDR_ANY;
 	} else if (ro != &iproute) {
 		if (ro->ro_nh == NULL) {
 			/*
 			 * We want to do any cloning requested by the link
 			 * layer, as this is probably required in all cases
 			 * for correct operation (as it is for ARP).
 			 */
 			uint32_t flowid;
 			flowid = m->m_pkthdr.flowid;
 			ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0,
 			    NHR_REF, flowid);
 
 			if (ro->ro_nh == NULL || (!NH_IS_VALID(ro->ro_nh))) {
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 				/*
 				 * There is no route for this packet, but it is
 				 * possible that a matching SPD entry exists.
 				 */
 				no_route_but_check_spd = 1;
 				goto sendit;
 #endif
 				IPSTAT_INC(ips_noroute);
 				error = EHOSTUNREACH;
 				goto bad;
 			}
 		}
 		struct nhop_object *nh = ro->ro_nh;
 
 		ia = ifatoia(nh->nh_ifa);
 		ifp = nh->nh_ifp;
 		counter_u64_add(nh->nh_pksent, 1);
 		rt_update_ro_flags(ro, nh);
 		if (nh->nh_flags & NHF_GATEWAY)
 			gw = &nh->gw_sa;
 		if (nh->nh_flags & NHF_HOST)
 			isbroadcast = (nh->nh_flags & NHF_BROADCAST);
 		else if ((ifp->if_flags & IFF_BROADCAST) && (gw->sa_family == AF_INET))
 			isbroadcast = in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia);
 		else
 			isbroadcast = 0;
 		mtu = nh->nh_mtu;
 		src = IA_SIN(ia)->sin_addr;
 	} else {
 		struct nhop_object *nh;
 
 		nh = fib4_lookup(M_GETFIB(m), dst->sin_addr, 0, NHR_NONE,
 		    m->m_pkthdr.flowid);
 		if (nh == NULL) {
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 			/*
 			 * There is no route for this packet, but it is
 			 * possible that a matching SPD entry exists.
 			 */
 			no_route_but_check_spd = 1;
 			goto sendit;
 #endif
 			IPSTAT_INC(ips_noroute);
 			error = EHOSTUNREACH;
 			goto bad;
 		}
 		ifp = nh->nh_ifp;
 		mtu = nh->nh_mtu;
 		rt_update_ro_flags(ro, nh);
 		if (nh->nh_flags & NHF_GATEWAY)
 			gw = &nh->gw_sa;
 		ia = ifatoia(nh->nh_ifa);
 		src = IA_SIN(ia)->sin_addr;
 		isbroadcast = (((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) ==
 		    (NHF_HOST | NHF_BROADCAST)) ||
 		    ((ifp->if_flags & IFF_BROADCAST) &&
 		    (gw->sa_family == AF_INET) &&
 		    in_ifaddr_broadcast(((const struct sockaddr_in *)gw)->sin_addr, ia)));
 	}
 
 	/* Catch a possible divide by zero later. */
 	KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p",
 	    __func__, mtu, ro,
 	    (ro != NULL && ro->ro_nh != NULL) ? ro->ro_nh->nh_flags : 0, ifp));
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 		m->m_flags |= M_MCAST;
 		/*
 		 * IP destination address is multicast.  Make sure "gw"
 		 * still points to the address in "ro".  (It may have been
 		 * changed to point to a gateway address, above.)
 		 */
 		gw = (const struct sockaddr *)dst;
 		/*
 		 * See if the caller provided any multicast options
 		 */
 		if (imo != NULL) {
 			ip->ip_ttl = imo->imo_multicast_ttl;
 			if (imo->imo_multicast_vif != -1)
 				ip->ip_src.s_addr =
 				    ip_mcast_src ?
 				    ip_mcast_src(imo->imo_multicast_vif) :
 				    INADDR_ANY;
 		} else
 			ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
 		/*
 		 * Confirm that the outgoing interface supports multicast.
 		 */
 		if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
 			if ((ifp->if_flags & IFF_MULTICAST) == 0) {
 				IPSTAT_INC(ips_noroute);
 				error = ENETUNREACH;
 				goto bad;
 			}
 		}
 		/*
 		 * If source address not specified yet, use address
 		 * of outgoing interface.
 		 */
 		if (ip->ip_src.s_addr == INADDR_ANY)
 			ip->ip_src = src;
 
 		if ((imo == NULL && in_mcast_loop) ||
 		    (imo && imo->imo_multicast_loop)) {
 			/*
 			 * Loop back multicast datagram if not expressly
 			 * forbidden to do so, even if we are not a member
 			 * of the group; ip_input() will filter it later,
 			 * thus deferring a hash lookup and mutex acquisition
 			 * at the expense of a cheap copy using m_copym().
 			 */
 			ip_mloopback(ifp, m, hlen);
 		} else {
 			/*
 			 * If we are acting as a multicast router, perform
 			 * multicast forwarding as if the packet had just
 			 * arrived on the interface to which we are about
 			 * to send.  The multicast forwarding function
 			 * recursively calls this function, using the
 			 * IP_FORWARDING flag to prevent infinite recursion.
 			 *
 			 * Multicasts that are looped back by ip_mloopback(),
 			 * above, will be forwarded by the ip_input() routine,
 			 * if necessary.
 			 */
 			if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
 				/*
 				 * If rsvp daemon is not running, do not
 				 * set ip_moptions. This ensures that the packet
 				 * is multicast and not just sent down one link
 				 * as prescribed by rsvpd.
 				 */
 				if (!V_rsvp_on)
 					imo = NULL;
 				if (ip_mforward &&
 				    ip_mforward(ip, ifp, m, imo) != 0) {
 					m_freem(m);
 					goto done;
 				}
 			}
 		}
 
 		/*
 		 * Multicasts with a time-to-live of zero may be looped-
 		 * back, above, but must not be transmitted on a network.
 		 * Also, multicasts addressed to the loopback interface
 		 * are not sent -- the above call to ip_mloopback() will
 		 * loop back a copy. ip_input() will drop the copy if
 		 * this host does not belong to the destination group on
 		 * the loopback interface.
 		 */
 		if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
 			m_freem(m);
 			goto done;
 		}
 
 		goto sendit;
 	}
 
 	/*
 	 * If the source address is not specified yet, use the address
 	 * of the outoing interface.
 	 */
 	if (ip->ip_src.s_addr == INADDR_ANY)
 		ip->ip_src = src;
 
 	/*
 	 * Look for broadcast address and
 	 * verify user is allowed to send
 	 * such a packet.
 	 */
 	if (isbroadcast) {
 		if ((ifp->if_flags & IFF_BROADCAST) == 0) {
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 		if ((flags & IP_ALLOWBROADCAST) == 0) {
 			error = EACCES;
 			goto bad;
 		}
 		/* don't allow broadcast messages to be fragmented */
 		if (ip_len > mtu) {
 			error = EMSGSIZE;
 			goto bad;
 		}
 		m->m_flags |= M_BCAST;
 	} else {
 		m->m_flags &= ~M_BCAST;
 	}
 
 sendit:
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	if (IPSEC_ENABLED(ipv4)) {
 		if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) {
 			if (error == EINPROGRESS)
 				error = 0;
 			goto done;
 		}
 	}
 	/*
 	 * Check if there was a route for this packet; return error if not.
 	 */
 	if (no_route_but_check_spd) {
 		IPSTAT_INC(ips_noroute);
 		error = EHOSTUNREACH;
 		goto bad;
 	}
 	/* Update variables that are affected by ipsec4_output(). */
 	ip = mtod(m, struct ip *);
 	hlen = ip->ip_hl << 2;
 #endif /* IPSEC */
 
 	/* Jump over all PFIL processing if hooks are not active. */
 	if (PFIL_HOOKED_OUT(V_inet_pfil_head)) {
 		switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum,
 		    &error)) {
 		case 1: /* Finished */
 			goto done;
 
 		case 0: /* Continue normally */
 			ip = mtod(m, struct ip *);
 			break;
 
 		case -1: /* Need to try again */
 			/* Reset everything for a new round */
 			if (ro != NULL) {
 				RO_NHFREE(ro);
 				ro->ro_prepend = NULL;
 			}
 			gw = (const struct sockaddr *)dst;
 			ip = mtod(m, struct ip *);
 			goto again;
 		}
 	}
 
 	if (vlan_pcp > -1)
 		EVL_APPLY_PRI(m, vlan_pcp);
 
 	/* IN_LOOPBACK must not appear on the wire - RFC1122. */
 	if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) ||
 	    IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) {
 		if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
 			IPSTAT_INC(ips_badaddr);
 			error = EADDRNOTAVAIL;
 			goto bad;
 		}
 	}
 
 	/* Ensure the packet data is mapped if the interface requires it. */
 	if ((ifp->if_capenable & IFCAP_MEXTPG) == 0) {
 		m = mb_unmapped_to_ext(m);
 		if (m == NULL) {
 			IPSTAT_INC(ips_odropped);
 			error = ENOBUFS;
 			goto bad;
 		}
 	}
 
 	m->m_pkthdr.csum_flags |= CSUM_IP;
 	if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
 		in_delayed_cksum(m);
 		m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
 		sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
 		m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 
 	/*
 	 * If small enough for interface, or the interface will take
 	 * care of the fragmentation for us, we can just send directly.
 	 * Note that if_vxlan could have requested TSO even though the outer
 	 * frame is UDP.  It is correct to not fragment such datagrams and
 	 * instead just pass them on to the driver.
 	 */
 	if (ip_len <= mtu ||
 	    (m->m_pkthdr.csum_flags & ifp->if_hwassist &
 	    (CSUM_TSO | CSUM_INNER_TSO)) != 0) {
 		ip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
 			ip->ip_sum = in_cksum(m, hlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 
 		/*
 		 * Record statistics for this interface address.
 		 * With CSUM_TSO the byte/packet count will be slightly
 		 * incorrect because we count the IP+TCP headers only
 		 * once instead of for every generated packet.
 		 */
 		if (!(flags & IP_FORWARDING) && ia) {
 			if (m->m_pkthdr.csum_flags &
 			    (CSUM_TSO | CSUM_INNER_TSO))
 				counter_u64_add(ia->ia_ifa.ifa_opackets,
 				    m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
 			else
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 
 			counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
 		}
 #ifdef MBUF_STRESS_TEST
 		if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
 			m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
 #endif
 		/*
 		 * Reset layer specific mbuf flags
 		 * to avoid confusing lower layers.
 		 */
 		m_clrprotoflags(m);
 		IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
 		error = ip_output_send(inp, ifp, m, gw, ro,
 		    (flags & IP_NO_SND_TAG_RL) ? false : true);
 		goto done;
 	}
 
 	/* Balk when DF bit is set or the interface didn't support TSO. */
 	if ((ip_off & IP_DF) ||
 	    (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) {
 		error = EMSGSIZE;
 		IPSTAT_INC(ips_cantfrag);
 		goto bad;
 	}
 
 	/*
 	 * Too large for interface; fragment if possible. If successful,
 	 * on return, m will point to a list of packets to be sent.
 	 */
 	error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
 	if (error)
 		goto bad;
 	for (; m; m = m0) {
 		m0 = m->m_nextpkt;
 		m->m_nextpkt = 0;
 		if (error == 0) {
 			/* Record statistics for this interface address. */
 			if (ia != NULL) {
 				counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
 				counter_u64_add(ia->ia_ifa.ifa_obytes,
 				    m->m_pkthdr.len);
 			}
 			/*
 			 * Reset layer specific mbuf flags
 			 * to avoid confusing upper layers.
 			 */
 			m_clrprotoflags(m);
 
 			IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
 			    mtod(m, struct ip *), NULL);
 			error = ip_output_send(inp, ifp, m, gw, ro, true);
 		} else
 			m_freem(m);
 	}
 
 	if (error == 0)
 		IPSTAT_INC(ips_fragmented);
 
 done:
 	return (error);
  bad:
 	m_freem(m);
 	goto done;
 }
 
 /*
  * Create a chain of fragments which fit the given mtu. m_frag points to the
  * mbuf to be fragmented; on return it points to the chain with the fragments.
  * Return 0 if no error. If error, m_frag may contain a partially built
  * chain of fragments that should be freed by the caller.
  *
  * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
  */
 int
 ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
     u_long if_hwassist_flags)
 {
 	int error = 0;
 	int hlen = ip->ip_hl << 2;
 	int len = (mtu - hlen) & ~7;	/* size of payload in each fragment */
 	int off;
 	struct mbuf *m0 = *m_frag;	/* the original packet		*/
 	int firstlen;
 	struct mbuf **mnext;
 	int nfrags;
 	uint16_t ip_len, ip_off;
 
 	ip_len = ntohs(ip->ip_len);
 	ip_off = ntohs(ip->ip_off);
 
 	if (ip_off & IP_DF) {	/* Fragmentation not allowed */
 		IPSTAT_INC(ips_cantfrag);
 		return EMSGSIZE;
 	}
 
 	/*
 	 * Must be able to put at least 8 bytes per fragment.
 	 */
 	if (len < 8)
 		return EMSGSIZE;
 
 	/*
 	 * If the interface will not calculate checksums on
 	 * fragmented packets, then do it here.
 	 */
 	if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 		in_delayed_cksum(m0);
 		m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 	}
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 	if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
 		sctp_delayed_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
 	}
 #endif
 	if (len > PAGE_SIZE) {
 		/*
 		 * Fragment large datagrams such that each segment
 		 * contains a multiple of PAGE_SIZE amount of data,
 		 * plus headers. This enables a receiver to perform
 		 * page-flipping zero-copy optimizations.
 		 *
 		 * XXX When does this help given that sender and receiver
 		 * could have different page sizes, and also mtu could
 		 * be less than the receiver's page size ?
 		 */
 		int newlen;
 
 		off = MIN(mtu, m0->m_pkthdr.len);
 
 		/*
 		 * firstlen (off - hlen) must be aligned on an
 		 * 8-byte boundary
 		 */
 		if (off < hlen)
 			goto smart_frag_failure;
 		off = ((off - hlen) & ~7) + hlen;
 		newlen = (~PAGE_MASK) & mtu;
 		if ((newlen + sizeof (struct ip)) > mtu) {
 			/* we failed, go back the default */
 smart_frag_failure:
 			newlen = len;
 			off = hlen + len;
 		}
 		len = newlen;
 
 	} else {
 		off = hlen + len;
 	}
 
 	firstlen = off - hlen;
 	mnext = &m0->m_nextpkt;		/* pointer to next packet */
 
 	/*
 	 * Loop through length of segment after first fragment,
 	 * make new header and copy data of each part and link onto chain.
 	 * Here, m0 is the original packet, m is the fragment being created.
 	 * The fragments are linked off the m_nextpkt of the original
 	 * packet, which after processing serves as the first fragment.
 	 */
 	for (nfrags = 1; off < ip_len; off += len, nfrags++) {
 		struct ip *mhip;	/* ip header on the fragment */
 		struct mbuf *m;
 		int mhlen = sizeof (struct ip);
 
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL) {
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * Make sure the complete packet header gets copied
 		 * from the originating mbuf to the newly created
 		 * mbuf. This also ensures that existing firewall
 		 * classification(s), VLAN tags and so on get copied
 		 * to the resulting fragmented packet(s):
 		 */
 		if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
 			m_free(m);
 			error = ENOBUFS;
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		/*
 		 * In the first mbuf, leave room for the link header, then
 		 * copy the original IP header including options. The payload
 		 * goes into an additional mbuf chain returned by m_copym().
 		 */
 		m->m_data += max_linkhdr;
 		mhip = mtod(m, struct ip *);
 		*mhip = *ip;
 		if (hlen > sizeof (struct ip)) {
 			mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
 			mhip->ip_v = IPVERSION;
 			mhip->ip_hl = mhlen >> 2;
 		}
 		m->m_len = mhlen;
 		/* XXX do we need to add ip_off below ? */
 		mhip->ip_off = ((off - hlen) >> 3) + ip_off;
 		if (off + len >= ip_len)
 			len = ip_len - off;
 		else
 			mhip->ip_off |= IP_MF;
 		mhip->ip_len = htons((u_short)(len + mhlen));
 		m->m_next = m_copym(m0, off, len, M_NOWAIT);
 		if (m->m_next == NULL) {	/* copy failed */
 			m_free(m);
 			error = ENOBUFS;	/* ??? */
 			IPSTAT_INC(ips_odropped);
 			goto done;
 		}
 		m->m_pkthdr.len = mhlen + len;
 #ifdef MAC
 		mac_netinet_fragment(m0, m);
 #endif
 		mhip->ip_off = htons(mhip->ip_off);
 		mhip->ip_sum = 0;
 		if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 			mhip->ip_sum = in_cksum(m, mhlen);
 			m->m_pkthdr.csum_flags &= ~CSUM_IP;
 		}
 		*mnext = m;
 		mnext = &m->m_nextpkt;
 	}
 	IPSTAT_ADD(ips_ofragments, nfrags);
 
 	/*
 	 * Update first fragment by trimming what's been copied out
 	 * and updating header.
 	 */
 	m_adj(m0, hlen + firstlen - ip_len);
 	m0->m_pkthdr.len = hlen + firstlen;
 	ip->ip_len = htons((u_short)m0->m_pkthdr.len);
 	ip->ip_off = htons(ip_off | IP_MF);
 	ip->ip_sum = 0;
 	if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
 		ip->ip_sum = in_cksum(m0, hlen);
 		m0->m_pkthdr.csum_flags &= ~CSUM_IP;
 	}
 
 done:
 	*m_frag = m0;
 	return error;
 }
 
 void
 in_delayed_cksum(struct mbuf *m)
 {
 	struct ip *ip;
 	struct udphdr *uh;
 	uint16_t cklen, csum, offset;
 
 	ip = mtod(m, struct ip *);
 	offset = ip->ip_hl << 2 ;
 
 	if (m->m_pkthdr.csum_flags & CSUM_UDP) {
 		/* if udp header is not in the first mbuf copy udplen */
 		if (offset + sizeof(struct udphdr) > m->m_len) {
 			m_copydata(m, offset + offsetof(struct udphdr,
 			    uh_ulen), sizeof(cklen), (caddr_t)&cklen);
 			cklen = ntohs(cklen);
 		} else {
 			uh = (struct udphdr *)mtodo(m, offset);
 			cklen = ntohs(uh->uh_ulen);
 		}
 		csum = in_cksum_skip(m, cklen + offset, offset);
 		if (csum == 0)
 			csum = 0xffff;
 	} else {
 		cklen = ntohs(ip->ip_len);
 		csum = in_cksum_skip(m, cklen, offset);
 	}
 	offset += m->m_pkthdr.csum_data;	/* checksum offset */
 
 	if (offset + sizeof(csum) > m->m_len)
 		m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
 	else
 		*(u_short *)mtodo(m, offset) = csum;
 }
 
 /*
  * IP socket option processing.
  */
 int
 ip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 #ifdef	RSS
 	uint32_t rss_bucket;
 	int retval;
 #endif
 
 	error = optval = 0;
 	if (sopt->sopt_level != IPPROTO_IP) {
 		error = EINVAL;
 
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_dir == SOPT_SET) {
 			switch (sopt->sopt_name) {
 			case SO_REUSEADDR:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEADDR) != 0)
 					inp->inp_flags2 |= INP_REUSEADDR;
 				else
 					inp->inp_flags2 &= ~INP_REUSEADDR;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_REUSEPORT_LB:
 				INP_WLOCK(inp);
 				if ((so->so_options & SO_REUSEPORT_LB) != 0)
 					inp->inp_flags2 |= INP_REUSEPORT_LB;
 				else
 					inp->inp_flags2 &= ~INP_REUSEPORT_LB;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
 			case SO_MAX_PACING_RATE:
 #ifdef RATELIMIT
 				INP_WLOCK(inp);
 				inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
 				INP_WUNLOCK(inp);
 				error = 0;
 #else
 				error = EOPNOTSUPP;
 #endif
 				break;
 			default:
 				break;
 			}
 		}
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 #ifdef notyet
 		case IP_RETOPTS:
 #endif
 		{
 			struct mbuf *m;
 			if (sopt->sopt_valsize > MLEN) {
 				error = EMSGSIZE;
 				break;
 			}
 			m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
 			if (m == NULL) {
 				error = ENOBUFS;
 				break;
 			}
 			m->m_len = sopt->sopt_valsize;
 			error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
 					    m->m_len);
 			if (error) {
 				m_free(m);
 				break;
 			}
 			INP_WLOCK(inp);
 			error = ip_pcbopts(inp, sopt->sopt_name, m);
 			INP_WUNLOCK(inp);
 			return (error);
 		}
 
 		case IP_BINDANY:
 			if (sopt->sopt_td != NULL) {
 				error = priv_check(sopt->sopt_td,
 				    PRIV_NETINET_BINDANY);
 				if (error)
 					break;
 			}
 			/* FALLTHROUGH */
 		case IP_BINDMULTI:
 #ifdef	RSS
 		case IP_RSS_LISTEN_BUCKET:
 #endif
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_ORIGDSTADDR:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_RECVTOS:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RECVRSSBUCKETID:
 #endif
 		case IP_VLAN_PCP:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos = optval;
 				break;
 
 			case IP_TTL:
 				inp->inp_ip_ttl = optval;
 				break;
 
 			case IP_MINTTL:
 				if (optval >= 0 && optval <= MAXTTL)
 					inp->inp_ip_minttl = optval;
 				else
 					error = EINVAL;
 				break;
 
 #define	OPTSET(bit) do {						\
 	INP_WLOCK(inp);							\
 	if (optval)							\
 		inp->inp_flags |= bit;					\
 	else								\
 		inp->inp_flags &= ~bit;					\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 #define	OPTSET2(bit, val) do {						\
 	INP_WLOCK(inp);							\
 	if (val)							\
 		inp->inp_flags2 |= bit;					\
 	else								\
 		inp->inp_flags2 &= ~bit;				\
 	INP_WUNLOCK(inp);						\
 } while (0)
 
 			case IP_RECVOPTS:
 				OPTSET(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				OPTSET(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				OPTSET(INP_RECVDSTADDR);
 				break;
 
 			case IP_ORIGDSTADDR:
 				OPTSET2(INP_ORIGDSTADDR, optval);
 				break;
 
 			case IP_RECVTTL:
 				OPTSET(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				OPTSET(INP_RECVIF);
 				break;
 
 			case IP_ONESBCAST:
 				OPTSET(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				OPTSET(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				OPTSET(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				OPTSET(INP_RECVTOS);
 				break;
 			case IP_BINDMULTI:
 				OPTSET2(INP_BINDMULTI, optval);
 				break;
 			case IP_RECVFLOWID:
 				OPTSET2(INP_RECVFLOWID, optval);
 				break;
 #ifdef	RSS
 			case IP_RSS_LISTEN_BUCKET:
 				if ((optval >= 0) &&
 				    (optval < rss_getnumbuckets())) {
 					inp->inp_rss_listen_bucket = optval;
 					OPTSET2(INP_RSS_BUCKET_SET, 1);
 				} else {
 					error = EINVAL;
 				}
 				break;
 			case IP_RECVRSSBUCKETID:
 				OPTSET2(INP_RECVRSSBUCKETID, optval);
 				break;
 #endif
 			case IP_VLAN_PCP:
 				if ((optval >= -1) && (optval <=
 				    (INP_2PCP_MASK >> INP_2PCP_SHIFT))) {
 					if (optval == -1) {
 						INP_WLOCK(inp);
 						inp->inp_flags2 &=
 						    ~(INP_2PCP_SET |
 						      INP_2PCP_MASK);
 						INP_WUNLOCK(inp);
 					} else {
 						INP_WLOCK(inp);
 						inp->inp_flags2 |=
 						    INP_2PCP_SET;
 						inp->inp_flags2 &=
 						    ~INP_2PCP_MASK;
 						inp->inp_flags2 |=
 						    optval << INP_2PCP_SHIFT;
 						INP_WUNLOCK(inp);
 					}
 				} else
 					error = EINVAL;
 				break;
 			}
 			break;
 #undef OPTSET
 #undef OPTSET2
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_ADD_MEMBERSHIP:
 		case IP_DROP_MEMBERSHIP:
 		case IP_ADD_SOURCE_MEMBERSHIP:
 		case IP_DROP_SOURCE_MEMBERSHIP:
 		case IP_BLOCK_SOURCE:
 		case IP_UNBLOCK_SOURCE:
 		case IP_MSFILTER:
 		case MCAST_JOIN_GROUP:
 		case MCAST_LEAVE_GROUP:
 		case MCAST_JOIN_SOURCE_GROUP:
 		case MCAST_LEAVE_SOURCE_GROUP:
 		case MCAST_BLOCK_SOURCE:
 		case MCAST_UNBLOCK_SOURCE:
 			error = inp_setmoptions(inp, sopt);
 			break;
 
 		case IP_PORTRANGE:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 
 			INP_WLOCK(inp);
 			switch (optval) {
 			case IP_PORTRANGE_DEFAULT:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				break;
 
 			case IP_PORTRANGE_HIGH:
 				inp->inp_flags &= ~(INP_LOWPORT);
 				inp->inp_flags |= INP_HIGHPORT;
 				break;
 
 			case IP_PORTRANGE_LOW:
 				inp->inp_flags &= ~(INP_HIGHPORT);
 				inp->inp_flags |= INP_LOWPORT;
 				break;
 
 			default:
 				error = EINVAL;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		case IP_IPSEC_POLICY:
 			if (IPSEC_ENABLED(ipv4)) {
 				error = IPSEC_PCBCTL(ipv4, inp, sopt);
 				break;
 			}
 			/* FALLTHROUGH */
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_OPTIONS:
 		case IP_RETOPTS:
 			INP_RLOCK(inp);
 			if (inp->inp_options) {
 				struct mbuf *options;
 
 				options = m_copym(inp->inp_options, 0,
 				    M_COPYALL, M_NOWAIT);
 				INP_RUNLOCK(inp);
 				if (options != NULL) {
 					error = sooptcopyout(sopt,
 							     mtod(options, char *),
 							     options->m_len);
 					m_freem(options);
 				} else
 					error = ENOMEM;
 			} else {
 				INP_RUNLOCK(inp);
 				sopt->sopt_valsize = 0;
 			}
 			break;
 
 		case IP_TOS:
 		case IP_TTL:
 		case IP_MINTTL:
 		case IP_RECVOPTS:
 		case IP_RECVRETOPTS:
 		case IP_ORIGDSTADDR:
 		case IP_RECVDSTADDR:
 		case IP_RECVTTL:
 		case IP_RECVIF:
 		case IP_PORTRANGE:
 		case IP_ONESBCAST:
 		case IP_DONTFRAG:
 		case IP_BINDANY:
 		case IP_RECVTOS:
 		case IP_BINDMULTI:
 		case IP_FLOWID:
 		case IP_FLOWTYPE:
 		case IP_RECVFLOWID:
 #ifdef	RSS
 		case IP_RSSBUCKETID:
 		case IP_RECVRSSBUCKETID:
 #endif
 		case IP_VLAN_PCP:
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				optval = inp->inp_ip_tos;
 				break;
 
 			case IP_TTL:
 				optval = inp->inp_ip_ttl;
 				break;
 
 			case IP_MINTTL:
 				optval = inp->inp_ip_minttl;
 				break;
 
 #define	OPTBIT(bit)	(inp->inp_flags & bit ? 1 : 0)
 #define	OPTBIT2(bit)	(inp->inp_flags2 & bit ? 1 : 0)
 
 			case IP_RECVOPTS:
 				optval = OPTBIT(INP_RECVOPTS);
 				break;
 
 			case IP_RECVRETOPTS:
 				optval = OPTBIT(INP_RECVRETOPTS);
 				break;
 
 			case IP_RECVDSTADDR:
 				optval = OPTBIT(INP_RECVDSTADDR);
 				break;
 
 			case IP_ORIGDSTADDR:
 				optval = OPTBIT2(INP_ORIGDSTADDR);
 				break;
 
 			case IP_RECVTTL:
 				optval = OPTBIT(INP_RECVTTL);
 				break;
 
 			case IP_RECVIF:
 				optval = OPTBIT(INP_RECVIF);
 				break;
 
 			case IP_PORTRANGE:
 				if (inp->inp_flags & INP_HIGHPORT)
 					optval = IP_PORTRANGE_HIGH;
 				else if (inp->inp_flags & INP_LOWPORT)
 					optval = IP_PORTRANGE_LOW;
 				else
 					optval = 0;
 				break;
 
 			case IP_ONESBCAST:
 				optval = OPTBIT(INP_ONESBCAST);
 				break;
 			case IP_DONTFRAG:
 				optval = OPTBIT(INP_DONTFRAG);
 				break;
 			case IP_BINDANY:
 				optval = OPTBIT(INP_BINDANY);
 				break;
 			case IP_RECVTOS:
 				optval = OPTBIT(INP_RECVTOS);
 				break;
 			case IP_FLOWID:
 				optval = inp->inp_flowid;
 				break;
 			case IP_FLOWTYPE:
 				optval = inp->inp_flowtype;
 				break;
 			case IP_RECVFLOWID:
 				optval = OPTBIT2(INP_RECVFLOWID);
 				break;
 #ifdef	RSS
 			case IP_RSSBUCKETID:
 				retval = rss_hash2bucket(inp->inp_flowid,
 				    inp->inp_flowtype,
 				    &rss_bucket);
 				if (retval == 0)
 					optval = rss_bucket;
 				else
 					error = EINVAL;
 				break;
 			case IP_RECVRSSBUCKETID:
 				optval = OPTBIT2(INP_RECVRSSBUCKETID);
 				break;
 #endif
 			case IP_BINDMULTI:
 				optval = OPTBIT2(INP_BINDMULTI);
 				break;
 			case IP_VLAN_PCP:
 				if (OPTBIT2(INP_2PCP_SET)) {
 					optval = (inp->inp_flags2 &
 					    INP_2PCP_MASK) >> INP_2PCP_SHIFT;
 				} else {
 					optval = -1;
 				}
 				break;
 			}
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		/*
 		 * Multicast socket options are processed by the in_mcast
 		 * module.
 		 */
 		case IP_MULTICAST_IF:
 		case IP_MULTICAST_VIF:
 		case IP_MULTICAST_TTL:
 		case IP_MULTICAST_LOOP:
 		case IP_MSFILTER:
 			error = inp_getmoptions(inp, sopt);
 			break;
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		case IP_IPSEC_POLICY:
 			if (IPSEC_ENABLED(ipv4)) {
 				error = IPSEC_PCBCTL(ipv4, inp, sopt);
 				break;
 			}
 			/* FALLTHROUGH */
 #endif /* IPSEC */
 
 		default:
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 /*
  * Routine called from ip_output() to loop back a copy of an IP multicast
  * packet to the input queue of a specified interface.  Note that this
  * calls the output routine of the loopback "driver", but with an interface
  * pointer that might NOT be a loopback interface -- evil, but easier than
  * replicating that code here.
  */
 static void
 ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
 {
 	struct ip *ip;
 	struct mbuf *copym;
 
 	/*
 	 * Make a deep copy of the packet because we're going to
 	 * modify the pack in order to generate checksums.
 	 */
 	copym = m_dup(m, M_NOWAIT);
 	if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
 		copym = m_pullup(copym, hlen);
 	if (copym != NULL) {
 		/* If needed, compute the checksum and mark it as valid. */
 		if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
 			in_delayed_cksum(copym);
 			copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
 			copym->m_pkthdr.csum_flags |=
 			    CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
 			copym->m_pkthdr.csum_data = 0xffff;
 		}
 		/*
 		 * We don't bother to fragment if the IP length is greater
 		 * than the interface's MTU.  Can this possibly matter?
 		 */
 		ip = mtod(copym, struct ip *);
 		ip->ip_sum = 0;
 		ip->ip_sum = in_cksum(copym, hlen);
 		if_simloop(ifp, copym, AF_INET, 0);
 	}
 }
diff --git a/sys/netinet/ip_var.h b/sys/netinet/ip_var.h
index 7701b64c1be0..237a99a3481c 100644
--- a/sys/netinet/ip_var.h
+++ b/sys/netinet/ip_var.h
@@ -1,301 +1,300 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ip_var.h	8.2 (Berkeley) 1/9/95
  * $FreeBSD$
  */
 
 #ifndef _NETINET_IP_VAR_H_
 #define	_NETINET_IP_VAR_H_
 
 #include <sys/epoch.h>
 #include <sys/queue.h>
 #include <sys/types.h>
 
 #include <netinet/in.h>
 
 /*
  * Overlay for ip header used by other protocols (tcp, udp).
  */
 struct ipovly {
 	u_char	ih_x1[9];		/* (unused) */
 	u_char	ih_pr;			/* protocol */
 	u_short	ih_len;			/* protocol length */
 	struct	in_addr ih_src;		/* source internet address */
 	struct	in_addr ih_dst;		/* destination internet address */
 };
 
 #ifdef _KERNEL
 /*
  * Ip reassembly queue structure.  Each fragment
  * being reassembled is attached to one of these structures.
  * They are timed out after ipq_ttl drops to 0, and may also
  * be reclaimed if memory becomes tight.
  */
 struct ipq {
 	TAILQ_ENTRY(ipq) ipq_list;	/* to other reass headers */
 	u_char	ipq_ttl;		/* time for reass q to live */
 	u_char	ipq_p;			/* protocol of this fragment */
 	u_short	ipq_id;			/* sequence id for reassembly */
 	int	ipq_maxoff;		/* total length of packet */
 	struct mbuf *ipq_frags;		/* to ip headers of fragments */
 	struct	in_addr ipq_src,ipq_dst;
 	u_char	ipq_nfrags;		/* # frags in this packet */
 	struct label *ipq_label;	/* MAC label */
 };
 #endif /* _KERNEL */
 
 /*
  * Structure stored in mbuf in inpcb.ip_options
  * and passed to ip_output when ip options are in use.
  * The actual length of the options (including ipopt_dst)
  * is in m_len.
  */
 #define MAX_IPOPTLEN	40
 
 struct ipoption {
 	struct	in_addr ipopt_dst;	/* first-hop dst if source routed */
 	char	ipopt_list[MAX_IPOPTLEN];	/* options proper */
 };
 
 #if defined(_NETINET_IN_VAR_H_) && defined(_KERNEL)
 /*
  * Structure attached to inpcb.ip_moptions and
  * passed to ip_output when IP multicast options are in use.
  * This structure is lazy-allocated.
  */
 struct ip_moptions {
 	struct	ifnet *imo_multicast_ifp; /* ifp for outgoing multicasts */
 	struct in_addr imo_multicast_addr; /* ifindex/addr on MULTICAST_IF */
 	u_long	imo_multicast_vif;	/* vif num outgoing multicasts */
 	u_char	imo_multicast_ttl;	/* TTL for outgoing multicasts */
 	u_char	imo_multicast_loop;	/* 1 => hear sends if a member */
 	struct ip_mfilter_head imo_head; /* group membership list */
 };
 #else
 struct ip_moptions;
 #endif
 
 struct	ipstat {
 	uint64_t ips_total;		/* total packets received */
 	uint64_t ips_badsum;		/* checksum bad */
 	uint64_t ips_tooshort;		/* packet too short */
 	uint64_t ips_toosmall;		/* not enough data */
 	uint64_t ips_badhlen;		/* ip header length < data size */
 	uint64_t ips_badlen;		/* ip length < ip header length */
 	uint64_t ips_fragments;		/* fragments received */
 	uint64_t ips_fragdropped;	/* frags dropped (dups, out of space) */
 	uint64_t ips_fragtimeout;	/* fragments timed out */
 	uint64_t ips_forward;		/* packets forwarded */
 	uint64_t ips_fastforward;	/* packets fast forwarded */
 	uint64_t ips_cantforward;	/* packets rcvd for unreachable dest */
 	uint64_t ips_redirectsent;	/* packets forwarded on same net */
 	uint64_t ips_noproto;		/* unknown or unsupported protocol */
 	uint64_t ips_delivered;		/* datagrams delivered to upper level*/
 	uint64_t ips_localout;		/* total ip packets generated here */
 	uint64_t ips_odropped;		/* lost packets due to nobufs, etc. */
 	uint64_t ips_reassembled;	/* total packets reassembled ok */
 	uint64_t ips_fragmented;	/* datagrams successfully fragmented */
 	uint64_t ips_ofragments;	/* output fragments created */
 	uint64_t ips_cantfrag;		/* don't fragment flag was set, etc. */
 	uint64_t ips_badoptions;		/* error in option processing */
 	uint64_t ips_noroute;		/* packets discarded due to no route */
 	uint64_t ips_badvers;		/* ip version != 4 */
 	uint64_t ips_rawout;		/* total raw ip packets generated */
 	uint64_t ips_toolong;		/* ip length > max ip packet size */
 	uint64_t ips_notmember;		/* multicasts for unregistered grps */
 	uint64_t ips_nogif;		/* no match gif found */
 	uint64_t ips_badaddr;		/* invalid address on header */
 };
 
 #ifdef _KERNEL
 
 #include <sys/counter.h>
 #include <net/vnet.h>
 
 VNET_PCPUSTAT_DECLARE(struct ipstat, ipstat);
 /*
  * In-kernel consumers can use these accessor macros directly to update
  * stats.
  */
 #define	IPSTAT_ADD(name, val)	\
     VNET_PCPUSTAT_ADD(struct ipstat, ipstat, name, (val))
 #define	IPSTAT_SUB(name, val)	IPSTAT_ADD(name, -(val))
 #define	IPSTAT_INC(name)	IPSTAT_ADD(name, 1)
 #define	IPSTAT_DEC(name)	IPSTAT_SUB(name, 1)
 
 /*
  * Kernel module consumers must use this accessor macro.
  */
 void	kmod_ipstat_inc(int statnum);
 #define	KMOD_IPSTAT_INC(name)	\
     kmod_ipstat_inc(offsetof(struct ipstat, name) / sizeof(uint64_t))
 void	kmod_ipstat_dec(int statnum);
 #define	KMOD_IPSTAT_DEC(name)	\
     kmod_ipstat_dec(offsetof(struct ipstat, name) / sizeof(uint64_t))
 
 /* flags passed to ip_output as last parameter */
 #define	IP_FORWARDING		0x1		/* most of ip header exists */
 #define	IP_RAWOUTPUT		0x2		/* raw ip header exists */
 #define	IP_SENDONES		0x4		/* send all-ones broadcast */
 #define	IP_SENDTOIF		0x8		/* send on specific ifnet */
 #define IP_ROUTETOIF		SO_DONTROUTE	/* 0x10 bypass routing tables */
 #define IP_ALLOWBROADCAST	SO_BROADCAST	/* 0x20 can send broadcast packets */
 #define	IP_NODEFAULTFLOWID	0x40		/* Don't set the flowid from inp */
 #define IP_NO_SND_TAG_RL	0x80		/* Don't send down the ratelimit tag */
 
 #ifdef __NO_STRICT_ALIGNMENT
 #define IP_HDR_ALIGNED_P(ip)	1
 #else
 #define IP_HDR_ALIGNED_P(ip)	((((intptr_t) (ip)) & 3) == 0)
 #endif
 
 struct ip;
 struct inpcb;
 struct route;
 struct sockopt;
 struct inpcbinfo;
 
 VNET_DECLARE(int, ip_defttl);			/* default IP ttl */
 VNET_DECLARE(int, ipforwarding);		/* ip forwarding */
 VNET_DECLARE(int, ipsendredirects);
 #ifdef IPSTEALTH
 VNET_DECLARE(int, ipstealth);			/* stealth forwarding */
 #endif
 VNET_DECLARE(struct socket *, ip_rsvpd);	/* reservation protocol daemon*/
 VNET_DECLARE(struct socket *, ip_mrouter);	/* multicast routing daemon */
 extern int	(*legal_vif_num)(int);
 extern u_long	(*ip_mcast_src)(int);
 VNET_DECLARE(int, rsvp_on);
 VNET_DECLARE(int, drop_redirect);
-extern struct	pr_usrreqs rip_usrreqs;
 
 #define	V_ip_id			VNET(ip_id)
 #define	V_ip_defttl		VNET(ip_defttl)
 #define	V_ipforwarding		VNET(ipforwarding)
 #define	V_ipsendredirects	VNET(ipsendredirects)
 #ifdef IPSTEALTH
 #define	V_ipstealth		VNET(ipstealth)
 #endif
 #define	V_ip_rsvpd		VNET(ip_rsvpd)
 #define	V_ip_mrouter		VNET(ip_mrouter)
 #define	V_rsvp_on		VNET(rsvp_on)
 #define	V_drop_redirect		VNET(drop_redirect)
 
 void	inp_freemoptions(struct ip_moptions *);
 int	inp_getmoptions(struct inpcb *, struct sockopt *);
 int	inp_setmoptions(struct inpcb *, struct sockopt *);
 
 int	ip_ctloutput(struct socket *, struct sockopt *sopt);
 int	ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
 	    u_long if_hwassist_flags);
 void	ip_forward(struct mbuf *m, int srcrt);
 extern int
 	(*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 	    struct ip_moptions *);
 int	ip_output(struct mbuf *,
 	    struct mbuf *, struct route *, int, struct ip_moptions *,
 	    struct inpcb *);
 struct mbuf *
 	ip_reass(struct mbuf *);
 void	ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
 	    struct mbuf *);
 void	ip_fillid(struct ip *);
 int	rip_ctloutput(struct socket *, struct sockopt *);
 void	rip_ctlinput(int, struct sockaddr *, void *);
 int	rip_input(struct mbuf **, int *, int);
 int	ipip_input(struct mbuf **, int *, int);
 int	rsvp_input(struct mbuf **, int *, int);
 
 int	ip_rsvp_init(struct socket *);
 int	ip_rsvp_done(void);
 extern int	(*ip_rsvp_vif)(struct socket *, struct sockopt *);
 extern void	(*ip_rsvp_force_done)(struct socket *);
 extern int	(*rsvp_input_p)(struct mbuf **, int *, int);
 
 VNET_DECLARE(struct pfil_head *, inet_pfil_head);
 #define	V_inet_pfil_head	VNET(inet_pfil_head)
 #define	PFIL_INET_NAME		"inet"
 
 void	in_delayed_cksum(struct mbuf *m);
 
 /* Hooks for ipfw, dummynet, divert etc. Most are declared in raw_ip.c */
 /*
  * Reference to an ipfw or packet filter rule that can be carried
  * outside critical sections.
  * A rule is identified by rulenum:rule_id which is ordered.
  * In version chain_id the rule can be found in slot 'slot', so
  * we don't need a lookup if chain_id == chain->id.
  *
  * On exit from the firewall this structure refers to the rule after
  * the matching one (slot points to the new rule; rulenum:rule_id-1
  * is the matching rule), and additional info (e.g. info often contains
  * the insn argument or tablearg in the low 16 bits, in host format).
  * On entry, the structure is valid if slot>0, and refers to the starting
  * rules. 'info' contains the reason for reinject, e.g. divert port,
  * divert direction, and so on.
  */
 struct ipfw_rule_ref {
 	uint32_t	slot;		/* slot for matching rule	*/
 	uint32_t	rulenum;	/* matching rule number		*/
 	uint32_t	rule_id;	/* matching rule id		*/
 	uint32_t	chain_id;	/* ruleset id			*/
 	uint32_t	info;		/* see below			*/
 };
 
 enum {
 	IPFW_INFO_MASK	= 0x0000ffff,
 	IPFW_INFO_OUT	= 0x00000000,	/* outgoing, just for convenience */
 	IPFW_INFO_IN	= 0x80000000,	/* incoming, overloads dir */
 	IPFW_ONEPASS	= 0x40000000,	/* One-pass, do not reinject */
 	IPFW_IS_MASK	= 0x30000000,	/* which source ? */
 	IPFW_IS_DIVERT	= 0x20000000,
 	IPFW_IS_DUMMYNET =0x10000000,
 	IPFW_IS_PIPE	= 0x08000000,	/* pipe=1, queue = 0 */
 };
 #define MTAG_IPFW	1148380143	/* IPFW-tagged cookie */
 #define MTAG_IPFW_RULE	1262273568	/* rule reference */
 #define	MTAG_IPFW_CALL	1308397630	/* call stack */
 
 struct ip_fw_args;
 typedef int	(*ip_fw_chk_ptr_t)(struct ip_fw_args *args);
 typedef int	(*ip_fw_ctl_ptr_t)(struct sockopt *);
 VNET_DECLARE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr);
 #define	V_ip_fw_ctl_ptr		VNET(ip_fw_ctl_ptr)
 
 /* Divert hooks. */
 extern void	(*ip_divert_ptr)(struct mbuf *m, bool incoming);
 /* ng_ipfw hooks -- XXX make it the same as divert and dummynet */
 extern int	(*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool);
 extern int	(*ip_dn_ctl_ptr)(struct sockopt *);
 extern int	(*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *);
 #endif /* _KERNEL */
 
 #endif /* !_NETINET_IP_VAR_H_ */
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
index 4da408080794..0e7e75b1fe99 100644
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -1,1088 +1,1137 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)raw_ip.c	8.7 (Berkeley) 5/15/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/eventhandler.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/rwlock.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/route/route_ctl.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_mroute.h>
 #include <netinet/ip_icmp.h>
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/stdarg.h>
 #include <security/mac/mac_framework.h>
 
 extern ipproto_input_t *ip_protox[];
 
 VNET_DEFINE(int, ip_defttl) = IPDEFTTL;
 SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(ip_defttl), 0,
     "Maximum TTL on IP packets");
 
 VNET_DEFINE(struct inpcbinfo, ripcbinfo);
 #define	V_ripcbinfo		VNET(ripcbinfo)
 
 /*
  * Control and data hooks for ipfw, dummynet, divert and so on.
  * The data hooks are not used here but it is convenient
  * to keep them all in one place.
  */
 VNET_DEFINE(ip_fw_chk_ptr_t, ip_fw_chk_ptr) = NULL;
 VNET_DEFINE(ip_fw_ctl_ptr_t, ip_fw_ctl_ptr) = NULL;
 
 int	(*ip_dn_ctl_ptr)(struct sockopt *);
 int	(*ip_dn_io_ptr)(struct mbuf **, struct ip_fw_args *);
 void	(*ip_divert_ptr)(struct mbuf *, bool);
 int	(*ng_ipfw_input_p)(struct mbuf **, struct ip_fw_args *, bool);
 
 #ifdef INET
 /*
  * Hooks for multicast routing. They all default to NULL, so leave them not
  * initialized and rely on BSS being set to 0.
  */
 
 /*
  * The socket used to communicate with the multicast routing daemon.
  */
 VNET_DEFINE(struct socket *, ip_mrouter);
 
 /*
  * The various mrouter and rsvp functions.
  */
 int (*ip_mrouter_set)(struct socket *, struct sockopt *);
 int (*ip_mrouter_get)(struct socket *, struct sockopt *);
 int (*ip_mrouter_done)(void);
 int (*ip_mforward)(struct ip *, struct ifnet *, struct mbuf *,
 		   struct ip_moptions *);
 int (*mrt_ioctl)(u_long, caddr_t, int);
 int (*legal_vif_num)(int);
 u_long (*ip_mcast_src)(int);
 
 int (*rsvp_input_p)(struct mbuf **, int *, int);
 int (*ip_rsvp_vif)(struct socket *, struct sockopt *);
 void (*ip_rsvp_force_done)(struct socket *);
 #endif /* INET */
 
-extern	struct protosw inetsw[];
-
 u_long	rip_sendspace = 9216;
 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, maxdgram, CTLFLAG_RW,
     &rip_sendspace, 0, "Maximum outgoing raw IP datagram size");
 
 u_long	rip_recvspace = 9216;
 SYSCTL_ULONG(_net_inet_raw, OID_AUTO, recvspace, CTLFLAG_RW,
     &rip_recvspace, 0, "Maximum space for incoming raw IP datagrams");
 
 /*
  * Hash functions
  */
 
 #define INP_PCBHASH_RAW_SIZE	256
 #define INP_PCBHASH_RAW(proto, laddr, faddr, mask) \
         (((proto) + (laddr) + (faddr)) % (mask) + 1)
 
 #ifdef INET
 static void
 rip_inshash(struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbhead *pcbhash;
 	int hash;
 
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	if (inp->inp_ip_p != 0 &&
 	    inp->inp_laddr.s_addr != INADDR_ANY &&
 	    inp->inp_faddr.s_addr != INADDR_ANY) {
 		hash = INP_PCBHASH_RAW(inp->inp_ip_p, inp->inp_laddr.s_addr,
 		    inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask);
 	} else
 		hash = 0;
 	pcbhash = &pcbinfo->ipi_hashbase[hash];
 	CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 }
 
 static void
 rip_delhash(struct inpcb *inp)
 {
 
 	INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
 	CK_LIST_REMOVE(inp, inp_hash);
 }
 #endif /* INET */
 
 INPCBSTORAGE_DEFINE(ripcbstor, "rawinp", "ripcb", "rip", "riphash");
 
 static void
 rip_init(void *arg __unused)
 {
 
 	in_pcbinfo_init(&V_ripcbinfo, &ripcbstor, INP_PCBHASH_RAW_SIZE, 1);
 }
 VNET_SYSINIT(rip_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, rip_init, NULL);
 
 #ifdef VIMAGE
 static void
 rip_destroy(void *unused __unused)
 {
 
 	in_pcbinfo_destroy(&V_ripcbinfo);
 }
 VNET_SYSUNINIT(raw_ip, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, rip_destroy, NULL);
 #endif
 
 #ifdef INET
 static int
 rip_append(struct inpcb *inp, struct ip *ip, struct mbuf *m,
     struct sockaddr_in *ripsrc)
 {
 	struct socket *so = inp->inp_socket;
 	struct mbuf *n, *opts = NULL;
 
 	INP_LOCK_ASSERT(inp);
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/* check AH/ESP integrity. */
 	if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0)
 		return (0);
 #endif /* IPSEC */
 #ifdef MAC
 	if (mac_inpcb_check_deliver(inp, m) != 0)
 		return (0);
 #endif
 	/* Check the minimum TTL for socket. */
 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl)
 		return (0);
 
 	if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL)
 		return (0);
 
 	if ((inp->inp_flags & INP_CONTROLOPTS) ||
 	    (so->so_options & (SO_TIMESTAMP | SO_BINTIME)))
 		ip_savecontrol(inp, &opts, ip, n);
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv,
 	    (struct sockaddr *)ripsrc, n, opts) == 0) {
 		soroverflow_locked(so);
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		return (0);
 	}
 	sorwakeup_locked(so);
 
 	return (1);
 }
 
 struct rip_inp_match_ctx {
 	struct ip *ip;
 	int proto;
 };
 
 static bool
 rip_inp_match1(const struct inpcb *inp, void *v)
 {
 	struct rip_inp_match_ctx *ctx = v;
 
 	if (inp->inp_ip_p != ctx->proto)
 		return (false);
 #ifdef INET6
 	/* XXX inp locking */
 	if ((inp->inp_vflag & INP_IPV4) == 0)
 		return (false);
 #endif
 	if (inp->inp_laddr.s_addr != ctx->ip->ip_dst.s_addr)
 		return (false);
 	if (inp->inp_faddr.s_addr != ctx->ip->ip_src.s_addr)
 		return (false);
 	return (true);
 }
 
 static bool
 rip_inp_match2(const struct inpcb *inp, void *v)
 {
 	struct rip_inp_match_ctx *ctx = v;
 
 	if (inp->inp_ip_p && inp->inp_ip_p != ctx->proto)
 		return (false);
 #ifdef INET6
 	/* XXX inp locking */
 	if ((inp->inp_vflag & INP_IPV4) == 0)
 		return (false);
 #endif
 	if (!in_nullhost(inp->inp_laddr) &&
 	    !in_hosteq(inp->inp_laddr, ctx->ip->ip_dst))
 		return (false);
 	if (!in_nullhost(inp->inp_faddr) &&
 	    !in_hosteq(inp->inp_faddr, ctx->ip->ip_src))
 		return (false);
 	return (true);
 }
 
 /*
  * Setup generic address and protocol structures for raw_input routine, then
  * pass them along with mbuf chain.
  */
 int
 rip_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct rip_inp_match_ctx ctx = {
 		.ip = mtod(*mp, struct ip *),
 		.proto = proto,
 	};
 	struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
 	    INPLOOKUP_RLOCKPCB, rip_inp_match1, &ctx);
 	struct ifnet *ifp;
 	struct mbuf *m = *mp;
 	struct inpcb *inp;
 	struct sockaddr_in ripsrc;
 	int appended;
 
 	*mp = NULL;
 	appended = 0;
 
 	bzero(&ripsrc, sizeof(ripsrc));
 	ripsrc.sin_len = sizeof(ripsrc);
 	ripsrc.sin_family = AF_INET;
 	ripsrc.sin_addr = ctx.ip->ip_src;
 
 	ifp = m->m_pkthdr.rcvif;
 
 	inpi.hash = INP_PCBHASH_RAW(proto, ctx.ip->ip_src.s_addr,
 	    ctx.ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask);
 	while ((inp = inp_next(&inpi)) != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		if (jailed_without_vnet(inp->inp_cred) &&
 		    prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) {
 			/*
 			 * XXX: If faddr was bound to multicast group,
 			 * jailed raw socket will drop datagram.
 			 */
 			continue;
 		}
 		appended += rip_append(inp, ctx.ip, m, &ripsrc);
 	}
 
 	inpi.hash = 0;
 	inpi.match = rip_inp_match2;
 	MPASS(inpi.inp == NULL);
 	while ((inp = inp_next(&inpi)) != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		if (jailed_without_vnet(inp->inp_cred) &&
 		    !IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr)) &&
 		    prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0)
 			/*
 			 * Allow raw socket in jail to receive multicast;
 			 * assume process had PRIV_NETINET_RAW at attach,
 			 * and fall through into normal filter path if so.
 			 */
 			continue;
 		/*
 		 * If this raw socket has multicast state, and we
 		 * have received a multicast, check if this socket
 		 * should receive it, as multicast filtering is now
 		 * the responsibility of the transport layer.
 		 */
 		if (inp->inp_moptions != NULL &&
 		    IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr))) {
 			/*
 			 * If the incoming datagram is for IGMP, allow it
 			 * through unconditionally to the raw socket.
 			 *
 			 * In the case of IGMPv2, we may not have explicitly
 			 * joined the group, and may have set IFF_ALLMULTI
 			 * on the interface. imo_multi_filter() may discard
 			 * control traffic we actually need to see.
 			 *
 			 * Userland multicast routing daemons should continue
 			 * filter the control traffic appropriately.
 			 */
 			int blocked;
 
 			blocked = MCAST_PASS;
 			if (proto != IPPROTO_IGMP) {
 				struct sockaddr_in group;
 
 				bzero(&group, sizeof(struct sockaddr_in));
 				group.sin_len = sizeof(struct sockaddr_in);
 				group.sin_family = AF_INET;
 				group.sin_addr = ctx.ip->ip_dst;
 
 				blocked = imo_multi_filter(inp->inp_moptions,
 				    ifp,
 				    (struct sockaddr *)&group,
 				    (struct sockaddr *)&ripsrc);
 			}
 
 			if (blocked != MCAST_PASS) {
 				IPSTAT_INC(ips_notmember);
 				continue;
 			}
 		}
 		appended += rip_append(inp, ctx.ip, m, &ripsrc);
 	}
 	if (appended == 0 && ip_protox[ctx.ip->ip_p] == rip_input) {
 		IPSTAT_INC(ips_noproto);
 		IPSTAT_DEC(ips_delivered);
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0);
 	} else
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 /*
  * Generate IP header and pass packet to ip_output.  Tack on options user may
  * have setup with control call.
  */
 static int
 rip_send(struct socket *so, int pruflags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct epoch_tracker et;
 	struct ip *ip;
 	struct inpcb *inp;
 	in_addr_t *dst;
 	int error, flags, cnt, hlen;
 	u_char opttype, optlen, *cp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_send: inp == NULL"));
 
 	if (control != NULL) {
 		m_freem(control);
 		control = NULL;
 	}
 
 	if (so->so_state & SS_ISCONNECTED) {
 		if (nam) {
 			error = EISCONN;
 			m_freem(m);
 			return (error);
 		}
 		dst = &inp->inp_faddr.s_addr;
 	} else {
 		if (nam == NULL)
 			error = ENOTCONN;
 		else if (nam->sa_family != AF_INET)
 			error = EAFNOSUPPORT;
 		else if (nam->sa_len != sizeof(struct sockaddr_in))
 			error = EINVAL;
 		else
 			error = 0;
 		if (error != 0) {
 			m_freem(m);
 			return (error);
 		}
 		dst = &((struct sockaddr_in *)nam)->sin_addr.s_addr;
 	}
 
 	flags = ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) |
 	    IP_ALLOWBROADCAST;
 
 	/*
 	 * If the user handed us a complete IP packet, use it.  Otherwise,
 	 * allocate an mbuf for a header and fill it in.
 	 */
 	if ((inp->inp_flags & INP_HDRINCL) == 0) {
 		if (m->m_pkthdr.len + sizeof(struct ip) > IP_MAXPACKET) {
 			m_freem(m);
 			return(EMSGSIZE);
 		}
 		M_PREPEND(m, sizeof(struct ip), M_NOWAIT);
 		if (m == NULL)
 			return(ENOBUFS);
 
 		INP_RLOCK(inp);
 		ip = mtod(m, struct ip *);
 		ip->ip_tos = inp->inp_ip_tos;
 		if (inp->inp_flags & INP_DONTFRAG)
 			ip->ip_off = htons(IP_DF);
 		else
 			ip->ip_off = htons(0);
 		ip->ip_p = inp->inp_ip_p;
 		ip->ip_len = htons(m->m_pkthdr.len);
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst.s_addr = *dst;
 #ifdef ROUTE_MPATH
 		if (CALC_FLOWID_OUTBOUND) {
 			uint32_t hash_type, hash_val;
 
 			hash_val = fib4_calc_software_hash(ip->ip_src,
 			    ip->ip_dst, 0, 0, ip->ip_p, &hash_type);
 			m->m_pkthdr.flowid = hash_val;
 			M_HASHTYPE_SET(m, hash_type);
 			flags |= IP_NODEFAULTFLOWID;
 		}
 #endif
 		if (jailed(inp->inp_cred)) {
 			/*
 			 * prison_local_ip4() would be good enough but would
 			 * let a source of INADDR_ANY pass, which we do not
 			 * want to see from jails.
 			 */
 			if (ip->ip_src.s_addr == INADDR_ANY) {
 				NET_EPOCH_ENTER(et);
 				error = in_pcbladdr(inp, &ip->ip_dst,
 				    &ip->ip_src, inp->inp_cred);
 				NET_EPOCH_EXIT(et);
 			} else {
 				error = prison_local_ip4(inp->inp_cred,
 				    &ip->ip_src);
 			}
 			if (error != 0) {
 				INP_RUNLOCK(inp);
 				m_freem(m);
 				return (error);
 			}
 		}
 		ip->ip_ttl = inp->inp_ip_ttl;
 	} else {
 		if (m->m_pkthdr.len > IP_MAXPACKET) {
 			m_freem(m);
 			return (EMSGSIZE);
 		}
 		if (m->m_pkthdr.len < sizeof(*ip)) {
 			m_freem(m);
 			return (EINVAL);
 		}
 		m = m_pullup(m, sizeof(*ip));
 		if (m == NULL)
 			return (ENOMEM);
 		ip = mtod(m, struct ip *);
 		hlen = ip->ip_hl << 2;
 		if (m->m_len < hlen) {
 			m = m_pullup(m, hlen);
 			if (m == NULL)
 				return (EINVAL);
 			ip = mtod(m, struct ip *);
 		}
 #ifdef ROUTE_MPATH
 		if (CALC_FLOWID_OUTBOUND) {
 			uint32_t hash_type, hash_val;
 
 			hash_val = fib4_calc_software_hash(ip->ip_dst,
 			    ip->ip_src, 0, 0, ip->ip_p, &hash_type);
 			m->m_pkthdr.flowid = hash_val;
 			M_HASHTYPE_SET(m, hash_type);
 			flags |= IP_NODEFAULTFLOWID;
 		}
 #endif
 		INP_RLOCK(inp);
 		/*
 		 * Don't allow both user specified and setsockopt options,
 		 * and don't allow packet length sizes that will crash.
 		 */
 		if ((hlen < sizeof (*ip))
 		    || ((hlen > sizeof (*ip)) && inp->inp_options)
 		    || (ntohs(ip->ip_len) != m->m_pkthdr.len)) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (EINVAL);
 		}
 		error = prison_check_ip4(inp->inp_cred, &ip->ip_src);
 		if (error != 0) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (error);
 		}
 		/*
 		 * Don't allow IP options which do not have the required
 		 * structure as specified in section 3.1 of RFC 791 on
 		 * pages 15-23.
 		 */
 		cp = (u_char *)(ip + 1);
 		cnt = hlen - sizeof (struct ip);
 		for (; cnt > 0; cnt -= optlen, cp += optlen) {
 			opttype = cp[IPOPT_OPTVAL];
 			if (opttype == IPOPT_EOL)
 				break;
 			if (opttype == IPOPT_NOP) {
 				optlen = 1;
 				continue;
 			}
 			if (cnt < IPOPT_OLEN + sizeof(u_char)) {
 				INP_RUNLOCK(inp);
 				m_freem(m);
 				return (EINVAL);
 			}
 			optlen = cp[IPOPT_OLEN];
 			if (optlen < IPOPT_OLEN + sizeof(u_char) ||
 			    optlen > cnt) {
 				INP_RUNLOCK(inp);
 				m_freem(m);
 				return (EINVAL);
 			}
 		}
 		/*
 		 * This doesn't allow application to specify ID of zero,
 		 * but we got this limitation from the beginning of history.
 		 */
 		if (ip->ip_id == 0)
 			ip_fillid(ip);
 
 		/*
 		 * XXX prevent ip_output from overwriting header fields.
 		 */
 		flags |= IP_RAWOUTPUT;
 		IPSTAT_INC(ips_rawout);
 	}
 
 	if (inp->inp_flags & INP_ONESBCAST)
 		flags |= IP_SENDONES;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 	NET_EPOCH_ENTER(et);
 	error = ip_output(m, inp->inp_options, NULL, flags,
 	    inp->inp_moptions, inp);
 	NET_EPOCH_EXIT(et);
 	INP_RUNLOCK(inp);
 	return (error);
 }
 
 /*
  * Raw IP socket option processing.
  *
  * IMPORTANT NOTE regarding access control: Traditionally, raw sockets could
  * only be created by a privileged process, and as such, socket option
  * operations to manage system properties on any raw socket were allowed to
  * take place without explicit additional access control checks.  However,
  * raw sockets can now also be created in jail(), and therefore explicit
  * checks are now required.  Likewise, raw sockets can be used by a process
  * after it gives up privilege, so some caution is required.  For options
  * passed down to the IP layer via ip_ctloutput(), checks are assumed to be
  * performed in ip_ctloutput() and therefore no check occurs here.
  * Unilaterally checking priv_check() here breaks normal IP socket option
  * operations on raw sockets.
  *
  * When adding new socket options here, make sure to add access control
  * checks here as necessary.
  *
  * XXX-BZ inp locking?
  */
 int
 rip_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp = sotoinpcb(so);
 	int	error, optval;
 
 	if (sopt->sopt_level != IPPROTO_IP) {
 		if ((sopt->sopt_level == SOL_SOCKET) &&
 		    (sopt->sopt_name == SO_SETFIB)) {
 			inp->inp_inc.inc_fibnum = so->so_fibnum;
 			return (0);
 		}
 		return (EINVAL);
 	}
 
 	error = 0;
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			optval = inp->inp_flags & INP_HDRINCL;
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 
 		case IP_FW3:	/* generic ipfw v.3 functions */
 		case IP_FW_ADD:	/* ADD actually returns the body... */
 		case IP_FW_GET:
 		case IP_FW_TABLE_GETSIZE:
 		case IP_FW_TABLE_LIST:
 		case IP_FW_NAT_GET_CONFIG:
 		case IP_FW_NAT_GET_LOG:
 			if (V_ip_fw_ctl_ptr != NULL)
 				error = V_ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
 		case IP_DUMMYNET_GET:
 			if (ip_dn_ctl_ptr != NULL)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break ;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_mrouter_get ? ip_mrouter_get(so, sopt) :
 				EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case IP_HDRINCL:
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 					    sizeof optval);
 			if (error)
 				break;
 			if (optval)
 				inp->inp_flags |= INP_HDRINCL;
 			else
 				inp->inp_flags &= ~INP_HDRINCL;
 			break;
 
 		case IP_FW3:	/* generic ipfw v.3 functions */
 		case IP_FW_ADD:
 		case IP_FW_DEL:
 		case IP_FW_FLUSH:
 		case IP_FW_ZERO:
 		case IP_FW_RESETLOG:
 		case IP_FW_TABLE_ADD:
 		case IP_FW_TABLE_DEL:
 		case IP_FW_TABLE_FLUSH:
 		case IP_FW_NAT_CFG:
 		case IP_FW_NAT_DEL:
 			if (V_ip_fw_ctl_ptr != NULL)
 				error = V_ip_fw_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT;
 			break;
 
 		case IP_DUMMYNET3:	/* generic dummynet v.3 functions */
 		case IP_DUMMYNET_CONFIGURE:
 		case IP_DUMMYNET_DEL:
 		case IP_DUMMYNET_FLUSH:
 			if (ip_dn_ctl_ptr != NULL)
 				error = ip_dn_ctl_ptr(sopt);
 			else
 				error = ENOPROTOOPT ;
 			break ;
 
 		case IP_RSVP_ON:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_init(so);
 			break;
 
 		case IP_RSVP_OFF:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_done();
 			break;
 
 		case IP_RSVP_VIF_ON:
 		case IP_RSVP_VIF_OFF:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_rsvp_vif ?
 				ip_rsvp_vif(so, sopt) : EINVAL;
 			break;
 
 		case MRT_INIT:
 		case MRT_DONE:
 		case MRT_ADD_VIF:
 		case MRT_DEL_VIF:
 		case MRT_ADD_MFC:
 		case MRT_DEL_MFC:
 		case MRT_VERSION:
 		case MRT_ASSERT:
 		case MRT_API_SUPPORT:
 		case MRT_API_CONFIG:
 		case MRT_ADD_BW_UPCALL:
 		case MRT_DEL_BW_UPCALL:
 			error = priv_check(curthread, PRIV_NETINET_MROUTE);
 			if (error != 0)
 				return (error);
 			error = ip_mrouter_set ? ip_mrouter_set(so, sopt) :
 					EOPNOTSUPP;
 			break;
 
 		default:
 			error = ip_ctloutput(so, sopt);
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 void
 rip_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 
 	switch (cmd) {
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	case PRC_MSGSIZE:
 		if (IPSEC_ENABLED(ipv4))
 			IPSEC_CTLINPUT(ipv4, cmd, sa, vip);
 		break;
 #endif
 	}
 }
 
 static int
 rip_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("rip_attach: inp != NULL"));
 
 	error = priv_check(td, PRIV_NETINET_RAW);
 	if (error)
 		return (error);
 	if (proto >= IPPROTO_MAX || proto < 0)
 		return EPROTONOSUPPORT;
 	error = soreserve(so, rip_sendspace, rip_recvspace);
 	if (error)
 		return (error);
 	error = in_pcballoc(so, &V_ripcbinfo);
 	if (error)
 		return (error);
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_ip_p = proto;
 	inp->inp_ip_ttl = V_ip_defttl;
 	INP_HASH_WLOCK(&V_ripcbinfo);
 	rip_inshash(inp);
 	INP_HASH_WUNLOCK(&V_ripcbinfo);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static void
 rip_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_detach: inp == NULL"));
 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
 	    ("rip_detach: not closed"));
 
 	/* Disable mrouter first */
 	if (so == V_ip_mrouter && ip_mrouter_done)
 		ip_mrouter_done();
 
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(&V_ripcbinfo);
 	rip_delhash(inp);
 	INP_HASH_WUNLOCK(&V_ripcbinfo);
 
 	if (ip_rsvp_force_done)
 		ip_rsvp_force_done(so);
 	if (so == V_ip_rsvpd)
 		ip_rsvp_done();
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 }
 
 static void
 rip_dodisconnect(struct socket *so, struct inpcb *inp)
 {
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = inp->inp_pcbinfo;
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(pcbinfo);
 	rip_delhash(inp);
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	rip_inshash(inp);
 	INP_HASH_WUNLOCK(pcbinfo);
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 }
 
 static void
 rip_abort(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_abort: inp == NULL"));
 
 	rip_dodisconnect(so, inp);
 }
 
 static void
 rip_close(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_close: inp == NULL"));
 
 	rip_dodisconnect(so, inp);
 }
 
 static int
 rip_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return (ENOTCONN);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_disconnect: inp == NULL"));
 
 	rip_dodisconnect(so, inp);
 	return (0);
 }
 
 static int
 rip_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 	struct inpcb *inp;
 	int error;
 
 	if (nam->sa_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 
 	error = prison_check_ip4(td->td_ucred, &addr->sin_addr);
 	if (error != 0)
 		return (error);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_bind: inp == NULL"));
 
 	if (CK_STAILQ_EMPTY(&V_ifnet) ||
 	    (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK) ||
 	    (addr->sin_addr.s_addr &&
 	     (inp->inp_flags & INP_BINDANY) == 0 &&
 	     ifa_ifwithaddr_check((struct sockaddr *)addr) == 0))
 		return (EADDRNOTAVAIL);
 
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(&V_ripcbinfo);
 	rip_delhash(inp);
 	inp->inp_laddr = addr->sin_addr;
 	rip_inshash(inp);
 	INP_HASH_WUNLOCK(&V_ripcbinfo);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 rip_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in *addr = (struct sockaddr_in *)nam;
 	struct inpcb *inp;
 
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 	if (CK_STAILQ_EMPTY(&V_ifnet))
 		return (EADDRNOTAVAIL);
 	if (addr->sin_family != AF_INET && addr->sin_family != AF_IMPLINK)
 		return (EAFNOSUPPORT);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_connect: inp == NULL"));
 
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(&V_ripcbinfo);
 	rip_delhash(inp);
 	inp->inp_faddr = addr->sin_addr;
 	rip_inshash(inp);
 	INP_HASH_WUNLOCK(&V_ripcbinfo);
 	soisconnected(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 rip_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip_shutdown: inp == NULL"));
 
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 #endif /* INET */
 
 static int
 rip_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_ripcbinfo,
 	    INPLOOKUP_RLOCKPCB);
 	struct xinpgen xig;
 	struct inpcb *inp;
 	int error;
 
 	if (req->newptr != 0)
 		return (EPERM);
 
 	if (req->oldptr == 0) {
 		int n;
 
 		n = V_ripcbinfo.ipi_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return (0);
 	}
 
 	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
 		return (error);
 
 	bzero(&xig, sizeof(xig));
 	xig.xig_len = sizeof xig;
 	xig.xig_count = V_ripcbinfo.ipi_count;
 	xig.xig_gen = V_ripcbinfo.ipi_gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		if (inp->inp_gencnt <= xig.xig_gen &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
 			struct xinpcb xi;
 
 			in_pcbtoxinpcb(inp, &xi);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 			if (error) {
 				INP_RUNLOCK(inp);
 				break;
 			}
 		}
 	}
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		xig.xig_gen = V_ripcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_ripcbinfo.ipi_count;
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_raw, OID_AUTO/*XXX*/, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     rip_pcblist, "S,xinpcb",
     "List of active raw IP sockets");
 
 #ifdef INET
-struct pr_usrreqs rip_usrreqs = {
-	.pru_abort =		rip_abort,
-	.pru_attach =		rip_attach,
-	.pru_bind =		rip_bind,
-	.pru_connect =		rip_connect,
-	.pru_control =		in_control,
-	.pru_detach =		rip_detach,
-	.pru_disconnect =	rip_disconnect,
-	.pru_peeraddr =		in_getpeeraddr,
-	.pru_send =		rip_send,
-	.pru_shutdown =		rip_shutdown,
-	.pru_sockaddr =		in_getsockaddr,
-	.pru_sosetlabel =	in_pcbsosetlabel,
-	.pru_close =		rip_close,
+/*
+ * See comment in in_proto.c containing "protosw definitions are not needed".
+ */
+#define	RAW_PROTOSW							\
+	.pr_type =		SOCK_RAW,				\
+	.pr_flags =		PR_ATOMIC|PR_ADDR,			\
+	.pr_ctloutput =		rip_ctloutput,				\
+	.pr_abort =		rip_abort,				\
+	.pr_attach =		rip_attach,				\
+	.pr_bind =		rip_bind,				\
+	.pr_connect =		rip_connect,				\
+	.pr_control =		in_control,				\
+	.pr_detach =		rip_detach,				\
+	.pr_disconnect =	rip_disconnect,				\
+	.pr_peeraddr =		in_getpeeraddr,				\
+	.pr_send =		rip_send,				\
+	.pr_shutdown =		rip_shutdown,				\
+	.pr_sockaddr =		in_getsockaddr,				\
+	.pr_sosetlabel =	in_pcbsosetlabel,			\
+	.pr_close =		rip_close
+
+struct protosw rip_protosw = {
+	.pr_protocol = IPPROTO_RAW,
+	RAW_PROTOSW
+};
+struct protosw icmp_protosw = {
+	.pr_protocol =	IPPROTO_ICMP,
+	RAW_PROTOSW
+};
+struct protosw igmp_protosw = {
+	.pr_protocol =	IPPROTO_IGMP,
+	RAW_PROTOSW
+};
+struct protosw rsvp_protosw = {
+	.pr_protocol =	IPPROTO_RSVP,
+	RAW_PROTOSW
+};
+struct protosw rawipv4_protosw = {
+	.pr_protocol =	IPPROTO_IPV4,
+	RAW_PROTOSW
+};
+struct protosw mobile_protosw = {
+	.pr_protocol =	IPPROTO_MOBILE,
+	RAW_PROTOSW
+};
+struct protosw etherip_protosw = {
+	.pr_protocol =	IPPROTO_ETHERIP,
+	RAW_PROTOSW
+};
+struct protosw gre_protosw = {
+	.pr_protocol =	IPPROTO_GRE,
+	RAW_PROTOSW
+};
+#ifdef INET6
+struct protosw rawipv6_protosw = {
+	.pr_protocol =	IPPROTO_IPV6,
+	RAW_PROTOSW
+};
+#endif
+struct protosw pim_protosw = {
+	.pr_protocol =	IPPROTO_PIM,
+	RAW_PROTOSW
+};
+struct protosw ripwild_protosw = {
+	RAW_PROTOSW
 };
 #endif /* INET */
diff --git a/sys/netinet/sctp_module.c b/sys/netinet/sctp_module.c
index ba0d585bd541..a4f294d72e00 100644
--- a/sys/netinet/sctp_module.c
+++ b/sys/netinet/sctp_module.c
@@ -1,181 +1,137 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2019-2020 The FreeBSD Foundation
  *
  * This software was developed by Mark Johnston under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
  * met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 
 #include <netinet/in.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp_var.h>
 #include <netinet/sctp_os_bsd.h>
 
 #include <netinet6/ip6_var.h>
 #include <netinet6/sctp6_var.h>
 
-#ifdef INET
-extern struct domain inetdomain;
-
-struct protosw sctp_stream_protosw = {
-	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_SCTP,
-	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD,
-	.pr_ctloutput =		sctp_ctloutput,
-	.pr_usrreqs =		&sctp_usrreqs,
-};
-
-struct protosw sctp_seqpacket_protosw = {
-	.pr_type =		SOCK_SEQPACKET,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_SCTP,
-	.pr_flags =		PR_WANTRCVD,
-	.pr_ctloutput =		sctp_ctloutput,
-	.pr_usrreqs =		&sctp_usrreqs,
-};
-#endif
-
-#ifdef INET6
-extern struct domain inet6domain;
-
-struct protosw sctp6_stream_protosw = {
-	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_SCTP,
-	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD,
-	.pr_ctloutput =		sctp_ctloutput,
-	.pr_usrreqs =		&sctp6_usrreqs,
-};
-
-struct protosw sctp6_seqpacket_protosw = {
-	.pr_type =		SOCK_SEQPACKET,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_SCTP,
-	.pr_flags =		PR_WANTRCVD,
-	.pr_ctloutput =		sctp_ctloutput,
-	.pr_usrreqs =		&sctp6_usrreqs,
-};
-#endif
-
 static int
 sctp_module_load(void)
 {
 	int error;
 
 #ifdef INET
-	error = pf_proto_register(PF_INET, &sctp_stream_protosw);
+	error = protosw_register(&inetdomain, &sctp_stream_protosw);
 	if (error != 0)
 		return (error);
-	error = pf_proto_register(PF_INET, &sctp_seqpacket_protosw);
+	error = protosw_register(&inetdomain, &sctp_seqpacket_protosw);
 	if (error != 0)
 		return (error);
 	error = ipproto_register(IPPROTO_SCTP, sctp_input, sctp_ctlinput);
 	if (error != 0)
 		return (error);
 #endif
 #ifdef INET6
-	error = pf_proto_register(PF_INET6, &sctp6_stream_protosw);
+	error = protosw_register(&inet6domain, &sctp6_stream_protosw);
 	if (error != 0)
 		return (error);
-	error = pf_proto_register(PF_INET6, &sctp6_seqpacket_protosw);
+	error = protosw_register(&inet6domain, &sctp6_seqpacket_protosw);
 	if (error != 0)
 		return (error);
 	error = ip6proto_register(IPPROTO_SCTP, sctp6_input, sctp6_ctlinput);
 	if (error != 0)
 		return (error);
 #endif
 	error = sctp_syscalls_init();
 	if (error != 0)
 		return (error);
 	return (0);
 }
 
 static int __unused
 sctp_module_unload(void)
 {
 
 	(void)sctp_syscalls_uninit();
 
 #ifdef INET
 	(void)ipproto_unregister(IPPROTO_SCTP);
-	(void)pf_proto_unregister(PF_INET, IPPROTO_SCTP, SOCK_STREAM);
-	(void)pf_proto_unregister(PF_INET, IPPROTO_SCTP, SOCK_SEQPACKET);
+	(void)protosw_unregister(&sctp_seqpacket_protosw);
+	(void)protosw_unregister(&sctp_stream_protosw);
 #endif
 #ifdef INET6
 	(void)ip6proto_unregister(IPPROTO_SCTP);
-	(void)pf_proto_unregister(PF_INET6, IPPROTO_SCTP, SOCK_STREAM);
-	(void)pf_proto_unregister(PF_INET6, IPPROTO_SCTP, SOCK_SEQPACKET);
+	(void)protosw_unregister(&sctp6_seqpacket_protosw);
+	(void)protosw_unregister(&sctp6_stream_protosw);
 #endif
 	return (0);
 }
 
 static int
 sctp_modload(struct module *module, int cmd, void *arg)
 {
 	int error;
 
 	switch (cmd) {
 	case MOD_LOAD:
 		error = sctp_module_load();
 		break;
 	case MOD_UNLOAD:
 		/*
 		 * Unloading SCTP is currently unsupported.  Currently, SCTP
 		 * iterator threads are not stopped during unload.
 		 */
 		error = EOPNOTSUPP;
 		break;
 	default:
 		error = 0;
 		break;
 	}
 	return (error);
 }
 
 static moduledata_t sctp_mod = {
 	"sctp",
 	&sctp_modload,
 	NULL,
 };
 
 DECLARE_MODULE(sctp, sctp_mod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY);
 MODULE_VERSION(sctp, 1);
diff --git a/sys/netinet/sctp_usrreq.c b/sys/netinet/sctp_usrreq.c
index 285fbcfbf512..58a9d610b003 100644
--- a/sys/netinet/sctp_usrreq.c
+++ b/sys/netinet/sctp_usrreq.c
@@ -1,7549 +1,7562 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <netinet/sctp_os.h>
 #include <sys/proc.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp_header.h>
 #include <netinet/sctp_var.h>
 #ifdef INET6
 #include <netinet6/sctp6_var.h>
 #endif
 #include <netinet/sctp_sysctl.h>
 #include <netinet/sctp_output.h>
 #include <netinet/sctp_uio.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp_indata.h>
 #include <netinet/sctp_timer.h>
 #include <netinet/sctp_auth.h>
 #include <netinet/sctp_bsd_addr.h>
 #include <netinet/udp.h>
 #include <sys/eventhandler.h>
 
 extern const struct sctp_cc_functions sctp_cc_functions[];
 extern const struct sctp_ss_functions sctp_ss_functions[];
 
 static void
 sctp_init(void *arg SCTP_UNUSED)
 {
 	u_long sb_max_adj;
 
 	/* Initialize and modify the sysctled variables */
 	sctp_init_sysctls();
 	if ((nmbclusters / 8) > SCTP_ASOC_MAX_CHUNKS_ON_QUEUE)
 		SCTP_BASE_SYSCTL(sctp_max_chunks_on_queue) = (nmbclusters / 8);
 	/*
 	 * Allow a user to take no more than 1/2 the number of clusters or
 	 * the SB_MAX, whichever is smaller, for the send window.
 	 */
 	sb_max_adj = (u_long)((u_quad_t)(SB_MAX) * MCLBYTES / (MSIZE + MCLBYTES));
 	SCTP_BASE_SYSCTL(sctp_sendspace) = min(sb_max_adj,
 	    (((uint32_t)nmbclusters / 2) * MCLBYTES));
 	/*
 	 * Now for the recv window, should we take the same amount? or
 	 * should I do 1/2 the SB_MAX instead in the SB_MAX min above. For
 	 * now I will just copy.
 	 */
 	SCTP_BASE_SYSCTL(sctp_recvspace) = SCTP_BASE_SYSCTL(sctp_sendspace);
 	SCTP_BASE_VAR(first_time) = 0;
 	SCTP_BASE_VAR(sctp_pcb_initialized) = 0;
 	sctp_pcb_init();
 #if defined(SCTP_PACKET_LOGGING)
 	SCTP_BASE_VAR(packet_log_writers) = 0;
 	SCTP_BASE_VAR(packet_log_end) = 0;
 	memset(&SCTP_BASE_VAR(packet_log_buffer), 0, SCTP_PACKET_LOG_SIZE);
 #endif
 	SCTP_BASE_VAR(eh_tag) = EVENTHANDLER_REGISTER(rt_addrmsg,
 	    sctp_addr_change_event_handler, NULL, EVENTHANDLER_PRI_FIRST);
 }
 
 VNET_SYSINIT(sctp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, sctp_init, NULL);
 
 #ifdef VIMAGE
 static void
 sctp_finish(void *unused __unused)
 {
 	EVENTHANDLER_DEREGISTER(rt_addrmsg, SCTP_BASE_VAR(eh_tag));
 	sctp_pcb_finish();
 }
 
 VNET_SYSUNINIT(sctp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, sctp_finish, NULL);
 #endif
 
 void
 sctp_pathmtu_adjustment(struct sctp_tcb *stcb, uint32_t mtu, bool resend)
 {
 	struct sctp_association *asoc;
 	struct sctp_tmit_chunk *chk;
 	uint32_t overhead;
 
 	asoc = &stcb->asoc;
 	KASSERT(mtu < asoc->smallest_mtu,
 	    ("Currently only reducing association MTU %u supported (MTU %u)",
 	    asoc->smallest_mtu, mtu));
 	asoc->smallest_mtu = mtu;
 	if (stcb->sctp_ep->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 		overhead = SCTP_MIN_OVERHEAD;
 	} else {
 		overhead = SCTP_MIN_V4_OVERHEAD;
 	}
 	if (asoc->idata_supported) {
 		if (sctp_auth_is_required_chunk(SCTP_IDATA, asoc->peer_auth_chunks)) {
 			overhead += sctp_get_auth_chunk_len(asoc->peer_hmac_id);
 		}
 	} else {
 		if (sctp_auth_is_required_chunk(SCTP_DATA, asoc->peer_auth_chunks)) {
 			overhead += sctp_get_auth_chunk_len(asoc->peer_hmac_id);
 		}
 	}
 	KASSERT(overhead % 4 == 0,
 	    ("overhead (%u) not a multiple of 4", overhead));
 	TAILQ_FOREACH(chk, &asoc->send_queue, sctp_next) {
 		if (((uint32_t)chk->send_size + overhead) > mtu) {
 			chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
 		}
 	}
 	TAILQ_FOREACH(chk, &asoc->sent_queue, sctp_next) {
 		if (((uint32_t)chk->send_size + overhead) > mtu) {
 			chk->flags |= CHUNK_FLAGS_FRAGMENT_OK;
 			if (resend && chk->sent < SCTP_DATAGRAM_RESEND) {
 				/*
 				 * If requested, mark the chunk for
 				 * immediate resend, since we sent it being
 				 * too big.
 				 */
 				sctp_flight_size_decrease(chk);
 				sctp_total_flight_decrease(stcb, chk);
 				chk->sent = SCTP_DATAGRAM_RESEND;
 				sctp_ucount_incr(asoc->sent_queue_retran_cnt);
 				chk->rec.data.doing_fast_retransmit = 0;
 				if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_FLIGHT_LOGGING_ENABLE) {
 					sctp_misc_ints(SCTP_FLIGHT_LOG_DOWN_PMTU,
 					    chk->whoTo->flight_size,
 					    chk->book_size,
 					    (uint32_t)(uintptr_t)chk->whoTo,
 					    chk->rec.data.tsn);
 				}
 				/* Clear any time, so NO RTT is being done. */
 				if (chk->do_rtt == 1) {
 					chk->do_rtt = 0;
 					chk->whoTo->rto_needed = 1;
 				}
 			}
 		}
 	}
 }
 
 #ifdef INET
 void
 sctp_notify(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     struct sctp_nets *net,
     uint8_t icmp_type,
     uint8_t icmp_code,
     uint16_t ip_len,
     uint32_t next_mtu)
 {
 	int timer_stopped;
 
 	if (icmp_type != ICMP_UNREACH) {
 		/* We only care about unreachable */
 		SCTP_TCB_UNLOCK(stcb);
 		return;
 	}
 	if ((icmp_code == ICMP_UNREACH_NET) ||
 	    (icmp_code == ICMP_UNREACH_HOST) ||
 	    (icmp_code == ICMP_UNREACH_NET_UNKNOWN) ||
 	    (icmp_code == ICMP_UNREACH_HOST_UNKNOWN) ||
 	    (icmp_code == ICMP_UNREACH_ISOLATED) ||
 	    (icmp_code == ICMP_UNREACH_NET_PROHIB) ||
 	    (icmp_code == ICMP_UNREACH_HOST_PROHIB) ||
 	    (icmp_code == ICMP_UNREACH_FILTER_PROHIB)) {
 		/* Mark the net unreachable. */
 		if (net->dest_state & SCTP_ADDR_REACHABLE) {
 			/* OK, that destination is NOT reachable. */
 			net->dest_state &= ~SCTP_ADDR_REACHABLE;
 			net->dest_state &= ~SCTP_ADDR_PF;
 			sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
 			    stcb, 0,
 			    (void *)net, SCTP_SO_NOT_LOCKED);
 		}
 		SCTP_TCB_UNLOCK(stcb);
 	} else if ((icmp_code == ICMP_UNREACH_PROTOCOL) ||
 	    (icmp_code == ICMP_UNREACH_PORT)) {
 		/* Treat it like an ABORT. */
 		sctp_abort_notification(stcb, true, false, 0, NULL, SCTP_SO_NOT_LOCKED);
 		(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 		    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2);
 		/* no need to unlock here, since the TCB is gone */
 	} else if (icmp_code == ICMP_UNREACH_NEEDFRAG) {
 		if (net->dest_state & SCTP_ADDR_NO_PMTUD) {
 			SCTP_TCB_UNLOCK(stcb);
 			return;
 		}
 		/* Find the next (smaller) MTU */
 		if (next_mtu == 0) {
 			/*
 			 * Old type router that does not tell us what the
 			 * next MTU is. Rats we will have to guess (in a
 			 * educated fashion of course).
 			 */
 			next_mtu = sctp_get_prev_mtu(ip_len);
 		}
 		/* Stop the PMTU timer. */
 		if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
 			timer_stopped = 1;
 			sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
 			    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_1);
 		} else {
 			timer_stopped = 0;
 		}
 		/* Update the path MTU. */
 		if (net->port) {
 			next_mtu -= sizeof(struct udphdr);
 		}
 		if (net->mtu > next_mtu) {
 			net->mtu = next_mtu;
 			if (net->port) {
 				sctp_hc_set_mtu(&net->ro._l_addr, inp->fibnum, next_mtu + sizeof(struct udphdr));
 			} else {
 				sctp_hc_set_mtu(&net->ro._l_addr, inp->fibnum, next_mtu);
 			}
 		}
 		/* Update the association MTU */
 		if (stcb->asoc.smallest_mtu > next_mtu) {
 			sctp_pathmtu_adjustment(stcb, next_mtu, true);
 		}
 		/* Finally, start the PMTU timer if it was running before. */
 		if (timer_stopped) {
 			sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
 		}
 		SCTP_TCB_UNLOCK(stcb);
 	} else {
 		SCTP_TCB_UNLOCK(stcb);
 	}
 }
 
 void
 sctp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	struct ip *outer_ip;
 	struct ip *inner_ip;
 	struct sctphdr *sh;
 	struct icmp *icmp;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb;
 	struct sctp_nets *net;
 	struct sctp_init_chunk *ch;
 	struct sockaddr_in src, dst;
 
 	if (sa->sa_family != AF_INET ||
 	    ((struct sockaddr_in *)sa)->sin_addr.s_addr == INADDR_ANY) {
 		return;
 	}
 	if (PRC_IS_REDIRECT(cmd)) {
 		vip = NULL;
 	} else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) {
 		return;
 	}
 	if (vip != NULL) {
 		inner_ip = (struct ip *)vip;
 		icmp = (struct icmp *)((caddr_t)inner_ip -
 		    (sizeof(struct icmp) - sizeof(struct ip)));
 		outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip));
 		sh = (struct sctphdr *)((caddr_t)inner_ip + (inner_ip->ip_hl << 2));
 		memset(&src, 0, sizeof(struct sockaddr_in));
 		src.sin_family = AF_INET;
 		src.sin_len = sizeof(struct sockaddr_in);
 		src.sin_port = sh->src_port;
 		src.sin_addr = inner_ip->ip_src;
 		memset(&dst, 0, sizeof(struct sockaddr_in));
 		dst.sin_family = AF_INET;
 		dst.sin_len = sizeof(struct sockaddr_in);
 		dst.sin_port = sh->dest_port;
 		dst.sin_addr = inner_ip->ip_dst;
 		/*
 		 * 'dst' holds the dest of the packet that failed to be
 		 * sent. 'src' holds our local endpoint address. Thus we
 		 * reverse the dst and the src in the lookup.
 		 */
 		inp = NULL;
 		net = NULL;
 		stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst,
 		    (struct sockaddr *)&src,
 		    &inp, &net, 1,
 		    SCTP_DEFAULT_VRFID);
 		if ((stcb != NULL) &&
 		    (net != NULL) &&
 		    (inp != NULL)) {
 			/* Check the verification tag */
 			if (ntohl(sh->v_tag) != 0) {
 				/*
 				 * This must be the verification tag used
 				 * for sending out packets. We don't
 				 * consider packets reflecting the
 				 * verification tag.
 				 */
 				if (ntohl(sh->v_tag) != stcb->asoc.peer_vtag) {
 					SCTP_TCB_UNLOCK(stcb);
 					return;
 				}
 			} else {
 				if (ntohs(outer_ip->ip_len) >=
 				    sizeof(struct ip) +
 				    8 + (inner_ip->ip_hl << 2) + 20) {
 					/*
 					 * In this case we can check if we
 					 * got an INIT chunk and if the
 					 * initiate tag matches.
 					 */
 					ch = (struct sctp_init_chunk *)(sh + 1);
 					if ((ch->ch.chunk_type != SCTP_INITIATION) ||
 					    (ntohl(ch->init.initiate_tag) != stcb->asoc.my_vtag)) {
 						SCTP_TCB_UNLOCK(stcb);
 						return;
 					}
 				} else {
 					SCTP_TCB_UNLOCK(stcb);
 					return;
 				}
 			}
 			sctp_notify(inp, stcb, net,
 			    icmp->icmp_type,
 			    icmp->icmp_code,
 			    ntohs(inner_ip->ip_len),
 			    (uint32_t)ntohs(icmp->icmp_nextmtu));
 		} else {
 			if ((stcb == NULL) && (inp != NULL)) {
 				/* reduce ref-count */
 				SCTP_INP_WLOCK(inp);
 				SCTP_INP_DECR_REF(inp);
 				SCTP_INP_WUNLOCK(inp);
 			}
 			if (stcb) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 		}
 	}
 	return;
 }
 #endif
 
 static int
 sctp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct sctp_inpcb *inp;
 	struct sctp_nets *net;
 	struct sctp_tcb *stcb;
 	int error;
 	uint32_t vrf_id;
 
 	/* FIX, for non-bsd is this right? */
 	vrf_id = SCTP_DEFAULT_VRFID;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 
 	if (error)
 		return (error);
 
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	stcb = sctp_findassociation_addr_sa(sintosa(&addrs[1]),
 	    sintosa(&addrs[0]),
 	    &inp, &net, 1, vrf_id);
 	if (stcb == NULL || inp == NULL || inp->sctp_socket == NULL) {
 		if ((inp != NULL) && (stcb == NULL)) {
 			/* reduce ref-count */
 			SCTP_INP_WLOCK(inp);
 			SCTP_INP_DECR_REF(inp);
 			goto cred_can_cont;
 		}
 
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 		error = ENOENT;
 		goto out;
 	}
 	SCTP_TCB_UNLOCK(stcb);
 	/*
 	 * We use the write lock here, only since in the error leg we need
 	 * it. If we used RLOCK, then we would have to
 	 * wlock/decr/unlock/rlock. Which in theory could create a hole.
 	 * Better to use higher wlock.
 	 */
 	SCTP_INP_WLOCK(inp);
 cred_can_cont:
 	error = cr_canseesocket(req->td->td_ucred, inp->sctp_socket);
 	if (error) {
 		SCTP_INP_WUNLOCK(inp);
 		goto out;
 	}
 	cru2x(inp->sctp_socket->so_cred, &xuc);
 	SCTP_INP_WUNLOCK(inp);
 	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 out:
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_sctp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     0, 0, sctp_getcred, "S,ucred",
     "Get the ucred of a SCTP connection");
 
 #ifdef INET
 static void
 sctp_abort(struct socket *so)
 {
 	struct epoch_tracker et;
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		return;
 	}
 
 	SCTP_INP_WLOCK(inp);
 	NET_EPOCH_ENTER(et);
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 17);
 #endif
 	if (((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0)) {
 		inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP;
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, NULL, 16);
 #endif
 		SCTP_INP_WUNLOCK(inp);
 		sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
 		    SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
 		SOCK_LOCK(so);
 		KASSERT(!SOLISTENING(so),
 		    ("sctp_abort: called on listening socket %p", so));
 		SCTP_SB_CLEAR(so->so_snd);
 		SCTP_SB_CLEAR(so->so_rcv);
 		/* Now null out the reference, we are completely detached. */
 		so->so_pcb = NULL;
 		SOCK_UNLOCK(so);
 	} else {
 		SCTP_INP_WUNLOCK(inp);
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 static int
 sctp_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNUSED)
 {
 	struct sctp_inpcb *inp;
 	struct inpcb *ip_inp;
 	int error;
 	uint32_t vrf_id = SCTP_DEFAULT_VRFID;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp != NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		error = SCTP_SORESERVE(so, SCTP_BASE_SYSCTL(sctp_sendspace), SCTP_BASE_SYSCTL(sctp_recvspace));
 		if (error) {
 			return (error);
 		}
 	}
 	error = sctp_inpcb_alloc(so, vrf_id);
 	if (error) {
 		return (error);
 	}
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	SCTP_INP_WLOCK(inp);
 	inp->sctp_flags &= ~SCTP_PCB_FLAGS_BOUND_V6;	/* I'm not v6! */
 	ip_inp = &inp->ip_inp.inp;
 	ip_inp->inp_vflag |= INP_IPV4;
 	ip_inp->inp_ip_ttl = MODULE_GLOBAL(ip_defttl);
 	SCTP_INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 sctp_bind(struct socket *so, struct sockaddr *addr, struct thread *p)
 {
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	if (addr != NULL) {
 		if ((addr->sa_family != AF_INET) ||
 		    (addr->sa_len != sizeof(struct sockaddr_in))) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 	}
 	return (sctp_inpcb_bind(so, addr, NULL, p));
 }
 
 #endif
 void
 sctp_close(struct socket *so)
 {
 	struct epoch_tracker et;
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL)
 		return;
 
 	/*
 	 * Inform all the lower layer assoc that we are done.
 	 */
 	SCTP_INP_WLOCK(inp);
 	NET_EPOCH_ENTER(et);
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 17);
 #endif
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) {
 		inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP;
 		if (((so->so_options & SO_LINGER) && (so->so_linger == 0)) ||
 		    (SCTP_SBAVAIL(&so->so_rcv) > 0)) {
 #ifdef SCTP_LOG_CLOSING
 			sctp_log_closing(inp, NULL, 13);
 #endif
 			SCTP_INP_WUNLOCK(inp);
 			sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
 			    SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
 		} else {
 #ifdef SCTP_LOG_CLOSING
 			sctp_log_closing(inp, NULL, 14);
 #endif
 			SCTP_INP_WUNLOCK(inp);
 			sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_GRACEFUL_CLOSE,
 			    SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
 		}
 		/*
 		 * The socket is now detached, no matter what the state of
 		 * the SCTP association.
 		 */
 		SOCK_LOCK(so);
 		if (!SOLISTENING(so)) {
 			SCTP_SB_CLEAR(so->so_snd);
 			SCTP_SB_CLEAR(so->so_rcv);
 		}
 		/* Now null out the reference, we are completely detached. */
 		so->so_pcb = NULL;
 		SOCK_UNLOCK(so);
 	} else {
 		SCTP_INP_WUNLOCK(inp);
 	}
 	NET_EPOCH_EXIT(et);
 }
 
 int
 sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *p);
 
 int
 sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *p)
 {
 	struct sctp_inpcb *inp;
 	int error;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		if (control) {
 			sctp_m_freem(control);
 			control = NULL;
 		}
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		sctp_m_freem(m);
 		return (EINVAL);
 	}
 	/* Got to have an to address if we are NOT a connected socket */
 	if ((addr == NULL) &&
 	    ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE))) {
 		goto connected_type;
 	}
 
 	error = 0;
 	if (addr == NULL) {
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EDESTADDRREQ);
 		error = EDESTADDRREQ;
 	} else if (addr->sa_family != AF_INET) {
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EAFNOSUPPORT);
 		error = EAFNOSUPPORT;
 	} else if (addr->sa_len != sizeof(struct sockaddr_in)) {
 		SCTP_LTRACE_ERR_RET_PKT(m, inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		error = EINVAL;
 	}
 	if (error != 0) {
 		sctp_m_freem(m);
 		if (control) {
 			sctp_m_freem(control);
 			control = NULL;
 		}
 		return (error);
 	}
 connected_type:
 	/* now what about control */
 	if (control) {
 		if (inp->control) {
 			sctp_m_freem(inp->control);
 			inp->control = NULL;
 		}
 		inp->control = control;
 	}
 	/* Place the data */
 	if (inp->pkt) {
 		SCTP_BUF_NEXT(inp->pkt_last) = m;
 		inp->pkt_last = m;
 	} else {
 		inp->pkt_last = inp->pkt = m;
 	}
 	if (
 	/* FreeBSD uses a flag passed */
 	    ((flags & PRUS_MORETOCOME) == 0)
 	    ) {
 		/*
 		 * note with the current version this code will only be used
 		 * by OpenBSD-- NetBSD, FreeBSD, and MacOS have methods for
 		 * re-defining sosend to use the sctp_sosend. One can
 		 * optionally switch back to this code (by changing back the
 		 * definitions) but this is not advisable. This code is used
 		 * by FreeBSD when sending a file with sendfile() though.
 		 */
 		struct epoch_tracker et;
 		int ret;
 
 		NET_EPOCH_ENTER(et);
 		ret = sctp_output(inp, inp->pkt, addr, inp->control, p, flags);
 		NET_EPOCH_EXIT(et);
 		inp->pkt = NULL;
 		inp->control = NULL;
 		return (ret);
 	} else {
 		return (0);
 	}
 }
 
 int
 sctp_disconnect(struct socket *so)
 {
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
 		return (ENOTCONN);
 	}
 	SCTP_INP_RLOCK(inp);
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 		if (LIST_EMPTY(&inp->sctp_asoc_list)) {
 			/* No connection */
 			SCTP_INP_RUNLOCK(inp);
 			return (0);
 		} else {
 			struct epoch_tracker et;
 			struct sctp_association *asoc;
 			struct sctp_tcb *stcb;
 
 			stcb = LIST_FIRST(&inp->sctp_asoc_list);
 			if (stcb == NULL) {
 				SCTP_INP_RUNLOCK(inp);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			SCTP_TCB_LOCK(stcb);
 			asoc = &stcb->asoc;
 			if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 				/* We are about to be freed, out of here */
 				SCTP_TCB_UNLOCK(stcb);
 				SCTP_INP_RUNLOCK(inp);
 				return (0);
 			}
 			NET_EPOCH_ENTER(et);
 			if (((so->so_options & SO_LINGER) && (so->so_linger == 0)) ||
 			    (SCTP_SBAVAIL(&so->so_rcv) > 0)) {
 				if (SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT) {
 					/* Left with Data unread */
 					struct mbuf *op_err;
 
 					op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
 					sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED);
 					SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 				}
 				SCTP_INP_RUNLOCK(inp);
 				if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) ||
 				    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 					SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 				}
 				(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 				    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_3);
 				/* No unlock tcb assoc is gone */
 				NET_EPOCH_EXIT(et);
 				return (0);
 			}
 			if (TAILQ_EMPTY(&asoc->send_queue) &&
 			    TAILQ_EMPTY(&asoc->sent_queue) &&
 			    (asoc->stream_queue_cnt == 0)) {
 				/* there is nothing queued to send, so done */
 				if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
 					goto abort_anyway;
 				}
 				if ((SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_SENT) &&
 				    (SCTP_GET_STATE(stcb) != SCTP_STATE_SHUTDOWN_ACK_SENT)) {
 					/* only send SHUTDOWN 1st time thru */
 					struct sctp_nets *netp;
 
 					if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) ||
 					    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 						SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 					}
 					SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT);
 					sctp_stop_timers_for_shutdown(stcb);
 					if (stcb->asoc.alternate) {
 						netp = stcb->asoc.alternate;
 					} else {
 						netp = stcb->asoc.primary_destination;
 					}
 					sctp_send_shutdown(stcb, netp);
 					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
 					    stcb->sctp_ep, stcb, netp);
 					sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD,
 					    stcb->sctp_ep, stcb, NULL);
 					sctp_chunk_output(stcb->sctp_ep, stcb, SCTP_OUTPUT_FROM_T3, SCTP_SO_LOCKED);
 				}
 			} else {
 				/*
 				 * we still got (or just got) data to send,
 				 * so set SHUTDOWN_PENDING
 				 */
 				/*
 				 * XXX sockets draft says that SCTP_EOF
 				 * should be sent with no data. currently,
 				 * we will allow user data to be sent first
 				 * and move to SHUTDOWN-PENDING
 				 */
 				SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING);
 				sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL);
 				if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
 					SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT);
 				}
 				if (TAILQ_EMPTY(&asoc->send_queue) &&
 				    TAILQ_EMPTY(&asoc->sent_queue) &&
 				    (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
 					struct mbuf *op_err;
 
 			abort_anyway:
 					op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
 					stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_4;
 					sctp_send_abort_tcb(stcb, op_err, SCTP_SO_LOCKED);
 					SCTP_STAT_INCR_COUNTER32(sctps_aborted);
 					if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) ||
 					    (SCTP_GET_STATE(stcb) == SCTP_STATE_SHUTDOWN_RECEIVED)) {
 						SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 					}
 					SCTP_INP_RUNLOCK(inp);
 					(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 					    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_5);
 					NET_EPOCH_EXIT(et);
 					return (0);
 				} else {
 					sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
 				}
 			}
 			soisdisconnecting(so);
 			NET_EPOCH_EXIT(et);
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			return (0);
 		}
 		/* not reached */
 	} else {
 		/* UDP model does not support this */
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 		return (EOPNOTSUPP);
 	}
 }
 
 int
 sctp_flush(struct socket *so, int how)
 {
 	/*
 	 * We will just clear out the values and let subsequent close clear
 	 * out the data, if any. Note if the user did a shutdown(SHUT_RD)
 	 * they will not be able to read the data, the socket will block
 	 * that from happening.
 	 */
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	SCTP_INP_RLOCK(inp);
 	/* For the 1 to many model this does nothing */
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) {
 		SCTP_INP_RUNLOCK(inp);
 		return (0);
 	}
 	SCTP_INP_RUNLOCK(inp);
 	if ((how == PRU_FLUSH_RD) || (how == PRU_FLUSH_RDWR)) {
 		/*
 		 * First make sure the sb will be happy, we don't use these
 		 * except maybe the count
 		 */
 		SCTP_INP_WLOCK(inp);
 		SCTP_INP_READ_LOCK(inp);
 		inp->sctp_flags |= SCTP_PCB_FLAGS_SOCKET_CANT_READ;
 		SCTP_INP_READ_UNLOCK(inp);
 		SCTP_INP_WUNLOCK(inp);
 		SOCK_LOCK(so);
 		KASSERT(!SOLISTENING(so),
 		    ("sctp_flush: called on listening socket %p", so));
 		SCTP_SB_CLEAR(so->so_rcv);
 		SOCK_UNLOCK(so);
 	}
 	if ((how == PRU_FLUSH_WR) || (how == PRU_FLUSH_RDWR)) {
 		/*
 		 * First make sure the sb will be happy, we don't use these
 		 * except maybe the count
 		 */
 		SOCK_LOCK(so);
 		KASSERT(!SOLISTENING(so),
 		    ("sctp_flush: called on listening socket %p", so));
 		SCTP_SB_CLEAR(so->so_snd);
 		SOCK_UNLOCK(so);
 	}
 	return (0);
 }
 
 int
 sctp_shutdown(struct socket *so)
 {
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	SCTP_INP_RLOCK(inp);
 	/* For UDP model this is a invalid call */
 	if (!((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL))) {
 		/* Restore the flags that the soshutdown took away. */
 		SOCKBUF_LOCK(&so->so_rcv);
 		so->so_rcv.sb_state &= ~SBS_CANTRCVMORE;
 		SOCKBUF_UNLOCK(&so->so_rcv);
 		/* This proc will wakeup for read and do nothing (I hope) */
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 		return (EOPNOTSUPP);
 	} else {
 		/*
 		 * Ok, if we reach here its the TCP model and it is either a
 		 * SHUT_WR or SHUT_RDWR. This means we put the shutdown flag
 		 * against it.
 		 */
 		struct epoch_tracker et;
 		struct sctp_tcb *stcb;
 		struct sctp_association *asoc;
 		struct sctp_nets *netp;
 
 		if ((so->so_state &
 		    (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
 			SCTP_INP_RUNLOCK(inp);
 			return (ENOTCONN);
 		}
 		socantsendmore(so);
 
 		stcb = LIST_FIRST(&inp->sctp_asoc_list);
 		if (stcb == NULL) {
 			/*
 			 * Ok, we hit the case that the shutdown call was
 			 * made after an abort or something. Nothing to do
 			 * now.
 			 */
 			SCTP_INP_RUNLOCK(inp);
 			return (0);
 		}
 		SCTP_TCB_LOCK(stcb);
 		asoc = &stcb->asoc;
 		if (asoc->state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			return (0);
 		}
 		if ((SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_WAIT) &&
 		    (SCTP_GET_STATE(stcb) != SCTP_STATE_COOKIE_ECHOED) &&
 		    (SCTP_GET_STATE(stcb) != SCTP_STATE_OPEN)) {
 			/*
 			 * If we are not in or before ESTABLISHED, there is
 			 * no protocol action required.
 			 */
 			SCTP_TCB_UNLOCK(stcb);
 			SCTP_INP_RUNLOCK(inp);
 			return (0);
 		}
 		NET_EPOCH_ENTER(et);
 		if (stcb->asoc.alternate) {
 			netp = stcb->asoc.alternate;
 		} else {
 			netp = stcb->asoc.primary_destination;
 		}
 		if ((SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) &&
 		    TAILQ_EMPTY(&asoc->send_queue) &&
 		    TAILQ_EMPTY(&asoc->sent_queue) &&
 		    (asoc->stream_queue_cnt == 0)) {
 			if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
 				goto abort_anyway;
 			}
 			/* there is nothing queued to send, so I'm done... */
 			SCTP_STAT_DECR_GAUGE32(sctps_currestab);
 			SCTP_SET_STATE(stcb, SCTP_STATE_SHUTDOWN_SENT);
 			sctp_stop_timers_for_shutdown(stcb);
 			sctp_send_shutdown(stcb, netp);
 			sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWN,
 			    stcb->sctp_ep, stcb, netp);
 		} else {
 			/*
 			 * We still got (or just got) data to send, so set
 			 * SHUTDOWN_PENDING.
 			 */
 			SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_SHUTDOWN_PENDING);
 			if ((*asoc->ss_functions.sctp_ss_is_user_msgs_incomplete) (stcb, asoc)) {
 				SCTP_ADD_SUBSTATE(stcb, SCTP_STATE_PARTIAL_MSG_LEFT);
 			}
 			if (TAILQ_EMPTY(&asoc->send_queue) &&
 			    TAILQ_EMPTY(&asoc->sent_queue) &&
 			    (asoc->state & SCTP_STATE_PARTIAL_MSG_LEFT)) {
 				struct mbuf *op_err;
 
 		abort_anyway:
 				op_err = sctp_generate_cause(SCTP_CAUSE_USER_INITIATED_ABT, "");
 				stcb->sctp_ep->last_abort_code = SCTP_FROM_SCTP_USRREQ + SCTP_LOC_6;
 				SCTP_INP_RUNLOCK(inp);
 				sctp_abort_an_association(stcb->sctp_ep, stcb,
 				    op_err, false, SCTP_SO_LOCKED);
 				NET_EPOCH_EXIT(et);
 				return (0);
 			}
 		}
 		sctp_timer_start(SCTP_TIMER_TYPE_SHUTDOWNGUARD, stcb->sctp_ep, stcb, NULL);
 		/*
 		 * XXX: Why do this in the case where we have still data
 		 * queued?
 		 */
 		sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_CLOSING, SCTP_SO_LOCKED);
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_INP_RUNLOCK(inp);
 		NET_EPOCH_EXIT(et);
 		return (0);
 	}
 }
 
 /*
  * copies a "user" presentable address and removes embedded scope, etc.
  * returns 0 on success, 1 on error
  */
 static uint32_t
 sctp_fill_user_address(struct sockaddr *dst, struct sockaddr *src)
 {
 #ifdef INET6
 	struct sockaddr_in6 lsa6;
 
 	src = (struct sockaddr *)sctp_recover_scope((struct sockaddr_in6 *)src,
 	    &lsa6);
 #endif
 	memcpy(dst, src, src->sa_len);
 	return (0);
 }
 
 static size_t
 sctp_fill_up_addresses_vrf(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     size_t limit,
     struct sockaddr *addr,
     uint32_t vrf_id)
 {
 	struct sctp_ifn *sctp_ifn;
 	struct sctp_ifa *sctp_ifa;
 	size_t actual;
 	int loopback_scope;
 #if defined(INET)
 	int ipv4_local_scope, ipv4_addr_legal;
 #endif
 #if defined(INET6)
 	int local_scope, site_scope, ipv6_addr_legal;
 #endif
 	struct sctp_vrf *vrf;
 
 	SCTP_IPI_ADDR_LOCK_ASSERT();
 	actual = 0;
 	if (limit == 0)
 		return (actual);
 
 	if (stcb) {
 		/* Turn on all the appropriate scope */
 		loopback_scope = stcb->asoc.scope.loopback_scope;
 #if defined(INET)
 		ipv4_local_scope = stcb->asoc.scope.ipv4_local_scope;
 		ipv4_addr_legal = stcb->asoc.scope.ipv4_addr_legal;
 #endif
 #if defined(INET6)
 		local_scope = stcb->asoc.scope.local_scope;
 		site_scope = stcb->asoc.scope.site_scope;
 		ipv6_addr_legal = stcb->asoc.scope.ipv6_addr_legal;
 #endif
 	} else {
 		/* Use generic values for endpoints. */
 		loopback_scope = 1;
 #if defined(INET)
 		ipv4_local_scope = 1;
 #endif
 #if defined(INET6)
 		local_scope = 1;
 		site_scope = 1;
 #endif
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 #if defined(INET6)
 			ipv6_addr_legal = 1;
 #endif
 #if defined(INET)
 			if (SCTP_IPV6_V6ONLY(inp)) {
 				ipv4_addr_legal = 0;
 			} else {
 				ipv4_addr_legal = 1;
 			}
 #endif
 		} else {
 #if defined(INET6)
 			ipv6_addr_legal = 0;
 #endif
 #if defined(INET)
 			ipv4_addr_legal = 1;
 #endif
 		}
 	}
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		return (0);
 	}
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 			if ((loopback_scope == 0) &&
 			    SCTP_IFN_IS_IFT_LOOP(sctp_ifn)) {
 				/* Skip loopback if loopback_scope not set */
 				continue;
 			}
 			LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 				if (stcb) {
 					/*
 					 * For the BOUND-ALL case, the list
 					 * associated with a TCB is Always
 					 * considered a reverse list.. i.e.
 					 * it lists addresses that are NOT
 					 * part of the association. If this
 					 * is one of those we must skip it.
 					 */
 					if (sctp_is_addr_restricted(stcb,
 					    sctp_ifa)) {
 						continue;
 					}
 				}
 				switch (sctp_ifa->address.sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 					if (ipv4_addr_legal) {
 						struct sockaddr_in *sin;
 
 						sin = &sctp_ifa->address.sin;
 						if (sin->sin_addr.s_addr == 0) {
 							/*
 							 * we skip
 							 * unspecified
 							 * addresses
 							 */
 							continue;
 						}
 						if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 						    &sin->sin_addr) != 0) {
 							continue;
 						}
 						if ((ipv4_local_scope == 0) &&
 						    (IN4_ISPRIVATE_ADDRESS(&sin->sin_addr))) {
 							continue;
 						}
 #ifdef INET6
 						if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
 							if (actual + sizeof(struct sockaddr_in6) > limit) {
 								return (actual);
 							}
 							in6_sin_2_v4mapsin6(sin, (struct sockaddr_in6 *)addr);
 							((struct sockaddr_in6 *)addr)->sin6_port = inp->sctp_lport;
 							addr = (struct sockaddr *)((caddr_t)addr + sizeof(struct sockaddr_in6));
 							actual += sizeof(struct sockaddr_in6);
 						} else {
 #endif
 							if (actual + sizeof(struct sockaddr_in) > limit) {
 								return (actual);
 							}
 							memcpy(addr, sin, sizeof(struct sockaddr_in));
 							((struct sockaddr_in *)addr)->sin_port = inp->sctp_lport;
 							addr = (struct sockaddr *)((caddr_t)addr + sizeof(struct sockaddr_in));
 							actual += sizeof(struct sockaddr_in);
 #ifdef INET6
 						}
 #endif
 					} else {
 						continue;
 					}
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					if (ipv6_addr_legal) {
 						struct sockaddr_in6 *sin6;
 
 						sin6 = &sctp_ifa->address.sin6;
 						if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 							/*
 							 * we skip
 							 * unspecified
 							 * addresses
 							 */
 							continue;
 						}
 						if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 						    &sin6->sin6_addr) != 0) {
 							continue;
 						}
 						if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) {
 							if (local_scope == 0)
 								continue;
 							if (sin6->sin6_scope_id == 0) {
 								if (sa6_recoverscope(sin6) != 0)
 									/*
 									 *
 									 * bad
 									 * link
 									 *
 									 * local
 									 *
 									 * address
 									 */
 									continue;
 							}
 						}
 						if ((site_scope == 0) &&
 						    (IN6_IS_ADDR_SITELOCAL(&sin6->sin6_addr))) {
 							continue;
 						}
 						if (actual + sizeof(struct sockaddr_in6) > limit) {
 							return (actual);
 						}
 						memcpy(addr, sin6, sizeof(struct sockaddr_in6));
 						((struct sockaddr_in6 *)addr)->sin6_port = inp->sctp_lport;
 						addr = (struct sockaddr *)((caddr_t)addr + sizeof(struct sockaddr_in6));
 						actual += sizeof(struct sockaddr_in6);
 					} else {
 						continue;
 					}
 					break;
 #endif
 				default:
 					/* TSNH */
 					break;
 				}
 			}
 		}
 	} else {
 		struct sctp_laddr *laddr;
 		size_t sa_len;
 
 		LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 			if (stcb) {
 				if (sctp_is_addr_restricted(stcb, laddr->ifa)) {
 					continue;
 				}
 			}
 			sa_len = laddr->ifa->address.sa.sa_len;
 			if (actual + sa_len > limit) {
 				return (actual);
 			}
 			if (sctp_fill_user_address(addr, &laddr->ifa->address.sa))
 				continue;
 			switch (laddr->ifa->address.sa.sa_family) {
 #ifdef INET
 			case AF_INET:
 				((struct sockaddr_in *)addr)->sin_port = inp->sctp_lport;
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				((struct sockaddr_in6 *)addr)->sin6_port = inp->sctp_lport;
 				break;
 #endif
 			default:
 				/* TSNH */
 				break;
 			}
 			addr = (struct sockaddr *)((caddr_t)addr + sa_len);
 			actual += sa_len;
 		}
 	}
 	return (actual);
 }
 
 static size_t
 sctp_fill_up_addresses(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     size_t limit,
     struct sockaddr *addr)
 {
 	size_t size;
 
 	SCTP_IPI_ADDR_RLOCK();
 	/* fill up addresses for the endpoint's default vrf */
 	size = sctp_fill_up_addresses_vrf(inp, stcb, limit, addr,
 	    inp->def_vrf_id);
 	SCTP_IPI_ADDR_RUNLOCK();
 	return (size);
 }
 
 static size_t
 sctp_max_size_addresses_vrf(struct sctp_inpcb *inp, uint32_t vrf_id)
 {
 	struct sctp_vrf *vrf;
 	size_t size;
 
 	/*
 	 * In both sub-set bound an bound_all cases we return the size of
 	 * the maximum number of addresses that you could get. In reality
 	 * the sub-set bound may have an exclusion list for a given TCB or
 	 * in the bound-all case a TCB may NOT include the loopback or other
 	 * addresses as well.
 	 */
 	SCTP_IPI_ADDR_LOCK_ASSERT();
 	vrf = sctp_find_vrf(vrf_id);
 	if (vrf == NULL) {
 		return (0);
 	}
 	size = 0;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		struct sctp_ifn *sctp_ifn;
 		struct sctp_ifa *sctp_ifa;
 
 		LIST_FOREACH(sctp_ifn, &vrf->ifnlist, next_ifn) {
 			LIST_FOREACH(sctp_ifa, &sctp_ifn->ifalist, next_ifa) {
 				/* Count them if they are the right type */
 				switch (sctp_ifa->address.sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 #ifdef INET6
 					if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4))
 						size += sizeof(struct sockaddr_in6);
 					else
 						size += sizeof(struct sockaddr_in);
 #else
 					size += sizeof(struct sockaddr_in);
 #endif
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					size += sizeof(struct sockaddr_in6);
 					break;
 #endif
 				default:
 					break;
 				}
 			}
 		}
 	} else {
 		struct sctp_laddr *laddr;
 
 		LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 			switch (laddr->ifa->address.sa.sa_family) {
 #ifdef INET
 			case AF_INET:
 #ifdef INET6
 				if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4))
 					size += sizeof(struct sockaddr_in6);
 				else
 					size += sizeof(struct sockaddr_in);
 #else
 				size += sizeof(struct sockaddr_in);
 #endif
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				size += sizeof(struct sockaddr_in6);
 				break;
 #endif
 			default:
 				break;
 			}
 		}
 	}
 	return (size);
 }
 
 static size_t
 sctp_max_size_addresses(struct sctp_inpcb *inp)
 {
 	size_t size;
 
 	SCTP_IPI_ADDR_RLOCK();
 	/* Maximum size of all addresses for the endpoint's default VRF */
 	size = sctp_max_size_addresses_vrf(inp, inp->def_vrf_id);
 	SCTP_IPI_ADDR_RUNLOCK();
 	return (size);
 }
 
 static int
 sctp_do_connect_x(struct socket *so, struct sctp_inpcb *inp, void *optval,
     size_t optsize, void *p, int delay)
 {
 	int error;
 	int creat_lock_on = 0;
 	struct sctp_tcb *stcb = NULL;
 	struct sockaddr *sa;
 	unsigned int num_v6 = 0, num_v4 = 0, *totaddrp, totaddr;
 	uint32_t vrf_id;
 	sctp_assoc_t *a_id;
 
 	SCTPDBG(SCTP_DEBUG_PCB1, "Connectx called\n");
 
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) {
 		/* We are already connected AND the TCP model */
 		SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE);
 		return (EADDRINUSE);
 	}
 
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) &&
 	    (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE))) {
 		SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 		SCTP_INP_RLOCK(inp);
 		stcb = LIST_FIRST(&inp->sctp_asoc_list);
 		SCTP_INP_RUNLOCK(inp);
 	}
 	if (stcb) {
 		SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 		return (EALREADY);
 	}
 	SCTP_INP_INCR_REF(inp);
 	SCTP_ASOC_CREATE_LOCK(inp);
 	creat_lock_on = 1;
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
 		SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EFAULT);
 		error = EFAULT;
 		goto out_now;
 	}
 	totaddrp = (unsigned int *)optval;
 	totaddr = *totaddrp;
 	sa = (struct sockaddr *)(totaddrp + 1);
 	error = sctp_connectx_helper_find(inp, sa, totaddr, &num_v4, &num_v6, (unsigned int)(optsize - sizeof(int)));
 	if (error != 0) {
 		/* Already have or am bring up an association */
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 		creat_lock_on = 0;
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 		goto out_now;
 	}
 #ifdef INET6
 	if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) &&
 	    (num_v6 > 0)) {
 		error = EINVAL;
 		goto out_now;
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 	    (num_v4 > 0)) {
 		if (SCTP_IPV6_V6ONLY(inp)) {
 			/*
 			 * if IPV6_V6ONLY flag, ignore connections destined
 			 * to a v4 addr or v4-mapped addr
 			 */
 			SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			error = EINVAL;
 			goto out_now;
 		}
 	}
 #endif				/* INET6 */
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) {
 		/* Bind a ephemeral port */
 		error = sctp_inpcb_bind(so, NULL, NULL, p);
 		if (error) {
 			goto out_now;
 		}
 	}
 
 	/* FIX ME: do we want to pass in a vrf on the connect call? */
 	vrf_id = inp->def_vrf_id;
 
 	/* We are GOOD to go */
 	stcb = sctp_aloc_assoc_connected(inp, sa, &error, 0, 0, vrf_id,
 	    inp->sctp_ep.pre_open_stream_count,
 	    inp->sctp_ep.port,
 	    (struct thread *)p,
 	    SCTP_INITIALIZE_AUTH_PARAMS);
 	if (stcb == NULL) {
 		/* Gak! no memory */
 		goto out_now;
 	}
 	SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT);
 	/* move to second address */
 	switch (sa->sa_family) {
 #ifdef INET
 	case AF_INET:
 		sa = (struct sockaddr *)((caddr_t)sa + sizeof(struct sockaddr_in));
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		sa = (struct sockaddr *)((caddr_t)sa + sizeof(struct sockaddr_in6));
 		break;
 #endif
 	default:
 		break;
 	}
 
 	error = 0;
 	sctp_connectx_helper_add(stcb, sa, (totaddr - 1), &error);
 	/* Fill in the return id */
 	if (error) {
 		goto out_now;
 	}
 	a_id = (sctp_assoc_t *)optval;
 	*a_id = sctp_get_associd(stcb);
 
 	if (delay) {
 		/* doing delayed connection */
 		stcb->asoc.delayed_connection = 1;
 		sctp_timer_start(SCTP_TIMER_TYPE_INIT, inp, stcb, stcb->asoc.primary_destination);
 	} else {
 		(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
 		sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
 	}
 	SCTP_TCB_UNLOCK(stcb);
 out_now:
 	if (creat_lock_on) {
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 	}
 	SCTP_INP_DECR_REF(inp);
 	return (error);
 }
 
 #define SCTP_FIND_STCB(inp, stcb, assoc_id) { \
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||\
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) { \
 		SCTP_INP_RLOCK(inp); \
 		stcb = LIST_FIRST(&inp->sctp_asoc_list); \
 		if (stcb) { \
 			SCTP_TCB_LOCK(stcb); \
 		} \
 		SCTP_INP_RUNLOCK(inp); \
 	} else if (assoc_id > SCTP_ALL_ASSOC) { \
 		stcb = sctp_findassociation_ep_asocid(inp, assoc_id, 1); \
 		if (stcb == NULL) { \
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT); \
 			error = ENOENT; \
 			break; \
 		} \
 	} else { \
 		stcb = NULL; \
 	} \
 }
 
 #define SCTP_CHECK_AND_CAST(destp, srcp, type, size) {\
 	if (size < sizeof(type)) { \
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL); \
 		error = EINVAL; \
 		break; \
 	} else { \
 		destp = (type *)srcp; \
 	} \
 }
 
 static int
 sctp_getopt(struct socket *so, int optname, void *optval, size_t *optsize,
     void *p)
 {
 	struct sctp_inpcb *inp = NULL;
 	int error, val = 0;
 	struct sctp_tcb *stcb = NULL;
 
 	if (optval == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return EINVAL;
 	}
 	error = 0;
 
 	switch (optname) {
 	case SCTP_NODELAY:
 	case SCTP_AUTOCLOSE:
 	case SCTP_EXPLICIT_EOR:
 	case SCTP_AUTO_ASCONF:
 	case SCTP_DISABLE_FRAGMENTS:
 	case SCTP_I_WANT_MAPPED_V4_ADDR:
 	case SCTP_USE_EXT_RCVINFO:
 		SCTP_INP_RLOCK(inp);
 		switch (optname) {
 		case SCTP_DISABLE_FRAGMENTS:
 			val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NO_FRAGMENT);
 			break;
 		case SCTP_I_WANT_MAPPED_V4_ADDR:
 			val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4);
 			break;
 		case SCTP_AUTO_ASCONF:
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 				/* only valid for bound all sockets */
 				val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTO_ASCONF);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				goto flags_out;
 			}
 			break;
 		case SCTP_EXPLICIT_EOR:
 			val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXPLICIT_EOR);
 			break;
 		case SCTP_NODELAY:
 			val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NODELAY);
 			break;
 		case SCTP_USE_EXT_RCVINFO:
 			val = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_EXT_RCVINFO);
 			break;
 		case SCTP_AUTOCLOSE:
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTOCLOSE))
 				val = sctp_ticks_to_secs(inp->sctp_ep.auto_close_time);
 			else
 				val = 0;
 			break;
 
 		default:
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
 			error = ENOPROTOOPT;
 		}		/* end switch (sopt->sopt_name) */
 		if (*optsize < sizeof(val)) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			error = EINVAL;
 		}
 flags_out:
 		SCTP_INP_RUNLOCK(inp);
 		if (error == 0) {
 			/* return the option value */
 			*(int *)optval = val;
 			*optsize = sizeof(val);
 		}
 		break;
 	case SCTP_GET_PACKET_LOG:
 		{
 #ifdef  SCTP_PACKET_LOGGING
 			uint8_t *target;
 			int ret;
 
 			SCTP_CHECK_AND_CAST(target, optval, uint8_t, *optsize);
 			ret = sctp_copy_out_packet_log(target, (int)*optsize);
 			*optsize = ret;
 #else
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 			error = EOPNOTSUPP;
 #endif
 			break;
 		}
 	case SCTP_REUSE_PORT:
 		{
 			uint32_t *value;
 
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) {
 				/* Can't do this for a 1-m socket */
 				error = EINVAL;
 				break;
 			}
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			*value = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE);
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_PARTIAL_DELIVERY_POINT:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			*value = inp->partial_delivery_point;
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_FRAGMENT_INTERLEAVE:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) {
 				if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS)) {
 					*value = SCTP_FRAG_LEVEL_2;
 				} else {
 					*value = SCTP_FRAG_LEVEL_1;
 				}
 			} else {
 				*value = SCTP_FRAG_LEVEL_0;
 			}
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_INTERLEAVING_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.idata_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					if (inp->idata_supported) {
 						av->assoc_value = 1;
 					} else {
 						av->assoc_value = 0;
 					}
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_CMT_ON_OFF:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				av->assoc_value = stcb->asoc.sctp_cmt_on_off;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->sctp_cmt_on_off;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_PLUGGABLE_CC:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				av->assoc_value = stcb->asoc.congestion_control_module;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->sctp_ep.sctp_default_cc_module;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_CC_OPTION:
 		{
 			struct sctp_cc_option *cc_opt;
 
 			SCTP_CHECK_AND_CAST(cc_opt, optval, struct sctp_cc_option, *optsize);
 			SCTP_FIND_STCB(inp, stcb, cc_opt->aid_value.assoc_id);
 			if (stcb == NULL) {
 				error = EINVAL;
 			} else {
 				if (stcb->asoc.cc_functions.sctp_cwnd_socket_option == NULL) {
 					error = ENOTSUP;
 				} else {
 					error = (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 0, cc_opt);
 					*optsize = sizeof(struct sctp_cc_option);
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			break;
 		}
 	case SCTP_PLUGGABLE_SS:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				av->assoc_value = stcb->asoc.stream_scheduling_module;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->sctp_ep.sctp_default_ss_module;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_SS_VALUE:
 		{
 			struct sctp_stream_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_stream_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				if ((av->stream_id >= stcb->asoc.streamoutcnt) ||
 				    (stcb->asoc.ss_functions.sctp_ss_get_value(stcb, &stcb->asoc, &stcb->asoc.strmout[av->stream_id],
 				    &av->stream_value) < 0)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				} else {
 					*optsize = sizeof(struct sctp_stream_value);
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				/*
 				 * Can't get stream value without
 				 * association
 				 */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_GET_ADDR_LEN:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			error = EINVAL;
 #ifdef INET
 			if (av->assoc_value == AF_INET) {
 				av->assoc_value = sizeof(struct sockaddr_in);
 				error = 0;
 			}
 #endif
 #ifdef INET6
 			if (av->assoc_value == AF_INET6) {
 				av->assoc_value = sizeof(struct sockaddr_in6);
 				error = 0;
 			}
 #endif
 			if (error) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 			} else {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_GET_ASSOC_NUMBER:
 		{
 			uint32_t *value, cnt;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			SCTP_INP_RLOCK(inp);
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 			    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 				/* Can't do this for a 1-1 socket */
 				error = EINVAL;
 				SCTP_INP_RUNLOCK(inp);
 				break;
 			}
 			cnt = 0;
 			LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 				cnt++;
 			}
 			SCTP_INP_RUNLOCK(inp);
 			*value = cnt;
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_GET_ASSOC_ID_LIST:
 		{
 			struct sctp_assoc_ids *ids;
 			uint32_t at;
 			size_t limit;
 
 			SCTP_CHECK_AND_CAST(ids, optval, struct sctp_assoc_ids, *optsize);
 			SCTP_INP_RLOCK(inp);
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 			    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 				/* Can't do this for a 1-1 socket */
 				error = EINVAL;
 				SCTP_INP_RUNLOCK(inp);
 				break;
 			}
 			at = 0;
 			limit = (*optsize - sizeof(uint32_t)) / sizeof(sctp_assoc_t);
 			LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 				if (at < limit) {
 					ids->gaids_assoc_id[at++] = sctp_get_associd(stcb);
 					if (at == 0) {
 						error = EINVAL;
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else {
 					error = EINVAL;
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 			SCTP_INP_RUNLOCK(inp);
 			if (error == 0) {
 				ids->gaids_number_of_ids = at;
 				*optsize = ((at * sizeof(sctp_assoc_t)) + sizeof(uint32_t));
 			}
 			break;
 		}
 	case SCTP_CONTEXT:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.context;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->sctp_context;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_VRF_ID:
 		{
 			uint32_t *default_vrfid;
 
 			SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, *optsize);
 			*default_vrfid = inp->def_vrf_id;
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_GET_ASOC_VRF:
 		{
 			struct sctp_assoc_value *id;
 
 			SCTP_CHECK_AND_CAST(id, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, id->assoc_id);
 			if (stcb == NULL) {
 				error = EINVAL;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 			} else {
 				id->assoc_value = stcb->asoc.vrf_id;
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_GET_VRF_IDS:
 		{
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 			error = EOPNOTSUPP;
 			break;
 		}
 	case SCTP_GET_NONCE_VALUES:
 		{
 			struct sctp_get_nonce_values *gnv;
 
 			SCTP_CHECK_AND_CAST(gnv, optval, struct sctp_get_nonce_values, *optsize);
 			SCTP_FIND_STCB(inp, stcb, gnv->gn_assoc_id);
 
 			if (stcb) {
 				gnv->gn_peers_tag = stcb->asoc.peer_vtag;
 				gnv->gn_local_tag = stcb->asoc.my_vtag;
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_get_nonce_values);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
 				error = ENOTCONN;
 			}
 			break;
 		}
 	case SCTP_DELAYED_SACK:
 		{
 			struct sctp_sack_info *sack;
 
 			SCTP_CHECK_AND_CAST(sack, optval, struct sctp_sack_info, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sack->sack_assoc_id);
 			if (stcb) {
 				sack->sack_delay = stcb->asoc.delayed_ack;
 				sack->sack_freq = stcb->asoc.sack_freq;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (sack->sack_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					sack->sack_delay = sctp_ticks_to_msecs(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV]);
 					sack->sack_freq = inp->sctp_ep.sctp_sack_freq;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_sack_info);
 			}
 			break;
 		}
 	case SCTP_GET_SNDBUF_USE:
 		{
 			struct sctp_sockstat *ss;
 
 			SCTP_CHECK_AND_CAST(ss, optval, struct sctp_sockstat, *optsize);
 			SCTP_FIND_STCB(inp, stcb, ss->ss_assoc_id);
 
 			if (stcb) {
 				ss->ss_total_sndbuf = stcb->asoc.total_output_queue_size;
 				ss->ss_total_recv_buf = (stcb->asoc.size_on_reasm_queue +
 				    stcb->asoc.size_on_all_streams);
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_sockstat);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
 				error = ENOTCONN;
 			}
 			break;
 		}
 	case SCTP_MAX_BURST:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.max_burst;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->sctp_ep.max_burst;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_MAXSEG:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.sctp_frag_point;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->sctp_frag_point;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_GET_STAT_LOG:
 		error = sctp_fill_stat_log(optval, optsize);
 		break;
 	case SCTP_EVENTS:
 		{
 			struct sctp_event_subscribe *events;
 
 			SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, *optsize);
 			memset(events, 0, sizeof(struct sctp_event_subscribe));
 			SCTP_INP_RLOCK(inp);
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT))
 				events->sctp_data_io_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT))
 				events->sctp_association_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVPADDREVNT))
 				events->sctp_address_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT))
 				events->sctp_send_failure_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVPEERERR))
 				events->sctp_peer_error_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT))
 				events->sctp_shutdown_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT))
 				events->sctp_partial_delivery_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT))
 				events->sctp_adaptation_layer_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_AUTHEVNT))
 				events->sctp_authentication_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT))
 				events->sctp_sender_dry_event = 1;
 
 			if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT))
 				events->sctp_stream_reset_event = 1;
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(struct sctp_event_subscribe);
 			break;
 		}
 	case SCTP_ADAPTATION_LAYER:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 
 			SCTP_INP_RLOCK(inp);
 			*value = inp->sctp_ep.adaptation_layer_indicator;
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_SET_INITIAL_DBG_SEQ:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			SCTP_INP_RLOCK(inp);
 			*value = inp->sctp_ep.initial_sequence_debug;
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_GET_LOCAL_ADDR_SIZE:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			SCTP_INP_RLOCK(inp);
 			*value = (uint32_t)sctp_max_size_addresses(inp);
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(uint32_t);
 			break;
 		}
 	case SCTP_GET_REMOTE_ADDR_SIZE:
 		{
 			uint32_t *value;
 			struct sctp_nets *net;
 			size_t size;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, *optsize);
 			/* FIXME MT: change to sctp_assoc_value? */
 			SCTP_FIND_STCB(inp, stcb, (sctp_assoc_t)*value);
 
 			if (stcb != NULL) {
 				size = 0;
 				/* Count the sizes */
 				TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 					switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 					case AF_INET:
 #ifdef INET6
 						if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
 							size += sizeof(struct sockaddr_in6);
 						} else {
 							size += sizeof(struct sockaddr_in);
 						}
 #else
 						size += sizeof(struct sockaddr_in);
 #endif
 						break;
 #endif
 #ifdef INET6
 					case AF_INET6:
 						size += sizeof(struct sockaddr_in6);
 						break;
 #endif
 					default:
 						break;
 					}
 				}
 				SCTP_TCB_UNLOCK(stcb);
 				*value = (uint32_t)size;
 				*optsize = sizeof(uint32_t);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((sctp_assoc_t)*value <= SCTP_ALL_ASSOC)) {
 					error = EINVAL;
 				} else {
 					error = ENOENT;
 				}
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 			}
 			break;
 		}
 	case SCTP_GET_PEER_ADDRESSES:
 		/*
 		 * Get the address information, an array is passed in to
 		 * fill up we pack it.
 		 */
 		{
 			size_t cpsz, left;
 			struct sockaddr *addr;
 			struct sctp_nets *net;
 			struct sctp_getaddresses *saddr;
 
 			SCTP_CHECK_AND_CAST(saddr, optval, struct sctp_getaddresses, *optsize);
 			SCTP_FIND_STCB(inp, stcb, saddr->sget_assoc_id);
 
 			if (stcb != NULL) {
 				left = *optsize - offsetof(struct sctp_getaddresses, addr);
 				*optsize = offsetof(struct sctp_getaddresses, addr);
 				addr = &saddr->addr[0].sa;
 
 				TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 					switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 					case AF_INET:
 #ifdef INET6
 						if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
 							cpsz = sizeof(struct sockaddr_in6);
 						} else {
 							cpsz = sizeof(struct sockaddr_in);
 						}
 #else
 						cpsz = sizeof(struct sockaddr_in);
 #endif
 						break;
 #endif
 #ifdef INET6
 					case AF_INET6:
 						cpsz = sizeof(struct sockaddr_in6);
 						break;
 #endif
 					default:
 						cpsz = 0;
 						break;
 					}
 					if (cpsz == 0) {
 						break;
 					}
 					if (left < cpsz) {
 						/* not enough room. */
 						break;
 					}
 #if defined(INET) && defined(INET6)
 					if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) &&
 					    (net->ro._l_addr.sa.sa_family == AF_INET)) {
 						/* Must map the address */
 						in6_sin_2_v4mapsin6(&net->ro._l_addr.sin,
 						    (struct sockaddr_in6 *)addr);
 					} else {
 						memcpy(addr, &net->ro._l_addr, cpsz);
 					}
 #else
 					memcpy(addr, &net->ro._l_addr, cpsz);
 #endif
 					((struct sockaddr_in *)addr)->sin_port = stcb->rport;
 
 					addr = (struct sockaddr *)((caddr_t)addr + cpsz);
 					left -= cpsz;
 					*optsize += cpsz;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (saddr->sget_assoc_id <= SCTP_ALL_ASSOC)) {
 					error = EINVAL;
 				} else {
 					error = ENOENT;
 				}
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 			}
 			break;
 		}
 	case SCTP_GET_LOCAL_ADDRESSES:
 		{
 			size_t limit, actual;
 			struct sctp_getaddresses *saddr;
 
 			SCTP_CHECK_AND_CAST(saddr, optval, struct sctp_getaddresses, *optsize);
 			SCTP_FIND_STCB(inp, stcb, saddr->sget_assoc_id);
 
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 			    ((saddr->sget_assoc_id == SCTP_CURRENT_ASSOC) ||
 			    (saddr->sget_assoc_id == SCTP_ALL_ASSOC))) {
 				error = EINVAL;
 			} else {
 				limit = *optsize - offsetof(struct sctp_getaddresses, addr);
 				actual = sctp_fill_up_addresses(inp, stcb, limit, &saddr->addr[0].sa);
 				*optsize = offsetof(struct sctp_getaddresses, addr) + actual;
 			}
 			if (stcb != NULL) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			break;
 		}
 	case SCTP_PEER_ADDR_PARAMS:
 		{
 			struct sctp_paddrparams *paddrp;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 #endif
 
 			SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, *optsize);
 			SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (paddrp->spp_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&paddrp->spp_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&paddrp->spp_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&paddrp->spp_address;
 			}
 #else
 			addr = (struct sockaddr *)&paddrp->spp_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 
 			if (stcb != NULL) {
 				/* Applies to the specific association */
 				paddrp->spp_flags = 0;
 				if (net != NULL) {
 					paddrp->spp_hbinterval = net->heart_beat_delay;
 					paddrp->spp_pathmaxrxt = net->failure_threshold;
 					paddrp->spp_pathmtu = net->mtu;
 					switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 					case AF_INET:
 						paddrp->spp_pathmtu -= SCTP_MIN_V4_OVERHEAD;
 						break;
 #endif
 #ifdef INET6
 					case AF_INET6:
 						paddrp->spp_pathmtu -= SCTP_MIN_OVERHEAD;
 						break;
 #endif
 					default:
 						break;
 					}
 					/* get flags for HB */
 					if (net->dest_state & SCTP_ADDR_NOHB) {
 						paddrp->spp_flags |= SPP_HB_DISABLE;
 					} else {
 						paddrp->spp_flags |= SPP_HB_ENABLE;
 					}
 					/* get flags for PMTU */
 					if (net->dest_state & SCTP_ADDR_NO_PMTUD) {
 						paddrp->spp_flags |= SPP_PMTUD_DISABLE;
 					} else {
 						paddrp->spp_flags |= SPP_PMTUD_ENABLE;
 					}
 					if (net->dscp & 0x01) {
 						paddrp->spp_dscp = net->dscp & 0xfc;
 						paddrp->spp_flags |= SPP_DSCP;
 					}
 #ifdef INET6
 					if ((net->ro._l_addr.sa.sa_family == AF_INET6) &&
 					    (net->flowlabel & 0x80000000)) {
 						paddrp->spp_ipv6_flowlabel = net->flowlabel & 0x000fffff;
 						paddrp->spp_flags |= SPP_IPV6_FLOWLABEL;
 					}
 #endif
 				} else {
 					/*
 					 * No destination so return default
 					 * value
 					 */
 					paddrp->spp_pathmaxrxt = stcb->asoc.def_net_failure;
 					paddrp->spp_pathmtu = stcb->asoc.default_mtu;
 					if (stcb->asoc.default_dscp & 0x01) {
 						paddrp->spp_dscp = stcb->asoc.default_dscp & 0xfc;
 						paddrp->spp_flags |= SPP_DSCP;
 					}
 #ifdef INET6
 					if (stcb->asoc.default_flowlabel & 0x80000000) {
 						paddrp->spp_ipv6_flowlabel = stcb->asoc.default_flowlabel & 0x000fffff;
 						paddrp->spp_flags |= SPP_IPV6_FLOWLABEL;
 					}
 #endif
 					/* default settings should be these */
 					if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) {
 						paddrp->spp_flags |= SPP_HB_DISABLE;
 					} else {
 						paddrp->spp_flags |= SPP_HB_ENABLE;
 					}
 					if (sctp_stcb_is_feature_on(inp, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD)) {
 						paddrp->spp_flags |= SPP_PMTUD_DISABLE;
 					} else {
 						paddrp->spp_flags |= SPP_PMTUD_ENABLE;
 					}
 					paddrp->spp_hbinterval = stcb->asoc.heart_beat_delay;
 				}
 				paddrp->spp_assoc_id = sctp_get_associd(stcb);
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (paddrp->spp_assoc_id == SCTP_FUTURE_ASSOC))) {
 					/* Use endpoint defaults */
 					SCTP_INP_RLOCK(inp);
 					paddrp->spp_pathmaxrxt = inp->sctp_ep.def_net_failure;
 					paddrp->spp_hbinterval = sctp_ticks_to_msecs(inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT]);
 					paddrp->spp_assoc_id = SCTP_FUTURE_ASSOC;
 					/* get inp's default */
 					if (inp->sctp_ep.default_dscp & 0x01) {
 						paddrp->spp_dscp = inp->sctp_ep.default_dscp & 0xfc;
 						paddrp->spp_flags |= SPP_DSCP;
 					}
 #ifdef INET6
 					if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) &&
 					    (inp->sctp_ep.default_flowlabel & 0x80000000)) {
 						paddrp->spp_ipv6_flowlabel = inp->sctp_ep.default_flowlabel & 0x000fffff;
 						paddrp->spp_flags |= SPP_IPV6_FLOWLABEL;
 					}
 #endif
 					paddrp->spp_pathmtu = inp->sctp_ep.default_mtu;
 
 					if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT)) {
 						paddrp->spp_flags |= SPP_HB_ENABLE;
 					} else {
 						paddrp->spp_flags |= SPP_HB_DISABLE;
 					}
 					if (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD)) {
 						paddrp->spp_flags |= SPP_PMTUD_ENABLE;
 					} else {
 						paddrp->spp_flags |= SPP_PMTUD_DISABLE;
 					}
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_paddrparams);
 			}
 			break;
 		}
 	case SCTP_GET_PEER_ADDR_INFO:
 		{
 			struct sctp_paddrinfo *paddri;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 #endif
 
 			SCTP_CHECK_AND_CAST(paddri, optval, struct sctp_paddrinfo, *optsize);
 			SCTP_FIND_STCB(inp, stcb, paddri->spinfo_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (paddri->spinfo_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&paddri->spinfo_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&paddri->spinfo_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&paddri->spinfo_address;
 			}
 #else
 			addr = (struct sockaddr *)&paddri->spinfo_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 
 			if ((stcb != NULL) && (net != NULL)) {
 				if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
 					/* It's unconfirmed */
 					paddri->spinfo_state = SCTP_UNCONFIRMED;
 				} else if (net->dest_state & SCTP_ADDR_REACHABLE) {
 					/* It's active */
 					paddri->spinfo_state = SCTP_ACTIVE;
 				} else {
 					/* It's inactive */
 					paddri->spinfo_state = SCTP_INACTIVE;
 				}
 				paddri->spinfo_cwnd = net->cwnd;
 				paddri->spinfo_srtt = net->lastsa >> SCTP_RTT_SHIFT;
 				paddri->spinfo_rto = net->RTO;
 				paddri->spinfo_assoc_id = sctp_get_associd(stcb);
 				paddri->spinfo_mtu = net->mtu;
 				switch (addr->sa_family) {
 #if defined(INET)
 				case AF_INET:
 					paddri->spinfo_mtu -= SCTP_MIN_V4_OVERHEAD;
 					break;
 #endif
 #if defined(INET6)
 				case AF_INET6:
 					paddri->spinfo_mtu -= SCTP_MIN_OVERHEAD;
 					break;
 #endif
 				default:
 					break;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_paddrinfo);
 			} else {
 				if (stcb != NULL) {
 					SCTP_TCB_UNLOCK(stcb);
 				}
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 			}
 			break;
 		}
 	case SCTP_PCB_STATUS:
 		{
 			struct sctp_pcbinfo *spcb;
 
 			SCTP_CHECK_AND_CAST(spcb, optval, struct sctp_pcbinfo, *optsize);
 			sctp_fill_pcbinfo(spcb);
 			*optsize = sizeof(struct sctp_pcbinfo);
 			break;
 		}
 	case SCTP_STATUS:
 		{
 			struct sctp_nets *net;
 			struct sctp_status *sstat;
 
 			SCTP_CHECK_AND_CAST(sstat, optval, struct sctp_status, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sstat->sstat_assoc_id);
 
 			if (stcb == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			sstat->sstat_state = sctp_map_assoc_state(stcb->asoc.state);
 			sstat->sstat_assoc_id = sctp_get_associd(stcb);
 			sstat->sstat_rwnd = stcb->asoc.peers_rwnd;
 			sstat->sstat_unackdata = stcb->asoc.sent_queue_cnt;
 			/*
 			 * We can't include chunks that have been passed to
 			 * the socket layer. Only things in queue.
 			 */
 			sstat->sstat_penddata = (stcb->asoc.cnt_on_reasm_queue +
 			    stcb->asoc.cnt_on_all_streams);
 			sstat->sstat_instrms = stcb->asoc.streamincnt;
 			sstat->sstat_outstrms = stcb->asoc.streamoutcnt;
 			sstat->sstat_fragmentation_point = sctp_get_frag_point(stcb);
 			net = stcb->asoc.primary_destination;
 			if (net != NULL) {
 				memcpy(&sstat->sstat_primary.spinfo_address,
 				    &net->ro._l_addr,
 				    ((struct sockaddr *)(&net->ro._l_addr))->sa_len);
 				((struct sockaddr_in *)&sstat->sstat_primary.spinfo_address)->sin_port = stcb->rport;
 				/*
 				 * Again the user can get info from
 				 * sctp_constants.h for what the state of
 				 * the network is.
 				 */
 				if (net->dest_state & SCTP_ADDR_UNCONFIRMED) {
 					/* It's unconfirmed */
 					sstat->sstat_primary.spinfo_state = SCTP_UNCONFIRMED;
 				} else if (net->dest_state & SCTP_ADDR_REACHABLE) {
 					/* It's active */
 					sstat->sstat_primary.spinfo_state = SCTP_ACTIVE;
 				} else {
 					/* It's inactive */
 					sstat->sstat_primary.spinfo_state = SCTP_INACTIVE;
 				}
 				sstat->sstat_primary.spinfo_cwnd = net->cwnd;
 				sstat->sstat_primary.spinfo_srtt = net->lastsa >> SCTP_RTT_SHIFT;
 				sstat->sstat_primary.spinfo_rto = net->RTO;
 				sstat->sstat_primary.spinfo_mtu = net->mtu;
 				switch (stcb->asoc.primary_destination->ro._l_addr.sa.sa_family) {
 #if defined(INET)
 				case AF_INET:
 					sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_V4_OVERHEAD;
 					break;
 #endif
 #if defined(INET6)
 				case AF_INET6:
 					sstat->sstat_primary.spinfo_mtu -= SCTP_MIN_OVERHEAD;
 					break;
 #endif
 				default:
 					break;
 				}
 			} else {
 				memset(&sstat->sstat_primary, 0, sizeof(struct sctp_paddrinfo));
 			}
 			sstat->sstat_primary.spinfo_assoc_id = sctp_get_associd(stcb);
 			SCTP_TCB_UNLOCK(stcb);
 			*optsize = sizeof(struct sctp_status);
 			break;
 		}
 	case SCTP_RTOINFO:
 		{
 			struct sctp_rtoinfo *srto;
 
 			SCTP_CHECK_AND_CAST(srto, optval, struct sctp_rtoinfo, *optsize);
 			SCTP_FIND_STCB(inp, stcb, srto->srto_assoc_id);
 
 			if (stcb) {
 				srto->srto_initial = stcb->asoc.initial_rto;
 				srto->srto_max = stcb->asoc.maxrto;
 				srto->srto_min = stcb->asoc.minrto;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (srto->srto_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					srto->srto_initial = inp->sctp_ep.initial_rto;
 					srto->srto_max = inp->sctp_ep.sctp_maxrto;
 					srto->srto_min = inp->sctp_ep.sctp_minrto;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_rtoinfo);
 			}
 			break;
 		}
 	case SCTP_TIMEOUTS:
 		{
 			struct sctp_timeouts *stimo;
 
 			SCTP_CHECK_AND_CAST(stimo, optval, struct sctp_timeouts, *optsize);
 			SCTP_FIND_STCB(inp, stcb, stimo->stimo_assoc_id);
 
 			if (stcb) {
 				stimo->stimo_init = stcb->asoc.timoinit;
 				stimo->stimo_data = stcb->asoc.timodata;
 				stimo->stimo_sack = stcb->asoc.timosack;
 				stimo->stimo_shutdown = stcb->asoc.timoshutdown;
 				stimo->stimo_heartbeat = stcb->asoc.timoheartbeat;
 				stimo->stimo_cookie = stcb->asoc.timocookie;
 				stimo->stimo_shutdownack = stcb->asoc.timoshutdownack;
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_timeouts);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_ASSOCINFO:
 		{
 			struct sctp_assocparams *sasoc;
 
 			SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id);
 
 			if (stcb) {
 				sasoc->sasoc_cookie_life = sctp_ticks_to_msecs(stcb->asoc.cookie_life);
 				sasoc->sasoc_asocmaxrxt = stcb->asoc.max_send_times;
 				sasoc->sasoc_number_peer_destinations = stcb->asoc.numnets;
 				sasoc->sasoc_peer_rwnd = stcb->asoc.peers_rwnd;
 				sasoc->sasoc_local_rwnd = stcb->asoc.my_rwnd;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (sasoc->sasoc_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					sasoc->sasoc_cookie_life = sctp_ticks_to_msecs(inp->sctp_ep.def_cookie_life);
 					sasoc->sasoc_asocmaxrxt = inp->sctp_ep.max_send_times;
 					sasoc->sasoc_number_peer_destinations = 0;
 					sasoc->sasoc_peer_rwnd = 0;
 					sasoc->sasoc_local_rwnd = (uint32_t)sbspace(&inp->sctp_socket->so_rcv);
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assocparams);
 			}
 			break;
 		}
 	case SCTP_DEFAULT_SEND_PARAM:
 		{
 			struct sctp_sndrcvinfo *s_info;
 
 			SCTP_CHECK_AND_CAST(s_info, optval, struct sctp_sndrcvinfo, *optsize);
 			SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id);
 
 			if (stcb) {
 				memcpy(s_info, &stcb->asoc.def_send, sizeof(stcb->asoc.def_send));
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (s_info->sinfo_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					memcpy(s_info, &inp->def_send, sizeof(inp->def_send));
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_sndrcvinfo);
 			}
 			break;
 		}
 	case SCTP_INITMSG:
 		{
 			struct sctp_initmsg *sinit;
 
 			SCTP_CHECK_AND_CAST(sinit, optval, struct sctp_initmsg, *optsize);
 			SCTP_INP_RLOCK(inp);
 			sinit->sinit_num_ostreams = inp->sctp_ep.pre_open_stream_count;
 			sinit->sinit_max_instreams = inp->sctp_ep.max_open_streams_intome;
 			sinit->sinit_max_attempts = inp->sctp_ep.max_init_times;
 			sinit->sinit_max_init_timeo = inp->sctp_ep.initial_init_rto_max;
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(struct sctp_initmsg);
 			break;
 		}
 	case SCTP_PRIMARY_ADDR:
 		/* we allow a "get" operation on this */
 		{
 			struct sctp_setprim *ssp;
 
 			SCTP_CHECK_AND_CAST(ssp, optval, struct sctp_setprim, *optsize);
 			SCTP_FIND_STCB(inp, stcb, ssp->ssp_assoc_id);
 
 			if (stcb) {
 				union sctp_sockstore *addr;
 
 				addr = &stcb->asoc.primary_destination->ro._l_addr;
 				switch (addr->sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 #ifdef INET6
 					if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_NEEDS_MAPPED_V4)) {
 						in6_sin_2_v4mapsin6(&addr->sin,
 						    (struct sockaddr_in6 *)&ssp->ssp_addr);
 					} else {
 						memcpy(&ssp->ssp_addr, &addr->sin, sizeof(struct sockaddr_in));
 					}
 #else
 					memcpy(&ssp->ssp_addr, &addr->sin, sizeof(struct sockaddr_in));
 #endif
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					memcpy(&ssp->ssp_addr, &addr->sin6, sizeof(struct sockaddr_in6));
 					break;
 #endif
 				default:
 					break;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 				*optsize = sizeof(struct sctp_setprim);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_HMAC_IDENT:
 		{
 			struct sctp_hmacalgo *shmac;
 			sctp_hmaclist_t *hmaclist;
 			size_t size;
 			int i;
 
 			SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, *optsize);
 
 			SCTP_INP_RLOCK(inp);
 			hmaclist = inp->sctp_ep.local_hmacs;
 			if (hmaclist == NULL) {
 				/* no HMACs to return */
 				*optsize = sizeof(*shmac);
 				SCTP_INP_RUNLOCK(inp);
 				break;
 			}
 			/* is there room for all of the hmac ids? */
 			size = sizeof(*shmac) + (hmaclist->num_algo *
 			    sizeof(shmac->shmac_idents[0]));
 			if (*optsize < size) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_INP_RUNLOCK(inp);
 				break;
 			}
 			/* copy in the list */
 			shmac->shmac_number_of_idents = hmaclist->num_algo;
 			for (i = 0; i < hmaclist->num_algo; i++) {
 				shmac->shmac_idents[i] = hmaclist->hmac[i];
 			}
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = size;
 			break;
 		}
 	case SCTP_AUTH_ACTIVE_KEY:
 		{
 			struct sctp_authkeyid *scact;
 
 			SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, *optsize);
 			SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id);
 
 			if (stcb) {
 				/* get the active key on the assoc */
 				scact->scact_keynumber = stcb->asoc.authinfo.active_keyid;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (scact->scact_assoc_id == SCTP_FUTURE_ASSOC))) {
 					/* get the endpoint active key */
 					SCTP_INP_RLOCK(inp);
 					scact->scact_keynumber = inp->sctp_ep.default_keyid;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_authkeyid);
 			}
 			break;
 		}
 	case SCTP_LOCAL_AUTH_CHUNKS:
 		{
 			struct sctp_authchunks *sac;
 			sctp_auth_chklist_t *chklist = NULL;
 			size_t size = 0;
 
 			SCTP_CHECK_AND_CAST(sac, optval, struct sctp_authchunks, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sac->gauth_assoc_id);
 
 			if (stcb) {
 				/* get off the assoc */
 				chklist = stcb->asoc.local_auth_chunks;
 				/* is there enough space? */
 				size = sctp_auth_get_chklist_size(chklist);
 				if (*optsize < (sizeof(struct sctp_authchunks) + size)) {
 					error = EINVAL;
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 				} else {
 					/* copy in the chunks */
 					(void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
 					sac->gauth_number_of_chunks = (uint32_t)size;
 					*optsize = sizeof(struct sctp_authchunks) + size;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (sac->gauth_assoc_id == SCTP_FUTURE_ASSOC))) {
 					/* get off the endpoint */
 					SCTP_INP_RLOCK(inp);
 					chklist = inp->sctp_ep.local_auth_chunks;
 					/* is there enough space? */
 					size = sctp_auth_get_chklist_size(chklist);
 					if (*optsize < (sizeof(struct sctp_authchunks) + size)) {
 						error = EINVAL;
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					} else {
 						/* copy in the chunks */
 						(void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
 						sac->gauth_number_of_chunks = (uint32_t)size;
 						*optsize = sizeof(struct sctp_authchunks) + size;
 					}
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_PEER_AUTH_CHUNKS:
 		{
 			struct sctp_authchunks *sac;
 			sctp_auth_chklist_t *chklist = NULL;
 			size_t size = 0;
 
 			SCTP_CHECK_AND_CAST(sac, optval, struct sctp_authchunks, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sac->gauth_assoc_id);
 
 			if (stcb) {
 				/* get off the assoc */
 				chklist = stcb->asoc.peer_auth_chunks;
 				/* is there enough space? */
 				size = sctp_auth_get_chklist_size(chklist);
 				if (*optsize < (sizeof(struct sctp_authchunks) + size)) {
 					error = EINVAL;
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 				} else {
 					/* copy in the chunks */
 					(void)sctp_serialize_auth_chunks(chklist, sac->gauth_chunks);
 					sac->gauth_number_of_chunks = (uint32_t)size;
 					*optsize = sizeof(struct sctp_authchunks) + size;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 			}
 			break;
 		}
 	case SCTP_EVENT:
 		{
 			struct sctp_event *event;
 			uint32_t event_type;
 
 			SCTP_CHECK_AND_CAST(event, optval, struct sctp_event, *optsize);
 			SCTP_FIND_STCB(inp, stcb, event->se_assoc_id);
 
 			switch (event->se_type) {
 			case SCTP_ASSOC_CHANGE:
 				event_type = SCTP_PCB_FLAGS_RECVASSOCEVNT;
 				break;
 			case SCTP_PEER_ADDR_CHANGE:
 				event_type = SCTP_PCB_FLAGS_RECVPADDREVNT;
 				break;
 			case SCTP_REMOTE_ERROR:
 				event_type = SCTP_PCB_FLAGS_RECVPEERERR;
 				break;
 			case SCTP_SEND_FAILED:
 				event_type = SCTP_PCB_FLAGS_RECVSENDFAILEVNT;
 				break;
 			case SCTP_SHUTDOWN_EVENT:
 				event_type = SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT;
 				break;
 			case SCTP_ADAPTATION_INDICATION:
 				event_type = SCTP_PCB_FLAGS_ADAPTATIONEVNT;
 				break;
 			case SCTP_PARTIAL_DELIVERY_EVENT:
 				event_type = SCTP_PCB_FLAGS_PDAPIEVNT;
 				break;
 			case SCTP_AUTHENTICATION_EVENT:
 				event_type = SCTP_PCB_FLAGS_AUTHEVNT;
 				break;
 			case SCTP_STREAM_RESET_EVENT:
 				event_type = SCTP_PCB_FLAGS_STREAM_RESETEVNT;
 				break;
 			case SCTP_SENDER_DRY_EVENT:
 				event_type = SCTP_PCB_FLAGS_DRYEVNT;
 				break;
 			case SCTP_NOTIFICATIONS_STOPPED_EVENT:
 				event_type = 0;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP);
 				error = ENOTSUP;
 				break;
 			case SCTP_ASSOC_RESET_EVENT:
 				event_type = SCTP_PCB_FLAGS_ASSOC_RESETEVNT;
 				break;
 			case SCTP_STREAM_CHANGE_EVENT:
 				event_type = SCTP_PCB_FLAGS_STREAM_CHANGEEVNT;
 				break;
 			case SCTP_SEND_FAILED_EVENT:
 				event_type = SCTP_PCB_FLAGS_RECVNSENDFAILEVNT;
 				break;
 			default:
 				event_type = 0;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			if (event_type > 0) {
 				if (stcb) {
 					event->se_on = sctp_stcb_is_feature_on(inp, stcb, event_type);
 				} else {
 					if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 					    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 					    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 					    (event->se_assoc_id == SCTP_FUTURE_ASSOC))) {
 						SCTP_INP_RLOCK(inp);
 						event->se_on = sctp_is_feature_on(inp, event_type);
 						SCTP_INP_RUNLOCK(inp);
 					} else {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 				}
 			}
 			if (stcb != NULL) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_event);
 			}
 			break;
 		}
 	case SCTP_RECVRCVINFO:
 		if (*optsize < sizeof(int)) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			error = EINVAL;
 		} else {
 			SCTP_INP_RLOCK(inp);
 			*(int *)optval = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO);
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(int);
 		}
 		break;
 	case SCTP_RECVNXTINFO:
 		if (*optsize < sizeof(int)) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			error = EINVAL;
 		} else {
 			SCTP_INP_RLOCK(inp);
 			*(int *)optval = sctp_is_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO);
 			SCTP_INP_RUNLOCK(inp);
 			*optsize = sizeof(int);
 		}
 		break;
 	case SCTP_DEFAULT_SNDINFO:
 		{
 			struct sctp_sndinfo *info;
 
 			SCTP_CHECK_AND_CAST(info, optval, struct sctp_sndinfo, *optsize);
 			SCTP_FIND_STCB(inp, stcb, info->snd_assoc_id);
 
 			if (stcb) {
 				info->snd_sid = stcb->asoc.def_send.sinfo_stream;
 				info->snd_flags = stcb->asoc.def_send.sinfo_flags;
 				info->snd_flags &= 0xfff0;
 				info->snd_ppid = stcb->asoc.def_send.sinfo_ppid;
 				info->snd_context = stcb->asoc.def_send.sinfo_context;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (info->snd_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					info->snd_sid = inp->def_send.sinfo_stream;
 					info->snd_flags = inp->def_send.sinfo_flags;
 					info->snd_flags &= 0xfff0;
 					info->snd_ppid = inp->def_send.sinfo_ppid;
 					info->snd_context = inp->def_send.sinfo_context;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_sndinfo);
 			}
 			break;
 		}
 	case SCTP_DEFAULT_PRINFO:
 		{
 			struct sctp_default_prinfo *info;
 
 			SCTP_CHECK_AND_CAST(info, optval, struct sctp_default_prinfo, *optsize);
 			SCTP_FIND_STCB(inp, stcb, info->pr_assoc_id);
 
 			if (stcb) {
 				info->pr_policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags);
 				info->pr_value = stcb->asoc.def_send.sinfo_timetolive;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (info->pr_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					info->pr_policy = PR_SCTP_POLICY(inp->def_send.sinfo_flags);
 					info->pr_value = inp->def_send.sinfo_timetolive;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_default_prinfo);
 			}
 			break;
 		}
 	case SCTP_PEER_ADDR_THLDS:
 		{
 			struct sctp_paddrthlds *thlds;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 #endif
 
 			SCTP_CHECK_AND_CAST(thlds, optval, struct sctp_paddrthlds, *optsize);
 			SCTP_FIND_STCB(inp, stcb, thlds->spt_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (thlds->spt_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&thlds->spt_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&thlds->spt_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&thlds->spt_address;
 			}
 #else
 			addr = (struct sockaddr *)&thlds->spt_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 
 			if (stcb != NULL) {
 				if (net != NULL) {
 					thlds->spt_pathmaxrxt = net->failure_threshold;
 					thlds->spt_pathpfthld = net->pf_threshold;
 					thlds->spt_pathcpthld = 0xffff;
 				} else {
 					thlds->spt_pathmaxrxt = stcb->asoc.def_net_failure;
 					thlds->spt_pathpfthld = stcb->asoc.def_net_pf_threshold;
 					thlds->spt_pathcpthld = 0xffff;
 				}
 				thlds->spt_assoc_id = sctp_get_associd(stcb);
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (thlds->spt_assoc_id == SCTP_FUTURE_ASSOC))) {
 					/* Use endpoint defaults */
 					SCTP_INP_RLOCK(inp);
 					thlds->spt_pathmaxrxt = inp->sctp_ep.def_net_failure;
 					thlds->spt_pathpfthld = inp->sctp_ep.def_net_pf_threshold;
 					thlds->spt_pathcpthld = 0xffff;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_paddrthlds);
 			}
 			break;
 		}
 	case SCTP_REMOTE_UDP_ENCAPS_PORT:
 		{
 			struct sctp_udpencaps *encaps;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 #endif
 
 			SCTP_CHECK_AND_CAST(encaps, optval, struct sctp_udpencaps, *optsize);
 			SCTP_FIND_STCB(inp, stcb, encaps->sue_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (encaps->sue_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&encaps->sue_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&encaps->sue_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&encaps->sue_address;
 			}
 #else
 			addr = (struct sockaddr *)&encaps->sue_address;
 #endif
 			if (stcb) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						error = EINVAL;
 						SCTP_TCB_UNLOCK(stcb);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 
 			if (stcb != NULL) {
 				if (net) {
 					encaps->sue_port = net->port;
 				} else {
 					encaps->sue_port = stcb->asoc.port;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (encaps->sue_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					encaps->sue_port = inp->sctp_ep.port;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_udpencaps);
 			}
 			break;
 		}
 	case SCTP_ECN_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.ecn_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->ecn_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_PR_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.prsctp_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->prsctp_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_AUTH_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.auth_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->auth_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_ASCONF_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.asconf_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->asconf_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_RECONFIG_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.reconfig_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->reconfig_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_NRSACK_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.nrsack_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->nrsack_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_PKTDROP_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.pktdrop_supported;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->pktdrop_supported;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_ENABLE_STREAM_RESET:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = (uint32_t)stcb->asoc.local_strreset_support;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = (uint32_t)inp->local_strreset_support;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	case SCTP_PR_STREAM_STATUS:
 		{
 			struct sctp_prstatus *sprstat;
 			uint16_t sid;
 			uint16_t policy;
 
 			SCTP_CHECK_AND_CAST(sprstat, optval, struct sctp_prstatus, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sprstat->sprstat_assoc_id);
 
 			sid = sprstat->sprstat_sid;
 			policy = sprstat->sprstat_policy;
 #if defined(SCTP_DETAILED_STR_STATS)
 			if ((stcb != NULL) &&
 			    (sid < stcb->asoc.streamoutcnt) &&
 			    (policy != SCTP_PR_SCTP_NONE) &&
 			    ((policy <= SCTP_PR_SCTP_MAX) ||
 			    (policy == SCTP_PR_SCTP_ALL))) {
 				if (policy == SCTP_PR_SCTP_ALL) {
 					sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[0];
 					sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[0];
 				} else {
 					sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[policy];
 					sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[policy];
 				}
 #else
 			if ((stcb != NULL) &&
 			    (sid < stcb->asoc.streamoutcnt) &&
 			    (policy == SCTP_PR_SCTP_ALL)) {
 				sprstat->sprstat_abandoned_unsent = stcb->asoc.strmout[sid].abandoned_unsent[0];
 				sprstat->sprstat_abandoned_sent = stcb->asoc.strmout[sid].abandoned_sent[0];
 #endif
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			if (stcb != NULL) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_prstatus);
 			}
 			break;
 		}
 	case SCTP_PR_ASSOC_STATUS:
 		{
 			struct sctp_prstatus *sprstat;
 			uint16_t policy;
 
 			SCTP_CHECK_AND_CAST(sprstat, optval, struct sctp_prstatus, *optsize);
 			SCTP_FIND_STCB(inp, stcb, sprstat->sprstat_assoc_id);
 
 			policy = sprstat->sprstat_policy;
 			if ((stcb != NULL) &&
 			    (policy != SCTP_PR_SCTP_NONE) &&
 			    ((policy <= SCTP_PR_SCTP_MAX) ||
 			    (policy == SCTP_PR_SCTP_ALL))) {
 				if (policy == SCTP_PR_SCTP_ALL) {
 					sprstat->sprstat_abandoned_unsent = stcb->asoc.abandoned_unsent[0];
 					sprstat->sprstat_abandoned_sent = stcb->asoc.abandoned_sent[0];
 				} else {
 					sprstat->sprstat_abandoned_unsent = stcb->asoc.abandoned_unsent[policy];
 					sprstat->sprstat_abandoned_sent = stcb->asoc.abandoned_sent[policy];
 				}
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			if (stcb != NULL) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_prstatus);
 			}
 			break;
 		}
 	case SCTP_MAX_CWND:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, *optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				av->assoc_value = stcb->asoc.max_cwnd;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					av->assoc_value = inp->max_cwnd;
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			if (error == 0) {
 				*optsize = sizeof(struct sctp_assoc_value);
 			}
 			break;
 		}
 	default:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
 		error = ENOPROTOOPT;
 		break;
 	}			/* end switch (sopt->sopt_name) */
 	if (error) {
 		*optsize = 0;
 	}
 	return (error);
 }
 
 static int
 sctp_setopt(struct socket *so, int optname, void *optval, size_t optsize,
     void *p)
 {
 	int error, set_opt;
 	uint32_t *mopt;
 	struct sctp_tcb *stcb = NULL;
 	struct sctp_inpcb *inp = NULL;
 	uint32_t vrf_id;
 
 	if (optval == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	vrf_id = inp->def_vrf_id;
 
 	error = 0;
 	switch (optname) {
 	case SCTP_NODELAY:
 	case SCTP_AUTOCLOSE:
 	case SCTP_AUTO_ASCONF:
 	case SCTP_EXPLICIT_EOR:
 	case SCTP_DISABLE_FRAGMENTS:
 	case SCTP_USE_EXT_RCVINFO:
 	case SCTP_I_WANT_MAPPED_V4_ADDR:
 		/* copy in the option value */
 		SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize);
 		set_opt = 0;
 		if (error)
 			break;
 		switch (optname) {
 		case SCTP_DISABLE_FRAGMENTS:
 			set_opt = SCTP_PCB_FLAGS_NO_FRAGMENT;
 			break;
 		case SCTP_AUTO_ASCONF:
 			/*
 			 * NOTE: we don't really support this flag
 			 */
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 				/* only valid for bound all sockets */
 				if ((SCTP_BASE_SYSCTL(sctp_auto_asconf) == 0) &&
 				    (*mopt != 0)) {
 					/* forbidden by admin */
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EPERM);
 					return (EPERM);
 				}
 				set_opt = SCTP_PCB_FLAGS_AUTO_ASCONF;
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			break;
 		case SCTP_EXPLICIT_EOR:
 			set_opt = SCTP_PCB_FLAGS_EXPLICIT_EOR;
 			break;
 		case SCTP_USE_EXT_RCVINFO:
 			set_opt = SCTP_PCB_FLAGS_EXT_RCVINFO;
 			break;
 		case SCTP_I_WANT_MAPPED_V4_ADDR:
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 				set_opt = SCTP_PCB_FLAGS_NEEDS_MAPPED_V4;
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			break;
 		case SCTP_NODELAY:
 			set_opt = SCTP_PCB_FLAGS_NODELAY;
 			break;
 		case SCTP_AUTOCLOSE:
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 			    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			set_opt = SCTP_PCB_FLAGS_AUTOCLOSE;
 			/*
 			 * The value is in ticks. Note this does not effect
 			 * old associations, only new ones.
 			 */
 			inp->sctp_ep.auto_close_time = sctp_secs_to_ticks(*mopt);
 			break;
 		}
 		SCTP_INP_WLOCK(inp);
 		if (*mopt != 0) {
 			sctp_feature_on(inp, set_opt);
 		} else {
 			sctp_feature_off(inp, set_opt);
 		}
 		SCTP_INP_WUNLOCK(inp);
 		break;
 	case SCTP_REUSE_PORT:
 		{
 			SCTP_CHECK_AND_CAST(mopt, optval, uint32_t, optsize);
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) == 0) {
 				/* Can't set it after we are bound */
 				error = EINVAL;
 				break;
 			}
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE)) {
 				/* Can't do this for a 1-m socket */
 				error = EINVAL;
 				break;
 			}
 			if (optval)
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE);
 			else
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE);
 			break;
 		}
 	case SCTP_PARTIAL_DELIVERY_POINT:
 		{
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, optsize);
 			if (*value > SCTP_SB_LIMIT_RCV(so)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			inp->partial_delivery_point = *value;
 			break;
 		}
 	case SCTP_FRAGMENT_INTERLEAVE:
 		/* not yet until we re-write sctp_recvmsg() */
 		{
 			uint32_t *level;
 
 			SCTP_CHECK_AND_CAST(level, optval, uint32_t, optsize);
 			if (*level == SCTP_FRAG_LEVEL_2) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 			} else if (*level == SCTP_FRAG_LEVEL_1) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 			} else if (*level == SCTP_FRAG_LEVEL_0) {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE);
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS);
 
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_INTERLEAVING_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					if (av->assoc_value == 0) {
 						inp->idata_supported = 0;
 					} else {
 						if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_FRAG_INTERLEAVE)) &&
 						    (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_INTERLEAVE_STRMS))) {
 							inp->idata_supported = 1;
 						} else {
 							/*
 							 * Must have Frag
 							 * interleave and
 							 * stream interleave
 							 * on
 							 */
 							SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 							error = EINVAL;
 						}
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_CMT_ON_OFF:
 		if (SCTP_BASE_SYSCTL(sctp_cmt_on_off)) {
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			if (av->assoc_value > SCTP_CMT_MAX) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				stcb->asoc.sctp_cmt_on_off = av->assoc_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_cmt_on_off = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.sctp_cmt_on_off = av->assoc_value;
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 		} else {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
 			error = ENOPROTOOPT;
 		}
 		break;
 	case SCTP_PLUGGABLE_CC:
 		{
 			struct sctp_assoc_value *av;
 			struct sctp_nets *net;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			if ((av->assoc_value != SCTP_CC_RFC2581) &&
 			    (av->assoc_value != SCTP_CC_HSTCP) &&
 			    (av->assoc_value != SCTP_CC_HTCP) &&
 			    (av->assoc_value != SCTP_CC_RTCC)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value];
 				stcb->asoc.congestion_control_module = av->assoc_value;
 				if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) {
 					TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 						stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net);
 					}
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_ep.sctp_default_cc_module = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.cc_functions = sctp_cc_functions[av->assoc_value];
 						stcb->asoc.congestion_control_module = av->assoc_value;
 						if (stcb->asoc.cc_functions.sctp_set_initial_cc_param != NULL) {
 							TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 								stcb->asoc.cc_functions.sctp_set_initial_cc_param(stcb, net);
 							}
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_CC_OPTION:
 		{
 			struct sctp_cc_option *cc_opt;
 
 			SCTP_CHECK_AND_CAST(cc_opt, optval, struct sctp_cc_option, optsize);
 			SCTP_FIND_STCB(inp, stcb, cc_opt->aid_value.assoc_id);
 			if (stcb == NULL) {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (cc_opt->aid_value.assoc_id == SCTP_CURRENT_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						if (stcb->asoc.cc_functions.sctp_cwnd_socket_option) {
 							(*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 1, cc_opt);
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					error = EINVAL;
 				}
 			} else {
 				if (stcb->asoc.cc_functions.sctp_cwnd_socket_option == NULL) {
 					error = ENOTSUP;
 				} else {
 					error = (*stcb->asoc.cc_functions.sctp_cwnd_socket_option) (stcb, 1,
 					    cc_opt);
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			break;
 		}
 	case SCTP_PLUGGABLE_SS:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			if ((av->assoc_value != SCTP_SS_DEFAULT) &&
 			    (av->assoc_value != SCTP_SS_ROUND_ROBIN) &&
 			    (av->assoc_value != SCTP_SS_ROUND_ROBIN_PACKET) &&
 			    (av->assoc_value != SCTP_SS_PRIORITY) &&
 			    (av->assoc_value != SCTP_SS_FAIR_BANDWITH) &&
 			    (av->assoc_value != SCTP_SS_FIRST_COME)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, true);
 				stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value];
 				stcb->asoc.stream_scheduling_module = av->assoc_value;
 				stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc);
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_ep.sctp_default_ss_module = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.ss_functions.sctp_ss_clear(stcb, &stcb->asoc, true);
 						stcb->asoc.ss_functions = sctp_ss_functions[av->assoc_value];
 						stcb->asoc.stream_scheduling_module = av->assoc_value;
 						stcb->asoc.ss_functions.sctp_ss_init(stcb, &stcb->asoc);
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_SS_VALUE:
 		{
 			struct sctp_stream_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_stream_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				if ((av->stream_id >= stcb->asoc.streamoutcnt) ||
 				    (stcb->asoc.ss_functions.sctp_ss_set_value(stcb, &stcb->asoc, &stcb->asoc.strmout[av->stream_id],
 				    av->stream_value) < 0)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_CURRENT_ASSOC)) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						if (av->stream_id < stcb->asoc.streamoutcnt) {
 							stcb->asoc.ss_functions.sctp_ss_set_value(stcb,
 							    &stcb->asoc,
 							    &stcb->asoc.strmout[av->stream_id],
 							    av->stream_value);
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				} else {
 					/*
 					 * Can't set stream value without
 					 * association
 					 */
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_CLR_STAT_LOG:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 		error = EOPNOTSUPP;
 		break;
 	case SCTP_CONTEXT:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				stcb->asoc.context = av->assoc_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_context = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.context = av->assoc_value;
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_VRF_ID:
 		{
 			uint32_t *default_vrfid;
 
 			SCTP_CHECK_AND_CAST(default_vrfid, optval, uint32_t, optsize);
 			if (*default_vrfid > SCTP_MAX_VRF_ID) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			inp->def_vrf_id = *default_vrfid;
 			break;
 		}
 	case SCTP_DEL_VRF_ID:
 		{
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 			error = EOPNOTSUPP;
 			break;
 		}
 	case SCTP_ADD_VRF_ID:
 		{
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 			error = EOPNOTSUPP;
 			break;
 		}
 	case SCTP_DELAYED_SACK:
 		{
 			struct sctp_sack_info *sack;
 
 			SCTP_CHECK_AND_CAST(sack, optval, struct sctp_sack_info, optsize);
 			SCTP_FIND_STCB(inp, stcb, sack->sack_assoc_id);
 			if (sack->sack_delay) {
 				if (sack->sack_delay > SCTP_MAX_SACK_DELAY) {
 					error = EINVAL;
 					if (stcb != NULL) {
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					break;
 				}
 			}
 			if (stcb) {
 				if (sack->sack_delay) {
 					stcb->asoc.delayed_ack = sack->sack_delay;
 				}
 				if (sack->sack_freq) {
 					stcb->asoc.sack_freq = sack->sack_freq;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((sack->sack_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (sack->sack_assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					if (sack->sack_delay) {
 						inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_RECV] = sctp_msecs_to_ticks(sack->sack_delay);
 					}
 					if (sack->sack_freq) {
 						inp->sctp_ep.sctp_sack_freq = sack->sack_freq;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((sack->sack_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (sack->sack_assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						if (sack->sack_delay) {
 							stcb->asoc.delayed_ack = sack->sack_delay;
 						}
 						if (sack->sack_freq) {
 							stcb->asoc.sack_freq = sack->sack_freq;
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_AUTH_CHUNK:
 		{
 			struct sctp_authchunk *sauth;
 
 			SCTP_CHECK_AND_CAST(sauth, optval, struct sctp_authchunk, optsize);
 
 			SCTP_INP_WLOCK(inp);
 			if (sctp_auth_add_chunk(sauth->sauth_chunk, inp->sctp_ep.local_auth_chunks)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			} else {
 				inp->auth_supported = 1;
 			}
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 	case SCTP_AUTH_KEY:
 		{
 			struct sctp_authkey *sca;
 			struct sctp_keyhead *shared_keys;
 			sctp_sharedkey_t *shared_key;
 			sctp_key_t *key = NULL;
 			size_t size;
 
 			SCTP_CHECK_AND_CAST(sca, optval, struct sctp_authkey, optsize);
 			if (sca->sca_keylength == 0) {
 				size = optsize - sizeof(struct sctp_authkey);
 			} else {
 				if (sca->sca_keylength + sizeof(struct sctp_authkey) <= optsize) {
 					size = sca->sca_keylength;
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 			}
 			SCTP_FIND_STCB(inp, stcb, sca->sca_assoc_id);
 
 			if (stcb) {
 				shared_keys = &stcb->asoc.shared_keys;
 				/* clear the cached keys for this key id */
 				sctp_clear_cachedkeys(stcb, sca->sca_keynumber);
 				/*
 				 * create the new shared key and
 				 * insert/replace it
 				 */
 				if (size > 0) {
 					key = sctp_set_key(sca->sca_key, (uint32_t)size);
 					if (key == NULL) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 						error = ENOMEM;
 						SCTP_TCB_UNLOCK(stcb);
 						break;
 					}
 				}
 				shared_key = sctp_alloc_sharedkey();
 				if (shared_key == NULL) {
 					sctp_free_key(key);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 					error = ENOMEM;
 					SCTP_TCB_UNLOCK(stcb);
 					break;
 				}
 				shared_key->key = key;
 				shared_key->keyid = sca->sca_keynumber;
 				error = sctp_insert_sharedkey(shared_keys, shared_key);
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((sca->sca_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (sca->sca_assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					shared_keys = &inp->sctp_ep.shared_keys;
 					/*
 					 * clear the cached keys on all
 					 * assocs for this key id
 					 */
 					sctp_clear_cachedkeys_ep(inp, sca->sca_keynumber);
 					/*
 					 * create the new shared key and
 					 * insert/replace it
 					 */
 					if (size > 0) {
 						key = sctp_set_key(sca->sca_key, (uint32_t)size);
 						if (key == NULL) {
 							SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 							error = ENOMEM;
 							SCTP_INP_WUNLOCK(inp);
 							break;
 						}
 					}
 					shared_key = sctp_alloc_sharedkey();
 					if (shared_key == NULL) {
 						sctp_free_key(key);
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 						error = ENOMEM;
 						SCTP_INP_WUNLOCK(inp);
 						break;
 					}
 					shared_key->key = key;
 					shared_key->keyid = sca->sca_keynumber;
 					error = sctp_insert_sharedkey(shared_keys, shared_key);
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((sca->sca_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (sca->sca_assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						shared_keys = &stcb->asoc.shared_keys;
 						/*
 						 * clear the cached keys for
 						 * this key id
 						 */
 						sctp_clear_cachedkeys(stcb, sca->sca_keynumber);
 						/*
 						 * create the new shared key
 						 * and insert/replace it
 						 */
 						if (size > 0) {
 							key = sctp_set_key(sca->sca_key, (uint32_t)size);
 							if (key == NULL) {
 								SCTP_TCB_UNLOCK(stcb);
 								continue;
 							}
 						}
 						shared_key = sctp_alloc_sharedkey();
 						if (shared_key == NULL) {
 							sctp_free_key(key);
 							SCTP_TCB_UNLOCK(stcb);
 							continue;
 						}
 						shared_key->key = key;
 						shared_key->keyid = sca->sca_keynumber;
 						error = sctp_insert_sharedkey(shared_keys, shared_key);
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_HMAC_IDENT:
 		{
 			struct sctp_hmacalgo *shmac;
 			sctp_hmaclist_t *hmaclist;
 			uint16_t hmacid;
 			uint32_t i;
 
 			SCTP_CHECK_AND_CAST(shmac, optval, struct sctp_hmacalgo, optsize);
 			if ((optsize < sizeof(struct sctp_hmacalgo) + shmac->shmac_number_of_idents * sizeof(uint16_t)) ||
 			    (shmac->shmac_number_of_idents > 0xffff)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 
 			hmaclist = sctp_alloc_hmaclist((uint16_t)shmac->shmac_number_of_idents);
 			if (hmaclist == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 				error = ENOMEM;
 				break;
 			}
 			for (i = 0; i < shmac->shmac_number_of_idents; i++) {
 				hmacid = shmac->shmac_idents[i];
 				if (sctp_auth_add_hmacid(hmaclist, hmacid)) {
 					 /* invalid HMACs were found */ ;
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					sctp_free_hmaclist(hmaclist);
 					goto sctp_set_hmac_done;
 				}
 			}
 			for (i = 0; i < hmaclist->num_algo; i++) {
 				if (hmaclist->hmac[i] == SCTP_AUTH_HMAC_ID_SHA1) {
 					/* already in list */
 					break;
 				}
 			}
 			if (i == hmaclist->num_algo) {
 				/* not found in list */
 				sctp_free_hmaclist(hmaclist);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			/* set it on the endpoint */
 			SCTP_INP_WLOCK(inp);
 			if (inp->sctp_ep.local_hmacs)
 				sctp_free_hmaclist(inp->sctp_ep.local_hmacs);
 			inp->sctp_ep.local_hmacs = hmaclist;
 			SCTP_INP_WUNLOCK(inp);
 	sctp_set_hmac_done:
 			break;
 		}
 	case SCTP_AUTH_ACTIVE_KEY:
 		{
 			struct sctp_authkeyid *scact;
 
 			SCTP_CHECK_AND_CAST(scact, optval, struct sctp_authkeyid, optsize);
 			SCTP_FIND_STCB(inp, stcb, scact->scact_assoc_id);
 
 			/* set the active key on the right place */
 			if (stcb) {
 				/* set the active key on the assoc */
 				if (sctp_auth_setactivekey(stcb,
 				    scact->scact_keynumber)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL,
 					    SCTP_FROM_SCTP_USRREQ,
 					    EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((scact->scact_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (scact->scact_assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					if (sctp_auth_setactivekey_ep(inp, scact->scact_keynumber)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((scact->scact_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (scact->scact_assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						sctp_auth_setactivekey(stcb, scact->scact_keynumber);
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_AUTH_DELETE_KEY:
 		{
 			struct sctp_authkeyid *scdel;
 
 			SCTP_CHECK_AND_CAST(scdel, optval, struct sctp_authkeyid, optsize);
 			SCTP_FIND_STCB(inp, stcb, scdel->scact_assoc_id);
 
 			/* delete the key from the right place */
 			if (stcb) {
 				if (sctp_delete_sharedkey(stcb, scdel->scact_keynumber)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((scdel->scact_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (scdel->scact_assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					if (sctp_delete_sharedkey_ep(inp, scdel->scact_keynumber)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((scdel->scact_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (scdel->scact_assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						sctp_delete_sharedkey(stcb, scdel->scact_keynumber);
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_AUTH_DEACTIVATE_KEY:
 		{
 			struct sctp_authkeyid *keyid;
 
 			SCTP_CHECK_AND_CAST(keyid, optval, struct sctp_authkeyid, optsize);
 			SCTP_FIND_STCB(inp, stcb, keyid->scact_assoc_id);
 
 			/* deactivate the key from the right place */
 			if (stcb) {
 				if (sctp_deact_sharedkey(stcb, keyid->scact_keynumber)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((keyid->scact_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (keyid->scact_assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					if (sctp_deact_sharedkey_ep(inp, keyid->scact_keynumber)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((keyid->scact_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (keyid->scact_assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						sctp_deact_sharedkey(stcb, keyid->scact_keynumber);
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_ENABLE_STREAM_RESET:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			if (av->assoc_value & (~SCTP_ENABLE_VALUE_MASK)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 			if (stcb) {
 				stcb->asoc.local_strreset_support = (uint8_t)av->assoc_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					inp->local_strreset_support = (uint8_t)av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.local_strreset_support = (uint8_t)av->assoc_value;
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_RESET_STREAMS:
 		{
 			struct sctp_reset_streams *strrst;
 			int i, send_out = 0;
 			int send_in = 0;
 
 			SCTP_CHECK_AND_CAST(strrst, optval, struct sctp_reset_streams, optsize);
 			SCTP_FIND_STCB(inp, stcb, strrst->srs_assoc_id);
 			if (stcb == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 				break;
 			}
 			if (stcb->asoc.reconfig_supported == 0) {
 				/*
 				 * Peer does not support the chunk type.
 				 */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 				error = EOPNOTSUPP;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (SCTP_GET_STATE(stcb) != SCTP_STATE_OPEN) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (sizeof(struct sctp_reset_streams) +
 			    strrst->srs_number_streams * sizeof(uint16_t) > optsize) {
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (strrst->srs_flags & SCTP_STREAM_RESET_INCOMING) {
 				send_in = 1;
 				if (stcb->asoc.stream_reset_outstanding) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 					error = EALREADY;
 					SCTP_TCB_UNLOCK(stcb);
 					break;
 				}
 			}
 			if (strrst->srs_flags & SCTP_STREAM_RESET_OUTGOING) {
 				send_out = 1;
 			}
 			if ((strrst->srs_number_streams > SCTP_MAX_STREAMS_AT_ONCE_RESET) && send_in) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOMEM);
 				error = ENOMEM;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if ((send_in == 0) && (send_out == 0)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			for (i = 0; i < strrst->srs_number_streams; i++) {
 				if ((send_in) &&
 				    (strrst->srs_stream_list[i] >= stcb->asoc.streamincnt)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 				if ((send_out) &&
 				    (strrst->srs_stream_list[i] >= stcb->asoc.streamoutcnt)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 			}
 			if (error) {
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (send_out) {
 				int cnt;
 				uint16_t strm;
 
 				if (strrst->srs_number_streams) {
 					for (i = 0, cnt = 0; i < strrst->srs_number_streams; i++) {
 						strm = strrst->srs_stream_list[i];
 						if (stcb->asoc.strmout[strm].state == SCTP_STREAM_OPEN) {
 							stcb->asoc.strmout[strm].state = SCTP_STREAM_RESET_PENDING;
 							cnt++;
 						}
 					}
 				} else {
 					/* Its all */
 					for (i = 0, cnt = 0; i < stcb->asoc.streamoutcnt; i++) {
 						if (stcb->asoc.strmout[i].state == SCTP_STREAM_OPEN) {
 							stcb->asoc.strmout[i].state = SCTP_STREAM_RESET_PENDING;
 							cnt++;
 						}
 					}
 				}
 			}
 			if (send_in) {
 				error = sctp_send_str_reset_req(stcb, strrst->srs_number_streams,
 				    strrst->srs_stream_list,
 				    send_in, 0, 0, 0, 0, 0);
 			} else {
 				error = sctp_send_stream_reset_out_if_possible(stcb, SCTP_SO_LOCKED);
 			}
 			if (error == 0) {
 				sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED);
 			} else {
 				/*
 				 * For outgoing streams don't report any
 				 * problems in sending the request to the
 				 * application. XXX: Double check resetting
 				 * incoming streams.
 				 */
 				error = 0;
 			}
 			SCTP_TCB_UNLOCK(stcb);
 			break;
 		}
 	case SCTP_ADD_STREAMS:
 		{
 			struct sctp_add_streams *stradd;
 			uint8_t addstream = 0;
 			uint16_t add_o_strmcnt = 0;
 			uint16_t add_i_strmcnt = 0;
 
 			SCTP_CHECK_AND_CAST(stradd, optval, struct sctp_add_streams, optsize);
 			SCTP_FIND_STCB(inp, stcb, stradd->sas_assoc_id);
 			if (stcb == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 				break;
 			}
 			if (stcb->asoc.reconfig_supported == 0) {
 				/*
 				 * Peer does not support the chunk type.
 				 */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 				error = EOPNOTSUPP;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (SCTP_GET_STATE(stcb) != SCTP_STATE_OPEN) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (stcb->asoc.stream_reset_outstanding) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 				error = EALREADY;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if ((stradd->sas_outstrms == 0) &&
 			    (stradd->sas_instrms == 0)) {
 				error = EINVAL;
 				goto skip_stuff;
 			}
 			if (stradd->sas_outstrms) {
 				addstream = 1;
 				/* We allocate here */
 				add_o_strmcnt = stradd->sas_outstrms;
 				if ((((int)add_o_strmcnt) + ((int)stcb->asoc.streamoutcnt)) > 0x0000ffff) {
 					/* You can't have more than 64k */
 					error = EINVAL;
 					goto skip_stuff;
 				}
 			}
 			if (stradd->sas_instrms) {
 				int cnt;
 
 				addstream |= 2;
 				/*
 				 * We allocate inside
 				 * sctp_send_str_reset_req()
 				 */
 				add_i_strmcnt = stradd->sas_instrms;
 				cnt = add_i_strmcnt;
 				cnt += stcb->asoc.streamincnt;
 				if (cnt > 0x0000ffff) {
 					/* You can't have more than 64k */
 					error = EINVAL;
 					goto skip_stuff;
 				}
 				if (cnt > (int)stcb->asoc.max_inbound_streams) {
 					/* More than you are allowed */
 					error = EINVAL;
 					goto skip_stuff;
 				}
 			}
 			error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 0, addstream, add_o_strmcnt, add_i_strmcnt, 0);
 			sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED);
 	skip_stuff:
 			SCTP_TCB_UNLOCK(stcb);
 			break;
 		}
 	case SCTP_RESET_ASSOC:
 		{
 			int i;
 			uint32_t *value;
 
 			SCTP_CHECK_AND_CAST(value, optval, uint32_t, optsize);
 			SCTP_FIND_STCB(inp, stcb, (sctp_assoc_t)*value);
 			if (stcb == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 				break;
 			}
 			if (stcb->asoc.reconfig_supported == 0) {
 				/*
 				 * Peer does not support the chunk type.
 				 */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 				error = EOPNOTSUPP;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (SCTP_GET_STATE(stcb) != SCTP_STATE_OPEN) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			if (stcb->asoc.stream_reset_outstanding) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 				error = EALREADY;
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			/*
 			 * Is there any data pending in the send or sent
 			 * queues?
 			 */
 			if (!TAILQ_EMPTY(&stcb->asoc.send_queue) ||
 			    !TAILQ_EMPTY(&stcb->asoc.sent_queue)) {
 		busy_out:
 				error = EBUSY;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 				SCTP_TCB_UNLOCK(stcb);
 				break;
 			}
 			/* Do any streams have data queued? */
 			for (i = 0; i < stcb->asoc.streamoutcnt; i++) {
 				if (!TAILQ_EMPTY(&stcb->asoc.strmout[i].outqueue)) {
 					goto busy_out;
 				}
 			}
 			error = sctp_send_str_reset_req(stcb, 0, NULL, 0, 1, 0, 0, 0, 0);
 			sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_STRRST_REQ, SCTP_SO_LOCKED);
 			SCTP_TCB_UNLOCK(stcb);
 			break;
 		}
 	case SCTP_CONNECT_X:
 		if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			error = EINVAL;
 			break;
 		}
 		error = sctp_do_connect_x(so, inp, optval, optsize, p, 0);
 		break;
 	case SCTP_CONNECT_X_DELAYED:
 		if (optsize < (sizeof(int) + sizeof(struct sockaddr_in))) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 			error = EINVAL;
 			break;
 		}
 		error = sctp_do_connect_x(so, inp, optval, optsize, p, 1);
 		break;
 	case SCTP_CONNECT_X_COMPLETE:
 		{
 			struct sockaddr *sa;
 
 			/* FIXME MT: check correct? */
 			SCTP_CHECK_AND_CAST(sa, optval, struct sockaddr, optsize);
 
 			/* find tcb */
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 				SCTP_INP_RLOCK(inp);
 				stcb = LIST_FIRST(&inp->sctp_asoc_list);
 				if (stcb) {
 					SCTP_TCB_LOCK(stcb);
 				}
 				SCTP_INP_RUNLOCK(inp);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, sa, NULL, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 
 			if (stcb == NULL) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 				error = ENOENT;
 				break;
 			}
 			if (stcb->asoc.delayed_connection == 1) {
 				stcb->asoc.delayed_connection = 0;
 				(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
 				sctp_timer_stop(SCTP_TIMER_TYPE_INIT, inp, stcb,
 				    stcb->asoc.primary_destination,
 				    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_8);
 				sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
 			} else {
 				/*
 				 * already expired or did not use delayed
 				 * connectx
 				 */
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 				error = EALREADY;
 			}
 			SCTP_TCB_UNLOCK(stcb);
 			break;
 		}
 	case SCTP_MAX_BURST:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				stcb->asoc.max_burst = av->assoc_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_ep.max_burst = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((av->assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (av->assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.max_burst = av->assoc_value;
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_MAXSEG:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				stcb->asoc.sctp_frag_point = av->assoc_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_frag_point = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_EVENTS:
 		{
 			struct sctp_event_subscribe *events;
 
 			SCTP_CHECK_AND_CAST(events, optval, struct sctp_event_subscribe, optsize);
 
 			SCTP_INP_WLOCK(inp);
 			if (events->sctp_data_io_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVDATAIOEVNT);
 			}
 
 			if (events->sctp_association_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVASSOCEVNT);
 			}
 
 			if (events->sctp_address_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVPADDREVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVPADDREVNT);
 			}
 
 			if (events->sctp_send_failure_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVSENDFAILEVNT);
 			}
 
 			if (events->sctp_peer_error_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVPEERERR);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVPEERERR);
 			}
 
 			if (events->sctp_shutdown_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT);
 			}
 
 			if (events->sctp_partial_delivery_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_PDAPIEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_PDAPIEVNT);
 			}
 
 			if (events->sctp_adaptation_layer_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_ADAPTATIONEVNT);
 			}
 
 			if (events->sctp_authentication_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_AUTHEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_AUTHEVNT);
 			}
 
 			if (events->sctp_sender_dry_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_DRYEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_DRYEVNT);
 			}
 
 			if (events->sctp_stream_reset_event) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_STREAM_RESETEVNT);
 			}
 
 			LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 				SCTP_TCB_LOCK(stcb);
 				if (events->sctp_association_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVASSOCEVNT);
 				}
 				if (events->sctp_address_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVPADDREVNT);
 				}
 				if (events->sctp_send_failure_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVSENDFAILEVNT);
 				}
 				if (events->sctp_peer_error_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVPEERERR);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVPEERERR);
 				}
 				if (events->sctp_shutdown_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT);
 				}
 				if (events->sctp_partial_delivery_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_PDAPIEVNT);
 				}
 				if (events->sctp_adaptation_layer_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_ADAPTATIONEVNT);
 				}
 				if (events->sctp_authentication_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_AUTHEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_AUTHEVNT);
 				}
 				if (events->sctp_sender_dry_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DRYEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DRYEVNT);
 				}
 				if (events->sctp_stream_reset_event) {
 					sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT);
 				} else {
 					sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_STREAM_RESETEVNT);
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			/*
 			 * Send up the sender dry event only for 1-to-1
 			 * style sockets.
 			 */
 			if (events->sctp_sender_dry_event) {
 				if (((inp->sctp_flags & (SCTP_PCB_FLAGS_TCPTYPE | SCTP_PCB_FLAGS_IN_TCPPOOL)) != 0) &&
 				    !SCTP_IS_LISTENING(inp)) {
 					stcb = LIST_FIRST(&inp->sctp_asoc_list);
 					if (stcb != NULL) {
 						SCTP_TCB_LOCK(stcb);
 						if (TAILQ_EMPTY(&stcb->asoc.send_queue) &&
 						    TAILQ_EMPTY(&stcb->asoc.sent_queue) &&
 						    (stcb->asoc.stream_queue_cnt == 0)) {
 							sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED);
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 				}
 			}
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 	case SCTP_ADAPTATION_LAYER:
 		{
 			struct sctp_setadaptation *adap_bits;
 
 			SCTP_CHECK_AND_CAST(adap_bits, optval, struct sctp_setadaptation, optsize);
 			SCTP_INP_WLOCK(inp);
 			inp->sctp_ep.adaptation_layer_indicator = adap_bits->ssb_adaptation_ind;
 			inp->sctp_ep.adaptation_layer_indicator_provided = 1;
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 #ifdef SCTP_DEBUG
 	case SCTP_SET_INITIAL_DBG_SEQ:
 		{
 			uint32_t *vvv;
 
 			SCTP_CHECK_AND_CAST(vvv, optval, uint32_t, optsize);
 			SCTP_INP_WLOCK(inp);
 			inp->sctp_ep.initial_sequence_debug = *vvv;
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 #endif
 	case SCTP_DEFAULT_SEND_PARAM:
 		{
 			struct sctp_sndrcvinfo *s_info;
 
 			SCTP_CHECK_AND_CAST(s_info, optval, struct sctp_sndrcvinfo, optsize);
 			SCTP_FIND_STCB(inp, stcb, s_info->sinfo_assoc_id);
 
 			if (stcb) {
 				if (s_info->sinfo_stream < stcb->asoc.streamoutcnt) {
 					memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send)));
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((s_info->sinfo_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (s_info->sinfo_assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					memcpy(&inp->def_send, s_info, min(optsize, sizeof(inp->def_send)));
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((s_info->sinfo_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (s_info->sinfo_assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						if (s_info->sinfo_stream < stcb->asoc.streamoutcnt) {
 							memcpy(&stcb->asoc.def_send, s_info, min(optsize, sizeof(stcb->asoc.def_send)));
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_PEER_ADDR_PARAMS:
 		{
 			struct sctp_paddrparams *paddrp;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 #endif
 
 			SCTP_CHECK_AND_CAST(paddrp, optval, struct sctp_paddrparams, optsize);
 			SCTP_FIND_STCB(inp, stcb, paddrp->spp_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (paddrp->spp_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&paddrp->spp_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&paddrp->spp_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&paddrp->spp_address;
 			}
 #else
 			addr = (struct sockaddr *)&paddrp->spp_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr,
 				    &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 			/* sanity checks */
 			if ((paddrp->spp_flags & SPP_HB_ENABLE) && (paddrp->spp_flags & SPP_HB_DISABLE)) {
 				if (stcb)
 					SCTP_TCB_UNLOCK(stcb);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 
 			if ((paddrp->spp_flags & SPP_PMTUD_ENABLE) && (paddrp->spp_flags & SPP_PMTUD_DISABLE)) {
 				if (stcb)
 					SCTP_TCB_UNLOCK(stcb);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			if ((paddrp->spp_flags & SPP_PMTUD_DISABLE) &&
 			    (paddrp->spp_pathmtu > 0) &&
 			    ((paddrp->spp_pathmtu < SCTP_SMALLEST_PMTU) ||
 			    (paddrp->spp_pathmtu > SCTP_LARGEST_PMTU))) {
 				if (stcb)
 					SCTP_TCB_UNLOCK(stcb);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 
 			if (stcb != NULL) {
 				/************************TCB SPECIFIC SET ******************/
 				if (net != NULL) {
 					/************************NET SPECIFIC SET ******************/
 					if (paddrp->spp_flags & SPP_HB_DISABLE) {
 						if (((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0) &&
 						    ((net->dest_state & SCTP_ADDR_NOHB) == 0)) {
 							sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
 							    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_9);
 						}
 						net->dest_state |= SCTP_ADDR_NOHB;
 					}
 					if (paddrp->spp_flags & SPP_HB_ENABLE) {
 						if (paddrp->spp_hbinterval) {
 							net->heart_beat_delay = paddrp->spp_hbinterval;
 						} else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) {
 							net->heart_beat_delay = 0;
 						}
 						sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
 						    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_10);
 						sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net);
 						net->dest_state &= ~SCTP_ADDR_NOHB;
 					}
 					if (paddrp->spp_flags & SPP_HB_DEMAND) {
 						if (SCTP_GET_STATE(stcb) == SCTP_STATE_OPEN) {
 							sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
 							sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SOCKOPT, SCTP_SO_LOCKED);
 							sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net);
 						}
 					}
 					if (paddrp->spp_flags & SPP_PMTUD_DISABLE) {
 						if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
 							sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
 							    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_11);
 						}
 						net->dest_state |= SCTP_ADDR_NO_PMTUD;
 						if (paddrp->spp_pathmtu > 0) {
 							net->mtu = paddrp->spp_pathmtu;
 							switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 							case AF_INET:
 								net->mtu += SCTP_MIN_V4_OVERHEAD;
 								break;
 #endif
 #ifdef INET6
 							case AF_INET6:
 								net->mtu += SCTP_MIN_OVERHEAD;
 								break;
 #endif
 							default:
 								break;
 							}
 							if (net->mtu < stcb->asoc.smallest_mtu) {
 								sctp_pathmtu_adjustment(stcb, net->mtu, true);
 							}
 						}
 					}
 					if (paddrp->spp_flags & SPP_PMTUD_ENABLE) {
 						if (!SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
 							sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
 						}
 						net->dest_state &= ~SCTP_ADDR_NO_PMTUD;
 					}
 					if (paddrp->spp_pathmaxrxt > 0) {
 						if (net->dest_state & SCTP_ADDR_PF) {
 							if (net->error_count > paddrp->spp_pathmaxrxt) {
 								net->dest_state &= ~SCTP_ADDR_PF;
 							}
 						} else {
 							if ((net->error_count <= paddrp->spp_pathmaxrxt) &&
 							    (net->error_count > net->pf_threshold)) {
 								net->dest_state |= SCTP_ADDR_PF;
 								sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
 								sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
 								    stcb->sctp_ep, stcb, net,
 								    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_12);
 								sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
 							}
 						}
 						if (net->dest_state & SCTP_ADDR_REACHABLE) {
 							if (net->error_count > paddrp->spp_pathmaxrxt) {
 								net->dest_state &= ~SCTP_ADDR_REACHABLE;
 								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						} else {
 							if (net->error_count <= paddrp->spp_pathmaxrxt) {
 								net->dest_state |= SCTP_ADDR_REACHABLE;
 								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						}
 						net->failure_threshold = paddrp->spp_pathmaxrxt;
 					}
 					if (paddrp->spp_flags & SPP_DSCP) {
 						net->dscp = paddrp->spp_dscp & 0xfc;
 						net->dscp |= 0x01;
 					}
 #ifdef INET6
 					if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) {
 						if (net->ro._l_addr.sa.sa_family == AF_INET6) {
 							net->flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff;
 							net->flowlabel |= 0x80000000;
 						}
 					}
 #endif
 				} else {
 					/************************ASSOC ONLY -- NO NET SPECIFIC SET ******************/
 					if (paddrp->spp_pathmaxrxt > 0) {
 						stcb->asoc.def_net_failure = paddrp->spp_pathmaxrxt;
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if (net->dest_state & SCTP_ADDR_PF) {
 								if (net->error_count > paddrp->spp_pathmaxrxt) {
 									net->dest_state &= ~SCTP_ADDR_PF;
 								}
 							} else {
 								if ((net->error_count <= paddrp->spp_pathmaxrxt) &&
 								    (net->error_count > net->pf_threshold)) {
 									net->dest_state |= SCTP_ADDR_PF;
 									sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
 									sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
 									    stcb->sctp_ep, stcb, net,
 									    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_13);
 									sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
 								}
 							}
 							if (net->dest_state & SCTP_ADDR_REACHABLE) {
 								if (net->error_count > paddrp->spp_pathmaxrxt) {
 									net->dest_state &= ~SCTP_ADDR_REACHABLE;
 									sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 								}
 							} else {
 								if (net->error_count <= paddrp->spp_pathmaxrxt) {
 									net->dest_state |= SCTP_ADDR_REACHABLE;
 									sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 								}
 							}
 							net->failure_threshold = paddrp->spp_pathmaxrxt;
 						}
 					}
 					if (paddrp->spp_flags & SPP_HB_ENABLE) {
 						if (paddrp->spp_hbinterval != 0) {
 							stcb->asoc.heart_beat_delay = paddrp->spp_hbinterval;
 						} else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) {
 							stcb->asoc.heart_beat_delay = 0;
 						}
 						/* Turn back on the timer */
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if (paddrp->spp_hbinterval != 0) {
 								net->heart_beat_delay = paddrp->spp_hbinterval;
 							} else if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) {
 								net->heart_beat_delay = 0;
 							}
 							if (net->dest_state & SCTP_ADDR_NOHB) {
 								net->dest_state &= ~SCTP_ADDR_NOHB;
 							}
 							sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net,
 							    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_14);
 							sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, inp, stcb, net);
 						}
 						sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT);
 					}
 					if (paddrp->spp_flags & SPP_HB_DISABLE) {
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if ((net->dest_state & SCTP_ADDR_NOHB) == 0) {
 								net->dest_state |= SCTP_ADDR_NOHB;
 								if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0) {
 									sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
 									    inp, stcb, net,
 									    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_15);
 								}
 							}
 						}
 						sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DONOT_HEARTBEAT);
 					}
 					if (paddrp->spp_flags & SPP_PMTUD_DISABLE) {
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
 								sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
 								    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_16);
 							}
 							net->dest_state |= SCTP_ADDR_NO_PMTUD;
 							if (paddrp->spp_pathmtu > 0) {
 								net->mtu = paddrp->spp_pathmtu;
 								switch (net->ro._l_addr.sa.sa_family) {
 #ifdef INET
 								case AF_INET:
 									net->mtu += SCTP_MIN_V4_OVERHEAD;
 									break;
 #endif
 #ifdef INET6
 								case AF_INET6:
 									net->mtu += SCTP_MIN_OVERHEAD;
 									break;
 #endif
 								default:
 									break;
 								}
 								if (net->mtu < stcb->asoc.smallest_mtu) {
 									sctp_pathmtu_adjustment(stcb, net->mtu, true);
 								}
 							}
 						}
 						if (paddrp->spp_pathmtu > 0) {
 							stcb->asoc.default_mtu = paddrp->spp_pathmtu;
 						}
 						sctp_stcb_feature_on(inp, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD);
 					}
 					if (paddrp->spp_flags & SPP_PMTUD_ENABLE) {
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if (!SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
 								sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
 							}
 							net->dest_state &= ~SCTP_ADDR_NO_PMTUD;
 						}
 						stcb->asoc.default_mtu = 0;
 						sctp_stcb_feature_off(inp, stcb, SCTP_PCB_FLAGS_DO_NOT_PMTUD);
 					}
 					if (paddrp->spp_flags & SPP_DSCP) {
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							net->dscp = paddrp->spp_dscp & 0xfc;
 							net->dscp |= 0x01;
 						}
 						stcb->asoc.default_dscp = paddrp->spp_dscp & 0xfc;
 						stcb->asoc.default_dscp |= 0x01;
 					}
 #ifdef INET6
 					if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) {
 						TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 							if (net->ro._l_addr.sa.sa_family == AF_INET6) {
 								net->flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff;
 								net->flowlabel |= 0x80000000;
 							}
 						}
 						stcb->asoc.default_flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff;
 						stcb->asoc.default_flowlabel |= 0x80000000;
 					}
 #endif
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				/************************NO TCB, SET TO default stuff ******************/
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (paddrp->spp_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					/*
 					 * For the TOS/FLOWLABEL stuff you
 					 * set it with the options on the
 					 * socket
 					 */
 					if (paddrp->spp_pathmaxrxt > 0) {
 						inp->sctp_ep.def_net_failure = paddrp->spp_pathmaxrxt;
 					}
 
 					if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO)
 						inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0;
 					else if (paddrp->spp_hbinterval != 0) {
 						if (paddrp->spp_hbinterval > SCTP_MAX_HB_INTERVAL)
 							paddrp->spp_hbinterval = SCTP_MAX_HB_INTERVAL;
 						inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = sctp_msecs_to_ticks(paddrp->spp_hbinterval);
 					}
 
 					if (paddrp->spp_flags & SPP_HB_ENABLE) {
 						if (paddrp->spp_flags & SPP_HB_TIME_IS_ZERO) {
 							inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = 0;
 						} else if (paddrp->spp_hbinterval) {
 							inp->sctp_ep.sctp_timeoutticks[SCTP_TIMER_HEARTBEAT] = sctp_msecs_to_ticks(paddrp->spp_hbinterval);
 						}
 						sctp_feature_off(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT);
 					} else if (paddrp->spp_flags & SPP_HB_DISABLE) {
 						sctp_feature_on(inp, SCTP_PCB_FLAGS_DONOT_HEARTBEAT);
 					}
 					if (paddrp->spp_flags & SPP_PMTUD_ENABLE) {
 						inp->sctp_ep.default_mtu = 0;
 						sctp_feature_off(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD);
 					} else if (paddrp->spp_flags & SPP_PMTUD_DISABLE) {
 						if (paddrp->spp_pathmtu > 0) {
 							inp->sctp_ep.default_mtu = paddrp->spp_pathmtu;
 						}
 						sctp_feature_on(inp, SCTP_PCB_FLAGS_DO_NOT_PMTUD);
 					}
 					if (paddrp->spp_flags & SPP_DSCP) {
 						inp->sctp_ep.default_dscp = paddrp->spp_dscp & 0xfc;
 						inp->sctp_ep.default_dscp |= 0x01;
 					}
 #ifdef INET6
 					if (paddrp->spp_flags & SPP_IPV6_FLOWLABEL) {
 						if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 							inp->sctp_ep.default_flowlabel = paddrp->spp_ipv6_flowlabel & 0x000fffff;
 							inp->sctp_ep.default_flowlabel |= 0x80000000;
 						}
 					}
 #endif
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_RTOINFO:
 		{
 			struct sctp_rtoinfo *srto;
 			uint32_t new_init, new_min, new_max;
 
 			SCTP_CHECK_AND_CAST(srto, optval, struct sctp_rtoinfo, optsize);
 			SCTP_FIND_STCB(inp, stcb, srto->srto_assoc_id);
 
 			if (stcb) {
 				if (srto->srto_initial)
 					new_init = srto->srto_initial;
 				else
 					new_init = stcb->asoc.initial_rto;
 				if (srto->srto_max)
 					new_max = srto->srto_max;
 				else
 					new_max = stcb->asoc.maxrto;
 				if (srto->srto_min)
 					new_min = srto->srto_min;
 				else
 					new_min = stcb->asoc.minrto;
 				if ((new_min <= new_init) && (new_init <= new_max)) {
 					stcb->asoc.initial_rto = new_init;
 					stcb->asoc.maxrto = new_max;
 					stcb->asoc.minrto = new_min;
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (srto->srto_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					if (srto->srto_initial)
 						new_init = srto->srto_initial;
 					else
 						new_init = inp->sctp_ep.initial_rto;
 					if (srto->srto_max)
 						new_max = srto->srto_max;
 					else
 						new_max = inp->sctp_ep.sctp_maxrto;
 					if (srto->srto_min)
 						new_min = srto->srto_min;
 					else
 						new_min = inp->sctp_ep.sctp_minrto;
 					if ((new_min <= new_init) && (new_init <= new_max)) {
 						inp->sctp_ep.initial_rto = new_init;
 						inp->sctp_ep.sctp_maxrto = new_max;
 						inp->sctp_ep.sctp_minrto = new_min;
 					} else {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_ASSOCINFO:
 		{
 			struct sctp_assocparams *sasoc;
 
 			SCTP_CHECK_AND_CAST(sasoc, optval, struct sctp_assocparams, optsize);
 			SCTP_FIND_STCB(inp, stcb, sasoc->sasoc_assoc_id);
 			if (sasoc->sasoc_cookie_life > 0) {
 				/* boundary check the cookie life */
 				if (sasoc->sasoc_cookie_life < SCTP_MIN_COOKIE_LIFE) {
 					sasoc->sasoc_cookie_life = SCTP_MIN_COOKIE_LIFE;
 				}
 				if (sasoc->sasoc_cookie_life > SCTP_MAX_COOKIE_LIFE) {
 					sasoc->sasoc_cookie_life = SCTP_MAX_COOKIE_LIFE;
 				}
 			}
 			if (stcb) {
 				if (sasoc->sasoc_asocmaxrxt > 0) {
 					stcb->asoc.max_send_times = sasoc->sasoc_asocmaxrxt;
 				}
 				if (sasoc->sasoc_cookie_life > 0) {
 					stcb->asoc.cookie_life = sctp_msecs_to_ticks(sasoc->sasoc_cookie_life);
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (sasoc->sasoc_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					if (sasoc->sasoc_asocmaxrxt > 0) {
 						inp->sctp_ep.max_send_times = sasoc->sasoc_asocmaxrxt;
 					}
 					if (sasoc->sasoc_cookie_life > 0) {
 						inp->sctp_ep.def_cookie_life = sctp_msecs_to_ticks(sasoc->sasoc_cookie_life);
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_INITMSG:
 		{
 			struct sctp_initmsg *sinit;
 
 			SCTP_CHECK_AND_CAST(sinit, optval, struct sctp_initmsg, optsize);
 			SCTP_INP_WLOCK(inp);
 			if (sinit->sinit_num_ostreams)
 				inp->sctp_ep.pre_open_stream_count = sinit->sinit_num_ostreams;
 
 			if (sinit->sinit_max_instreams)
 				inp->sctp_ep.max_open_streams_intome = sinit->sinit_max_instreams;
 
 			if (sinit->sinit_max_attempts)
 				inp->sctp_ep.max_init_times = sinit->sinit_max_attempts;
 
 			if (sinit->sinit_max_init_timeo)
 				inp->sctp_ep.initial_init_rto_max = sinit->sinit_max_init_timeo;
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 	case SCTP_PRIMARY_ADDR:
 		{
 			struct sctp_setprim *spa;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 #endif
 
 			SCTP_CHECK_AND_CAST(spa, optval, struct sctp_setprim, optsize);
 			SCTP_FIND_STCB(inp, stcb, spa->ssp_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (spa->ssp_addr.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&spa->ssp_addr;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&spa->ssp_addr;
 				}
 			} else {
 				addr = (struct sockaddr *)&spa->ssp_addr;
 			}
 #else
 			addr = (struct sockaddr *)&spa->ssp_addr;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr,
 				    &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 
 			if ((stcb != NULL) && (net != NULL)) {
 				if (net != stcb->asoc.primary_destination) {
 					if ((net->dest_state & SCTP_ADDR_UNCONFIRMED) == 0) {
 						/* Ok we need to set it */
 						if (sctp_set_primary_addr(stcb, (struct sockaddr *)NULL, net) == 0) {
 							if ((stcb->asoc.alternate) &&
 							    ((net->dest_state & SCTP_ADDR_PF) == 0) &&
 							    (net->dest_state & SCTP_ADDR_REACHABLE)) {
 								sctp_free_remote_addr(stcb->asoc.alternate);
 								stcb->asoc.alternate = NULL;
 							}
 						} else {
 							SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 							error = EINVAL;
 						}
 					} else {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					}
 				}
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			if (stcb != NULL) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 			break;
 		}
 	case SCTP_SET_DYNAMIC_PRIMARY:
 		{
 			union sctp_sockstore *ss;
 
 			error = priv_check(curthread,
 			    PRIV_NETINET_RESERVEDPORT);
 			if (error)
 				break;
 
 			SCTP_CHECK_AND_CAST(ss, optval, union sctp_sockstore, optsize);
 			/* SUPER USER CHECK? */
 			error = sctp_dynamic_set_primary(&ss->sa, vrf_id);
 			break;
 		}
 	case SCTP_SET_PEER_PRIMARY_ADDR:
 		{
 			struct sctp_setpeerprim *sspp;
 			struct sockaddr *addr;
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 #endif
 
 			SCTP_CHECK_AND_CAST(sspp, optval, struct sctp_setpeerprim, optsize);
 			SCTP_FIND_STCB(inp, stcb, sspp->sspp_assoc_id);
 			if (stcb != NULL) {
 				struct sctp_ifa *ifa;
 
 #if defined(INET) && defined(INET6)
 				if (sspp->sspp_addr.ss_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)&sspp->sspp_addr;
 					if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 						in6_sin6_2_sin(&sin_store, sin6);
 						addr = (struct sockaddr *)&sin_store;
 					} else {
 						addr = (struct sockaddr *)&sspp->sspp_addr;
 					}
 				} else {
 					addr = (struct sockaddr *)&sspp->sspp_addr;
 				}
 #else
 				addr = (struct sockaddr *)&sspp->sspp_addr;
 #endif
 				ifa = sctp_find_ifa_by_addr(addr, stcb->asoc.vrf_id, SCTP_ADDR_NOT_LOCKED);
 				if (ifa == NULL) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					goto out_of_it;
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
 					/*
 					 * Must validate the ifa found is in
 					 * our ep
 					 */
 					struct sctp_laddr *laddr;
 					int found = 0;
 
 					LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 						if (laddr->ifa == NULL) {
 							SCTPDBG(SCTP_DEBUG_OUTPUT1, "%s: NULL ifa\n",
 							    __func__);
 							continue;
 						}
 						if ((sctp_is_addr_restricted(stcb, laddr->ifa)) &&
 						    (!sctp_is_addr_pending(stcb, laddr->ifa))) {
 							continue;
 						}
 						if (laddr->ifa == ifa) {
 							found = 1;
 							break;
 						}
 					}
 					if (!found) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 						goto out_of_it;
 					}
 				} else {
 					switch (addr->sa_family) {
 #ifdef INET
 					case AF_INET:
 						{
 							struct sockaddr_in *sin;
 
 							sin = (struct sockaddr_in *)addr;
 							if (prison_check_ip4(inp->ip_inp.inp.inp_cred,
 							    &sin->sin_addr) != 0) {
 								SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 								error = EINVAL;
 								goto out_of_it;
 							}
 							break;
 						}
 #endif
 #ifdef INET6
 					case AF_INET6:
 						{
 							struct sockaddr_in6 *sin6;
 
 							sin6 = (struct sockaddr_in6 *)addr;
 							if (prison_check_ip6(inp->ip_inp.inp.inp_cred,
 							    &sin6->sin6_addr) != 0) {
 								SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 								error = EINVAL;
 								goto out_of_it;
 							}
 							break;
 						}
 #endif
 					default:
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 						goto out_of_it;
 					}
 				}
 				if (sctp_set_primary_ip_address_sa(stcb, addr) != 0) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				sctp_chunk_output(inp, stcb, SCTP_OUTPUT_FROM_SOCKOPT, SCTP_SO_LOCKED);
 		out_of_it:
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 			}
 			break;
 		}
 	case SCTP_BINDX_ADD_ADDR:
 		{
 			struct sockaddr *sa;
 			struct thread *td;
 
 			td = (struct thread *)p;
 			SCTP_CHECK_AND_CAST(sa, optval, struct sockaddr, optsize);
 #ifdef INET
 			if (sa->sa_family == AF_INET) {
 				if (optsize < sizeof(struct sockaddr_in)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 				if (td != NULL &&
 				    (error = prison_local_ip4(td->td_ucred, &(((struct sockaddr_in *)sa)->sin_addr)))) {
 					SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			} else
 #endif
 #ifdef INET6
 			if (sa->sa_family == AF_INET6) {
 				if (optsize < sizeof(struct sockaddr_in6)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 				if (td != NULL &&
 				    (error = prison_local_ip6(td->td_ucred,
 				    &(((struct sockaddr_in6 *)sa)->sin6_addr),
 				    (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) {
 					SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			} else
 #endif
 			{
 				error = EAFNOSUPPORT;
 				break;
 			}
 			sctp_bindx_add_address(so, inp, sa, vrf_id, &error, p);
 			break;
 		}
 	case SCTP_BINDX_REM_ADDR:
 		{
 			struct sockaddr *sa;
 			struct thread *td;
 
 			td = (struct thread *)p;
 
 			SCTP_CHECK_AND_CAST(sa, optval, struct sockaddr, optsize);
 #ifdef INET
 			if (sa->sa_family == AF_INET) {
 				if (optsize < sizeof(struct sockaddr_in)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 				if (td != NULL &&
 				    (error = prison_local_ip4(td->td_ucred, &(((struct sockaddr_in *)sa)->sin_addr)))) {
 					SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			} else
 #endif
 #ifdef INET6
 			if (sa->sa_family == AF_INET6) {
 				if (optsize < sizeof(struct sockaddr_in6)) {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 					break;
 				}
 				if (td != NULL &&
 				    (error = prison_local_ip6(td->td_ucred,
 				    &(((struct sockaddr_in6 *)sa)->sin6_addr),
 				    (SCTP_IPV6_V6ONLY(inp) != 0))) != 0) {
 					SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			} else
 #endif
 			{
 				error = EAFNOSUPPORT;
 				break;
 			}
 			sctp_bindx_delete_address(inp, sa, vrf_id, &error);
 			break;
 		}
 	case SCTP_EVENT:
 		{
 			struct sctp_event *event;
 			uint32_t event_type;
 
 			SCTP_CHECK_AND_CAST(event, optval, struct sctp_event, optsize);
 			SCTP_FIND_STCB(inp, stcb, event->se_assoc_id);
 			switch (event->se_type) {
 			case SCTP_ASSOC_CHANGE:
 				event_type = SCTP_PCB_FLAGS_RECVASSOCEVNT;
 				break;
 			case SCTP_PEER_ADDR_CHANGE:
 				event_type = SCTP_PCB_FLAGS_RECVPADDREVNT;
 				break;
 			case SCTP_REMOTE_ERROR:
 				event_type = SCTP_PCB_FLAGS_RECVPEERERR;
 				break;
 			case SCTP_SEND_FAILED:
 				event_type = SCTP_PCB_FLAGS_RECVSENDFAILEVNT;
 				break;
 			case SCTP_SHUTDOWN_EVENT:
 				event_type = SCTP_PCB_FLAGS_RECVSHUTDOWNEVNT;
 				break;
 			case SCTP_ADAPTATION_INDICATION:
 				event_type = SCTP_PCB_FLAGS_ADAPTATIONEVNT;
 				break;
 			case SCTP_PARTIAL_DELIVERY_EVENT:
 				event_type = SCTP_PCB_FLAGS_PDAPIEVNT;
 				break;
 			case SCTP_AUTHENTICATION_EVENT:
 				event_type = SCTP_PCB_FLAGS_AUTHEVNT;
 				break;
 			case SCTP_STREAM_RESET_EVENT:
 				event_type = SCTP_PCB_FLAGS_STREAM_RESETEVNT;
 				break;
 			case SCTP_SENDER_DRY_EVENT:
 				event_type = SCTP_PCB_FLAGS_DRYEVNT;
 				break;
 			case SCTP_NOTIFICATIONS_STOPPED_EVENT:
 				event_type = 0;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP);
 				error = ENOTSUP;
 				break;
 			case SCTP_ASSOC_RESET_EVENT:
 				event_type = SCTP_PCB_FLAGS_ASSOC_RESETEVNT;
 				break;
 			case SCTP_STREAM_CHANGE_EVENT:
 				event_type = SCTP_PCB_FLAGS_STREAM_CHANGEEVNT;
 				break;
 			case SCTP_SEND_FAILED_EVENT:
 				event_type = SCTP_PCB_FLAGS_RECVNSENDFAILEVNT;
 				break;
 			default:
 				event_type = 0;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			if (event_type > 0) {
 				if (stcb) {
 					if (event->se_on) {
 						sctp_stcb_feature_on(inp, stcb, event_type);
 						if (event_type == SCTP_PCB_FLAGS_DRYEVNT) {
 							if (TAILQ_EMPTY(&stcb->asoc.send_queue) &&
 							    TAILQ_EMPTY(&stcb->asoc.sent_queue) &&
 							    (stcb->asoc.stream_queue_cnt == 0)) {
 								sctp_ulp_notify(SCTP_NOTIFY_SENDER_DRY, stcb, 0, NULL, SCTP_SO_LOCKED);
 							}
 						}
 					} else {
 						sctp_stcb_feature_off(inp, stcb, event_type);
 					}
 					SCTP_TCB_UNLOCK(stcb);
 				} else {
 					/*
 					 * We don't want to send up a storm
 					 * of events, so return an error for
 					 * sender dry events
 					 */
 					if ((event_type == SCTP_PCB_FLAGS_DRYEVNT) &&
 					    (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 					    ((event->se_assoc_id == SCTP_ALL_ASSOC) ||
 					    (event->se_assoc_id == SCTP_CURRENT_ASSOC))) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTSUP);
 						error = ENOTSUP;
 						break;
 					}
 					if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 					    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 					    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 					    ((event->se_assoc_id == SCTP_FUTURE_ASSOC) ||
 					    (event->se_assoc_id == SCTP_ALL_ASSOC)))) {
 						SCTP_INP_WLOCK(inp);
 						if (event->se_on) {
 							sctp_feature_on(inp, event_type);
 						} else {
 							sctp_feature_off(inp, event_type);
 						}
 						SCTP_INP_WUNLOCK(inp);
 					}
 					if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 					    ((event->se_assoc_id == SCTP_CURRENT_ASSOC) ||
 					    (event->se_assoc_id == SCTP_ALL_ASSOC))) {
 						SCTP_INP_RLOCK(inp);
 						LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 							SCTP_TCB_LOCK(stcb);
 							if (event->se_on) {
 								sctp_stcb_feature_on(inp, stcb, event_type);
 							} else {
 								sctp_stcb_feature_off(inp, stcb, event_type);
 							}
 							SCTP_TCB_UNLOCK(stcb);
 						}
 						SCTP_INP_RUNLOCK(inp);
 					}
 				}
 			} else {
 				if (stcb) {
 					SCTP_TCB_UNLOCK(stcb);
 				}
 			}
 			break;
 		}
 	case SCTP_RECVRCVINFO:
 		{
 			int *onoff;
 
 			SCTP_CHECK_AND_CAST(onoff, optval, int, optsize);
 			SCTP_INP_WLOCK(inp);
 			if (*onoff != 0) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVRCVINFO);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVRCVINFO);
 			}
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 	case SCTP_RECVNXTINFO:
 		{
 			int *onoff;
 
 			SCTP_CHECK_AND_CAST(onoff, optval, int, optsize);
 			SCTP_INP_WLOCK(inp);
 			if (*onoff != 0) {
 				sctp_feature_on(inp, SCTP_PCB_FLAGS_RECVNXTINFO);
 			} else {
 				sctp_feature_off(inp, SCTP_PCB_FLAGS_RECVNXTINFO);
 			}
 			SCTP_INP_WUNLOCK(inp);
 			break;
 		}
 	case SCTP_DEFAULT_SNDINFO:
 		{
 			struct sctp_sndinfo *info;
 			uint16_t policy;
 
 			SCTP_CHECK_AND_CAST(info, optval, struct sctp_sndinfo, optsize);
 			SCTP_FIND_STCB(inp, stcb, info->snd_assoc_id);
 
 			if (stcb) {
 				if (info->snd_sid < stcb->asoc.streamoutcnt) {
 					stcb->asoc.def_send.sinfo_stream = info->snd_sid;
 					policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags);
 					stcb->asoc.def_send.sinfo_flags = info->snd_flags;
 					stcb->asoc.def_send.sinfo_flags |= policy;
 					stcb->asoc.def_send.sinfo_ppid = info->snd_ppid;
 					stcb->asoc.def_send.sinfo_context = info->snd_context;
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((info->snd_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (info->snd_assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					inp->def_send.sinfo_stream = info->snd_sid;
 					policy = PR_SCTP_POLICY(inp->def_send.sinfo_flags);
 					inp->def_send.sinfo_flags = info->snd_flags;
 					inp->def_send.sinfo_flags |= policy;
 					inp->def_send.sinfo_ppid = info->snd_ppid;
 					inp->def_send.sinfo_context = info->snd_context;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((info->snd_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (info->snd_assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						if (info->snd_sid < stcb->asoc.streamoutcnt) {
 							stcb->asoc.def_send.sinfo_stream = info->snd_sid;
 							policy = PR_SCTP_POLICY(stcb->asoc.def_send.sinfo_flags);
 							stcb->asoc.def_send.sinfo_flags = info->snd_flags;
 							stcb->asoc.def_send.sinfo_flags |= policy;
 							stcb->asoc.def_send.sinfo_ppid = info->snd_ppid;
 							stcb->asoc.def_send.sinfo_context = info->snd_context;
 						}
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_DEFAULT_PRINFO:
 		{
 			struct sctp_default_prinfo *info;
 
 			SCTP_CHECK_AND_CAST(info, optval, struct sctp_default_prinfo, optsize);
 			SCTP_FIND_STCB(inp, stcb, info->pr_assoc_id);
 
 			if (info->pr_policy > SCTP_PR_SCTP_MAX) {
 				if (stcb) {
 					SCTP_TCB_UNLOCK(stcb);
 				}
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				break;
 			}
 			if (stcb) {
 				stcb->asoc.def_send.sinfo_flags &= 0xfff0;
 				stcb->asoc.def_send.sinfo_flags |= info->pr_policy;
 				stcb->asoc.def_send.sinfo_timetolive = info->pr_value;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((info->pr_assoc_id == SCTP_FUTURE_ASSOC) ||
 				    (info->pr_assoc_id == SCTP_ALL_ASSOC)))) {
 					SCTP_INP_WLOCK(inp);
 					inp->def_send.sinfo_flags &= 0xfff0;
 					inp->def_send.sinfo_flags |= info->pr_policy;
 					inp->def_send.sinfo_timetolive = info->pr_value;
 					SCTP_INP_WUNLOCK(inp);
 				}
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    ((info->pr_assoc_id == SCTP_CURRENT_ASSOC) ||
 				    (info->pr_assoc_id == SCTP_ALL_ASSOC))) {
 					SCTP_INP_RLOCK(inp);
 					LIST_FOREACH(stcb, &inp->sctp_asoc_list, sctp_tcblist) {
 						SCTP_TCB_LOCK(stcb);
 						stcb->asoc.def_send.sinfo_flags &= 0xfff0;
 						stcb->asoc.def_send.sinfo_flags |= info->pr_policy;
 						stcb->asoc.def_send.sinfo_timetolive = info->pr_value;
 						SCTP_TCB_UNLOCK(stcb);
 					}
 					SCTP_INP_RUNLOCK(inp);
 				}
 			}
 			break;
 		}
 	case SCTP_PEER_ADDR_THLDS:
 		/* Applies to the specific association */
 		{
 			struct sctp_paddrthlds *thlds;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 #endif
 
 			SCTP_CHECK_AND_CAST(thlds, optval, struct sctp_paddrthlds, optsize);
 			SCTP_FIND_STCB(inp, stcb, thlds->spt_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (thlds->spt_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&thlds->spt_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&thlds->spt_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&thlds->spt_address;
 			}
 #else
 			addr = (struct sockaddr *)&thlds->spt_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr,
 				    &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 			if (thlds->spt_pathcpthld != 0xffff) {
 				if (stcb != NULL) {
 					SCTP_TCB_UNLOCK(stcb);
 				}
 				error = EINVAL;
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 				break;
 			}
 			if (stcb != NULL) {
 				if (net != NULL) {
 					net->failure_threshold = thlds->spt_pathmaxrxt;
 					net->pf_threshold = thlds->spt_pathpfthld;
 					if (net->dest_state & SCTP_ADDR_PF) {
 						if ((net->error_count > net->failure_threshold) ||
 						    (net->error_count <= net->pf_threshold)) {
 							net->dest_state &= ~SCTP_ADDR_PF;
 						}
 					} else {
 						if ((net->error_count > net->pf_threshold) &&
 						    (net->error_count <= net->failure_threshold)) {
 							net->dest_state |= SCTP_ADDR_PF;
 							sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
 							sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
 							    stcb->sctp_ep, stcb, net,
 							    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_17);
 							sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
 						}
 					}
 					if (net->dest_state & SCTP_ADDR_REACHABLE) {
 						if (net->error_count > net->failure_threshold) {
 							net->dest_state &= ~SCTP_ADDR_REACHABLE;
 							sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 						}
 					} else {
 						if (net->error_count <= net->failure_threshold) {
 							net->dest_state |= SCTP_ADDR_REACHABLE;
 							sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 						}
 					}
 				} else {
 					TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 						net->failure_threshold = thlds->spt_pathmaxrxt;
 						net->pf_threshold = thlds->spt_pathpfthld;
 						if (net->dest_state & SCTP_ADDR_PF) {
 							if ((net->error_count > net->failure_threshold) ||
 							    (net->error_count <= net->pf_threshold)) {
 								net->dest_state &= ~SCTP_ADDR_PF;
 							}
 						} else {
 							if ((net->error_count > net->pf_threshold) &&
 							    (net->error_count <= net->failure_threshold)) {
 								net->dest_state |= SCTP_ADDR_PF;
 								sctp_send_hb(stcb, net, SCTP_SO_LOCKED);
 								sctp_timer_stop(SCTP_TIMER_TYPE_HEARTBEAT,
 								    stcb->sctp_ep, stcb, net,
 								    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_18);
 								sctp_timer_start(SCTP_TIMER_TYPE_HEARTBEAT, stcb->sctp_ep, stcb, net);
 							}
 						}
 						if (net->dest_state & SCTP_ADDR_REACHABLE) {
 							if (net->error_count > net->failure_threshold) {
 								net->dest_state &= ~SCTP_ADDR_REACHABLE;
 								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						} else {
 							if (net->error_count <= net->failure_threshold) {
 								net->dest_state |= SCTP_ADDR_REACHABLE;
 								sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_UP, stcb, 0, net, SCTP_SO_LOCKED);
 							}
 						}
 					}
 					stcb->asoc.def_net_failure = thlds->spt_pathmaxrxt;
 					stcb->asoc.def_net_pf_threshold = thlds->spt_pathpfthld;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (thlds->spt_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_ep.def_net_failure = thlds->spt_pathmaxrxt;
 					inp->sctp_ep.def_net_pf_threshold = thlds->spt_pathpfthld;
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_REMOTE_UDP_ENCAPS_PORT:
 		{
 			struct sctp_udpencaps *encaps;
 			struct sctp_nets *net;
 			struct sockaddr *addr;
 #if defined(INET) && defined(INET6)
 			struct sockaddr_in sin_store;
 #endif
 
 			SCTP_CHECK_AND_CAST(encaps, optval, struct sctp_udpencaps, optsize);
 			SCTP_FIND_STCB(inp, stcb, encaps->sue_assoc_id);
 
 #if defined(INET) && defined(INET6)
 			if (encaps->sue_address.ss_family == AF_INET6) {
 				struct sockaddr_in6 *sin6;
 
 				sin6 = (struct sockaddr_in6 *)&encaps->sue_address;
 				if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 					in6_sin6_2_sin(&sin_store, sin6);
 					addr = (struct sockaddr *)&sin_store;
 				} else {
 					addr = (struct sockaddr *)&encaps->sue_address;
 				}
 			} else {
 				addr = (struct sockaddr *)&encaps->sue_address;
 			}
 #else
 			addr = (struct sockaddr *)&encaps->sue_address;
 #endif
 			if (stcb != NULL) {
 				net = sctp_findnet(stcb, addr);
 			} else {
 				/*
 				 * We increment here since
 				 * sctp_findassociation_ep_addr() wil do a
 				 * decrement if it finds the stcb as long as
 				 * the locked tcb (last argument) is NOT a
 				 * TCB.. aka NULL.
 				 */
 				net = NULL;
 				SCTP_INP_INCR_REF(inp);
 				stcb = sctp_findassociation_ep_addr(&inp, addr, &net, NULL, NULL);
 				if (stcb == NULL) {
 					SCTP_INP_DECR_REF(inp);
 				}
 			}
 			if ((stcb != NULL) && (net == NULL)) {
 #ifdef INET
 				if (addr->sa_family == AF_INET) {
 					struct sockaddr_in *sin;
 
 					sin = (struct sockaddr_in *)addr;
 					if (sin->sin_addr.s_addr != INADDR_ANY) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 #ifdef INET6
 				if (addr->sa_family == AF_INET6) {
 					struct sockaddr_in6 *sin6;
 
 					sin6 = (struct sockaddr_in6 *)addr;
 					if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						SCTP_TCB_UNLOCK(stcb);
 						error = EINVAL;
 						break;
 					}
 				} else
 #endif
 				{
 					error = EAFNOSUPPORT;
 					SCTP_TCB_UNLOCK(stcb);
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 					break;
 				}
 			}
 
 			if (stcb != NULL) {
 				if (net != NULL) {
 					net->port = encaps->sue_port;
 				} else {
 					stcb->asoc.port = encaps->sue_port;
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (encaps->sue_assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					inp->sctp_ep.port = encaps->sue_port;
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_ECN_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					if (av->assoc_value == 0) {
 						inp->ecn_supported = 0;
 					} else {
 						inp->ecn_supported = 1;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_PR_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					if (av->assoc_value == 0) {
 						inp->prsctp_supported = 0;
 					} else {
 						inp->prsctp_supported = 1;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_AUTH_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					if ((av->assoc_value == 0) &&
 					    (inp->asconf_supported == 1)) {
 						/*
 						 * AUTH is required for
 						 * ASCONF
 						 */
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					} else {
 						SCTP_INP_WLOCK(inp);
 						if (av->assoc_value == 0) {
 							inp->auth_supported = 0;
 						} else {
 							inp->auth_supported = 1;
 						}
 						SCTP_INP_WUNLOCK(inp);
 					}
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_ASCONF_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					if ((av->assoc_value != 0) &&
 					    (inp->auth_supported == 0)) {
 						/*
 						 * AUTH is required for
 						 * ASCONF
 						 */
 						SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 						error = EINVAL;
 					} else {
 						SCTP_INP_WLOCK(inp);
 						if (av->assoc_value == 0) {
 							inp->asconf_supported = 0;
 							sctp_auth_delete_chunk(SCTP_ASCONF,
 							    inp->sctp_ep.local_auth_chunks);
 							sctp_auth_delete_chunk(SCTP_ASCONF_ACK,
 							    inp->sctp_ep.local_auth_chunks);
 						} else {
 							inp->asconf_supported = 1;
 							sctp_auth_add_chunk(SCTP_ASCONF,
 							    inp->sctp_ep.local_auth_chunks);
 							sctp_auth_add_chunk(SCTP_ASCONF_ACK,
 							    inp->sctp_ep.local_auth_chunks);
 						}
 						SCTP_INP_WUNLOCK(inp);
 					}
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_RECONFIG_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					if (av->assoc_value == 0) {
 						inp->reconfig_supported = 0;
 					} else {
 						inp->reconfig_supported = 1;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_NRSACK_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					if (av->assoc_value == 0) {
 						inp->nrsack_supported = 0;
 					} else {
 						inp->nrsack_supported = 1;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_PKTDROP_SUPPORTED:
 		{
 			struct sctp_assoc_value *av;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				error = EINVAL;
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					if (av->assoc_value == 0) {
 						inp->pktdrop_supported = 0;
 					} else {
 						inp->pktdrop_supported = 1;
 					}
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	case SCTP_MAX_CWND:
 		{
 			struct sctp_assoc_value *av;
 			struct sctp_nets *net;
 
 			SCTP_CHECK_AND_CAST(av, optval, struct sctp_assoc_value, optsize);
 			SCTP_FIND_STCB(inp, stcb, av->assoc_id);
 
 			if (stcb) {
 				stcb->asoc.max_cwnd = av->assoc_value;
 				if (stcb->asoc.max_cwnd > 0) {
 					TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 						if ((net->cwnd > stcb->asoc.max_cwnd) &&
 						    (net->cwnd > (net->mtu - sizeof(struct sctphdr)))) {
 							net->cwnd = stcb->asoc.max_cwnd;
 							if (net->cwnd < (net->mtu - sizeof(struct sctphdr))) {
 								net->cwnd = net->mtu - sizeof(struct sctphdr);
 							}
 						}
 					}
 				}
 				SCTP_TCB_UNLOCK(stcb);
 			} else {
 				if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) ||
 				    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) ||
 				    ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) &&
 				    (av->assoc_id == SCTP_FUTURE_ASSOC))) {
 					SCTP_INP_WLOCK(inp);
 					inp->max_cwnd = av->assoc_value;
 					SCTP_INP_WUNLOCK(inp);
 				} else {
 					SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 					error = EINVAL;
 				}
 			}
 			break;
 		}
 	default:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOPROTOOPT);
 		error = ENOPROTOOPT;
 		break;
 	}			/* end switch (opt) */
 	return (error);
 }
 
 int
 sctp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct epoch_tracker et;
 	struct sctp_inpcb *inp;
 	void *optval = NULL;
 	void *p;
 	size_t optsize = 0;
 	int error = 0;
 
 	if ((sopt->sopt_level == SOL_SOCKET) &&
 	    (sopt->sopt_name == SO_SETFIB)) {
 		inp = (struct sctp_inpcb *)so->so_pcb;
 		if (inp == NULL) {
 			SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS);
 			return (EINVAL);
 		}
 		SCTP_INP_WLOCK(inp);
 		inp->fibnum = so->so_fibnum;
 		SCTP_INP_WUNLOCK(inp);
 		return (0);
 	}
 	if (sopt->sopt_level != IPPROTO_SCTP) {
 		/* wrong proto level... send back up to IP */
 #ifdef INET6
 		if (INP_CHECK_SOCKAF(so, AF_INET6))
 			error = ip6_ctloutput(so, sopt);
 #endif				/* INET6 */
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 			error = ip_ctloutput(so, sopt);
 #endif
 		return (error);
 	}
 	optsize = sopt->sopt_valsize;
 	if (optsize > SCTP_SOCKET_OPTION_LIMIT) {
 		SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS);
 		return (ENOBUFS);
 	}
 	if (optsize) {
 		SCTP_MALLOC(optval, void *, optsize, SCTP_M_SOCKOPT);
 		if (optval == NULL) {
 			SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOBUFS);
 			return (ENOBUFS);
 		}
 		error = sooptcopyin(sopt, optval, optsize, optsize);
 		if (error) {
 			SCTP_FREE(optval, SCTP_M_SOCKOPT);
 			goto out;
 		}
 	}
 	p = (void *)sopt->sopt_td;
 	if (sopt->sopt_dir == SOPT_SET) {
 		NET_EPOCH_ENTER(et);
 		error = sctp_setopt(so, sopt->sopt_name, optval, optsize, p);
 		NET_EPOCH_EXIT(et);
 	} else if (sopt->sopt_dir == SOPT_GET) {
 		error = sctp_getopt(so, sopt->sopt_name, optval, &optsize, p);
 	} else {
 		SCTP_LTRACE_ERR_RET(so->so_pcb, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		error = EINVAL;
 	}
 	if ((error == 0) && (optval != NULL)) {
 		error = sooptcopyout(sopt, optval, optsize);
 		SCTP_FREE(optval, SCTP_M_SOCKOPT);
 	} else if (optval != NULL) {
 		SCTP_FREE(optval, SCTP_M_SOCKOPT);
 	}
 out:
 	return (error);
 }
 
 #ifdef INET
 static int
 sctp_connect(struct socket *so, struct sockaddr *addr, struct thread *p)
 {
 	struct epoch_tracker et;
 	int error = 0;
 	int create_lock_on = 0;
 	uint32_t vrf_id;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb = NULL;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		/* I made the same as TCP since we are not setup? */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	if (addr == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return EINVAL;
 	}
 
 	switch (addr->sa_family) {
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			if (addr->sa_len != sizeof(struct sockaddr_in6)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			sin6 = (struct sockaddr_in6 *)addr;
 			if (p != NULL && (error = prison_remote_ip6(p->td_ucred, &sin6->sin6_addr)) != 0) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 				return (error);
 			}
 			break;
 		}
 #endif
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			if (addr->sa_len != sizeof(struct sockaddr_in)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			sin = (struct sockaddr_in *)addr;
 			if (p != NULL && (error = prison_remote_ip4(p->td_ucred, &sin->sin_addr)) != 0) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 				return (error);
 			}
 			break;
 		}
 #endif
 	default:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EAFNOSUPPORT);
 		return (EAFNOSUPPORT);
 	}
 	SCTP_INP_INCR_REF(inp);
 	SCTP_ASOC_CREATE_LOCK(inp);
 	create_lock_on = 1;
 	NET_EPOCH_ENTER(et);
 
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE)) {
 		/* Should I really unlock ? */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EFAULT);
 		error = EFAULT;
 		goto out_now;
 	}
 #ifdef INET6
 	if (((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) &&
 	    (addr->sa_family == AF_INET6)) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		error = EINVAL;
 		goto out_now;
 	}
 #endif
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) {
 		/* Bind a ephemeral port */
 		error = sctp_inpcb_bind(so, NULL, NULL, p);
 		if (error) {
 			goto out_now;
 		}
 	}
 	/* Now do we connect? */
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL) &&
 	    (sctp_is_feature_off(inp, SCTP_PCB_FLAGS_PORTREUSE))) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		error = EINVAL;
 		goto out_now;
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) {
 		/* We are already connected AND the TCP model */
 		SCTP_LTRACE_ERR_RET(inp, stcb, NULL, SCTP_FROM_SCTP_USRREQ, EADDRINUSE);
 		error = EADDRINUSE;
 		goto out_now;
 	}
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 		SCTP_INP_RLOCK(inp);
 		stcb = LIST_FIRST(&inp->sctp_asoc_list);
 		SCTP_INP_RUNLOCK(inp);
 	} else {
 		/*
 		 * We increment here since sctp_findassociation_ep_addr()
 		 * will do a decrement if it finds the stcb as long as the
 		 * locked tcb (last argument) is NOT a TCB.. aka NULL.
 		 */
 		SCTP_INP_INCR_REF(inp);
 		stcb = sctp_findassociation_ep_addr(&inp, addr, NULL, NULL, NULL);
 		if (stcb == NULL) {
 			SCTP_INP_DECR_REF(inp);
 		} else {
 			SCTP_TCB_UNLOCK(stcb);
 		}
 	}
 	if (stcb != NULL) {
 		/* Already have or am bring up an association */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EALREADY);
 		error = EALREADY;
 		goto out_now;
 	}
 
 	vrf_id = inp->def_vrf_id;
 	/* We are GOOD to go */
 	stcb = sctp_aloc_assoc_connected(inp, addr, &error, 0, 0, vrf_id,
 	    inp->sctp_ep.pre_open_stream_count,
 	    inp->sctp_ep.port, p,
 	    SCTP_INITIALIZE_AUTH_PARAMS);
 	if (stcb == NULL) {
 		/* Gak! no memory */
 		goto out_now;
 	}
 	SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT);
 	(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
 
 	sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
 	SCTP_TCB_UNLOCK(stcb);
 out_now:
 	NET_EPOCH_EXIT(et);
 	if (create_lock_on) {
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 	}
 	SCTP_INP_DECR_REF(inp);
 	return (error);
 }
 #endif
 
 int
 sctp_listen(struct socket *so, int backlog, struct thread *p)
 {
 	/*
 	 * Note this module depends on the protocol processing being called
 	 * AFTER any socket level flags and backlog are applied to the
 	 * socket. The traditional way that the socket flags are applied is
 	 * AFTER protocol processing. We have made a change to the
 	 * sys/kern/uipc_socket.c module to reverse this but this MUST be in
 	 * place if the socket API for SCTP is to work properly.
 	 */
 
 	int error = 0;
 	struct sctp_inpcb *inp;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		/* I made the same as TCP since we are not setup? */
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	if (sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) {
 		/* See if we have a listener */
 		struct sctp_inpcb *tinp;
 		union sctp_sockstore store;
 
 		if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) == 0) {
 			/* not bound all */
 			struct sctp_laddr *laddr;
 
 			LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 				memcpy(&store, &laddr->ifa->address, sizeof(store));
 				switch (store.sa.sa_family) {
 #ifdef INET
 				case AF_INET:
 					store.sin.sin_port = inp->sctp_lport;
 					break;
 #endif
 #ifdef INET6
 				case AF_INET6:
 					store.sin6.sin6_port = inp->sctp_lport;
 					break;
 #endif
 				default:
 					break;
 				}
 				tinp = sctp_pcb_findep(&store.sa, 0, 0, inp->def_vrf_id);
 				if (tinp && (tinp != inp) &&
 				    ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) &&
 				    ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
 				    (SCTP_IS_LISTENING(tinp))) {
 					/*
 					 * we have a listener already and
 					 * its not this inp.
 					 */
 					SCTP_INP_DECR_REF(tinp);
 					return (EADDRINUSE);
 				} else if (tinp) {
 					SCTP_INP_DECR_REF(tinp);
 				}
 			}
 		} else {
 			/* Setup a local addr bound all */
 			memset(&store, 0, sizeof(store));
 #ifdef INET6
 			if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) {
 				store.sa.sa_family = AF_INET6;
 				store.sa.sa_len = sizeof(struct sockaddr_in6);
 			}
 #endif
 #ifdef INET
 			if ((inp->sctp_flags & SCTP_PCB_FLAGS_BOUND_V6) == 0) {
 				store.sa.sa_family = AF_INET;
 				store.sa.sa_len = sizeof(struct sockaddr_in);
 			}
 #endif
 			switch (store.sa.sa_family) {
 #ifdef INET
 			case AF_INET:
 				store.sin.sin_port = inp->sctp_lport;
 				break;
 #endif
 #ifdef INET6
 			case AF_INET6:
 				store.sin6.sin6_port = inp->sctp_lport;
 				break;
 #endif
 			default:
 				break;
 			}
 			tinp = sctp_pcb_findep(&store.sa, 0, 0, inp->def_vrf_id);
 			if (tinp && (tinp != inp) &&
 			    ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_ALLGONE) == 0) &&
 			    ((tinp->sctp_flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
 			    (SCTP_IS_LISTENING(tinp))) {
 				/*
 				 * we have a listener already and its not
 				 * this inp.
 				 */
 				SCTP_INP_DECR_REF(tinp);
 				return (EADDRINUSE);
 			} else if (tinp) {
 				SCTP_INP_DECR_REF(tinp);
 			}
 		}
 	}
 	SCTP_INP_INFO_WLOCK();
 	SCTP_INP_WLOCK(inp);
 #ifdef SCTP_LOCK_LOGGING
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LOCK_LOGGING_ENABLE) {
 		sctp_log_lock(inp, (struct sctp_tcb *)NULL, SCTP_LOG_LOCK_SOCK);
 	}
 #endif
 	if ((sctp_is_feature_on(inp, SCTP_PCB_FLAGS_PORTREUSE)) &&
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_IN_TCPPOOL)) {
 		/*
 		 * The unlucky case - We are in the tcp pool with this guy.
 		 * - Someone else is in the main inp slot. - We must move
 		 * this guy (the listener) to the main slot - We must then
 		 * move the guy that was listener to the TCP Pool.
 		 */
 		if (sctp_swap_inpcb_for_listen(inp)) {
 			error = EADDRINUSE;
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 			goto out;
 		}
 	}
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	if (error) {
 		SOCK_UNLOCK(so);
 		goto out;
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) {
 		SOCK_UNLOCK(so);
 		solisten_proto_abort(so);
 		error = EADDRINUSE;
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 		goto out;
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 	    ((inp->sctp_flags & SCTP_PCB_FLAGS_WAS_CONNECTED) ||
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_WAS_ABORTED))) {
 		SOCK_UNLOCK(so);
 		solisten_proto_abort(so);
 		error = EINVAL;
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, error);
 		goto out;
 	}
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) {
 		if ((error = sctp_inpcb_bind_locked(inp, NULL, NULL, p))) {
 			SOCK_UNLOCK(so);
 			solisten_proto_abort(so);
 			/* bind error, probably perm */
 			goto out;
 		}
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) == 0) {
 		solisten_proto(so, backlog);
 		SOCK_UNLOCK(so);
 		inp->sctp_flags |= SCTP_PCB_FLAGS_ACCEPTING;
 	} else {
 		solisten_proto_abort(so);
 		SOCK_UNLOCK(so);
 		if (backlog > 0) {
 			inp->sctp_flags |= SCTP_PCB_FLAGS_ACCEPTING;
 		} else {
 			inp->sctp_flags &= ~SCTP_PCB_FLAGS_ACCEPTING;
 		}
 	}
 out:
 	SCTP_INP_WUNLOCK(inp);
 	SCTP_INP_INFO_WUNLOCK();
 	return (error);
 }
 
 static int sctp_defered_wakeup_cnt = 0;
 
 int
 sctp_accept(struct socket *so, struct sockaddr **addr)
 {
 	struct sctp_tcb *stcb;
 	struct sctp_inpcb *inp;
 	union sctp_sockstore store;
 #ifdef INET6
 	int error;
 #endif
 	inp = (struct sctp_inpcb *)so->so_pcb;
 
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	SCTP_INP_WLOCK(inp);
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_UDPTYPE) {
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EOPNOTSUPP);
 		return (EOPNOTSUPP);
 	}
 	if (so->so_state & SS_ISDISCONNECTED) {
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ECONNABORTED);
 		return (ECONNABORTED);
 	}
 	stcb = LIST_FIRST(&inp->sctp_asoc_list);
 	if (stcb == NULL) {
 		SCTP_INP_WUNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	SCTP_TCB_LOCK(stcb);
 	store = stcb->asoc.primary_destination->ro._l_addr;
 	SCTP_CLEAR_SUBSTATE(stcb, SCTP_STATE_IN_ACCEPT_QUEUE);
 	/* Wake any delayed sleep action */
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_DONT_WAKE) {
 		inp->sctp_flags &= ~SCTP_PCB_FLAGS_DONT_WAKE;
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEOUTPUT) {
 			inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEOUTPUT;
 			SOCKBUF_LOCK(&inp->sctp_socket->so_snd);
 			if (sowriteable(inp->sctp_socket)) {
 				sowwakeup_locked(inp->sctp_socket);
 			} else {
 				SOCKBUF_UNLOCK(&inp->sctp_socket->so_snd);
 			}
 		}
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_WAKEINPUT) {
 			inp->sctp_flags &= ~SCTP_PCB_FLAGS_WAKEINPUT;
 			SOCKBUF_LOCK(&inp->sctp_socket->so_rcv);
 			if (soreadable(inp->sctp_socket)) {
 				sctp_defered_wakeup_cnt++;
 				sorwakeup_locked(inp->sctp_socket);
 			} else {
 				SOCKBUF_UNLOCK(&inp->sctp_socket->so_rcv);
 			}
 		}
 	}
 	SCTP_INP_WUNLOCK(inp);
 	if (stcb->asoc.state & SCTP_STATE_ABOUT_TO_BE_FREED) {
 		sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 		    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_19);
 	} else {
 		SCTP_TCB_UNLOCK(stcb);
 	}
 	switch (store.sa.sa_family) {
 #ifdef INET
 	case AF_INET:
 		{
 			struct sockaddr_in *sin;
 
 			SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin);
 			if (sin == NULL)
 				return (ENOMEM);
 			sin->sin_family = AF_INET;
 			sin->sin_len = sizeof(*sin);
 			sin->sin_port = store.sin.sin_port;
 			sin->sin_addr = store.sin.sin_addr;
 			*addr = (struct sockaddr *)sin;
 			break;
 		}
 #endif
 #ifdef INET6
 	case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof *sin6);
 			if (sin6 == NULL)
 				return (ENOMEM);
 			sin6->sin6_family = AF_INET6;
 			sin6->sin6_len = sizeof(*sin6);
 			sin6->sin6_port = store.sin6.sin6_port;
 			sin6->sin6_addr = store.sin6.sin6_addr;
 			if ((error = sa6_recoverscope(sin6)) != 0) {
 				SCTP_FREE_SONAME(sin6);
 				return (error);
 			}
 			*addr = (struct sockaddr *)sin6;
 			break;
 		}
 #endif
 	default:
 		/* TSNH */
 		break;
 	}
 	return (0);
 }
 
 #ifdef INET
 int
 sctp_ingetaddr(struct socket *so, struct sockaddr **addr)
 {
 	struct sockaddr_in *sin;
 	uint32_t vrf_id;
 	struct sctp_inpcb *inp;
 	struct sctp_ifa *sctp_ifa;
 
 	/*
 	 * Do the malloc first in case it blocks.
 	 */
 	SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin);
 	if (sin == NULL)
 		return (ENOMEM);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (!inp) {
 		SCTP_FREE_SONAME(sin);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	SCTP_INP_RLOCK(inp);
 	sin->sin_port = inp->sctp_lport;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 			struct sctp_tcb *stcb;
 			struct sockaddr_in *sin_a;
 			struct sctp_nets *net;
 			int fnd;
 
 			stcb = LIST_FIRST(&inp->sctp_asoc_list);
 			if (stcb == NULL) {
 				goto notConn;
 			}
 			fnd = 0;
 			sin_a = NULL;
 			SCTP_TCB_LOCK(stcb);
 			TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 				sin_a = (struct sockaddr_in *)&net->ro._l_addr;
 				if (sin_a == NULL)
 					/* this will make coverity happy */
 					continue;
 
 				if (sin_a->sin_family == AF_INET) {
 					fnd = 1;
 					break;
 				}
 			}
 			if ((!fnd) || (sin_a == NULL)) {
 				/* punt */
 				SCTP_TCB_UNLOCK(stcb);
 				goto notConn;
 			}
 
 			vrf_id = inp->def_vrf_id;
 			sctp_ifa = sctp_source_address_selection(inp,
 			    stcb,
 			    (sctp_route_t *)&net->ro,
 			    net, 0, vrf_id);
 			if (sctp_ifa) {
 				sin->sin_addr = sctp_ifa->address.sin.sin_addr;
 				sctp_free_ifa(sctp_ifa);
 			}
 			SCTP_TCB_UNLOCK(stcb);
 		} else {
 			/* For the bound all case you get back 0 */
 	notConn:
 			sin->sin_addr.s_addr = 0;
 		}
 
 	} else {
 		/* Take the first IPv4 address in the list */
 		struct sctp_laddr *laddr;
 		int fnd = 0;
 
 		LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 			if (laddr->ifa->address.sa.sa_family == AF_INET) {
 				struct sockaddr_in *sin_a;
 
 				sin_a = &laddr->ifa->address.sin;
 				sin->sin_addr = sin_a->sin_addr;
 				fnd = 1;
 				break;
 			}
 		}
 		if (!fnd) {
 			SCTP_FREE_SONAME(sin);
 			SCTP_INP_RUNLOCK(inp);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 			return (ENOENT);
 		}
 	}
 	SCTP_INP_RUNLOCK(inp);
 	(*addr) = (struct sockaddr *)sin;
 	return (0);
 }
 
 int
 sctp_peeraddr(struct socket *so, struct sockaddr **addr)
 {
 	struct sockaddr_in *sin;
 	int fnd;
 	struct sockaddr_in *sin_a;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb;
 	struct sctp_nets *net;
 
 	/* Do the malloc first in case it blocks. */
 	SCTP_MALLOC_SONAME(sin, struct sockaddr_in *, sizeof *sin);
 	if (sin == NULL)
 		return (ENOMEM);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if ((inp == NULL) ||
 	    ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) {
 		/* UDP type and listeners will drop out here */
 		SCTP_FREE_SONAME(sin);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOTCONN);
 		return (ENOTCONN);
 	}
 	SCTP_INP_RLOCK(inp);
 	stcb = LIST_FIRST(&inp->sctp_asoc_list);
 	if (stcb) {
 		SCTP_TCB_LOCK(stcb);
 	}
 	SCTP_INP_RUNLOCK(inp);
 	if (stcb == NULL) {
 		SCTP_FREE_SONAME(sin);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, EINVAL);
 		return (ECONNRESET);
 	}
 	fnd = 0;
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		sin_a = (struct sockaddr_in *)&net->ro._l_addr;
 		if (sin_a->sin_family == AF_INET) {
 			fnd = 1;
 			sin->sin_port = stcb->rport;
 			sin->sin_addr = sin_a->sin_addr;
 			break;
 		}
 	}
 	SCTP_TCB_UNLOCK(stcb);
 	if (!fnd) {
 		/* No IPv4 address */
 		SCTP_FREE_SONAME(sin);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP_USRREQ, ENOENT);
 		return (ENOENT);
 	}
 	(*addr) = (struct sockaddr *)sin;
 	return (0);
 }
 
-struct pr_usrreqs sctp_usrreqs = {
-	.pru_abort = sctp_abort,
-	.pru_accept = sctp_accept,
-	.pru_attach = sctp_attach,
-	.pru_bind = sctp_bind,
-	.pru_connect = sctp_connect,
-	.pru_control = in_control,
-	.pru_close = sctp_close,
-	.pru_detach = sctp_close,
-	.pru_sopoll = sopoll_generic,
-	.pru_flush = sctp_flush,
-	.pru_disconnect = sctp_disconnect,
-	.pru_listen = sctp_listen,
-	.pru_peeraddr = sctp_peeraddr,
-	.pru_send = sctp_sendm,
-	.pru_shutdown = sctp_shutdown,
-	.pru_sockaddr = sctp_ingetaddr,
-	.pru_sosend = sctp_sosend,
-	.pru_soreceive = sctp_soreceive
+#define	SCTP_PROTOSW						\
+	.pr_protocol =	IPPROTO_SCTP,				\
+	.pr_ctloutput =	sctp_ctloutput,				\
+	.pr_abort =	sctp_abort,				\
+	.pr_accept =	sctp_accept,				\
+	.pr_attach =	sctp_attach,				\
+	.pr_bind =	sctp_bind,				\
+	.pr_connect =	sctp_connect,				\
+	.pr_control =	in_control,				\
+	.pr_close =	sctp_close,				\
+	.pr_detach =	sctp_close,				\
+	.pr_sopoll =	sopoll_generic,				\
+	.pr_flush =	sctp_flush,				\
+	.pr_disconnect = sctp_disconnect,			\
+	.pr_listen =	sctp_listen,				\
+	.pr_peeraddr =	sctp_peeraddr,				\
+	.pr_send =	sctp_sendm,				\
+	.pr_shutdown =	sctp_shutdown,				\
+	.pr_sockaddr =	sctp_ingetaddr,				\
+	.pr_sosend =	sctp_sosend,				\
+	.pr_soreceive =	sctp_soreceive				\
+
+struct protosw sctp_seqpacket_protosw = {
+	.pr_type =	SOCK_SEQPACKET,
+	.pr_flags =	PR_WANTRCVD,
+	SCTP_PROTOSW
+};
+
+struct protosw sctp_stream_protosw = {
+	.pr_type =      SOCK_STREAM,
+	.pr_flags =	PR_CONNREQUIRED | PR_WANTRCVD,
+	SCTP_PROTOSW
 };
 #endif
diff --git a/sys/netinet/sctp_var.h b/sys/netinet/sctp_var.h
index 3bff09adb367..3675ba4443a4 100644
--- a/sys/netinet/sctp_var.h
+++ b/sys/netinet/sctp_var.h
@@ -1,348 +1,348 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001-2008, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _NETINET_SCTP_VAR_H_
 #define _NETINET_SCTP_VAR_H_
 
 #include <netinet/sctp_uio.h>
 
 #if defined(_KERNEL) || defined(__Userspace__)
 
-extern struct pr_usrreqs sctp_usrreqs;
+extern struct protosw sctp_seqpacket_protosw, sctp_stream_protosw;
 
 #define sctp_feature_on(inp, feature)  (inp->sctp_features |= feature)
 #define sctp_feature_off(inp, feature) (inp->sctp_features &= ~feature)
 #define sctp_is_feature_on(inp, feature) ((inp->sctp_features & feature) == feature)
 #define sctp_is_feature_off(inp, feature) ((inp->sctp_features & feature) == 0)
 
 #define sctp_stcb_feature_on(inp, stcb, feature) {\
 	if (stcb) { \
 		stcb->asoc.sctp_features |= feature; \
 	} else if (inp) { \
 		inp->sctp_features |= feature; \
 	} \
 }
 #define sctp_stcb_feature_off(inp, stcb, feature) {\
 	if (stcb) { \
 		stcb->asoc.sctp_features &= ~feature; \
 	} else if (inp) { \
 		inp->sctp_features &= ~feature; \
 	} \
 }
 #define sctp_stcb_is_feature_on(inp, stcb, feature) \
 	(((stcb != NULL) && \
 	  ((stcb->asoc.sctp_features & feature) == feature)) || \
 	 ((stcb == NULL) && (inp != NULL) && \
 	  ((inp->sctp_features & feature) == feature)))
 #define sctp_stcb_is_feature_off(inp, stcb, feature) \
 	(((stcb != NULL) && \
 	  ((stcb->asoc.sctp_features & feature) == 0)) || \
 	 ((stcb == NULL) && (inp != NULL) && \
 	  ((inp->sctp_features & feature) == 0)) || \
 	 ((stcb == NULL) && (inp == NULL)))
 
 /* managing mobility_feature in inpcb (by micchie) */
 #define sctp_mobility_feature_on(inp, feature)  (inp->sctp_mobility_features |= feature)
 #define sctp_mobility_feature_off(inp, feature) (inp->sctp_mobility_features &= ~feature)
 #define sctp_is_mobility_feature_on(inp, feature) (inp->sctp_mobility_features & feature)
 #define sctp_is_mobility_feature_off(inp, feature) ((inp->sctp_mobility_features & feature) == 0)
 
 #define sctp_maxspace(sb) (max((sb)->sb_hiwat,SCTP_MINIMAL_RWND))
 
 #define	sctp_sbspace(asoc, sb) ((long) ((sctp_maxspace(sb) > (asoc)->sb_cc) ? (sctp_maxspace(sb) - (asoc)->sb_cc) : 0))
 
 #define	sctp_sbspace_failedmsgs(sb) ((long) ((sctp_maxspace(sb) > SCTP_SBAVAIL(sb)) ? (sctp_maxspace(sb) - SCTP_SBAVAIL(sb)) : 0))
 
 #define sctp_sbspace_sub(a,b) (((a) > (b)) ? ((a) - (b)) : 0)
 
 /*
  * I tried to cache the readq entries at one point. But the reality
  * is that it did not add any performance since this meant we had to
  * lock the STCB on read. And at that point once you have to do an
  * extra lock, it really does not matter if the lock is in the ZONE
  * stuff or in our code. Note that this same problem would occur with
  * an mbuf cache as well so it is not really worth doing, at least
  * right now :-D
  */
 #ifdef INVARIANTS
 #define sctp_free_a_readq(_stcb, _readq) { \
 	if ((_readq)->on_strm_q) \
 		panic("On strm q stcb:%p readq:%p", (_stcb), (_readq)); \
 	SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), (_readq)); \
 	SCTP_DECR_READQ_COUNT(); \
 }
 #else
 #define sctp_free_a_readq(_stcb, _readq) { \
 	SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_readq), (_readq)); \
 	SCTP_DECR_READQ_COUNT(); \
 }
 #endif
 
 #define sctp_alloc_a_readq(_stcb, _readq) { \
 	(_readq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_readq), struct sctp_queued_to_read); \
 	if ((_readq)) { \
 	     SCTP_INCR_READQ_COUNT(); \
 	} \
 }
 
 #define sctp_free_a_strmoq(_stcb, _strmoq, _so_locked) { \
 	if ((_strmoq)->holds_key_ref) { \
 		sctp_auth_key_release(stcb, sp->auth_keyid, _so_locked); \
 		(_strmoq)->holds_key_ref = 0; \
 	} \
 	SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_strmoq), (_strmoq)); \
 	SCTP_DECR_STRMOQ_COUNT(); \
 }
 
 #define sctp_alloc_a_strmoq(_stcb, _strmoq) { \
 	(_strmoq) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_strmoq), struct sctp_stream_queue_pending); \
 	if ((_strmoq)) { \
 		memset(_strmoq, 0, sizeof(struct sctp_stream_queue_pending)); \
 		SCTP_INCR_STRMOQ_COUNT(); \
 		(_strmoq)->holds_key_ref = 0; \
 	} \
 }
 
 #define sctp_free_a_chunk(_stcb, _chk, _so_locked) { \
 	if ((_chk)->holds_key_ref) {\
 		sctp_auth_key_release((_stcb), (_chk)->auth_keyid, _so_locked); \
 		(_chk)->holds_key_ref = 0; \
 	} \
 	if (_stcb) { \
 		SCTP_TCB_LOCK_ASSERT((_stcb)); \
 		if ((_chk)->whoTo) { \
 			sctp_free_remote_addr((_chk)->whoTo); \
 			(_chk)->whoTo = NULL; \
 		} \
 		if (((_stcb)->asoc.free_chunk_cnt > SCTP_BASE_SYSCTL(sctp_asoc_free_resc_limit)) || \
 		    (SCTP_BASE_INFO(ipi_free_chunks) > SCTP_BASE_SYSCTL(sctp_system_free_resc_limit))) { \
 			SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \
 			SCTP_DECR_CHK_COUNT(); \
 		} else { \
 			TAILQ_INSERT_TAIL(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \
 			(_stcb)->asoc.free_chunk_cnt++; \
 			atomic_add_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \
 		} \
 	} else { \
 		SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_chunk), (_chk)); \
 		SCTP_DECR_CHK_COUNT(); \
 	} \
 }
 
 #define sctp_alloc_a_chunk(_stcb, _chk) { \
 	if (TAILQ_EMPTY(&(_stcb)->asoc.free_chunks)) { \
 		(_chk) = SCTP_ZONE_GET(SCTP_BASE_INFO(ipi_zone_chunk), struct sctp_tmit_chunk); \
 		if ((_chk)) { \
 			SCTP_INCR_CHK_COUNT(); \
 			(_chk)->whoTo = NULL; \
 			(_chk)->holds_key_ref = 0; \
 		} \
 	} else { \
 		(_chk) = TAILQ_FIRST(&(_stcb)->asoc.free_chunks); \
 		TAILQ_REMOVE(&(_stcb)->asoc.free_chunks, (_chk), sctp_next); \
 		atomic_subtract_int(&SCTP_BASE_INFO(ipi_free_chunks), 1); \
 		(_chk)->holds_key_ref = 0; \
 		SCTP_STAT_INCR(sctps_cached_chk); \
 		(_stcb)->asoc.free_chunk_cnt--; \
 	} \
 }
 
 #define sctp_free_remote_addr(__net) { \
 	if ((__net)) {  \
 		if (SCTP_DECREMENT_AND_CHECK_REFCOUNT(&(__net)->ref_count)) { \
 			RO_NHFREE(&(__net)->ro); \
 			if ((__net)->src_addr_selected) { \
 				sctp_free_ifa((__net)->ro._s_addr); \
 				(__net)->ro._s_addr = NULL; \
 			} \
 			(__net)->src_addr_selected = 0; \
 			(__net)->dest_state &= ~SCTP_ADDR_REACHABLE; \
 			SCTP_ZONE_FREE(SCTP_BASE_INFO(ipi_zone_net), (__net)); \
 			SCTP_DECR_RADDR_COUNT(); \
 		} \
 	} \
 }
 
 #define sctp_sbfree(ctl, stcb, sb, m) { \
 	SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_cc, SCTP_BUF_LEN((m))); \
 	SCTP_SAVE_ATOMIC_DECREMENT(&(sb)->sb_mbcnt, MSIZE); \
 	if (((ctl)->do_not_ref_stcb == 0) && stcb) {\
 		SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.sb_cc, SCTP_BUF_LEN((m))); \
 		SCTP_SAVE_ATOMIC_DECREMENT(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \
 	} \
 	if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \
 	    SCTP_BUF_TYPE(m) != MT_OOBDATA) \
 		atomic_subtract_int(&(sb)->sb_ctl,SCTP_BUF_LEN((m))); \
 }
 
 #define sctp_sballoc(stcb, sb, m) { \
 	atomic_add_int(&(sb)->sb_cc,SCTP_BUF_LEN((m))); \
 	atomic_add_int(&(sb)->sb_mbcnt, MSIZE); \
 	if (stcb) { \
 		atomic_add_int(&(stcb)->asoc.sb_cc, SCTP_BUF_LEN((m))); \
 		atomic_add_int(&(stcb)->asoc.my_rwnd_control_len, MSIZE); \
 	} \
 	if (SCTP_BUF_TYPE(m) != MT_DATA && SCTP_BUF_TYPE(m) != MT_HEADER && \
 	    SCTP_BUF_TYPE(m) != MT_OOBDATA) \
 		atomic_add_int(&(sb)->sb_ctl,SCTP_BUF_LEN((m))); \
 }
 
 #define sctp_ucount_incr(val) { \
 	val++; \
 }
 
 #define sctp_ucount_decr(val) { \
 	if (val > 0) { \
 		val--; \
 	} else { \
 		val = 0; \
 	} \
 }
 
 #define sctp_mbuf_crush(data) do { \
 	struct mbuf *_m; \
 	_m = (data); \
 	while (_m && (SCTP_BUF_LEN(_m) == 0)) { \
 		(data)  = SCTP_BUF_NEXT(_m); \
 		SCTP_BUF_NEXT(_m) = NULL; \
 		sctp_m_free(_m); \
 		_m = (data); \
 	} \
 } while (0)
 
 #define sctp_flight_size_decrease(tp1) do { \
 	if (tp1->whoTo->flight_size >= tp1->book_size) \
 		tp1->whoTo->flight_size -= tp1->book_size; \
 	else \
 		tp1->whoTo->flight_size = 0; \
 } while (0)
 
 #define sctp_flight_size_increase(tp1) do { \
 	(tp1)->whoTo->flight_size += (tp1)->book_size; \
 } while (0)
 
 #ifdef SCTP_FS_SPEC_LOG
 #define sctp_total_flight_decrease(stcb, tp1) do { \
 	if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \
 		stcb->asoc.fs_index = 0;\
 	stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.tsn; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].book = tp1->book_size; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].sent = tp1->sent; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].incr = 0; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].decr = 1; \
 	stcb->asoc.fs_index++; \
 	tp1->window_probe = 0; \
 	if (stcb->asoc.total_flight >= tp1->book_size) { \
 		stcb->asoc.total_flight -= tp1->book_size; \
 		if (stcb->asoc.total_flight_count > 0) \
 			stcb->asoc.total_flight_count--; \
 	} else { \
 		stcb->asoc.total_flight = 0; \
 		stcb->asoc.total_flight_count = 0; \
 	} \
 } while (0)
 
 #define sctp_total_flight_increase(stcb, tp1) do { \
 	if (stcb->asoc.fs_index > SCTP_FS_SPEC_LOG_SIZE) \
 		stcb->asoc.fs_index = 0;\
 	stcb->asoc.fslog[stcb->asoc.fs_index].total_flight = stcb->asoc.total_flight; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].tsn = tp1->rec.data.tsn; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].book = tp1->book_size; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].sent = tp1->sent; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].incr = 1; \
 	stcb->asoc.fslog[stcb->asoc.fs_index].decr = 0; \
 	stcb->asoc.fs_index++; \
 	(stcb)->asoc.total_flight_count++; \
 	(stcb)->asoc.total_flight += (tp1)->book_size; \
 } while (0)
 
 #else
 
 #define sctp_total_flight_decrease(stcb, tp1) do { \
 	tp1->window_probe = 0; \
 	if (stcb->asoc.total_flight >= tp1->book_size) { \
 		stcb->asoc.total_flight -= tp1->book_size; \
 		if (stcb->asoc.total_flight_count > 0) \
 			stcb->asoc.total_flight_count--; \
 	} else { \
 		stcb->asoc.total_flight = 0; \
 		stcb->asoc.total_flight_count = 0; \
 	} \
 } while (0)
 
 #define sctp_total_flight_increase(stcb, tp1) do { \
 	(stcb)->asoc.total_flight_count++; \
 	(stcb)->asoc.total_flight += (tp1)->book_size; \
 } while (0)
 
 #endif
 
 #define SCTP_PF_ENABLED(_net) (_net->pf_threshold < _net->failure_threshold)
 #define SCTP_NET_IS_PF(_net) (_net->pf_threshold < _net->error_count)
 
 struct sctp_nets;
 struct sctp_inpcb;
 struct sctp_tcb;
 struct sctphdr;
 
 void sctp_close(struct socket *so);
 int sctp_disconnect(struct socket *so);
 void sctp_ctlinput(int, struct sockaddr *, void *);
 int sctp_ctloutput(struct socket *, struct sockopt *);
 void sctp_input_with_port(struct mbuf *, int, uint16_t);
 int sctp_input(struct mbuf **, int *, int);
 void sctp_pathmtu_adjustment(struct sctp_tcb *, uint32_t, bool);
 void
 sctp_notify(struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *,
     uint8_t, uint8_t, uint16_t, uint32_t);
 int sctp_flush(struct socket *, int);
 int sctp_shutdown(struct socket *);
 int
 sctp_bindx(struct socket *, int, struct sockaddr_storage *,
     int, int, struct proc *);
 
 /* can't use sctp_assoc_t here */
 int sctp_peeloff(struct socket *, struct socket *, int, caddr_t, int *);
 int sctp_ingetaddr(struct socket *, struct sockaddr **);
 int sctp_peeraddr(struct socket *, struct sockaddr **);
 int sctp_listen(struct socket *, int, struct thread *);
 int sctp_accept(struct socket *, struct sockaddr **);
 
 #endif				/* _KERNEL */
 
 #endif				/* !_NETINET_SCTP_VAR_H_ */
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 51a2e23db6ef..1c04de080623 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1,4154 +1,4154 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_subr.c	8.2 (Berkeley) 5/24/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/arb.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
 #endif
 #include <sys/kernel.h>
 #ifdef TCP_HHOOK
 #include <sys/khelp.h>
 #endif
 #ifdef KERN_TLS
 #include <sys/ktls.h>
 #endif
 #include <sys/qmath.h>
 #include <sys/stats.h>
 #include <sys/sysctl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
 #include <sys/refcount.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
 
 #include <vm/uma.h>
 
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/nd6.h>
 #endif
 
 #include <netinet/tcp.h>
 #ifdef INVARIANTS
 #define TCPSTATES
 #endif
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcp_syncache.h>
 #include <netinet/tcp_hpts.h>
 #include <netinet/cc/cc.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
 #endif
 #include <netinet/tcpip.h>
 #include <netinet/tcp_fastopen.h>
 #ifdef TCPPCAP
 #include <netinet/tcp_pcap.h>
 #endif
 #ifdef TCPDEBUG
 #include <netinet/tcp_debug.h>
 #endif
 #ifdef INET6
 #include <netinet6/ip6protosw.h>
 #endif
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 #include <crypto/siphash/siphash.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, tcp_mssdflt) = TCP_MSS;
 #ifdef INET6
 VNET_DEFINE(int, tcp_v6mssdflt) = TCP6_MSS;
 #endif
 
 #ifdef NETFLIX_EXP_DETECTION
 /*  Sack attack detection thresholds and such */
 SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack_attack,
     CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Sack Attack detection thresholds");
 int32_t tcp_force_detection = 0;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, force_detection,
     CTLFLAG_RW,
     &tcp_force_detection, 0,
     "Do we force detection even if the INP has it off?");
 int32_t tcp_sack_to_ack_thresh = 700;	/* 70 % */
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sack_to_ack_thresh,
     CTLFLAG_RW,
     &tcp_sack_to_ack_thresh, 700,
     "Percentage of sacks to acks we must see above (10.1 percent is 101)?");
 int32_t tcp_sack_to_move_thresh = 600;	/* 60 % */
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, move_thresh,
     CTLFLAG_RW,
     &tcp_sack_to_move_thresh, 600,
     "Percentage of sack moves we must see above (10.1 percent is 101)");
 int32_t tcp_restoral_thresh = 650;	/* 65 % (sack:2:ack -5%) */
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, restore_thresh,
     CTLFLAG_RW,
     &tcp_restoral_thresh, 550,
     "Percentage of sack to ack percentage we must see below to restore(10.1 percent is 101)");
 int32_t tcp_sad_decay_val = 800;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, decay_per,
     CTLFLAG_RW,
     &tcp_sad_decay_val, 800,
     "The decay percentage (10.1 percent equals 101 )");
 int32_t tcp_map_minimum = 500;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, nummaps,
     CTLFLAG_RW,
     &tcp_map_minimum, 500,
     "Number of Map enteries before we start detection");
 int32_t tcp_attack_on_turns_on_logging = 0;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, attacks_logged,
     CTLFLAG_RW,
     &tcp_attack_on_turns_on_logging, 0,
    "When we have a positive hit on attack, do we turn on logging?");
 int32_t tcp_sad_pacing_interval = 2000;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_pacing_int,
     CTLFLAG_RW,
     &tcp_sad_pacing_interval, 2000,
     "What is the minimum pacing interval for a classified attacker?");
 
 int32_t tcp_sad_low_pps = 100;
 SYSCTL_INT(_net_inet_tcp_sack_attack, OID_AUTO, sad_low_pps,
     CTLFLAG_RW,
     &tcp_sad_low_pps, 100,
     "What is the input pps that below which we do not decay?");
 #endif
 uint32_t tcp_ack_war_time_window = 1000;
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow,
     CTLFLAG_RW,
     &tcp_ack_war_time_window, 1000,
    "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?");
 uint32_t tcp_ack_war_cnt = 5;
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt,
     CTLFLAG_RW,
     &tcp_ack_war_cnt, 5,
    "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?");
 
 struct rwlock tcp_function_lock;
 
 static int
 sysctl_net_inet_tcp_mss_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_MSSDFLT, mssdflt,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(tcp_mssdflt), 0, &sysctl_net_inet_tcp_mss_check, "I",
     "Default TCP Maximum Segment Size");
 
 #ifdef INET6
 static int
 sysctl_net_inet_tcp_mss_v6_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_v6mssdflt;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if (new < TCP_MINMSS)
 			error = EINVAL;
 		else
 			V_tcp_v6mssdflt = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_V6MSSDFLT, v6mssdflt,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(tcp_v6mssdflt), 0, &sysctl_net_inet_tcp_mss_v6_check, "I",
    "Default TCP Maximum Segment Size for IPv6");
 #endif /* INET6 */
 
 /*
  * Minimum MSS we accept and use. This prevents DoS attacks where
  * we are forced to a ridiculous low MSS like 20 and send hundreds
  * of packets instead of one. The effect scales with the available
  * bandwidth and quickly saturates the CPU and network interface
  * with packet generation and sending. Set to zero to disable MINMSS
  * checking. This setting prevents us from sending too small packets.
  */
 VNET_DEFINE(int, tcp_minmss) = TCP_MINMSS;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, minmss, CTLFLAG_VNET | CTLFLAG_RW,
      &VNET_NAME(tcp_minmss), 0,
     "Minimum TCP Maximum Segment Size");
 
 VNET_DEFINE(int, tcp_do_rfc1323) = 1;
 SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_do_rfc1323), 0,
     "Enable rfc1323 (high performance TCP) extensions");
 
 /*
  * As of June 2021, several TCP stacks violate RFC 7323 from September 2014.
  * Some stacks negotiate TS, but never send them after connection setup. Some
  * stacks negotiate TS, but don't send them when sending keep-alive segments.
  * These include modern widely deployed TCP stacks.
  * Therefore tolerating violations for now...
  */
 VNET_DEFINE(int, tcp_tolerate_missing_ts) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tolerate_missing_ts, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_tolerate_missing_ts), 0,
     "Tolerate missing TCP timestamps");
 
 VNET_DEFINE(int, tcp_ts_offset_per_conn) = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, ts_offset_per_conn, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_ts_offset_per_conn), 0,
     "Initialize TCP timestamps per connection instead of per host pair");
 
 /* How many connections are pacing */
 static volatile uint32_t number_of_tcp_connections_pacing = 0;
 static uint32_t shadow_num_connections = 0;
 
 static int tcp_pacing_limit = 10000;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW,
     &tcp_pacing_limit, 1000,
     "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)");
 
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD,
     &shadow_num_connections, 0, "Number of TCP connections being paced");
 
 static int	tcp_log_debug = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW,
     &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
 
 static int	tcp_tcbhashsize;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
 
 static int	do_tcpdrain = 1;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_tcpdrain, CTLFLAG_RW, &do_tcpdrain, 0,
     "Enable tcp_drain routine for extra help when low on mbufs");
 
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_VNET | CTLFLAG_RD,
     &VNET_NAME(tcbinfo.ipi_count), 0, "Number of active PCBs");
 
 VNET_DEFINE_STATIC(int, icmp_may_rst) = 1;
 #define	V_icmp_may_rst			VNET(icmp_may_rst)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, icmp_may_rst, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(icmp_may_rst), 0,
     "Certain ICMP unreachable messages may abort connections in SYN_SENT");
 
 VNET_DEFINE_STATIC(int, tcp_isn_reseed_interval) = 0;
 #define	V_tcp_isn_reseed_interval	VNET(tcp_isn_reseed_interval)
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(tcp_isn_reseed_interval), 0,
     "Seconds between reseeding of ISN secret");
 
 static int	tcp_soreceive_stream;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, soreceive_stream, CTLFLAG_RDTUN,
     &tcp_soreceive_stream, 0, "Using soreceive_stream for TCP sockets");
 
 VNET_DEFINE(uma_zone_t, sack_hole_zone);
 #define	V_sack_hole_zone		VNET(sack_hole_zone)
 VNET_DEFINE(uint32_t, tcp_map_entries_limit) = 0;	/* unlimited */
 static int
 sysctl_net_inet_tcp_map_limit_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t new;
 
 	new = V_tcp_map_entries_limit;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		/* only allow "0" and value > minimum */
 		if (new > 0 && new < TCP_MIN_MAP_ENTRIES_LIMIT)
 			error = EINVAL;
 		else
 			V_tcp_map_entries_limit = new;
 	}
 	return (error);
 }
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, map_limit,
     CTLFLAG_VNET | CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     &VNET_NAME(tcp_map_entries_limit), 0,
     &sysctl_net_inet_tcp_map_limit_check, "IU",
     "Total sendmap entries limit");
 
 VNET_DEFINE(uint32_t, tcp_map_split_limit) = 0;	/* unlimited */
 SYSCTL_UINT(_net_inet_tcp, OID_AUTO, split_limit, CTLFLAG_VNET | CTLFLAG_RW,
      &VNET_NAME(tcp_map_split_limit), 0,
     "Total sendmap split entries limit");
 
 #ifdef TCP_HHOOK
 VNET_DEFINE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST+1]);
 #endif
 
 #define TS_OFFSET_SECRET_LENGTH SIPHASH_KEY_LENGTH
 VNET_DEFINE_STATIC(u_char, ts_offset_secret[TS_OFFSET_SECRET_LENGTH]);
 #define	V_ts_offset_secret	VNET(ts_offset_secret)
 
 static int	tcp_default_fb_init(struct tcpcb *tp);
 static void	tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged);
 static int	tcp_default_handoff_ok(struct tcpcb *tp);
 static struct inpcb *tcp_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
 static struct inpcb *tcp_mtudisc(struct inpcb *, int);
 static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
 		    const void *ip4hdr, const void *ip6hdr);
 
 static struct tcp_function_block tcp_def_funcblk = {
 	.tfb_tcp_block_name = "freebsd",
 	.tfb_tcp_output = tcp_default_output,
 	.tfb_tcp_do_segment = tcp_do_segment,
 	.tfb_tcp_ctloutput = tcp_default_ctloutput,
 	.tfb_tcp_handoff_ok = tcp_default_handoff_ok,
 	.tfb_tcp_fb_init = tcp_default_fb_init,
 	.tfb_tcp_fb_fini = tcp_default_fb_fini,
 };
 
 static int tcp_fb_cnt = 0;
 struct tcp_funchead t_functions;
 static struct tcp_function_block *tcp_func_set_ptr = &tcp_def_funcblk;
 
 void
 tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp)
 {
 	TCPSTAT_INC(tcps_dsack_count);
 	tp->t_dsack_pack++;
 	if (tlp == 0) {
 		if (SEQ_GT(end, start)) {
 			tp->t_dsack_bytes += (end - start);
 			TCPSTAT_ADD(tcps_dsack_bytes, (end - start));
 		} else {
 			tp->t_dsack_tlp_bytes += (start - end);
 			TCPSTAT_ADD(tcps_dsack_bytes, (start - end));
 		}
 	} else {
 		if (SEQ_GT(end, start)) {
 			tp->t_dsack_bytes += (end - start);
 			TCPSTAT_ADD(tcps_dsack_tlp_bytes, (end - start));
 		} else {
 			tp->t_dsack_tlp_bytes += (start - end);
 			TCPSTAT_ADD(tcps_dsack_tlp_bytes, (start - end));
 		}
 	}
 }
 
 static struct tcp_function_block *
 find_tcp_functions_locked(struct tcp_function_set *fs)
 {
 	struct tcp_function *f;
 	struct tcp_function_block *blk=NULL;
 
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if (strcmp(f->tf_name, fs->function_set_name) == 0) {
 			blk = f->tf_fb;
 			break;
 		}
 	}
 	return(blk);
 }
 
 static struct tcp_function_block *
 find_tcp_fb_locked(struct tcp_function_block *blk, struct tcp_function **s)
 {
 	struct tcp_function_block *rblk=NULL;
 	struct tcp_function *f;
 
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if (f->tf_fb == blk) {
 			rblk = blk;
 			if (s) {
 				*s = f;
 			}
 			break;
 		}
 	}
 	return (rblk);
 }
 
 struct tcp_function_block *
 find_and_ref_tcp_functions(struct tcp_function_set *fs)
 {
 	struct tcp_function_block *blk;
 
 	rw_rlock(&tcp_function_lock);
 	blk = find_tcp_functions_locked(fs);
 	if (blk)
 		refcount_acquire(&blk->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	return(blk);
 }
 
 struct tcp_function_block *
 find_and_ref_tcp_fb(struct tcp_function_block *blk)
 {
 	struct tcp_function_block *rblk;
 
 	rw_rlock(&tcp_function_lock);
 	rblk = find_tcp_fb_locked(blk, NULL);
 	if (rblk)
 		refcount_acquire(&rblk->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	return(rblk);
 }
 
 /* Find a matching alias for the given tcp_function_block. */
 int
 find_tcp_function_alias(struct tcp_function_block *blk,
     struct tcp_function_set *fs)
 {
 	struct tcp_function *f;
 	int found;
 
 	found = 0;
 	rw_rlock(&tcp_function_lock);
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		if ((f->tf_fb == blk) &&
 		    (strncmp(f->tf_name, blk->tfb_tcp_block_name,
 		        TCP_FUNCTION_NAME_LEN_MAX) != 0)) {
 			/* Matching function block with different name. */
 			strncpy(fs->function_set_name, f->tf_name,
 			    TCP_FUNCTION_NAME_LEN_MAX);
 			found = 1;
 			break;
 		}
 	}
 	/* Null terminate the string appropriately. */
 	if (found) {
 		fs->function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
 	} else {
 		fs->function_set_name[0] = '\0';
 	}
 	rw_runlock(&tcp_function_lock);
 	return (found);
 }
 
 static struct tcp_function_block *
 find_and_ref_tcp_default_fb(void)
 {
 	struct tcp_function_block *rblk;
 
 	rw_rlock(&tcp_function_lock);
 	rblk = tcp_func_set_ptr;
 	refcount_acquire(&rblk->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	return (rblk);
 }
 
 void
 tcp_switch_back_to_default(struct tcpcb *tp)
 {
 	struct tcp_function_block *tfb;
 
 	KASSERT(tp->t_fb != &tcp_def_funcblk,
 	    ("%s: called by the built-in default stack", __func__));
 
 	/*
 	 * Release the old stack. This function will either find a new one
 	 * or panic.
 	 */
 	if (tp->t_fb->tfb_tcp_fb_fini != NULL)
 		(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
 	refcount_release(&tp->t_fb->tfb_refcnt);
 
 	/*
 	 * Now, we'll find a new function block to use.
 	 * Start by trying the current user-selected
 	 * default, unless this stack is the user-selected
 	 * default.
 	 */
 	tfb = find_and_ref_tcp_default_fb();
 	if (tfb == tp->t_fb) {
 		refcount_release(&tfb->tfb_refcnt);
 		tfb = NULL;
 	}
 	/* Does the stack accept this connection? */
 	if (tfb != NULL && tfb->tfb_tcp_handoff_ok != NULL &&
 	    (*tfb->tfb_tcp_handoff_ok)(tp)) {
 		refcount_release(&tfb->tfb_refcnt);
 		tfb = NULL;
 	}
 	/* Try to use that stack. */
 	if (tfb != NULL) {
 		/* Initialize the new stack. If it succeeds, we are done. */
 		tp->t_fb = tfb;
 		if (tp->t_fb->tfb_tcp_fb_init == NULL ||
 		    (*tp->t_fb->tfb_tcp_fb_init)(tp) == 0)
 			return;
 
 		/*
 		 * Initialization failed. Release the reference count on
 		 * the stack.
 		 */
 		refcount_release(&tfb->tfb_refcnt);
 	}
 
 	/*
 	 * If that wasn't feasible, use the built-in default
 	 * stack which is not allowed to reject anyone.
 	 */
 	tfb = find_and_ref_tcp_fb(&tcp_def_funcblk);
 	if (tfb == NULL) {
 		/* there always should be a default */
 		panic("Can't refer to tcp_def_funcblk");
 	}
 	if (tfb->tfb_tcp_handoff_ok != NULL) {
 		if ((*tfb->tfb_tcp_handoff_ok) (tp)) {
 			/* The default stack cannot say no */
 			panic("Default stack rejects a new session?");
 		}
 	}
 	tp->t_fb = tfb;
 	if (tp->t_fb->tfb_tcp_fb_init != NULL &&
 	    (*tp->t_fb->tfb_tcp_fb_init)(tp)) {
 		/* The default stack cannot fail */
 		panic("Default stack initialization failed");
 	}
 }
 
 static bool
 tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,
     const struct sockaddr *sa, void *ctx)
 {
 	struct ip *iph;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 #endif
 	struct udphdr *uh;
 	struct tcphdr *th;
 	int thlen;
 	uint16_t port;
 
 	TCPSTAT_INC(tcps_tunneled_pkts);
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		/* Can't handle one that is not a pkt hdr */
 		TCPSTAT_INC(tcps_tunneled_errs);
 		goto out;
 	}
 	thlen = sizeof(struct tcphdr);
 	if (m->m_len < off + sizeof(struct udphdr) + thlen &&
 	    (m =  m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) {
 		TCPSTAT_INC(tcps_tunneled_errs);
 		goto out;
 	}
 	iph = mtod(m, struct ip *);
 	uh = (struct udphdr *)((caddr_t)iph + off);
 	th = (struct tcphdr *)(uh + 1);
 	thlen = th->th_off << 2;
 	if (m->m_len < off + sizeof(struct udphdr) + thlen) {
 		m =  m_pullup(m, off + sizeof(struct udphdr) + thlen);
 		if (m == NULL) {
 			TCPSTAT_INC(tcps_tunneled_errs);
 			goto out;
 		} else {
 			iph = mtod(m, struct ip *);
 			uh = (struct udphdr *)((caddr_t)iph + off);
 			th = (struct tcphdr *)(uh + 1);
 		}
 	}
 	m->m_pkthdr.tcp_tun_port = port = uh->uh_sport;
 	bcopy(th, uh, m->m_len - off);
 	m->m_len -= sizeof(struct udphdr);
 	m->m_pkthdr.len -= sizeof(struct udphdr);
 	/*
 	 * We use the same algorithm for
 	 * both UDP and TCP for c-sum. So
 	 * the code in tcp_input will skip
 	 * the checksum. So we do nothing
 	 * with the flag (m->m_pkthdr.csum_flags).
 	 */
 	switch (iph->ip_v) {
 #ifdef INET
 	case IPVERSION:
 		iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr));
 		tcp_input_with_port(&m, &off, IPPROTO_TCP, port);
 		break;
 #endif
 #ifdef INET6
 	case IPV6_VERSION >> 4:
 		ip6 = mtod(m, struct ip6_hdr *);
 		ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr));
 		tcp6_input_with_port(&m, &off, IPPROTO_TCP, port);
 		break;
 #endif
 	default:
 		goto out;
 		break;
 	}
 	return (true);
 out:
 	m_freem(m);
 
 	return (true);
 }
 
 static int
 sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
 {
 	int error=ENOENT;
 	struct tcp_function_set fs;
 	struct tcp_function_block *blk;
 
 	memset(&fs, 0, sizeof(fs));
 	rw_rlock(&tcp_function_lock);
 	blk = find_tcp_fb_locked(tcp_func_set_ptr, NULL);
 	if (blk) {
 		/* Found him */
 		strcpy(fs.function_set_name, blk->tfb_tcp_block_name);
 		fs.pcbcnt = blk->tfb_refcnt;
 	}
 	rw_runlock(&tcp_function_lock);
 	error = sysctl_handle_string(oidp, fs.function_set_name,
 				     sizeof(fs.function_set_name), req);
 
 	/* Check for error or no change */
 	if (error != 0 || req->newptr == NULL)
 		return(error);
 
 	rw_wlock(&tcp_function_lock);
 	blk = find_tcp_functions_locked(&fs);
 	if ((blk == NULL) ||
 	    (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) {
 		error = ENOENT;
 		goto done;
 	}
 	tcp_func_set_ptr = blk;
 done:
 	rw_wunlock(&tcp_function_lock);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_default,
     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     NULL, 0, sysctl_net_inet_default_tcp_functions, "A",
     "Set/get the default TCP functions");
 
 static int
 sysctl_net_inet_list_available(SYSCTL_HANDLER_ARGS)
 {
 	int error, cnt, linesz;
 	struct tcp_function *f;
 	char *buffer, *cp;
 	size_t bufsz, outsz;
 	bool alias;
 
 	cnt = 0;
 	rw_rlock(&tcp_function_lock);
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		cnt++;
 	}
 	rw_runlock(&tcp_function_lock);
 
 	bufsz = (cnt+2) * ((TCP_FUNCTION_NAME_LEN_MAX * 2) + 13) + 1;
 	buffer = malloc(bufsz, M_TEMP, M_WAITOK);
 
 	error = 0;
 	cp = buffer;
 
 	linesz = snprintf(cp, bufsz, "\n%-32s%c %-32s %s\n", "Stack", 'D',
 	    "Alias", "PCB count");
 	cp += linesz;
 	bufsz -= linesz;
 	outsz = linesz;
 
 	rw_rlock(&tcp_function_lock);
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 		alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name);
 		linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n",
 		    f->tf_fb->tfb_tcp_block_name,
 		    (f->tf_fb == tcp_func_set_ptr) ? '*' : ' ',
 		    alias ? f->tf_name : "-",
 		    f->tf_fb->tfb_refcnt);
 		if (linesz >= bufsz) {
 			error = EOVERFLOW;
 			break;
 		}
 		cp += linesz;
 		bufsz -= linesz;
 		outsz += linesz;
 	}
 	rw_runlock(&tcp_function_lock);
 	if (error == 0)
 		error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
 	free(buffer, M_TEMP);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
     NULL, 0, sysctl_net_inet_list_available, "A",
     "list available TCP Function sets");
 
 VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT;
 
 #ifdef INET
 VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL;
 #define	V_udp4_tun_socket	VNET(udp4_tun_socket)
 #endif
 #ifdef INET6
 VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL;
 #define	V_udp6_tun_socket	VNET(udp6_tun_socket)
 #endif
 
 static void
 tcp_over_udp_stop(void)
 {
 	/*
 	 * This function assumes sysctl caller holds inp_rinfo_lock()
 	 * for writing!
 	 */
 #ifdef INET
 	if (V_udp4_tun_socket != NULL) {
 		soclose(V_udp4_tun_socket);
 		V_udp4_tun_socket = NULL;
 	}
 #endif
 #ifdef INET6
 	if (V_udp6_tun_socket != NULL) {
 		soclose(V_udp6_tun_socket);
 		V_udp6_tun_socket = NULL;
 	}
 #endif
 }
 
 static int
 tcp_over_udp_start(void)
 {
 	uint16_t port;
 	int ret;
 #ifdef INET
 	struct sockaddr_in sin;
 #endif
 #ifdef INET6
 	struct sockaddr_in6 sin6;
 #endif
 	/*
 	 * This function assumes sysctl caller holds inp_info_rlock()
 	 * for writing!
 	 */
 	port = V_tcp_udp_tunneling_port;
 	if (ntohs(port) == 0) {
 		/* Must have a port set */
 		return (EINVAL);
 	}
 #ifdef INET
 	if (V_udp4_tun_socket != NULL) {
 		/* Already running -- must stop first */
 		return (EALREADY);
 	}
 #endif
 #ifdef INET6
 	if (V_udp6_tun_socket != NULL) {
 		/* Already running -- must stop first */
 		return (EALREADY);
 	}
 #endif
 #ifdef INET
 	if ((ret = socreate(PF_INET, &V_udp4_tun_socket,
 	    SOCK_DGRAM, IPPROTO_UDP,
 	    curthread->td_ucred, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Call the special UDP hook. */
 	if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket,
 	    tcp_recv_udp_tunneled_packet,
 	    tcp_ctlinput_viaudp,
 	    NULL))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Ok, we have a socket, bind it to the port. */
 	memset(&sin, 0, sizeof(struct sockaddr_in));
 	sin.sin_len = sizeof(struct sockaddr_in);
 	sin.sin_family = AF_INET;
 	sin.sin_port = htons(port);
 	if ((ret = sobind(V_udp4_tun_socket,
 	    (struct sockaddr *)&sin, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 #endif
 #ifdef INET6
 	if ((ret = socreate(PF_INET6, &V_udp6_tun_socket,
 	    SOCK_DGRAM, IPPROTO_UDP,
 	    curthread->td_ucred, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Call the special UDP hook. */
 	if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket,
 	    tcp_recv_udp_tunneled_packet,
 	    tcp6_ctlinput_viaudp,
 	    NULL))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 	/* Ok, we have a socket, bind it to the port. */
 	memset(&sin6, 0, sizeof(struct sockaddr_in6));
 	sin6.sin6_len = sizeof(struct sockaddr_in6);
 	sin6.sin6_family = AF_INET6;
 	sin6.sin6_port = htons(port);
 	if ((ret = sobind(V_udp6_tun_socket,
 	    (struct sockaddr *)&sin6, curthread))) {
 		tcp_over_udp_stop();
 		return (ret);
 	}
 #endif
 	return (0);
 }
 
 static int
 sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	uint32_t old, new;
 
 	old = V_tcp_udp_tunneling_port;
 	new = old;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if ((error == 0) &&
 	    (req->newptr != NULL)) {
 		if ((new < TCP_TUNNELING_PORT_MIN) ||
 		    (new > TCP_TUNNELING_PORT_MAX)) {
 			error = EINVAL;
 		} else {
 			V_tcp_udp_tunneling_port = new;
 			if (old != 0) {
 				tcp_over_udp_stop();
 			}
 			if (new != 0) {
 				error = tcp_over_udp_start();
 			}
 		}
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(tcp_udp_tunneling_port),
     0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU",
     "Tunneling port for tcp over udp");
 
 VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT;
 
 static int
 sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS)
 {
 	int error, new;
 
 	new = V_tcp_udp_tunneling_overhead;
 	error = sysctl_handle_int(oidp, &new, 0, req);
 	if (error == 0 && req->newptr) {
 		if ((new < TCP_TUNNELING_OVERHEAD_MIN) ||
 		    (new > TCP_TUNNELING_OVERHEAD_MAX))
 			error = EINVAL;
 		else
 			V_tcp_udp_tunneling_overhead = new;
 	}
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead,
     CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
     &VNET_NAME(tcp_udp_tunneling_overhead),
     0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU",
     "MSS reduction when using tcp over udp");
 
 /*
  * Exports one (struct tcp_function_info) for each alias/name.
  */
 static int
 sysctl_net_inet_list_func_info(SYSCTL_HANDLER_ARGS)
 {
 	int cnt, error;
 	struct tcp_function *f;
 	struct tcp_function_info tfi;
 
 	/*
 	 * We don't allow writes.
 	 */
 	if (req->newptr != NULL)
 		return (EINVAL);
 
 	/*
 	 * Wire the old buffer so we can directly copy the functions to
 	 * user space without dropping the lock.
 	 */
 	if (req->oldptr != NULL) {
 		error = sysctl_wire_old_buffer(req, 0);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Walk the list and copy out matching entries. If INVARIANTS
 	 * is compiled in, also walk the list to verify the length of
 	 * the list matches what we have recorded.
 	 */
 	rw_rlock(&tcp_function_lock);
 
 	cnt = 0;
 #ifndef INVARIANTS
 	if (req->oldptr == NULL) {
 		cnt = tcp_fb_cnt;
 		goto skip_loop;
 	}
 #endif
 	TAILQ_FOREACH(f, &t_functions, tf_next) {
 #ifdef INVARIANTS
 		cnt++;
 #endif
 		if (req->oldptr != NULL) {
 			bzero(&tfi, sizeof(tfi));
 			tfi.tfi_refcnt = f->tf_fb->tfb_refcnt;
 			tfi.tfi_id = f->tf_fb->tfb_id;
 			(void)strlcpy(tfi.tfi_alias, f->tf_name,
 			    sizeof(tfi.tfi_alias));
 			(void)strlcpy(tfi.tfi_name,
 			    f->tf_fb->tfb_tcp_block_name, sizeof(tfi.tfi_name));
 			error = SYSCTL_OUT(req, &tfi, sizeof(tfi));
 			/*
 			 * Don't stop on error, as that is the
 			 * mechanism we use to accumulate length
 			 * information if the buffer was too short.
 			 */
 		}
 	}
 	KASSERT(cnt == tcp_fb_cnt,
 	    ("%s: cnt (%d) != tcp_fb_cnt (%d)", __func__, cnt, tcp_fb_cnt));
 #ifndef INVARIANTS
 skip_loop:
 #endif
 	rw_runlock(&tcp_function_lock);
 	if (req->oldptr == NULL)
 		error = SYSCTL_OUT(req, NULL,
 		    (cnt + 1) * sizeof(struct tcp_function_info));
 
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, function_info,
 	    CTLTYPE_OPAQUE | CTLFLAG_SKIP | CTLFLAG_RD | CTLFLAG_MPSAFE,
 	    NULL, 0, sysctl_net_inet_list_func_info, "S,tcp_function_info",
 	    "List TCP function block name-to-ID mappings");
 
 /*
  * tfb_tcp_handoff_ok() function for the default stack.
  * Note that we'll basically try to take all comers.
  */
 static int
 tcp_default_handoff_ok(struct tcpcb *tp)
 {
 
 	return (0);
 }
 
 /*
  * tfb_tcp_fb_init() function for the default stack.
  *
  * This handles making sure we have appropriate timers set if you are
  * transitioning a socket that has some amount of setup done.
  *
  * The init() fuction from the default can *never* return non-zero i.e.
  * it is required to always succeed since it is the stack of last resort!
  */
 static int
 tcp_default_fb_init(struct tcpcb *tp)
 {
 
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	KASSERT(tp->t_state >= 0 && tp->t_state < TCPS_TIME_WAIT,
 	    ("%s: connection %p in unexpected state %d", __func__, tp,
 	    tp->t_state));
 
 	/*
 	 * Nothing to do for ESTABLISHED or LISTEN states. And, we don't
 	 * know what to do for unexpected states (which includes TIME_WAIT).
 	 */
 	if (tp->t_state <= TCPS_LISTEN || tp->t_state >= TCPS_TIME_WAIT)
 		return (0);
 
 	/*
 	 * Make sure some kind of transmission timer is set if there is
 	 * outstanding data.
 	 */
 	so = tp->t_inpcb->inp_socket;
 	if ((!TCPS_HAVEESTABLISHED(tp->t_state) || sbavail(&so->so_snd) ||
 	    tp->snd_una != tp->snd_max) && !(tcp_timer_active(tp, TT_REXMT) ||
 	    tcp_timer_active(tp, TT_PERSIST))) {
 		/*
 		 * If the session has established and it looks like it should
 		 * be in the persist state, set the persist timer. Otherwise,
 		 * set the retransmit timer.
 		 */
 		if (TCPS_HAVEESTABLISHED(tp->t_state) && tp->snd_wnd == 0 &&
 		    (int32_t)(tp->snd_nxt - tp->snd_una) <
 		    (int32_t)sbavail(&so->so_snd))
 			tcp_setpersist(tp);
 		else
 			tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur);
 	}
 
 	/* All non-embryonic sessions get a keepalive timer. */
 	if (!tcp_timer_active(tp, TT_KEEP))
 		tcp_timer_activate(tp, TT_KEEP,
 		    TCPS_HAVEESTABLISHED(tp->t_state) ? TP_KEEPIDLE(tp) :
 		    TP_KEEPINIT(tp));
 
 	/*
 	 * Make sure critical variables are initialized
 	 * if transitioning while in Recovery.
 	 */
 	if IN_FASTRECOVERY(tp->t_flags) {
 		if (tp->sackhint.recover_fs == 0)
 			tp->sackhint.recover_fs = max(1,
 			    tp->snd_nxt - tp->snd_una);
 	}
 
 	return (0);
 }
 
 /*
  * tfb_tcp_fb_fini() function for the default stack.
  *
  * This changes state as necessary (or prudent) to prepare for another stack
  * to assume responsibility for the connection.
  */
 static void
 tcp_default_fb_fini(struct tcpcb *tp, int tcb_is_purged)
 {
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	return;
 }
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
  *
  * Note that this can be overridden by the kernel environment
  * variable net.inet.tcp.tcbhashsize
  */
 #ifndef TCBHASHSIZE
 #define TCBHASHSIZE	0
 #endif
 
 /*
  * XXX
  * Callouts should be moved into struct tcp directly.  They are currently
  * separate because the tcpcb structure is exported to userland for sysctl
  * parsing purposes, which do not know about callouts.
  */
 struct tcpcb_mem {
 	struct	tcpcb		tcb;
 	struct	tcp_timer	tt;
 	struct	cc_var		ccv;
 #ifdef TCP_HHOOK
 	struct	osd		osd;
 #endif
 };
 
 VNET_DEFINE_STATIC(uma_zone_t, tcpcb_zone);
 #define	V_tcpcb_zone			VNET(tcpcb_zone)
 
 MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
 MALLOC_DEFINE(M_TCPFUNCTIONS, "tcpfunc", "TCP function set memory");
 
 static struct mtx isn_mtx;
 
 #define	ISN_LOCK_INIT()	mtx_init(&isn_mtx, "isn_mtx", NULL, MTX_DEF)
 #define	ISN_LOCK()	mtx_lock(&isn_mtx)
 #define	ISN_UNLOCK()	mtx_unlock(&isn_mtx)
 
 INPCBSTORAGE_DEFINE(tcpcbstor, "tcpinp", "tcp_inpcb", "tcp", "tcphash");
 
 /*
  * Take a value and get the next power of 2 that doesn't overflow.
  * Used to size the tcp_inpcb hash buckets.
  */
 static int
 maketcp_hashsize(int size)
 {
 	int hashsize;
 
 	/*
 	 * auto tune.
 	 * get the next power of 2 higher than maxsockets.
 	 */
 	hashsize = 1 << fls(size);
 	/* catch overflow, and just go one power of 2 smaller */
 	if (hashsize < size) {
 		hashsize = 1 << (fls(size) - 1);
 	}
 	return (hashsize);
 }
 
 static volatile int next_tcp_stack_id = 1;
 
 /*
  * Register a TCP function block with the name provided in the names
  * array.  (Note that this function does NOT automatically register
  * blk->tfb_tcp_block_name as a stack name.  Therefore, you should
  * explicitly include blk->tfb_tcp_block_name in the list of names if
  * you wish to register the stack with that name.)
  *
  * Either all name registrations will succeed or all will fail.  If
  * a name registration fails, the function will update the num_names
  * argument to point to the array index of the name that encountered
  * the failure.
  *
  * Returns 0 on success, or an error code on failure.
  */
 int
 register_tcp_functions_as_names(struct tcp_function_block *blk, int wait,
     const char *names[], int *num_names)
 {
 	struct tcp_function *n;
 	struct tcp_function_set fs;
 	int error, i;
 
 	KASSERT(names != NULL && *num_names > 0,
 	    ("%s: Called with 0-length name list", __func__));
 	KASSERT(names != NULL, ("%s: Called with NULL name list", __func__));
 	KASSERT(rw_initialized(&tcp_function_lock),
 	    ("%s: called too early", __func__));
 
 	if ((blk->tfb_tcp_output == NULL) ||
 	    (blk->tfb_tcp_do_segment == NULL) ||
 	    (blk->tfb_tcp_ctloutput == NULL) ||
 	    (strlen(blk->tfb_tcp_block_name) == 0)) {
 		/*
 		 * These functions are required and you
 		 * need a name.
 		 */
 		*num_names = 0;
 		return (EINVAL);
 	}
 	if (blk->tfb_tcp_timer_stop_all ||
 	    blk->tfb_tcp_timer_activate ||
 	    blk->tfb_tcp_timer_active ||
 	    blk->tfb_tcp_timer_stop) {
 		/*
 		 * If you define one timer function you
 		 * must have them all.
 		 */
 		if ((blk->tfb_tcp_timer_stop_all == NULL) ||
 		    (blk->tfb_tcp_timer_activate == NULL) ||
 		    (blk->tfb_tcp_timer_active == NULL) ||
 		    (blk->tfb_tcp_timer_stop == NULL)) {
 			*num_names = 0;
 			return (EINVAL);
 		}
 	}
 
 	if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
 		*num_names = 0;
 		return (EINVAL);
 	}
 
 	refcount_init(&blk->tfb_refcnt, 0);
 	blk->tfb_id = atomic_fetchadd_int(&next_tcp_stack_id, 1);
 	for (i = 0; i < *num_names; i++) {
 		n = malloc(sizeof(struct tcp_function), M_TCPFUNCTIONS, wait);
 		if (n == NULL) {
 			error = ENOMEM;
 			goto cleanup;
 		}
 		n->tf_fb = blk;
 
 		(void)strlcpy(fs.function_set_name, names[i],
 		    sizeof(fs.function_set_name));
 		rw_wlock(&tcp_function_lock);
 		if (find_tcp_functions_locked(&fs) != NULL) {
 			/* Duplicate name space not allowed */
 			rw_wunlock(&tcp_function_lock);
 			free(n, M_TCPFUNCTIONS);
 			error = EALREADY;
 			goto cleanup;
 		}
 		(void)strlcpy(n->tf_name, names[i], sizeof(n->tf_name));
 		TAILQ_INSERT_TAIL(&t_functions, n, tf_next);
 		tcp_fb_cnt++;
 		rw_wunlock(&tcp_function_lock);
 	}
 	return(0);
 
 cleanup:
 	/*
 	 * Deregister the names we just added. Because registration failed
 	 * for names[i], we don't need to deregister that name.
 	 */
 	*num_names = i;
 	rw_wlock(&tcp_function_lock);
 	while (--i >= 0) {
 		TAILQ_FOREACH(n, &t_functions, tf_next) {
 			if (!strncmp(n->tf_name, names[i],
 			    TCP_FUNCTION_NAME_LEN_MAX)) {
 				TAILQ_REMOVE(&t_functions, n, tf_next);
 				tcp_fb_cnt--;
 				n->tf_fb = NULL;
 				free(n, M_TCPFUNCTIONS);
 				break;
 			}
 		}
 	}
 	rw_wunlock(&tcp_function_lock);
 	return (error);
 }
 
 /*
  * Register a TCP function block using the name provided in the name
  * argument.
  *
  * Returns 0 on success, or an error code on failure.
  */
 int
 register_tcp_functions_as_name(struct tcp_function_block *blk, const char *name,
     int wait)
 {
 	const char *name_list[1];
 	int num_names, rv;
 
 	num_names = 1;
 	if (name != NULL)
 		name_list[0] = name;
 	else
 		name_list[0] = blk->tfb_tcp_block_name;
 	rv = register_tcp_functions_as_names(blk, wait, name_list, &num_names);
 	return (rv);
 }
 
 /*
  * Register a TCP function block using the name defined in
  * blk->tfb_tcp_block_name.
  *
  * Returns 0 on success, or an error code on failure.
  */
 int
 register_tcp_functions(struct tcp_function_block *blk, int wait)
 {
 
 	return (register_tcp_functions_as_name(blk, NULL, wait));
 }
 
 /*
  * Deregister all names associated with a function block. This
  * functionally removes the function block from use within the system.
  *
  * When called with a true quiesce argument, mark the function block
  * as being removed so no more stacks will use it and determine
  * whether the removal would succeed.
  *
  * When called with a false quiesce argument, actually attempt the
  * removal.
  *
  * When called with a force argument, attempt to switch all TCBs to
  * use the default stack instead of returning EBUSY.
  *
  * Returns 0 on success (or if the removal would succeed, or an error
  * code on failure.
  */
 int
 deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
     bool force)
 {
 	struct tcp_function *f;
 
 	if (blk == &tcp_def_funcblk) {
 		/* You can't un-register the default */
 		return (EPERM);
 	}
 	rw_wlock(&tcp_function_lock);
 	if (blk == tcp_func_set_ptr) {
 		/* You can't free the current default */
 		rw_wunlock(&tcp_function_lock);
 		return (EBUSY);
 	}
 	/* Mark the block so no more stacks can use it. */
 	blk->tfb_flags |= TCP_FUNC_BEING_REMOVED;
 	/*
 	 * If TCBs are still attached to the stack, attempt to switch them
 	 * to the default stack.
 	 */
 	if (force && blk->tfb_refcnt) {
 		struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
 		    INPLOOKUP_WLOCKPCB);
 		struct inpcb *inp;
 		struct tcpcb *tp;
 		VNET_ITERATOR_DECL(vnet_iter);
 
 		rw_wunlock(&tcp_function_lock);
 
 		VNET_LIST_RLOCK();
 		VNET_FOREACH(vnet_iter) {
 			CURVNET_SET(vnet_iter);
 			while ((inp = inp_next(&inpi)) != NULL) {
 				if (inp->inp_flags & INP_TIMEWAIT)
 					continue;
 				tp = intotcpcb(inp);
 				if (tp == NULL || tp->t_fb != blk)
 					continue;
 				tcp_switch_back_to_default(tp);
 			}
 			CURVNET_RESTORE();
 		}
 		VNET_LIST_RUNLOCK();
 
 		rw_wlock(&tcp_function_lock);
 	}
 	if (blk->tfb_refcnt) {
 		/* TCBs still attached. */
 		rw_wunlock(&tcp_function_lock);
 		return (EBUSY);
 	}
 	if (quiesce) {
 		/* Skip removal. */
 		rw_wunlock(&tcp_function_lock);
 		return (0);
 	}
 	/* Remove any function names that map to this function block. */
 	while (find_tcp_fb_locked(blk, &f) != NULL) {
 		TAILQ_REMOVE(&t_functions, f, tf_next);
 		tcp_fb_cnt--;
 		f->tf_fb = NULL;
 		free(f, M_TCPFUNCTIONS);
 	}
 	rw_wunlock(&tcp_function_lock);
 	return (0);
 }
 
 static void
 tcp_drain(void)
 {
 	struct epoch_tracker et;
 	VNET_ITERATOR_DECL(vnet_iter);
 
 	if (!do_tcpdrain)
 		return;
 
 	NET_EPOCH_ENTER(et);
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
 		struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
 		    INPLOOKUP_WLOCKPCB);
 		struct inpcb *inpb;
 		struct tcpcb *tcpb;
 
 	/*
 	 * Walk the tcpbs, if existing, and flush the reassembly queue,
 	 * if there is one...
 	 * XXX: The "Net/3" implementation doesn't imply that the TCP
 	 *      reassembly queue should be flushed, but in a situation
 	 *	where we're really low on mbufs, this is potentially
 	 *	useful.
 	 */
 		while ((inpb = inp_next(&inpi)) != NULL) {
 			if (inpb->inp_flags & INP_TIMEWAIT)
 				continue;
 			if ((tcpb = intotcpcb(inpb)) != NULL) {
 				tcp_reass_flush(tcpb);
 				tcp_clean_sackreport(tcpb);
 #ifdef TCP_BLACKBOX
 				tcp_log_drain(tcpb);
 #endif
 #ifdef TCPPCAP
 				if (tcp_pcap_aggressive_free) {
 					/* Free the TCP PCAP queues. */
 					tcp_pcap_drain(&(tcpb->t_inpkts));
 					tcp_pcap_drain(&(tcpb->t_outpkts));
 				}
 #endif
 			}
 		}
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();
 	NET_EPOCH_EXIT(et);
 }
 
 static void
 tcp_vnet_init(void *arg __unused)
 {
 
 #ifdef TCP_HHOOK
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN,
 	    &V_tcp_hhh[HHOOK_TCP_EST_IN], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 	if (hhook_head_register(HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT,
 	    &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 #endif
 #ifdef STATS
 	if (tcp_stats_init())
 		printf("%s: WARNING: unable to initialise TCP stats\n",
 		    __func__);
 #endif
 	in_pcbinfo_init(&V_tcbinfo, &tcpcbstor, tcp_tcbhashsize,
 	    tcp_tcbhashsize);
 
 	/*
 	 * These have to be type stable for the benefit of the timers.
 	 */
 	V_tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(V_tcpcb_zone, maxsockets);
 	uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached");
 
 	tcp_tw_init();
 	syncache_init();
 	tcp_hc_init();
 
 	TUNABLE_INT_FETCH("net.inet.tcp.sack.enable", &V_tcp_do_sack);
 	V_sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 
 	tcp_fastopen_init();
 
 	COUNTER_ARRAY_ALLOC(V_tcps_states, TCP_NSTATES, M_WAITOK);
 	VNET_PCPUSTAT_ALLOC(tcpstat, M_WAITOK);
 
 	V_tcp_msl = TCPTV_MSL;
 }
 VNET_SYSINIT(tcp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
     tcp_vnet_init, NULL);
 
 static void
 tcp_init(void *arg __unused)
 {
 	const char *tcbhash_tuneable;
 	int hashsize;
 
 	tcp_reass_global_init();
 
 	/* XXX virtualize those below? */
 	tcp_delacktime = TCPTV_DELACK;
 	tcp_keepinit = TCPTV_KEEP_INIT;
 	tcp_keepidle = TCPTV_KEEP_IDLE;
 	tcp_keepintvl = TCPTV_KEEPINTVL;
 	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
 	tcp_rexmit_initial = TCPTV_RTOBASE;
 	if (tcp_rexmit_initial < 1)
 		tcp_rexmit_initial = 1;
 	tcp_rexmit_min = TCPTV_MIN;
 	if (tcp_rexmit_min < 1)
 		tcp_rexmit_min = 1;
 	tcp_persmin = TCPTV_PERSMIN;
 	tcp_persmax = TCPTV_PERSMAX;
 	tcp_rexmit_slop = TCPTV_CPU_VAR;
 	tcp_finwait2_timeout = TCPTV_FINWAIT2_TIMEOUT;
 
 	/* Setup the tcp function block list */
 	TAILQ_INIT(&t_functions);
 	rw_init(&tcp_function_lock, "tcp_func_lock");
 	register_tcp_functions(&tcp_def_funcblk, M_WAITOK);
 #ifdef TCP_BLACKBOX
 	/* Initialize the TCP logging data. */
 	tcp_log_init();
 #endif
 	arc4rand(&V_ts_offset_secret, sizeof(V_ts_offset_secret), 0);
 
 	if (tcp_soreceive_stream) {
 #ifdef INET
-		tcp_usrreqs.pru_soreceive = soreceive_stream;
+		tcp_protosw.pr_soreceive = soreceive_stream;
 #endif
 #ifdef INET6
-		tcp6_usrreqs.pru_soreceive = soreceive_stream;
+		tcp6_protosw.pr_soreceive = soreceive_stream;
 #endif /* INET6 */
 	}
 
 #ifdef INET6
 #define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
 #else /* INET6 */
 #define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
 #endif /* INET6 */
 	if (max_protohdr < TCP_MINPROTOHDR)
 		max_protohdr = TCP_MINPROTOHDR;
 	if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
 		panic("tcp_init");
 #undef TCP_MINPROTOHDR
 
 	ISN_LOCK_INIT();
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
 		SHUTDOWN_PRI_DEFAULT);
 	EVENTHANDLER_REGISTER(vm_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT);
 	EVENTHANDLER_REGISTER(mbuf_lowmem, tcp_drain, NULL, LOWMEM_PRI_DEFAULT);
 
 	tcp_inp_lro_direct_queue = counter_u64_alloc(M_WAITOK);
 	tcp_inp_lro_wokeup_queue = counter_u64_alloc(M_WAITOK);
 	tcp_inp_lro_compressed = counter_u64_alloc(M_WAITOK);
 	tcp_inp_lro_locks_taken = counter_u64_alloc(M_WAITOK);
 	tcp_extra_mbuf = counter_u64_alloc(M_WAITOK);
 	tcp_would_have_but = counter_u64_alloc(M_WAITOK);
 	tcp_comp_total = counter_u64_alloc(M_WAITOK);
 	tcp_uncomp_total = counter_u64_alloc(M_WAITOK);
 	tcp_bad_csums = counter_u64_alloc(M_WAITOK);
 #ifdef TCPPCAP
 	tcp_pcap_init();
 #endif
 
 	hashsize = TCBHASHSIZE;
 	tcbhash_tuneable = "net.inet.tcp.tcbhashsize";
 	TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
 	if (hashsize == 0) {
 		/*
 		 * Auto tune the hash size based on maxsockets.
 		 * A perfect hash would have a 1:1 mapping
 		 * (hashsize = maxsockets) however it's been
 		 * suggested that O(2) average is better.
 		 */
 		hashsize = maketcp_hashsize(maxsockets / 4);
 		/*
 		 * Our historical default is 512,
 		 * do not autotune lower than this.
 		 */
 		if (hashsize < 512)
 			hashsize = 512;
 		if (bootverbose)
 			printf("%s: %s auto tuned to %d\n", __func__,
 			    tcbhash_tuneable, hashsize);
 	}
 	/*
 	 * We require a hashsize to be a power of two.
 	 * Previously if it was not a power of two we would just reset it
 	 * back to 512, which could be a nasty surprise if you did not notice
 	 * the error message.
 	 * Instead what we do is clip it to the closest power of two lower
 	 * than the specified hash value.
 	 */
 	if (!powerof2(hashsize)) {
 		int oldhashsize = hashsize;
 
 		hashsize = maketcp_hashsize(hashsize);
 		/* prevent absurdly low value */
 		if (hashsize < 16)
 			hashsize = 16;
 		printf("%s: WARNING: TCB hash size not a power of 2, "
 		    "clipped from %d to %d.\n", __func__, oldhashsize,
 		    hashsize);
 	}
 	tcp_tcbhashsize = hashsize;
 
 #ifdef INET
 	IPPROTO_REGISTER(IPPROTO_TCP, tcp_input, tcp_ctlinput);
 #endif
 #ifdef INET6
 	IP6PROTO_REGISTER(IPPROTO_TCP, tcp6_input, tcp6_ctlinput);
 #endif
 }
 SYSINIT(tcp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, tcp_init, NULL);
 
 #ifdef VIMAGE
 static void
 tcp_destroy(void *unused __unused)
 {
 	int n;
 #ifdef TCP_HHOOK
 	int error;
 #endif
 
 	/*
 	 * All our processes are gone, all our sockets should be cleaned
 	 * up, which means, we should be past the tcp_discardcb() calls.
 	 * Sleep to let all tcpcb timers really disappear and cleanup.
 	 */
 	for (;;) {
 		INP_INFO_WLOCK(&V_tcbinfo);
 		n = V_tcbinfo.ipi_count;
 		INP_INFO_WUNLOCK(&V_tcbinfo);
 		if (n == 0)
 			break;
 		pause("tcpdes", hz / 10);
 	}
 	tcp_hc_destroy();
 	syncache_destroy();
 	tcp_tw_destroy();
 	in_pcbinfo_destroy(&V_tcbinfo);
 	/* tcp_discardcb() clears the sack_holes up. */
 	uma_zdestroy(V_sack_hole_zone);
 	uma_zdestroy(V_tcpcb_zone);
 
 	/*
 	 * Cannot free the zone until all tcpcbs are released as we attach
 	 * the allocations to them.
 	 */
 	tcp_fastopen_destroy();
 
 	COUNTER_ARRAY_FREE(V_tcps_states, TCP_NSTATES);
 	VNET_PCPUSTAT_FREE(tcpstat);
 
 #ifdef TCP_HHOOK
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_IN]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_IN, error);
 	}
 	error = hhook_head_deregister(V_tcp_hhh[HHOOK_TCP_EST_OUT]);
 	if (error != 0) {
 		printf("%s: WARNING: unable to deregister helper hook "
 		    "type=%d, id=%d: error %d returned\n", __func__,
 		    HHOOK_TYPE_TCP, HHOOK_TCP_EST_OUT, error);
 	}
 #endif
 }
 VNET_SYSUNINIT(tcp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, tcp_destroy, NULL);
 #endif
 
 void
 tcp_fini(void *xtp)
 {
 
 }
 
 /*
  * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
  * tcp_template used to store this data in mbufs, but we now recopy it out
  * of the tcpcb each time to conserve mbufs.
  */
 void
 tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr)
 {
 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV6) != 0) {
 		struct ip6_hdr *ip6;
 
 		ip6 = (struct ip6_hdr *)ip_ptr;
 		ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 			(inp->inp_flow & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
 		if (port == 0)
 			ip6->ip6_nxt = IPPROTO_TCP;
 		else
 			ip6->ip6_nxt = IPPROTO_UDP;
 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		struct ip *ip;
 
 		ip = (struct ip *)ip_ptr;
 		ip->ip_v = IPVERSION;
 		ip->ip_hl = 5;
 		ip->ip_tos = inp->inp_ip_tos;
 		ip->ip_len = 0;
 		ip->ip_id = 0;
 		ip->ip_off = 0;
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_sum = 0;
 		if (port == 0)
 			ip->ip_p = IPPROTO_TCP;
 		else
 			ip->ip_p = IPPROTO_UDP;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst = inp->inp_faddr;
 	}
 #endif /* INET */
 	th->th_sport = inp->inp_lport;
 	th->th_dport = inp->inp_fport;
 	th->th_seq = 0;
 	th->th_ack = 0;
 	th->th_off = 5;
 	tcp_set_flags(th, 0);
 	th->th_win = 0;
 	th->th_urp = 0;
 	th->th_sum = 0;		/* in_pseudo() is called later for ipv4 */
 }
 
 /*
  * Create template to be used to send tcp packets on a connection.
  * Allocates an mbuf and fills in a skeletal tcp/ip header.  The only
  * use for this function is in keepalives, which use tcp_respond.
  */
 struct tcptemp *
 tcpip_maketemplate(struct inpcb *inp)
 {
 	struct tcptemp *t;
 
 	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
 	if (t == NULL)
 		return (NULL);
 	tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t);
 	return (t);
 }
 
 /*
  * Send a single message to the TCP at address specified by
  * the given TCP/IP header.  If m == NULL, then we make a copy
  * of the tcpiphdr at th and send directly to the addressed host.
  * This is used to force keep alive messages out using the TCP
  * template for a connection.  If flags are given then we send
  * a message back to the TCP which originated the segment th,
  * and discard the mbuf containing it and any other attached mbufs.
  *
  * In any case the ack and sequence number of the transmitted
  * segment are as specified by the parameters.
  *
  * NOTE: If m != NULL, then th must point to *inside* the mbuf.
  */
 void
 tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
     tcp_seq ack, tcp_seq seq, int flags)
 {
 	struct tcpopt to;
 	struct inpcb *inp;
 	struct ip *ip;
 	struct mbuf *optm;
 	struct udphdr *uh = NULL;
 	struct tcphdr *nth;
 	struct tcp_log_buffer *lgb;
 	u_char *optp;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
 	int optlen, tlen, win, ulen;
 	bool incl_opts;
 	uint16_t port;
 	int output_ret;
 #ifdef INVARIANTS
 	int thflags = tcp_get_flags(th);
 #endif
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 	NET_EPOCH_ASSERT();
 
 #ifdef INET6
 	isipv6 = ((struct ip *)ipgen)->ip_v == (IPV6_VERSION >> 4);
 	ip6 = ipgen;
 #endif /* INET6 */
 	ip = ipgen;
 
 	if (tp != NULL) {
 		inp = tp->t_inpcb;
 		KASSERT(inp != NULL, ("tcp control block w/o inpcb"));
 		INP_LOCK_ASSERT(inp);
 	} else
 		inp = NULL;
 
 	if (m != NULL) {
 #ifdef INET6
 		if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP))
 			port = m->m_pkthdr.tcp_tun_port;
 		else
 #endif
 		if (ip && (ip->ip_p == IPPROTO_UDP))
 			port = m->m_pkthdr.tcp_tun_port;
 		else
 			port = 0;
 	} else
 		port = tp->t_port;
 
 	incl_opts = false;
 	win = 0;
 	if (tp != NULL) {
 		if (!(flags & TH_RST)) {
 			win = sbspace(&inp->inp_socket->so_rcv);
 			if (win > TCP_MAXWIN << tp->rcv_scale)
 				win = TCP_MAXWIN << tp->rcv_scale;
 		}
 		if ((tp->t_flags & TF_NOOPT) == 0)
 			incl_opts = true;
 	}
 	if (m == NULL) {
 		m = m_gethdr(M_NOWAIT, MT_DATA);
 		if (m == NULL)
 			return;
 		m->m_data += max_linkhdr;
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(m, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(m, struct ip6_hdr *);
 			nth = (struct tcphdr *)(ip6 + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		} else
 #endif /* INET6 */
 		{
 			bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
 			ip = mtod(m, struct ip *);
 			nth = (struct tcphdr *)(ip + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		}
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		flags = TH_ACK;
 	} else if ((!M_WRITABLE(m)) || (port != 0)) {
 		struct mbuf *n;
 
 		/* Can't reuse 'm', allocate a new mbuf. */
 		n = m_gethdr(M_NOWAIT, MT_DATA);
 		if (n == NULL) {
 			m_freem(m);
 			return;
 		}
 
 		if (!m_dup_pkthdr(n, m, M_NOWAIT)) {
 			m_freem(m);
 			m_freem(n);
 			return;
 		}
 
 		n->m_data += max_linkhdr;
 		/* m_len is set later */
 #define xchg(a,b,type) { type t; t=a; a=b; b=t; }
 #ifdef INET6
 		if (isipv6) {
 			bcopy((caddr_t)ip6, mtod(n, caddr_t),
 			      sizeof(struct ip6_hdr));
 			ip6 = mtod(n, struct ip6_hdr *);
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		} else
 #endif /* INET6 */
 		{
 			bcopy((caddr_t)ip, mtod(n, caddr_t), sizeof(struct ip));
 			ip = mtod(n, struct ip *);
 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
 			nth = (struct tcphdr *)(ip + 1);
 			if (port) {
 				/* Insert a UDP header */
 				uh = (struct udphdr *)nth;
 				uh->uh_sport = htons(V_tcp_udp_tunneling_port);
 				uh->uh_dport = port;
 				nth = (struct tcphdr *)(uh + 1);
 			}
 		}
 		bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
 		xchg(nth->th_dport, nth->th_sport, uint16_t);
 		th = nth;
 		m_freem(m);
 		m = n;
 	} else {
 		/*
 		 *  reuse the mbuf.
 		 * XXX MRT We inherit the FIB, which is lucky.
 		 */
 		m_freem(m->m_next);
 		m->m_next = NULL;
 		m->m_data = (caddr_t)ipgen;
 		/* m_len is set later */
 #ifdef INET6
 		if (isipv6) {
 			xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
 			nth = (struct tcphdr *)(ip6 + 1);
 		} else
 #endif /* INET6 */
 		{
 			xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, uint32_t);
 			nth = (struct tcphdr *)(ip + 1);
 		}
 		if (th != nth) {
 			/*
 			 * this is usually a case when an extension header
 			 * exists between the IPv6 header and the
 			 * TCP header.
 			 */
 			nth->th_sport = th->th_sport;
 			nth->th_dport = th->th_dport;
 		}
 		xchg(nth->th_dport, nth->th_sport, uint16_t);
 #undef xchg
 	}
 	tlen = 0;
 #ifdef INET6
 	if (isipv6)
 		tlen = sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 		tlen = sizeof (struct tcpiphdr);
 #endif
 	if (port)
 		tlen += sizeof (struct udphdr);
 #ifdef INVARIANTS
 	m->m_len = 0;
 	KASSERT(M_TRAILINGSPACE(m) >= tlen,
 	    ("Not enough trailing space for message (m=%p, need=%d, have=%ld)",
 	    m, tlen, (long)M_TRAILINGSPACE(m)));
 #endif
 	m->m_len = tlen;
 	to.to_flags = 0;
 	if (incl_opts) {
 		/* Make sure we have room. */
 		if (M_TRAILINGSPACE(m) < TCP_MAXOLEN) {
 			m->m_next = m_get(M_NOWAIT, MT_DATA);
 			if (m->m_next) {
 				optp = mtod(m->m_next, u_char *);
 				optm = m->m_next;
 			} else
 				incl_opts = false;
 		} else {
 			optp = (u_char *) (nth + 1);
 			optm = m;
 		}
 	}
 	if (incl_opts) {
 		/* Timestamps. */
 		if (tp->t_flags & TF_RCVD_TSTMP) {
 			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
 			to.to_tsecr = tp->ts_recent;
 			to.to_flags |= TOF_TS;
 		}
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		/* TCP-MD5 (RFC2385). */
 		if (tp->t_flags & TF_SIGNATURE)
 			to.to_flags |= TOF_SIGNATURE;
 #endif
 		/* Add the options. */
 		tlen += optlen = tcp_addoptions(&to, optp);
 
 		/* Update m_len in the correct mbuf. */
 		optm->m_len += optlen;
 	} else
 		optlen = 0;
 #ifdef INET6
 	if (isipv6) {
 		if (uh) {
 			ulen = tlen - sizeof(struct ip6_hdr);
 			uh->uh_ulen = htons(ulen);
 		}
 		ip6->ip6_flow = 0;
 		ip6->ip6_vfc = IPV6_VERSION;
 		if (port)
 			ip6->ip6_nxt = IPPROTO_UDP;
 		else
 			ip6->ip6_nxt = IPPROTO_TCP;
 		ip6->ip6_plen = htons(tlen - sizeof(*ip6));
 	}
 #endif
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		if (uh) {
 			ulen = tlen - sizeof(struct ip);
 			uh->uh_ulen = htons(ulen);
 		}
 		ip->ip_len = htons(tlen);
 		ip->ip_ttl = V_ip_defttl;
 		if (port) {
 			ip->ip_p = IPPROTO_UDP;
 		} else {
 			ip->ip_p = IPPROTO_TCP;
 		}
 		if (V_path_mtu_discovery)
 			ip->ip_off |= htons(IP_DF);
 	}
 #endif
 	m->m_pkthdr.len = tlen;
 	m->m_pkthdr.rcvif = NULL;
 #ifdef MAC
 	if (inp != NULL) {
 		/*
 		 * Packet is associated with a socket, so allow the
 		 * label of the response to reflect the socket label.
 		 */
 		INP_LOCK_ASSERT(inp);
 		mac_inpcb_create_mbuf(inp, m);
 	} else {
 		/*
 		 * Packet is not associated with a socket, so possibly
 		 * update the label in place.
 		 */
 		mac_netinet_tcp_reply(m);
 	}
 #endif
 	nth->th_seq = htonl(seq);
 	nth->th_ack = htonl(ack);
 	nth->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
 	tcp_set_flags(nth, flags);
 	if (tp != NULL)
 		nth->th_win = htons((u_short) (win >> tp->rcv_scale));
 	else
 		nth->th_win = htons((u_short)win);
 	nth->th_urp = 0;
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (to.to_flags & TOF_SIGNATURE) {
 		if (!TCPMD5_ENABLED() ||
 		    TCPMD5_OUTPUT(m, nth, to.to_signature) != 0) {
 			m_freem(m);
 			return;
 		}
 	}
 #endif
 
 #ifdef INET6
 	if (isipv6) {
 		if (port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			uh->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
 			nth->th_sum = 0;
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			nth->th_sum = in6_cksum_pseudo(ip6,
 			    tlen - sizeof(struct ip6_hdr), IPPROTO_TCP, 0);
 		}
 		ip6->ip6_hlim = in6_selecthlim(tp != NULL ? tp->t_inpcb :
 		    NULL, NULL);
 	}
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 	else
 #endif
 #ifdef INET
 	{
 		if (port) {
 			uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 			    htons(ulen + IPPROTO_UDP));
 			m->m_pkthdr.csum_flags = CSUM_UDP;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 			nth->th_sum = 0;
 		} else {
 			m->m_pkthdr.csum_flags = CSUM_TCP;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
 			    htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
 		}
 	}
 #endif /* INET */
 #ifdef TCPDEBUG
 	if (tp == NULL || (inp->inp_socket->so_options & SO_DEBUG))
 		tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
 #endif
 	TCP_PROBE3(debug__output, tp, th, m);
 	if (flags & TH_RST)
 		TCP_PROBE5(accept__refused, NULL, NULL, m, tp, nth);
 	lgb = NULL;
 	if ((tp != NULL) && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
 		if (INP_WLOCKED(inp)) {
 			union tcp_log_stackspecific log;
 			struct timeval tv;
 
 			memset(&log.u_bbr, 0, sizeof(log.u_bbr));
 			log.u_bbr.inhpts = tp->t_inpcb->inp_in_hpts;
 			log.u_bbr.flex8 = 4;
 			log.u_bbr.pkts_out = tp->t_maxseg;
 			log.u_bbr.timeStamp = tcp_get_usecs(&tv);
 			log.u_bbr.delivered = 0;
 			lgb = tcp_log_event_(tp, nth, NULL, NULL, TCP_LOG_OUT,
 			    ERRNO_UNK, 0, &log, false, NULL, NULL, 0, &tv);
 		} else {
 			/*
 			 * We can not log the packet, since we only own the
 			 * read lock, but a write lock is needed. The read lock
 			 * is not upgraded to a write lock, since only getting
 			 * the read lock was done intentionally to improve the
 			 * handling of SYN flooding attacks.
 			 * This happens only for pure SYN segments received in
 			 * the initial CLOSED state, or received in a more
 			 * advanced state than listen and the UDP encapsulation
 			 * port is unexpected.
 			 * The incoming SYN segments do not really belong to
 			 * the TCP connection and the handling does not change
 			 * the state of the TCP connection. Therefore, the
 			 * sending of the RST segments is not logged. Please
 			 * note that also the incoming SYN segments are not
 			 * logged.
 			 *
 			 * The following code ensures that the above description
 			 * is and stays correct.
 			 */
 			KASSERT((thflags & (TH_ACK|TH_SYN)) == TH_SYN &&
 			    (tp->t_state == TCPS_CLOSED ||
 			    (tp->t_state > TCPS_LISTEN && tp->t_port != port)),
 			    ("%s: Logging of TCP segment with flags 0x%b and "
 			    "UDP encapsulation port %u skipped in state %s",
 			    __func__, thflags, PRINT_TH_FLAGS,
 			    ntohs(port), tcpstates[tp->t_state]));
 		}
 	}
 
 #ifdef INET6
 	if (isipv6) {
 		TCP_PROBE5(send, NULL, tp, ip6, tp, nth);
 		output_ret = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
 	}
 #endif /* INET6 */
 #if defined(INET) && defined(INET6)
 	else
 #endif
 #ifdef INET
 	{
 		TCP_PROBE5(send, NULL, tp, ip, tp, nth);
 		output_ret = ip_output(m, NULL, NULL, 0, NULL, inp);
 	}
 #endif
 	if (lgb != NULL)
 		lgb->tlb_errno = output_ret;
 }
 
 /*
  * Create a new TCP control block, making an
  * empty reassembly queue and hooking it to the argument
  * protocol control block.  The `inp' parameter must have
  * come from the zone allocator set up in tcp_init().
  */
 struct tcpcb *
 tcp_newtcpcb(struct inpcb *inp)
 {
 	struct tcpcb_mem *tm;
 	struct tcpcb *tp;
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
 
 	tm = uma_zalloc(V_tcpcb_zone, M_NOWAIT | M_ZERO);
 	if (tm == NULL)
 		return (NULL);
 	tp = &tm->tcb;
 
 	/* Initialise cc_var struct for this tcpcb. */
 	tp->ccv = &tm->ccv;
 	tp->ccv->type = IPPROTO_TCP;
 	tp->ccv->ccvc.tcp = tp;
 	rw_rlock(&tcp_function_lock);
 	tp->t_fb = tcp_func_set_ptr;
 	refcount_acquire(&tp->t_fb->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
 	/*
 	 * Use the current system default CC algorithm.
 	 */
 	cc_attach(tp, CC_DEFAULT_ALGO());
 
 	/*
 	 * The tcpcb will hold a reference on its inpcb until tcp_discardcb()
 	 * is called.
 	 */
 	in_pcbref(inp);	/* Reference for tcpcb */
 	tp->t_inpcb = inp;
 
 	if (CC_ALGO(tp)->cb_init != NULL)
 		if (CC_ALGO(tp)->cb_init(tp->ccv, NULL) > 0) {
 			cc_detach(tp);
 			if (tp->t_fb->tfb_tcp_fb_fini)
 				(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 			in_pcbrele_wlocked(inp);
 			refcount_release(&tp->t_fb->tfb_refcnt);
 			uma_zfree(V_tcpcb_zone, tm);
 			return (NULL);
 		}
 
 #ifdef TCP_HHOOK
 	tp->osd = &tm->osd;
 	if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
 		if (tp->t_fb->tfb_tcp_fb_fini)
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 		in_pcbrele_wlocked(inp);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		uma_zfree(V_tcpcb_zone, tm);
 		return (NULL);
 	}
 #endif
 
 #ifdef VIMAGE
 	tp->t_vnet = inp->inp_vnet;
 #endif
 	tp->t_timers = &tm->tt;
 	TAILQ_INIT(&tp->t_segq);
 	tp->t_maxseg =
 #ifdef INET6
 		isipv6 ? V_tcp_v6mssdflt :
 #endif /* INET6 */
 		V_tcp_mssdflt;
 
 	/* Set up our timeouts. */
 	callout_init(&tp->t_timers->tt_rexmt, 1);
 	callout_init(&tp->t_timers->tt_persist, 1);
 	callout_init(&tp->t_timers->tt_keep, 1);
 	callout_init(&tp->t_timers->tt_2msl, 1);
 	callout_init(&tp->t_timers->tt_delack, 1);
 
 	if (V_tcp_do_rfc1323)
 		tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
 	if (V_tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
 	TAILQ_INIT(&tp->snd_holes);
 
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
 	 * reasonable initial retransmit time.
 	 */
 	tp->t_srtt = TCPTV_SRTTBASE;
 	tp->t_rttvar = ((tcp_rexmit_initial - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
 	tp->t_rttmin = tcp_rexmit_min;
 	tp->t_rxtcur = tcp_rexmit_initial;
 	tp->snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
 	tp->t_rcvtime = ticks;
 	/*
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 	inp->inp_ppcb = tp;
 #ifdef TCPPCAP
 	/*
 	 * Init the TCP PCAP queues.
 	 */
 	tcp_pcap_tcpcb_init(tp);
 #endif
 #ifdef TCP_BLACKBOX
 	/* Initialize the per-TCPCB log data. */
 	tcp_log_tcpcbinit(tp);
 #endif
 	tp->t_pacing_rate = -1;
 	if (tp->t_fb->tfb_tcp_fb_init) {
 		if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) {
 			refcount_release(&tp->t_fb->tfb_refcnt);
 			in_pcbrele_wlocked(inp);
 			uma_zfree(V_tcpcb_zone, tm);
 			return (NULL);
 		}
 	}
 #ifdef STATS
 	if (V_tcp_perconn_stats_enable == 1)
 		tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0);
 #endif
 	if (V_tcp_do_lrd)
 		tp->t_flags |= TF_LRD;
 	return (tp);		/* XXX */
 }
 
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 struct tcpcb *
 tcp_drop(struct tcpcb *tp, int errno)
 {
 	struct socket *so = tp->t_inpcb->inp_socket;
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
 		tcp_state_change(tp, TCPS_CLOSED);
 		/* Don't use tcp_output() here due to possible recursion. */
 		(void)tcp_output_nodrop(tp);
 		TCPSTAT_INC(tcps_drops);
 	} else
 		TCPSTAT_INC(tcps_conndrops);
 	if (errno == ETIMEDOUT && tp->t_softerror)
 		errno = tp->t_softerror;
 	so->so_error = errno;
 	return (tcp_close(tp));
 }
 
 void
 tcp_discardcb(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Make sure that all of our timers are stopped before we delete the
 	 * PCB.
 	 *
 	 * If stopping a timer fails, we schedule a discard function in same
 	 * callout, and the last discard function called will take care of
 	 * deleting the tcpcb.
 	 */
 	tp->t_timers->tt_draincnt = 0;
 	tcp_timer_stop(tp, TT_REXMT);
 	tcp_timer_stop(tp, TT_PERSIST);
 	tcp_timer_stop(tp, TT_KEEP);
 	tcp_timer_stop(tp, TT_2MSL);
 	tcp_timer_stop(tp, TT_DELACK);
 	if (tp->t_fb->tfb_tcp_timer_stop_all) {
 		/*
 		 * Call the stop-all function of the methods,
 		 * this function should call the tcp_timer_stop()
 		 * method with each of the function specific timeouts.
 		 * That stop will be called via the tfb_tcp_timer_stop()
 		 * which should use the async drain function of the
 		 * callout system (see tcp_var.h).
 		 */
 		tp->t_fb->tfb_tcp_timer_stop_all(tp);
 	}
 
 	/* free the reassembly queue, if any */
 	tcp_reass_flush(tp);
 
 #ifdef TCP_OFFLOAD
 	/* Disconnect offload device, if any. */
 	if (tp->t_flags & TF_TOE)
 		tcp_offload_detach(tp);
 #endif
 
 	tcp_free_sackholes(tp);
 
 #ifdef TCPPCAP
 	/* Free the TCP PCAP queues. */
 	tcp_pcap_drain(&(tp->t_inpkts));
 	tcp_pcap_drain(&(tp->t_outpkts));
 #endif
 
 	/* Allow the CC algorithm to clean up after itself. */
 	if (CC_ALGO(tp)->cb_destroy != NULL)
 		CC_ALGO(tp)->cb_destroy(tp->ccv);
 	CC_DATA(tp) = NULL;
 	/* Detach from the CC algorithm */
 	cc_detach(tp);
 
 #ifdef TCP_HHOOK
 	khelp_destroy_osd(tp->osd);
 #endif
 #ifdef STATS
 	stats_blob_destroy(tp->t_stats);
 #endif
 
 	CC_ALGO(tp) = NULL;
 	inp->inp_ppcb = NULL;
 	if (tp->t_timers->tt_draincnt == 0) {
 		bool released __diagused;
 
 		released = tcp_freecb(tp);
 		KASSERT(!released, ("%s: inp %p should not have been released "
 		    "here", __func__, inp));
 	}
 }
 
 bool
 tcp_freecb(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 #ifdef INET6
 	bool isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif
 
 	INP_WLOCK_ASSERT(inp);
 	MPASS(tp->t_timers->tt_draincnt == 0);
 
 	/* We own the last reference on tcpcb, let's free it. */
 #ifdef TCP_BLACKBOX
 	tcp_log_tcpcbfini(tp);
 #endif
 	TCPSTATES_DEC(tp->t_state);
 	if (tp->t_fb->tfb_tcp_fb_fini)
 		(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 
 	/*
 	 * If we got enough samples through the srtt filter,
 	 * save the rtt and rttvar in the routing entry.
 	 * 'Enough' is arbitrarily defined as 4 rtt samples.
 	 * 4 samples is enough for the srtt filter to converge
 	 * to within enough % of the correct value; fewer samples
 	 * and we could save a bogus rtt. The danger is not high
 	 * as tcp quickly recovers from everything.
 	 * XXX: Works very well but needs some more statistics!
 	 *
 	 * XXXRRS: Updating must be after the stack fini() since
 	 * that may be converting some internal representation of
 	 * say srtt etc into the general one used by other stacks.
 	 * Lets also at least protect against the so being NULL
 	 * as RW stated below.
 	 */
 	if ((tp->t_rttupdated >= 4) && (so != NULL)) {
 		struct hc_metrics_lite metrics;
 		uint32_t ssthresh;
 
 		bzero(&metrics, sizeof(metrics));
 		/*
 		 * Update the ssthresh always when the conditions below
 		 * are satisfied. This gives us better new start value
 		 * for the congestion avoidance for new connections.
 		 * ssthresh is only set if packet loss occurred on a session.
 		 *
 		 * XXXRW: 'so' may be NULL here, and/or socket buffer may be
 		 * being torn down.  Ideally this code would not use 'so'.
 		 */
 		ssthresh = tp->snd_ssthresh;
 		if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
 			/*
 			 * convert the limit from user data bytes to
 			 * packets then to packet data bytes.
 			 */
 			ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
 			if (ssthresh < 2)
 				ssthresh = 2;
 			ssthresh *= (tp->t_maxseg +
 #ifdef INET6
 			    (isipv6 ? sizeof (struct ip6_hdr) +
 			    sizeof (struct tcphdr) :
 #endif
 			    sizeof (struct tcpiphdr)
 #ifdef INET6
 			    )
 #endif
 			    );
 		} else
 			ssthresh = 0;
 		metrics.rmx_ssthresh = ssthresh;
 
 		metrics.rmx_rtt = tp->t_srtt;
 		metrics.rmx_rttvar = tp->t_rttvar;
 		metrics.rmx_cwnd = tp->snd_cwnd;
 		metrics.rmx_sendpipe = 0;
 		metrics.rmx_recvpipe = 0;
 
 		tcp_hc_update(&inp->inp_inc, &metrics);
 	}
 
 	refcount_release(&tp->t_fb->tfb_refcnt);
 	uma_zfree(V_tcpcb_zone, tp);
 
 	return (in_pcbrele_wlocked(inp));
 }
 
 /*
  * Attempt to close a TCP control block, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 struct tcpcb *
 tcp_close(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(inp);
 
 #ifdef TCP_OFFLOAD
 	if (tp->t_state == TCPS_LISTEN)
 		tcp_offload_listen_stop(tp);
 #endif
 	/*
 	 * This releases the TFO pending counter resource for TFO listen
 	 * sockets as well as passively-created TFO sockets that transition
 	 * from SYN_RECEIVED to CLOSED.
 	 */
 	if (tp->t_tfo_pending) {
 		tcp_fastopen_decrement_counter(tp->t_tfo_pending);
 		tp->t_tfo_pending = NULL;
 	}
 #ifdef TCPHPTS
 	tcp_hpts_remove(inp);
 #endif
 	in_pcbdrop(inp);
 	TCPSTAT_INC(tcps_closed);
 	if (tp->t_state != TCPS_CLOSED)
 		tcp_state_change(tp, TCPS_CLOSED);
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
 	so = inp->inp_socket;
 	soisdisconnected(so);
 	if (inp->inp_flags & INP_SOCKREF) {
 		inp->inp_flags &= ~INP_SOCKREF;
 		INP_WUNLOCK(inp);
 		sorele(so);
 		return (NULL);
 	}
 	return (tp);
 }
 
 /*
  * Notify a tcp user of an asynchronous error;
  * store error as soft error, but wake up user
  * (for now, won't do anything until can select for soft error).
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 static struct inpcb *
 tcp_notify(struct inpcb *inp, int error)
 {
 	struct tcpcb *tp;
 
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 * If connection hasn't completed, has retransmitted several times,
 	 * and receives a second error, give up now.  This is better
 	 * than waiting a long time to establish a connection that
 	 * can never complete.
 	 */
 	if (tp->t_state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN)) {
 		if (inp->inp_route.ro_nh) {
 			NH_FREE(inp->inp_route.ro_nh);
 			inp->inp_route.ro_nh = (struct nhop_object *)NULL;
 		}
 		return (inp);
 	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
 	    tp->t_softerror) {
 		tp = tcp_drop(tp, error);
 		if (tp != NULL)
 			return (inp);
 		else
 			return (NULL);
 	} else {
 		tp->t_softerror = error;
 		return (inp);
 	}
 #if 0
 	wakeup( &so->so_timeo);
 	sorwakeup(so);
 	sowwakeup(so);
 #endif
 }
 
 static int
 tcp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
 	    INPLOOKUP_RLOCKPCB);
 	struct xinpgen xig;
 	struct inpcb *inp;
 	int error;
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	if (req->oldptr == NULL) {
 		int n;
 
 		n = V_tcbinfo.ipi_count +
 		    counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]);
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
 		return (error);
 
 	bzero(&xig, sizeof(xig));
 	xig.xig_len = sizeof xig;
 	xig.xig_count = V_tcbinfo.ipi_count +
 	    counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]);
 	xig.xig_gen = V_tcbinfo.ipi_gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	error = syncache_pcblist(req);
 	if (error)
 		return (error);
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		if (inp->inp_gencnt <= xig.xig_gen) {
 			int crerr;
 
 			/*
 			 * XXX: This use of cr_cansee(), introduced with
 			 * TCP state changes, is not quite right, but for
 			 * now, better than nothing.
 			 */
 			if (inp->inp_flags & INP_TIMEWAIT) {
 				if (intotw(inp) != NULL)
 					crerr = cr_cansee(req->td->td_ucred,
 					    intotw(inp)->tw_cred);
 				else
 					crerr = EINVAL;	/* Skip this inp. */
 			} else
 				crerr = cr_canseeinpcb(req->td->td_ucred, inp);
 			if (crerr == 0) {
 				struct xtcpcb xt;
 
 				tcp_inptoxtp(inp, &xt);
 				error = SYSCTL_OUT(req, &xt, sizeof xt);
 				if (error) {
 					INP_RUNLOCK(inp);
 					break;
 				} else
 					continue;
 			}
 		}
 	}
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		xig.xig_gen = V_tcbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_tcbinfo.ipi_count +
 		    counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]);
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_NEEDGIANT,
     NULL, 0, tcp_pcblist, "S,xtcpcb",
     "List of active TCP connections");
 
 #ifdef INET
 static int
 tcp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	NET_EPOCH_ENTER(et);
 	inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL);
 	NET_EPOCH_EXIT(et);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT,
     0, 0, tcp_getcred, "S,xucred",
     "Get the xucred of a TCP connection");
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct epoch_tracker et;
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct inpcb *inp;
 	int error;
 #ifdef INET
 	int mapped = 0;
 #endif
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	if (IN6_IS_ADDR_V4MAPPED(&addrs[0].sin6_addr)) {
 #ifdef INET
 		if (IN6_IS_ADDR_V4MAPPED(&addrs[1].sin6_addr))
 			mapped = 1;
 		else
 #endif
 			return (EINVAL);
 	}
 
 	NET_EPOCH_ENTER(et);
 #ifdef INET
 	if (mapped == 1)
 		inp = in_pcblookup(&V_tcbinfo,
 			*(struct in_addr *)&addrs[1].sin6_addr.s6_addr[12],
 			addrs[1].sin6_port,
 			*(struct in_addr *)&addrs[0].sin6_addr.s6_addr[12],
 			addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL);
 	else
 #endif
 		inp = in6_pcblookup(&V_tcbinfo,
 			&addrs[1].sin6_addr, addrs[1].sin6_port,
 			&addrs[0].sin6_addr, addrs[0].sin6_port,
 			INPLOOKUP_RLOCKPCB, NULL);
 	NET_EPOCH_EXIT(et);
 	if (inp != NULL) {
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_tcp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_NEEDGIANT,
     0, 0, tcp6_getcred, "S,xucred",
     "Get the xucred of a TCP6 connection");
 #endif /* INET6 */
 
 #ifdef INET
 /* Path MTU to try next when a fragmentation-needed message is received. */
 static inline int
 tcp_next_pmtu(const struct icmp *icp, const struct ip *ip)
 {
 	int mtu = ntohs(icp->icmp_nextmtu);
 
 	/* If no alternative MTU was proposed, try the next smaller one. */
 	if (!mtu)
 		mtu = ip_next_mtu(ntohs(ip->ip_len), 1);
 	if (mtu < V_tcp_minmss + sizeof(struct tcpiphdr))
 		mtu = V_tcp_minmss + sizeof(struct tcpiphdr);
 
 	return (mtu);
 }
 
 static void
 tcp_ctlinput_with_port(int cmd, struct sockaddr *sa, void *vip, uint16_t port)
 {
 	struct ip *ip = vip;
 	struct tcphdr *th;
 	struct in_addr faddr;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct icmp *icp;
 	struct in_conninfo inc;
 	tcp_seq icmp_tcp_seq;
 	int mtu;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc_notify;
 	else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 		cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
 		cmd == PRC_TIMXCEED_INTRANS) && ip)
 		notify = tcp_drop_syn_sent;
 
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 * XXX: We never get this from ICMP, otherwise it makes an
 	 * excellent DoS attack on machines with many connections.
 	 */
 	else if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 
 	if (ip == NULL) {
 		in_pcbnotifyall(&V_tcbinfo, faddr, inetctlerrmap[cmd], notify);
 		return;
 	}
 
 	icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip));
 	th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 	inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src,
 	    th->th_sport, INPLOOKUP_WLOCKPCB, NULL);
 	if (inp != NULL && PRC_IS_REDIRECT(cmd)) {
 		/* signal EHOSTDOWN, as it flushes the cached route */
 		inp = (*notify)(inp, EHOSTDOWN);
 		goto out;
 	}
 	icmp_tcp_seq = th->th_seq;
 	if (inp != NULL)  {
 		if (!(inp->inp_flags & INP_TIMEWAIT) &&
 		    !(inp->inp_flags & INP_DROPPED) &&
 		    !(inp->inp_socket == NULL)) {
 			tp = intotcpcb(inp);
 #ifdef TCP_OFFLOAD
 			if (tp->t_flags & TF_TOE && cmd == PRC_MSGSIZE) {
 				/*
 				 * MTU discovery for offloaded connections.  Let
 				 * the TOE driver verify seq# and process it.
 				 */
 				mtu = tcp_next_pmtu(icp, ip);
 				tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu);
 				goto out;
 			}
 #endif
 			if (tp->t_port != port) {
 				goto out;
 			}
 			if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) &&
 			    SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) {
 				if (cmd == PRC_MSGSIZE) {
 					/*
 					 * MTU discovery: we got a needfrag and
 					 * will potentially try a lower MTU.
 					 */
 					mtu = tcp_next_pmtu(icp, ip);
 
 					/*
 					 * Only process the offered MTU if it
 					 * is smaller than the current one.
 					 */
 					if (mtu < tp->t_maxseg +
 					    sizeof(struct tcpiphdr)) {
 						bzero(&inc, sizeof(inc));
 						inc.inc_faddr = faddr;
 						inc.inc_fibnum =
 						    inp->inp_inc.inc_fibnum;
 						tcp_hc_updatemtu(&inc, mtu);
 						inp = tcp_mtudisc(inp, mtu);
 					}
 				} else
 					inp = (*notify)(inp,
 					    inetctlerrmap[cmd]);
 			}
 		}
 	} else {
 		bzero(&inc, sizeof(inc));
 		inc.inc_fport = th->th_dport;
 		inc.inc_lport = th->th_sport;
 		inc.inc_faddr = faddr;
 		inc.inc_laddr = ip->ip_src;
 		syncache_unreach(&inc, icmp_tcp_seq, port);
 	}
 out:
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 }
 
 void
 tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 	tcp_ctlinput_with_port(cmd, sa, vip, htons(0));
 }
 
 void
 tcp_ctlinput_viaudp(int cmd, struct sockaddr *sa, void *vip, void *unused)
 {
 	/* Its a tunneled TCP over UDP icmp */
 	struct ip *outer_ip, *inner_ip;
 	struct icmp *icmp;
 	struct udphdr *udp;
 	struct tcphdr *th, ttemp;
 	int i_hlen, o_len;
 	uint16_t port;
 
 	inner_ip = (struct ip *)vip;
 	icmp = (struct icmp *)((caddr_t)inner_ip -
 	    (sizeof(struct icmp) - sizeof(struct ip)));
 	outer_ip = (struct ip *)((caddr_t)icmp - sizeof(struct ip));
 	i_hlen = inner_ip->ip_hl << 2;
 	o_len = ntohs(outer_ip->ip_len);
 	if (o_len <
 	    (sizeof(struct ip) + 8 + i_hlen + sizeof(struct udphdr) + offsetof(struct tcphdr, th_ack))) {
 		/* Not enough data present */
 		return;
 	}
 	/* Ok lets strip out the inner udphdr header by copying up on top of it the tcp hdr */
 	udp = (struct udphdr *)(((caddr_t)inner_ip) + i_hlen);
 	if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) {
 		return;
 	}
 	port = udp->uh_dport;
 	th = (struct tcphdr *)(udp + 1);
 	memcpy(&ttemp, th, sizeof(struct tcphdr));
 	memcpy(udp, &ttemp, sizeof(struct tcphdr));
 	/* Now adjust down the size of the outer IP header */
 	o_len -= sizeof(struct udphdr);
 	outer_ip->ip_len = htons(o_len);
 	/* Now call in to the normal handling code */
 	tcp_ctlinput_with_port(cmd, sa, vip, port);
 }
 #endif /* INET */
 
 #ifdef INET6
 static inline int
 tcp6_next_pmtu(const struct icmp6_hdr *icmp6)
 {
 	int mtu = ntohl(icmp6->icmp6_mtu);
 
 	/*
 	 * If no alternative MTU was proposed, or the proposed MTU was too
 	 * small, set to the min.
 	 */
 	if (mtu < IPV6_MMTU)
 		mtu = IPV6_MMTU - 8;	/* XXXNP: what is the adjustment for? */
 	return (mtu);
 }
 
 static void
 tcp6_ctlinput_with_port(int cmd, struct sockaddr *sa, void *d, uint16_t port)
 {
 	struct in6_addr *dst;
 	struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct icmp6_hdr *icmp6;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	struct in_conninfo inc;
 	struct tcp_ports {
 		uint16_t th_sport;
 		uint16_t th_dport;
 	} t_ports;
 	tcp_seq icmp_tcp_seq;
 	unsigned int mtu;
 	unsigned int off;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		icmp6 = ip6cp->ip6c_icmp6;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		sa6_src = ip6cp->ip6c_src;
 		dst = ip6cp->ip6c_finaldst;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		off = 0;	/* fool gcc */
 		sa6_src = &sa6_any;
 		dst = NULL;
 	}
 
 	if (cmd == PRC_MSGSIZE)
 		notify = tcp_mtudisc_notify;
 	else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
 		cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
 		cmd == PRC_TIMXCEED_INTRANS) && ip6 != NULL)
 		notify = tcp_drop_syn_sent;
 
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 * XXX: We never get this from ICMP, otherwise it makes an
 	 * excellent DoS attack on machines with many connections.
 	 */
 	else if (cmd == PRC_HOSTDEAD)
 		ip6 = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0)
 		return;
 
 	if (ip6 == NULL) {
 		in6_pcbnotify(&V_tcbinfo, sa, 0,
 			      (const struct sockaddr *)sa6_src,
 			      0, cmd, NULL, notify);
 		return;
 	}
 
 	/* Check if we can safely get the ports from the tcp hdr */
 	if (m == NULL ||
 	    (m->m_pkthdr.len <
 		(int32_t) (off + sizeof(struct tcp_ports)))) {
 		return;
 	}
 	bzero(&t_ports, sizeof(struct tcp_ports));
 	m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports);
 	inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport,
 	    &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL);
 	if (inp != NULL && PRC_IS_REDIRECT(cmd)) {
 		/* signal EHOSTDOWN, as it flushes the cached route */
 		inp = (*notify)(inp, EHOSTDOWN);
 		goto out;
 	}
 	off += sizeof(struct tcp_ports);
 	if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) {
 		goto out;
 	}
 	m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq);
 	if (inp != NULL)  {
 		if (!(inp->inp_flags & INP_TIMEWAIT) &&
 		    !(inp->inp_flags & INP_DROPPED) &&
 		    !(inp->inp_socket == NULL)) {
 			tp = intotcpcb(inp);
 #ifdef TCP_OFFLOAD
 			if (tp->t_flags & TF_TOE && cmd == PRC_MSGSIZE) {
 				/* MTU discovery for offloaded connections. */
 				mtu = tcp6_next_pmtu(icmp6);
 				tcp_offload_pmtu_update(tp, icmp_tcp_seq, mtu);
 				goto out;
 			}
 #endif
 			if (tp->t_port != port) {
 				goto out;
 			}
 			if (SEQ_GEQ(ntohl(icmp_tcp_seq), tp->snd_una) &&
 			    SEQ_LT(ntohl(icmp_tcp_seq), tp->snd_max)) {
 				if (cmd == PRC_MSGSIZE) {
 					/*
 					 * MTU discovery:
 					 * If we got a needfrag set the MTU
 					 * in the route to the suggested new
 					 * value (if given) and then notify.
 					 */
 					mtu = tcp6_next_pmtu(icmp6);
 
 					bzero(&inc, sizeof(inc));
 					inc.inc_fibnum = M_GETFIB(m);
 					inc.inc_flags |= INC_ISIPV6;
 					inc.inc6_faddr = *dst;
 					if (in6_setscope(&inc.inc6_faddr,
 						m->m_pkthdr.rcvif, NULL))
 						goto out;
 					/*
 					 * Only process the offered MTU if it
 					 * is smaller than the current one.
 					 */
 					if (mtu < tp->t_maxseg +
 					    sizeof (struct tcphdr) +
 					    sizeof (struct ip6_hdr)) {
 						tcp_hc_updatemtu(&inc, mtu);
 						tcp_mtudisc(inp, mtu);
 						ICMP6STAT_INC(icp6s_pmtuchg);
 					}
 				} else
 					inp = (*notify)(inp,
 					    inet6ctlerrmap[cmd]);
 			}
 		}
 	} else {
 		bzero(&inc, sizeof(inc));
 		inc.inc_fibnum = M_GETFIB(m);
 		inc.inc_flags |= INC_ISIPV6;
 		inc.inc_fport = t_ports.th_dport;
 		inc.inc_lport = t_ports.th_sport;
 		inc.inc6_faddr = *dst;
 		inc.inc6_laddr = ip6->ip6_src;
 		syncache_unreach(&inc, icmp_tcp_seq, port);
 	}
 out:
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 }
 
 void
 tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	tcp6_ctlinput_with_port(cmd, sa, d, htons(0));
 }
 
 void
 tcp6_ctlinput_viaudp(int cmd, struct sockaddr *sa, void *d, void *unused)
 {
 	struct ip6ctlparam *ip6cp;
 	struct mbuf *m;
 	struct udphdr *udp;
 	uint16_t port;
 
 	ip6cp = (struct ip6ctlparam *)d;
 	m = m_pulldown(ip6cp->ip6c_m, ip6cp->ip6c_off, sizeof(struct udphdr), NULL);
 	if (m == NULL) {
 		return;
 	}
 	udp = mtod(m, struct udphdr *);
 	if (ntohs(udp->uh_sport) != V_tcp_udp_tunneling_port) {
 		return;
 	}
 	port = udp->uh_dport;
 	m_adj(m, sizeof(struct udphdr));
 	if ((m->m_flags & M_PKTHDR) == 0) {
 		ip6cp->ip6c_m->m_pkthdr.len -= sizeof(struct udphdr);
 	}
 	/* Now call in to the normal handling code */
 	tcp6_ctlinput_with_port(cmd, sa, d, port);
 }
 
 #endif /* INET6 */
 
 static uint32_t
 tcp_keyed_hash(struct in_conninfo *inc, u_char *key, u_int len)
 {
 	SIPHASH_CTX ctx;
 	uint32_t hash[2];
 
 	KASSERT(len >= SIPHASH_KEY_LENGTH,
 	    ("%s: keylen %u too short ", __func__, len));
 	SipHash24_Init(&ctx);
 	SipHash_SetKey(&ctx, (uint8_t *)key);
 	SipHash_Update(&ctx, &inc->inc_fport, sizeof(uint16_t));
 	SipHash_Update(&ctx, &inc->inc_lport, sizeof(uint16_t));
 	switch (inc->inc_flags & INC_ISIPV6) {
 #ifdef INET
 	case 0:
 		SipHash_Update(&ctx, &inc->inc_faddr, sizeof(struct in_addr));
 		SipHash_Update(&ctx, &inc->inc_laddr, sizeof(struct in_addr));
 		break;
 #endif
 #ifdef INET6
 	case INC_ISIPV6:
 		SipHash_Update(&ctx, &inc->inc6_faddr, sizeof(struct in6_addr));
 		SipHash_Update(&ctx, &inc->inc6_laddr, sizeof(struct in6_addr));
 		break;
 #endif
 	}
 	SipHash_Final((uint8_t *)hash, &ctx);
 
 	return (hash[0] ^ hash[1]);
 }
 
 uint32_t
 tcp_new_ts_offset(struct in_conninfo *inc)
 {
 	struct in_conninfo inc_store, *local_inc;
 
 	if (!V_tcp_ts_offset_per_conn) {
 		memcpy(&inc_store, inc, sizeof(struct in_conninfo));
 		inc_store.inc_lport = 0;
 		inc_store.inc_fport = 0;
 		local_inc = &inc_store;
 	} else {
 		local_inc = inc;
 	}
 	return (tcp_keyed_hash(local_inc, V_ts_offset_secret,
 	    sizeof(V_ts_offset_secret)));
 }
 
 /*
  * Following is where TCP initial sequence number generation occurs.
  *
  * There are two places where we must use initial sequence numbers:
  * 1.  In SYN-ACK packets.
  * 2.  In SYN packets.
  *
  * All ISNs for SYN-ACK packets are generated by the syncache.  See
  * tcp_syncache.c for details.
  *
  * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
  * depends on this property.  In addition, these ISNs should be
  * unguessable so as to prevent connection hijacking.  To satisfy
  * the requirements of this situation, the algorithm outlined in
  * RFC 1948 is used, with only small modifications.
  *
  * Implementation details:
  *
  * Time is based off the system timer, and is corrected so that it
  * increases by one megabyte per second.  This allows for proper
  * recycling on high speed LANs while still leaving over an hour
  * before rollover.
  *
  * As reading the *exact* system time is too expensive to be done
  * whenever setting up a TCP connection, we increment the time
  * offset in two ways.  First, a small random positive increment
  * is added to isn_offset for each connection that is set up.
  * Second, the function tcp_isn_tick fires once per clock tick
  * and increments isn_offset as necessary so that sequence numbers
  * are incremented at approximately ISN_BYTES_PER_SECOND.  The
  * random positive increments serve only to ensure that the same
  * exact sequence number is never sent out twice (as could otherwise
  * happen when a port is recycled in less than the system tick
  * interval.)
  *
  * net.inet.tcp.isn_reseed_interval controls the number of seconds
  * between seeding of isn_secret.  This is normally set to zero,
  * as reseeding should not be necessary.
  *
  * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
  * isn_offset_old, and isn_ctx is performed using the ISN lock.  In
  * general, this means holding an exclusive (write) lock.
  */
 
 #define ISN_BYTES_PER_SECOND 1048576
 #define ISN_STATIC_INCREMENT 4096
 #define ISN_RANDOM_INCREMENT (4096 - 1)
 #define ISN_SECRET_LENGTH    SIPHASH_KEY_LENGTH
 
 VNET_DEFINE_STATIC(u_char, isn_secret[ISN_SECRET_LENGTH]);
 VNET_DEFINE_STATIC(int, isn_last);
 VNET_DEFINE_STATIC(int, isn_last_reseed);
 VNET_DEFINE_STATIC(u_int32_t, isn_offset);
 VNET_DEFINE_STATIC(u_int32_t, isn_offset_old);
 
 #define	V_isn_secret			VNET(isn_secret)
 #define	V_isn_last			VNET(isn_last)
 #define	V_isn_last_reseed		VNET(isn_last_reseed)
 #define	V_isn_offset			VNET(isn_offset)
 #define	V_isn_offset_old		VNET(isn_offset_old)
 
 tcp_seq
 tcp_new_isn(struct in_conninfo *inc)
 {
 	tcp_seq new_isn;
 	u_int32_t projected_offset;
 
 	ISN_LOCK();
 	/* Seed if this is the first use, reseed if requested. */
 	if ((V_isn_last_reseed == 0) || ((V_tcp_isn_reseed_interval > 0) &&
 	     (((u_int)V_isn_last_reseed + (u_int)V_tcp_isn_reseed_interval*hz)
 		< (u_int)ticks))) {
 		arc4rand(&V_isn_secret, sizeof(V_isn_secret), 0);
 		V_isn_last_reseed = ticks;
 	}
 
 	/* Compute the hash and return the ISN. */
 	new_isn = (tcp_seq)tcp_keyed_hash(inc, V_isn_secret,
 	    sizeof(V_isn_secret));
 	V_isn_offset += ISN_STATIC_INCREMENT +
 		(arc4random() & ISN_RANDOM_INCREMENT);
 	if (ticks != V_isn_last) {
 		projected_offset = V_isn_offset_old +
 		    ISN_BYTES_PER_SECOND / hz * (ticks - V_isn_last);
 		if (SEQ_GT(projected_offset, V_isn_offset))
 			V_isn_offset = projected_offset;
 		V_isn_offset_old = V_isn_offset;
 		V_isn_last = ticks;
 	}
 	new_isn += V_isn_offset;
 	ISN_UNLOCK();
 	return (new_isn);
 }
 
 /*
  * When a specific ICMP unreachable message is received and the
  * connection state is SYN-SENT, drop the connection.  This behavior
  * is controlled by the icmp_may_rst sysctl.
  */
 struct inpcb *
 tcp_drop_syn_sent(struct inpcb *inp, int errno)
 {
 	struct tcpcb *tp;
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(inp);
 
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	if (tp->t_state != TCPS_SYN_SENT)
 		return (inp);
 
 	if (IS_FASTOPEN(tp->t_flags))
 		tcp_fastopen_disable_path(tp);
 
 	tp = tcp_drop(tp, errno);
 	if (tp != NULL)
 		return (inp);
 	else
 		return (NULL);
 }
 
 /*
  * When `need fragmentation' ICMP is received, update our idea of the MSS
  * based on the new value. Also nudge TCP to send something, since we
  * know the packet we just sent was dropped.
  * This duplicates some code in the tcp_mss() function in tcp_input.c.
  */
 static struct inpcb *
 tcp_mtudisc_notify(struct inpcb *inp, int error)
 {
 
 	return (tcp_mtudisc(inp, -1));
 }
 
 static struct inpcb *
 tcp_mtudisc(struct inpcb *inp, int mtuoffer)
 {
 	struct tcpcb *tp;
 	struct socket *so;
 
 	INP_WLOCK_ASSERT(inp);
 	if ((inp->inp_flags & INP_TIMEWAIT) ||
 	    (inp->inp_flags & INP_DROPPED))
 		return (inp);
 
 	tp = intotcpcb(inp);
 	KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL"));
 
 	tcp_mss_update(tp, -1, mtuoffer, NULL, NULL);
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_snd);
 	/* If the mss is larger than the socket buffer, decrease the mss. */
 	if (so->so_snd.sb_hiwat < tp->t_maxseg)
 		tp->t_maxseg = so->so_snd.sb_hiwat;
 	SOCKBUF_UNLOCK(&so->so_snd);
 
 	TCPSTAT_INC(tcps_mturesent);
 	tp->t_rtttime = 0;
 	tp->snd_nxt = tp->snd_una;
 	tcp_free_sackholes(tp);
 	tp->snd_recover = tp->snd_max;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		EXIT_FASTRECOVERY(tp->t_flags);
 	if (tp->t_fb->tfb_tcp_mtu_chg != NULL) {
 		/*
 		 * Conceptually the snd_nxt setting
 		 * and freeing sack holes should
 		 * be done by the default stacks
 		 * own tfb_tcp_mtu_chg().
 		 */
 		tp->t_fb->tfb_tcp_mtu_chg(tp);
 	}
 	if (tcp_output(tp) < 0)
 		return (NULL);
 	else
 		return (inp);
 }
 
 #ifdef INET
 /*
  * Look-up the routing entry to the peer of this inpcb.  If no route
  * is found and it cannot be allocated, then return 0.  This routine
  * is called by TCP routines that access the rmx structure and by
  * tcp_mss_update to get the peer/interface MTU.
  */
 uint32_t
 tcp_maxmtu(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct nhop_object *nh;
 	struct ifnet *ifp;
 	uint32_t maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
 
 	if (inc->inc_faddr.s_addr != INADDR_ANY) {
 		nh = fib4_lookup(inc->inc_fibnum, inc->inc_faddr, 0, NHR_NONE, 0);
 		if (nh == NULL)
 			return (0);
 
 		ifp = nh->nh_ifp;
 		maxmtu = nh->nh_mtu;
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO4 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 	}
 	return (maxmtu);
 }
 #endif /* INET */
 
 #ifdef INET6
 uint32_t
 tcp_maxmtu6(struct in_conninfo *inc, struct tcp_ifcap *cap)
 {
 	struct nhop_object *nh;
 	struct in6_addr dst6;
 	uint32_t scopeid;
 	struct ifnet *ifp;
 	uint32_t maxmtu = 0;
 
 	KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
 
 	if (inc->inc_flags & INC_IPV6MINMTU)
 		return (IPV6_MMTU);
 
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
 		in6_splitscope(&inc->inc6_faddr, &dst6, &scopeid);
 		nh = fib6_lookup(inc->inc_fibnum, &dst6, scopeid, NHR_NONE, 0);
 		if (nh == NULL)
 			return (0);
 
 		ifp = nh->nh_ifp;
 		maxmtu = nh->nh_mtu;
 
 		/* Report additional interface capabilities. */
 		if (cap != NULL) {
 			if (ifp->if_capenable & IFCAP_TSO6 &&
 			    ifp->if_hwassist & CSUM_TSO) {
 				cap->ifcap |= CSUM_TSO;
 				cap->tsomax = ifp->if_hw_tsomax;
 				cap->tsomaxsegcount = ifp->if_hw_tsomaxsegcount;
 				cap->tsomaxsegsize = ifp->if_hw_tsomaxsegsize;
 			}
 		}
 	}
 
 	return (maxmtu);
 }
 
 /*
  * Handle setsockopt(IPV6_USE_MIN_MTU) by a TCP stack.
  *
  * XXXGL: we are updating inpcb here with INC_IPV6MINMTU flag.
  * The right place to do that is ip6_setpktopt() that has just been
  * executed.  By the way it just filled ip6po_minmtu for us.
  */
 void
 tcp6_use_min_mtu(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 
 	INP_WLOCK_ASSERT(inp);
 	/*
 	 * In case of the IPV6_USE_MIN_MTU socket
 	 * option, the INC_IPV6MINMTU flag to announce
 	 * a corresponding MSS during the initial
 	 * handshake.  If the TCP connection is not in
 	 * the front states, just reduce the MSS being
 	 * used.  This avoids the sending of TCP
 	 * segments which will be fragmented at the
 	 * IPv6 layer.
 	 */
 	inp->inp_inc.inc_flags |= INC_IPV6MINMTU;
 	if ((tp->t_state >= TCPS_SYN_SENT) &&
 	    (inp->inp_inc.inc_flags & INC_ISIPV6)) {
 		struct ip6_pktopts *opt;
 
 		opt = inp->in6p_outputopts;
 		if (opt != NULL && opt->ip6po_minmtu == IP6PO_MINMTU_ALL &&
 		    tp->t_maxseg > TCP6_MSS)
 			tp->t_maxseg = TCP6_MSS;
 	}
 }
 #endif /* INET6 */
 
 /*
  * Calculate effective SMSS per RFC5681 definition for a given TCP
  * connection at its current state, taking into account SACK and etc.
  */
 u_int
 tcp_maxseg(const struct tcpcb *tp)
 {
 	u_int optlen;
 
 	if (tp->t_flags & TF_NOOPT)
 		return (tp->t_maxseg);
 
 	/*
 	 * Here we have a simplified code from tcp_addoptions(),
 	 * without a proper loop, and having most of paddings hardcoded.
 	 * We might make mistakes with padding here in some edge cases,
 	 * but this is harmless, since result of tcp_maxseg() is used
 	 * only in cwnd and ssthresh estimations.
 	 */
 	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 		if (tp->t_flags & TF_RCVD_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = 0;
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PADTCPOLEN(TCPOLEN_SIGNATURE);
 #endif
 		if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0) {
 			optlen += TCPOLEN_SACKHDR;
 			optlen += tp->rcv_numsacks * TCPOLEN_SACK;
 			optlen = PADTCPOLEN(optlen);
 		}
 	} else {
 		if (tp->t_flags & TF_REQ_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = PADTCPOLEN(TCPOLEN_MAXSEG);
 		if (tp->t_flags & TF_REQ_SCALE)
 			optlen += PADTCPOLEN(TCPOLEN_WINDOW);
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PADTCPOLEN(TCPOLEN_SIGNATURE);
 #endif
 		if (tp->t_flags & TF_SACK_PERMIT)
 			optlen += PADTCPOLEN(TCPOLEN_SACK_PERMITTED);
 	}
 #undef PAD
 	optlen = min(optlen, TCP_MAXOLEN);
 	return (tp->t_maxseg - optlen);
 }
 
 
 u_int
 tcp_fixed_maxseg(const struct tcpcb *tp)
 {
 	int optlen;
 
 	if (tp->t_flags & TF_NOOPT)
 		return (tp->t_maxseg);
 
 	/*
 	 * Here we have a simplified code from tcp_addoptions(),
 	 * without a proper loop, and having most of paddings hardcoded.
 	 * We only consider fixed options that we would send every
 	 * time I.e. SACK is not considered. This is important
 	 * for cc modules to figure out what the modulo of the
 	 * cwnd should be.
 	 */
 #define	PAD(len)	((((len) / 4) + !!((len) % 4)) * 4)
 	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 		if (tp->t_flags & TF_RCVD_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = 0;
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PAD(TCPOLEN_SIGNATURE);
 #endif
 	} else {
 		if (tp->t_flags & TF_REQ_TSTMP)
 			optlen = TCPOLEN_TSTAMP_APPA;
 		else
 			optlen = PAD(TCPOLEN_MAXSEG);
 		if (tp->t_flags & TF_REQ_SCALE)
 			optlen += PAD(TCPOLEN_WINDOW);
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		if (tp->t_flags & TF_SIGNATURE)
 			optlen += PAD(TCPOLEN_SIGNATURE);
 #endif
 		if (tp->t_flags & TF_SACK_PERMIT)
 			optlen += PAD(TCPOLEN_SACK_PERMITTED);
 	}
 #undef PAD
 	optlen = min(optlen, TCP_MAXOLEN);
 	return (tp->t_maxseg - optlen);
 }
 
 
 
 static int
 sysctl_drop(SYSCTL_HANDLER_ARGS)
 {
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	struct tcptw *tw;
 #ifdef INET
 	struct sockaddr_in *fin = NULL, *lin = NULL;
 #endif
 	struct epoch_tracker et;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 #endif
 	int error;
 
 	inp = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 #ifdef INET
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 #endif
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 #endif
 	default:
 		return (EINVAL);
 	}
 	NET_EPOCH_ENTER(et);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr,
 		    fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port,
 		    INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port,
 		    lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 	}
 	if (inp != NULL) {
 		if (inp->inp_flags & INP_TIMEWAIT) {
 			/*
 			 * XXXRW: There currently exists a state where an
 			 * inpcb is present, but its timewait state has been
 			 * discarded.  For now, don't allow dropping of this
 			 * type of inpcb.
 			 */
 			tw = intotw(inp);
 			if (tw != NULL)
 				tcp_twclose(tw, 0);
 			else
 				INP_WUNLOCK(inp);
 		} else if ((inp->inp_flags & INP_DROPPED) == 0 &&
 		    !SOLISTENING(inp->inp_socket)) {
 			tp = intotcpcb(inp);
 			tp = tcp_drop(tp, ECONNABORTED);
 			if (tp != NULL)
 				INP_WUNLOCK(inp);
 		} else
 			INP_WUNLOCK(inp);
 	} else
 		error = ESRCH;
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_DROP, drop,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_NEEDGIANT, NULL, 0, sysctl_drop, "",
     "Drop TCP connection");
 
 static int
 tcp_sysctl_setsockopt(SYSCTL_HANDLER_ARGS)
 {
 	return (sysctl_setsockopt(oidp, arg1, arg2, req, &V_tcbinfo,
 	    &tcp_ctloutput_set));
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, setsockopt,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_MPSAFE, NULL, 0, tcp_sysctl_setsockopt, "",
     "Set socket option for TCP endpoint");
 
 #ifdef KERN_TLS
 static int
 sysctl_switch_tls(SYSCTL_HANDLER_ARGS)
 {
 	/* addrs[0] is a foreign socket, addrs[1] is a local one. */
 	struct sockaddr_storage addrs[2];
 	struct inpcb *inp;
 #ifdef INET
 	struct sockaddr_in *fin = NULL, *lin = NULL;
 #endif
 	struct epoch_tracker et;
 #ifdef INET6
 	struct sockaddr_in6 *fin6, *lin6;
 #endif
 	int error;
 
 	inp = NULL;
 #ifdef INET6
 	fin6 = lin6 = NULL;
 #endif
 	error = 0;
 
 	if (req->oldptr != NULL || req->oldlen != 0)
 		return (EINVAL);
 	if (req->newptr == NULL)
 		return (EPERM);
 	if (req->newlen < sizeof(addrs))
 		return (ENOMEM);
 	error = SYSCTL_IN(req, &addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		fin6 = (struct sockaddr_in6 *)&addrs[0];
 		lin6 = (struct sockaddr_in6 *)&addrs[1];
 		if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
 		    lin6->sin6_len != sizeof(struct sockaddr_in6))
 			return (EINVAL);
 		if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
 			if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
 				return (EINVAL);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
 			in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
 #ifdef INET
 			fin = (struct sockaddr_in *)&addrs[0];
 			lin = (struct sockaddr_in *)&addrs[1];
 #endif
 			break;
 		}
 		error = sa6_embedscope(fin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		error = sa6_embedscope(lin6, V_ip6_use_defzone);
 		if (error)
 			return (error);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		fin = (struct sockaddr_in *)&addrs[0];
 		lin = (struct sockaddr_in *)&addrs[1];
 		if (fin->sin_len != sizeof(struct sockaddr_in) ||
 		    lin->sin_len != sizeof(struct sockaddr_in))
 			return (EINVAL);
 		break;
 #endif
 	default:
 		return (EINVAL);
 	}
 	NET_EPOCH_ENTER(et);
 	switch (addrs[0].ss_family) {
 #ifdef INET6
 	case AF_INET6:
 		inp = in6_pcblookup(&V_tcbinfo, &fin6->sin6_addr,
 		    fin6->sin6_port, &lin6->sin6_addr, lin6->sin6_port,
 		    INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 #ifdef INET
 	case AF_INET:
 		inp = in_pcblookup(&V_tcbinfo, fin->sin_addr, fin->sin_port,
 		    lin->sin_addr, lin->sin_port, INPLOOKUP_WLOCKPCB, NULL);
 		break;
 #endif
 	}
 	NET_EPOCH_EXIT(et);
 	if (inp != NULL) {
 		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0 ||
 		    inp->inp_socket == NULL) {
 			error = ECONNRESET;
 			INP_WUNLOCK(inp);
 		} else {
 			struct socket *so;
 
 			so = inp->inp_socket;
 			soref(so);
 			error = ktls_set_tx_mode(so,
 			    arg2 == 0 ? TCP_TLS_MODE_SW : TCP_TLS_MODE_IFNET);
 			INP_WUNLOCK(inp);
 			sorele(so);
 		}
 	} else
 		error = ESRCH;
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_sw_tls,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_NEEDGIANT, NULL, 0, sysctl_switch_tls, "",
     "Switch TCP connection to SW TLS");
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, switch_to_ifnet_tls,
     CTLFLAG_VNET | CTLTYPE_STRUCT | CTLFLAG_WR | CTLFLAG_SKIP |
     CTLFLAG_NEEDGIANT, NULL, 1, sysctl_switch_tls, "",
     "Switch TCP connection to ifnet TLS");
 #endif
 
 /*
  * Generate a standardized TCP log line for use throughout the
  * tcp subsystem.  Memory allocation is done with M_NOWAIT to
  * allow use in the interrupt context.
  *
  * NB: The caller MUST free(s, M_TCPLOG) the returned string.
  * NB: The function may return NULL if memory allocation failed.
  *
  * Due to header inclusion and ordering limitations the struct ip
  * and ip6_hdr pointers have to be passed as void pointers.
  */
 char *
 tcp_log_vain(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (V_tcp_log_in_vain == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 char *
 tcp_log_addrs(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr,
     const void *ip6hdr)
 {
 
 	/* Is logging enabled? */
 	if (tcp_log_debug == 0)
 		return (NULL);
 
 	return (tcp_log_addr(inc, th, ip4hdr, ip6hdr));
 }
 
 static char *
 tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th, const void *ip4hdr,
     const void *ip6hdr)
 {
 	char *s, *sp;
 	size_t size;
 #ifdef INET
 	const struct ip *ip = (const struct ip *)ip4hdr;
 #endif
 #ifdef INET6
 	const struct ip6_hdr *ip6 = (const struct ip6_hdr *)ip6hdr;
 #endif /* INET6 */
 
 	/*
 	 * The log line looks like this:
 	 * "TCP: [1.2.3.4]:50332 to [1.2.3.4]:80 tcpflags 0x2<SYN>"
 	 */
 	size = sizeof("TCP: []:12345 to []:12345 tcpflags 0x2<>") +
 	    sizeof(PRINT_TH_FLAGS) + 1 +
 #ifdef INET6
 	    2 * INET6_ADDRSTRLEN;
 #else
 	    2 * INET_ADDRSTRLEN;
 #endif /* INET6 */
 
 	s = malloc(size, M_TCPLOG, M_ZERO|M_NOWAIT);
 	if (s == NULL)
 		return (NULL);
 
 	strcat(s, "TCP: [");
 	sp = s + strlen(s);
 
 	if (inc && ((inc->inc_flags & INC_ISIPV6) == 0)) {
 		inet_ntoa_r(inc->inc_faddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		inet_ntoa_r(inc->inc_laddr, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 #ifdef INET6
 	} else if (inc) {
 		ip6_sprintf(sp, &inc->inc6_faddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(inc->inc_fport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &inc->inc6_laddr);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(inc->inc_lport));
 	} else if (ip6 && th) {
 		ip6_sprintf(sp, &ip6->ip6_src);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		ip6_sprintf(sp, &ip6->ip6_dst);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET6 */
 #ifdef INET
 	} else if (ip && th) {
 		inet_ntoa_r(ip->ip_src, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i to [", ntohs(th->th_sport));
 		sp = s + strlen(s);
 		inet_ntoa_r(ip->ip_dst, sp);
 		sp = s + strlen(s);
 		sprintf(sp, "]:%i", ntohs(th->th_dport));
 #endif /* INET */
 	} else {
 		free(s, M_TCPLOG);
 		return (NULL);
 	}
 	sp = s + strlen(s);
 	if (th)
 		sprintf(sp, " tcpflags 0x%b", tcp_get_flags(th), PRINT_TH_FLAGS);
 	if (*(s + size - 1) != '\0')
 		panic("%s: string too long", __func__);
 	return (s);
 }
 
 /*
  * A subroutine which makes it easy to track TCP state changes with DTrace.
  * This function shouldn't be called for t_state initializations that don't
  * correspond to actual TCP state transitions.
  */
 void
 tcp_state_change(struct tcpcb *tp, int newstate)
 {
 #if defined(KDTRACE_HOOKS)
 	int pstate = tp->t_state;
 #endif
 
 	TCPSTATES_DEC(tp->t_state);
 	TCPSTATES_INC(newstate);
 	tp->t_state = newstate;
 	TCP_PROBE6(state__change, NULL, tp, NULL, tp, NULL, pstate);
 }
 
 /*
  * Create an external-format (``xtcpcb'') structure using the information in
  * the kernel-format tcpcb structure pointed to by tp.  This is done to
  * reduce the spew of irrelevant information over this interface, to isolate
  * user code from changes in the kernel structure, and potentially to provide
  * information-hiding if we decide that some of this information should be
  * hidden from users.
  */
 void
 tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	struct tcptw *tw = intotw(inp);
 	sbintime_t now;
 
 	bzero(xt, sizeof(*xt));
 	if (inp->inp_flags & INP_TIMEWAIT) {
 		xt->t_state = TCPS_TIME_WAIT;
 		xt->xt_encaps_port = tw->t_port;
 	} else {
 		xt->t_state = tp->t_state;
 		xt->t_logstate = tp->t_logstate;
 		xt->t_flags = tp->t_flags;
 		xt->t_sndzerowin = tp->t_sndzerowin;
 		xt->t_sndrexmitpack = tp->t_sndrexmitpack;
 		xt->t_rcvoopack = tp->t_rcvoopack;
 		xt->t_rcv_wnd = tp->rcv_wnd;
 		xt->t_snd_wnd = tp->snd_wnd;
 		xt->t_snd_cwnd = tp->snd_cwnd;
 		xt->t_snd_ssthresh = tp->snd_ssthresh;
 		xt->t_dsack_bytes = tp->t_dsack_bytes;
 		xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes;
 		xt->t_dsack_pack = tp->t_dsack_pack;
 		xt->t_maxseg = tp->t_maxseg;
 		xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 +
 			     (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0;
 
 		now = getsbinuptime();
 #define	COPYTIMER(ttt)	do {						\
 		if (callout_active(&tp->t_timers->ttt))			\
 			xt->ttt = (tp->t_timers->ttt.c_time - now) /	\
 			    SBT_1MS;					\
 		else							\
 			xt->ttt = 0;					\
 } while (0)
 		COPYTIMER(tt_delack);
 		COPYTIMER(tt_rexmt);
 		COPYTIMER(tt_persist);
 		COPYTIMER(tt_keep);
 		COPYTIMER(tt_2msl);
 #undef COPYTIMER
 		xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz;
 
 		xt->xt_encaps_port = tp->t_port;
 		bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack,
 		    TCP_FUNCTION_NAME_LEN_MAX);
 		bcopy(CC_ALGO(tp)->name, xt->xt_cc,
 		    TCP_CA_NAME_MAX);
 #ifdef TCP_BLACKBOX
 		(void)tcp_log_get_id(tp, xt->xt_logid);
 #endif
 	}
 
 	xt->xt_len = sizeof(struct xtcpcb);
 	in_pcbtoxinpcb(inp, &xt->xt_inp);
 	if (inp->inp_socket == NULL)
 		xt->xt_inp.xi_socket.xso_protocol = IPPROTO_TCP;
 }
 
 void
 tcp_log_end_status(struct tcpcb *tp, uint8_t status)
 {
 	uint32_t bit, i;
 
 	if ((tp == NULL) ||
 	    (status > TCP_EI_STATUS_MAX_VALUE) ||
 	    (status == 0)) {
 		/* Invalid */
 		return;
 	}
 	if (status > (sizeof(uint32_t) * 8)) {
 		/* Should this be a KASSERT? */
 		return;
 	}
 	bit = 1U << (status - 1);
 	if (bit & tp->t_end_info_status) {
 		/* already logged */
 		return;
 	}
 	for (i = 0; i < TCP_END_BYTE_INFO; i++) {
 		if (tp->t_end_info_bytes[i] == TCP_EI_EMPTY_SLOT) {
 			tp->t_end_info_bytes[i] = status;
 			tp->t_end_info_status |= bit;
 			break;
 		}
 	}
 }
 
 int
 tcp_can_enable_pacing(void)
 {
 
 	if ((tcp_pacing_limit == -1) ||
 	    (tcp_pacing_limit > number_of_tcp_connections_pacing)) {
 		atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1);
 		shadow_num_connections = number_of_tcp_connections_pacing;
 		return (1);
 	} else {
 		return (0);
 	}
 }
 
 static uint8_t tcp_pacing_warning = 0;
 
 void
 tcp_decrement_paced_conn(void)
 {
 	uint32_t ret;
 
 	ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1);
 	shadow_num_connections = number_of_tcp_connections_pacing;
 	KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?"));
 	if (ret == 0) {
 		if (tcp_pacing_limit != -1) {
 			printf("Warning all pacing is now disabled, count decrements invalidly!\n");
 			tcp_pacing_limit = 0;
 		} else if (tcp_pacing_warning == 0) {
 			printf("Warning pacing count is invalid, invalid decrement\n");
 			tcp_pacing_warning = 1;
 		}
 	}
 }
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 1245995724e1..291d6a76bb20 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1,3213 +1,3223 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * Copyright (c) 2006-2007 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ddb.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_kern_tls.h"
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/arb.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/refcount.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/qmath.h>
 #include <sys/sysctl.h>
 #include <sys/mbuf.h>
 #ifdef INET6
 #include <sys/domain.h>
 #endif /* INET6 */
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <sys/stats.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
 #endif
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/scope6_var.h>
 #endif
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 #include <netinet/tcp_seq.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_log_buf.h>
 #include <netinet/tcpip.h>
 #include <netinet/cc/cc.h>
 #include <netinet/tcp_fastopen.h>
 #include <netinet/tcp_hpts.h>
 #ifdef TCPPCAP
 #include <netinet/tcp_pcap.h>
 #endif
 #include <netinet/tcp_debug.h>
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
 #include <netipsec/ipsec_support.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_map.h>
 #include <vm/vm_page.h>
 
 /*
  * TCP protocol interface to socket abstraction.
  */
 #ifdef INET
 static int	tcp_connect(struct tcpcb *, struct sockaddr *,
 		    struct thread *td);
 #endif /* INET */
 #ifdef INET6
 static int	tcp6_connect(struct tcpcb *, struct sockaddr *,
 		    struct thread *td);
 #endif /* INET6 */
 static void	tcp_disconnect(struct tcpcb *);
 static void	tcp_usrclosed(struct tcpcb *);
 static void	tcp_fill_info(struct tcpcb *, struct tcp_info *);
 
 static int	tcp_pru_options_support(struct tcpcb *tp, int flags);
 
 #ifdef TCPDEBUG
 #define	TCPDEBUG0	int ostate = 0
 #define	TCPDEBUG1()	ostate = tp ? tp->t_state : 0
 #define	TCPDEBUG2(req)	if (tp && (so->so_options & SO_DEBUG)) \
 				tcp_trace(TA_USER, ostate, tp, 0, 0, req)
 #else
 #define	TCPDEBUG0
 #define	TCPDEBUG1()
 #define	TCPDEBUG2(req)
 #endif
 
 /*
  * tcp_require_unique port requires a globally-unique source port for each
  * outgoing connection.  The default is to require the 4-tuple to be unique.
  */
 VNET_DEFINE(int, tcp_require_unique_port) = 0;
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, require_unique_port,
     CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_require_unique_port), 0,
     "Require globally-unique ephemeral port for outgoing connections");
 #define	V_tcp_require_unique_port	VNET(tcp_require_unique_port)
 
 /*
  * TCP attaches to socket via pru_attach(), reserving space,
  * and an internet control block.
  */
 static int
 tcp_usr_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	int error;
 	TCPDEBUG0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
 	TCPDEBUG1();
 
 	error = soreserve(so, V_tcp_sendspace, V_tcp_recvspace);
 	if (error)
 		goto out;
 
 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
 	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	error = in_pcballoc(so, &V_tcbinfo);
 	if (error)
 		goto out;
 	inp = sotoinpcb(so);
 	tp = tcp_newtcpcb(inp);
 	if (tp == NULL) {
 		error = ENOBUFS;
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		goto out;
 	}
 	tp->t_state = TCPS_CLOSED;
 	INP_WUNLOCK(inp);
 	TCPSTATES_INC(TCPS_CLOSED);
 out:
 	TCPDEBUG2(PRU_ATTACH);
 	TCP_PROBE2(debug__user, tp, PRU_ATTACH);
 	return (error);
 }
 
 /*
  * tcp_usr_detach is called when the socket layer loses its final reference
  * to the socket, be it a file descriptor reference, a reference from TCP,
  * etc.  At this point, there is only one case in which we will keep around
  * inpcb state: time wait.
  */
 static void
 tcp_usr_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	INP_WLOCK(inp);
 	KASSERT(so->so_pcb == inp && inp->inp_socket == so,
 		("%s: socket %p inp %p mismatch", __func__, so, inp));
 
 	tp = intotcpcb(inp);
 
 	if (inp->inp_flags & INP_TIMEWAIT) {
 		/*
 		 * There are two cases to handle: one in which the time wait
 		 * state is being discarded (INP_DROPPED), and one in which
 		 * this connection will remain in timewait.  In the former,
 		 * it is time to discard all state (except tcptw, which has
 		 * already been discarded by the timewait close code, which
 		 * should be further up the call stack somewhere).  In the
 		 * latter case, we detach from the socket, but leave the pcb
 		 * present until timewait ends.
 		 *
 		 * XXXRW: Would it be cleaner to free the tcptw here?
 		 *
 		 * Astute question indeed, from twtcp perspective there are
 		 * four cases to consider:
 		 *
 		 * #1 tcp_usr_detach is called at tcptw creation time by
 		 *  tcp_twstart, then do not discard the newly created tcptw
 		 *  and leave inpcb present until timewait ends
 		 * #2 tcp_usr_detach is called at tcptw creation time by
 		 *  tcp_twstart, but connection is local and tw will be
 		 *  discarded immediately
 		 * #3 tcp_usr_detach is called at timewait end (or reuse) by
 		 *  tcp_twclose, then the tcptw has already been discarded
 		 *  (or reused) and inpcb is freed here
 		 * #4 tcp_usr_detach is called() after timewait ends (or reuse)
 		 *  (e.g. by soclose), then tcptw has already been discarded
 		 *  (or reused) and inpcb is freed here
 		 *
 		 *  In all three cases the tcptw should not be freed here.
 		 */
 		if (inp->inp_flags & INP_DROPPED) {
 			KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
 			    "INP_DROPPED && tp != NULL"));
 			in_pcbdetach(inp);
 			in_pcbfree(inp);
 		} else {
 			in_pcbdetach(inp);
 			INP_WUNLOCK(inp);
 		}
 	} else {
 		/*
 		 * If the connection is not in timewait, it must be either
 		 * dropped or embryonic.
 		 */
 		KASSERT(inp->inp_flags & INP_DROPPED ||
 		    tp->t_state < TCPS_SYN_SENT,
 		    ("%s: inp %p not dropped or embryonic", __func__, inp));
 		tcp_discardcb(tp);
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 	}
 }
 
 #ifdef INET
 /*
  * Give the socket an address.
  */
 static int
 tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error = 0;
 	struct inpcb *inp;
 #ifdef KDTRACE_HOOKS
 	struct tcpcb *tp = NULL;
 #endif
 	struct sockaddr_in *sinp;
 
 	sinp = (struct sockaddr_in *)nam;
 	if (nam->sa_family != AF_INET) {
 		/*
 		 * Preserve compatibility with old programs.
 		 */
 		if (nam->sa_family != AF_UNSPEC ||
 		    nam->sa_len < offsetof(struct sockaddr_in, sin_zero) ||
 		    sinp->sin_addr.s_addr != INADDR_ANY)
 			return (EAFNOSUPPORT);
 		nam->sa_family = AF_INET;
 	}
 	if (nam->sa_len != sizeof(*sinp))
 		return (EINVAL);
 
 	/*
 	 * Must check for multicast addresses and disallow binding
 	 * to them.
 	 */
 	if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
 		return (EAFNOSUPPORT);
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef KDTRACE_HOOKS
 	tp = intotcpcb(inp);
 #endif
 	TCPDEBUG1();
 	INP_HASH_WLOCK(&V_tcbinfo);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 out:
 	TCPDEBUG2(PRU_BIND);
 	TCP_PROBE2(debug__user, tp, PRU_BIND);
 	INP_WUNLOCK(inp);
 
 	return (error);
 }
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error = 0;
 	struct inpcb *inp;
 #ifdef KDTRACE_HOOKS
 	struct tcpcb *tp = NULL;
 #endif
 	struct sockaddr_in6 *sin6;
 	u_char vflagsav;
 
 	sin6 = (struct sockaddr_in6 *)nam;
 	if (nam->sa_family != AF_INET6)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len != sizeof(*sin6))
 		return (EINVAL);
 
 	/*
 	 * Must check for multicast addresses and disallow binding
 	 * to them.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 		return (EAFNOSUPPORT);
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
 	INP_WLOCK(inp);
 	vflagsav = inp->inp_vflag;
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef KDTRACE_HOOKS
 	tp = intotcpcb(inp);
 #endif
 	TCPDEBUG1();
 	INP_HASH_WLOCK(&V_tcbinfo);
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
 #ifdef INET
 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
 			inp->inp_vflag |= INP_IPV4;
 		else if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 			struct sockaddr_in sin;
 
 			in6_sin6_2_sin(&sin, sin6);
 			if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
 				error = EAFNOSUPPORT;
 				INP_HASH_WUNLOCK(&V_tcbinfo);
 				goto out;
 			}
 			inp->inp_vflag |= INP_IPV4;
 			inp->inp_vflag &= ~INP_IPV6;
 			error = in_pcbbind(inp, (struct sockaddr *)&sin,
 			    td->td_ucred);
 			INP_HASH_WUNLOCK(&V_tcbinfo);
 			goto out;
 		}
 	}
 #endif
 	error = in6_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 out:
 	if (error != 0)
 		inp->inp_vflag = vflagsav;
 	TCPDEBUG2(PRU_BIND);
 	TCP_PROBE2(debug__user, tp, PRU_BIND);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 #endif /* INET6 */
 
 #ifdef INET
 /*
  * Prepare to accept connections.
  */
 static int
 tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
 {
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	if (error != 0) {
 		SOCK_UNLOCK(so);
 		goto out;
 	}
 	if (inp->inp_lport == 0) {
 		INP_HASH_WLOCK(&V_tcbinfo);
 		error = in_pcbbind(inp, NULL, td->td_ucred);
 		INP_HASH_WUNLOCK(&V_tcbinfo);
 	}
 	if (error == 0) {
 		tcp_state_change(tp, TCPS_LISTEN);
 		solisten_proto(so, backlog);
 #ifdef TCP_OFFLOAD
 		if ((so->so_options & SO_NO_OFFLOAD) == 0)
 			tcp_offload_listen_start(tp);
 #endif
 	} else {
 		solisten_proto_abort(so);
 	}
 	SOCK_UNLOCK(so);
 
 	if (IS_FASTOPEN(tp->t_flags))
 		tp->t_tfo_pending = tcp_fastopen_alloc_counter();
 
 out:
 	TCPDEBUG2(PRU_LISTEN);
 	TCP_PROBE2(debug__user, tp, PRU_LISTEN);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
 {
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	u_char vflagsav;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	vflagsav = inp->inp_vflag;
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	SOCK_LOCK(so);
 	error = solisten_proto_check(so);
 	if (error != 0) {
 		SOCK_UNLOCK(so);
 		goto out;
 	}
 	INP_HASH_WLOCK(&V_tcbinfo);
 	if (inp->inp_lport == 0) {
 		inp->inp_vflag &= ~INP_IPV4;
 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
 			inp->inp_vflag |= INP_IPV4;
 		error = in6_pcbbind(inp, NULL, td->td_ucred);
 	}
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 	if (error == 0) {
 		tcp_state_change(tp, TCPS_LISTEN);
 		solisten_proto(so, backlog);
 #ifdef TCP_OFFLOAD
 		if ((so->so_options & SO_NO_OFFLOAD) == 0)
 			tcp_offload_listen_start(tp);
 #endif
 	} else {
 		solisten_proto_abort(so);
 	}
 	SOCK_UNLOCK(so);
 
 	if (IS_FASTOPEN(tp->t_flags))
 		tp->t_tfo_pending = tcp_fastopen_alloc_counter();
 
 	if (error != 0)
 		inp->inp_vflag = vflagsav;
 
 out:
 	TCPDEBUG2(PRU_LISTEN);
 	TCP_PROBE2(debug__user, tp, PRU_LISTEN);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 #endif /* INET6 */
 
 #ifdef INET
 /*
  * Initiate connection to peer.
  * Create a template for use in transmissions on this connection.
  * Enter SYN_SENT state, and mark socket as connecting.
  * Start keep-alive timer, and seed output sequence space.
  * Send initial segment on connection.
  */
 static int
 tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct epoch_tracker et;
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct sockaddr_in *sinp;
 
 	sinp = (struct sockaddr_in *)nam;
 	if (nam->sa_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len != sizeof (*sinp))
 		return (EINVAL);
 
 	/*
 	 * Must disallow TCP ``connections'' to multicast addresses.
 	 */
 	if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
 		return (EAFNOSUPPORT);
 	if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST)
 		return (EACCES);
 	if ((error = prison_remote_ip4(td->td_ucred, &sinp->sin_addr)) != 0)
 		return (error);
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & INP_TIMEWAIT) {
 		error = EADDRINUSE;
 		goto out;
 	}
 	if (inp->inp_flags & INP_DROPPED) {
 		error = ECONNREFUSED;
 		goto out;
 	}
 	if (SOLISTENING(so)) {
 		error = EOPNOTSUPP;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	NET_EPOCH_ENTER(et);
 	if ((error = tcp_connect(tp, nam, td)) != 0)
 		goto out_in_epoch;
 #ifdef TCP_OFFLOAD
 	if (registered_toedevs > 0 &&
 	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
 	    (error = tcp_offload_connect(so, nam)) == 0)
 		goto out_in_epoch;
 #endif
 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 	error = tcp_output(tp);
 	KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
 	    ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
 out_in_epoch:
 	NET_EPOCH_EXIT(et);
 out:
 	TCPDEBUG2(PRU_CONNECT);
 	TCP_PROBE2(debug__user, tp, PRU_CONNECT);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct epoch_tracker et;
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct sockaddr_in6 *sin6;
 	u_int8_t incflagsav;
 	u_char vflagsav;
 
 	TCPDEBUG0;
 
 	sin6 = (struct sockaddr_in6 *)nam;
 	if (nam->sa_family != AF_INET6)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len != sizeof (*sin6))
 		return (EINVAL);
 
 	/*
 	 * Must disallow TCP ``connections'' to multicast addresses.
 	 */
 	if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr))
 		return (EAFNOSUPPORT);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
 	INP_WLOCK(inp);
 	vflagsav = inp->inp_vflag;
 	incflagsav = inp->inp_inc.inc_flags;
 	if (inp->inp_flags & INP_TIMEWAIT) {
 		error = EADDRINUSE;
 		goto out;
 	}
 	if (inp->inp_flags & INP_DROPPED) {
 		error = ECONNREFUSED;
 		goto out;
 	}
 	if (SOLISTENING(so)) {
 		error = EINVAL;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 #ifdef INET
 	/*
 	 * XXXRW: Some confusion: V4/V6 flags relate to binding, and
 	 * therefore probably require the hash lock, which isn't held here.
 	 * Is this a significant problem?
 	 */
 	if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 		struct sockaddr_in sin;
 
 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
 			error = EINVAL;
 			goto out;
 		}
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 
 		in6_sin6_2_sin(&sin, sin6);
 		if (IN_MULTICAST(ntohl(sin.sin_addr.s_addr))) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 		if (ntohl(sin.sin_addr.s_addr) == INADDR_BROADCAST) {
 			error = EACCES;
 			goto out;
 		}
 		if ((error = prison_remote_ip4(td->td_ucred,
 		    &sin.sin_addr)) != 0)
 			goto out;
 		inp->inp_vflag |= INP_IPV4;
 		inp->inp_vflag &= ~INP_IPV6;
 		NET_EPOCH_ENTER(et);
 		if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
 			goto out_in_epoch;
 #ifdef TCP_OFFLOAD
 		if (registered_toedevs > 0 &&
 		    (so->so_options & SO_NO_OFFLOAD) == 0 &&
 		    (error = tcp_offload_connect(so, nam)) == 0)
 			goto out_in_epoch;
 #endif
 		error = tcp_output(tp);
 		goto out_in_epoch;
 	} else {
 		if ((inp->inp_vflag & INP_IPV6) == 0) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 	}
 #endif
 	if ((error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr)) != 0)
 		goto out;
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
 	inp->inp_inc.inc_flags |= INC_ISIPV6;
 	if ((error = tcp6_connect(tp, nam, td)) != 0)
 		goto out;
 #ifdef TCP_OFFLOAD
 	if (registered_toedevs > 0 &&
 	    (so->so_options & SO_NO_OFFLOAD) == 0 &&
 	    (error = tcp_offload_connect(so, nam)) == 0)
 		goto out;
 #endif
 	tcp_timer_activate(tp, TT_KEEP, TP_KEEPINIT(tp));
 	NET_EPOCH_ENTER(et);
 	error = tcp_output(tp);
 #ifdef INET
 out_in_epoch:
 #endif
 	NET_EPOCH_EXIT(et);
 out:
 	KASSERT(error >= 0, ("TCP stack %s requested tcp_drop(%p) at connect()"
 	    ", error code %d", tp->t_fb->tfb_tcp_block_name, tp, -error));
 	/*
 	 * If the implicit bind in the connect call fails, restore
 	 * the flags we modified.
 	 */
 	if (error != 0 && inp->inp_lport == 0) {
 		inp->inp_vflag = vflagsav;
 		inp->inp_inc.inc_flags = incflagsav;
 	}
 
 	TCPDEBUG2(PRU_CONNECT);
 	TCP_PROBE2(debug__user, tp, PRU_CONNECT);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 #endif /* INET6 */
 
 /*
  * Initiate disconnect from peer.
  * If connection never passed embryonic stage, just drop;
  * else if don't need to let data drain, then can just drop anyways,
  * else have to begin TCP shutdown process: mark socket disconnecting,
  * drain unread data, state switch to reflect user close, and
  * send segment (e.g. FIN) to peer.  Socket will be really disconnected
  * when peer sends FIN and acks ours.
  *
  * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
  */
 static int
 tcp_usr_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct epoch_tracker et;
 	int error = 0;
 
 	TCPDEBUG0;
 	NET_EPOCH_ENTER(et);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & INP_TIMEWAIT)
 		goto out;
 	if (inp->inp_flags & INP_DROPPED) {
 		error = ECONNRESET;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	TCPDEBUG1();
 	tcp_disconnect(tp);
 out:
 	TCPDEBUG2(PRU_DISCONNECT);
 	TCP_PROBE2(debug__user, tp, PRU_DISCONNECT);
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 #ifdef INET
 /*
  * Accept a connection.  Essentially all the work is done at higher levels;
  * just return the address of the peer, storing through addr.
  */
 static int
 tcp_usr_accept(struct socket *so, struct sockaddr **nam)
 {
 	int error = 0;
 	struct inpcb *inp = NULL;
 #ifdef KDTRACE_HOOKS
 	struct tcpcb *tp = NULL;
 #endif
 	struct in_addr addr;
 	in_port_t port = 0;
 	TCPDEBUG0;
 
 	if (so->so_state & SS_ISDISCONNECTED)
 		return (ECONNABORTED);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = ECONNABORTED;
 		goto out;
 	}
 #ifdef KDTRACE_HOOKS
 	tp = intotcpcb(inp);
 #endif
 	TCPDEBUG1();
 
 	/*
 	 * We inline in_getpeeraddr and COMMON_END here, so that we can
 	 * copy the data of interest and defer the malloc until after we
 	 * release the lock.
 	 */
 	port = inp->inp_fport;
 	addr = inp->inp_faddr;
 
 out:
 	TCPDEBUG2(PRU_ACCEPT);
 	TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
 	INP_WUNLOCK(inp);
 	if (error == 0)
 		*nam = in_sockaddr(port, &addr);
 	return error;
 }
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp = NULL;
 	int error = 0;
 #ifdef KDTRACE_HOOKS
 	struct tcpcb *tp = NULL;
 #endif
 	struct in_addr addr;
 	struct in6_addr addr6;
 	struct epoch_tracker et;
 	in_port_t port = 0;
 	int v4 = 0;
 	TCPDEBUG0;
 
 	if (so->so_state & SS_ISDISCONNECTED)
 		return (ECONNABORTED);
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = ECONNABORTED;
 		goto out;
 	}
 #ifdef KDTRACE_HOOKS
 	tp = intotcpcb(inp);
 #endif
 	TCPDEBUG1();
 
 	/*
 	 * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
 	 * copy the data of interest and defer the malloc until after we
 	 * release the lock.
 	 */
 	if (inp->inp_vflag & INP_IPV4) {
 		v4 = 1;
 		port = inp->inp_fport;
 		addr = inp->inp_faddr;
 	} else {
 		port = inp->inp_fport;
 		addr6 = inp->in6p_faddr;
 	}
 
 out:
 	TCPDEBUG2(PRU_ACCEPT);
 	TCP_PROBE2(debug__user, tp, PRU_ACCEPT);
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	if (error == 0) {
 		if (v4)
 			*nam = in6_v4mapsin6_sockaddr(port, &addr);
 		else
 			*nam = in6_sockaddr(port, &addr6);
 	}
 	return error;
 }
 #endif /* INET6 */
 
 /*
  * Mark the connection as being incapable of further output.
  */
 static int
 tcp_usr_shutdown(struct socket *so)
 {
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct epoch_tracker et;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	NET_EPOCH_ENTER(et);
 	TCPDEBUG1();
 	socantsendmore(so);
 	tcp_usrclosed(tp);
 	if (!(inp->inp_flags & INP_DROPPED))
 		error = tcp_output_nodrop(tp);
 	TCPDEBUG2(PRU_SHUTDOWN);
 	TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN);
 	error = tcp_unlock_or_drop(tp, error);
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * After a receive, possibly send window update to peer.
  */
 static int
 tcp_usr_rcvd(struct socket *so, int flags)
 {
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	int outrv = 0, error = 0;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	NET_EPOCH_ENTER(et);
 	TCPDEBUG1();
 	/*
 	 * For passively-created TFO connections, don't attempt a window
 	 * update while still in SYN_RECEIVED as this may trigger an early
 	 * SYN|ACK.  It is preferable to have the SYN|ACK be sent along with
 	 * application response data, or failing that, when the DELACK timer
 	 * expires.
 	 */
 	if (IS_FASTOPEN(tp->t_flags) &&
 	    (tp->t_state == TCPS_SYN_RECEIVED))
 		goto out;
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE)
 		tcp_offload_rcvd(tp);
 	else
 #endif
 		outrv = tcp_output_nodrop(tp);
 out:
 	TCPDEBUG2(PRU_RCVD);
 	TCP_PROBE2(debug__user, tp, PRU_RCVD);
 	(void) tcp_unlock_or_drop(tp, outrv);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 /*
  * Do a send by putting data in output queue and updating urgent
  * marker if URG set.  Possibly send more data.  Unlike the other
  * pru_*() routines, the mbuf chains are our responsibility.  We
  * must either enqueue them or free them.  The other pru_* routines
  * generally are caller-frees.
  */
 static int
 tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *nam, struct mbuf *control, struct thread *td)
 {
 	struct epoch_tracker et;
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 #ifdef INET
 #ifdef INET6
 	struct sockaddr_in sin;
 #endif
 	struct sockaddr_in *sinp;
 #endif
 #ifdef INET6
 	int isipv6;
 #endif
 	u_int8_t incflagsav;
 	u_char vflagsav;
 	bool restoreflags;
 	TCPDEBUG0;
 
 	if (control != NULL) {
 		/* TCP doesn't do control messages (rights, creds, etc) */
 		if (control->m_len) {
 			m_freem(control);
 			return (EINVAL);
 		}
 		m_freem(control);	/* empty control, just free it */
 	}
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		if (m != NULL && (flags & PRUS_NOTREADY) == 0)
 			m_freem(m);
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 
 	vflagsav = inp->inp_vflag;
 	incflagsav = inp->inp_inc.inc_flags;
 	restoreflags = false;
 	tp = intotcpcb(inp);
 
 	NET_EPOCH_ENTER(et);
 	if ((flags & PRUS_OOB) != 0 &&
 	    (error = tcp_pru_options_support(tp, PRUS_OOB)) != 0)
 		goto out;
 
 	TCPDEBUG1();
 	if (nam != NULL && tp->t_state < TCPS_SYN_SENT) {
 		if (tp->t_state == TCPS_LISTEN) {
 			error = EINVAL;
 			goto out;
 		}
 		switch (nam->sa_family) {
 #ifdef INET
 		case AF_INET:
 			sinp = (struct sockaddr_in *)nam;
 			if (sinp->sin_len != sizeof(struct sockaddr_in)) {
 				error = EINVAL;
 				goto out;
 			}
 			if ((inp->inp_vflag & INP_IPV6) != 0) {
 				error = EAFNOSUPPORT;
 				goto out;
 			}
 			if (IN_MULTICAST(ntohl(sinp->sin_addr.s_addr))) {
 				error = EAFNOSUPPORT;
 				goto out;
 			}
 			if (ntohl(sinp->sin_addr.s_addr) == INADDR_BROADCAST) {
 				error = EACCES;
 				goto out;
 			}
 			if ((error = prison_remote_ip4(td->td_ucred,
 			    &sinp->sin_addr)))
 				goto out;
 #ifdef INET6
 			isipv6 = 0;
 #endif
 			break;
 #endif /* INET */
 #ifdef INET6
 		case AF_INET6:
 		{
 			struct sockaddr_in6 *sin6;
 
 			sin6 = (struct sockaddr_in6 *)nam;
 			if (sin6->sin6_len != sizeof(*sin6)) {
 				error = EINVAL;
 				goto out;
 			}
 			if ((inp->inp_vflag & INP_IPV6PROTO) == 0) {
 				error = EAFNOSUPPORT;
 				goto out;
 			}
 			if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
 				error = EAFNOSUPPORT;
 				goto out;
 			}
 			if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 #ifdef INET
 				if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
 					error = EINVAL;
 					goto out;
 				}
 				if ((inp->inp_vflag & INP_IPV4) == 0) {
 					error = EAFNOSUPPORT;
 					goto out;
 				}
 				restoreflags = true;
 				inp->inp_vflag &= ~INP_IPV6;
 				sinp = &sin;
 				in6_sin6_2_sin(sinp, sin6);
 				if (IN_MULTICAST(
 				    ntohl(sinp->sin_addr.s_addr))) {
 					error = EAFNOSUPPORT;
 					goto out;
 				}
 				if ((error = prison_remote_ip4(td->td_ucred,
 				    &sinp->sin_addr)))
 					goto out;
 				isipv6 = 0;
 #else /* !INET */
 				error = EAFNOSUPPORT;
 				goto out;
 #endif /* INET */
 			} else {
 				if ((inp->inp_vflag & INP_IPV6) == 0) {
 					error = EAFNOSUPPORT;
 					goto out;
 				}
 				restoreflags = true;
 				inp->inp_vflag &= ~INP_IPV4;
 				inp->inp_inc.inc_flags |= INC_ISIPV6;
 				if ((error = prison_remote_ip6(td->td_ucred,
 				    &sin6->sin6_addr)))
 					goto out;
 				isipv6 = 1;
 			}
 			break;
 		}
 #endif /* INET6 */
 		default:
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 	}
 	if (!(flags & PRUS_OOB)) {
 		sbappendstream(&so->so_snd, m, flags);
 		m = NULL;
 		if (nam && tp->t_state < TCPS_SYN_SENT) {
 			KASSERT(tp->t_state == TCPS_CLOSED,
 			    ("%s: tp %p is listening", __func__, tp));
 
 			/*
 			 * Do implied connect if not yet connected,
 			 * initialize window to default value, and
 			 * initialize maxseg using peer's cached MSS.
 			 */
 #ifdef INET6
 			if (isipv6)
 				error = tcp6_connect(tp, nam, td);
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 				error = tcp_connect(tp,
 				    (struct sockaddr *)sinp, td);
 #endif
 			/*
 			 * The bind operation in tcp_connect succeeded. We
 			 * no longer want to restore the flags if later
 			 * operations fail.
 			 */
 			if (error == 0 || inp->inp_lport != 0)
 				restoreflags = false;
 
 			if (error) {
 				/* m is freed if PRUS_NOTREADY is unset. */
 				sbflush(&so->so_snd);
 				goto out;
 			}
 			if (IS_FASTOPEN(tp->t_flags))
 				tcp_fastopen_connect(tp);
 			else {
 				tp->snd_wnd = TTCP_CLIENT_SND_WND;
 				tcp_mss(tp, -1);
 			}
 		}
 		if (flags & PRUS_EOF) {
 			/*
 			 * Close the send side of the connection after
 			 * the data is sent.
 			 */
 			socantsendmore(so);
 			tcp_usrclosed(tp);
 		}
 		if (TCPS_HAVEESTABLISHED(tp->t_state) &&
 		    ((tp->t_flags2 & TF2_FBYTES_COMPLETE) == 0) &&
 		    (tp->t_fbyte_out == 0) &&
 		    (so->so_snd.sb_ccc > 0)) {
 			tp->t_fbyte_out = ticks;
 			if (tp->t_fbyte_out == 0)
 				tp->t_fbyte_out = 1;
 			if (tp->t_fbyte_out && tp->t_fbyte_in)
 				tp->t_flags2 |= TF2_FBYTES_COMPLETE;
 		}
 		if (!(inp->inp_flags & INP_DROPPED) &&
 		    !(flags & PRUS_NOTREADY)) {
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags |= TF_MORETOCOME;
 			error = tcp_output_nodrop(tp);
 			if (flags & PRUS_MORETOCOME)
 				tp->t_flags &= ~TF_MORETOCOME;
 		}
 	} else {
 		/*
 		 * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
 		if (sbspace(&so->so_snd) < -512) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOBUFS;
 			goto out;
 		}
 		/*
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section.
 		 * Otherwise, snd_up should be one lower.
 		 */
 		sbappendstream_locked(&so->so_snd, m, flags);
 		SOCKBUF_UNLOCK(&so->so_snd);
 		m = NULL;
 		if (nam && tp->t_state < TCPS_SYN_SENT) {
 			/*
 			 * Do implied connect if not yet connected,
 			 * initialize window to default value, and
 			 * initialize maxseg using peer's cached MSS.
 			 */
 
 			/*
 			 * Not going to contemplate SYN|URG
 			 */
 			if (IS_FASTOPEN(tp->t_flags))
 				tp->t_flags &= ~TF_FASTOPEN;
 #ifdef INET6
 			if (isipv6)
 				error = tcp6_connect(tp, nam, td);
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 			else
 #endif
 #ifdef INET
 				error = tcp_connect(tp,
 				    (struct sockaddr *)sinp, td);
 #endif
 			/*
 			 * The bind operation in tcp_connect succeeded. We
 			 * no longer want to restore the flags if later
 			 * operations fail.
 			 */
 			if (error == 0 || inp->inp_lport != 0)
 				restoreflags = false;
 
 			if (error != 0) {
 				/* m is freed if PRUS_NOTREADY is unset. */
 				sbflush(&so->so_snd);
 				goto out;
 			}
 			tp->snd_wnd = TTCP_CLIENT_SND_WND;
 			tcp_mss(tp, -1);
 		}
 		tp->snd_up = tp->snd_una + sbavail(&so->so_snd);
 		if ((flags & PRUS_NOTREADY) == 0) {
 			tp->t_flags |= TF_FORCEDATA;
 			error = tcp_output_nodrop(tp);
 			tp->t_flags &= ~TF_FORCEDATA;
 		}
 	}
 	TCP_LOG_EVENT(tp, NULL,
 	    &inp->inp_socket->so_rcv,
 	    &inp->inp_socket->so_snd,
 	    TCP_LOG_USERSEND, error,
 	    0, NULL, false);
 
 out:
 	/*
 	 * In case of PRUS_NOTREADY, the caller or tcp_usr_ready() is
 	 * responsible for freeing memory.
 	 */
 	if (m != NULL && (flags & PRUS_NOTREADY) == 0)
 		m_freem(m);
 
 	/*
 	 * If the request was unsuccessful and we changed flags,
 	 * restore the original flags.
 	 */
 	if (error != 0 && restoreflags) {
 		inp->inp_vflag = vflagsav;
 		inp->inp_inc.inc_flags = incflagsav;
 	}
 	TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
 		  ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
 	TCP_PROBE2(debug__user, tp, (flags & PRUS_OOB) ? PRU_SENDOOB :
 		   ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
 	error = tcp_unlock_or_drop(tp, error);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 static int
 tcp_usr_ready(struct socket *so, struct mbuf *m, int count)
 {
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	struct tcpcb *tp;
 	int error;
 
 	inp = sotoinpcb(so);
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		mb_free_notready(m, count);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 
 	SOCKBUF_LOCK(&so->so_snd);
 	error = sbready(&so->so_snd, m, count);
 	SOCKBUF_UNLOCK(&so->so_snd);
 	if (error) {
 		INP_WUNLOCK(inp);
 		return (error);
 	}
 	NET_EPOCH_ENTER(et);
 	error = tcp_output_unlock(tp);
 	NET_EPOCH_EXIT(et);
 
 	return (error);
 }
 
 /*
  * Abort the TCP.  Drop the connection abruptly.
  */
 static void
 tcp_usr_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct epoch_tracker et;
 	TCPDEBUG0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
 
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	KASSERT(inp->inp_socket != NULL,
 	    ("tcp_usr_abort: inp_socket == NULL"));
 
 	/*
 	 * If we still have full TCP state, and we're not dropped, drop.
 	 */
 	if (!(inp->inp_flags & INP_TIMEWAIT) &&
 	    !(inp->inp_flags & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		TCPDEBUG1();
 		tp = tcp_drop(tp, ECONNABORTED);
 		if (tp == NULL)
 			goto dropped;
 		TCPDEBUG2(PRU_ABORT);
 		TCP_PROBE2(debug__user, tp, PRU_ABORT);
 	}
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		soref(so);
 		inp->inp_flags |= INP_SOCKREF;
 	}
 	INP_WUNLOCK(inp);
 dropped:
 	NET_EPOCH_EXIT(et);
 }
 
 /*
  * TCP socket is closed.  Start friendly disconnect.
  */
 static void
 tcp_usr_close(struct socket *so)
 {
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 	struct epoch_tracker et;
 	TCPDEBUG0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
 
 	NET_EPOCH_ENTER(et);
 	INP_WLOCK(inp);
 	KASSERT(inp->inp_socket != NULL,
 	    ("tcp_usr_close: inp_socket == NULL"));
 
 	/*
 	 * If we still have full TCP state, and we're not dropped, initiate
 	 * a disconnect.
 	 */
 	if (!(inp->inp_flags & INP_TIMEWAIT) &&
 	    !(inp->inp_flags & INP_DROPPED)) {
 		tp = intotcpcb(inp);
 		tp->t_flags |= TF_CLOSED;
 		TCPDEBUG1();
 		tcp_disconnect(tp);
 		TCPDEBUG2(PRU_CLOSE);
 		TCP_PROBE2(debug__user, tp, PRU_CLOSE);
 	}
 	if (!(inp->inp_flags & INP_DROPPED)) {
 		soref(so);
 		inp->inp_flags |= INP_SOCKREF;
 	}
 	INP_WUNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 }
 
 static int
 tcp_pru_options_support(struct tcpcb *tp, int flags)
 {
 	/*
 	 * If the specific TCP stack has a pru_options
 	 * specified then it does not always support
 	 * all the PRU_XX options and we must ask it.
 	 * If the function is not specified then all
 	 * of the PRU_XX options are supported.
 	 */
 	int ret = 0;
 
 	if (tp->t_fb->tfb_pru_options) {
 		ret = (*tp->t_fb->tfb_pru_options)(tp, flags);
 	}
 	return (ret);
 }
 
 /*
  * Receive out-of-band data.
  */
 static int
 tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
 {
 	int error = 0;
 	struct inpcb *inp;
 	struct tcpcb *tp = NULL;
 
 	TCPDEBUG0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
 	tp = intotcpcb(inp);
 	error = tcp_pru_options_support(tp, PRUS_OOB);
 	if (error) {
 		goto out;
 	}
 	TCPDEBUG1();
 	if ((so->so_oobmark == 0 &&
 	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
 	    so->so_options & SO_OOBINLINE ||
 	    tp->t_oobflags & TCPOOB_HADDATA) {
 		error = EINVAL;
 		goto out;
 	}
 	if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
 		error = EWOULDBLOCK;
 		goto out;
 	}
 	m->m_len = 1;
 	*mtod(m, caddr_t) = tp->t_iobc;
 	if ((flags & MSG_PEEK) == 0)
 		tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
 
 out:
 	TCPDEBUG2(PRU_RCVOOB);
 	TCP_PROBE2(debug__user, tp, PRU_RCVOOB);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 #ifdef INET
-struct pr_usrreqs tcp_usrreqs = {
-	.pru_abort =		tcp_usr_abort,
-	.pru_accept =		tcp_usr_accept,
-	.pru_attach =		tcp_usr_attach,
-	.pru_bind =		tcp_usr_bind,
-	.pru_connect =		tcp_usr_connect,
-	.pru_control =		in_control,
-	.pru_detach =		tcp_usr_detach,
-	.pru_disconnect =	tcp_usr_disconnect,
-	.pru_listen =		tcp_usr_listen,
-	.pru_peeraddr =		in_getpeeraddr,
-	.pru_rcvd =		tcp_usr_rcvd,
-	.pru_rcvoob =		tcp_usr_rcvoob,
-	.pru_send =		tcp_usr_send,
-	.pru_ready =		tcp_usr_ready,
-	.pru_shutdown =		tcp_usr_shutdown,
-	.pru_sockaddr =		in_getsockaddr,
-	.pru_sosetlabel =	in_pcbsosetlabel,
-	.pru_close =		tcp_usr_close,
+struct protosw tcp_protosw = {
+	.pr_type =		SOCK_STREAM,
+	.pr_protocol =		IPPROTO_TCP,
+	.pr_flags =		PR_CONNREQUIRED | PR_IMPLOPCL | PR_WANTRCVD |
+				    PR_CAPATTACH,
+	.pr_ctloutput =		tcp_ctloutput,
+	.pr_abort =		tcp_usr_abort,
+	.pr_accept =		tcp_usr_accept,
+	.pr_attach =		tcp_usr_attach,
+	.pr_bind =		tcp_usr_bind,
+	.pr_connect =		tcp_usr_connect,
+	.pr_control =		in_control,
+	.pr_detach =		tcp_usr_detach,
+	.pr_disconnect =	tcp_usr_disconnect,
+	.pr_listen =		tcp_usr_listen,
+	.pr_peeraddr =		in_getpeeraddr,
+	.pr_rcvd =		tcp_usr_rcvd,
+	.pr_rcvoob =		tcp_usr_rcvoob,
+	.pr_send =		tcp_usr_send,
+	.pr_ready =		tcp_usr_ready,
+	.pr_shutdown =		tcp_usr_shutdown,
+	.pr_sockaddr =		in_getsockaddr,
+	.pr_sosetlabel =	in_pcbsosetlabel,
+	.pr_close =		tcp_usr_close,
 };
 #endif /* INET */
 
 #ifdef INET6
-struct pr_usrreqs tcp6_usrreqs = {
-	.pru_abort =		tcp_usr_abort,
-	.pru_accept =		tcp6_usr_accept,
-	.pru_attach =		tcp_usr_attach,
-	.pru_bind =		tcp6_usr_bind,
-	.pru_connect =		tcp6_usr_connect,
-	.pru_control =		in6_control,
-	.pru_detach =		tcp_usr_detach,
-	.pru_disconnect =	tcp_usr_disconnect,
-	.pru_listen =		tcp6_usr_listen,
-	.pru_peeraddr =		in6_mapped_peeraddr,
-	.pru_rcvd =		tcp_usr_rcvd,
-	.pru_rcvoob =		tcp_usr_rcvoob,
-	.pru_send =		tcp_usr_send,
-	.pru_ready =		tcp_usr_ready,
-	.pru_shutdown =		tcp_usr_shutdown,
-	.pru_sockaddr =		in6_mapped_sockaddr,
-	.pru_sosetlabel =	in_pcbsosetlabel,
-	.pru_close =		tcp_usr_close,
+struct protosw tcp6_protosw = {
+	.pr_type =		SOCK_STREAM,
+	.pr_protocol =		IPPROTO_TCP,
+	.pr_flags =		PR_CONNREQUIRED | PR_IMPLOPCL |PR_WANTRCVD |
+				    PR_CAPATTACH,
+	.pr_ctloutput =		tcp_ctloutput,
+	.pr_abort =		tcp_usr_abort,
+	.pr_accept =		tcp6_usr_accept,
+	.pr_attach =		tcp_usr_attach,
+	.pr_bind =		tcp6_usr_bind,
+	.pr_connect =		tcp6_usr_connect,
+	.pr_control =		in6_control,
+	.pr_detach =		tcp_usr_detach,
+	.pr_disconnect =	tcp_usr_disconnect,
+	.pr_listen =		tcp6_usr_listen,
+	.pr_peeraddr =		in6_mapped_peeraddr,
+	.pr_rcvd =		tcp_usr_rcvd,
+	.pr_rcvoob =		tcp_usr_rcvoob,
+	.pr_send =		tcp_usr_send,
+	.pr_ready =		tcp_usr_ready,
+	.pr_shutdown =		tcp_usr_shutdown,
+	.pr_sockaddr =		in6_mapped_sockaddr,
+	.pr_sosetlabel =	in_pcbsosetlabel,
+	.pr_close =		tcp_usr_close,
 };
 #endif /* INET6 */
 
 #ifdef INET
 /*
  * Common subroutine to open a TCP connection to remote host specified
  * by struct sockaddr_in in mbuf *nam.  Call in_pcbbind to assign a local
  * port number if needed.  Call in_pcbconnect_setup to do the routing and
  * to choose a local host address (interface).  If there is an existing
  * incarnation of the same connection in TIME-WAIT state and if the remote
  * host was sending CC options and if the connection duration was < MSL, then
  * truncate the previous TIME-WAIT state and proceed.
  * Initialize connection parameters and enter SYN-SENT state.
  */
 static int
 tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp = tp->t_inpcb, *oinp;
 	struct socket *so = inp->inp_socket;
 	struct in_addr laddr;
 	u_short lport;
 	int error;
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK(&V_tcbinfo);
 
 	if (V_tcp_require_unique_port && inp->inp_lport == 0) {
 		error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
 		if (error)
 			goto out;
 	}
 
 	/*
 	 * Cannot simply call in_pcbconnect, because there might be an
 	 * earlier incarnation of this same connection still in
 	 * TIME_WAIT state, creating an ADDRINUSE error.
 	 */
 	laddr = inp->inp_laddr;
 	lport = inp->inp_lport;
 	error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
 	    &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
 	if (error && oinp == NULL)
 		goto out;
 	if (oinp) {
 		error = EADDRINUSE;
 		goto out;
 	}
 	/* Handle initial bind if it hadn't been done in advance. */
 	if (inp->inp_lport == 0) {
 		inp->inp_lport = lport;
 		if (in_pcbinshash(inp) != 0) {
 			inp->inp_lport = 0;
 			error = EAGAIN;
 			goto out;
 		}
 	}
 	inp->inp_laddr = laddr;
 	in_pcbrehash(inp);
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 
 	/*
 	 * Compute window scaling to request:
 	 * Scale to fit into sweet spot.  See tcp_syncache.c.
 	 * XXX: This should move to tcp_output().
 	 */
 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
 	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
 		tp->request_r_scale++;
 
 	soisconnecting(so);
 	TCPSTAT_INC(tcps_connattempt);
 	tcp_state_change(tp, TCPS_SYN_SENT);
 	tp->iss = tcp_new_isn(&inp->inp_inc);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
 	tcp_sendseqinit(tp);
 
 	return 0;
 
 out:
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 	return (error);
 }
 #endif /* INET */
 
 #ifdef INET6
 static int
 tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	int error;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK(&V_tcbinfo);
 
 	if (V_tcp_require_unique_port && inp->inp_lport == 0) {
 		error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
 		if (error)
 			goto out;
 	}
 	error = in6_pcbconnect(inp, nam, td->td_ucred);
 	if (error != 0)
 		goto out;
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 
 	/* Compute window scaling to request.  */
 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
 	    (TCP_MAXWIN << tp->request_r_scale) < sb_max)
 		tp->request_r_scale++;
 
 	soisconnecting(inp->inp_socket);
 	TCPSTAT_INC(tcps_connattempt);
 	tcp_state_change(tp, TCPS_SYN_SENT);
 	tp->iss = tcp_new_isn(&inp->inp_inc);
 	if (tp->t_flags & TF_REQ_TSTMP)
 		tp->ts_offset = tcp_new_ts_offset(&inp->inp_inc);
 	tcp_sendseqinit(tp);
 
 	return 0;
 
 out:
 	INP_HASH_WUNLOCK(&V_tcbinfo);
 	return error;
 }
 #endif /* INET6 */
 
 /*
  * Export TCP internal state information via a struct tcp_info, based on the
  * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
  * (TCP state machine, etc).  We export all information using FreeBSD-native
  * constants -- for example, the numeric values for tcpi_state will differ
  * from Linux.
  */
 static void
 tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
 {
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	bzero(ti, sizeof(*ti));
 
 	ti->tcpi_state = tp->t_state;
 	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
 		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
 	if (tp->t_flags & TF_SACK_PERMIT)
 		ti->tcpi_options |= TCPI_OPT_SACK;
 	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
 		ti->tcpi_options |= TCPI_OPT_WSCALE;
 		ti->tcpi_snd_wscale = tp->snd_scale;
 		ti->tcpi_rcv_wscale = tp->rcv_scale;
 	}
 	if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
 		ti->tcpi_options |= TCPI_OPT_ECN;
 
 	ti->tcpi_rto = tp->t_rxtcur * tick;
 	ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick;
 	ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
 	ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
 
 	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
 	ti->tcpi_snd_cwnd = tp->snd_cwnd;
 
 	/*
 	 * FreeBSD-specific extension fields for tcp_info.
 	 */
 	ti->tcpi_rcv_space = tp->rcv_wnd;
 	ti->tcpi_rcv_nxt = tp->rcv_nxt;
 	ti->tcpi_snd_wnd = tp->snd_wnd;
 	ti->tcpi_snd_bwnd = 0;		/* Unused, kept for compat. */
 	ti->tcpi_snd_nxt = tp->snd_nxt;
 	ti->tcpi_snd_mss = tp->t_maxseg;
 	ti->tcpi_rcv_mss = tp->t_maxseg;
 	ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
 	ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
 	ti->tcpi_snd_zerowin = tp->t_sndzerowin;
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE) {
 		ti->tcpi_options |= TCPI_OPT_TOE;
 		tcp_offload_tcp_info(tp, ti);
 	}
 #endif
 }
 
 /*
  * tcp_ctloutput() must drop the inpcb lock before performing copyin on
  * socket option arguments.  When it re-acquires the lock after the copy, it
  * has to revalidate that the connection is still valid for the socket
  * option.
  */
 #define INP_WLOCK_RECHECK_CLEANUP(inp, cleanup) do {			\
 	INP_WLOCK(inp);							\
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {		\
 		INP_WUNLOCK(inp);					\
 		cleanup;						\
 		return (ECONNRESET);					\
 	}								\
 	tp = intotcpcb(inp);						\
 } while(0)
 #define INP_WLOCK_RECHECK(inp) INP_WLOCK_RECHECK_CLEANUP((inp), /* noop */)
 
 int
 tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	int error = 0;
 
 	MPASS(sopt->sopt_dir == SOPT_SET);
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
 	    ("inp_flags == %x", inp->inp_flags));
 	KASSERT(so != NULL, ("inp_socket == NULL"));
 
 	if (sopt->sopt_level != IPPROTO_TCP) {
 		INP_WUNLOCK(inp);
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6PROTO)
 			error = ip6_ctloutput(so, sopt);
 #endif
 #if defined(INET6) && defined(INET)
 		else
 #endif
 #ifdef INET
 			error = ip_ctloutput(so, sopt);
 #endif
 		/*
 		 * When an IP-level socket option affects TCP, pass control
 		 * down to stack tfb_tcp_ctloutput, otherwise return what
 		 * IP level returned.
 		 */
 		switch (sopt->sopt_level) {
 #ifdef INET6
 		case IPPROTO_IPV6:
 			if ((inp->inp_vflag & INP_IPV6PROTO) == 0)
 				return (error);
 			switch (sopt->sopt_name) {
 			case IPV6_TCLASS:
 				/* Notify tcp stacks that care (e.g. RACK). */
 				break;
 			case IPV6_USE_MIN_MTU:
 				/* Update t_maxseg accordingly. */
 				break;
 			default:
 				return (error);
 			}
 			break;
 #endif
 #ifdef INET
 		case IPPROTO_IP:
 			switch (sopt->sopt_name) {
 			case IP_TOS:
 				inp->inp_ip_tos &= ~IPTOS_ECN_MASK;
 				break;
 			case IP_TTL:
 				/* Notify tcp stacks that care (e.g. RACK). */
 				break;
 			default:
 				return (error);
 			}
 			break;
 #endif
 		default:
 			return (error);
 		}
 		INP_WLOCK(inp);
 		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 			INP_WUNLOCK(inp);
 			return (ECONNRESET);
 		}
 	} else if (sopt->sopt_name == TCP_FUNCTION_BLK) {
 		/*
 		 * Protect the TCP option TCP_FUNCTION_BLK so
 		 * that a sub-function can *never* overwrite this.
 		 */
 		struct tcp_function_set fsn;
 		struct tcp_function_block *blk;
 
 		INP_WUNLOCK(inp);
 		error = sooptcopyin(sopt, &fsn, sizeof fsn, sizeof fsn);
 		if (error)
 			return (error);
 
 		INP_WLOCK(inp);
 		if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 			INP_WUNLOCK(inp);
 			return (ECONNRESET);
 		}
 		tp = intotcpcb(inp);
 
 		blk = find_and_ref_tcp_functions(&fsn);
 		if (blk == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOENT);
 		}
 		if (tp->t_fb == blk) {
 			/* You already have this */
 			refcount_release(&blk->tfb_refcnt);
 			INP_WUNLOCK(inp);
 			return (0);
 		}
 		if (tp->t_state != TCPS_CLOSED) {
 			/*
 			 * The user has advanced the state
 			 * past the initial point, we may not
 			 * be able to switch.
 			 */
 			if (blk->tfb_tcp_handoff_ok != NULL) {
 				/*
 				 * Does the stack provide a
 				 * query mechanism, if so it may
 				 * still be possible?
 				 */
 				error = (*blk->tfb_tcp_handoff_ok)(tp);
 			} else
 				error = EINVAL;
 			if (error) {
 				refcount_release(&blk->tfb_refcnt);
 				INP_WUNLOCK(inp);
 				return(error);
 			}
 		}
 		if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
 			refcount_release(&blk->tfb_refcnt);
 			INP_WUNLOCK(inp);
 			return (ENOENT);
 		}
 		/*
 		 * Release the old refcnt, the
 		 * lookup acquired a ref on the
 		 * new one already.
 		 */
 		if (tp->t_fb->tfb_tcp_fb_fini) {
 			struct epoch_tracker et;
 			/*
 			 * Tell the stack to cleanup with 0 i.e.
 			 * the tcb is not going away.
 			 */
 			NET_EPOCH_ENTER(et);
 			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
 			NET_EPOCH_EXIT(et);
 		}
 #ifdef TCPHPTS
 		/* Assure that we are not on any hpts */
 		tcp_hpts_remove(tp->t_inpcb);
 #endif
 		if (blk->tfb_tcp_fb_init) {
 			error = (*blk->tfb_tcp_fb_init)(tp);
 			if (error) {
 				refcount_release(&blk->tfb_refcnt);
 				if (tp->t_fb->tfb_tcp_fb_init) {
 					if((*tp->t_fb->tfb_tcp_fb_init)(tp) != 0)  {
 						/* Fall back failed, drop the connection */
 						INP_WUNLOCK(inp);
 						soabort(so);
 						return (error);
 					}
 				}
 				goto err_out;
 			}
 		}
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_fb = blk;
 #ifdef TCP_OFFLOAD
 		if (tp->t_flags & TF_TOE) {
 			tcp_offload_ctloutput(tp, sopt->sopt_dir,
 			     sopt->sopt_name);
 		}
 #endif
 err_out:
 		INP_WUNLOCK(inp);
 		return (error);
 	}
 
 	/* Pass in the INP locked, callee must unlock it. */
 	return (tp->t_fb->tfb_tcp_ctloutput(inp, sopt));
 }
 
 static int
 tcp_ctloutput_get(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct socket *so = inp->inp_socket;
 	struct tcpcb *tp = intotcpcb(inp);
 	int error = 0;
 
 	MPASS(sopt->sopt_dir == SOPT_GET);
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
 	    ("inp_flags == %x", inp->inp_flags));
 	KASSERT(so != NULL, ("inp_socket == NULL"));
 
 	if (sopt->sopt_level != IPPROTO_TCP) {
 		INP_WUNLOCK(inp);
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6PROTO)
 			error = ip6_ctloutput(so, sopt);
 #endif /* INET6 */
 #if defined(INET6) && defined(INET)
 		else
 #endif
 #ifdef INET
 			error = ip_ctloutput(so, sopt);
 #endif
 		return (error);
 	}
 	if (((sopt->sopt_name == TCP_FUNCTION_BLK) ||
 	     (sopt->sopt_name == TCP_FUNCTION_ALIAS))) {
 		struct tcp_function_set fsn;
 
 		if (sopt->sopt_name == TCP_FUNCTION_ALIAS) {
 			memset(&fsn, 0, sizeof(fsn));
 			find_tcp_function_alias(tp->t_fb, &fsn);
 		} else {
 			strncpy(fsn.function_set_name,
 			    tp->t_fb->tfb_tcp_block_name,
 			    TCP_FUNCTION_NAME_LEN_MAX);
 			fsn.function_set_name[TCP_FUNCTION_NAME_LEN_MAX - 1] = '\0';
 		}
 		fsn.pcbcnt = tp->t_fb->tfb_refcnt;
 		INP_WUNLOCK(inp);
 		error = sooptcopyout(sopt, &fsn, sizeof fsn);
 		return (error);
 	}
 
 	/* Pass in the INP locked, callee must unlock it. */
 	return (tp->t_fb->tfb_tcp_ctloutput(inp, sopt));
 }
 
 int
 tcp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct	inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("tcp_ctloutput: inp == NULL"));
 
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		return (ECONNRESET);
 	}
 	if (sopt->sopt_dir == SOPT_SET)
 		return (tcp_ctloutput_set(inp, sopt));
 	else if (sopt->sopt_dir == SOPT_GET)
 		return (tcp_ctloutput_get(inp, sopt));
 	else
 		panic("%s: sopt_dir $%d", __func__, sopt->sopt_dir);
 }
 
 /*
  * If this assert becomes untrue, we need to change the size of the buf
  * variable in tcp_default_ctloutput().
  */
 #ifdef CTASSERT
 CTASSERT(TCP_CA_NAME_MAX <= TCP_LOG_ID_LEN);
 CTASSERT(TCP_LOG_REASON_LEN <= TCP_LOG_ID_LEN);
 #endif
 
 #ifdef KERN_TLS
 static int
 copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls)
 {
 	struct tls_enable_v0 tls_v0;
 	int error;
 
 	if (sopt->sopt_valsize == sizeof(tls_v0)) {
 		error = sooptcopyin(sopt, &tls_v0, sizeof(tls_v0),
 		    sizeof(tls_v0));
 		if (error)
 			return (error);
 		memset(tls, 0, sizeof(*tls));
 		tls->cipher_key = tls_v0.cipher_key;
 		tls->iv = tls_v0.iv;
 		tls->auth_key = tls_v0.auth_key;
 		tls->cipher_algorithm = tls_v0.cipher_algorithm;
 		tls->cipher_key_len = tls_v0.cipher_key_len;
 		tls->iv_len = tls_v0.iv_len;
 		tls->auth_algorithm = tls_v0.auth_algorithm;
 		tls->auth_key_len = tls_v0.auth_key_len;
 		tls->flags = tls_v0.flags;
 		tls->tls_vmajor = tls_v0.tls_vmajor;
 		tls->tls_vminor = tls_v0.tls_vminor;
 		return (0);
 	}
 
 	return (sooptcopyin(sopt, tls, sizeof(*tls), sizeof(*tls)));
 }
 #endif
 
 extern struct cc_algo newreno_cc_algo;
 
 static int
 tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct cc_algo *algo;
 	void *ptr = NULL;
 	struct tcpcb *tp;
 	struct cc_var cc_mem;
 	char	buf[TCP_CA_NAME_MAX];
 	size_t mem_sz;
 	int error;
 
 	INP_WUNLOCK(inp);
 	error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
 	if (error)
 		return(error);
 	buf[sopt->sopt_valsize] = '\0';
 	CC_LIST_RLOCK();
 	STAILQ_FOREACH(algo, &cc_list, entries) {
 		if (strncmp(buf, algo->name,
 			    TCP_CA_NAME_MAX) == 0) {
 			if (algo->flags & CC_MODULE_BEING_REMOVED) {
 				/* We can't "see" modules being unloaded */
 				continue;
 			}
 			break;
 		}
 	}
 	if (algo == NULL) {
 		CC_LIST_RUNLOCK();
 		return(ESRCH);
 	}
 	/* 
 	 * With a reference the algorithm cannot be removed
 	 * so we hold a reference through the change process.
 	 */
 	cc_refer(algo);
 	CC_LIST_RUNLOCK();
 	if (algo->cb_init != NULL) {
 		/* We can now pre-get the memory for the CC */
 		mem_sz = (*algo->cc_data_sz)();
 		if (mem_sz == 0) {
 			goto no_mem_needed;
 		}
 		ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK);
 	} else {
 no_mem_needed:
 		mem_sz = 0;
 		ptr = NULL;
 	}
 	/*
 	 * Make sure its all clean and zero and also get
 	 * back the inplock.
 	 */
 	memset(&cc_mem, 0, sizeof(cc_mem));
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
 		if (ptr)
 			free(ptr, M_CC_MEM);
 		/* Release our temp reference */
 		CC_LIST_RLOCK();
 		cc_release(algo);
 		CC_LIST_RUNLOCK();
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	if (ptr != NULL)
 		memset(ptr, 0, mem_sz);
 	cc_mem.ccvc.tcp = tp;
 	/*
 	 * We once again hold a write lock over the tcb so it's
 	 * safe to do these things without ordering concerns.
 	 * Note here we init into stack memory.
 	 */
 	if (algo->cb_init != NULL)
 		error = algo->cb_init(&cc_mem, ptr);
 	else
 		error = 0;
 	/*
 	 * The CC algorithms, when given their memory
 	 * should not fail we could in theory have a
 	 * KASSERT here.
 	 */
 	if (error == 0) {
 		/*
 		 * Touchdown, lets go ahead and move the
 		 * connection to the new CC module by
 		 * copying in the cc_mem after we call
 		 * the old ones cleanup (if any).
 		 */
 		if (CC_ALGO(tp)->cb_destroy != NULL)
 			CC_ALGO(tp)->cb_destroy(tp->ccv);
 		/* Detach the old CC from the tcpcb  */
 		cc_detach(tp);
 		/* Copy in our temp memory that was inited */
 		memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var));
 		/* Now attach the new, which takes a reference */
 		cc_attach(tp, algo);
 		/* Ok now are we where we have gotten past any conn_init? */
 		if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) {
 			/* Yep run the connection init for the new CC */
 			CC_ALGO(tp)->conn_init(tp->ccv);
 		}
 	} else if (ptr)
 		free(ptr, M_CC_MEM);
 	INP_WUNLOCK(inp);
 	/* Now lets release our temp reference */
 	CC_LIST_RLOCK();
 	cc_release(algo);
 	CC_LIST_RUNLOCK();
 	return (error);
 }
 
 int
 tcp_default_ctloutput(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct tcpcb *tp = intotcpcb(inp);
 	int	error, opt, optval;
 	u_int	ui;
 	struct	tcp_info ti;
 #ifdef KERN_TLS
 	struct tls_enable tls;
 	struct socket *so = inp->inp_socket;
 #endif
 	char	*pbuf, buf[TCP_LOG_ID_LEN];
 #ifdef STATS
 	struct statsblob *sbp;
 #endif
 	size_t	len;
 
 	INP_WLOCK_ASSERT(inp);
 	KASSERT((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0,
 	    ("inp_flags == %x", inp->inp_flags));
 	KASSERT(inp->inp_socket != NULL, ("inp_socket == NULL"));
 
 	switch (sopt->sopt_level) {
 #ifdef INET6
 	case IPPROTO_IPV6:
 		MPASS(inp->inp_vflag & INP_IPV6PROTO);
 		switch (sopt->sopt_name) {
 		case IPV6_USE_MIN_MTU:
 			tcp6_use_min_mtu(tp);
 			/* FALLTHROUGH */
 		}
 		INP_WUNLOCK(inp);
 		return (0);
 #endif
 #ifdef INET
 	case IPPROTO_IP:
 		INP_WUNLOCK(inp);
 		return (0);
 #endif
 	}
 
 	/*
 	 * For TCP_CCALGOOPT forward the control to CC module, for both
 	 * SOPT_SET and SOPT_GET.
 	 */
 	switch (sopt->sopt_name) {
 	case TCP_CCALGOOPT:
 		INP_WUNLOCK(inp);
 		if (sopt->sopt_valsize > CC_ALGOOPT_LIMIT)
 			return (EINVAL);
 		pbuf = malloc(sopt->sopt_valsize, M_TEMP, M_WAITOK | M_ZERO);
 		error = sooptcopyin(sopt, pbuf, sopt->sopt_valsize,
 		    sopt->sopt_valsize);
 		if (error) {
 			free(pbuf, M_TEMP);
 			return (error);
 		}
 		INP_WLOCK_RECHECK_CLEANUP(inp, free(pbuf, M_TEMP));
 		if (CC_ALGO(tp)->ctl_output != NULL)
 			error = CC_ALGO(tp)->ctl_output(tp->ccv, sopt, pbuf);
 		else
 			error = ENOENT;
 		INP_WUNLOCK(inp);
 		if (error == 0 && sopt->sopt_dir == SOPT_GET)
 			error = sooptcopyout(sopt, pbuf, sopt->sopt_valsize);
 		free(pbuf, M_TEMP);
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		case TCP_MD5SIG:
 			INP_WUNLOCK(inp);
 			if (!TCPMD5_ENABLED())
 				return (ENOPROTOOPT);
 			error = TCPMD5_PCBCTL(inp, sopt);
 			if (error)
 				return (error);
 			INP_WLOCK_RECHECK(inp);
 			goto unlock_and_done;
 #endif /* IPSEC */
 
 		case TCP_NODELAY:
 		case TCP_NOOPT:
 		case TCP_LRD:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			switch (sopt->sopt_name) {
 			case TCP_NODELAY:
 				opt = TF_NODELAY;
 				break;
 			case TCP_NOOPT:
 				opt = TF_NOOPT;
 				break;
 			case TCP_LRD:
 				opt = TF_LRD;
 				break;
 			default:
 				opt = 0; /* dead code to fool gcc */
 				break;
 			}
 
 			if (optval)
 				tp->t_flags |= opt;
 			else
 				tp->t_flags &= ~opt;
 unlock_and_done:
 #ifdef TCP_OFFLOAD
 			if (tp->t_flags & TF_TOE) {
 				tcp_offload_ctloutput(tp, sopt->sopt_dir,
 				    sopt->sopt_name);
 			}
 #endif
 			INP_WUNLOCK(inp);
 			break;
 
 		case TCP_NOPUSH:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			if (optval)
 				tp->t_flags |= TF_NOPUSH;
 			else if (tp->t_flags & TF_NOPUSH) {
 				tp->t_flags &= ~TF_NOPUSH;
 				if (TCPS_HAVEESTABLISHED(tp->t_state)) {
 					struct epoch_tracker et;
 
 					NET_EPOCH_ENTER(et);
 					error = tcp_output_nodrop(tp);
 					NET_EPOCH_EXIT(et);
 				}
 			}
 			goto unlock_and_done;
 
 		case TCP_REMOTE_UDP_ENCAPS_PORT:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 			if ((optval < TCP_TUNNELING_PORT_MIN) ||
 			    (optval > TCP_TUNNELING_PORT_MAX)) {
 				/* Its got to be in range */
 				return (EINVAL);
 			}
 			if ((V_tcp_udp_tunneling_port == 0) && (optval != 0)) {
 				/* You have to have enabled a UDP tunneling port first */
 				return (EINVAL);
 			}
 			INP_WLOCK_RECHECK(inp);
 			if (tp->t_state != TCPS_CLOSED) {
 				/* You can't change after you are connected */
 				error = EINVAL;
 			} else {
 				/* Ok we are all good set the port */
 				tp->t_port = htons(optval);
 			}
 			goto unlock_and_done;
 
 		case TCP_MAXSEG:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			if (optval > 0 && optval <= tp->t_maxseg &&
 			    optval + 40 >= V_tcp_minmss)
 				tp->t_maxseg = optval;
 			else
 				error = EINVAL;
 			goto unlock_and_done;
 
 		case TCP_INFO:
 			INP_WUNLOCK(inp);
 			error = EINVAL;
 			break;
 
 		case TCP_STATS:
 			INP_WUNLOCK(inp);
 #ifdef STATS
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			if (optval > 0)
 				sbp = stats_blob_alloc(
 				    V_tcp_perconn_stats_dflt_tpl, 0);
 			else
 				sbp = NULL;
 
 			INP_WLOCK_RECHECK(inp);
 			if ((tp->t_stats != NULL && sbp == NULL) ||
 			    (tp->t_stats == NULL && sbp != NULL)) {
 				struct statsblob *t = tp->t_stats;
 				tp->t_stats = sbp;
 				sbp = t;
 			}
 			INP_WUNLOCK(inp);
 
 			stats_blob_destroy(sbp);
 #else
 			return (EOPNOTSUPP);
 #endif /* !STATS */
 			break;
 
 		case TCP_CONGESTION:
 			error = tcp_set_cc_mod(inp, sopt);
 			break;
 
 		case TCP_REUSPORT_LB_NUMA:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 			    sizeof(optval));
 			INP_WLOCK_RECHECK(inp);
 			if (!error)
 				error = in_pcblbgroup_numa(inp, optval);
 			INP_WUNLOCK(inp);
 			break;
 
 #ifdef KERN_TLS
 		case TCP_TXTLS_ENABLE:
 			INP_WUNLOCK(inp);
 			error = copyin_tls_enable(sopt, &tls);
 			if (error)
 				break;
 			error = ktls_enable_tx(so, &tls);
 			break;
 		case TCP_TXTLS_MODE:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			error = ktls_set_tx_mode(so, ui);
 			INP_WUNLOCK(inp);
 			break;
 		case TCP_RXTLS_ENABLE:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &tls, sizeof(tls),
 			    sizeof(tls));
 			if (error)
 				break;
 			error = ktls_enable_rx(so, &tls);
 			break;
 #endif
 
 		case TCP_KEEPIDLE:
 		case TCP_KEEPINTVL:
 		case TCP_KEEPINIT:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
 			if (error)
 				return (error);
 
 			if (ui > (UINT_MAX / hz)) {
 				error = EINVAL;
 				break;
 			}
 			ui *= hz;
 
 			INP_WLOCK_RECHECK(inp);
 			switch (sopt->sopt_name) {
 			case TCP_KEEPIDLE:
 				tp->t_keepidle = ui;
 				/*
 				 * XXX: better check current remaining
 				 * timeout and "merge" it with new value.
 				 */
 				if ((tp->t_state > TCPS_LISTEN) &&
 				    (tp->t_state <= TCPS_CLOSING))
 					tcp_timer_activate(tp, TT_KEEP,
 					    TP_KEEPIDLE(tp));
 				break;
 			case TCP_KEEPINTVL:
 				tp->t_keepintvl = ui;
 				if ((tp->t_state == TCPS_FIN_WAIT_2) &&
 				    (TP_MAXIDLE(tp) > 0))
 					tcp_timer_activate(tp, TT_2MSL,
 					    TP_MAXIDLE(tp));
 				break;
 			case TCP_KEEPINIT:
 				tp->t_keepinit = ui;
 				if (tp->t_state == TCPS_SYN_RECEIVED ||
 				    tp->t_state == TCPS_SYN_SENT)
 					tcp_timer_activate(tp, TT_KEEP,
 					    TP_KEEPINIT(tp));
 				break;
 			}
 			goto unlock_and_done;
 
 		case TCP_KEEPCNT:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &ui, sizeof(ui), sizeof(ui));
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			tp->t_keepcnt = ui;
 			if ((tp->t_state == TCPS_FIN_WAIT_2) &&
 			    (TP_MAXIDLE(tp) > 0))
 				tcp_timer_activate(tp, TT_2MSL,
 				    TP_MAXIDLE(tp));
 			goto unlock_and_done;
 
 #ifdef TCPPCAP
 		case TCP_PCAP_OUT:
 		case TCP_PCAP_IN:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			if (optval >= 0)
 				tcp_pcap_set_sock_max(TCP_PCAP_OUT ?
 					&(tp->t_outpkts) : &(tp->t_inpkts),
 					optval);
 			else
 				error = EINVAL;
 			goto unlock_and_done;
 #endif
 
 		case TCP_FASTOPEN: {
 			struct tcp_fastopen tfo_optval;
 
 			INP_WUNLOCK(inp);
 			if (!V_tcp_fastopen_client_enable &&
 			    !V_tcp_fastopen_server_enable)
 				return (EPERM);
 
 			error = sooptcopyin(sopt, &tfo_optval,
 				    sizeof(tfo_optval), sizeof(int));
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			if ((tp->t_state != TCPS_CLOSED) &&
 			    (tp->t_state != TCPS_LISTEN)) {
 				error = EINVAL;
 				goto unlock_and_done;
 			}
 			if (tfo_optval.enable) {
 				if (tp->t_state == TCPS_LISTEN) {
 					if (!V_tcp_fastopen_server_enable) {
 						error = EPERM;
 						goto unlock_and_done;
 					}
 
 					if (tp->t_tfo_pending == NULL)
 						tp->t_tfo_pending =
 						    tcp_fastopen_alloc_counter();
 				} else {
 					/*
 					 * If a pre-shared key was provided,
 					 * stash it in the client cookie
 					 * field of the tcpcb for use during
 					 * connect.
 					 */
 					if (sopt->sopt_valsize ==
 					    sizeof(tfo_optval)) {
 						memcpy(tp->t_tfo_cookie.client,
 						       tfo_optval.psk,
 						       TCP_FASTOPEN_PSK_LEN);
 						tp->t_tfo_client_cookie_len =
 						    TCP_FASTOPEN_PSK_LEN;
 					}
 				}
 				tp->t_flags |= TF_FASTOPEN;
 			} else
 				tp->t_flags &= ~TF_FASTOPEN;
 			goto unlock_and_done;
 		}
 
 #ifdef TCP_BLACKBOX
 		case TCP_LOG:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			INP_WLOCK_RECHECK(inp);
 			error = tcp_log_state_change(tp, optval);
 			goto unlock_and_done;
 
 		case TCP_LOGBUF:
 			INP_WUNLOCK(inp);
 			error = EINVAL;
 			break;
 
 		case TCP_LOGID:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, buf, TCP_LOG_ID_LEN - 1, 0);
 			if (error)
 				break;
 			buf[sopt->sopt_valsize] = '\0';
 			INP_WLOCK_RECHECK(inp);
 			error = tcp_log_set_id(tp, buf);
 			/* tcp_log_set_id() unlocks the INP. */
 			break;
 
 		case TCP_LOGDUMP:
 		case TCP_LOGDUMPID:
 			INP_WUNLOCK(inp);
 			error =
 			    sooptcopyin(sopt, buf, TCP_LOG_REASON_LEN - 1, 0);
 			if (error)
 				break;
 			buf[sopt->sopt_valsize] = '\0';
 			INP_WLOCK_RECHECK(inp);
 			if (sopt->sopt_name == TCP_LOGDUMP) {
 				error = tcp_log_dump_tp_logbuf(tp, buf,
 				    M_WAITOK, true);
 				INP_WUNLOCK(inp);
 			} else {
 				tcp_log_dump_tp_bucket_logbufs(tp, buf);
 				/*
 				 * tcp_log_dump_tp_bucket_logbufs() drops the
 				 * INP lock.
 				 */
 			}
 			break;
 #endif
 
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		tp = intotcpcb(inp);
 		switch (sopt->sopt_name) {
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 		case TCP_MD5SIG:
 			INP_WUNLOCK(inp);
 			if (!TCPMD5_ENABLED())
 				return (ENOPROTOOPT);
 			error = TCPMD5_PCBCTL(inp, sopt);
 			break;
 #endif
 
 		case TCP_NODELAY:
 			optval = tp->t_flags & TF_NODELAY;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_MAXSEG:
 			optval = tp->t_maxseg;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_REMOTE_UDP_ENCAPS_PORT:
 			optval = ntohs(tp->t_port);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_NOOPT:
 			optval = tp->t_flags & TF_NOOPT;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_NOPUSH:
 			optval = tp->t_flags & TF_NOPUSH;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		case TCP_INFO:
 			tcp_fill_info(tp, &ti);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &ti, sizeof ti);
 			break;
 		case TCP_STATS:
 			{
 #ifdef STATS
 			int nheld;
 			TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0;
 
 			error = 0;
 			socklen_t outsbsz = sopt->sopt_valsize;
 			if (tp->t_stats == NULL)
 				error = ENOENT;
 			else if (outsbsz >= tp->t_stats->cursz)
 				outsbsz = tp->t_stats->cursz;
 			else if (outsbsz >= sizeof(struct statsblob))
 				outsbsz = sizeof(struct statsblob);
 			else
 				error = EINVAL;
 			INP_WUNLOCK(inp);
 			if (error)
 				break;
 
 			sbp = sopt->sopt_val;
 			nheld = atop(round_page(((vm_offset_t)sbp) +
 			    (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp));
 			vm_page_t ma[nheld];
 			if (vm_fault_quick_hold_pages(
 			    &curproc->p_vmspace->vm_map, (vm_offset_t)sbp,
 			    outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma,
 			    nheld) < 0) {
 				error = EFAULT;
 				break;
 			}
 
 			if ((error = copyin_nofault(&(sbp->flags), &sbflags,
 			    SIZEOF_MEMBER(struct statsblob, flags))))
 				goto unhold;
 
 			INP_WLOCK_RECHECK(inp);
 			error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats,
 			    sbflags | SB_CLONE_USRDSTNOFAULT);
 			INP_WUNLOCK(inp);
 			sopt->sopt_valsize = outsbsz;
 unhold:
 			vm_page_unhold_pages(ma, nheld);
 #else
 			INP_WUNLOCK(inp);
 			error = EOPNOTSUPP;
 #endif /* !STATS */
 			break;
 			}
 		case TCP_CONGESTION:
 			len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, buf, len + 1);
 			break;
 		case TCP_KEEPIDLE:
 		case TCP_KEEPINTVL:
 		case TCP_KEEPINIT:
 		case TCP_KEEPCNT:
 			switch (sopt->sopt_name) {
 			case TCP_KEEPIDLE:
 				ui = TP_KEEPIDLE(tp) / hz;
 				break;
 			case TCP_KEEPINTVL:
 				ui = TP_KEEPINTVL(tp) / hz;
 				break;
 			case TCP_KEEPINIT:
 				ui = TP_KEEPINIT(tp) / hz;
 				break;
 			case TCP_KEEPCNT:
 				ui = TP_KEEPCNT(tp);
 				break;
 			}
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &ui, sizeof(ui));
 			break;
 #ifdef TCPPCAP
 		case TCP_PCAP_OUT:
 		case TCP_PCAP_IN:
 			optval = tcp_pcap_get_sock_max(TCP_PCAP_OUT ?
 					&(tp->t_outpkts) : &(tp->t_inpkts));
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 #endif
 		case TCP_FASTOPEN:
 			optval = tp->t_flags & TF_FASTOPEN;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 #ifdef TCP_BLACKBOX
 		case TCP_LOG:
 			optval = tp->t_logstate;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 		case TCP_LOGBUF:
 			/* tcp_log_getlogbuf() does INP_WUNLOCK(inp) */
 			error = tcp_log_getlogbuf(sopt, tp);
 			break;
 		case TCP_LOGID:
 			len = tcp_log_get_id(tp, buf);
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, buf, len + 1);
 			break;
 		case TCP_LOGDUMP:
 		case TCP_LOGDUMPID:
 			INP_WUNLOCK(inp);
 			error = EINVAL;
 			break;
 #endif
 #ifdef KERN_TLS
 		case TCP_TXTLS_MODE:
 			error = ktls_get_tx_mode(so, &optval);
 			INP_WUNLOCK(inp);
 			if (error == 0)
 				error = sooptcopyout(sopt, &optval,
 				    sizeof(optval));
 			break;
 		case TCP_RXTLS_MODE:
 			error = ktls_get_rx_mode(so, &optval);
 			INP_WUNLOCK(inp);
 			if (error == 0)
 				error = sooptcopyout(sopt, &optval,
 				    sizeof(optval));
 			break;
 #endif
 		case TCP_LRD:
 			optval = tp->t_flags & TF_LRD;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 #undef INP_WLOCK_RECHECK
 #undef INP_WLOCK_RECHECK_CLEANUP
 
 /*
  * Initiate (or continue) disconnect.
  * If embryonic state, just send reset (once).
  * If in ``let data drain'' option and linger null, just drop.
  * Otherwise (hard), mark socket disconnecting and drop
  * current input data; switch states based on user close, and
  * send segment to peer (with FIN).
  */
 static void
 tcp_disconnect(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so = inp->inp_socket;
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(inp);
 
 	/*
 	 * Neither tcp_close() nor tcp_drop() should return NULL, as the
 	 * socket is still open.
 	 */
 	if (tp->t_state < TCPS_ESTABLISHED &&
 	    !(tp->t_state > TCPS_LISTEN && IS_FASTOPEN(tp->t_flags))) {
 		tp = tcp_close(tp);
 		KASSERT(tp != NULL,
 		    ("tcp_disconnect: tcp_close() returned NULL"));
 	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
 		tp = tcp_drop(tp, 0);
 		KASSERT(tp != NULL,
 		    ("tcp_disconnect: tcp_drop() returned NULL"));
 	} else {
 		soisdisconnecting(so);
 		sbflush(&so->so_rcv);
 		tcp_usrclosed(tp);
 		if (!(inp->inp_flags & INP_DROPPED))
 			/* Ignore stack's drop request, we already at it. */
 			(void)tcp_output_nodrop(tp);
 	}
 }
 
 /*
  * User issued close, and wish to trail through shutdown states:
  * if never received SYN, just forget it.  If got a SYN from peer,
  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
  * If already got a FIN from peer, then almost done; go to LAST_ACK
  * state.  In all other cases, have already sent FIN to peer (e.g.
  * after PRU_SHUTDOWN), and just have to play tedious game waiting
  * for peer to send FIN or not respond to keep-alives, etc.
  * We can let the user exit from the close as soon as the FIN is acked.
  */
 static void
 tcp_usrclosed(struct tcpcb *tp)
 {
 
 	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	switch (tp->t_state) {
 	case TCPS_LISTEN:
 #ifdef TCP_OFFLOAD
 		tcp_offload_listen_stop(tp);
 #endif
 		tcp_state_change(tp, TCPS_CLOSED);
 		/* FALLTHROUGH */
 	case TCPS_CLOSED:
 		tp = tcp_close(tp);
 		/*
 		 * tcp_close() should never return NULL here as the socket is
 		 * still open.
 		 */
 		KASSERT(tp != NULL,
 		    ("tcp_usrclosed: tcp_close() returned NULL"));
 		break;
 
 	case TCPS_SYN_SENT:
 	case TCPS_SYN_RECEIVED:
 		tp->t_flags |= TF_NEEDFIN;
 		break;
 
 	case TCPS_ESTABLISHED:
 		tcp_state_change(tp, TCPS_FIN_WAIT_1);
 		break;
 
 	case TCPS_CLOSE_WAIT:
 		tcp_state_change(tp, TCPS_LAST_ACK);
 		break;
 	}
 	if (tp->t_state >= TCPS_FIN_WAIT_2) {
 		soisdisconnected(tp->t_inpcb->inp_socket);
 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
 		if (tp->t_state == TCPS_FIN_WAIT_2) {
 			int timeout;
 
 			timeout = (tcp_fast_finwait2_recycle) ?
 			    tcp_finwait2_timeout : TP_MAXIDLE(tp);
 			tcp_timer_activate(tp, TT_2MSL, timeout);
 		}
 	}
 }
 
 #ifdef DDB
 static void
 db_print_indent(int indent)
 {
 	int i;
 
 	for (i = 0; i < indent; i++)
 		db_printf(" ");
 }
 
 static void
 db_print_tstate(int t_state)
 {
 
 	switch (t_state) {
 	case TCPS_CLOSED:
 		db_printf("TCPS_CLOSED");
 		return;
 
 	case TCPS_LISTEN:
 		db_printf("TCPS_LISTEN");
 		return;
 
 	case TCPS_SYN_SENT:
 		db_printf("TCPS_SYN_SENT");
 		return;
 
 	case TCPS_SYN_RECEIVED:
 		db_printf("TCPS_SYN_RECEIVED");
 		return;
 
 	case TCPS_ESTABLISHED:
 		db_printf("TCPS_ESTABLISHED");
 		return;
 
 	case TCPS_CLOSE_WAIT:
 		db_printf("TCPS_CLOSE_WAIT");
 		return;
 
 	case TCPS_FIN_WAIT_1:
 		db_printf("TCPS_FIN_WAIT_1");
 		return;
 
 	case TCPS_CLOSING:
 		db_printf("TCPS_CLOSING");
 		return;
 
 	case TCPS_LAST_ACK:
 		db_printf("TCPS_LAST_ACK");
 		return;
 
 	case TCPS_FIN_WAIT_2:
 		db_printf("TCPS_FIN_WAIT_2");
 		return;
 
 	case TCPS_TIME_WAIT:
 		db_printf("TCPS_TIME_WAIT");
 		return;
 
 	default:
 		db_printf("unknown");
 		return;
 	}
 }
 
 static void
 db_print_tflags(u_int t_flags)
 {
 	int comma;
 
 	comma = 0;
 	if (t_flags & TF_ACKNOW) {
 		db_printf("%sTF_ACKNOW", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_DELACK) {
 		db_printf("%sTF_DELACK", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NODELAY) {
 		db_printf("%sTF_NODELAY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NOOPT) {
 		db_printf("%sTF_NOOPT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_SENTFIN) {
 		db_printf("%sTF_SENTFIN", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_REQ_SCALE) {
 		db_printf("%sTF_REQ_SCALE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_RCVD_SCALE) {
 		db_printf("%sTF_RECVD_SCALE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_REQ_TSTMP) {
 		db_printf("%sTF_REQ_TSTMP", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_RCVD_TSTMP) {
 		db_printf("%sTF_RCVD_TSTMP", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_SACK_PERMIT) {
 		db_printf("%sTF_SACK_PERMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NEEDSYN) {
 		db_printf("%sTF_NEEDSYN", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NEEDFIN) {
 		db_printf("%sTF_NEEDFIN", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_NOPUSH) {
 		db_printf("%sTF_NOPUSH", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_PREVVALID) {
 		db_printf("%sTF_PREVVALID", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_MORETOCOME) {
 		db_printf("%sTF_MORETOCOME", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_LQ_OVERFLOW) {
 		db_printf("%sTF_LQ_OVERFLOW", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_LASTIDLE) {
 		db_printf("%sTF_LASTIDLE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_RXWIN0SENT) {
 		db_printf("%sTF_RXWIN0SENT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_FASTRECOVERY) {
 		db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_CONGRECOVERY) {
 		db_printf("%sTF_CONGRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_WASFRECOVERY) {
 		db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_WASCRECOVERY) {
 		db_printf("%sTF_WASCRECOVERY", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_SIGNATURE) {
 		db_printf("%sTF_SIGNATURE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_FORCEDATA) {
 		db_printf("%sTF_FORCEDATA", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_TSO) {
 		db_printf("%sTF_TSO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags & TF_FASTOPEN) {
 		db_printf("%sTF_FASTOPEN", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_tflags2(u_int t_flags2)
 {
 	int comma;
 
 	comma = 0;
 	if (t_flags2 & TF2_PLPMTU_BLACKHOLE) {
 		db_printf("%sTF2_PLPMTU_BLACKHOLE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_PLPMTU_PMTUD) {
 		db_printf("%sTF2_PLPMTU_PMTUD", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_PLPMTU_MAXSEGSNT) {
 		db_printf("%sTF2_PLPMTU_MAXSEGSNT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_LOG_AUTO) {
 		db_printf("%sTF2_LOG_AUTO", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_DROP_AF_DATA) {
 		db_printf("%sTF2_DROP_AF_DATA", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_ECN_PERMIT) {
 		db_printf("%sTF2_ECN_PERMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_ECN_SND_CWR) {
 		db_printf("%sTF2_ECN_SND_CWR", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_ECN_SND_ECE) {
 		db_printf("%sTF2_ECN_SND_ECE", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_ACE_PERMIT) {
 		db_printf("%sTF2_ACE_PERMIT", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_flags2 & TF2_FBYTES_COMPLETE) {
 		db_printf("%sTF2_FBYTES_COMPLETE", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_toobflags(char t_oobflags)
 {
 	int comma;
 
 	comma = 0;
 	if (t_oobflags & TCPOOB_HAVEDATA) {
 		db_printf("%sTCPOOB_HAVEDATA", comma ? ", " : "");
 		comma = 1;
 	}
 	if (t_oobflags & TCPOOB_HADDATA) {
 		db_printf("%sTCPOOB_HADDATA", comma ? ", " : "");
 		comma = 1;
 	}
 }
 
 static void
 db_print_tcpcb(struct tcpcb *tp, const char *name, int indent)
 {
 
 	db_print_indent(indent);
 	db_printf("%s at %p\n", name, tp);
 
 	indent += 2;
 
 	db_print_indent(indent);
 	db_printf("t_segq first: %p   t_segqlen: %d   t_dupacks: %d\n",
 	   TAILQ_FIRST(&tp->t_segq), tp->t_segqlen, tp->t_dupacks);
 
 	db_print_indent(indent);
 	db_printf("tt_rexmt: %p   tt_persist: %p   tt_keep: %p\n",
 	    &tp->t_timers->tt_rexmt, &tp->t_timers->tt_persist, &tp->t_timers->tt_keep);
 
 	db_print_indent(indent);
 	db_printf("tt_2msl: %p   tt_delack: %p   t_inpcb: %p\n", &tp->t_timers->tt_2msl,
 	    &tp->t_timers->tt_delack, tp->t_inpcb);
 
 	db_print_indent(indent);
 	db_printf("t_state: %d (", tp->t_state);
 	db_print_tstate(tp->t_state);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("t_flags: 0x%x (", tp->t_flags);
 	db_print_tflags(tp->t_flags);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("t_flags2: 0x%x (", tp->t_flags2);
 	db_print_tflags2(tp->t_flags2);
 	db_printf(")\n");
 
 	db_print_indent(indent);
 	db_printf("snd_una: 0x%08x   snd_max: 0x%08x   snd_nxt: x0%08x\n",
 	    tp->snd_una, tp->snd_max, tp->snd_nxt);
 
 	db_print_indent(indent);
 	db_printf("snd_up: 0x%08x   snd_wl1: 0x%08x   snd_wl2: 0x%08x\n",
 	   tp->snd_up, tp->snd_wl1, tp->snd_wl2);
 
 	db_print_indent(indent);
 	db_printf("iss: 0x%08x   irs: 0x%08x   rcv_nxt: 0x%08x\n",
 	    tp->iss, tp->irs, tp->rcv_nxt);
 
 	db_print_indent(indent);
 	db_printf("rcv_adv: 0x%08x   rcv_wnd: %u   rcv_up: 0x%08x\n",
 	    tp->rcv_adv, tp->rcv_wnd, tp->rcv_up);
 
 	db_print_indent(indent);
 	db_printf("snd_wnd: %u   snd_cwnd: %u\n",
 	   tp->snd_wnd, tp->snd_cwnd);
 
 	db_print_indent(indent);
 	db_printf("snd_ssthresh: %u   snd_recover: "
 	    "0x%08x\n", tp->snd_ssthresh, tp->snd_recover);
 
 	db_print_indent(indent);
 	db_printf("t_rcvtime: %u   t_startime: %u\n",
 	    tp->t_rcvtime, tp->t_starttime);
 
 	db_print_indent(indent);
 	db_printf("t_rttime: %u   t_rtsq: 0x%08x\n",
 	    tp->t_rtttime, tp->t_rtseq);
 
 	db_print_indent(indent);
 	db_printf("t_rxtcur: %d   t_maxseg: %u   t_srtt: %d\n",
 	    tp->t_rxtcur, tp->t_maxseg, tp->t_srtt);
 
 	db_print_indent(indent);
 	db_printf("t_rttvar: %d   t_rxtshift: %d   t_rttmin: %u   "
 	    "t_rttbest: %u\n", tp->t_rttvar, tp->t_rxtshift, tp->t_rttmin,
 	    tp->t_rttbest);
 
 	db_print_indent(indent);
 	db_printf("t_rttupdated: %lu   max_sndwnd: %u   t_softerror: %d\n",
 	    tp->t_rttupdated, tp->max_sndwnd, tp->t_softerror);
 
 	db_print_indent(indent);
 	db_printf("t_oobflags: 0x%x (", tp->t_oobflags);
 	db_print_toobflags(tp->t_oobflags);
 	db_printf(")   t_iobc: 0x%02x\n", tp->t_iobc);
 
 	db_print_indent(indent);
 	db_printf("snd_scale: %u   rcv_scale: %u   request_r_scale: %u\n",
 	    tp->snd_scale, tp->rcv_scale, tp->request_r_scale);
 
 	db_print_indent(indent);
 	db_printf("ts_recent: %u   ts_recent_age: %u\n",
 	    tp->ts_recent, tp->ts_recent_age);
 
 	db_print_indent(indent);
 	db_printf("ts_offset: %u   last_ack_sent: 0x%08x   snd_cwnd_prev: "
 	    "%u\n", tp->ts_offset, tp->last_ack_sent, tp->snd_cwnd_prev);
 
 	db_print_indent(indent);
 	db_printf("snd_ssthresh_prev: %u   snd_recover_prev: 0x%08x   "
 	    "t_badrxtwin: %u\n", tp->snd_ssthresh_prev,
 	    tp->snd_recover_prev, tp->t_badrxtwin);
 
 	db_print_indent(indent);
 	db_printf("snd_numholes: %d  snd_holes first: %p\n",
 	    tp->snd_numholes, TAILQ_FIRST(&tp->snd_holes));
 
 	db_print_indent(indent);
 	db_printf("snd_fack: 0x%08x   rcv_numsacks: %d\n",
 	    tp->snd_fack, tp->rcv_numsacks);
 
 	/* Skip sackblks, sackhint. */
 
 	db_print_indent(indent);
 	db_printf("t_rttlow: %d   rfbuf_ts: %u   rfbuf_cnt: %d\n",
 	    tp->t_rttlow, tp->rfbuf_ts, tp->rfbuf_cnt);
 }
 
 DB_SHOW_COMMAND(tcpcb, db_show_tcpcb)
 {
 	struct tcpcb *tp;
 
 	if (!have_addr) {
 		db_printf("usage: show tcpcb <addr>\n");
 		return;
 	}
 	tp = (struct tcpcb *)addr;
 
 	db_print_tcpcb(tp, "tcpcb", 0);
 }
 #endif
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 62d64f2dbdb2..d80bbef1c51d 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -1,1304 +1,1305 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993, 1994, 1995
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)tcp_var.h	8.4 (Berkeley) 5/24/95
  * $FreeBSD$
  */
 
 #ifndef _NETINET_TCP_VAR_H_
 #define _NETINET_TCP_VAR_H_
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_fsm.h>
 
 #ifdef _KERNEL
 #include "opt_kern_tls.h"
 #include <net/vnet.h>
 #include <sys/mbuf.h>
 #include <sys/ktls.h>
 #endif
 
 #define TCP_END_BYTE_INFO 8	/* Bytes that makeup the "end information array" */
 /* Types of ending byte info */
 #define TCP_EI_EMPTY_SLOT	0
 #define TCP_EI_STATUS_CLIENT_FIN	0x1
 #define TCP_EI_STATUS_CLIENT_RST	0x2
 #define TCP_EI_STATUS_SERVER_FIN	0x3
 #define TCP_EI_STATUS_SERVER_RST	0x4
 #define TCP_EI_STATUS_RETRAN		0x5
 #define TCP_EI_STATUS_PROGRESS		0x6
 #define TCP_EI_STATUS_PERSIST_MAX	0x7
 #define TCP_EI_STATUS_KEEP_MAX		0x8
 #define TCP_EI_STATUS_DATA_A_CLOSE	0x9
 #define TCP_EI_STATUS_RST_IN_FRONT	0xa
 #define TCP_EI_STATUS_2MSL		0xb
 #define TCP_EI_STATUS_MAX_VALUE		0xb
 
 /************************************************/
 /* Status bits we track to assure no duplicates,
  * the bits here are not used by the code but
  * for human representation. To check a bit we
  * take and shift over by 1 minus the value (1-8).
  */
 /************************************************/
 #define TCP_EI_BITS_CLIENT_FIN	0x001
 #define TCP_EI_BITS_CLIENT_RST	0x002
 #define TCP_EI_BITS_SERVER_FIN	0x004
 #define TCP_EI_BITS_SERVER_RST	0x008
 #define TCP_EI_BITS_RETRAN	0x010
 #define TCP_EI_BITS_PROGRESS	0x020
 #define TCP_EI_BITS_PRESIST_MAX	0x040
 #define TCP_EI_BITS_KEEP_MAX	0x080
 #define TCP_EI_BITS_DATA_A_CLO  0x100
 #define TCP_EI_BITS_RST_IN_FR	0x200	/* a front state reset */
 #define TCP_EI_BITS_2MS_TIMER	0x400	/* 2 MSL timer expired */
 
 #if defined(_KERNEL) || defined(_WANT_TCPCB)
 /* TCP segment queue entry */
 struct tseg_qent {
 	TAILQ_ENTRY(tseg_qent) tqe_q;
 	struct	mbuf   *tqe_m;		/* mbuf contains packet */
 	struct  mbuf   *tqe_last;	/* last mbuf in chain */
 	tcp_seq tqe_start;		/* TCP Sequence number start */
 	int	tqe_len;		/* TCP segment data length */
 	uint32_t tqe_flags;		/* The flags from tcp_get_flags() */
 	uint32_t tqe_mbuf_cnt;		/* Count of mbuf overhead */
 };
 TAILQ_HEAD(tsegqe_head, tseg_qent);
 
 struct sackblk {
 	tcp_seq start;		/* start seq no. of sack block */
 	tcp_seq end;		/* end seq no. */
 };
 
 struct sackhole {
 	tcp_seq start;		/* start seq no. of hole */
 	tcp_seq end;		/* end seq no. */
 	tcp_seq rxmit;		/* next seq. no in hole to be retransmitted */
 	TAILQ_ENTRY(sackhole) scblink;	/* scoreboard linkage */
 };
 
 struct sackhint {
 	struct sackhole	*nexthole;
 	int32_t		sack_bytes_rexmit;
 	tcp_seq		last_sack_ack;	/* Most recent/largest sacked ack */
 
 	int32_t		delivered_data; /* Newly acked data from last SACK */
 
 	int32_t		sacked_bytes;	/* Total sacked bytes reported by the
 					 * receiver via sack option
 					 */
 	uint32_t	recover_fs;	/* Flight Size at the start of Loss recovery */
 	uint32_t	prr_delivered;	/* Total bytes delivered using PRR */
 	uint32_t	prr_out;	/* Bytes sent during IN_RECOVERY */
 };
 
 #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq)
 
 STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
 
 /*
  * Tcp control block, one per tcp; fields:
  * Organized for 64 byte cacheline efficiency based
  * on common tcp_input/tcp_output processing.
  */
 struct tcpcb {
 	/* Cache line 1 */
 	struct	inpcb *t_inpcb;		/* back pointer to internet pcb */
 	struct tcp_function_block *t_fb;/* TCP function call block */
 	void	*t_fb_ptr;		/* Pointer to t_fb specific data */
 	uint32_t t_maxseg:24,		/* maximum segment size */
 		t_logstate:8;		/* State of "black box" logging */
 	uint32_t t_port:16,		/* Tunneling (over udp) port */
 		t_state:4,		/* state of this connection */
 		t_idle_reduce : 1,
 		t_delayed_ack: 7,	/* Delayed ack variable */
 		t_fin_is_rst: 1,	/* Are fin's treated as resets */
 		t_log_state_set: 1,
 		bits_spare : 2;
 	u_int	t_flags;
 	tcp_seq	snd_una;		/* sent but unacknowledged */
 	tcp_seq	snd_max;		/* highest sequence number sent;
 					 * used to recognize retransmits
 					 */
 	tcp_seq	snd_nxt;		/* send next */
 	tcp_seq	snd_up;			/* send urgent pointer */
 	uint32_t  snd_wnd;		/* send window */
 	uint32_t  snd_cwnd;		/* congestion-controlled window */
 	uint32_t t_peakrate_thr; 	/* pre-calculated peak rate threshold */
 	/* Cache line 2 */
 	u_int32_t  ts_offset;		/* our timestamp offset */
 	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
 	int	rcv_numsacks;		/* # distinct sack blks present */
 	u_int	t_tsomax;		/* TSO total burst length limit in bytes */
 	u_int	t_tsomaxsegcount;	/* TSO maximum segment count */
 	u_int	t_tsomaxsegsize;	/* TSO maximum segment size in bytes */
 	tcp_seq	rcv_nxt;		/* receive next */
 	tcp_seq	rcv_adv;		/* advertised window */
 	uint32_t  rcv_wnd;		/* receive window */
 	u_int	t_flags2;		/* More tcpcb flags storage */
 	int	t_srtt;			/* smoothed round-trip time */
 	int	t_rttvar;		/* variance in round-trip time */
 	u_int32_t  ts_recent;		/* timestamp echo data */
 	u_char	snd_scale;		/* window scaling for send window */
 	u_char	rcv_scale;		/* window scaling for recv window */
 	u_char	snd_limited;		/* segments limited transmitted */
 	u_char	request_r_scale;	/* pending window scaling */
 	tcp_seq	last_ack_sent;
 	u_int	t_rcvtime;		/* inactivity time */
 	/* Cache line 3 */
 	tcp_seq	rcv_up;			/* receive urgent pointer */
 	int	t_segqlen;		/* segment reassembly queue length */
 	uint32_t t_segqmbuflen;		/* Count of bytes mbufs on all entries */
 	struct	tsegqe_head t_segq;	/* segment reassembly queue */
 	struct mbuf      *t_in_pkt;
 	struct mbuf	 *t_tail_pkt;
 	struct tcp_timer *t_timers;	/* All the TCP timers in one struct */
 	struct	vnet *t_vnet;		/* back pointer to parent vnet */
 	uint32_t  snd_ssthresh;		/* snd_cwnd size threshold for
 					 * for slow start exponential to
 					 * linear switch
 					 */
 	tcp_seq	snd_wl1;		/* window update seg seq number */
 	/* Cache line 4 */
 	tcp_seq	snd_wl2;		/* window update seg ack number */
 
 	tcp_seq	irs;			/* initial receive sequence number */
 	tcp_seq	iss;			/* initial send sequence number */
 	u_int	t_acktime;		/* RACK and BBR incoming new data was acked */
 	u_int	t_sndtime;		/* time last data was sent */
 	u_int	ts_recent_age;		/* when last updated */
 	tcp_seq	snd_recover;		/* for use in NewReno Fast Recovery */
 	uint16_t cl4_spare;		/* Spare to adjust CL 4 */
 	char	t_oobflags;		/* have some */
 	char	t_iobc;			/* input character */
 	int	t_rxtcur;		/* current retransmit value (ticks) */
 
 	int	t_rxtshift;		/* log(2) of rexmt exp. backoff */
 	u_int	t_rtttime;		/* RTT measurement start time */
 
 	tcp_seq	t_rtseq;		/* sequence number being timed */
 	u_int	t_starttime;		/* time connection was established */
 	u_int	t_fbyte_in;		/* ticks time when first byte queued in */
 	u_int	t_fbyte_out;		/* ticks time when first byte queued out */
 
 	u_int	t_pmtud_saved_maxseg;	/* pre-blackhole MSS */
 	int	t_blackhole_enter;	/* when to enter blackhole detection */
 	int	t_blackhole_exit;	/* when to exit blackhole detection */
 	u_int	t_rttmin;		/* minimum rtt allowed */
 
 	u_int	t_rttbest;		/* best rtt we've seen */
 
 	int	t_softerror;		/* possible error not yet reported */
 	uint32_t  max_sndwnd;		/* largest window peer has offered */
 	/* Cache line 5 */
 	uint32_t  snd_cwnd_prev;	/* cwnd prior to retransmit */
 	uint32_t  snd_ssthresh_prev;	/* ssthresh prior to retransmit */
 	tcp_seq	snd_recover_prev;	/* snd_recover prior to retransmit */
 	int	t_sndzerowin;		/* zero-window updates sent */
 	u_long	t_rttupdated;		/* number of times rtt sampled */
 	int	snd_numholes;		/* number of holes seen by sender */
 	u_int	t_badrxtwin;		/* window for retransmit recovery */
 	TAILQ_HEAD(sackhole_head, sackhole) snd_holes;
 					/* SACK scoreboard (sorted) */
 	tcp_seq	snd_fack;		/* last seq number(+1) sack'd by rcv'r*/
 	struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
 	struct sackhint	sackhint;	/* SACK scoreboard hint */
 	int	t_rttlow;		/* smallest observerved RTT */
 	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
 	struct toedev	*tod;		/* toedev handling this connection */
 	int	t_sndrexmitpack;	/* retransmit packets sent */
 	int	t_rcvoopack;		/* out-of-order packets received */
 	void	*t_toe;			/* TOE pcb pointer */
 	struct cc_algo	*cc_algo;	/* congestion control algorithm */
 	struct cc_var	*ccv;		/* congestion control specific vars */
 	struct osd	*osd;		/* storage for Khelp module data */
 	int	t_bytes_acked;		/* # bytes acked during current RTT */
 	u_int   t_maxunacktime;
 	u_int	t_keepinit;		/* time to establish connection */
 	u_int	t_keepidle;		/* time before keepalive probes begin */
 	u_int	t_keepintvl;		/* interval between keepalives */
 	u_int	t_keepcnt;		/* number of keepalives before close */
 	int	t_dupacks;		/* consecutive dup acks recd */
 	int	t_lognum;		/* Number of log entries */
 	int	t_loglimit;		/* Maximum number of log entries */
 	uint32_t r_cep;			/* Number of received CE marked packets */
 	uint32_t s_cep;			/* Synced number of delivered CE packets */
 	int64_t	t_pacing_rate;		/* bytes / sec, -1 => unlimited */
 	struct tcp_log_stailq t_logs;	/* Log buffer */
 	struct tcp_log_id_node *t_lin;
 	struct tcp_log_id_bucket *t_lib;
 	const char *t_output_caller;	/* Function that called tcp_output */
 	struct statsblob *t_stats;	/* Per-connection stats */
 	uint32_t t_logsn;		/* Log "serial number" */
 	uint32_t gput_ts;		/* Time goodput measurement started */
 	tcp_seq gput_seq;		/* Outbound measurement seq */
 	tcp_seq gput_ack;		/* Inbound measurement ack */
 	int32_t t_stats_gput_prev;	/* XXXLAS: Prev gput measurement */
 	uint32_t t_maxpeakrate;		/* max peak rate set by user, in bytes/s */
 	uint32_t t_sndtlppack;		/* tail loss probe packets sent */
 	uint64_t t_sndtlpbyte;		/* total tail loss probe bytes sent */
 	uint64_t t_sndbytes;		/* total bytes sent */
 	uint64_t t_snd_rxt_bytes;	/* total bytes retransmitted */
 	uint32_t t_dsack_bytes;		/* Total number of dsack bytes we have received */
 	uint32_t t_dsack_tlp_bytes;	/* Total number of dsack bytes we have received for TLPs sent */
 	uint32_t t_dsack_pack;		/* Total dsack packets we have recieved */
 
 	uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
 	uint32_t t_end_info_status;	/* Status flag of end info */
 	unsigned int *t_tfo_pending;	/* TCP Fast Open server pending counter */
 	union {
 		uint8_t client[TCP_FASTOPEN_MAX_COOKIE_LEN];
 		uint64_t server;
 	} t_tfo_cookie;			/* TCP Fast Open cookie to send */
 	union {
 		uint8_t t_end_info_bytes[TCP_END_BYTE_INFO];
 		uint64_t t_end_info;
 	};
 #ifdef TCPPCAP
 	struct mbufq t_inpkts;		/* List of saved input packets. */
 	struct mbufq t_outpkts;		/* List of saved output packets. */
 #endif
 };
 #endif	/* _KERNEL || _WANT_TCPCB */
 
 #ifdef _KERNEL
 struct tcptemp {
 	u_char	tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
 	struct	tcphdr tt_t;
 };
 
 /* Enable TCP/UDP tunneling port */
 #define TCP_TUNNELING_PORT_MIN		0
 #define TCP_TUNNELING_PORT_MAX		65535
 #define TCP_TUNNELING_PORT_DEFAULT	0
 
 /* Enable TCP/UDP tunneling port */
 #define TCP_TUNNELING_OVERHEAD_MIN	sizeof(struct udphdr)
 #define TCP_TUNNELING_OVERHEAD_MAX	1024
 #define TCP_TUNNELING_OVERHEAD_DEFAULT	TCP_TUNNELING_OVERHEAD_MIN
 
 /* Minimum map entries limit value, if set */
 #define TCP_MIN_MAP_ENTRIES_LIMIT	128
 
 /*
  * TODO: We yet need to brave plowing in
  * to tcp_input() and the pru_usrreq() block.
  * Right now these go to the old standards which
  * are somewhat ok, but in the long term may
  * need to be changed. If we do tackle tcp_input()
  * then we need to get rid of the tcp_do_segment()
  * function below.
  */
 /* Flags for tcp functions */
 #define	TCP_FUNC_BEING_REMOVED	0x01   	/* Can no longer be referenced */
 #define	TCP_FUNC_OUTPUT_CANDROP	0x02   	/* tfb_tcp_output may ask tcp_drop */
 
 /*
  * If defining the optional tcp_timers, in the
  * tfb_tcp_timer_stop call you must use the
  * callout_async_drain() function with the
  * tcp_timer_discard callback. You should check
  * the return of callout_async_drain() and if 0
  * increment tt_draincnt. Since the timer sub-system
  * does not know your callbacks you must provide a
  * stop_all function that loops through and calls
  * tcp_timer_stop() with each of your defined timers.
  * Adding a tfb_tcp_handoff_ok function allows the socket
  * option to change stacks to query you even if the
  * connection is in a later stage. You return 0 to
  * say you can take over and run your stack, you return
  * non-zero (an error number) to say no you can't.
  * If the function is undefined you can only change
  * in the early states (before connect or listen).
  * tfb_tcp_fb_fini is changed to add a flag to tell
  * the old stack if the tcb is being destroyed or
  * not. A one in the flag means the TCB is being
  * destroyed, a zero indicates its transitioning to
  * another stack (via socket option).
  */
 struct tcp_function_block {
 	char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
 	int	(*tfb_tcp_output)(struct tcpcb *);
 	int	(*tfb_tcp_output_wtime)(struct tcpcb *, const struct timeval *);
 	void	(*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *,
 			    struct socket *, struct tcpcb *,
 		        int, int, uint8_t);
 	int     (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int);
 	int      (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *,
 			    struct socket *, struct tcpcb *,
 			    int, int, uint8_t,
 			    int, struct timeval *);
 	void	(*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *,
 			    struct socket *, struct tcpcb *,
 			    int, int, uint8_t,
 			    int, struct timeval *);
 	int     (*tfb_tcp_ctloutput)(struct inpcb *inp, struct sockopt *sopt);
 	/* Optional memory allocation/free routine */
 	int	(*tfb_tcp_fb_init)(struct tcpcb *);
 	void	(*tfb_tcp_fb_fini)(struct tcpcb *, int);
 	/* Optional timers, must define all if you define one */
 	int	(*tfb_tcp_timer_stop_all)(struct tcpcb *);
 	void	(*tfb_tcp_timer_activate)(struct tcpcb *,
 			    uint32_t, u_int);
 	int	(*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
 	void	(*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
 	void	(*tfb_tcp_rexmit_tmr)(struct tcpcb *);
 	int	(*tfb_tcp_handoff_ok)(struct tcpcb *);
 	void	(*tfb_tcp_mtu_chg)(struct tcpcb *);
 	int	(*tfb_pru_options)(struct tcpcb *, int);
 	void	(*tfb_hwtls_change)(struct tcpcb *, int);
 	volatile uint32_t tfb_refcnt;
 	uint32_t  tfb_flags;
 	uint8_t	tfb_id;
 };
 
 struct tcp_function {
 	TAILQ_ENTRY(tcp_function)	tf_next;
 	char				tf_name[TCP_FUNCTION_NAME_LEN_MAX];
 	struct tcp_function_block	*tf_fb;
 };
 
 TAILQ_HEAD(tcp_funchead, tcp_function);
 
 struct tcpcb * tcp_drop(struct tcpcb *, int);
 
 #ifdef _NETINET_IN_PCB_H_
 /*
  * tcp_output()
  * Handles tcp_drop request from advanced stacks and reports that inpcb is
  * gone with negative return code.
  * Drop in replacement for the default stack.
  */
 static inline int
 tcp_output(struct tcpcb *tp)
 {
 	int rv;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	rv = tp->t_fb->tfb_tcp_output(tp);
 	if (rv < 0) {
 		KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
 		    ("TCP stack %s requested tcp_drop(%p)",
 		    tp->t_fb->tfb_tcp_block_name, tp));
 		tp = tcp_drop(tp, -rv);
 		if (tp)
 			INP_WUNLOCK(tp->t_inpcb);
 	}
 
 	return (rv);
 }
 
 /*
  * tcp_output_unlock()
  * Always returns unlocked, handles drop request from advanced stacks.
  * Always returns positive error code.
  */
 static inline int
 tcp_output_unlock(struct tcpcb *tp)
 {
 	int rv;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	rv = tp->t_fb->tfb_tcp_output(tp);
 	if (rv < 0) {
 		KASSERT(tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
 		    ("TCP stack %s requested tcp_drop(%p)",
 		    tp->t_fb->tfb_tcp_block_name, tp));
 		rv = -rv;
 		tp = tcp_drop(tp, rv);
 		if (tp)
 			INP_WUNLOCK(tp->t_inpcb);
 	} else
 		INP_WUNLOCK(tp->t_inpcb);
 
 	return (rv);
 }
 
 /*
  * tcp_output_nodrop()
  * Always returns locked.  It is caller's responsibility to run tcp_drop()!
  * Useful in syscall implementations, when we want to perform some logging
  * and/or tracing with tcpcb before calling tcp_drop().  To be used with
  * tcp_unlock_or_drop() later.
  *
  * XXXGL: maybe don't allow stacks to return a drop request at certain
  * TCP states? Why would it do in connect(2)? In recv(2)?
  */
 static inline int
 tcp_output_nodrop(struct tcpcb *tp)
 {
 	int rv;
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	rv = tp->t_fb->tfb_tcp_output(tp);
 	KASSERT(rv >= 0 || tp->t_fb->tfb_flags & TCP_FUNC_OUTPUT_CANDROP,
 	    ("TCP stack %s requested tcp_drop(%p)",
 	    tp->t_fb->tfb_tcp_block_name, tp));
 	return (rv);
 }
 
 /*
  * tcp_unlock_or_drop()
  * Handle return code from tfb_tcp_output() after we have logged/traced,
  * to be used with tcp_output_nodrop().
  */
 static inline int
 tcp_unlock_or_drop(struct tcpcb *tp, int tcp_output_retval)
 {
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
         if (tcp_output_retval < 0) {
                 tcp_output_retval = -tcp_output_retval;
                 if (tcp_drop(tp, tcp_output_retval) != NULL)
                         INP_WUNLOCK(tp->t_inpcb);
         } else
 		INP_WUNLOCK(tp->t_inpcb);
 
 	return (tcp_output_retval);
 }
 #endif	/* _NETINET_IN_PCB_H_ */
 #endif	/* _KERNEL */
 
 /*
  * Flags and utility macros for the t_flags field.
  */
 #define	TF_ACKNOW	0x00000001	/* ack peer immediately */
 #define	TF_DELACK	0x00000002	/* ack, but try to delay it */
 #define	TF_NODELAY	0x00000004	/* don't delay packets to coalesce */
 #define	TF_NOOPT	0x00000008	/* don't use tcp options */
 #define	TF_SENTFIN	0x00000010	/* have sent FIN */
 #define	TF_REQ_SCALE	0x00000020	/* have/will request window scaling */
 #define	TF_RCVD_SCALE	0x00000040	/* other side has requested scaling */
 #define	TF_REQ_TSTMP	0x00000080	/* have/will request timestamps */
 #define	TF_RCVD_TSTMP	0x00000100	/* a timestamp was received in SYN */
 #define	TF_SACK_PERMIT	0x00000200	/* other side said I could SACK */
 #define	TF_NEEDSYN	0x00000400	/* send SYN (implicit state) */
 #define	TF_NEEDFIN	0x00000800	/* send FIN (implicit state) */
 #define	TF_NOPUSH	0x00001000	/* don't push */
 #define	TF_PREVVALID	0x00002000	/* saved values for bad rxmit valid
 					 * Note: accessing and restoring from
 					 * these may only be done in the 1st
 					 * RTO recovery round (t_rxtshift == 1)
 					 */
 #define	TF_WAKESOR	0x00004000	/* wake up receive socket */
 #define	TF_GPUTINPROG	0x00008000	/* Goodput measurement in progress */
 #define	TF_MORETOCOME	0x00010000	/* More data to be appended to sock */
 #define	TF_LQ_OVERFLOW	0x00020000	/* listen queue overflow */
 #define	TF_LASTIDLE	0x00040000	/* connection was previously idle */
 #define	TF_RXWIN0SENT	0x00080000	/* sent a receiver win 0 in response */
 #define	TF_FASTRECOVERY	0x00100000	/* in NewReno Fast Recovery */
 #define	TF_WASFRECOVERY	0x00200000	/* was in NewReno Fast Recovery */
 #define	TF_SIGNATURE	0x00400000	/* require MD5 digests (RFC2385) */
 #define	TF_FORCEDATA	0x00800000	/* force out a byte */
 #define	TF_TSO		0x01000000	/* TSO enabled on this connection */
 #define	TF_TOE		0x02000000	/* this connection is offloaded */
 #define	TF_CLOSED	0x04000000	/* close(2) called on socket */
 #define	TF_UNUSED1	0x08000000	/* unused */
 #define	TF_LRD		0x10000000	/* Lost Retransmission Detection */
 #define	TF_CONGRECOVERY	0x20000000	/* congestion recovery mode */
 #define	TF_WASCRECOVERY	0x40000000	/* was in congestion recovery */
 #define	TF_FASTOPEN	0x80000000	/* TCP Fast Open indication */
 
 #define	IN_FASTRECOVERY(t_flags)	(t_flags & TF_FASTRECOVERY)
 #define	ENTER_FASTRECOVERY(t_flags)	t_flags |= TF_FASTRECOVERY
 #define	EXIT_FASTRECOVERY(t_flags)	t_flags &= ~TF_FASTRECOVERY
 
 #define	IN_CONGRECOVERY(t_flags)	(t_flags & TF_CONGRECOVERY)
 #define	ENTER_CONGRECOVERY(t_flags)	t_flags |= TF_CONGRECOVERY
 #define	EXIT_CONGRECOVERY(t_flags)	t_flags &= ~TF_CONGRECOVERY
 
 #define	IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY))
 #define	ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY)
 #define	EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY)
 
 #if defined(_KERNEL) && !defined(TCP_RFC7413)
 #define	IS_FASTOPEN(t_flags)		(false)
 #else
 #define	IS_FASTOPEN(t_flags)		(t_flags & TF_FASTOPEN)
 #endif
 
 #define	BYTES_THIS_ACK(tp, th)	(th->th_ack - tp->snd_una)
 
 /*
  * Flags for the t_oobflags field.
  */
 #define	TCPOOB_HAVEDATA	0x01
 #define	TCPOOB_HADDATA	0x02
 
 /*
  * Flags for the extended TCP flags field, t_flags2
  */
 #define	TF2_PLPMTU_BLACKHOLE	0x00000001 /* Possible PLPMTUD Black Hole. */
 #define	TF2_PLPMTU_PMTUD	0x00000002 /* Allowed to attempt PLPMTUD. */
 #define	TF2_PLPMTU_MAXSEGSNT	0x00000004 /* Last seg sent was full seg. */
 #define	TF2_LOG_AUTO		0x00000008 /* Session is auto-logging. */
 #define	TF2_DROP_AF_DATA	0x00000010 /* Drop after all data ack'd */
 #define	TF2_ECN_PERMIT		0x00000020 /* connection ECN-ready */
 #define	TF2_ECN_SND_CWR		0x00000040 /* ECN CWR in queue */
 #define	TF2_ECN_SND_ECE		0x00000080 /* ECN ECE in queue */
 #define	TF2_ACE_PERMIT		0x00000100 /* Accurate ECN mode */
 #define TF2_FBYTES_COMPLETE	0x00000400 /* We have first bytes in and out */
 /*
  * Structure to hold TCP options that are only used during segment
  * processing (in tcp_input), but not held in the tcpcb.
  * It's basically used to reduce the number of parameters
  * to tcp_dooptions and tcp_addoptions.
  * The binary order of the to_flags is relevant for packing of the
  * options in tcp_addoptions.
  */
 struct tcpopt {
 	u_int32_t	to_flags;	/* which options are present */
 #define	TOF_MSS		0x0001		/* maximum segment size */
 #define	TOF_SCALE	0x0002		/* window scaling */
 #define	TOF_SACKPERM	0x0004		/* SACK permitted */
 #define	TOF_TS		0x0010		/* timestamp */
 #define	TOF_SIGNATURE	0x0040		/* TCP-MD5 signature option (RFC2385) */
 #define	TOF_SACK	0x0080		/* Peer sent SACK option */
 #define	TOF_FASTOPEN	0x0100		/* TCP Fast Open (TFO) cookie */
 #define	TOF_MAXOPT	0x0200
 	u_int32_t	to_tsval;	/* new timestamp */
 	u_int32_t	to_tsecr;	/* reflected timestamp */
 	u_char		*to_sacks;	/* pointer to the first SACK blocks */
 	u_char		*to_signature;	/* pointer to the TCP-MD5 signature */
 	u_int8_t	*to_tfo_cookie; /* pointer to the TFO cookie */
 	u_int16_t	to_mss;		/* maximum segment size */
 	u_int8_t	to_wscale;	/* window scaling */
 	u_int8_t	to_nsacks;	/* number of SACK blocks */
 	u_int8_t	to_tfo_len;	/* TFO cookie length */
 	u_int32_t	to_spare;	/* UTO */
 };
 
 /*
  * Flags for tcp_dooptions.
  */
 #define	TO_SYN		0x01		/* parse SYN-only options */
 
 struct hc_metrics_lite {	/* must stay in sync with hc_metrics */
 	uint32_t	rmx_mtu;	/* MTU for this path */
 	uint32_t	rmx_ssthresh;	/* outbound gateway buffer limit */
 	uint32_t	rmx_rtt;	/* estimated round trip time */
 	uint32_t	rmx_rttvar;	/* estimated rtt variance */
 	uint32_t	rmx_cwnd;	/* congestion window */
 	uint32_t	rmx_sendpipe;   /* outbound delay-bandwidth product */
 	uint32_t	rmx_recvpipe;   /* inbound delay-bandwidth product */
 };
 
 /*
  * Used by tcp_maxmtu() to communicate interface specific features
  * and limits at the time of connection setup.
  */
 struct tcp_ifcap {
 	int	ifcap;
 	u_int	tsomax;
 	u_int	tsomaxsegcount;
 	u_int	tsomaxsegsize;
 };
 
 #ifndef _NETINET_IN_PCB_H_
 struct in_conninfo;
 #endif /* _NETINET_IN_PCB_H_ */
 
 struct tcptw {
 	struct inpcb	*tw_inpcb;	/* XXX back pointer to internet pcb */
 	uint32_t  t_port:16,		/* UDP port number if TCPoUDP */
 		t_unused:16;
 	tcp_seq		snd_nxt;
 	tcp_seq		rcv_nxt;
 	u_short		last_win;	/* cached window value */
 	short		tw_so_options;	/* copy of so_options */
 	struct ucred	*tw_cred;	/* user credentials */
 	u_int32_t	t_recent;
 	u_int32_t	ts_offset;	/* our timestamp offset */
 	int		tw_time;
 	TAILQ_ENTRY(tcptw) tw_2msl;
 	u_int		tw_flags;	/* tcpcb t_flags */
 };
 
 #define	intotcpcb(ip)	((struct tcpcb *)(ip)->inp_ppcb)
 #define	intotw(ip)	((struct tcptw *)(ip)->inp_ppcb)
 #define	sototcpcb(so)	(intotcpcb(sotoinpcb(so)))
 
 /*
  * The smoothed round-trip time and estimated variance
  * are stored as fixed point numbers scaled by the values below.
  * For convenience, these scales are also used in smoothing the average
  * (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed).
  * With these scales, srtt has 3 bits to the right of the binary point,
  * and thus an "ALPHA" of 0.875.  rttvar has 2 bits to the right of the
  * binary point, and is smoothed with an ALPHA of 0.75.
  */
 #define	TCP_RTT_SCALE		32	/* multiplier for srtt; 3 bits frac. */
 #define	TCP_RTT_SHIFT		5	/* shift for srtt; 3 bits frac. */
 #define	TCP_RTTVAR_SCALE	16	/* multiplier for rttvar; 2 bits */
 #define	TCP_RTTVAR_SHIFT	4	/* shift for rttvar; 2 bits */
 #define	TCP_DELTA_SHIFT		2	/* see tcp_input.c */
 
 /*
  * The initial retransmission should happen at rtt + 4 * rttvar.
  * Because of the way we do the smoothing, srtt and rttvar
  * will each average +1/2 tick of bias.  When we compute
  * the retransmit timer, we want 1/2 tick of rounding and
  * 1 extra tick because of +-1/2 tick uncertainty in the
  * firing of the timer.  The bias will give us exactly the
  * 1.5 tick we need.  But, because the bias is
  * statistical, we have to test that we don't drop below
  * the minimum feasible timer (which is 2 ticks).
  * This version of the macro adapted from a paper by Lawrence
  * Brakmo and Larry Peterson which outlines a problem caused
  * by insufficient precision in the original implementation,
  * which results in inappropriately large RTO values for very
  * fast networks.
  */
 #define	TCP_REXMTVAL(tp) \
 	max((tp)->t_rttmin, (((tp)->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT))  \
 	  + (tp)->t_rttvar) >> TCP_DELTA_SHIFT)
 
 /*
  * TCP statistics.
  * Many of these should be kept per connection,
  * but that's inconvenient at the moment.
  */
 struct	tcpstat {
 	uint64_t tcps_connattempt;	/* connections initiated */
 	uint64_t tcps_accepts;		/* connections accepted */
 	uint64_t tcps_connects;		/* connections established */
 	uint64_t tcps_drops;		/* connections dropped */
 	uint64_t tcps_conndrops;	/* embryonic connections dropped */
 	uint64_t tcps_minmssdrops;	/* average minmss too low drops */
 	uint64_t tcps_closed;		/* conn. closed (includes drops) */
 	uint64_t tcps_segstimed;	/* segs where we tried to get rtt */
 	uint64_t tcps_rttupdated;	/* times we succeeded */
 	uint64_t tcps_delack;		/* delayed acks sent */
 	uint64_t tcps_timeoutdrop;	/* conn. dropped in rxmt timeout */
 	uint64_t tcps_rexmttimeo;	/* retransmit timeouts */
 	uint64_t tcps_persisttimeo;	/* persist timeouts */
 	uint64_t tcps_keeptimeo;	/* keepalive timeouts */
 	uint64_t tcps_keepprobe;	/* keepalive probes sent */
 	uint64_t tcps_keepdrops;	/* connections dropped in keepalive */
 
 	uint64_t tcps_sndtotal;		/* total packets sent */
 	uint64_t tcps_sndpack;		/* data packets sent */
 	uint64_t tcps_sndbyte;		/* data bytes sent */
 	uint64_t tcps_sndrexmitpack;	/* data packets retransmitted */
 	uint64_t tcps_sndrexmitbyte;	/* data bytes retransmitted */
 	uint64_t tcps_sndrexmitbad;	/* unnecessary packet retransmissions */
 	uint64_t tcps_sndacks;		/* ack-only packets sent */
 	uint64_t tcps_sndprobe;		/* window probes sent */
 	uint64_t tcps_sndurg;		/* packets sent with URG only */
 	uint64_t tcps_sndwinup;		/* window update-only packets sent */
 	uint64_t tcps_sndctrl;		/* control (SYN|FIN|RST) packets sent */
 
 	uint64_t tcps_rcvtotal;		/* total packets received */
 	uint64_t tcps_rcvpack;		/* packets received in sequence */
 	uint64_t tcps_rcvbyte;		/* bytes received in sequence */
 	uint64_t tcps_rcvbadsum;	/* packets received with ccksum errs */
 	uint64_t tcps_rcvbadoff;	/* packets received with bad offset */
 	uint64_t tcps_rcvreassfull;	/* packets dropped for no reass space */
 	uint64_t tcps_rcvshort;		/* packets received too short */
 	uint64_t tcps_rcvduppack;	/* duplicate-only packets received */
 	uint64_t tcps_rcvdupbyte;	/* duplicate-only bytes received */
 	uint64_t tcps_rcvpartduppack;	/* packets with some duplicate data */
 	uint64_t tcps_rcvpartdupbyte;	/* dup. bytes in part-dup. packets */
 	uint64_t tcps_rcvoopack;	/* out-of-order packets received */
 	uint64_t tcps_rcvoobyte;	/* out-of-order bytes received */
 	uint64_t tcps_rcvpackafterwin;	/* packets with data after window */
 	uint64_t tcps_rcvbyteafterwin;	/* bytes rcvd after window */
 	uint64_t tcps_rcvafterclose;	/* packets rcvd after "close" */
 	uint64_t tcps_rcvwinprobe;	/* rcvd window probe packets */
 	uint64_t tcps_rcvdupack;	/* rcvd duplicate acks */
 	uint64_t tcps_rcvacktoomuch;	/* rcvd acks for unsent data */
 	uint64_t tcps_rcvackpack;	/* rcvd ack packets */
 	uint64_t tcps_rcvackbyte;	/* bytes acked by rcvd acks */
 	uint64_t tcps_rcvwinupd;	/* rcvd window update packets */
 	uint64_t tcps_pawsdrop;		/* segments dropped due to PAWS */
 	uint64_t tcps_predack;		/* times hdr predict ok for acks */
 	uint64_t tcps_preddat;		/* times hdr predict ok for data pkts */
 	uint64_t tcps_pcbcachemiss;
 	uint64_t tcps_cachedrtt;	/* times cached RTT in route updated */
 	uint64_t tcps_cachedrttvar;	/* times cached rttvar updated */
 	uint64_t tcps_cachedssthresh;	/* times cached ssthresh updated */
 	uint64_t tcps_usedrtt;		/* times RTT initialized from route */
 	uint64_t tcps_usedrttvar;	/* times RTTVAR initialized from rt */
 	uint64_t tcps_usedssthresh;	/* times ssthresh initialized from rt*/
 	uint64_t tcps_persistdrop;	/* timeout in persist state */
 	uint64_t tcps_badsyn;		/* bogus SYN, e.g. premature ACK */
 	uint64_t tcps_mturesent;	/* resends due to MTU discovery */
 	uint64_t tcps_listendrop;	/* listen queue overflows */
 	uint64_t tcps_badrst;		/* ignored RSTs in the window */
 
 	uint64_t tcps_sc_added;		/* entry added to syncache */
 	uint64_t tcps_sc_retransmitted;	/* syncache entry was retransmitted */
 	uint64_t tcps_sc_dupsyn;	/* duplicate SYN packet */
 	uint64_t tcps_sc_dropped;	/* could not reply to packet */
 	uint64_t tcps_sc_completed;	/* successful extraction of entry */
 	uint64_t tcps_sc_bucketoverflow;/* syncache per-bucket limit hit */
 	uint64_t tcps_sc_cacheoverflow;	/* syncache cache limit hit */
 	uint64_t tcps_sc_reset;		/* RST removed entry from syncache */
 	uint64_t tcps_sc_stale;		/* timed out or listen socket gone */
 	uint64_t tcps_sc_aborted;	/* syncache entry aborted */
 	uint64_t tcps_sc_badack;	/* removed due to bad ACK */
 	uint64_t tcps_sc_unreach;	/* ICMP unreachable received */
 	uint64_t tcps_sc_zonefail;	/* zalloc() failed */
 	uint64_t tcps_sc_sendcookie;	/* SYN cookie sent */
 	uint64_t tcps_sc_recvcookie;	/* SYN cookie received */
 
 	uint64_t tcps_hc_added;		/* entry added to hostcache */
 	uint64_t tcps_hc_bucketoverflow;/* hostcache per bucket limit hit */
 
 	uint64_t tcps_finwait2_drops;    /* Drop FIN_WAIT_2 connection after time limit */
 
 	/* SACK related stats */
 	uint64_t tcps_sack_recovery_episode; /* SACK recovery episodes */
 	uint64_t tcps_sack_rexmits;	    /* SACK rexmit segments   */
 	uint64_t tcps_sack_rexmit_bytes;    /* SACK rexmit bytes      */
 	uint64_t tcps_sack_rcv_blocks;	    /* SACK blocks (options) received */
 	uint64_t tcps_sack_send_blocks;	    /* SACK blocks (options) sent     */
 	uint64_t tcps_sack_lostrexmt;	    /* SACK lost retransmission recovered */
 	uint64_t tcps_sack_sboverflow;	    /* times scoreboard overflowed */
 
 	/* ECN related stats */
 	uint64_t tcps_ecn_ce;		/* ECN Congestion Experienced */
 	uint64_t tcps_ecn_ect0;		/* ECN Capable Transport */
 	uint64_t tcps_ecn_ect1;		/* ECN Capable Transport */
 	uint64_t tcps_ecn_shs;		/* ECN successful handshakes */
 	uint64_t tcps_ecn_rcwnd;	/* # times ECN reduced the cwnd */
 
 	/* TCP_SIGNATURE related stats */
 	uint64_t tcps_sig_rcvgoodsig;	/* Total matching signature received */
 	uint64_t tcps_sig_rcvbadsig;	/* Total bad signature received */
 	uint64_t tcps_sig_err_buildsig;	/* Failed to make signature */
 	uint64_t tcps_sig_err_sigopt;	/* No signature expected by socket */
 	uint64_t tcps_sig_err_nosigopt;	/* No signature provided by segment */
 
 	/* Path MTU Discovery Black Hole Detection related stats */
 	uint64_t tcps_pmtud_blackhole_activated;	 /* Black Hole Count */
 	uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */
 	uint64_t tcps_pmtud_blackhole_failed;		 /* Black Hole Failure Count */
 
 	uint64_t tcps_tunneled_pkts;	/* Packets encap's in UDP received */
 	uint64_t tcps_tunneled_errs;	/* Packets that had errors that were UDP encaped */
 
 	/* Dsack related stats */
 	uint64_t tcps_dsack_count;	/* Number of ACKs arriving with DSACKs */
 	uint64_t tcps_dsack_bytes;	/* Number of bytes DSACK'ed no TLP */
 	uint64_t tcps_dsack_tlp_bytes;	/* Number of bytes DSACK'ed due to TLPs */
 
 	/* TCPS_TIME_WAIT usage stats */
 	uint64_t tcps_tw_recycles;	/* Times time-wait was recycled. */
 	uint64_t tcps_tw_resets;	/* Times time-wait sent a reset. */
 	uint64_t tcps_tw_responds;	/* Times time-wait sent a valid ack. */
 
 	/* Accurate ECN Handshake stats */
 	uint64_t tcps_ace_nect;		/* ACE SYN packet with Non-ECT */
 	uint64_t tcps_ace_ect1;		/* ACE SYN packet with ECT1 */
 	uint64_t tcps_ace_ect0;		/* ACE SYN packet with ECT0 */
 	uint64_t tcps_ace_ce;		/* ACE SYN packet with CE */
 
 	uint64_t _pad[6];		/* 6 TBD placeholder for STABLE */
 };
 
 #define	tcps_rcvmemdrop	tcps_rcvreassfull	/* compat */
 
 #ifdef _KERNEL
 #define	TI_UNLOCKED	1
 #define	TI_RLOCKED	2
 #include <sys/counter.h>
 
 VNET_PCPUSTAT_DECLARE(struct tcpstat, tcpstat);	/* tcp statistics */
 /*
  * In-kernel consumers can use these accessor macros directly to update
  * stats.
  */
 #define	TCPSTAT_ADD(name, val)	\
     VNET_PCPUSTAT_ADD(struct tcpstat, tcpstat, name, (val))
 #define	TCPSTAT_INC(name)	TCPSTAT_ADD(name, 1)
 
 /*
  * Kernel module consumers must use this accessor macro.
  */
 void	kmod_tcpstat_add(int statnum, int val);
 #define	KMOD_TCPSTAT_ADD(name, val)					\
     kmod_tcpstat_add(offsetof(struct tcpstat, name) / sizeof(uint64_t), val)
 #define	KMOD_TCPSTAT_INC(name)	KMOD_TCPSTAT_ADD(name, 1)
 
 /*
  * Running TCP connection count by state.
  */
 VNET_DECLARE(counter_u64_t, tcps_states[TCP_NSTATES]);
 #define	V_tcps_states	VNET(tcps_states)
 #define	TCPSTATES_INC(state)	counter_u64_add(V_tcps_states[state], 1)
 #define	TCPSTATES_DEC(state)	counter_u64_add(V_tcps_states[state], -1)
 
 /*
  * TCP specific helper hook point identifiers.
  */
 #define	HHOOK_TCP_EST_IN		0
 #define	HHOOK_TCP_EST_OUT		1
 #define	HHOOK_TCP_LAST			HHOOK_TCP_EST_OUT
 
 struct tcp_hhook_data {
 	struct tcpcb	*tp;
 	struct tcphdr	*th;
 	struct tcpopt	*to;
 	uint32_t	len;
 	int		tso;
 	tcp_seq		curack;
 };
 #ifdef TCP_HHOOK
 void hhook_run_tcp_est_out(struct tcpcb *tp,
 	struct tcphdr *th, struct tcpopt *to,
 	uint32_t len, int tso);
 #endif
 #endif
 
 /*
  * TCB structure exported to user-land via sysctl(3).
  *
  * Fields prefixed with "xt_" are unique to the export structure, and fields
  * with "t_" or other prefixes match corresponding fields of 'struct tcpcb'.
  *
  * Legend:
  * (s) - used by userland utilities in src
  * (p) - used by utilities in ports
  * (3) - is known to be used by third party software not in ports
  * (n) - no known usage
  *
  * Evil hack: declare only if in_pcb.h and sys/socketvar.h have been
  * included.  Not all of our clients do.
  */
 #if defined(_NETINET_IN_PCB_H_) && defined(_SYS_SOCKETVAR_H_)
 struct xtcpcb {
 	ksize_t	xt_len;		/* length of this structure */
 	struct xinpcb	xt_inp;
 	char		xt_stack[TCP_FUNCTION_NAME_LEN_MAX];	/* (s) */
 	char		xt_logid[TCP_LOG_ID_LEN];	/* (s) */
 	char		xt_cc[TCP_CA_NAME_MAX];	/* (s) */
 	int64_t		spare64[6];
 	int32_t		t_state;		/* (s,p) */
 	uint32_t	t_flags;		/* (s,p) */
 	int32_t		t_sndzerowin;		/* (s) */
 	int32_t		t_sndrexmitpack;	/* (s) */
 	int32_t		t_rcvoopack;		/* (s) */
 	int32_t		t_rcvtime;		/* (s) */
 	int32_t		tt_rexmt;		/* (s) */
 	int32_t		tt_persist;		/* (s) */
 	int32_t		tt_keep;		/* (s) */
 	int32_t		tt_2msl;		/* (s) */
 	int32_t		tt_delack;		/* (s) */
 	int32_t		t_logstate;		/* (3) */
 	uint32_t	t_snd_cwnd;		/* (s) */
 	uint32_t	t_snd_ssthresh;		/* (s) */
 	uint32_t	t_maxseg;		/* (s) */
 	uint32_t	t_rcv_wnd;		/* (s) */
 	uint32_t	t_snd_wnd;		/* (s) */
 	uint32_t	xt_ecn;			/* (s) */
 	uint32_t	t_dsack_bytes;		/* (n) */
 	uint32_t	t_dsack_tlp_bytes;	/* (n) */
 	uint32_t	t_dsack_pack;		/* (n) */
 	uint16_t	xt_encaps_port;		/* (s) */
 	int16_t		spare16;
 	int32_t		spare32[22];
 } __aligned(8);
 
 #ifdef _KERNEL
 void	tcp_inptoxtp(const struct inpcb *, struct xtcpcb *);
 #endif
 #endif
 
 /*
  * TCP function information (name-to-id mapping, aliases, and refcnt)
  * exported to user-land via sysctl(3).
  */
 struct tcp_function_info {
 	uint32_t	tfi_refcnt;
 	uint8_t		tfi_id;
 	char		tfi_name[TCP_FUNCTION_NAME_LEN_MAX];
 	char		tfi_alias[TCP_FUNCTION_NAME_LEN_MAX];
 };
 
 /*
  * Identifiers for TCP sysctl nodes
  */
 #define	TCPCTL_DO_RFC1323	1	/* use RFC-1323 extensions */
 #define	TCPCTL_MSSDFLT		3	/* MSS default */
 #define TCPCTL_STATS		4	/* statistics */
 #define	TCPCTL_RTTDFLT		5	/* default RTT estimate */
 #define	TCPCTL_KEEPIDLE		6	/* keepalive idle timer */
 #define	TCPCTL_KEEPINTVL	7	/* interval to send keepalives */
 #define	TCPCTL_SENDSPACE	8	/* send buffer space */
 #define	TCPCTL_RECVSPACE	9	/* receive buffer space */
 #define	TCPCTL_KEEPINIT		10	/* timeout for establishing syn */
 #define	TCPCTL_PCBLIST		11	/* list of all outstanding PCBs */
 #define	TCPCTL_DELACKTIME	12	/* time before sending delayed ACK */
 #define	TCPCTL_V6MSSDFLT	13	/* MSS default for IPv6 */
 #define	TCPCTL_SACK		14	/* Selective Acknowledgement,rfc 2018 */
 #define	TCPCTL_DROP		15	/* drop tcp connection */
 #define	TCPCTL_STATES		16	/* connection counts by TCP state */
 
 #ifdef _KERNEL
 #ifdef SYSCTL_DECL
 SYSCTL_DECL(_net_inet_tcp);
 SYSCTL_DECL(_net_inet_tcp_sack);
 MALLOC_DECLARE(M_TCPLOG);
 #endif
 
 VNET_DECLARE(int, tcp_log_in_vain);
 #define	V_tcp_log_in_vain		VNET(tcp_log_in_vain)
 
 /*
  * Global TCP tunables shared between different stacks.
  * Please keep the list sorted.
  */
 VNET_DECLARE(int, drop_synfin);
 VNET_DECLARE(int, path_mtu_discovery);
 VNET_DECLARE(int, tcp_abc_l_var);
 VNET_DECLARE(int, tcp_autorcvbuf_max);
 VNET_DECLARE(int, tcp_autosndbuf_inc);
 VNET_DECLARE(int, tcp_autosndbuf_max);
 VNET_DECLARE(int, tcp_delack_enabled);
 VNET_DECLARE(int, tcp_do_autorcvbuf);
 VNET_DECLARE(int, tcp_do_autosndbuf);
 VNET_DECLARE(int, tcp_do_ecn);
 VNET_DECLARE(int, tcp_do_lrd);
 VNET_DECLARE(int, tcp_do_prr);
 VNET_DECLARE(int, tcp_do_prr_conservative);
 VNET_DECLARE(int, tcp_do_newcwv);
 VNET_DECLARE(int, tcp_do_rfc1323);
 VNET_DECLARE(int, tcp_tolerate_missing_ts);
 VNET_DECLARE(int, tcp_do_rfc3042);
 VNET_DECLARE(int, tcp_do_rfc3390);
 VNET_DECLARE(int, tcp_do_rfc3465);
 VNET_DECLARE(int, tcp_do_newsack);
 VNET_DECLARE(int, tcp_do_sack);
 VNET_DECLARE(int, tcp_do_tso);
 VNET_DECLARE(int, tcp_ecn_maxretries);
 VNET_DECLARE(int, tcp_initcwnd_segments);
 VNET_DECLARE(int, tcp_insecure_rst);
 VNET_DECLARE(int, tcp_insecure_syn);
 VNET_DECLARE(uint32_t, tcp_map_entries_limit);
 VNET_DECLARE(uint32_t, tcp_map_split_limit);
 VNET_DECLARE(int, tcp_minmss);
 VNET_DECLARE(int, tcp_mssdflt);
 #ifdef STATS
 VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl);
 VNET_DECLARE(int, tcp_perconn_stats_enable);
 #endif /* STATS */
 VNET_DECLARE(int, tcp_recvspace);
 VNET_DECLARE(int, tcp_sack_globalholes);
 VNET_DECLARE(int, tcp_sack_globalmaxholes);
 VNET_DECLARE(int, tcp_sack_maxholes);
 VNET_DECLARE(int, tcp_sc_rst_sock_fail);
 VNET_DECLARE(int, tcp_sendspace);
 VNET_DECLARE(int, tcp_udp_tunneling_overhead);
 VNET_DECLARE(int, tcp_udp_tunneling_port);
 VNET_DECLARE(struct inpcbinfo, tcbinfo);
 
 #define	V_tcp_do_lrd			VNET(tcp_do_lrd)
 #define	V_tcp_do_prr			VNET(tcp_do_prr)
 #define	V_tcp_do_prr_conservative	VNET(tcp_do_prr_conservative)
 #define	V_tcp_do_newcwv			VNET(tcp_do_newcwv)
 #define	V_drop_synfin			VNET(drop_synfin)
 #define	V_path_mtu_discovery		VNET(path_mtu_discovery)
 #define	V_tcbinfo			VNET(tcbinfo)
 #define	V_tcp_abc_l_var			VNET(tcp_abc_l_var)
 #define	V_tcp_autorcvbuf_max		VNET(tcp_autorcvbuf_max)
 #define	V_tcp_autosndbuf_inc		VNET(tcp_autosndbuf_inc)
 #define	V_tcp_autosndbuf_max		VNET(tcp_autosndbuf_max)
 #define	V_tcp_delack_enabled		VNET(tcp_delack_enabled)
 #define	V_tcp_do_autorcvbuf		VNET(tcp_do_autorcvbuf)
 #define	V_tcp_do_autosndbuf		VNET(tcp_do_autosndbuf)
 #define	V_tcp_do_ecn			VNET(tcp_do_ecn)
 #define	V_tcp_do_rfc1323		VNET(tcp_do_rfc1323)
 #define	V_tcp_tolerate_missing_ts	VNET(tcp_tolerate_missing_ts)
 #define V_tcp_ts_offset_per_conn	VNET(tcp_ts_offset_per_conn)
 #define	V_tcp_do_rfc3042		VNET(tcp_do_rfc3042)
 #define	V_tcp_do_rfc3390		VNET(tcp_do_rfc3390)
 #define	V_tcp_do_rfc3465		VNET(tcp_do_rfc3465)
 #define	V_tcp_do_newsack		VNET(tcp_do_newsack)
 #define	V_tcp_do_sack			VNET(tcp_do_sack)
 #define	V_tcp_do_tso			VNET(tcp_do_tso)
 #define	V_tcp_ecn_maxretries		VNET(tcp_ecn_maxretries)
 #define	V_tcp_initcwnd_segments		VNET(tcp_initcwnd_segments)
 #define	V_tcp_insecure_rst		VNET(tcp_insecure_rst)
 #define	V_tcp_insecure_syn		VNET(tcp_insecure_syn)
 #define	V_tcp_map_entries_limit		VNET(tcp_map_entries_limit)
 #define	V_tcp_map_split_limit		VNET(tcp_map_split_limit)
 #define	V_tcp_minmss			VNET(tcp_minmss)
 #define	V_tcp_mssdflt			VNET(tcp_mssdflt)
 #ifdef STATS
 #define	V_tcp_perconn_stats_dflt_tpl	VNET(tcp_perconn_stats_dflt_tpl)
 #define	V_tcp_perconn_stats_enable	VNET(tcp_perconn_stats_enable)
 #endif /* STATS */
 #define	V_tcp_recvspace			VNET(tcp_recvspace)
 #define	V_tcp_sack_globalholes		VNET(tcp_sack_globalholes)
 #define	V_tcp_sack_globalmaxholes	VNET(tcp_sack_globalmaxholes)
 #define	V_tcp_sack_maxholes		VNET(tcp_sack_maxholes)
 #define	V_tcp_sc_rst_sock_fail		VNET(tcp_sc_rst_sock_fail)
 #define	V_tcp_sendspace			VNET(tcp_sendspace)
 #define	V_tcp_udp_tunneling_overhead	VNET(tcp_udp_tunneling_overhead)
 #define	V_tcp_udp_tunneling_port	VNET(tcp_udp_tunneling_port)
 
 #ifdef TCP_HHOOK
 VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]);
 #define	V_tcp_hhh		VNET(tcp_hhh)
 #endif
 
 int	 tcp_addoptions(struct tcpopt *, u_char *);
 struct tcpcb *
 	 tcp_close(struct tcpcb *);
 void	 tcp_discardcb(struct tcpcb *);
 bool	 tcp_freecb(struct tcpcb *);
 void	 tcp_twstart(struct tcpcb *);
 void	 tcp_twclose(struct tcptw *, int);
 void	 tcp_ctlinput(int, struct sockaddr *, void *);
 int	 tcp_ctloutput(struct socket *, struct sockopt *);
 void 	 tcp_ctlinput_viaudp(int, struct sockaddr *, void *, void *);
 void	 tcp_fini(void *);
 char	*tcp_log_addrs(struct in_conninfo *, struct tcphdr *, const void *,
 	    const void *);
 char	*tcp_log_vain(struct in_conninfo *, struct tcphdr *, const void *,
 	    const void *);
 int	 tcp_reass(struct tcpcb *, struct tcphdr *, tcp_seq *, int *,
 	    struct mbuf *);
 void	 tcp_reass_global_init(void);
 void	 tcp_reass_flush(struct tcpcb *);
 void	 tcp_dooptions(struct tcpopt *, u_char *, int, int);
 void	tcp_dropwithreset(struct mbuf *, struct tcphdr *,
 		     struct tcpcb *, int, int);
 void	tcp_pulloutofband(struct socket *,
 		     struct tcphdr *, struct mbuf *, int);
 void	tcp_xmit_timer(struct tcpcb *, int);
 void	tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
 void	cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
 			    uint16_t nsegs, uint16_t type);
 void 	cc_conn_init(struct tcpcb *tp);
 void 	cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
 void    cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos);
 void	cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos);
 void	cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
 #ifdef TCP_HHOOK
 void	hhook_run_tcp_est_in(struct tcpcb *tp,
 			    struct tcphdr *th, struct tcpopt *to);
 #endif
 
 int	 tcp_input(struct mbuf **, int *, int);
 int	 tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
 	    struct tcpcb *, int);
 int	 tcp_input_with_port(struct mbuf **, int *, int, uint16_t);
 void	 tcp_handle_wakeup(struct tcpcb *, struct socket *);
 void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
 			struct socket *, struct tcpcb *, int, int, uint8_t);
 
 int register_tcp_functions(struct tcp_function_block *blk, int wait);
 int register_tcp_functions_as_names(struct tcp_function_block *blk,
     int wait, const char *names[], int *num_names);
 int register_tcp_functions_as_name(struct tcp_function_block *blk,
     const char *name, int wait);
 int deregister_tcp_functions(struct tcp_function_block *blk, bool quiesce,
     bool force);
 struct tcp_function_block *find_and_ref_tcp_functions(struct tcp_function_set *fs);
 int find_tcp_function_alias(struct tcp_function_block *blk, struct tcp_function_set *fs);
 void tcp_switch_back_to_default(struct tcpcb *tp);
 struct tcp_function_block *
 find_and_ref_tcp_fb(struct tcp_function_block *fs);
 int tcp_default_ctloutput(struct inpcb *inp, struct sockopt *sopt);
 int tcp_ctloutput_set(struct inpcb *inp, struct sockopt *sopt);
 
 extern counter_u64_t tcp_inp_lro_direct_queue;
 extern counter_u64_t tcp_inp_lro_wokeup_queue;
 extern counter_u64_t tcp_inp_lro_compressed;
 extern counter_u64_t tcp_inp_lro_locks_taken;
 extern counter_u64_t tcp_extra_mbuf;
 extern counter_u64_t tcp_would_have_but;
 extern counter_u64_t tcp_comp_total;
 extern counter_u64_t tcp_uncomp_total;
 extern counter_u64_t tcp_bad_csums;
 
 #ifdef NETFLIX_EXP_DETECTION
 /* Various SACK attack thresholds */
 extern int32_t tcp_force_detection;
 extern int32_t tcp_sack_to_ack_thresh;
 extern int32_t tcp_sack_to_move_thresh;
 extern int32_t tcp_restoral_thresh;
 extern int32_t tcp_sad_decay_val;
 extern int32_t tcp_sad_pacing_interval;
 extern int32_t tcp_sad_low_pps;
 extern int32_t tcp_map_minimum;
 extern int32_t tcp_attack_on_turns_on_logging;
 #endif
 extern uint32_t tcp_ack_war_time_window;
 extern uint32_t tcp_ack_war_cnt;
 
 uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *);
 uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *);
 void	 tcp6_use_min_mtu(struct tcpcb *);
 u_int	 tcp_maxseg(const struct tcpcb *);
 u_int	 tcp_fixed_maxseg(const struct tcpcb *);
 void	 tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *,
 	    struct tcp_ifcap *);
 void	 tcp_mss(struct tcpcb *, int);
 int	 tcp_mssopt(struct in_conninfo *);
 struct inpcb *
 	 tcp_drop_syn_sent(struct inpcb *, int);
 struct tcpcb *
 	 tcp_newtcpcb(struct inpcb *);
 int	 tcp_default_output(struct tcpcb *);
 void	 tcp_state_change(struct tcpcb *, int);
 void	 tcp_respond(struct tcpcb *, void *,
 	    struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
 void	 tcp_tw_init(void);
 #ifdef VIMAGE
 void	 tcp_tw_destroy(void);
 #endif
 void	 tcp_tw_zone_change(void);
 int	 tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *,
 	    struct mbuf *, int);
 void	 tcp_setpersist(struct tcpcb *);
 void	 tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp);
 struct tcptemp *
 	 tcpip_maketemplate(struct inpcb *);
 void	 tcpip_fillheaders(struct inpcb *, uint16_t, void *, void *);
 void	 tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
 int	 tcp_timer_suspend(struct tcpcb *, uint32_t);
 void	 tcp_timers_unsuspend(struct tcpcb *, uint32_t);
 int	 tcp_timer_active(struct tcpcb *, uint32_t);
 void	 tcp_timer_stop(struct tcpcb *, uint32_t);
 void	 tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
 int	 inp_to_cpuid(struct inpcb *inp);
 /*
  * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
  */
 void	 tcp_hc_init(void);
 #ifdef VIMAGE
 void	 tcp_hc_destroy(void);
 #endif
 void	 tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *);
 uint32_t tcp_hc_getmtu(struct in_conninfo *);
 void	 tcp_hc_updatemtu(struct in_conninfo *, uint32_t);
 void	 tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *);
 
-extern	struct pr_usrreqs tcp_usrreqs;
+extern	struct protosw tcp_protosw;		/* shared for TOE */
+extern	struct protosw tcp6_protosw;		/* shared for TOE */
 
 uint32_t tcp_new_ts_offset(struct in_conninfo *);
 tcp_seq	 tcp_new_isn(struct in_conninfo *);
 
 int	 tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq);
 int	 tcp_dsack_block_exists(struct tcpcb *);
 void	 tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq);
 void	 tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend);
 void	 tcp_clean_dsack_blocks(struct tcpcb *tp);
 void	 tcp_clean_sackreport(struct tcpcb *tp);
 void	 tcp_sack_adjust(struct tcpcb *tp);
 struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
 void	 tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *);
 void	 tcp_lost_retransmission(struct tcpcb *, struct tcphdr *);
 void	 tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
 void	 tcp_free_sackholes(struct tcpcb *tp);
 void	 tcp_sack_lost_retransmission(struct tcpcb *, struct tcphdr *);
 int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 int	 tcp_compute_pipe(struct tcpcb *);
 uint32_t tcp_compute_initwnd(uint32_t);
 void	 tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
 int	 tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
     size_t seed_len);
 int tcp_can_enable_pacing(void);
 void tcp_decrement_paced_conn(void);
 
 struct mbuf *
 	 tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
 	   int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls);
 
 int	tcp_stats_init(void);
 void tcp_log_end_status(struct tcpcb *tp, uint8_t status);
 
 static inline void
 tcp_fields_to_host(struct tcphdr *th)
 {
 
 	th->th_seq = ntohl(th->th_seq);
 	th->th_ack = ntohl(th->th_ack);
 	th->th_win = ntohs(th->th_win);
 	th->th_urp = ntohs(th->th_urp);
 }
 
 static inline void
 tcp_fields_to_net(struct tcphdr *th)
 {
 
 	th->th_seq = htonl(th->th_seq);
 	th->th_ack = htonl(th->th_ack);
 	th->th_win = htons(th->th_win);
 	th->th_urp = htons(th->th_urp);
 }
 
 static inline uint16_t
 tcp_get_flags(const struct tcphdr *th)
 {
         return (((uint16_t)th->th_x2 << 8) | th->th_flags);
 }
 
 static inline void
 tcp_set_flags(struct tcphdr *th, uint16_t flags)
 {
         th->th_x2    = (flags >> 8) & 0x0f;
         th->th_flags = flags & 0xff;
 }
 
 static inline void
 tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
     uint8_t is_tlp, int hw_tls)
 {
 	if (is_tlp) {
 		tp->t_sndtlppack++;
 		tp->t_sndtlpbyte += len;
 	}
 	/* To get total bytes sent you must add t_snd_rxt_bytes to t_sndbytes */
 	if (is_rxt)
 		tp->t_snd_rxt_bytes += len;
 	else
 		tp->t_sndbytes += len;
 
 #ifdef KERN_TLS
 	if (hw_tls && is_rxt && len != 0) {
 		uint64_t rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes));
 		if (rexmit_percent > ktls_ifnet_max_rexmit_pct)
 			ktls_disable_ifnet(tp);
 	}
 #endif
 
 }
 #endif /* _KERNEL */
 
 #endif /* _NETINET_TCP_VAR_H_ */
diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c
index 13fe863ecb75..3cbeb4a5e50f 100644
--- a/sys/netinet/udp_usrreq.c
+++ b/sys/netinet/udp_usrreq.c
@@ -1,1782 +1,1794 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.
  * Copyright (c) 2008 Robert N. M. Watson
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * Copyright (c) 2014 Kevin Lo
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/eventhandler.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <vm/uma.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/route.h>
 #include <net/route/nhop.h>
 #include <net/rss_config.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_fib.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #ifdef INET6
 #include <netinet/ip6.h>
 #endif
 #include <netinet/ip_icmp.h>
 #include <netinet/icmp_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_options.h>
 #ifdef INET6
 #include <netinet6/ip6_var.h>
 #endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/udplite.h>
 #include <netinet/in_rss.h>
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
 
 /*
  * UDP and UDP-Lite protocols implementation.
  * Per RFC 768, August, 1980.
  * Per RFC 3828, July, 2004.
  */
 
 /*
  * BSD 4.2 defaulted the udp checksum to be off.  Turning off udp checksums
  * removes the only data integrity mechanism for packets and malformed
  * packets that would otherwise be discarded due to bad checksums, and may
  * cause problems (especially for NFS data blocks).
  */
 VNET_DEFINE(int, udp_cksum) = 1;
 SYSCTL_INT(_net_inet_udp, UDPCTL_CHECKSUM, checksum, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(udp_cksum), 0, "compute udp checksum");
 
 VNET_DEFINE(int, udp_log_in_vain) = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, log_in_vain, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(udp_log_in_vain), 0, "Log all incoming UDP packets");
 
 VNET_DEFINE(int, udp_blackhole) = 0;
 SYSCTL_INT(_net_inet_udp, OID_AUTO, blackhole, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(udp_blackhole), 0,
     "Do not send port unreachables for refused connects");
 VNET_DEFINE(bool, udp_blackhole_local) = false;
 SYSCTL_BOOL(_net_inet_udp, OID_AUTO, blackhole_local, CTLFLAG_VNET |
     CTLFLAG_RW, &VNET_NAME(udp_blackhole_local), false,
     "Enforce net.inet.udp.blackhole for locally originated packets");
 
 u_long	udp_sendspace = 9216;		/* really max datagram size */
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_MAXDGRAM, maxdgram, CTLFLAG_RW,
     &udp_sendspace, 0, "Maximum outgoing UDP datagram size");
 
 u_long	udp_recvspace = 40 * (1024 +
 #ifdef INET6
 				      sizeof(struct sockaddr_in6)
 #else
 				      sizeof(struct sockaddr_in)
 #endif
 				      );	/* 40 1K datagrams */
 
 SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
     &udp_recvspace, 0, "Maximum space for incoming UDP datagrams");
 
 VNET_DEFINE(struct inpcbinfo, udbinfo);
 VNET_DEFINE(struct inpcbinfo, ulitecbinfo);
 VNET_DEFINE_STATIC(uma_zone_t, udpcb_zone);
 #define	V_udpcb_zone			VNET(udpcb_zone)
 
 #ifndef UDBHASHSIZE
 #define	UDBHASHSIZE	128
 #endif
 
 VNET_PCPUSTAT_DEFINE(struct udpstat, udpstat);		/* from udp_var.h */
 VNET_PCPUSTAT_SYSINIT(udpstat);
 SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat,
     udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)");
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(udpstat);
 #endif /* VIMAGE */
 #ifdef INET
 static void	udp_detach(struct socket *so);
 static int	udp_output(struct inpcb *, struct mbuf *, struct sockaddr *,
 		    struct mbuf *, struct thread *, int);
 #endif
 
 INPCBSTORAGE_DEFINE(udpcbstor, "udpinp", "udp_inpcb", "udp", "udphash");
 INPCBSTORAGE_DEFINE(udplitecbstor, "udpliteinp", "udplite_inpcb", "udplite",
     "udplitehash");
 
 static void
 udp_vnet_init(void *arg __unused)
 {
 
 	/*
 	 * For now default to 2-tuple UDP hashing - until the fragment
 	 * reassembly code can also update the flowid.
 	 *
 	 * Once we can calculate the flowid that way and re-establish
 	 * a 4-tuple, flip this to 4-tuple.
 	 */
 	in_pcbinfo_init(&V_udbinfo, &udpcbstor, UDBHASHSIZE, UDBHASHSIZE);
 	V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	uma_zone_set_max(V_udpcb_zone, maxsockets);
 	uma_zone_set_warning(V_udpcb_zone, "kern.ipc.maxsockets limit reached");
 
 	/* Additional pcbinfo for UDP-Lite */
 	in_pcbinfo_init(&V_ulitecbinfo, &udplitecbstor, UDBHASHSIZE,
 	    UDBHASHSIZE);
 }
 VNET_SYSINIT(udp_vnet_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH,
     udp_vnet_init, NULL);
 
 /*
  * Kernel module interface for updating udpstat.  The argument is an index
  * into udpstat treated as an array of u_long.  While this encodes the
  * general layout of udpstat into the caller, it doesn't encode its location,
  * so that future changes to add, for example, per-CPU stats support won't
  * cause binary compatibility problems for kernel modules.
  */
 void
 kmod_udpstat_inc(int statnum)
 {
 
 	counter_u64_add(VNET(udpstat)[statnum], 1);
 }
 
 int
 udp_newudpcb(struct inpcb *inp)
 {
 	struct udpcb *up;
 
 	up = uma_zalloc(V_udpcb_zone, M_NOWAIT | M_ZERO);
 	if (up == NULL)
 		return (ENOBUFS);
 	inp->inp_ppcb = up;
 	return (0);
 }
 
 void
 udp_discardcb(struct udpcb *up)
 {
 
 	uma_zfree(V_udpcb_zone, up);
 }
 
 #ifdef VIMAGE
 static void
 udp_destroy(void *unused __unused)
 {
 
 	in_pcbinfo_destroy(&V_udbinfo);
 	uma_zdestroy(V_udpcb_zone);
 }
 VNET_SYSUNINIT(udp, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udp_destroy, NULL);
 
 static void
 udplite_destroy(void *unused __unused)
 {
 
 	in_pcbinfo_destroy(&V_ulitecbinfo);
 }
 VNET_SYSUNINIT(udplite, SI_SUB_PROTO_DOMAIN, SI_ORDER_FOURTH, udplite_destroy,
     NULL);
 #endif
 
 #ifdef INET
 /*
  * Subroutine of udp_input(), which appends the provided mbuf chain to the
  * passed pcb/socket.  The caller must provide a sockaddr_in via udp_in that
  * contains the source address.  If the socket ends up being an IPv6 socket,
  * udp_append() will convert to a sockaddr_in6 before passing the address
  * into the socket code.
  *
  * In the normal case udp_append() will return 0, indicating that you
  * must unlock the inp. However if a tunneling protocol is in place we increment
  * the inpcb refcnt and unlock the inp, on return from the tunneling protocol we
  * then decrement the reference count. If the inp_rele returns 1, indicating the
  * inp is gone, we return that to the caller to tell them *not* to unlock
  * the inp. In the case of multi-cast this will cause the distribution
  * to stop (though most tunneling protocols known currently do *not* use
  * multicast).
  */
 static int
 udp_append(struct inpcb *inp, struct ip *ip, struct mbuf *n, int off,
     struct sockaddr_in *udp_in)
 {
 	struct sockaddr *append_sa;
 	struct socket *so;
 	struct mbuf *tmpopts, *opts = NULL;
 #ifdef INET6
 	struct sockaddr_in6 udp_in6;
 #endif
 	struct udpcb *up;
 	bool filtered;
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * Engage the tunneling protocol.
 	 */
 	up = intoudpcb(inp);
 	if (up->u_tun_func != NULL) {
 		in_pcbref(inp);
 		INP_RUNLOCK(inp);
 		filtered = (*up->u_tun_func)(n, off, inp, (struct sockaddr *)&udp_in[0],
 		    up->u_tun_ctx);
 		INP_RLOCK(inp);
 		if (filtered)
 			return (in_pcbrele_rlocked(inp));
 	}
 
 	off += sizeof(struct udphdr);
 
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/* Check AH/ESP integrity. */
 	if (IPSEC_ENABLED(ipv4) &&
 	    IPSEC_CHECK_POLICY(ipv4, n, inp) != 0) {
 		m_freem(n);
 		return (0);
 	}
 	if (up->u_flags & UF_ESPINUDP) {/* IPSec UDP encaps. */
 		if (IPSEC_ENABLED(ipv4) &&
 		    UDPENCAP_INPUT(n, off, AF_INET) != 0)
 			return (0);	/* Consumed. */
 	}
 #endif /* IPSEC */
 #ifdef MAC
 	if (mac_inpcb_check_deliver(inp, n) != 0) {
 		m_freem(n);
 		return (0);
 	}
 #endif /* MAC */
 	if (inp->inp_flags & INP_CONTROLOPTS ||
 	    inp->inp_socket->so_options & (SO_TIMESTAMP | SO_BINTIME)) {
 #ifdef INET6
 		if (inp->inp_vflag & INP_IPV6)
 			(void)ip6_savecontrol_v4(inp, n, &opts, NULL);
 		else
 #endif /* INET6 */
 			ip_savecontrol(inp, &opts, ip, n);
 	}
 	if ((inp->inp_vflag & INP_IPV4) && (inp->inp_flags2 & INP_ORIGDSTADDR)) {
 		tmpopts = sbcreatecontrol(&udp_in[1],
 		    sizeof(struct sockaddr_in), IP_ORIGDSTADDR, IPPROTO_IP,
 		    M_NOWAIT);
 		if (tmpopts) {
 			if (opts) {
 				tmpopts->m_next = opts;
 				opts = tmpopts;
 			} else
 				opts = tmpopts;
 		}
 	}
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6) {
 		bzero(&udp_in6, sizeof(udp_in6));
 		udp_in6.sin6_len = sizeof(udp_in6);
 		udp_in6.sin6_family = AF_INET6;
 		in6_sin_2_v4mapsin6(&udp_in[0], &udp_in6);
 		append_sa = (struct sockaddr *)&udp_in6;
 	} else
 #endif /* INET6 */
 		append_sa = (struct sockaddr *)&udp_in[0];
 	m_adj(n, off);
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, append_sa, n, opts) == 0) {
 		soroverflow_locked(so);
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		UDPSTAT_INC(udps_fullsock);
 	} else
 		sorwakeup_locked(so);
 	return (0);
 }
 
 static bool
 udp_multi_match(const struct inpcb *inp, void *v)
 {
 	struct ip *ip = v;
 	struct udphdr *uh = (struct udphdr *)(ip + 1);
 
 	if (inp->inp_lport != uh->uh_dport)
 		return (false);
 #ifdef INET6
 	if ((inp->inp_vflag & INP_IPV4) == 0)
 		return (false);
 #endif
 	if (inp->inp_laddr.s_addr != INADDR_ANY &&
 	    inp->inp_laddr.s_addr != ip->ip_dst.s_addr)
 		return (false);
 	if (inp->inp_faddr.s_addr != INADDR_ANY &&
 	    inp->inp_faddr.s_addr != ip->ip_src.s_addr)
 		return (false);
 	if (inp->inp_fport != 0 &&
 	    inp->inp_fport != uh->uh_sport)
 		return (false);
 
 	return (true);
 }
 
 static int
 udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in)
 {
 	struct ip *ip = mtod(m, struct ip *);
 	struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto),
 	    INPLOOKUP_RLOCKPCB, udp_multi_match, ip);
 #ifdef KDTRACE_HOOKS
 	struct udphdr *uh = (struct udphdr *)(ip + 1);
 #endif
 	struct inpcb *inp;
 	struct mbuf *n;
 	int appends = 0;
 
 	MPASS(ip->ip_hl == sizeof(struct ip) >> 2);
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		/*
 		 * XXXRW: Because we weren't holding either the inpcb
 		 * or the hash lock when we checked for a match
 		 * before, we should probably recheck now that the
 		 * inpcb lock is held.
 		 */
 		/*
 		 * Handle socket delivery policy for any-source
 		 * and source-specific multicast. [RFC3678]
 		 */
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
 			struct ip_moptions	*imo;
 			struct sockaddr_in	 group;
 			int			 blocked;
 
 			imo = inp->inp_moptions;
 			if (imo == NULL)
 				continue;
 			bzero(&group, sizeof(struct sockaddr_in));
 			group.sin_len = sizeof(struct sockaddr_in);
 			group.sin_family = AF_INET;
 			group.sin_addr = ip->ip_dst;
 
 			blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif,
 				(struct sockaddr *)&group,
 				(struct sockaddr *)&udp_in[0]);
 			if (blocked != MCAST_PASS) {
 				if (blocked == MCAST_NOTGMEMBER)
 					IPSTAT_INC(ips_notmember);
 				if (blocked == MCAST_NOTSMEMBER ||
 				    blocked == MCAST_MUTED)
 					UDPSTAT_INC(udps_filtermcast);
 				continue;
 			}
 		}
 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
 			if (proto == IPPROTO_UDPLITE)
 				UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
 			else
 				UDP_PROBE(receive, NULL, inp, ip, inp, uh);
 			if (udp_append(inp, ip, n, sizeof(struct ip), udp_in)) {
 				INP_RUNLOCK(inp);
 				break;
 			} else
 				appends++;
 		}
 		/*
 		 * Don't look for additional matches if this one does
 		 * not have either the SO_REUSEPORT or SO_REUSEADDR
 		 * socket options set.  This heuristic avoids
 		 * searching through all pcbs in the common case of a
 		 * non-shared port.  It assumes that an application
 		 * will never clear these options after setting them.
 		 */
 		if ((inp->inp_socket->so_options &
 		    (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) {
 			INP_RUNLOCK(inp);
 			break;
 		}
 	}
 
 	if (appends == 0) {
 		/*
 		 * No matching pcb found; discard datagram.  (No need
 		 * to send an ICMP Port Unreachable for a broadcast
 		 * or multicast datgram.)
 		 */
 		UDPSTAT_INC(udps_noport);
 		if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)))
 			UDPSTAT_INC(udps_noportmcast);
 		else
 			UDPSTAT_INC(udps_noportbcast);
 	}
 	m_freem(m);
 
 	return (IPPROTO_DONE);
 }
 
 static int
 udp_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct ip *ip;
 	struct udphdr *uh;
 	struct ifnet *ifp;
 	struct inpcb *inp;
 	uint16_t len, ip_len;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in udp_in[2];
 	struct mbuf *m;
 	struct m_tag *fwd_tag;
 	int cscov_partial, iphlen;
 
 	m = *mp;
 	iphlen = *offp;
 	ifp = m->m_pkthdr.rcvif;
 	*mp = NULL;
 	UDPSTAT_INC(udps_ipackets);
 
 	/*
 	 * Strip IP options, if any; should skip this, make available to
 	 * user, and use on returned packets, but we don't yet have a way to
 	 * check the checksum with options still present.
 	 */
 	if (iphlen > sizeof (struct ip)) {
 		ip_stripoptions(m);
 		iphlen = sizeof(struct ip);
 	}
 
 	/*
 	 * Get IP and UDP header together in first mbuf.
 	 */
 	if (m->m_len < iphlen + sizeof(struct udphdr)) {
 		if ((m = m_pullup(m, iphlen + sizeof(struct udphdr))) == NULL) {
 			UDPSTAT_INC(udps_hdrops);
 			return (IPPROTO_DONE);
 		}
 	}
 	ip = mtod(m, struct ip *);
 	uh = (struct udphdr *)((caddr_t)ip + iphlen);
 	cscov_partial = (proto == IPPROTO_UDPLITE) ? 1 : 0;
 
 	/*
 	 * Destination port of 0 is illegal, based on RFC768.
 	 */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	/*
 	 * Construct sockaddr format source address.  Stuff source address
 	 * and datagram in user buffer.
 	 */
 	bzero(&udp_in[0], sizeof(struct sockaddr_in) * 2);
 	udp_in[0].sin_len = sizeof(struct sockaddr_in);
 	udp_in[0].sin_family = AF_INET;
 	udp_in[0].sin_port = uh->uh_sport;
 	udp_in[0].sin_addr = ip->ip_src;
 	udp_in[1].sin_len = sizeof(struct sockaddr_in);
 	udp_in[1].sin_family = AF_INET;
 	udp_in[1].sin_port = uh->uh_dport;
 	udp_in[1].sin_addr = ip->ip_dst;
 
 	/*
 	 * Make mbuf data length reflect UDP length.  If not enough data to
 	 * reflect UDP length, drop.
 	 */
 	len = ntohs((u_short)uh->uh_ulen);
 	ip_len = ntohs(ip->ip_len) - iphlen;
 	if (proto == IPPROTO_UDPLITE && (len == 0 || len == ip_len)) {
 		/* Zero means checksum over the complete packet. */
 		if (len == 0)
 			len = ip_len;
 		cscov_partial = 0;
 	}
 	if (ip_len != len) {
 		if (len > ip_len || len < sizeof(struct udphdr)) {
 			UDPSTAT_INC(udps_badlen);
 			goto badunlocked;
 		}
 		if (proto == IPPROTO_UDP)
 			m_adj(m, len - ip_len);
 	}
 
 	/*
 	 * Checksum extended UDP header and data.
 	 */
 	if (uh->uh_sum) {
 		u_short uh_sum;
 
 		if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
 		    !cscov_partial) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				uh_sum = m->m_pkthdr.csum_data;
 			else
 				uh_sum = in_pseudo(ip->ip_src.s_addr,
 				    ip->ip_dst.s_addr, htonl((u_short)len +
 				    m->m_pkthdr.csum_data + proto));
 			uh_sum ^= 0xffff;
 		} else {
 			char b[offsetof(struct ipovly, ih_src)];
 			struct ipovly *ipov = (struct ipovly *)ip;
 
 			bcopy(ipov, b, sizeof(b));
 			bzero(ipov, sizeof(ipov->ih_x1));
 			ipov->ih_len = (proto == IPPROTO_UDP) ?
 			    uh->uh_ulen : htons(ip_len);
 			uh_sum = in_cksum(m, len + sizeof (struct ip));
 			bcopy(b, ipov, sizeof(b));
 		}
 		if (uh_sum) {
 			UDPSTAT_INC(udps_badsum);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	} else {
 		if (proto == IPPROTO_UDP) {
 			UDPSTAT_INC(udps_nosum);
 		} else {
 			/* UDPLite requires a checksum */
 			/* XXX: What is the right UDPLite MIB counter here? */
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 
 	if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
 	    in_broadcast(ip->ip_dst, ifp))
 		return (udp_multi_input(m, proto, udp_in));
 
 	pcbinfo = udp_get_inpcbinfo(proto);
 
 	/*
 	 * Locate pcb for datagram.
 	 *
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
 	if ((m->m_flags & M_IP_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		struct sockaddr_in *next_hop;
 
 		next_hop = (struct sockaddr_in *)(fwd_tag + 1);
 
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_RLOCKPCB, ifp, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in_pcblookup(pcbinfo, ip->ip_src,
 			    uh->uh_sport, next_hop->sin_addr,
 			    next_hop->sin_port ? htons(next_hop->sin_port) :
 			    uh->uh_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_RLOCKPCB, ifp);
 		}
 		/* Remove the tag from the packet. We don't need it anymore. */
 		m_tag_delete(m, fwd_tag);
 		m->m_flags &= ~M_IP_NEXTHOP;
 	} else
 		inp = in_pcblookup_mbuf(pcbinfo, ip->ip_src, uh->uh_sport,
 		    ip->ip_dst, uh->uh_dport, INPLOOKUP_WILDCARD |
 		    INPLOOKUP_RLOCKPCB, ifp, m);
 	if (inp == NULL) {
 		if (V_udp_log_in_vain) {
 			char src[INET_ADDRSTRLEN];
 			char dst[INET_ADDRSTRLEN];
 
 			log(LOG_INFO,
 			    "Connection attempt to UDP %s:%d from %s:%d\n",
 			    inet_ntoa_r(ip->ip_dst, dst), ntohs(uh->uh_dport),
 			    inet_ntoa_r(ip->ip_src, src), ntohs(uh->uh_sport));
 		}
 		if (proto == IPPROTO_UDPLITE)
 			UDPLITE_PROBE(receive, NULL, NULL, ip, NULL, uh);
 		else
 			UDP_PROBE(receive, NULL, NULL, ip, NULL, uh);
 		UDPSTAT_INC(udps_noport);
 		if (m->m_flags & (M_BCAST | M_MCAST)) {
 			UDPSTAT_INC(udps_noportbcast);
 			goto badunlocked;
 		}
 		if (V_udp_blackhole && (V_udp_blackhole_local ||
 		    !in_localip(ip->ip_src)))
 			goto badunlocked;
 		if (badport_bandlim(BANDLIM_ICMP_UNREACH) < 0)
 			goto badunlocked;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PORT, 0, 0);
 		return (IPPROTO_DONE);
 	}
 
 	/*
 	 * Check the minimum TTL for socket.
 	 */
 	INP_RLOCK_ASSERT(inp);
 	if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) {
 		if (proto == IPPROTO_UDPLITE)
 			UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
 		else
 			UDP_PROBE(receive, NULL, inp, ip, inp, uh);
 		INP_RUNLOCK(inp);
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 	if (cscov_partial) {
 		struct udpcb *up;
 
 		up = intoudpcb(inp);
 		if (up->u_rxcslen == 0 || up->u_rxcslen > len) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			return (IPPROTO_DONE);
 		}
 	}
 
 	if (proto == IPPROTO_UDPLITE)
 		UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh);
 	else
 		UDP_PROBE(receive, NULL, inp, ip, inp, uh);
 	if (udp_append(inp, ip, m, iphlen, udp_in) == 0)
 		INP_RUNLOCK(inp);
 	return (IPPROTO_DONE);
 
 badunlocked:
 	m_freem(m);
 	return (IPPROTO_DONE);
 }
 #endif /* INET */
 
 /*
  * Notify a udp user of an asynchronous error; just wake up so that they can
  * collect error status.
  */
 struct inpcb *
 udp_notify(struct inpcb *inp, int errno)
 {
 
 	INP_WLOCK_ASSERT(inp);
 	if ((errno == EHOSTUNREACH || errno == ENETUNREACH ||
 	     errno == EHOSTDOWN) && inp->inp_route.ro_nh) {
 		NH_FREE(inp->inp_route.ro_nh);
 		inp->inp_route.ro_nh = (struct nhop_object *)NULL;
 	}
 
 	inp->inp_socket->so_error = errno;
 	sorwakeup(inp->inp_socket);
 	sowwakeup(inp->inp_socket);
 	return (inp);
 }
 
 #ifdef INET
 static void
 udp_common_ctlinput(int cmd, struct sockaddr *sa, void *vip,
     struct inpcbinfo *pcbinfo)
 {
 	struct ip *ip = vip;
 	struct udphdr *uh;
 	struct in_addr faddr;
 	struct inpcb *inp;
 
 	faddr = ((struct sockaddr_in *)sa)->sin_addr;
 	if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
 		return;
 
 	if (PRC_IS_REDIRECT(cmd)) {
 		/* signal EHOSTDOWN, as it flushes the cached route */
 		in_pcbnotifyall(pcbinfo, faddr, EHOSTDOWN, udp_notify);
 		return;
 	}
 
 	/*
 	 * Hostdead is ugly because it goes linearly through all PCBs.
 	 *
 	 * XXX: We never get this from ICMP, otherwise it makes an excellent
 	 * DoS attack on machines with many connections.
 	 */
 	if (cmd == PRC_HOSTDEAD)
 		ip = NULL;
 	else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
 		return;
 	if (ip != NULL) {
 		uh = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2));
 		inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
 		    ip->ip_src, uh->uh_sport, INPLOOKUP_WLOCKPCB, NULL);
 		if (inp != NULL) {
 			INP_WLOCK_ASSERT(inp);
 			if (inp->inp_socket != NULL) {
 				udp_notify(inp, inetctlerrmap[cmd]);
 			}
 			INP_WUNLOCK(inp);
 		} else {
 			inp = in_pcblookup(pcbinfo, faddr, uh->uh_dport,
 					   ip->ip_src, uh->uh_sport,
 					   INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
 			if (inp != NULL) {
 				struct udpcb *up;
 				void *ctx;
 				udp_tun_icmp_t func;
 
 				up = intoudpcb(inp);
 				ctx = up->u_tun_ctx;
 				func = up->u_icmp_func;
 				INP_RUNLOCK(inp);
 				if (func != NULL)
 					(*func)(cmd, sa, vip, ctx);
 			}
 		}
 	} else
 		in_pcbnotifyall(pcbinfo, faddr, inetctlerrmap[cmd],
 		    udp_notify);
 }
 
 static void
 udp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 
 	return (udp_common_ctlinput(cmd, sa, vip, &V_udbinfo));
 }
 
 static void
 udplite_ctlinput(int cmd, struct sockaddr *sa, void *vip)
 {
 
 	return (udp_common_ctlinput(cmd, sa, vip, &V_ulitecbinfo));
 }
 #endif /* INET */
 
 static int
 udp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_udbinfo,
 	    INPLOOKUP_RLOCKPCB);
 	struct xinpgen xig;
 	struct inpcb *inp;
 	int error;
 
 	if (req->newptr != 0)
 		return (EPERM);
 
 	if (req->oldptr == 0) {
 		int n;
 
 		n = V_udbinfo.ipi_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb);
 		return (0);
 	}
 
 	if ((error = sysctl_wire_old_buffer(req, 0)) != 0)
 		return (error);
 
 	bzero(&xig, sizeof(xig));
 	xig.xig_len = sizeof xig;
 	xig.xig_count = V_udbinfo.ipi_count;
 	xig.xig_gen = V_udbinfo.ipi_gencnt;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		if (inp->inp_gencnt <= xig.xig_gen &&
 		    cr_canseeinpcb(req->td->td_ucred, inp) == 0) {
 			struct xinpcb xi;
 
 			in_pcbtoxinpcb(inp, &xi);
 			error = SYSCTL_OUT(req, &xi, sizeof xi);
 			if (error) {
 				INP_RUNLOCK(inp);
 				break;
 			}
 		}
 	}
 
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.  If the
 		 * generation differs from what we told her before, she knows
 		 * that something happened while we were processing this
 		 * request, and it might be necessary to retry.
 		 */
 		xig.xig_gen = V_udbinfo.ipi_gencnt;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = V_udbinfo.ipi_count;
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, UDPCTL_PCBLIST, pcblist,
     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
     udp_pcblist, "S,xinpcb",
     "List of active UDP sockets");
 
 #ifdef INET
 static int
 udp_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in addrs[2];
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	NET_EPOCH_ENTER(et);
 	inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port,
 	    addrs[0].sin_addr, addrs[0].sin_port,
 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
 	NET_EPOCH_EXIT(et);
 	if (inp != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseeinpcb(req->td->td_ucred, inp);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet_udp, OID_AUTO, getcred,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
     0, 0, udp_getcred, "S,xucred",
     "Get the xucred of a UDP connection");
 #endif /* INET */
 
 int
 udp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 	int isudplite, error, optval;
 
 	error = 0;
 	isudplite = (so->so_proto->pr_protocol == IPPROTO_UDPLITE) ? 1 : 0;
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	INP_WLOCK(inp);
 	if (sopt->sopt_level != so->so_proto->pr_protocol) {
 #ifdef INET6
 		if (INP_CHECK_SOCKAF(so, AF_INET6)) {
 			INP_WUNLOCK(inp);
 			error = ip6_ctloutput(so, sopt);
 		}
 #endif
 #if defined(INET) && defined(INET6)
 		else
 #endif
 #ifdef INET
 		{
 			INP_WUNLOCK(inp);
 			error = ip_ctloutput(so, sopt);
 		}
 #endif
 		return (error);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 #ifdef INET
 		case UDP_ENCAP:
 			if (!IPSEC_ENABLED(ipv4)) {
 				INP_WUNLOCK(inp);
 				return (ENOPROTOOPT);
 			}
 			error = UDPENCAP_PCBCTL(inp, sopt);
 			break;
 #endif /* INET */
 #endif /* IPSEC */
 		case UDPLITE_SEND_CSCOV:
 		case UDPLITE_RECV_CSCOV:
 			if (!isudplite) {
 				INP_WUNLOCK(inp);
 				error = ENOPROTOOPT;
 				break;
 			}
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, &optval, sizeof(optval),
 			    sizeof(optval));
 			if (error != 0)
 				break;
 			inp = sotoinpcb(so);
 			KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 			INP_WLOCK(inp);
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			if ((optval != 0 && optval < 8) || (optval > 65535)) {
 				INP_WUNLOCK(inp);
 				error = EINVAL;
 				break;
 			}
 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
 				up->u_txcslen = optval;
 			else
 				up->u_rxcslen = optval;
 			INP_WUNLOCK(inp);
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 #ifdef INET
 		case UDP_ENCAP:
 			if (!IPSEC_ENABLED(ipv4)) {
 				INP_WUNLOCK(inp);
 				return (ENOPROTOOPT);
 			}
 			error = UDPENCAP_PCBCTL(inp, sopt);
 			break;
 #endif /* INET */
 #endif /* IPSEC */
 		case UDPLITE_SEND_CSCOV:
 		case UDPLITE_RECV_CSCOV:
 			if (!isudplite) {
 				INP_WUNLOCK(inp);
 				error = ENOPROTOOPT;
 				break;
 			}
 			up = intoudpcb(inp);
 			KASSERT(up != NULL, ("%s: up == NULL", __func__));
 			if (sopt->sopt_name == UDPLITE_SEND_CSCOV)
 				optval = up->u_txcslen;
 			else
 				optval = up->u_rxcslen;
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &optval, sizeof(optval));
 			break;
 		default:
 			INP_WUNLOCK(inp);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 
 #ifdef INET
 #ifdef INET6
 /* The logic here is derived from ip6_setpktopt(). See comments there. */
 static int
 udp_v4mapped_pktinfo(struct cmsghdr *cm, struct sockaddr_in * src,
     struct inpcb *inp, int flags)
 {
 	struct ifnet *ifp;
 	struct in6_pktinfo *pktinfo;
 	struct in_addr ia;
 
 	if ((flags & PRUS_IPV6) == 0)
 		return (0);
 
 	if (cm->cmsg_level != IPPROTO_IPV6)
 		return (0);
 
 	if  (cm->cmsg_type != IPV6_2292PKTINFO &&
 	    cm->cmsg_type != IPV6_PKTINFO)
 		return (0);
 
 	if (cm->cmsg_len !=
 	    CMSG_LEN(sizeof(struct in6_pktinfo)))
 		return (EINVAL);
 
 	pktinfo = (struct in6_pktinfo *)CMSG_DATA(cm);
 	if (!IN6_IS_ADDR_V4MAPPED(&pktinfo->ipi6_addr) &&
 	    !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr))
 		return (EINVAL);
 
 	/* Validate the interface index if specified. */
 	if (pktinfo->ipi6_ifindex) {
 		struct epoch_tracker et;
 
 		NET_EPOCH_ENTER(et);
 		ifp = ifnet_byindex(pktinfo->ipi6_ifindex);
 		NET_EPOCH_EXIT(et);	/* XXXGL: unsafe ifp */
 		if (ifp == NULL)
 			return (ENXIO);
 	} else
 		ifp = NULL;
 	if (ifp != NULL && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
 		ia.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
 		if (in_ifhasaddr(ifp, ia) == 0)
 			return (EADDRNOTAVAIL);
 	}
 
 	bzero(src, sizeof(*src));
 	src->sin_family = AF_INET;
 	src->sin_len = sizeof(*src);
 	src->sin_port = inp->inp_lport;
 	src->sin_addr.s_addr = pktinfo->ipi6_addr.s6_addr32[3];
 
 	return (0);
 }
 #endif
 
 static int
 udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td, int flags)
 {
 	struct udpiphdr *ui;
 	int len = m->m_pkthdr.len;
 	struct in_addr faddr, laddr;
 	struct cmsghdr *cm;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in *sin, src;
 	struct epoch_tracker et;
 	int cscov_partial = 0;
 	int error = 0;
 	int ipflags = 0;
 	u_short fport, lport;
 	u_char tos;
 	uint8_t pr;
 	uint16_t cscov = 0;
 	uint32_t flowid = 0;
 	uint8_t flowtype = M_HASHTYPE_NONE;
 
 	if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) {
 		if (control)
 			m_freem(control);
 		m_freem(m);
 		return (EMSGSIZE);
 	}
 
 	src.sin_family = 0;
 	sin = (struct sockaddr_in *)addr;
 
 	/*
 	 * udp_output() may need to temporarily bind or connect the current
 	 * inpcb.  As such, we don't know up front whether we will need the
 	 * pcbinfo lock or not.  Do any work to decide what is needed up
 	 * front before acquiring any locks.
 	 *
 	 * We will need network epoch in either case, to safely lookup into
 	 * pcb hash.
 	 */
 	if (sin == NULL ||
 	    (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0))
 		INP_WLOCK(inp);
 	else
 		INP_RLOCK(inp);
 	NET_EPOCH_ENTER(et);
 	tos = inp->inp_ip_tos;
 	if (control != NULL) {
 		/*
 		 * XXX: Currently, we assume all the optional information is
 		 * stored in a single mbuf.
 		 */
 		if (control->m_next) {
 			m_freem(control);
 			error = EINVAL;
 			goto release;
 		}
 		for (; control->m_len > 0;
 		    control->m_data += CMSG_ALIGN(cm->cmsg_len),
 		    control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
 			cm = mtod(control, struct cmsghdr *);
 			if (control->m_len < sizeof(*cm) || cm->cmsg_len == 0
 			    || cm->cmsg_len > control->m_len) {
 				error = EINVAL;
 				break;
 			}
 #ifdef INET6
 			error = udp_v4mapped_pktinfo(cm, &src, inp, flags);
 			if (error != 0)
 				break;
 #endif
 			if (cm->cmsg_level != IPPROTO_IP)
 				continue;
 
 			switch (cm->cmsg_type) {
 			case IP_SENDSRCADDR:
 				if (cm->cmsg_len !=
 				    CMSG_LEN(sizeof(struct in_addr))) {
 					error = EINVAL;
 					break;
 				}
 				bzero(&src, sizeof(src));
 				src.sin_family = AF_INET;
 				src.sin_len = sizeof(src);
 				src.sin_port = inp->inp_lport;
 				src.sin_addr =
 				    *(struct in_addr *)CMSG_DATA(cm);
 				break;
 
 			case IP_TOS:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(u_char))) {
 					error = EINVAL;
 					break;
 				}
 				tos = *(u_char *)CMSG_DATA(cm);
 				break;
 
 			case IP_FLOWID:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				flowid = *(uint32_t *) CMSG_DATA(cm);
 				break;
 
 			case IP_FLOWTYPE:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				flowtype = *(uint32_t *) CMSG_DATA(cm);
 				break;
 
 #ifdef	RSS
 			case IP_RSSBUCKETID:
 				if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) {
 					error = EINVAL;
 					break;
 				}
 				/* This is just a placeholder for now */
 				break;
 #endif	/* RSS */
 			default:
 				error = ENOPROTOOPT;
 				break;
 			}
 			if (error)
 				break;
 		}
 		m_freem(control);
 		control = NULL;
 	}
 	if (error)
 		goto release;
 
 	pr = inp->inp_socket->so_proto->pr_protocol;
 	pcbinfo = udp_get_inpcbinfo(pr);
 
 	/*
 	 * If the IP_SENDSRCADDR control message was specified, override the
 	 * source address for this datagram.  Its use is invalidated if the
 	 * address thus specified is incomplete or clobbers other inpcbs.
 	 */
 	laddr = inp->inp_laddr;
 	lport = inp->inp_lport;
 	if (src.sin_family == AF_INET) {
 		if ((lport == 0) ||
 		    (laddr.s_addr == INADDR_ANY &&
 		     src.sin_addr.s_addr == INADDR_ANY)) {
 			error = EINVAL;
 			goto release;
 		}
 		INP_HASH_WLOCK(pcbinfo);
 		error = in_pcbbind_setup(inp, (struct sockaddr *)&src,
 		    &laddr.s_addr, &lport, td->td_ucred);
 		INP_HASH_WUNLOCK(pcbinfo);
 		if (error)
 			goto release;
 	}
 
 	/*
 	 * If a UDP socket has been connected, then a local address/port will
 	 * have been selected and bound.
 	 *
 	 * If a UDP socket has not been connected to, then an explicit
 	 * destination address must be used, in which case a local
 	 * address/port may not have been selected and bound.
 	 */
 	if (sin != NULL) {
 		INP_LOCK_ASSERT(inp);
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto release;
 		}
 
 		/*
 		 * Jail may rewrite the destination address, so let it do
 		 * that before we use it.
 		 */
 		error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
 		if (error)
 			goto release;
 
 		/*
 		 * If a local address or port hasn't yet been selected, or if
 		 * the destination address needs to be rewritten due to using
 		 * a special INADDR_ constant, invoke in_pcbconnect_setup()
 		 * to do the heavy lifting.  Once a port is selected, we
 		 * commit the binding back to the socket; we also commit the
 		 * binding of the address if in jail.
 		 *
 		 * If we already have a valid binding and we're not
 		 * requesting a destination address rewrite, use a fast path.
 		 */
 		if (inp->inp_laddr.s_addr == INADDR_ANY ||
 		    inp->inp_lport == 0 ||
 		    sin->sin_addr.s_addr == INADDR_ANY ||
 		    sin->sin_addr.s_addr == INADDR_BROADCAST) {
 			INP_HASH_WLOCK(pcbinfo);
 			error = in_pcbconnect_setup(inp, addr, &laddr.s_addr,
 			    &lport, &faddr.s_addr, &fport, NULL,
 			    td->td_ucred);
 			if (error) {
 				INP_HASH_WUNLOCK(pcbinfo);
 				goto release;
 			}
 
 			/*
 			 * XXXRW: Why not commit the port if the address is
 			 * !INADDR_ANY?
 			 */
 			/* Commit the local port if newly assigned. */
 			if (inp->inp_laddr.s_addr == INADDR_ANY &&
 			    inp->inp_lport == 0) {
 				INP_WLOCK_ASSERT(inp);
 				/*
 				 * Remember addr if jailed, to prevent
 				 * rebinding.
 				 */
 				if (prison_flag(td->td_ucred, PR_IP4))
 					inp->inp_laddr = laddr;
 				inp->inp_lport = lport;
 				error = in_pcbinshash(inp);
 				INP_HASH_WUNLOCK(pcbinfo);
 				if (error != 0) {
 					inp->inp_lport = 0;
 					error = EAGAIN;
 					goto release;
 				}
 				inp->inp_flags |= INP_ANONPORT;
 			} else
 				INP_HASH_WUNLOCK(pcbinfo);
 		} else {
 			faddr = sin->sin_addr;
 			fport = sin->sin_port;
 		}
 	} else {
 		INP_LOCK_ASSERT(inp);
 		faddr = inp->inp_faddr;
 		fport = inp->inp_fport;
 		if (faddr.s_addr == INADDR_ANY) {
 			error = ENOTCONN;
 			goto release;
 		}
 	}
 
 	/*
 	 * Calculate data length and get a mbuf for UDP, IP, and possible
 	 * link-layer headers.  Immediate slide the data pointer back forward
 	 * since we won't use that space at this layer.
 	 */
 	M_PREPEND(m, sizeof(struct udpiphdr) + max_linkhdr, M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto release;
 	}
 	m->m_data += max_linkhdr;
 	m->m_len -= max_linkhdr;
 	m->m_pkthdr.len -= max_linkhdr;
 
 	/*
 	 * Fill in mbuf with extended UDP header and addresses and length put
 	 * into network format.
 	 */
 	ui = mtod(m, struct udpiphdr *);
 	bzero(ui->ui_x1, sizeof(ui->ui_x1));	/* XXX still needed? */
 	ui->ui_v = IPVERSION << 4;
 	ui->ui_pr = pr;
 	ui->ui_src = laddr;
 	ui->ui_dst = faddr;
 	ui->ui_sport = lport;
 	ui->ui_dport = fport;
 	ui->ui_ulen = htons((u_short)len + sizeof(struct udphdr));
 	if (pr == IPPROTO_UDPLITE) {
 		struct udpcb *up;
 		uint16_t plen;
 
 		up = intoudpcb(inp);
 		cscov = up->u_txcslen;
 		plen = (u_short)len + sizeof(struct udphdr);
 		if (cscov >= plen)
 			cscov = 0;
 		ui->ui_len = htons(plen);
 		ui->ui_ulen = htons(cscov);
 		/*
 		 * For UDP-Lite, checksum coverage length of zero means
 		 * the entire UDPLite packet is covered by the checksum.
 		 */
 		cscov_partial = (cscov == 0) ? 0 : 1;
 	}
 
 	/*
 	 * Set the Don't Fragment bit in the IP header.
 	 */
 	if (inp->inp_flags & INP_DONTFRAG) {
 		struct ip *ip;
 
 		ip = (struct ip *)&ui->ui_i;
 		ip->ip_off |= htons(IP_DF);
 	}
 
 	if (inp->inp_socket->so_options & SO_DONTROUTE)
 		ipflags |= IP_ROUTETOIF;
 	if (inp->inp_socket->so_options & SO_BROADCAST)
 		ipflags |= IP_ALLOWBROADCAST;
 	if (inp->inp_flags & INP_ONESBCAST)
 		ipflags |= IP_SENDONES;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 	/*
 	 * Set up checksum and output datagram.
 	 */
 	ui->ui_sum = 0;
 	if (pr == IPPROTO_UDPLITE) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		if (cscov_partial) {
 			if ((ui->ui_sum = in_cksum(m, sizeof(struct ip) + cscov)) == 0)
 				ui->ui_sum = 0xffff;
 		} else {
 			if ((ui->ui_sum = in_cksum(m, sizeof(struct udpiphdr) + len)) == 0)
 				ui->ui_sum = 0xffff;
 		}
 	} else if (V_udp_cksum) {
 		if (inp->inp_flags & INP_ONESBCAST)
 			faddr.s_addr = INADDR_BROADCAST;
 		ui->ui_sum = in_pseudo(ui->ui_src.s_addr, faddr.s_addr,
 		    htons((u_short)len + sizeof(struct udphdr) + pr));
 		m->m_pkthdr.csum_flags = CSUM_UDP;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	}
 	((struct ip *)ui)->ip_len = htons(sizeof(struct udpiphdr) + len);
 	((struct ip *)ui)->ip_ttl = inp->inp_ip_ttl;	/* XXX */
 	((struct ip *)ui)->ip_tos = tos;		/* XXX */
 	UDPSTAT_INC(udps_opackets);
 
 	/*
 	 * Setup flowid / RSS information for outbound socket.
 	 *
 	 * Once the UDP code decides to set a flowid some other way,
 	 * this allows the flowid to be overridden by userland.
 	 */
 	if (flowtype != M_HASHTYPE_NONE) {
 		m->m_pkthdr.flowid = flowid;
 		M_HASHTYPE_SET(m, flowtype);
 	}
 #if defined(ROUTE_MPATH) || defined(RSS)
 	else if (CALC_FLOWID_OUTBOUND_SENDTO) {
 		uint32_t hash_val, hash_type;
 
 		hash_val = fib4_calc_packet_hash(laddr, faddr,
 		    lport, fport, pr, &hash_type);
 		m->m_pkthdr.flowid = hash_val;
 		M_HASHTYPE_SET(m, hash_type);
 	}
 
 	/*
 	 * Don't override with the inp cached flowid value.
 	 *
 	 * Depending upon the kind of send being done, the inp
 	 * flowid/flowtype values may actually not be appropriate
 	 * for this particular socket send.
 	 *
 	 * We should either leave the flowid at zero (which is what is
 	 * currently done) or set it to some software generated
 	 * hash value based on the packet contents.
 	 */
 	ipflags |= IP_NODEFAULTFLOWID;
 #endif	/* RSS */
 
 	if (pr == IPPROTO_UDPLITE)
 		UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
 	else
 		UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u);
 	error = ip_output(m, inp->inp_options,
 	    INP_WLOCKED(inp) ? &inp->inp_route : NULL, ipflags,
 	    inp->inp_moptions, inp);
 	INP_UNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	return (error);
 
 release:
 	INP_UNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	m_freem(m);
 	return (error);
 }
 
 pr_abort_t udp_abort;			/* shared with udp6_usrreq.c */
 void
 udp_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_abort: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_HASH_WLOCK(pcbinfo);
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp_attach(struct socket *so, int proto, struct thread *td)
 {
 	static uint32_t udp_flowid;
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("udp_attach: inp != NULL"));
 	error = soreserve(so, udp_sendspace, udp_recvspace);
 	if (error)
 		return (error);
 	error = in_pcballoc(so, pcbinfo);
 	if (error)
 		return (error);
 
 	inp = sotoinpcb(so);
 	inp->inp_ip_ttl = V_ip_defttl;
 	inp->inp_flowid = atomic_fetchadd_int(&udp_flowid, 1);
 	inp->inp_flowtype = M_HASHTYPE_OPAQUE;
 
 	error = udp_newudpcb(inp);
 	if (error) {
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		return (error);
 	}
 	INP_WUNLOCK(inp);
 
 	return (0);
 }
 #endif /* INET */
 
 int
 udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f, udp_tun_icmp_t i, void *ctx)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 
 	KASSERT(so->so_type == SOCK_DGRAM,
 	    ("udp_set_kernel_tunneling: !dgram"));
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_set_kernel_tunneling: inp == NULL"));
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	if ((f != NULL || i != NULL) && ((up->u_tun_func != NULL) ||
 	    (up->u_icmp_func != NULL))) {
 		INP_WUNLOCK(inp);
 		return (EBUSY);
 	}
 	up->u_tun_func = f;
 	up->u_icmp_func = i;
 	up->u_tun_ctx = ctx;
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 #ifdef INET
 static int
 udp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in *sinp;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_bind: inp == NULL"));
 
 	sinp = (struct sockaddr_in *)nam;
 	if (nam->sa_family != AF_INET) {
 		/*
 		 * Preserve compatibility with old programs.
 		 */
 		if (nam->sa_family != AF_UNSPEC ||
 		    nam->sa_len < offsetof(struct sockaddr_in, sin_zero) ||
 		    sinp->sin_addr.s_addr != INADDR_ANY)
 			return (EAFNOSUPPORT);
 		nam->sa_family = AF_INET;
 	}
 	if (nam->sa_len != sizeof(struct sockaddr_in))
 		return (EINVAL);
 
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(pcbinfo);
 	error = in_pcbbind(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(pcbinfo);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp_close(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_close: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_HASH_WLOCK(pcbinfo);
 		in_pcbdisconnect(inp);
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in *sin;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_connect: inp == NULL"));
 
 	sin = (struct sockaddr_in *)nam;
 	if (sin->sin_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (sin->sin_len != sizeof(*sin))
 		return (EINVAL);
 
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr != INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		return (EISCONN);
 	}
 	error = prison_remote_ip4(td->td_ucred, &sin->sin_addr);
 	if (error != 0) {
 		INP_WUNLOCK(inp);
 		return (error);
 	}
 	NET_EPOCH_ENTER(et);
 	INP_HASH_WLOCK(pcbinfo);
 	error = in_pcbconnect(inp, nam, td->td_ucred, true);
 	INP_HASH_WUNLOCK(pcbinfo);
 	NET_EPOCH_EXIT(et);
 	if (error == 0)
 		soisconnected(so);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_detach: inp == NULL"));
 	KASSERT(inp->inp_faddr.s_addr == INADDR_ANY,
 	    ("udp_detach: not disconnected"));
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
 	inp->inp_ppcb = NULL;
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	udp_discardcb(up);
 }
 
 pr_disconnect_t udp_disconnect;		/* shared with udp6_usrreq.c */
 int
 udp_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_disconnect: inp == NULL"));
 	INP_WLOCK(inp);
 	if (inp->inp_faddr.s_addr == INADDR_ANY) {
 		INP_WUNLOCK(inp);
 		return (ENOTCONN);
 	}
 	INP_HASH_WLOCK(pcbinfo);
 	in_pcbdisconnect(inp);
 	inp->inp_laddr.s_addr = INADDR_ANY;
 	INP_HASH_WUNLOCK(pcbinfo);
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 pr_send_t udp_send;			/* shared with udp6_usrreq.c */
 int
 udp_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_send: inp == NULL"));
 
 	if (addr != NULL) {
 		error = 0;
 		if (addr->sa_family != AF_INET)
 			error = EAFNOSUPPORT;
 		else if (addr->sa_len != sizeof(struct sockaddr_in))
 			error = EINVAL;
 		if (__predict_false(error != 0)) {
 			m_freem(control);
 			m_freem(m);
 			return (error);
 		}
 	}
 	return (udp_output(inp, m, addr, control, td, flags));
 }
 #endif /* INET */
 
 int
 udp_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp_shutdown: inp == NULL"));
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 #ifdef INET
-struct pr_usrreqs udp_usrreqs = {
-	.pru_abort =		udp_abort,
-	.pru_attach =		udp_attach,
-	.pru_bind =		udp_bind,
-	.pru_connect =		udp_connect,
-	.pru_control =		in_control,
-	.pru_detach =		udp_detach,
-	.pru_disconnect =	udp_disconnect,
-	.pru_peeraddr =		in_getpeeraddr,
-	.pru_send =		udp_send,
-	.pru_soreceive =	soreceive_dgram,
-	.pru_sosend =		sosend_dgram,
-	.pru_shutdown =		udp_shutdown,
-	.pru_sockaddr =		in_getsockaddr,
-	.pru_sosetlabel =	in_pcbsosetlabel,
-	.pru_close =		udp_close,
+#define	UDP_PROTOSW							\
+	.pr_type =		SOCK_DGRAM,				\
+	.pr_flags =		PR_ATOMIC | PR_ADDR | PR_CAPATTACH,	\
+	.pr_ctloutput =		udp_ctloutput,				\
+	.pr_abort =		udp_abort,				\
+	.pr_attach =		udp_attach,				\
+	.pr_bind =		udp_bind,				\
+	.pr_connect =		udp_connect,				\
+	.pr_control =		in_control,				\
+	.pr_detach =		udp_detach,				\
+	.pr_disconnect =	udp_disconnect,				\
+	.pr_peeraddr =		in_getpeeraddr,				\
+	.pr_send =		udp_send,				\
+	.pr_soreceive =		soreceive_dgram,			\
+	.pr_sosend =		sosend_dgram,				\
+	.pr_shutdown =		udp_shutdown,				\
+	.pr_sockaddr =		in_getsockaddr,				\
+	.pr_sosetlabel =	in_pcbsosetlabel,			\
+	.pr_close =		udp_close
+
+struct protosw udp_protosw = {
+	.pr_protocol =		IPPROTO_UDP,
+	UDP_PROTOSW
+};
+
+struct protosw udplite_protosw = {
+	.pr_protocol =		IPPROTO_UDPLITE,
+	UDP_PROTOSW
 };
 
 static void
 udp_init(void *arg __unused)
 {
 
 	IPPROTO_REGISTER(IPPROTO_UDP, udp_input, udp_ctlinput);
 	IPPROTO_REGISTER(IPPROTO_UDPLITE, udp_input, udplite_ctlinput);
 }
 SYSINIT(udp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp_init, NULL);
 #endif /* INET */
diff --git a/sys/netinet/udp_var.h b/sys/netinet/udp_var.h
index 8da2592054af..5fb86aa656b2 100644
--- a/sys/netinet/udp_var.h
+++ b/sys/netinet/udp_var.h
@@ -1,183 +1,182 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_var.h	8.1 (Berkeley) 6/10/93
  * $FreeBSD$
  */
 
 #ifndef _NETINET_UDP_VAR_H_
 #define	_NETINET_UDP_VAR_H_
 
 #include <sys/types.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 
 /*
  * UDP kernel structures and variables.
  */
 struct udpiphdr {
 	struct ipovly	ui_i;		/* overlaid ip structure */
 	struct udphdr	ui_u;		/* udp header */
 };
 #define	ui_x1		ui_i.ih_x1
 #define	ui_v		ui_i.ih_x1[0]
 #define	ui_pr		ui_i.ih_pr
 #define	ui_len		ui_i.ih_len
 #define	ui_src		ui_i.ih_src
 #define	ui_dst		ui_i.ih_dst
 #define	ui_sport	ui_u.uh_sport
 #define	ui_dport	ui_u.uh_dport
 #define	ui_ulen		ui_u.uh_ulen
 #define	ui_sum		ui_u.uh_sum
 
 struct inpcb;
 struct mbuf;
 
 #ifdef _KERNEL
 typedef bool(*udp_tun_func_t)(struct mbuf *, int, struct inpcb *,
 			      const struct sockaddr *, void *);
 typedef void(*udp_tun_icmp_t)(int, struct sockaddr *, void *, void *);
 
 /*
  * UDP control block; one per udp.
  */
 struct udpcb {
 	udp_tun_func_t	u_tun_func;	/* UDP kernel tunneling callback. */
 	udp_tun_icmp_t  u_icmp_func;	/* UDP kernel tunneling icmp callback */
 	u_int		u_flags;	/* Generic UDP flags. */
 	uint16_t	u_rxcslen;	/* Coverage for incoming datagrams. */
 	uint16_t	u_txcslen;	/* Coverage for outgoing datagrams. */
 	void 		*u_tun_ctx;	/* Tunneling callback context. */
 };
 
 #define	intoudpcb(ip)	((struct udpcb *)(ip)->inp_ppcb)
 #define	sotoudpcb(so)	(intoudpcb(sotoinpcb(so)))
 #endif
 
 				/* IPsec: ESP in UDP tunneling: */
 #define	UF_ESPINUDP_NON_IKE	0x00000001	/* w/ non-IKE marker .. */
 	/* .. per draft-ietf-ipsec-nat-t-ike-0[01],
 	 * and draft-ietf-ipsec-udp-encaps-(00/)01.txt */
 #define	UF_ESPINUDP		0x00000002	/* w/ non-ESP marker. */
 
 struct udpstat {
 				/* input statistics: */
 	uint64_t udps_ipackets;		/* total input packets */
 	uint64_t udps_hdrops;		/* packet shorter than header */
 	uint64_t udps_badsum;		/* checksum error */
 	uint64_t udps_nosum;		/* no checksum */
 	uint64_t udps_badlen;		/* data length larger than packet */
 	uint64_t udps_noport;		/* no socket on port */
 	uint64_t udps_noportbcast;	/* of above, arrived as broadcast */
 	uint64_t udps_fullsock;		/* not delivered, input socket full */
 	uint64_t udpps_pcbcachemiss;	/* input packets missing pcb cache */
 	uint64_t udpps_pcbhashmiss;	/* input packets not for hashed pcb */
 				/* output statistics: */
 	uint64_t udps_opackets;		/* total output packets */
 	uint64_t udps_fastout;		/* output packets on fast path */
 	/* of no socket on port, arrived as multicast */
 	uint64_t udps_noportmcast;
 	uint64_t udps_filtermcast;	/* blocked by multicast filter */
 };
 
 #ifdef _KERNEL
 #include <sys/counter.h>
 
 VNET_PCPUSTAT_DECLARE(struct udpstat, udpstat);
 /*
  * In-kernel consumers can use these accessor macros directly to update
  * stats.
  */
 #define	UDPSTAT_ADD(name, val)  \
     VNET_PCPUSTAT_ADD(struct udpstat, udpstat, name, (val))
 #define	UDPSTAT_INC(name)	UDPSTAT_ADD(name, 1)
 
 /*
  * Kernel module consumers must use this accessor macro.
  */
 void	kmod_udpstat_inc(int statnum);
 #define	KMOD_UDPSTAT_INC(name)	\
     kmod_udpstat_inc(offsetof(struct udpstat, name) / sizeof(uint64_t))
 #endif
 
 /*
  * Identifiers for UDP sysctl nodes.
  */
 #define	UDPCTL_CHECKSUM		1	/* checksum UDP packets */
 #define	UDPCTL_STATS		2	/* statistics (read-only) */
 #define	UDPCTL_MAXDGRAM		3	/* max datagram size */
 #define	UDPCTL_RECVSPACE	4	/* default receive buffer space */
 #define	UDPCTL_PCBLIST		5	/* list of PCBs for UDP sockets */
 
 #ifdef _KERNEL
 #include <netinet/in_pcb.h>
 SYSCTL_DECL(_net_inet_udp);
 
-extern struct pr_usrreqs	udp_usrreqs;
 VNET_DECLARE(struct inpcbinfo, udbinfo);
 VNET_DECLARE(struct inpcbinfo, ulitecbinfo);
 #define	V_udbinfo		VNET(udbinfo)
 #define	V_ulitecbinfo		VNET(ulitecbinfo)
 
 extern u_long			udp_sendspace;
 extern u_long			udp_recvspace;
 VNET_DECLARE(int, udp_cksum);
 VNET_DECLARE(int, udp_blackhole);
 VNET_DECLARE(bool, udp_blackhole_local);
 VNET_DECLARE(int, udp_log_in_vain);
 #define	V_udp_cksum		VNET(udp_cksum)
 #define	V_udp_blackhole		VNET(udp_blackhole)
 #define	V_udp_blackhole_local	VNET(udp_blackhole_local)
 #define	V_udp_log_in_vain	VNET(udp_log_in_vain)
 
 VNET_DECLARE(int, zero_checksum_port);
 #define	V_zero_checksum_port	VNET(zero_checksum_port)
 
 static __inline struct inpcbinfo *
 udp_get_inpcbinfo(int protocol)
 {
 	return (protocol == IPPROTO_UDP) ? &V_udbinfo : &V_ulitecbinfo;
 }
 
 int		udp_newudpcb(struct inpcb *);
 void		udp_discardcb(struct udpcb *);
 
 int		udp_ctloutput(struct socket *, struct sockopt *);
 void		udplite_input(struct mbuf *, int);
 struct inpcb	*udp_notify(struct inpcb *inp, int errno);
 int		udp_shutdown(struct socket *so);
 
 int		udp_set_kernel_tunneling(struct socket *so, udp_tun_func_t f,
 		    udp_tun_icmp_t i, void *ctx);
 
 #endif /* _KERNEL */
 
 #endif /* _NETINET_UDP_VAR_H_ */
diff --git a/sys/netinet6/in6_proto.c b/sys/netinet6/in6_proto.c
index 963b6a8d9aed..72b84c915641 100644
--- a/sys/netinet6/in6_proto.c
+++ b/sys/netinet6/in6_proto.c
@@ -1,558 +1,442 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: in6_proto.c,v 1.91 2001/05/27 13:28:35 itojun Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)in_proto.c	8.1 (Berkeley) 6/10/93
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_ipstealth.h"
 #include "opt_sctp.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/domain.h>
 #include <sys/mbuf.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/radix.h>
 #include <net/route.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_encap.h>
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #include <netinet/icmp6.h>
 
 #include <netinet/tcp.h>
 #include <netinet/tcp_timer.h>
 #include <netinet/tcp_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet6/tcp6_var.h>
 #include <netinet6/raw_ip6.h>
 #include <netinet6/udp6_var.h>
 #include <netinet6/pim6_var.h>
 #include <netinet6/nd6.h>
 
 #ifdef SCTP
 #include <netinet/in_pcb.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp.h>
 #include <netinet/sctp_var.h>
 #include <netinet6/sctp6_var.h>
 #endif /* SCTP */
 
 #include <netinet6/ip6protosw.h>
 
+/* netinet6/raw_ip6.c */
+extern struct protosw rip6_protosw, icmp6_protosw, dstopts6_protosw,
+    routing6_protosw, frag6_protosw, rawipv4in6_protosw, rawipv6in6_protosw,
+    etherip6_protosw, gre6_protosw, pim6_protosw, rip6wild_protosw;
+/* netinet6/udp6_usrreq.c */
+extern struct protosw udp6_protosw, udplite6_protosw;
+
 /*
  * TCP/IP protocol family: IP6, ICMP6, UDP, TCP.
  */
 FEATURE(inet6, "Internet Protocol version 6");
 
-extern	struct domain inet6domain;
-static	struct pr_usrreqs nousrreqs;
-
-#define PR_LISTEN	0
-#define PR_ABRTACPTDIS	0
-
-/* Spacer for loadable protocols. */
-#define IP6PROTOSPACER   			\
-{						\
-	.pr_domain =		&inet6domain,	\
-	.pr_protocol =		PROTO_SPACER,	\
-	.pr_usrreqs =		&nousrreqs	\
-}
-
-struct protosw inet6sw[] = {
-{
-	.pr_type =		SOCK_DGRAM,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_UDP,
-	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_CAPATTACH,
-	.pr_ctloutput =		ip6_ctloutput,
-	.pr_usrreqs =		&udp6_usrreqs,
-},
-{
-	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_TCP,
-	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD|
-				    PR_LISTEN|PR_CAPATTACH,
-	.pr_ctloutput =		tcp_ctloutput,
-	.pr_usrreqs =		&tcp6_usrreqs,
-},
-#ifdef SCTP
-{
-	.pr_type =		SOCK_SEQPACKET,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_SCTP,
-	.pr_flags =		PR_WANTRCVD,
-	.pr_ctloutput =	sctp_ctloutput,
-	.pr_usrreqs =		&sctp6_usrreqs
-},
-{
-	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_SCTP,
-	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD,
-	.pr_ctloutput =		sctp_ctloutput,
-	.pr_usrreqs =		&sctp6_usrreqs
-},
-#endif /* SCTP */
-{
-	.pr_type =		SOCK_DGRAM,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_UDPLITE,
-	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_CAPATTACH,
-	.pr_ctloutput =		udp_ctloutput,
-	.pr_usrreqs =		&udp6_usrreqs,
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_RAW,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip6_ctloutput,
-	.pr_usrreqs =		&rip6_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_ICMPV6,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip6_ctloutput,
-	.pr_usrreqs =		&rip6_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_DSTOPTS,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_usrreqs =		&nousrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_ROUTING,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_usrreqs =		&nousrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_FRAGMENT,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_usrreqs =		&nousrreqs
-},
-#ifdef INET
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_IPV4,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip6_ctloutput,
-	.pr_usrreqs =		&rip6_usrreqs
-},
-#endif /* INET */
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_IPV6,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip6_ctloutput,
-	.pr_usrreqs =		&rip6_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_ETHERIP,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip6_ctloutput,
-	.pr_usrreqs =		&rip6_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_GRE,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip6_ctloutput,
-	.pr_usrreqs =		&rip6_usrreqs
-},
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inet6domain,
-	.pr_protocol =		IPPROTO_PIM,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip6_ctloutput,
-	.pr_usrreqs =		&rip6_usrreqs
-},
-/* Spacer n-times for loadable protocols. */
-IP6PROTOSPACER,
-IP6PROTOSPACER,
-IP6PROTOSPACER,
-IP6PROTOSPACER,
-IP6PROTOSPACER,
-IP6PROTOSPACER,
-IP6PROTOSPACER,
-IP6PROTOSPACER,
-/* raw wildcard */
-{
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inet6domain,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_ctloutput =		rip6_ctloutput,
-	.pr_usrreqs =		&rip6_usrreqs
-},
-};
-
 struct domain inet6domain = {
 	.dom_family =		AF_INET6,
 	.dom_name =		"internet6",
-	.dom_protosw =		(struct protosw *)inet6sw,
-	.dom_protoswNPROTOSW =	(struct protosw *)&inet6sw[nitems(inet6sw)],
 	.dom_rtattach =		in6_inithead,
 #ifdef VIMAGE
 	.dom_rtdetach =		in6_detachhead,
 #endif
 	.dom_ifattach =		in6_domifattach,
 	.dom_ifdetach =		in6_domifdetach,
-	.dom_ifmtu    =		in6_domifmtu
+	.dom_ifmtu    =		in6_domifmtu,
+	.dom_nprotosw =		24,
+	.dom_protosw = {
+		&tcp6_protosw,
+		&udp6_protosw,
+#ifdef SCTP
+		&sctp6_seqpacket_protosw,
+		&sctp6_stream_protosw,
+#else
+		NULL, NULL,
+#endif
+		&udplite6_protosw,
+		&rip6_protosw,
+		/*
+		 * XXXGL: it is entirely possible that all below raw-based
+		 * protosw definitions are not needed.  They could have existed
+		 * just to define pr_input, pr_drain, pr_*timo or PR_LASTHDR
+		 * flag, and were never supposed to create a special socket.
+		 */
+		&icmp6_protosw,
+		&dstopts6_protosw,
+		&routing6_protosw,
+		&frag6_protosw,
+#ifdef INET
+		&rawipv4in6_protosw,
+#else
+		NULL,
+#endif
+		&rawipv6in6_protosw,
+		&etherip6_protosw,
+		&gre6_protosw,
+		&pim6_protosw,
+		/* Spacer 8 times for loadable protocols. XXXGL: why 8? */
+		NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+		&rip6wild_protosw,
+	},
 };
 
 DOMAIN_SET(inet6);
 
 /*
  * Internet configuration info
  */
 #ifndef	IPV6FORWARDING
 #ifdef GATEWAY6
 #define	IPV6FORWARDING	1	/* forward IP6 packets not for us */
 #else
 #define	IPV6FORWARDING	0	/* don't forward IP6 packets not for us */
 #endif /* GATEWAY6 */
 #endif /* !IPV6FORWARDING */
 
 #ifndef	IPV6_SENDREDIRECTS
 #define	IPV6_SENDREDIRECTS	1
 #endif
 
 VNET_DEFINE(int, ip6_forwarding) = IPV6FORWARDING;	/* act as router? */
 VNET_DEFINE(int, ip6_sendredirects) = IPV6_SENDREDIRECTS;
 VNET_DEFINE(int, ip6_defhlim) = IPV6_DEFHLIM;
 VNET_DEFINE(int, ip6_defmcasthlim) = IPV6_DEFAULT_MULTICAST_HOPS;
 VNET_DEFINE(int, ip6_accept_rtadv) = 0;
 VNET_DEFINE(int, ip6_no_radr) = 0;
 VNET_DEFINE(int, ip6_norbit_raif) = 0;
 VNET_DEFINE(int, ip6_rfc6204w3) = 0;
 VNET_DEFINE(int, ip6_log_interval) = 5;
 VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we
 					 * process? */
 VNET_DEFINE(int, ip6_dad_count) = 1;	/* DupAddrDetectionTransmits */
 VNET_DEFINE(int, ip6_auto_flowlabel) = 1;
 VNET_DEFINE(int, ip6_use_deprecated) = 1;/* allow deprecated addr
 					 * (RFC2462 5.5.4) */
 VNET_DEFINE(int, ip6_rr_prune) = 5;	/* router renumbering prefix
 					 * walk list every 5 sec. */
 VNET_DEFINE(int, ip6_mcast_pmtu) = 0;	/* enable pMTU discovery for multicast? */
 VNET_DEFINE(int, ip6_v6only) = 1;
 
 VNET_DEFINE(time_t, ip6_log_time) = (time_t)0L;
 #ifdef IPSTEALTH
 VNET_DEFINE(int, ip6stealth) = 0;
 #endif
 VNET_DEFINE(int, nd6_onlink_ns_rfc4861) = 0;/* allow 'on-link' nd6 NS
 					     * (RFC 4861) */
 
 /* icmp6 */
 /*
  * BSDI4 defines these variables in in_proto.c...
  * XXX: what if we don't define INET? Should we define pmtu6_expire
  * or so? (jinmei@kame.net 19990310)
  */
 VNET_DEFINE(int, pmtu_expire) = 60*10;
 VNET_DEFINE(int, pmtu_probe) = 60*2;
 
 /* ICMPV6 parameters */
 VNET_DEFINE(int, icmp6_rediraccept) = 1;/* accept and process redirects */
 VNET_DEFINE(int, icmp6_redirtimeout) = 10 * 60;	/* 10 minutes */
 VNET_DEFINE(int, icmp6errppslim) = 100;		/* 100pps */
 /* control how to respond to NI queries */
 VNET_DEFINE(int, icmp6_nodeinfo) =
     (ICMP6_NODEINFO_FQDNOK|ICMP6_NODEINFO_NODEADDROK);
 VNET_DEFINE(int, icmp6_nodeinfo_oldmcprefix) = 1;
 
 /*
  * sysctl related items.
  */
 SYSCTL_NODE(_net, PF_INET6, inet6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Internet6 Family");
 
 /* net.inet6 */
 SYSCTL_NODE(_net_inet6,	IPPROTO_IPV6, ip6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_ICMPV6, icmp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "ICMP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_UDP, udp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "UDP6");
 SYSCTL_NODE(_net_inet6,	IPPROTO_TCP, tcp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "TCP6");
 #if defined(SCTP) || defined(SCTP_SUPPORT)
 SYSCTL_NODE(_net_inet6,	IPPROTO_SCTP, sctp6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "SCTP6");
 #endif
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 SYSCTL_NODE(_net_inet6,	IPPROTO_ESP, ipsec6, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "IPSEC6");
 #endif /* IPSEC */
 
 /* net.inet6.ip6 */
 static int
 sysctl_ip6_temppltime(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = V_ip6_temp_preferred_lifetime;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || !req->newptr)
 		return (error);
 	if (val < V_ip6_desync_factor + V_ip6_temp_regen_advance)
 		return (EINVAL);
 	V_ip6_temp_preferred_lifetime = val;
 	return (0);
 }
 
 static int
 sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS)
 {
 	int error, val;
 
 	val = V_ip6_temp_valid_lifetime;
 	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error != 0 || !req->newptr)
 		return (error);
 	if (val < V_ip6_temp_preferred_lifetime)
 		return (EINVAL);
 	V_ip6_temp_valid_lifetime = val;
 	return (0);
 }
 
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0,
 	"Enable forwarding of IPv6 packets between interfaces");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_SENDREDIRECTS, redirect,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_sendredirects), 0,
 	"Send ICMPv6 redirects for unforwardable IPv6 packets");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defhlim), 0,
 	"Default hop limit to use for outgoing IPv6 packets");
 SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat,
 	ip6stat,
 	"IP6 statistics (struct ip6stat, netinet6/ip6_var.h)");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0,
 	"Default value of per-interface flag for accepting ICMPv6 RA messages");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NO_RADR, no_radr,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_no_radr), 0,
 	"Default value of per-interface flag to control whether routers "
 	"sending ICMPv6 RA messages on that interface are added into the "
 	"default router list");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_NORBIT_RAIF, norbit_raif,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_norbit_raif), 0,
 	"Always set clear the R flag in ICMPv6 NA messages when accepting RA "
 	"on the interface");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RFC6204W3, rfc6204w3,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rfc6204w3), 0,
 	"Accept the default router list from ICMPv6 RA messages even "
 	"when packet forwarding is enabled");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_LOG_INTERVAL, log_interval,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_log_interval), 0,
 	"Frequency in seconds at which to log IPv6 forwarding errors");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_HDRNESTLIMIT, hdrnestlimit,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_hdrnestlimit), 0,
 	"Default maximum number of IPv6 extension headers permitted on "
 	"incoming IPv6 packets, 0 for no artificial limit");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DAD_COUNT, dad_count,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_dad_count), 0,
 	"Number of ICMPv6 NS messages sent during duplicate address detection");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_FLOWLABEL, auto_flowlabel,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_flowlabel), 0,
 	"Provide an IPv6 flowlabel in outbound packets");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFMCASTHLIM, defmcasthlim,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_defmcasthlim), 0,
 	"Default hop limit for IPv6 multicast packets originating from this "
 	"node");
 SYSCTL_STRING(_net_inet6_ip6, IPV6CTL_KAME_VERSION, kame_version,
 	CTLFLAG_RD, __KAME_VERSION, 0,
 	"KAME version string");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEPRECATED, use_deprecated,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_deprecated), 0,
 	"Allow the use of addresses whose preferred lifetimes have expired");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_RR_PRUNE, rr_prune,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_rr_prune), 0,
 	""); /* XXX unused */
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USETEMPADDR, use_tempaddr,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_tempaddr), 0,
 	"Create RFC3041 temporary addresses for autoconfigured addresses");
 SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPPLTIME, temppltime,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	NULL, 0, sysctl_ip6_temppltime, "I",
 	"Maximum preferred lifetime for temporary addresses");
 SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_TEMPVLTIME, tempvltime,
 	CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
 	NULL, 0, sysctl_ip6_tempvltime, "I",
 	"Maximum valid lifetime for temporary addresses");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_V6ONLY, v6only,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_v6only), 0,
 	"Restrict AF_INET6 sockets to IPv6 addresses only");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_AUTO_LINKLOCAL, auto_linklocal,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_auto_linklocal), 0,
 	"Default value of per-interface flag for automatically adding an IPv6 "
 	"link-local address to interfaces when attached");
 SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_RIP6STATS, rip6stats,
 	struct rip6stat, rip6stat,
 	"Raw IP6 statistics (struct rip6stat, netinet6/raw_ip6.h)");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_PREFER_TEMPADDR, prefer_tempaddr,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_prefer_tempaddr), 0,
 	"Prefer RFC3041 temporary addresses in source address selection");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0,
 	"Use the default scope zone when none is specified");
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0,
 	"Enable path MTU discovery for multicast packets");
 #ifdef IPSTEALTH
 SYSCTL_INT(_net_inet6_ip6, IPV6CTL_STEALTH, stealth, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(ip6stealth), 0,
 	"Forward IPv6 packets without decrementing their TTL");
 #endif
 
 /* net.inet6.icmp6 */
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT, rediraccept,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_rediraccept), 0,
 	"Accept ICMPv6 redirect messages");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRTIMEOUT, redirtimeout,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_redirtimeout), 0,
 	"Delay in seconds before expiring redirect route");
 SYSCTL_VNET_PCPUSTAT(_net_inet6_icmp6, ICMPV6CTL_STATS, stats,
 	struct icmp6stat, icmp6stat,
 	"ICMPv6 statistics (struct icmp6stat, netinet/icmp6.h)");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_PRUNE, nd6_prune,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_prune), 0,
 	"Frequency in seconds of checks for expired prefixes and routers");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DELAY, nd6_delay,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_delay), 0,
 	"Delay in seconds before probing for reachability");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_UMAXTRIES, nd6_umaxtries,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_umaxtries), 0,
 	"Number of ICMPv6 NS messages sent during reachability detection");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MMAXTRIES, nd6_mmaxtries,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_mmaxtries), 0,
 	"Number of ICMPv6 NS messages sent during address resolution");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_USELOOPBACK, nd6_useloopback,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_useloopback), 0,
 	"Create a loopback route when configuring an IPv6 address");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO, nodeinfo,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6_nodeinfo), 0,
 	"Mask of enabled RFC4620 node information query types");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_NODEINFO_OLDMCPREFIX,
 	nodeinfo_oldmcprefix, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(icmp6_nodeinfo_oldmcprefix), 0,
 	"Join old IPv6 NI group address in draft-ietf-ipngwg-icmp-name-lookup "
 	"for compatibility with KAME implementation");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ERRPPSLIMIT, errppslimit,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(icmp6errppslim), 0,
 	"Maximum number of ICMPv6 error messages per second");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_MAXNUDHINT, nd6_maxnudhint,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_maxnudhint), 0,
 	""); /* XXX unused */
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_DEBUG, nd6_debug,
 	CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nd6_debug), 0,
 	"Log NDP debug messages");
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_ND6_ONLINKNSRFC4861,
 	nd6_onlink_ns_rfc4861, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(nd6_onlink_ns_rfc4861), 0,
 	"Accept 'on-link' ICMPv6 NS messages in compliance with RFC 4861");
 #ifdef EXPERIMENTAL
 SYSCTL_INT(_net_inet6_icmp6, OID_AUTO,
 	nd6_ignore_ipv6_only_ra, CTLFLAG_VNET | CTLFLAG_RW,
 	&VNET_NAME(nd6_ignore_ipv6_only_ra), 0,
 	"Ignore the 'IPv6-Only flag' in RA messages in compliance with "
 	"draft-ietf-6man-ipv6only-flag");
 #endif
diff --git a/sys/netinet6/raw_ip6.c b/sys/netinet6/raw_ip6.c
index 678121b0ca32..091e90b0eaed 100644
--- a/sys/netinet6/raw_ip6.c
+++ b/sys/netinet6/raw_ip6.c
@@ -1,885 +1,934 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1993
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)raw_ip.c	8.2 (Berkeley) 1/4/94
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_ipsec.h"
 #include "opt_inet6.h"
 #include "opt_route.h"
 
 #include <sys/param.h>
 #include <sys/errno.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/syslog.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_pcb.h>
 
 #include <netinet/icmp6.h>
 #include <netinet/ip6.h>
 #include <netinet/ip_var.h>
 #include <netinet6/ip6protosw.h>
 #include <netinet6/ip6_mroute.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/raw_ip6.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/send.h>
 
 #include <netipsec/ipsec_support.h>
 
 #include <machine/stdarg.h>
 
 #define	satosin6(sa)	((struct sockaddr_in6 *)(sa))
 #define	ifatoia6(ifa)	((struct in6_ifaddr *)(ifa))
 
 /*
  * Raw interface to IP6 protocol.
  */
 
 VNET_DECLARE(struct inpcbinfo, ripcbinfo);
 #define	V_ripcbinfo			VNET(ripcbinfo)
 
 extern u_long	rip_sendspace;
 extern u_long	rip_recvspace;
 
 VNET_PCPUSTAT_DEFINE(struct rip6stat, rip6stat);
 VNET_PCPUSTAT_SYSINIT(rip6stat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(rip6stat);
 #endif /* VIMAGE */
 
 /*
  * Hooks for multicast routing. They all default to NULL, so leave them not
  * initialized and rely on BSS being set to 0.
  */
 
 /*
  * The socket used to communicate with the multicast routing daemon.
  */
 VNET_DEFINE(struct socket *, ip6_mrouter);
 
 /*
  * The various mrouter functions.
  */
 int (*ip6_mrouter_set)(struct socket *, struct sockopt *);
 int (*ip6_mrouter_get)(struct socket *, struct sockopt *);
 int (*ip6_mrouter_done)(void);
 int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *);
 int (*mrt6_ioctl)(u_long, caddr_t);
 
 struct rip6_inp_match_ctx {
 	struct ip6_hdr *ip6;
 	int proto;
 };
 
 static bool
 rip6_inp_match(const struct inpcb *inp, void *v)
 {
 	struct rip6_inp_match_ctx *c = v;
 	struct ip6_hdr *ip6 = c->ip6;
 	int proto = c->proto;
 
 	/* XXX inp locking */
 	if ((inp->inp_vflag & INP_IPV6) == 0)
 		return (false);
 	if (inp->inp_ip_p && inp->inp_ip_p != proto)
 		return (false);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
 	    !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst))
 		return (false);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 	    !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src))
 		return (false);
 
 	return (true);
 }
 
 /*
  * Setup generic address and protocol structures for raw_input routine, then
  * pass them along with mbuf chain.
  */
 int
 rip6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct ifnet *ifp;
 	struct mbuf *n, *m = *mp;
 	struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 	struct inpcb *inp;
 	struct mbuf *opts = NULL;
 	struct sockaddr_in6 fromsa;
 	struct rip6_inp_match_ctx ctx = { .ip6 = ip6, .proto = proto };
 	struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo,
 	    INPLOOKUP_RLOCKPCB, rip6_inp_match, &ctx);
 	int delivered = 0;
 
 	NET_EPOCH_ASSERT();
 
 	RIP6STAT_INC(rip6s_ipackets);
 
 	init_sin6(&fromsa, m, 0); /* general init */
 
 	ifp = m->m_pkthdr.rcvif;
 
 	while ((inp = inp_next(&inpi)) != NULL) {
 		INP_RLOCK_ASSERT(inp);
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 		/*
 		 * Check AH/ESP integrity.
 		 */
 		if (IPSEC_ENABLED(ipv6) &&
 		    IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) {
 			/* Do not inject data into pcb. */
 			continue;
 		}
 #endif /* IPSEC */
 		if (jailed_without_vnet(inp->inp_cred) &&
 		    !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
 		    prison_check_ip6(inp->inp_cred, &ip6->ip6_dst) != 0)
 			/*
 			 * Allow raw socket in jail to receive multicast;
 			 * assume process had PRIV_NETINET_RAW at attach,
 			 * and fall through into normal filter path if so.
 			 */
 			continue;
 		if (inp->in6p_cksum != -1) {
 			RIP6STAT_INC(rip6s_isum);
 			if (m->m_pkthdr.len - (*offp + inp->in6p_cksum) < 2 ||
 			    in6_cksum(m, proto, *offp,
 			    m->m_pkthdr.len - *offp)) {
 				RIP6STAT_INC(rip6s_badsum);
 				/*
 				 * Drop the received message, don't send an
 				 * ICMP6 message. Set proto to IPPROTO_NONE
 				 * to achieve that.
 				 */
 				INP_RUNLOCK(inp);
 				proto = IPPROTO_NONE;
 				break;
 			}
 		}
 		/*
 		 * If this raw socket has multicast state, and we
 		 * have received a multicast, check if this socket
 		 * should receive it, as multicast filtering is now
 		 * the responsibility of the transport layer.
 		 */
 		if (inp->in6p_moptions &&
 		    IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 			/*
 			 * If the incoming datagram is for MLD, allow it
 			 * through unconditionally to the raw socket.
 			 *
 			 * Use the M_RTALERT_MLD flag to check for MLD
 			 * traffic without having to inspect the mbuf chain
 			 * more deeply, as all MLDv1/v2 host messages MUST
 			 * contain the Router Alert option.
 			 *
 			 * In the case of MLDv1, we may not have explicitly
 			 * joined the group, and may have set IFF_ALLMULTI
 			 * on the interface. im6o_mc_filter() may discard
 			 * control traffic we actually need to see.
 			 *
 			 * Userland multicast routing daemons should continue
 			 * filter the control traffic appropriately.
 			 */
 			int blocked;
 
 			blocked = MCAST_PASS;
 			if ((m->m_flags & M_RTALERT_MLD) == 0) {
 				struct sockaddr_in6 mcaddr;
 
 				bzero(&mcaddr, sizeof(struct sockaddr_in6));
 				mcaddr.sin6_len = sizeof(struct sockaddr_in6);
 				mcaddr.sin6_family = AF_INET6;
 				mcaddr.sin6_addr = ip6->ip6_dst;
 
 				blocked = im6o_mc_filter(inp->in6p_moptions,
 				    ifp,
 				    (struct sockaddr *)&mcaddr,
 				    (struct sockaddr *)&fromsa);
 			}
 			if (blocked != MCAST_PASS) {
 				IP6STAT_INC(ip6s_notmember);
 				continue;
 			}
 		}
 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL)
 			continue;
 		if (inp->inp_flags & INP_CONTROLOPTS ||
 		    inp->inp_socket->so_options & SO_TIMESTAMP)
 			ip6_savecontrol(inp, n, &opts);
 		/* strip intermediate headers */
 		m_adj(n, *offp);
 		if (sbappendaddr(&inp->inp_socket->so_rcv,
 		    (struct sockaddr *)&fromsa, n, opts) == 0) {
 			soroverflow(inp->inp_socket);
 			m_freem(n);
 			if (opts)
 				m_freem(opts);
 			RIP6STAT_INC(rip6s_fullsock);
 		} else {
 			sorwakeup(inp->inp_socket);
 			delivered++;
 		}
 		opts = NULL;
 	}
 	if (delivered == 0) {
 		RIP6STAT_INC(rip6s_nosock);
 		if (m->m_flags & M_MCAST)
 			RIP6STAT_INC(rip6s_nosockmcast);
 		if (proto == IPPROTO_NONE)
 			m_freem(m);
 		else
 			icmp6_error(m, ICMP6_PARAM_PROB,
 			    ICMP6_PARAMPROB_NEXTHEADER,
 			    ip6_get_prevhdr(m, *offp));
 		IP6STAT_DEC(ip6s_delivered);
 	} else
 		m_freem(m);
 	return (IPPROTO_DONE);
 }
 
 void
 rip6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	void *cmdarg;
 	struct inpcb *(*notify)(struct inpcb *, int) = in6_rtchange;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if ((unsigned)cmd >= PRC_NCMDS)
 		return;
 	if (PRC_IS_REDIRECT(cmd))
 		notify = in6_rtchange, d = NULL;
 	else if (cmd == PRC_HOSTDEAD)
 		d = NULL;
 	else if (inet6ctlerrmap[cmd] == 0)
 		return;
 
 	/*
 	 * If the parameter is from icmp6, decode it.
 	 */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		cmdarg = ip6cp->ip6c_cmdarg;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		cmdarg = NULL;
 		sa6_src = &sa6_any;
 	}
 
 	(void) in6_pcbnotify(&V_ripcbinfo, sa, 0,
 	    (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
 }
 
 /*
  * Generate IPv6 header and pass packet to ip6_output.  Tack on options user
  * may have setup with control call.
  */
 static int
 rip6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	struct sockaddr_in6 tmp, *dstsock;
 	struct m_tag *mtag;
 	struct ip6_hdr *ip6;
 	u_int	plen = m->m_pkthdr.len;
 	struct ip6_pktopts opt, *optp;
 	struct ifnet *oifp = NULL;
 	int error;
 	int type = 0, code = 0;		/* for ICMPv6 output statistics only */
 	int scope_ambiguous = 0;
 	int use_defzone = 0;
 	int hlim = 0;
 	struct in6_addr in6a;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_send: inp == NULL"));
 
 	/* Always copy sockaddr to avoid overwrites. */
 	/* Unlocked read. */
 	if (so->so_state & SS_ISCONNECTED) {
 		if (nam) {
 			error = EISCONN;
 			goto release;
 		}
 		tmp = (struct sockaddr_in6 ){
 			.sin6_family = AF_INET6,
 			.sin6_len = sizeof(struct sockaddr_in6),
 		};
 		INP_RLOCK(inp);
 		bcopy(&inp->in6p_faddr, &tmp.sin6_addr,
 		    sizeof(struct in6_addr));
 		INP_RUNLOCK(inp);
 		dstsock = &tmp;
 	} else {
 		if (nam == NULL)
 			error = ENOTCONN;
 		else if (nam->sa_family != AF_INET6)
 			error = EAFNOSUPPORT;
 		else if (nam->sa_len != sizeof(struct sockaddr_in6))
 			error = EINVAL;
 		else
 			error = 0;
 		if (error != 0)
 			goto release;
 		dstsock = (struct sockaddr_in6 *)nam;
 		if (dstsock->sin6_family != AF_INET6) {
 			error = EAFNOSUPPORT;
 			goto release;
 		}
 	}
 
 	INP_WLOCK(inp);
 
 	if (control != NULL) {
 		NET_EPOCH_ENTER(et);
 		error = ip6_setpktopts(control, &opt,
 		    inp->in6p_outputopts, so->so_cred,
 		    so->so_proto->pr_protocol);
 		NET_EPOCH_EXIT(et);
 
 		if (error != 0) {
 			goto bad;
 		}
 		optp = &opt;
 	} else
 		optp = inp->in6p_outputopts;
 
 	/*
 	 * Check and convert scope zone ID into internal form.
 	 *
 	 * XXX: we may still need to determine the zone later.
 	 */
 	if (!(so->so_state & SS_ISCONNECTED)) {
 		if (!optp || !optp->ip6po_pktinfo ||
 		    !optp->ip6po_pktinfo->ipi6_ifindex)
 			use_defzone = V_ip6_use_defzone;
 		if (dstsock->sin6_scope_id == 0 && !use_defzone)
 			scope_ambiguous = 1;
 		if ((error = sa6_embedscope(dstsock, use_defzone)) != 0)
 			goto bad;
 	}
 
 	/*
 	 * For an ICMPv6 packet, we should know its type and code to update
 	 * statistics.
 	 */
 	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
 		struct icmp6_hdr *icmp6;
 		if (m->m_len < sizeof(struct icmp6_hdr) &&
 		    (m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
 			error = ENOBUFS;
 			goto bad;
 		}
 		icmp6 = mtod(m, struct icmp6_hdr *);
 		type = icmp6->icmp6_type;
 		code = icmp6->icmp6_code;
 	}
 
 	M_PREPEND(m, sizeof(*ip6), M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto bad;
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 
 #ifdef ROUTE_MPATH
 	if (CALC_FLOWID_OUTBOUND) {
 		uint32_t hash_type, hash_val;
 
 		hash_val = fib6_calc_software_hash(&inp->in6p_laddr,
 		    &dstsock->sin6_addr, 0, 0, so->so_proto->pr_protocol,
 		    &hash_type);
 		inp->inp_flowid = hash_val;
 		inp->inp_flowtype = hash_type;
 	}
 #endif
 	/*
 	 * Source address selection.
 	 */
 	NET_EPOCH_ENTER(et);
 	error = in6_selectsrc_socket(dstsock, optp, inp, so->so_cred,
 	    scope_ambiguous, &in6a, &hlim);
 	NET_EPOCH_EXIT(et);
 
 	if (error)
 		goto bad;
 	error = prison_check_ip6(inp->inp_cred, &in6a);
 	if (error != 0)
 		goto bad;
 	ip6->ip6_src = in6a;
 
 	ip6->ip6_dst = dstsock->sin6_addr;
 
 	/*
 	 * Fill in the rest of the IPv6 header fields.
 	 */
 	ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
 	    (inp->inp_flow & IPV6_FLOWINFO_MASK);
 	ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 	    (IPV6_VERSION & IPV6_VERSION_MASK);
 
 	/*
 	 * ip6_plen will be filled in ip6_output, so not fill it here.
 	 */
 	ip6->ip6_nxt = inp->inp_ip_p;
 	ip6->ip6_hlim = hlim;
 
 	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 ||
 	    inp->in6p_cksum != -1) {
 		struct mbuf *n;
 		int off;
 		u_int16_t *p;
 
 		/* Compute checksum. */
 		if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
 			off = offsetof(struct icmp6_hdr, icmp6_cksum);
 		else
 			off = inp->in6p_cksum;
 		if (plen < off + 2) {
 			error = EINVAL;
 			goto bad;
 		}
 		off += sizeof(struct ip6_hdr);
 
 		n = m;
 		while (n && n->m_len <= off) {
 			off -= n->m_len;
 			n = n->m_next;
 		}
 		if (!n)
 			goto bad;
 		p = (u_int16_t *)(mtod(n, caddr_t) + off);
 		*p = 0;
 		*p = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen);
 	}
 
 	/*
 	 * Send RA/RS messages to user land for protection, before sending
 	 * them to rtadvd/rtsol.
 	 */
 	if ((send_sendso_input_hook != NULL) &&
 	    so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
 		switch (type) {
 		case ND_ROUTER_ADVERT:
 		case ND_ROUTER_SOLICIT:
 			mtag = m_tag_get(PACKET_TAG_ND_OUTGOING,
 				sizeof(unsigned short), M_NOWAIT);
 			if (mtag == NULL)
 				goto bad;
 			m_tag_prepend(m, mtag);
 		}
 	}
 
 	NET_EPOCH_ENTER(et);
 	error = ip6_output(m, optp, NULL, 0, inp->in6p_moptions, &oifp, inp);
 	NET_EPOCH_EXIT(et);
 	if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
 		if (oifp)
 			icmp6_ifoutstat_inc(oifp, type, code);
 		ICMP6STAT_INC(icp6s_outhist[type]);
 	} else
 		RIP6STAT_INC(rip6s_opackets);
 
 	goto freectl;
 
  bad:
 	if (m)
 		m_freem(m);
 
  freectl:
 	if (control != NULL) {
 		ip6_clearpktopts(&opt, -1);
 		m_freem(control);
 	}
 	INP_WUNLOCK(inp);
 	return (error);
 
 release:
 	if (control != NULL)
 		m_freem(control);
 	m_freem(m);
 	return (error);
 }
 
 /*
  * Raw IPv6 socket option processing.
  */
 int
 rip6_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	struct inpcb *inp;
 	int error;
 
 	if (sopt->sopt_level == IPPROTO_ICMPV6)
 		/*
 		 * XXX: is it better to call icmp6_ctloutput() directly
 		 * from protosw?
 		 */
 		return (icmp6_ctloutput(so, sopt));
 	else if (sopt->sopt_level != IPPROTO_IPV6) {
 		if (sopt->sopt_level == SOL_SOCKET &&
 		    sopt->sopt_name == SO_SETFIB) {
 			inp = sotoinpcb(so);
 			INP_WLOCK(inp);
 			inp->inp_inc.inc_fibnum = so->so_fibnum;
 			INP_WUNLOCK(inp);
 			return (0);
 		}
 		return (EINVAL);
 	}
 
 	error = 0;
 
 	switch (sopt->sopt_dir) {
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case MRT6_INIT:
 		case MRT6_DONE:
 		case MRT6_ADD_MIF:
 		case MRT6_DEL_MIF:
 		case MRT6_ADD_MFC:
 		case MRT6_DEL_MFC:
 		case MRT6_PIM:
 			error = ip6_mrouter_get ?  ip6_mrouter_get(so, sopt) :
 			    EOPNOTSUPP;
 			break;
 		case IPV6_CHECKSUM:
 			error = ip6_raw_ctloutput(so, sopt);
 			break;
 		default:
 			error = ip6_ctloutput(so, sopt);
 			break;
 		}
 		break;
 
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case MRT6_INIT:
 		case MRT6_DONE:
 		case MRT6_ADD_MIF:
 		case MRT6_DEL_MIF:
 		case MRT6_ADD_MFC:
 		case MRT6_DEL_MFC:
 		case MRT6_PIM:
 			error = ip6_mrouter_set ?  ip6_mrouter_set(so, sopt) :
 			    EOPNOTSUPP;
 			break;
 		case IPV6_CHECKSUM:
 			error = ip6_raw_ctloutput(so, sopt);
 			break;
 		default:
 			error = ip6_ctloutput(so, sopt);
 			break;
 		}
 		break;
 	}
 
 	return (error);
 }
 
 static int
 rip6_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	struct icmp6_filter *filter;
 	int error;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("rip6_attach: inp != NULL"));
 
 	error = priv_check(td, PRIV_NETINET_RAW);
 	if (error)
 		return (error);
 	error = soreserve(so, rip_sendspace, rip_recvspace);
 	if (error)
 		return (error);
 	filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT);
 	if (filter == NULL)
 		return (ENOMEM);
 	error = in_pcballoc(so, &V_ripcbinfo);
 	if (error) {
 		free(filter, M_PCB);
 		return (error);
 	}
 	inp = (struct inpcb *)so->so_pcb;
 	inp->inp_ip_p = (long)proto;
 	inp->in6p_cksum = -1;
 	inp->in6p_icmp6filt = filter;
 	ICMP6_FILTER_SETPASSALL(inp->in6p_icmp6filt);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static void
 rip6_detach(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_detach: inp == NULL"));
 
 	if (so == V_ip6_mrouter && ip6_mrouter_done)
 		ip6_mrouter_done();
 	/* xxx: RSVP */
 	INP_WLOCK(inp);
 	free(inp->in6p_icmp6filt, M_PCB);
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 }
 
 /* XXXRW: This can't ever be called. */
 static void
 rip6_abort(struct socket *so)
 {
 	struct inpcb *inp __diagused;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_abort: inp == NULL"));
 
 	soisdisconnected(so);
 }
 
 static void
 rip6_close(struct socket *so)
 {
 	struct inpcb *inp __diagused;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_close: inp == NULL"));
 
 	soisdisconnected(so);
 }
 
 static int
 rip6_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_disconnect: inp == NULL"));
 
 	if ((so->so_state & SS_ISCONNECTED) == 0)
 		return (ENOTCONN);
 	inp->in6p_faddr = in6addr_any;
 	rip6_abort(so);
 	return (0);
 }
 
 static int
 rip6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
 	struct ifaddr *ifa = NULL;
 	int error = 0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_bind: inp == NULL"));
 
 	if (nam->sa_family != AF_INET6)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 	if ((error = prison_check_ip6(td->td_ucred, &addr->sin6_addr)) != 0)
 		return (error);
 	if (CK_STAILQ_EMPTY(&V_ifnet) || addr->sin6_family != AF_INET6)
 		return (EADDRNOTAVAIL);
 	if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
 		return (error);
 
 	NET_EPOCH_ENTER(et);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) &&
 	    (ifa = ifa_ifwithaddr((struct sockaddr *)addr)) == NULL) {
 		NET_EPOCH_EXIT(et);
 		return (EADDRNOTAVAIL);
 	}
 	if (ifa != NULL &&
 	    ((struct in6_ifaddr *)ifa)->ia6_flags &
 	    (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|
 	     IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
 		NET_EPOCH_EXIT(et);
 		return (EADDRNOTAVAIL);
 	}
 	NET_EPOCH_EXIT(et);
 	INP_WLOCK(inp);
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	inp->in6p_laddr = addr->sin6_addr;
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 rip6_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
 	struct in6_addr in6a;
 	struct epoch_tracker et;
 	int error = 0, scope_ambiguous = 0;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_connect: inp == NULL"));
 
 	if (nam->sa_len != sizeof(*addr))
 		return (EINVAL);
 	if (CK_STAILQ_EMPTY(&V_ifnet))
 		return (EADDRNOTAVAIL);
 	if (addr->sin6_family != AF_INET6)
 		return (EAFNOSUPPORT);
 
 	/*
 	 * Application should provide a proper zone ID or the use of default
 	 * zone IDs should be enabled.  Unfortunately, some applications do
 	 * not behave as it should, so we need a workaround.  Even if an
 	 * appropriate ID is not determined, we'll see if we can determine
 	 * the outgoing interface.  If we can, determine the zone ID based on
 	 * the interface below.
 	 */
 	if (addr->sin6_scope_id == 0 && !V_ip6_use_defzone)
 		scope_ambiguous = 1;
 	if ((error = sa6_embedscope(addr, V_ip6_use_defzone)) != 0)
 		return (error);
 
 	INP_WLOCK(inp);
 	INP_INFO_WLOCK(&V_ripcbinfo);
 	/* Source address selection. XXX: need pcblookup? */
 	NET_EPOCH_ENTER(et);
 	error = in6_selectsrc_socket(addr, inp->in6p_outputopts,
 	    inp, so->so_cred, scope_ambiguous, &in6a, NULL);
 	NET_EPOCH_EXIT(et);
 	if (error) {
 		INP_INFO_WUNLOCK(&V_ripcbinfo);
 		INP_WUNLOCK(inp);
 		return (error);
 	}
 
 	inp->in6p_faddr = addr->sin6_addr;
 	inp->in6p_laddr = in6a;
 	soisconnected(so);
 	INP_INFO_WUNLOCK(&V_ripcbinfo);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 rip6_shutdown(struct socket *so)
 {
 	struct inpcb *inp;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("rip6_shutdown: inp == NULL"));
 
 	INP_WLOCK(inp);
 	socantsendmore(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
-struct pr_usrreqs rip6_usrreqs = {
-	.pru_abort =		rip6_abort,
-	.pru_attach =		rip6_attach,
-	.pru_bind =		rip6_bind,
-	.pru_connect =		rip6_connect,
-	.pru_control =		in6_control,
-	.pru_detach =		rip6_detach,
-	.pru_disconnect =	rip6_disconnect,
-	.pru_peeraddr =		in6_getpeeraddr,
-	.pru_send =		rip6_send,
-	.pru_shutdown =		rip6_shutdown,
-	.pru_sockaddr =		in6_getsockaddr,
-	.pru_close =		rip6_close,
+/*
+ * See comment in in6_proto.c containing "protosw definitions are not needed".
+ */
+#define	RAW6_PROTOSW						\
+	.pr_type =		SOCK_RAW,			\
+	.pr_flags =		PR_ATOMIC|PR_ADDR,		\
+	.pr_ctloutput =		rip6_ctloutput,			\
+	.pr_abort =		rip6_abort,			\
+	.pr_attach =		rip6_attach,			\
+	.pr_bind =		rip6_bind,			\
+	.pr_connect =		rip6_connect,			\
+	.pr_control =		in6_control,			\
+	.pr_detach =		rip6_detach,			\
+	.pr_disconnect =	rip6_disconnect,		\
+	.pr_peeraddr =		in6_getpeeraddr,		\
+	.pr_send =		rip6_send,			\
+	.pr_shutdown =		rip6_shutdown,			\
+	.pr_sockaddr =		in6_getsockaddr,		\
+	.pr_close =		rip6_close
+
+struct protosw rip6_protosw = {
+	.pr_protocol =	IPPROTO_RAW,
+	RAW6_PROTOSW
+};
+struct protosw icmp6_protosw = {
+	.pr_protocol =	IPPROTO_ICMPV6,
+	RAW6_PROTOSW
+};
+struct protosw dstopts6_protosw = {
+	.pr_protocol =	IPPROTO_DSTOPTS,
+	RAW6_PROTOSW
+};
+struct protosw routing6_protosw = {
+	.pr_protocol =	IPPROTO_ROUTING,
+	RAW6_PROTOSW
+};
+struct protosw frag6_protosw = {
+	.pr_protocol =	IPPROTO_FRAGMENT,
+	RAW6_PROTOSW
+};
+struct protosw rawipv4in6_protosw = {
+	.pr_protocol =	IPPROTO_IPV4,
+	RAW6_PROTOSW
+};
+struct protosw rawipv6in6_protosw = {
+	.pr_protocol =	IPPROTO_IPV6,
+	RAW6_PROTOSW
+};
+struct protosw etherip6_protosw = {
+	.pr_protocol =	IPPROTO_ETHERIP,
+	RAW6_PROTOSW
+};
+struct protosw gre6_protosw = {
+	.pr_protocol =	IPPROTO_GRE,
+	RAW6_PROTOSW
+};
+struct protosw pim6_protosw = {
+	.pr_protocol =	IPPROTO_PIM,
+	RAW6_PROTOSW
+};
+struct protosw rip6wild_protosw = {
+	RAW6_PROTOSW
 };
diff --git a/sys/netinet6/sctp6_usrreq.c b/sys/netinet6/sctp6_usrreq.c
index 361bc4a18a2f..b14d7d10451a 100644
--- a/sys/netinet6/sctp6_usrreq.c
+++ b/sys/netinet6/sctp6_usrreq.c
@@ -1,1223 +1,1235 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <netinet/sctp_os.h>
 #ifdef INET6
 #include <sys/proc.h>
 #include <netinet/sctp_pcb.h>
 #include <netinet/sctp_header.h>
 #include <netinet/sctp_var.h>
 #include <netinet6/sctp6_var.h>
 #include <netinet/sctp_sysctl.h>
 #include <netinet/sctp_output.h>
 #include <netinet/sctp_uio.h>
 #include <netinet/sctp_asconf.h>
 #include <netinet/sctputil.h>
 #include <netinet/sctp_indata.h>
 #include <netinet/sctp_timer.h>
 #include <netinet/sctp_auth.h>
 #include <netinet/sctp_input.h>
 #include <netinet/sctp_output.h>
 #include <netinet/sctp_bsd_addr.h>
 #include <netinet/sctp_crc32.h>
 #include <netinet/icmp6.h>
 #include <netinet/udp.h>
 
 int
 sctp6_input_with_port(struct mbuf **i_pak, int *offp, uint16_t port)
 {
 	struct mbuf *m;
 	int iphlen;
 	uint32_t vrf_id;
 	uint8_t ecn_bits;
 	struct sockaddr_in6 src, dst;
 	struct ip6_hdr *ip6;
 	struct sctphdr *sh;
 	struct sctp_chunkhdr *ch;
 	int length, offset;
 	uint8_t compute_crc;
 	uint32_t mflowid;
 	uint8_t mflowtype;
 	uint16_t fibnum;
 
 	iphlen = *offp;
 	if (SCTP_GET_PKT_VRFID(*i_pak, vrf_id)) {
 		SCTP_RELEASE_PKT(*i_pak);
 		return (IPPROTO_DONE);
 	}
 	m = SCTP_HEADER_TO_CHAIN(*i_pak);
 #ifdef SCTP_MBUF_LOGGING
 	/* Log in any input mbufs */
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_MBUF_LOGGING_ENABLE) {
 		sctp_log_mbc(m, SCTP_MBUF_INPUT);
 	}
 #endif
 #ifdef SCTP_PACKET_LOGGING
 	if (SCTP_BASE_SYSCTL(sctp_logging_level) & SCTP_LAST_PACKET_TRACING) {
 		sctp_packet_log(m);
 	}
 #endif
 	SCTPDBG(SCTP_DEBUG_CRCOFFLOAD,
 	    "sctp6_input(): Packet of length %d received on %s with csum_flags 0x%b.\n",
 	    m->m_pkthdr.len,
 	    if_name(m->m_pkthdr.rcvif),
 	    (int)m->m_pkthdr.csum_flags, CSUM_BITS);
 	mflowid = m->m_pkthdr.flowid;
 	mflowtype = M_HASHTYPE_GET(m);
 	fibnum = M_GETFIB(m);
 	SCTP_STAT_INCR(sctps_recvpackets);
 	SCTP_STAT_INCR_COUNTER64(sctps_inpackets);
 	/* Get IP, SCTP, and first chunk header together in the first mbuf. */
 	offset = iphlen + sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr);
 	if (m->m_len < offset) {
 		m = m_pullup(m, offset);
 		if (m == NULL) {
 			SCTP_STAT_INCR(sctps_hdrops);
 			return (IPPROTO_DONE);
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	sh = (struct sctphdr *)(mtod(m, caddr_t)+iphlen);
 	ch = (struct sctp_chunkhdr *)((caddr_t)sh + sizeof(struct sctphdr));
 	offset -= sizeof(struct sctp_chunkhdr);
 	memset(&src, 0, sizeof(struct sockaddr_in6));
 	src.sin6_family = AF_INET6;
 	src.sin6_len = sizeof(struct sockaddr_in6);
 	src.sin6_port = sh->src_port;
 	src.sin6_addr = ip6->ip6_src;
 	if (in6_setscope(&src.sin6_addr, m->m_pkthdr.rcvif, NULL) != 0) {
 		goto out;
 	}
 	memset(&dst, 0, sizeof(struct sockaddr_in6));
 	dst.sin6_family = AF_INET6;
 	dst.sin6_len = sizeof(struct sockaddr_in6);
 	dst.sin6_port = sh->dest_port;
 	dst.sin6_addr = ip6->ip6_dst;
 	if (in6_setscope(&dst.sin6_addr, m->m_pkthdr.rcvif, NULL) != 0) {
 		goto out;
 	}
 	length = ntohs(ip6->ip6_plen) + iphlen;
 	/* Validate mbuf chain length with IP payload length. */
 	if (SCTP_HEADER_LEN(m) != length) {
 		SCTPDBG(SCTP_DEBUG_INPUT1,
 		    "sctp6_input() length:%d reported length:%d\n", length, SCTP_HEADER_LEN(m));
 		SCTP_STAT_INCR(sctps_hdrops);
 		goto out;
 	}
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		goto out;
 	}
 	ecn_bits = IPV6_TRAFFIC_CLASS(ip6);
 	if (m->m_pkthdr.csum_flags & CSUM_SCTP_VALID) {
 		SCTP_STAT_INCR(sctps_recvhwcrc);
 		compute_crc = 0;
 	} else {
 		SCTP_STAT_INCR(sctps_recvswcrc);
 		compute_crc = 1;
 	}
 	sctp_common_input_processing(&m, iphlen, offset, length,
 	    (struct sockaddr *)&src,
 	    (struct sockaddr *)&dst,
 	    sh, ch,
 	    compute_crc,
 	    ecn_bits,
 	    mflowtype, mflowid, fibnum,
 	    vrf_id, port);
 out:
 	if (m) {
 		sctp_m_freem(m);
 	}
 	return (IPPROTO_DONE);
 }
 
 int
 sctp6_input(struct mbuf **i_pak, int *offp, int proto SCTP_UNUSED)
 {
 	return (sctp6_input_with_port(i_pak, offp, 0));
 }
 
 void
 sctp6_notify(struct sctp_inpcb *inp,
     struct sctp_tcb *stcb,
     struct sctp_nets *net,
     uint8_t icmp6_type,
     uint8_t icmp6_code,
     uint32_t next_mtu)
 {
 	int timer_stopped;
 
 	switch (icmp6_type) {
 	case ICMP6_DST_UNREACH:
 		if ((icmp6_code == ICMP6_DST_UNREACH_NOROUTE) ||
 		    (icmp6_code == ICMP6_DST_UNREACH_ADMIN) ||
 		    (icmp6_code == ICMP6_DST_UNREACH_BEYONDSCOPE) ||
 		    (icmp6_code == ICMP6_DST_UNREACH_ADDR)) {
 			/* Mark the net unreachable. */
 			if (net->dest_state & SCTP_ADDR_REACHABLE) {
 				/* Ok that destination is not reachable */
 				net->dest_state &= ~SCTP_ADDR_REACHABLE;
 				net->dest_state &= ~SCTP_ADDR_PF;
 				sctp_ulp_notify(SCTP_NOTIFY_INTERFACE_DOWN,
 				    stcb, 0, (void *)net, SCTP_SO_NOT_LOCKED);
 			}
 		}
 		SCTP_TCB_UNLOCK(stcb);
 		break;
 	case ICMP6_PARAM_PROB:
 		/* Treat it like an ABORT. */
 		if (icmp6_code == ICMP6_PARAMPROB_NEXTHEADER) {
 			sctp_abort_notification(stcb, true, false, 0, NULL, SCTP_SO_NOT_LOCKED);
 			(void)sctp_free_assoc(inp, stcb, SCTP_NORMAL_PROC,
 			    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_2);
 		} else {
 			SCTP_TCB_UNLOCK(stcb);
 		}
 		break;
 	case ICMP6_PACKET_TOO_BIG:
 		if (net->dest_state & SCTP_ADDR_NO_PMTUD) {
 			SCTP_TCB_UNLOCK(stcb);
 			break;
 		}
 		if (SCTP_OS_TIMER_PENDING(&net->pmtu_timer.timer)) {
 			timer_stopped = 1;
 			sctp_timer_stop(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net,
 			    SCTP_FROM_SCTP_USRREQ + SCTP_LOC_1);
 		} else {
 			timer_stopped = 0;
 		}
 		/* Update the path MTU. */
 		if (net->port) {
 			next_mtu -= sizeof(struct udphdr);
 		}
 		if (net->mtu > next_mtu) {
 			net->mtu = next_mtu;
 			if (net->port) {
 				sctp_hc_set_mtu(&net->ro._l_addr, inp->fibnum, next_mtu + sizeof(struct udphdr));
 			} else {
 				sctp_hc_set_mtu(&net->ro._l_addr, inp->fibnum, next_mtu);
 			}
 		}
 		/* Update the association MTU */
 		if (stcb->asoc.smallest_mtu > next_mtu) {
 			sctp_pathmtu_adjustment(stcb, next_mtu, true);
 		}
 		/* Finally, start the PMTU timer if it was running before. */
 		if (timer_stopped) {
 			sctp_timer_start(SCTP_TIMER_TYPE_PATHMTURAISE, inp, stcb, net);
 		}
 		SCTP_TCB_UNLOCK(stcb);
 		break;
 	default:
 		SCTP_TCB_UNLOCK(stcb);
 		break;
 	}
 }
 
 void
 sctp6_ctlinput(int cmd, struct sockaddr *pktdst, void *d)
 {
 	struct ip6ctlparam *ip6cp;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb;
 	struct sctp_nets *net;
 	struct sctphdr sh;
 	struct sockaddr_in6 src, dst;
 
 	if (pktdst->sa_family != AF_INET6 ||
 	    pktdst->sa_len != sizeof(struct sockaddr_in6)) {
 		return;
 	}
 
 	if ((unsigned)cmd >= PRC_NCMDS) {
 		return;
 	}
 	if (PRC_IS_REDIRECT(cmd)) {
 		d = NULL;
 	} else if (inet6ctlerrmap[cmd] == 0) {
 		return;
 	}
 	/* If the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 	} else {
 		ip6cp = (struct ip6ctlparam *)NULL;
 	}
 
 	if (ip6cp != NULL) {
 		/*
 		 * XXX: We assume that when IPV6 is non NULL, M and OFF are
 		 * valid.
 		 */
 		if (ip6cp->ip6c_m == NULL) {
 			return;
 		}
 
 		/*
 		 * Check if we can safely examine the ports and the
 		 * verification tag of the SCTP common header.
 		 */
 		if (ip6cp->ip6c_m->m_pkthdr.len <
 		    (int32_t)(ip6cp->ip6c_off + offsetof(struct sctphdr, checksum))) {
 			return;
 		}
 
 		/* Copy out the port numbers and the verification tag. */
 		memset(&sh, 0, sizeof(sh));
 		m_copydata(ip6cp->ip6c_m,
 		    ip6cp->ip6c_off,
 		    sizeof(uint16_t) + sizeof(uint16_t) + sizeof(uint32_t),
 		    (caddr_t)&sh);
 		memset(&src, 0, sizeof(struct sockaddr_in6));
 		src.sin6_family = AF_INET6;
 		src.sin6_len = sizeof(struct sockaddr_in6);
 		src.sin6_port = sh.src_port;
 		src.sin6_addr = ip6cp->ip6c_ip6->ip6_src;
 		if (in6_setscope(&src.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) {
 			return;
 		}
 		memset(&dst, 0, sizeof(struct sockaddr_in6));
 		dst.sin6_family = AF_INET6;
 		dst.sin6_len = sizeof(struct sockaddr_in6);
 		dst.sin6_port = sh.dest_port;
 		dst.sin6_addr = ip6cp->ip6c_ip6->ip6_dst;
 		if (in6_setscope(&dst.sin6_addr, ip6cp->ip6c_m->m_pkthdr.rcvif, NULL) != 0) {
 			return;
 		}
 		inp = NULL;
 		net = NULL;
 		stcb = sctp_findassociation_addr_sa((struct sockaddr *)&dst,
 		    (struct sockaddr *)&src,
 		    &inp, &net, 1, SCTP_DEFAULT_VRFID);
 		if ((stcb != NULL) &&
 		    (net != NULL) &&
 		    (inp != NULL)) {
 			/* Check the verification tag */
 			if (ntohl(sh.v_tag) != 0) {
 				/*
 				 * This must be the verification tag used
 				 * for sending out packets. We don't
 				 * consider packets reflecting the
 				 * verification tag.
 				 */
 				if (ntohl(sh.v_tag) != stcb->asoc.peer_vtag) {
 					SCTP_TCB_UNLOCK(stcb);
 					return;
 				}
 			} else {
 				if (ip6cp->ip6c_m->m_pkthdr.len >=
 				    ip6cp->ip6c_off + sizeof(struct sctphdr) +
 				    sizeof(struct sctp_chunkhdr) +
 				    offsetof(struct sctp_init, a_rwnd)) {
 					/*
 					 * In this case we can check if we
 					 * got an INIT chunk and if the
 					 * initiate tag matches.
 					 */
 					uint32_t initiate_tag;
 					uint8_t chunk_type;
 
 					m_copydata(ip6cp->ip6c_m,
 					    ip6cp->ip6c_off +
 					    sizeof(struct sctphdr),
 					    sizeof(uint8_t),
 					    (caddr_t)&chunk_type);
 					m_copydata(ip6cp->ip6c_m,
 					    ip6cp->ip6c_off +
 					    sizeof(struct sctphdr) +
 					    sizeof(struct sctp_chunkhdr),
 					    sizeof(uint32_t),
 					    (caddr_t)&initiate_tag);
 					if ((chunk_type != SCTP_INITIATION) ||
 					    (ntohl(initiate_tag) != stcb->asoc.my_vtag)) {
 						SCTP_TCB_UNLOCK(stcb);
 						return;
 					}
 				} else {
 					SCTP_TCB_UNLOCK(stcb);
 					return;
 				}
 			}
 			sctp6_notify(inp, stcb, net,
 			    ip6cp->ip6c_icmp6->icmp6_type,
 			    ip6cp->ip6c_icmp6->icmp6_code,
 			    ntohl(ip6cp->ip6c_icmp6->icmp6_mtu));
 		} else {
 			if ((stcb == NULL) && (inp != NULL)) {
 				/* reduce inp's ref-count */
 				SCTP_INP_WLOCK(inp);
 				SCTP_INP_DECR_REF(inp);
 				SCTP_INP_WUNLOCK(inp);
 			}
 			if (stcb) {
 				SCTP_TCB_UNLOCK(stcb);
 			}
 		}
 	}
 }
 
 /*
  * this routine can probably be collasped into the one in sctp_userreq.c
  * since they do the same thing and now we lookup with a sockaddr
  */
 static int
 sctp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct sctp_inpcb *inp;
 	struct sctp_nets *net;
 	struct sctp_tcb *stcb;
 	int error;
 	uint32_t vrf_id;
 
 	vrf_id = SCTP_DEFAULT_VRFID;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 
 	if (req->newlen != sizeof(addrs)) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	if (req->oldlen != sizeof(struct ucred)) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 
 	stcb = sctp_findassociation_addr_sa(sin6tosa(&addrs[1]),
 	    sin6tosa(&addrs[0]),
 	    &inp, &net, 1, vrf_id);
 	if (stcb == NULL || inp == NULL || inp->sctp_socket == NULL) {
 		if ((inp != NULL) && (stcb == NULL)) {
 			/* reduce ref-count */
 			SCTP_INP_WLOCK(inp);
 			SCTP_INP_DECR_REF(inp);
 			goto cred_can_cont;
 		}
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOENT);
 		error = ENOENT;
 		goto out;
 	}
 	SCTP_TCB_UNLOCK(stcb);
 	/*
 	 * We use the write lock here, only since in the error leg we need
 	 * it. If we used RLOCK, then we would have to
 	 * wlock/decr/unlock/rlock. Which in theory could create a hole.
 	 * Better to use higher wlock.
 	 */
 	SCTP_INP_WLOCK(inp);
 cred_can_cont:
 	error = cr_canseesocket(req->td->td_ucred, inp->sctp_socket);
 	if (error) {
 		SCTP_INP_WUNLOCK(inp);
 		goto out;
 	}
 	cru2x(inp->sctp_socket->so_cred, &xuc);
 	SCTP_INP_WUNLOCK(inp);
 	error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 out:
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_sctp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_NEEDGIANT,
     0, 0, sctp6_getcred, "S,ucred",
     "Get the ucred of a SCTP6 connection");
 
 /* This is the same as the sctp_abort() could be made common */
 static void
 sctp6_abort(struct socket *so)
 {
 	struct epoch_tracker et;
 	struct sctp_inpcb *inp;
 	uint32_t flags;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 		return;
 	}
 	NET_EPOCH_ENTER(et);
 sctp_must_try_again:
 	flags = inp->sctp_flags;
 #ifdef SCTP_LOG_CLOSING
 	sctp_log_closing(inp, NULL, 17);
 #endif
 	if (((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) &&
 	    (atomic_cmpset_int(&inp->sctp_flags, flags, (flags | SCTP_PCB_FLAGS_SOCKET_GONE | SCTP_PCB_FLAGS_CLOSE_IP)))) {
 #ifdef SCTP_LOG_CLOSING
 		sctp_log_closing(inp, NULL, 16);
 #endif
 		sctp_inpcb_free(inp, SCTP_FREE_SHOULD_USE_ABORT,
 		    SCTP_CALLED_AFTER_CMPSET_OFCLOSE);
 		SOCK_LOCK(so);
 		SCTP_SB_CLEAR(so->so_snd);
 		/*
 		 * same for the rcv ones, they are only here for the
 		 * accounting/select.
 		 */
 		SCTP_SB_CLEAR(so->so_rcv);
 		/* Now null out the reference, we are completely detached. */
 		so->so_pcb = NULL;
 		SOCK_UNLOCK(so);
 	} else {
 		flags = inp->sctp_flags;
 		if ((flags & SCTP_PCB_FLAGS_SOCKET_GONE) == 0) {
 			goto sctp_must_try_again;
 		}
 	}
 	NET_EPOCH_EXIT(et);
 	return;
 }
 
 static int
 sctp6_attach(struct socket *so, int proto SCTP_UNUSED, struct thread *p SCTP_UNUSED)
 {
 	int error;
 	struct sctp_inpcb *inp;
 	uint32_t vrf_id = SCTP_DEFAULT_VRFID;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp != NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		error = SCTP_SORESERVE(so, SCTP_BASE_SYSCTL(sctp_sendspace), SCTP_BASE_SYSCTL(sctp_recvspace));
 		if (error)
 			return (error);
 	}
 	error = sctp_inpcb_alloc(so, vrf_id);
 	if (error)
 		return (error);
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	SCTP_INP_WLOCK(inp);
 	inp->sctp_flags |= SCTP_PCB_FLAGS_BOUND_V6;	/* I'm v6! */
 
 	inp->ip_inp.inp.inp_vflag |= INP_IPV6;
 	inp->ip_inp.inp.in6p_hops = -1;	/* use kernel default */
 	inp->ip_inp.inp.in6p_cksum = -1;	/* just to be sure */
 #ifdef INET
 	/*
 	 * XXX: ugly!! IPv4 TTL initialization is necessary for an IPv6
 	 * socket as well, because the socket may be bound to an IPv6
 	 * wildcard address, which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->ip_inp.inp.inp_ip_ttl = MODULE_GLOBAL(ip_defttl);
 #endif
 	SCTP_INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 sctp6_bind(struct socket *so, struct sockaddr *addr, struct thread *p)
 {
 	struct sctp_inpcb *inp;
 	int error;
 	u_char vflagsav;
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 
 	if (addr) {
 		switch (addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			if (addr->sa_len != sizeof(struct sockaddr_in)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			if (addr->sa_len != sizeof(struct sockaddr_in6)) {
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 				return (EINVAL);
 			}
 			break;
 #endif
 		default:
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 	}
 	vflagsav = inp->ip_inp.inp.inp_vflag;
 	inp->ip_inp.inp.inp_vflag &= ~INP_IPV4;
 	inp->ip_inp.inp.inp_vflag |= INP_IPV6;
 	if ((addr != NULL) && (SCTP_IPV6_V6ONLY(inp) == 0)) {
 		switch (addr->sa_family) {
 #ifdef INET
 		case AF_INET:
 			/* binding v4 addr to v6 socket, so reset flags */
 			inp->ip_inp.inp.inp_vflag |= INP_IPV4;
 			inp->ip_inp.inp.inp_vflag &= ~INP_IPV6;
 			break;
 #endif
 #ifdef INET6
 		case AF_INET6:
 			{
 				struct sockaddr_in6 *sin6_p;
 
 				sin6_p = (struct sockaddr_in6 *)addr;
 
 				if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr)) {
 					inp->ip_inp.inp.inp_vflag |= INP_IPV4;
 				}
 #ifdef INET
 				if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) {
 					struct sockaddr_in sin;
 
 					in6_sin6_2_sin(&sin, sin6_p);
 					inp->ip_inp.inp.inp_vflag |= INP_IPV4;
 					inp->ip_inp.inp.inp_vflag &= ~INP_IPV6;
 					error = sctp_inpcb_bind(so, (struct sockaddr *)&sin, NULL, p);
 					goto out;
 				}
 #endif
 				break;
 			}
 #endif
 		default:
 			break;
 		}
 	} else if (addr != NULL) {
 		struct sockaddr_in6 *sin6_p;
 
 		/* IPV6_V6ONLY socket */
 #ifdef INET
 		if (addr->sa_family == AF_INET) {
 			/* can't bind v4 addr to v6 only socket! */
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			error = EINVAL;
 			goto out;
 		}
 #endif
 		sin6_p = (struct sockaddr_in6 *)addr;
 
 		if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) {
 			/* can't bind v4-mapped addrs either! */
 			/* NOTE: we don't support SIIT */
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			error = EINVAL;
 			goto out;
 		}
 	}
 	error = sctp_inpcb_bind(so, addr, NULL, p);
 out:
 	if (error != 0)
 		inp->ip_inp.inp.inp_vflag = vflagsav;
 	return (error);
 }
 
 static void
 sctp6_close(struct socket *so)
 {
 	sctp_close(so);
 }
 
 /* This could be made common with sctp_detach() since they are identical */
 
 static
 int
 sctp6_disconnect(struct socket *so)
 {
 	return (sctp_disconnect(so));
 }
 
 int
 sctp_sendm(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *p);
 
 static int
 sctp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
     struct mbuf *control, struct thread *p)
 {
 	struct sctp_inpcb *inp;
 
 #ifdef INET
 	struct sockaddr_in6 *sin6;
 #endif				/* INET */
 	/* No SPL needed since sctp_output does this */
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		if (control) {
 			SCTP_RELEASE_PKT(control);
 			control = NULL;
 		}
 		SCTP_RELEASE_PKT(m);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	/*
 	 * For the TCP model we may get a NULL addr, if we are a connected
 	 * socket thats ok.
 	 */
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) &&
 	    (addr == NULL)) {
 		goto connected_type;
 	}
 	if (addr == NULL) {
 		SCTP_RELEASE_PKT(m);
 		if (control) {
 			SCTP_RELEASE_PKT(control);
 			control = NULL;
 		}
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EDESTADDRREQ);
 		return (EDESTADDRREQ);
 	}
 	switch (addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (addr->sa_len != sizeof(struct sockaddr_in)) {
 			if (control) {
 				SCTP_RELEASE_PKT(control);
 				control = NULL;
 			}
 			SCTP_RELEASE_PKT(m);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (addr->sa_len != sizeof(struct sockaddr_in6)) {
 			if (control) {
 				SCTP_RELEASE_PKT(control);
 				control = NULL;
 			}
 			SCTP_RELEASE_PKT(m);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 		break;
 #endif
 	default:
 		if (control) {
 			SCTP_RELEASE_PKT(control);
 			control = NULL;
 		}
 		SCTP_RELEASE_PKT(m);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 #ifdef INET
 	sin6 = (struct sockaddr_in6 *)addr;
 	if (SCTP_IPV6_V6ONLY(inp)) {
 		/*
 		 * if IPV6_V6ONLY flag, we discard datagrams destined to a
 		 * v4 addr or v4-mapped addr
 		 */
 		if (addr->sa_family == AF_INET) {
 			if (control) {
 				SCTP_RELEASE_PKT(control);
 				control = NULL;
 			}
 			SCTP_RELEASE_PKT(m);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 			if (control) {
 				SCTP_RELEASE_PKT(control);
 				control = NULL;
 			}
 			SCTP_RELEASE_PKT(m);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 	}
 	if ((addr->sa_family == AF_INET6) &&
 	    IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 		struct sockaddr_in sin;
 
 		/* convert v4-mapped into v4 addr and send */
 		in6_sin6_2_sin(&sin, sin6);
 		return (sctp_sendm(so, flags, m, (struct sockaddr *)&sin, control, p));
 	}
 #endif				/* INET */
 connected_type:
 	/* now what about control */
 	if (control) {
 		if (inp->control) {
 			SCTP_PRINTF("huh? control set?\n");
 			SCTP_RELEASE_PKT(inp->control);
 			inp->control = NULL;
 		}
 		inp->control = control;
 	}
 	/* Place the data */
 	if (inp->pkt) {
 		SCTP_BUF_NEXT(inp->pkt_last) = m;
 		inp->pkt_last = m;
 	} else {
 		inp->pkt_last = inp->pkt = m;
 	}
 	if (
 	/* FreeBSD and MacOSX uses a flag passed */
 	    ((flags & PRUS_MORETOCOME) == 0)
 	    ) {
 		/*
 		 * note with the current version this code will only be used
 		 * by OpenBSD, NetBSD and FreeBSD have methods for
 		 * re-defining sosend() to use sctp_sosend().  One can
 		 * optionaly switch back to this code (by changing back the
 		 * defininitions but this is not advisable.
 		 */
 		struct epoch_tracker et;
 		int ret;
 
 		NET_EPOCH_ENTER(et);
 		ret = sctp_output(inp, inp->pkt, addr, inp->control, p, flags);
 		NET_EPOCH_EXIT(et);
 		inp->pkt = NULL;
 		inp->control = NULL;
 		return (ret);
 	} else {
 		return (0);
 	}
 }
 
 static int
 sctp6_connect(struct socket *so, struct sockaddr *addr, struct thread *p)
 {
 	struct epoch_tracker et;
 	uint32_t vrf_id;
 	int error = 0;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb;
 #ifdef INET
 	struct sockaddr_in6 *sin6;
 	union sctp_sockstore store;
 #endif
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ECONNRESET);
 		return (ECONNRESET);	/* I made the same as TCP since we are
 					 * not setup? */
 	}
 	if (addr == NULL) {
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 	switch (addr->sa_family) {
 #ifdef INET
 	case AF_INET:
 		if (addr->sa_len != sizeof(struct sockaddr_in)) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		if (addr->sa_len != sizeof(struct sockaddr_in6)) {
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 		break;
 #endif
 	default:
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 
 	vrf_id = inp->def_vrf_id;
 	SCTP_ASOC_CREATE_LOCK(inp);
 	SCTP_INP_RLOCK(inp);
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_UNBOUND) ==
 	    SCTP_PCB_FLAGS_UNBOUND) {
 		/* Bind a ephemeral port */
 		SCTP_INP_RUNLOCK(inp);
 		error = sctp6_bind(so, NULL, p);
 		if (error) {
 			SCTP_ASOC_CREATE_UNLOCK(inp);
 
 			return (error);
 		}
 		SCTP_INP_RLOCK(inp);
 	}
 	if ((inp->sctp_flags & SCTP_PCB_FLAGS_TCPTYPE) &&
 	    (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED)) {
 		/* We are already connected AND the TCP model */
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EADDRINUSE);
 		return (EADDRINUSE);
 	}
 #ifdef INET
 	sin6 = (struct sockaddr_in6 *)addr;
 	if (SCTP_IPV6_V6ONLY(inp)) {
 		/*
 		 * if IPV6_V6ONLY flag, ignore connections destined to a v4
 		 * addr or v4-mapped addr
 		 */
 		if (addr->sa_family == AF_INET) {
 			SCTP_INP_RUNLOCK(inp);
 			SCTP_ASOC_CREATE_UNLOCK(inp);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 		if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 			SCTP_INP_RUNLOCK(inp);
 			SCTP_ASOC_CREATE_UNLOCK(inp);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 			return (EINVAL);
 		}
 	}
 	if ((addr->sa_family == AF_INET6) &&
 	    IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 		/* convert v4-mapped into v4 addr */
 		in6_sin6_2_sin(&store.sin, sin6);
 		addr = &store.sa;
 	}
 #endif				/* INET */
 	/* Now do we connect? */
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 		stcb = LIST_FIRST(&inp->sctp_asoc_list);
 		if (stcb) {
 			SCTP_TCB_LOCK(stcb);
 		}
 		SCTP_INP_RUNLOCK(inp);
 	} else {
 		SCTP_INP_RUNLOCK(inp);
 		SCTP_INP_WLOCK(inp);
 		SCTP_INP_INCR_REF(inp);
 		SCTP_INP_WUNLOCK(inp);
 		stcb = sctp_findassociation_ep_addr(&inp, addr, NULL, NULL, NULL);
 		if (stcb == NULL) {
 			SCTP_INP_WLOCK(inp);
 			SCTP_INP_DECR_REF(inp);
 			SCTP_INP_WUNLOCK(inp);
 		}
 	}
 
 	if (stcb != NULL) {
 		/* Already have or am bring up an association */
 		SCTP_ASOC_CREATE_UNLOCK(inp);
 		SCTP_TCB_UNLOCK(stcb);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EALREADY);
 		return (EALREADY);
 	}
 	/* We are GOOD to go */
 	stcb = sctp_aloc_assoc_connected(inp, addr, &error, 0, 0, vrf_id,
 	    inp->sctp_ep.pre_open_stream_count,
 	    inp->sctp_ep.port, p,
 	    SCTP_INITIALIZE_AUTH_PARAMS);
 	SCTP_ASOC_CREATE_UNLOCK(inp);
 	if (stcb == NULL) {
 		/* Gak! no memory */
 		return (error);
 	}
 	SCTP_SET_STATE(stcb, SCTP_STATE_COOKIE_WAIT);
 	(void)SCTP_GETTIME_TIMEVAL(&stcb->asoc.time_entered);
 	NET_EPOCH_ENTER(et);
 	sctp_send_initiate(inp, stcb, SCTP_SO_LOCKED);
 	SCTP_TCB_UNLOCK(stcb);
 	NET_EPOCH_EXIT(et);
 	return (error);
 }
 
 static int
 sctp6_getaddr(struct socket *so, struct sockaddr **addr)
 {
 	struct sockaddr_in6 *sin6;
 	struct sctp_inpcb *inp;
 	uint32_t vrf_id;
 	struct sctp_ifa *sctp_ifa;
 
 	int error;
 
 	/*
 	 * Do the malloc first in case it blocks.
 	 */
 	SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof(*sin6));
 	if (sin6 == NULL)
 		return (ENOMEM);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if (inp == NULL) {
 		SCTP_FREE_SONAME(sin6);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ECONNRESET);
 		return (ECONNRESET);
 	}
 	SCTP_INP_RLOCK(inp);
 	sin6->sin6_port = inp->sctp_lport;
 	if (inp->sctp_flags & SCTP_PCB_FLAGS_BOUNDALL) {
 		/* For the bound all case you get back 0 */
 		if (inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) {
 			struct sctp_tcb *stcb;
 			struct sockaddr_in6 *sin_a6;
 			struct sctp_nets *net;
 			int fnd;
 
 			stcb = LIST_FIRST(&inp->sctp_asoc_list);
 			if (stcb == NULL) {
 				SCTP_INP_RUNLOCK(inp);
 				SCTP_FREE_SONAME(sin6);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOENT);
 				return (ENOENT);
 			}
 			fnd = 0;
 			sin_a6 = NULL;
 			TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 				sin_a6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 				if (sin_a6 == NULL)
 					/* this will make coverity happy */
 					continue;
 
 				if (sin_a6->sin6_family == AF_INET6) {
 					fnd = 1;
 					break;
 				}
 			}
 			if ((!fnd) || (sin_a6 == NULL)) {
 				/* punt */
 				SCTP_INP_RUNLOCK(inp);
 				SCTP_FREE_SONAME(sin6);
 				SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOENT);
 				return (ENOENT);
 			}
 			vrf_id = inp->def_vrf_id;
 			sctp_ifa = sctp_source_address_selection(inp, stcb, (sctp_route_t *)&net->ro, net, 0, vrf_id);
 			if (sctp_ifa) {
 				sin6->sin6_addr = sctp_ifa->address.sin6.sin6_addr;
 			}
 		} else {
 			/* For the bound all case you get back 0 */
 			memset(&sin6->sin6_addr, 0, sizeof(sin6->sin6_addr));
 		}
 	} else {
 		/* Take the first IPv6 address in the list */
 		struct sctp_laddr *laddr;
 		int fnd = 0;
 
 		LIST_FOREACH(laddr, &inp->sctp_addr_list, sctp_nxt_addr) {
 			if (laddr->ifa->address.sa.sa_family == AF_INET6) {
 				struct sockaddr_in6 *sin_a;
 
 				sin_a = &laddr->ifa->address.sin6;
 				sin6->sin6_addr = sin_a->sin6_addr;
 				fnd = 1;
 				break;
 			}
 		}
 		if (!fnd) {
 			SCTP_FREE_SONAME(sin6);
 			SCTP_INP_RUNLOCK(inp);
 			SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOENT);
 			return (ENOENT);
 		}
 	}
 	SCTP_INP_RUNLOCK(inp);
 	/* Scoping things for v6 */
 	if ((error = sa6_recoverscope(sin6)) != 0) {
 		SCTP_FREE_SONAME(sin6);
 		return (error);
 	}
 	(*addr) = (struct sockaddr *)sin6;
 	return (0);
 }
 
 static int
 sctp6_peeraddr(struct socket *so, struct sockaddr **addr)
 {
 	struct sockaddr_in6 *sin6;
 	int fnd;
 	struct sockaddr_in6 *sin_a6;
 	struct sctp_inpcb *inp;
 	struct sctp_tcb *stcb;
 	struct sctp_nets *net;
 	int error;
 
 	/* Do the malloc first in case it blocks. */
 	SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof *sin6);
 	if (sin6 == NULL)
 		return (ENOMEM);
 	sin6->sin6_family = AF_INET6;
 	sin6->sin6_len = sizeof(*sin6);
 
 	inp = (struct sctp_inpcb *)so->so_pcb;
 	if ((inp == NULL) ||
 	    ((inp->sctp_flags & SCTP_PCB_FLAGS_CONNECTED) == 0)) {
 		/* UDP type and listeners will drop out here */
 		SCTP_FREE_SONAME(sin6);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOTCONN);
 		return (ENOTCONN);
 	}
 	SCTP_INP_RLOCK(inp);
 	stcb = LIST_FIRST(&inp->sctp_asoc_list);
 	if (stcb) {
 		SCTP_TCB_LOCK(stcb);
 	}
 	SCTP_INP_RUNLOCK(inp);
 	if (stcb == NULL) {
 		SCTP_FREE_SONAME(sin6);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ECONNRESET);
 		return (ECONNRESET);
 	}
 	fnd = 0;
 	TAILQ_FOREACH(net, &stcb->asoc.nets, sctp_next) {
 		sin_a6 = (struct sockaddr_in6 *)&net->ro._l_addr;
 		if (sin_a6->sin6_family == AF_INET6) {
 			fnd = 1;
 			sin6->sin6_port = stcb->rport;
 			sin6->sin6_addr = sin_a6->sin6_addr;
 			break;
 		}
 	}
 	SCTP_TCB_UNLOCK(stcb);
 	if (!fnd) {
 		/* No IPv4 address */
 		SCTP_FREE_SONAME(sin6);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, ENOENT);
 		return (ENOENT);
 	}
 	if ((error = sa6_recoverscope(sin6)) != 0) {
 		SCTP_FREE_SONAME(sin6);
 		SCTP_LTRACE_ERR_RET(inp, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, error);
 		return (error);
 	}
 	*addr = (struct sockaddr *)sin6;
 	return (0);
 }
 
 static int
 sctp6_in6getaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	int error;
 
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 
 	/* allow v6 addresses precedence */
 	error = sctp6_getaddr(so, nam);
 #ifdef INET
 	if (error) {
 		struct sockaddr_in6 *sin6;
 
 		/* try v4 next if v6 failed */
 		error = sctp_ingetaddr(so, nam);
 		if (error) {
 			return (error);
 		}
 		SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof *sin6);
 		if (sin6 == NULL) {
 			SCTP_FREE_SONAME(*nam);
 			return (ENOMEM);
 		}
 		in6_sin_2_v4mapsin6((struct sockaddr_in *)*nam, sin6);
 		SCTP_FREE_SONAME(*nam);
 		*nam = (struct sockaddr *)sin6;
 	}
 #endif
 	return (error);
 }
 
 static int
 sctp6_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct inpcb *inp = sotoinpcb(so);
 	int error;
 
 	if (inp == NULL) {
 		SCTP_LTRACE_ERR_RET(NULL, NULL, NULL, SCTP_FROM_SCTP6_USRREQ, EINVAL);
 		return (EINVAL);
 	}
 
 	/* allow v6 addresses precedence */
 	error = sctp6_peeraddr(so, nam);
 #ifdef INET
 	if (error) {
 		struct sockaddr_in6 *sin6;
 
 		/* try v4 next if v6 failed */
 		error = sctp_peeraddr(so, nam);
 		if (error) {
 			return (error);
 		}
 		SCTP_MALLOC_SONAME(sin6, struct sockaddr_in6 *, sizeof *sin6);
 		if (sin6 == NULL) {
 			SCTP_FREE_SONAME(*nam);
 			return (ENOMEM);
 		}
 		in6_sin_2_v4mapsin6((struct sockaddr_in *)*nam, sin6);
 		SCTP_FREE_SONAME(*nam);
 		*nam = (struct sockaddr *)sin6;
 	}
 #endif
 	return (error);
 }
 
-struct pr_usrreqs sctp6_usrreqs = {
-	.pru_abort = sctp6_abort,
-	.pru_accept = sctp_accept,
-	.pru_attach = sctp6_attach,
-	.pru_bind = sctp6_bind,
-	.pru_connect = sctp6_connect,
-	.pru_control = in6_control,
-	.pru_close = sctp6_close,
-	.pru_detach = sctp6_close,
-	.pru_sopoll = sopoll_generic,
-	.pru_flush = sctp_flush,
-	.pru_disconnect = sctp6_disconnect,
-	.pru_listen = sctp_listen,
-	.pru_peeraddr = sctp6_getpeeraddr,
-	.pru_send = sctp6_send,
-	.pru_shutdown = sctp_shutdown,
-	.pru_sockaddr = sctp6_in6getaddr,
-	.pru_sosend = sctp_sosend,
-	.pru_soreceive = sctp_soreceive
+#define	SCTP6_PROTOSW							\
+	.pr_protocol =	IPPROTO_SCTP,					\
+	.pr_ctloutput =	sctp_ctloutput,					\
+	.pr_abort =	sctp6_abort,					\
+	.pr_accept =	sctp_accept,					\
+	.pr_attach =	sctp6_attach,					\
+	.pr_bind =	sctp6_bind,					\
+	.pr_connect =	sctp6_connect,					\
+	.pr_control =	in6_control,					\
+	.pr_close =	sctp6_close,					\
+	.pr_detach =	sctp6_close,					\
+	.pr_sopoll =	sopoll_generic,					\
+	.pr_flush =	sctp_flush,					\
+	.pr_disconnect = sctp6_disconnect,				\
+	.pr_listen =	sctp_listen,					\
+	.pr_peeraddr =	sctp6_getpeeraddr,				\
+	.pr_send =	sctp6_send,					\
+	.pr_shutdown =	sctp_shutdown,					\
+	.pr_sockaddr =	sctp6_in6getaddr,				\
+	.pr_sosend =	sctp_sosend,					\
+	.pr_soreceive =	sctp_soreceive
+
+struct protosw sctp6_seqpacket_protosw = {
+	.pr_type =	SOCK_SEQPACKET,
+	.pr_flags =	PR_WANTRCVD,
+	SCTP6_PROTOSW
 };
 
+struct protosw sctp6_stream_protosw = {
+	.pr_type =	SOCK_STREAM,
+	.pr_flags =	PR_CONNREQUIRED | PR_WANTRCVD,
+	SCTP6_PROTOSW
+};
 #endif
diff --git a/sys/netinet6/sctp6_var.h b/sys/netinet6/sctp6_var.h
index 4ad0ca2811c1..74a24f425cea 100644
--- a/sys/netinet6/sctp6_var.h
+++ b/sys/netinet6/sctp6_var.h
@@ -1,56 +1,56 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2001-2007, by Cisco Systems, Inc. All rights reserved.
  * Copyright (c) 2008-2012, by Randall Stewart. All rights reserved.
  * Copyright (c) 2008-2012, by Michael Tuexen. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *
  * a) Redistributions of source code must retain the above copyright notice,
  *    this list of conditions and the following disclaimer.
  *
  * b) Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in
  *    the documentation and/or other materials provided with the distribution.
  *
  * c) Neither the name of Cisco Systems, Inc. nor the names of its
  *    contributors may be used to endorse or promote products derived
  *    from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #ifndef _NETINET6_SCTP6_VAR_H_
 #define _NETINET6_SCTP6_VAR_H_
 
 #if defined(_KERNEL)
 
 SYSCTL_DECL(_net_inet6_sctp6);
-extern struct pr_usrreqs sctp6_usrreqs;
+extern struct protosw sctp6_seqpacket_protosw, sctp6_stream_protosw;
 
 int sctp6_input(struct mbuf **, int *, int);
 int sctp6_input_with_port(struct mbuf **, int *, uint16_t);
 int
 sctp6_output(struct sctp_inpcb *, struct mbuf *, struct sockaddr *,
     struct mbuf *, struct proc *);
 void sctp6_ctlinput(int, struct sockaddr *, void *);
 void
 sctp6_notify(struct sctp_inpcb *, struct sctp_tcb *, struct sctp_nets *,
     uint8_t, uint8_t, uint32_t);
 #endif
 #endif
diff --git a/sys/netinet6/send.c b/sys/netinet6/send.c
index 736039c695af..0684e2eea22c 100644
--- a/sys/netinet6/send.c
+++ b/sys/netinet6/send.c
@@ -1,392 +1,389 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009-2010 Ana Kukec <anchie@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/sockbuf.h>
 #include <sys/socketvar.h>
 #include <sys/types.h>
 
 #include <net/route.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/scope6_var.h>
 #include <netinet6/send.h>
 
 static MALLOC_DEFINE(M_SEND, "send", "Secure Neighbour Discovery");
 
 /*
  * The socket used to communicate with the SeND daemon.
  */
 VNET_DEFINE_STATIC(struct socket *, send_so);
 #define	V_send_so	VNET(send_so)
 
 u_long	send_sendspace	= 8 * (1024 + sizeof(struct sockaddr_send));
 u_long	send_recvspace	= 9216;
 
 struct mtx	send_mtx;
 #define SEND_LOCK_INIT()	mtx_init(&send_mtx, "send_mtx", NULL, MTX_DEF)
 #define SEND_LOCK()		mtx_lock(&send_mtx)
 #define SEND_UNLOCK()		mtx_unlock(&send_mtx)
 #define SEND_LOCK_DESTROY()     mtx_destroy(&send_mtx)
 
 static int
 send_attach(struct socket *so, int proto, struct thread *td)
 {
 	int error;
 
 	SEND_LOCK();
 	if (V_send_so != NULL) {
 		SEND_UNLOCK();
 		return (EEXIST);
 	}
 
 	error = priv_check(td, PRIV_NETINET_RAW);
 	if (error) {
 		SEND_UNLOCK();
 		return(error);
 	}
 
 	if (proto != IPPROTO_SEND) {
 		SEND_UNLOCK();
 		return (EPROTONOSUPPORT);
 	}
 	error = soreserve(so, send_sendspace, send_recvspace);
 	if (error) {
 		SEND_UNLOCK();
 		return(error);
 	}
 
 	V_send_so = so;
 	SEND_UNLOCK();
 
 	return (0);
 }
 
 static int
 send_output(struct mbuf *m, struct ifnet *ifp, int direction)
 {
 	struct ip6_hdr *ip6;
 	struct sockaddr_in6 dst;
 	struct icmp6_hdr *icmp6;
 	struct epoch_tracker et;
 	int icmp6len;
 	int error;
 
 	/*
 	 * Receive incoming (SeND-protected) or outgoing traffic
 	 * (SeND-validated) from the SeND user space application.
 	 */
 
 	switch (direction) {
 	case SND_IN:
 		if (m->m_len < (sizeof(struct ip6_hdr) +
 		    sizeof(struct icmp6_hdr))) {
 			m = m_pullup(m, sizeof(struct ip6_hdr) +
 			    sizeof(struct icmp6_hdr));
 			if (!m)
 				return (ENOBUFS);
 		}
 
 		/* Before passing off the mbuf record the proper interface. */
 		m->m_pkthdr.rcvif = ifp;
 
 		if (m->m_flags & M_PKTHDR)
 			icmp6len = m->m_pkthdr.len - sizeof(struct ip6_hdr);
 		else
 			panic("Doh! not the first mbuf.");
 
 		ip6 = mtod(m, struct ip6_hdr *);
 		icmp6 = (struct icmp6_hdr *)(ip6 + 1);
 		error = 0;
 
 		/*
 		 * Output the packet as icmp6.c:icpm6_input() would do.
 		 * The mbuf is always consumed, so we do not have to
 		 * care about that.
 		 */
 		NET_EPOCH_ENTER(et);
 		switch (icmp6->icmp6_type) {
 		case ND_NEIGHBOR_SOLICIT:
 			nd6_ns_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		case ND_NEIGHBOR_ADVERT:
 			nd6_na_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		case ND_REDIRECT:
 			icmp6_redirect_input(m, sizeof(struct ip6_hdr));
 			break;
 		case ND_ROUTER_SOLICIT:
 			nd6_rs_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		case ND_ROUTER_ADVERT:
 			nd6_ra_input(m, sizeof(struct ip6_hdr), icmp6len);
 			break;
 		default:
 			m_freem(m);
 			error = ENOSYS;
 		}
 		NET_EPOCH_EXIT(et);
 
 		return (error);
 
 	case SND_OUT:
 		if (m->m_len < sizeof(struct ip6_hdr)) {
 			m = m_pullup(m, sizeof(struct ip6_hdr));
 			if (!m)
 				return (ENOBUFS);
 		}
 		ip6 = mtod(m, struct ip6_hdr *);
 		if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
 			m->m_flags |= M_MCAST;
 
 		bzero(&dst, sizeof(dst));
 		dst.sin6_family = AF_INET6;
 		dst.sin6_len = sizeof(dst);
 		dst.sin6_addr = ip6->ip6_dst;
 
 		m_clrprotoflags(m);	/* Avoid confusing lower layers. */
 
 		IP_PROBE(send, NULL, NULL, ip6, ifp, NULL, ip6);
 
 		/*
 		 * Output the packet as nd6.c:nd6_output_lle() would do.
 		 * The mbuf is always consumed, so we do not have to care
 		 * about that.
 		 * XXX-BZ as we added data, what about fragmenting,
 		 * if now needed?
 		 */
 		error = ((*ifp->if_output)(ifp, m, (struct sockaddr *)&dst,
 		    NULL));
 		if (error)
 			error = ENOENT;
 		return (error);
 
 	default:
 		panic("%s: direction %d neither SND_IN nor SND_OUT.",
 		     __func__, direction);
 	}
 }
 
 /*
  * Receive a SeND message from user space to be either send out by the kernel
  * or, with SeND ICMPv6 options removed, to be further processed by the icmp6
  * input path.
  */
 static int
 send_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
 	struct sockaddr_send *sendsrc;
 	struct ifnet *ifp;
 	int error;
 
 	KASSERT(V_send_so == so, ("%s: socket %p not send socket %p",
 		__func__, so, V_send_so));
 
 	sendsrc = (struct sockaddr_send *)nam;
 	if (sendsrc->send_family != AF_INET6) {
 		error = EAFNOSUPPORT;
 		goto err;
 	}
 	if (sendsrc->send_len != sizeof(*sendsrc)) {
 		error = EINVAL;
 		goto err;
 	}
 	ifp = ifnet_byindex_ref(sendsrc->send_ifidx);
 	if (ifp == NULL) {
 		error = ENETUNREACH;
 		goto err;
 	}
 
 	error = send_output(m, ifp, sendsrc->send_direction);
 	if_rele(ifp);
 	m = NULL;
 
 err:
 	if (control != NULL)
 		m_freem(control);
 	if (m != NULL)
 		m_freem(m);
 
 	return (error);
 }
 
 static void
 send_close(struct socket *so)
 {
 
 	SEND_LOCK();
 	if (V_send_so)
 		V_send_so = NULL;
 	SEND_UNLOCK();
 }
 
 /*
  * Send a SeND message to user space, that was either received and has to be
  * validated or was about to be send out and has to be handled by the SEND
  * daemon adding SeND ICMPv6 options.
  */
 static int
 send_input(struct mbuf *m, struct ifnet *ifp, int direction, int msglen __unused)
 {
 	struct ip6_hdr *ip6;
 	struct sockaddr_send sendsrc;
 
 	SEND_LOCK();
 	if (V_send_so == NULL) {
 		SEND_UNLOCK();
 		return (-1);
 	}
 
 	/*
 	 * Make sure to clear any possible internally embedded scope before
 	 * passing the packet to user space for SeND cryptographic signature
 	 * validation to succeed.
 	 */
 	ip6 = mtod(m, struct ip6_hdr *);
 	in6_clearscope(&ip6->ip6_src);
 	in6_clearscope(&ip6->ip6_dst);
 
 	bzero(&sendsrc, sizeof(sendsrc));
 	sendsrc.send_len = sizeof(sendsrc);
 	sendsrc.send_family = AF_INET6;
 	sendsrc.send_direction = direction;
 	sendsrc.send_ifidx = ifp->if_index;
 
 	/*
 	 * Send incoming or outgoing traffic to user space either to be
 	 * protected (outgoing) or validated (incoming) according to rfc3971.
 	 */
 	SOCKBUF_LOCK(&V_send_so->so_rcv);
 	if (sbappendaddr_locked(&V_send_so->so_rcv,
 	    (struct sockaddr *)&sendsrc, m, NULL) == 0) {
 		soroverflow_locked(V_send_so);
 		/* XXX stats. */
 		m_freem(m);
 	} else {
 		sorwakeup_locked(V_send_so);
 	}
 
 	SEND_UNLOCK();
 	return (0);
 }
 
-struct pr_usrreqs send_usrreqs = {
-	.pru_attach =		send_attach,
-	.pru_send =		send_send,
-	.pru_detach =		send_close
-};
-struct protosw send_protosw = {
+static struct protosw send_protosw = {
 	.pr_type =		SOCK_RAW,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
 	.pr_protocol =		IPPROTO_SEND,
-	.pr_usrreqs =		&send_usrreqs
+	.pr_attach =		send_attach,
+	.pr_send =		send_send,
+	.pr_detach =		send_close
 };
 
 static int
 send_modevent(module_t mod, int type, void *unused)
 {
 #ifdef __notyet__
 	VNET_ITERATOR_DECL(vnet_iter);
 #endif
 	int error;
 
 	switch (type) {
 	case MOD_LOAD:
 		SEND_LOCK_INIT();
 
-		error = pf_proto_register(PF_INET6, &send_protosw);
+		error = protosw_register(&inet6domain, &send_protosw);
 		if (error != 0) {
 			printf("%s:%d: MOD_LOAD pf_proto_register(): %d\n",
 			   __func__, __LINE__, error);
 			SEND_LOCK_DESTROY();
 			break;
 		}
 		send_sendso_input_hook = send_input;
 		break;
 	case MOD_UNLOAD:
 		/* Do not allow unloading w/o locking. */
 		return (EBUSY);
 #ifdef __notyet__
 		VNET_LIST_RLOCK_NOSLEEP();
 		SEND_LOCK();
 		VNET_FOREACH(vnet_iter) {
 			CURVNET_SET(vnet_iter);
 			if (V_send_so != NULL) {
 				CURVNET_RESTORE();
 				SEND_UNLOCK();
 				VNET_LIST_RUNLOCK_NOSLEEP();
 				return (EBUSY);
 			}
 			CURVNET_RESTORE();
 		}
 		SEND_UNLOCK();
 		VNET_LIST_RUNLOCK_NOSLEEP();
-		error = pf_proto_unregister(PF_INET6, IPPROTO_SEND, SOCK_RAW);
+		error = protosw_unregister(&send_protosw);
 		if (error == 0)
 			SEND_LOCK_DESTROY();
 		send_sendso_input_hook = NULL;
 		break;
 #endif
 	default:
 		error = 0;
 		break;
 	}
 
 	return (error);
 }
 
 static moduledata_t sendmod = {
 	"send",
 	send_modevent,
 	0
 };
 
 DECLARE_MODULE(send, sendmod, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
diff --git a/sys/netinet6/udp6_usrreq.c b/sys/netinet6/udp6_usrreq.c
index 13a44530ce67..81cea4d8036f 100644
--- a/sys/netinet6/udp6_usrreq.c
+++ b/sys/netinet6/udp6_usrreq.c
@@ -1,1350 +1,1362 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * Copyright (c) 2014 Kevin Lo
  * All rights reserved.
  *
  * Portions of this software were developed by Robert N. M. Watson under
  * contract to Juniper Networks, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	$KAME: udp6_usrreq.c,v 1.27 2001/05/21 05:45:10 jinmei Exp $
  *	$KAME: udp6_output.c,v 1.31 2001/05/21 16:39:15 jinmei Exp $
  */
 
 /*-
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *	The Regents of the University of California.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)udp_usrreq.c	8.6 (Berkeley) 5/23/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 #include "opt_ipsec.h"
 #include "opt_route.h"
 #include "opt_rss.h"
 
 #include <sys/param.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #include <net/route.h>
 #include <net/rss_config.h>
 
 #include <netinet/in.h>
 #include <netinet/in_kdtrace.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_systm.h>
 #include <netinet/in_var.h>
 #include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/udplite.h>
 
 #include <netinet6/ip6protosw.h>
 #include <netinet6/ip6_var.h>
 #include <netinet6/in6_fib.h>
 #include <netinet6/in6_pcb.h>
 #include <netinet6/in6_rss.h>
 #include <netinet6/udp6_var.h>
 #include <netinet6/scope6_var.h>
 
 #include <netipsec/ipsec_support.h>
 
 #include <security/mac/mac_framework.h>
 
 VNET_DEFINE(int, zero_checksum_port) = 0;
 #define	V_zero_checksum_port	VNET(zero_checksum_port)
 SYSCTL_INT(_net_inet6_udp6, OID_AUTO, rfc6935_port, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(zero_checksum_port), 0,
     "Zero UDP checksum allowed for traffic to/from this port.");
 
 
 /* netinet/udp_usrreqs.c */
 pr_abort_t	udp_abort;
 pr_disconnect_t	udp_disconnect;
 pr_send_t	udp_send;
 
 /*
  * UDP protocol implementation.
  * Per RFC 768, August, 1980.
  */
 
 static void		udp6_detach(struct socket *so);
 
 static int
 udp6_append(struct inpcb *inp, struct mbuf *n, int off,
     struct sockaddr_in6 *fromsa)
 {
 	struct socket *so;
 	struct mbuf *opts = NULL, *tmp_opts;
 	struct udpcb *up;
 	bool filtered;
 
 	INP_LOCK_ASSERT(inp);
 
 	/*
 	 * Engage the tunneling protocol.
 	 */
 	up = intoudpcb(inp);
 	if (up->u_tun_func != NULL) {
 		in_pcbref(inp);
 		INP_RUNLOCK(inp);
 		filtered = (*up->u_tun_func)(n, off, inp,
 		    (struct sockaddr *)&fromsa[0], up->u_tun_ctx);
 		INP_RLOCK(inp);
 		if (filtered)
 			return (in_pcbrele_rlocked(inp));
 	}
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	/* Check AH/ESP integrity. */
 	if (IPSEC_ENABLED(ipv6)) {
 		if (IPSEC_CHECK_POLICY(ipv6, n, inp) != 0) {
 			m_freem(n);
 			return (0);
 		}
 	}
 #endif /* IPSEC */
 #ifdef MAC
 	if (mac_inpcb_check_deliver(inp, n) != 0) {
 		m_freem(n);
 		return (0);
 	}
 #endif
 	opts = NULL;
 	if (inp->inp_flags & INP_CONTROLOPTS ||
 	    inp->inp_socket->so_options & SO_TIMESTAMP)
 		ip6_savecontrol(inp, n, &opts);
 	if ((inp->inp_vflag & INP_IPV6) && (inp->inp_flags2 & INP_ORIGDSTADDR)) {
 		tmp_opts = sbcreatecontrol(&fromsa[1],
 		    sizeof(struct sockaddr_in6), IPV6_ORIGDSTADDR,
 		    IPPROTO_IPV6, M_NOWAIT);
                 if (tmp_opts) {
                         if (opts) {
                                 tmp_opts->m_next = opts;
                                 opts = tmp_opts;
                         } else
                                 opts = tmp_opts;
                 }
 	}
 	m_adj(n, off + sizeof(struct udphdr));
 
 	so = inp->inp_socket;
 	SOCKBUF_LOCK(&so->so_rcv);
 	if (sbappendaddr_locked(&so->so_rcv, (struct sockaddr *)&fromsa[0], n,
 	    opts) == 0) {
 		soroverflow_locked(so);
 		m_freem(n);
 		if (opts)
 			m_freem(opts);
 		UDPSTAT_INC(udps_fullsock);
 	} else
 		sorwakeup_locked(so);
 	return (0);
 }
 
 struct udp6_multi_match_ctx {
 	struct ip6_hdr *ip6;
 	struct udphdr *uh;
 };
 
 static bool
 udp6_multi_match(const struct inpcb *inp, void *v)
 {
 	struct udp6_multi_match_ctx *ctx = v;
 
 	if ((inp->inp_vflag & INP_IPV6) == 0)
 		return(false);
 	if (inp->inp_lport != ctx->uh->uh_dport)
 		return(false);
 	if (inp->inp_fport != 0 && inp->inp_fport != ctx->uh->uh_sport)
 		return(false);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
 	    !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ctx->ip6->ip6_dst))
 		return (false);
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) &&
 	    (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ctx->ip6->ip6_src) ||
 	    inp->inp_fport != ctx->uh->uh_sport))
 		return (false);
 
 	return (true);
 }
 
 static int
 udp6_multi_input(struct mbuf *m, int off, int proto,
     struct sockaddr_in6 *fromsa)
 {
 	struct udp6_multi_match_ctx ctx;
 	struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto),
 	    INPLOOKUP_RLOCKPCB, udp6_multi_match, &ctx);
 	struct inpcb *inp;
 	struct ip6_moptions *imo;
 	struct mbuf *n;
 	int appends = 0;
 
 	/*
 	 * In the event that laddr should be set to the link-local
 	 * address (this happens in RIPng), the multicast address
 	 * specified in the received packet will not match laddr.  To
 	 * handle this situation, matching is relaxed if the
 	 * receiving interface is the same as one specified in the
 	 * socket and if the destination multicast address matches
 	 * one of the multicast groups specified in the socket.
 	 */
 
 	/*
 	 * KAME note: traditionally we dropped udpiphdr from mbuf
 	 * here.  We need udphdr for IPsec processing so we do that
 	 * later.
 	 */
 	ctx.ip6 = mtod(m, struct ip6_hdr *);
 	ctx.uh = (struct udphdr *)((char *)ctx.ip6 + off);
 	while ((inp = inp_next(&inpi)) != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		/*
 		 * XXXRW: Because we weren't holding either the inpcb
 		 * or the hash lock when we checked for a match
 		 * before, we should probably recheck now that the
 		 * inpcb lock is (supposed to be) held.
 		 */
 		/*
 		 * Handle socket delivery policy for any-source
 		 * and source-specific multicast. [RFC3678]
 		 */
 		if ((imo = inp->in6p_moptions) != NULL) {
 			struct sockaddr_in6	 mcaddr;
 			int			 blocked;
 
 			bzero(&mcaddr, sizeof(struct sockaddr_in6));
 			mcaddr.sin6_len = sizeof(struct sockaddr_in6);
 			mcaddr.sin6_family = AF_INET6;
 			mcaddr.sin6_addr = ctx.ip6->ip6_dst;
 
 			blocked = im6o_mc_filter(imo, m->m_pkthdr.rcvif,
 				(struct sockaddr *)&mcaddr,
 				(struct sockaddr *)&fromsa[0]);
 			if (blocked != MCAST_PASS) {
 				if (blocked == MCAST_NOTGMEMBER)
 					IP6STAT_INC(ip6s_notmember);
 				if (blocked == MCAST_NOTSMEMBER ||
 				    blocked == MCAST_MUTED)
 					UDPSTAT_INC(udps_filtermcast);
 				continue;
 			}
 		}
 		if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
 			if (proto == IPPROTO_UDPLITE)
 				UDPLITE_PROBE(receive, NULL, inp, ctx.ip6,
 				    inp, ctx.uh);
 			else
 				UDP_PROBE(receive, NULL, inp, ctx.ip6, inp,
 				    ctx.uh);
 			if (udp6_append(inp, n, off, fromsa)) {
 				INP_RUNLOCK(inp);
 				break;
 			} else
 				appends++;
 		}
 		/*
 		 * Don't look for additional matches if this one does
 		 * not have either the SO_REUSEPORT or SO_REUSEADDR
 		 * socket options set.  This heuristic avoids
 		 * searching through all pcbs in the common case of a
 		 * non-shared port.  It assumes that an application
 		 * will never clear these options after setting them.
 		 */
 		if ((inp->inp_socket->so_options &
 		     (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) {
 			INP_RUNLOCK(inp);
 			break;
 		}
 	}
 	m_freem(m);
 
 	if (appends == 0) {
 		/*
 		 * No matching pcb found; discard datagram.  (No need
 		 * to send an ICMP Port Unreachable for a broadcast
 		 * or multicast datgram.)
 		 */
 		UDPSTAT_INC(udps_noport);
 		UDPSTAT_INC(udps_noportmcast);
 	}
 
 	return (IPPROTO_DONE);
 }
 
 int
 udp6_input(struct mbuf **mp, int *offp, int proto)
 {
 	struct mbuf *m = *mp;
 	struct ip6_hdr *ip6;
 	struct udphdr *uh;
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct udpcb *up;
 	int off = *offp;
 	int cscov_partial;
 	int plen, ulen;
 	struct sockaddr_in6 fromsa[2];
 	struct m_tag *fwd_tag;
 	uint16_t uh_sum;
 	uint8_t nxt;
 
 	NET_EPOCH_ASSERT();
 
 	if (m->m_len < off + sizeof(struct udphdr)) {
 		m = m_pullup(m, off + sizeof(struct udphdr));
 		if (m == NULL) {
 			IP6STAT_INC(ip6s_exthdrtoolong);
 			*mp = NULL;
 			return (IPPROTO_DONE);
 		}
 	}
 	ip6 = mtod(m, struct ip6_hdr *);
 	uh = (struct udphdr *)((caddr_t)ip6 + off);
 
 	UDPSTAT_INC(udps_ipackets);
 
 	/*
 	 * Destination port of 0 is illegal, based on RFC768.
 	 */
 	if (uh->uh_dport == 0)
 		goto badunlocked;
 
 	plen = ntohs(ip6->ip6_plen) - off + sizeof(*ip6);
 	ulen = ntohs((u_short)uh->uh_ulen);
 
 	nxt = proto;
 	cscov_partial = (nxt == IPPROTO_UDPLITE) ? 1 : 0;
 	if (nxt == IPPROTO_UDPLITE) {
 		/* Zero means checksum over the complete packet. */
 		if (ulen == 0)
 			ulen = plen;
 		if (ulen == plen)
 			cscov_partial = 0;
 		if ((ulen < sizeof(struct udphdr)) || (ulen > plen)) {
 			/* XXX: What is the right UDPLite MIB counter? */
 			goto badunlocked;
 		}
 		if (uh->uh_sum == 0) {
 			/* XXX: What is the right UDPLite MIB counter? */
 			goto badunlocked;
 		}
 	} else {
 		if ((ulen < sizeof(struct udphdr)) || (plen != ulen)) {
 			UDPSTAT_INC(udps_badlen);
 			goto badunlocked;
 		}
 		if (uh->uh_sum == 0) {
 			UDPSTAT_INC(udps_nosum);
 			/*
 			 * dport 0 was rejected earlier so this is OK even if
 			 * zero_checksum_port is 0 (which is its default value).
 			 */
 			if (ntohs(uh->uh_dport) == V_zero_checksum_port)
 				goto skip_checksum;
 			else
 				goto badunlocked;
 		}
 	}
 
 	if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) &&
 	    !cscov_partial) {
 		if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 			uh_sum = m->m_pkthdr.csum_data;
 		else
 			uh_sum = in6_cksum_pseudo(ip6, ulen, nxt,
 			    m->m_pkthdr.csum_data);
 		uh_sum ^= 0xffff;
 	} else
 		uh_sum = in6_cksum_partial(m, nxt, off, plen, ulen);
 
 	if (uh_sum != 0) {
 		UDPSTAT_INC(udps_badsum);
 		goto badunlocked;
 	}
 
 skip_checksum:
 	/*
 	 * Construct sockaddr format source address.
 	 */
 	init_sin6(&fromsa[0], m, 0);
 	fromsa[0].sin6_port = uh->uh_sport;
 	init_sin6(&fromsa[1], m, 1);
 	fromsa[1].sin6_port = uh->uh_dport;
 
 	pcbinfo = udp_get_inpcbinfo(nxt);
 	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))  {
 		*mp = NULL;
 		return (udp6_multi_input(m, off, proto, fromsa));
 	}
 
 	/*
 	 * Locate pcb for datagram.
 	 */
 
 	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
 	if ((m->m_flags & M_IP6_NEXTHOP) &&
 	    (fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL) {
 		struct sockaddr_in6 *next_hop6;
 
 		next_hop6 = (struct sockaddr_in6 *)(fwd_tag + 1);
 
 		/*
 		 * Transparently forwarded. Pretend to be the destination.
 		 * Already got one like this?
 		 */
 		inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src,
 		    uh->uh_sport, &ip6->ip6_dst, uh->uh_dport,
 		    INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif, m);
 		if (!inp) {
 			/*
 			 * It's new.  Try to find the ambushing socket.
 			 * Because we've rewritten the destination address,
 			 * any hardware-generated hash is ignored.
 			 */
 			inp = in6_pcblookup(pcbinfo, &ip6->ip6_src,
 			    uh->uh_sport, &next_hop6->sin6_addr,
 			    next_hop6->sin6_port ? htons(next_hop6->sin6_port) :
 			    uh->uh_dport, INPLOOKUP_WILDCARD |
 			    INPLOOKUP_RLOCKPCB, m->m_pkthdr.rcvif);
 		}
 		/* Remove the tag from the packet. We don't need it anymore. */
 		m_tag_delete(m, fwd_tag);
 		m->m_flags &= ~M_IP6_NEXTHOP;
 	} else
 		inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_src,
 		    uh->uh_sport, &ip6->ip6_dst, uh->uh_dport,
 		    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB,
 		    m->m_pkthdr.rcvif, m);
 	if (inp == NULL) {
 		if (V_udp_log_in_vain) {
 			char ip6bufs[INET6_ADDRSTRLEN];
 			char ip6bufd[INET6_ADDRSTRLEN];
 
 			log(LOG_INFO,
 			    "Connection attempt to UDP [%s]:%d from [%s]:%d\n",
 			    ip6_sprintf(ip6bufd, &ip6->ip6_dst),
 			    ntohs(uh->uh_dport),
 			    ip6_sprintf(ip6bufs, &ip6->ip6_src),
 			    ntohs(uh->uh_sport));
 		}
 		if (nxt == IPPROTO_UDPLITE)
 			UDPLITE_PROBE(receive, NULL, NULL, ip6, NULL, uh);
 		else
 			UDP_PROBE(receive, NULL, NULL, ip6, NULL, uh);
 		UDPSTAT_INC(udps_noport);
 		if (m->m_flags & M_MCAST) {
 			printf("UDP6: M_MCAST is set in a unicast packet.\n");
 			UDPSTAT_INC(udps_noportmcast);
 			goto badunlocked;
 		}
 		if (V_udp_blackhole && (V_udp_blackhole_local ||
 		    !in6_localaddr(&ip6->ip6_src)))
 			goto badunlocked;
 		icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0);
 		*mp = NULL;
 		return (IPPROTO_DONE);
 	}
 	INP_RLOCK_ASSERT(inp);
 	up = intoudpcb(inp);
 	if (cscov_partial) {
 		if (up->u_rxcslen == 0 || up->u_rxcslen > ulen) {
 			INP_RUNLOCK(inp);
 			m_freem(m);
 			*mp = NULL;
 			return (IPPROTO_DONE);
 		}
 	}
 	if (nxt == IPPROTO_UDPLITE)
 		UDPLITE_PROBE(receive, NULL, inp, ip6, inp, uh);
 	else
 		UDP_PROBE(receive, NULL, inp, ip6, inp, uh);
 	if (udp6_append(inp, m, off, fromsa) == 0)
 		INP_RUNLOCK(inp);
 	*mp = NULL;
 	return (IPPROTO_DONE);
 
 badunlocked:
 	m_freem(m);
 	*mp = NULL;
 	return (IPPROTO_DONE);
 }
 
 static void
 udp6_common_ctlinput(int cmd, struct sockaddr *sa, void *d,
     struct inpcbinfo *pcbinfo)
 {
 	struct udphdr uh;
 	struct ip6_hdr *ip6;
 	struct mbuf *m;
 	int off = 0;
 	struct ip6ctlparam *ip6cp = NULL;
 	const struct sockaddr_in6 *sa6_src = NULL;
 	void *cmdarg;
 	struct inpcb *(*notify)(struct inpcb *, int) = udp_notify;
 	struct udp_portonly {
 		u_int16_t uh_sport;
 		u_int16_t uh_dport;
 	} *uhp;
 
 	if (sa->sa_family != AF_INET6 ||
 	    sa->sa_len != sizeof(struct sockaddr_in6))
 		return;
 
 	if ((unsigned)cmd >= PRC_NCMDS)
 		return;
 	if (PRC_IS_REDIRECT(cmd))
 		notify = in6_rtchange, d = NULL;
 	else if (cmd == PRC_HOSTDEAD)
 		d = NULL;
 	else if (inet6ctlerrmap[cmd] == 0)
 		return;
 
 	/* if the parameter is from icmp6, decode it. */
 	if (d != NULL) {
 		ip6cp = (struct ip6ctlparam *)d;
 		m = ip6cp->ip6c_m;
 		ip6 = ip6cp->ip6c_ip6;
 		off = ip6cp->ip6c_off;
 		cmdarg = ip6cp->ip6c_cmdarg;
 		sa6_src = ip6cp->ip6c_src;
 	} else {
 		m = NULL;
 		ip6 = NULL;
 		cmdarg = NULL;
 		sa6_src = &sa6_any;
 	}
 
 	if (ip6) {
 		/*
 		 * XXX: We assume that when IPV6 is non NULL,
 		 * M and OFF are valid.
 		 */
 
 		/* Check if we can safely examine src and dst ports. */
 		if (m->m_pkthdr.len < off + sizeof(*uhp))
 			return;
 
 		bzero(&uh, sizeof(uh));
 		m_copydata(m, off, sizeof(*uhp), (caddr_t)&uh);
 
 		if (!PRC_IS_REDIRECT(cmd)) {
 			/* Check to see if its tunneled */
 			struct inpcb *inp;
 			inp = in6_pcblookup_mbuf(pcbinfo, &ip6->ip6_dst,
 			    uh.uh_dport, &ip6->ip6_src, uh.uh_sport,
 			    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB,
 			    m->m_pkthdr.rcvif, m);
 			if (inp != NULL) {
 				struct udpcb *up;
 				
 				up = intoudpcb(inp);
 				if (up->u_icmp_func) {
 					/* Yes it is. */
 					INP_RUNLOCK(inp);
 					(*up->u_icmp_func)(cmd, (struct sockaddr *)ip6cp->ip6c_src,
 					      d, up->u_tun_ctx);
 					return;
 				} else {
 					/* Can't find it. */
 					INP_RUNLOCK(inp);
 				}
 			}
 		}
 		(void)in6_pcbnotify(pcbinfo, sa, uh.uh_dport,
 		    (struct sockaddr *)ip6cp->ip6c_src, uh.uh_sport, cmd,
 		    cmdarg, notify);
 	} else
 		(void)in6_pcbnotify(pcbinfo, sa, 0,
 		    (const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
 }
 
 void
 udp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 
 	return (udp6_common_ctlinput(cmd, sa, d, &V_udbinfo));
 }
 
 void
 udplite6_ctlinput(int cmd, struct sockaddr *sa, void *d)
 {
 
 	return (udp6_common_ctlinput(cmd, sa, d, &V_ulitecbinfo));
 }
 
 static int
 udp6_getcred(SYSCTL_HANDLER_ARGS)
 {
 	struct xucred xuc;
 	struct sockaddr_in6 addrs[2];
 	struct epoch_tracker et;
 	struct inpcb *inp;
 	int error;
 
 	error = priv_check(req->td, PRIV_NETINET_GETCRED);
 	if (error)
 		return (error);
 
 	if (req->newlen != sizeof(addrs))
 		return (EINVAL);
 	if (req->oldlen != sizeof(struct xucred))
 		return (EINVAL);
 	error = SYSCTL_IN(req, addrs, sizeof(addrs));
 	if (error)
 		return (error);
 	if ((error = sa6_embedscope(&addrs[0], V_ip6_use_defzone)) != 0 ||
 	    (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) {
 		return (error);
 	}
 	NET_EPOCH_ENTER(et);
 	inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr,
 	    addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port,
 	    INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL);
 	NET_EPOCH_EXIT(et);
 	if (inp != NULL) {
 		INP_RLOCK_ASSERT(inp);
 		if (inp->inp_socket == NULL)
 			error = ENOENT;
 		if (error == 0)
 			error = cr_canseesocket(req->td->td_ucred,
 			    inp->inp_socket);
 		if (error == 0)
 			cru2x(inp->inp_cred, &xuc);
 		INP_RUNLOCK(inp);
 	} else
 		error = ENOENT;
 	if (error == 0)
 		error = SYSCTL_OUT(req, &xuc, sizeof(struct xucred));
 	return (error);
 }
 
 SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred,
     CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE,
     0, 0, udp6_getcred, "S,xucred",
     "Get the xucred of a UDP6 connection");
 
 static int
 udp6_output(struct socket *so, int flags_arg, struct mbuf *m,
     struct sockaddr *addr6, struct mbuf *control, struct thread *td)
 {
 	struct inpcb *inp;
 	struct ip6_hdr *ip6;
 	struct udphdr *udp6;
 	struct in6_addr *laddr, *faddr, in6a;
 	struct ip6_pktopts *optp, opt;
 	struct sockaddr_in6 *sin6, tmp;
 	struct epoch_tracker et;
 	int cscov_partial, error, flags, hlen, scope_ambiguous;
 	u_int32_t ulen, plen;
 	uint16_t cscov;
 	u_short fport;
 	uint8_t nxt;
 
 	/* addr6 has been validated in udp6_send(). */
 	sin6 = (struct sockaddr_in6 *)addr6;
 
 	/*
 	 * In contrast to to IPv4 we do not validate the max. packet length
 	 * here due to IPv6 Jumbograms (RFC2675).
 	 */
 
 	scope_ambiguous = 0;
 	if (sin6) {
 		/* Protect *addr6 from overwrites. */
 		tmp = *sin6;
 		sin6 = &tmp;
 
 		/*
 		 * Application should provide a proper zone ID or the use of
 		 * default zone IDs should be enabled.  Unfortunately, some
 		 * applications do not behave as it should, so we need a
 		 * workaround.  Even if an appropriate ID is not determined,
 		 * we'll see if we can determine the outgoing interface.  If we
 		 * can, determine the zone ID based on the interface below.
 		 */
 		if (sin6->sin6_scope_id == 0 && !V_ip6_use_defzone)
 			scope_ambiguous = 1;
 		if ((error = sa6_embedscope(sin6, V_ip6_use_defzone)) != 0) {
 			if (control)
 				m_freem(control);
 			m_freem(m);
 			return (error);
 		}
 	}
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
 	/*
 	 * In the following cases we want a write lock on the inp for either
 	 * local operations or for possible route cache updates in the IPv6
 	 * output path:
 	 * - on connected sockets (sin6 is NULL) for route cache updates,
 	 * - when we are not bound to an address and source port (it is
 	 *   in6_pcbsetport() which will require the write lock).
 	 *
 	 * We check the inp fields before actually locking the inp, so
 	 * here exists a race, and we may WLOCK the inp and end with already
 	 * bound one by other thread. This is fine.
 	 */
 	if (sin6 == NULL || (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
 	    inp->inp_lport == 0))
 		INP_WLOCK(inp);
 	else
 		INP_RLOCK(inp);
 
 	nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ?
 	    IPPROTO_UDP : IPPROTO_UDPLITE;
 
 #ifdef INET
 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
 		int hasv4addr;
 
 		if (sin6 == NULL)
 			hasv4addr = (inp->inp_vflag & INP_IPV4);
 		else
 			hasv4addr = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)
 			    ? 1 : 0;
 		if (hasv4addr) {
 			/*
 			 * XXXRW: We release UDP-layer locks before calling
 			 * udp_send() in order to avoid recursion.  However,
 			 * this does mean there is a short window where inp's
 			 * fields are unstable.  Could this lead to a
 			 * potential race in which the factors causing us to
 			 * select the UDPv4 output routine are invalidated?
 			 */
 			INP_UNLOCK(inp);
 			if (sin6)
 				in6_sin6_2_sin_in_sock((struct sockaddr *)sin6);
 			/* addr will just be freed in sendit(). */
 			return (udp_send(so, flags_arg | PRUS_IPV6, m,
 			    (struct sockaddr *)sin6, control, td));
 		}
 	} else
 #endif
 	if (sin6 && IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 		/*
 		 * Given this is either an IPv6-only socket or no INET is
 		 * supported we will fail the send if the given destination
 		 * address is a v4mapped address.
 		 *
 		 * XXXGL: do we leak m and control?
 		 */
 		INP_UNLOCK(inp);
 		return (EINVAL);
 	}
 
 	NET_EPOCH_ENTER(et);
 	if (control) {
 		if ((error = ip6_setpktopts(control, &opt,
 		    inp->in6p_outputopts, td->td_ucred, nxt)) != 0) {
 			goto release;
 		}
 		optp = &opt;
 	} else
 		optp = inp->in6p_outputopts;
 
 	if (sin6) {
 		/*
 		 * Since we saw no essential reason for calling in_pcbconnect,
 		 * we get rid of such kind of logic, and call in6_selectsrc
 		 * and in6_pcbsetport in order to fill in the local address
 		 * and the local port.
 		 */
 		if (sin6->sin6_port == 0) {
 			error = EADDRNOTAVAIL;
 			goto release;
 		}
 
 		if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 			/* how about ::ffff:0.0.0.0 case? */
 			error = EISCONN;
 			goto release;
 		}
 
 		/*
 		 * Given we handle the v4mapped case in the INET block above
 		 * assert here that it must not happen anymore.
 		 */
 		KASSERT(!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr),
 		    ("%s: sin6(%p)->sin6_addr is v4mapped which we "
 		    "should have handled.", __func__, sin6));
 
 		/* This only requires read-locking. */
 		error = in6_selectsrc_socket(sin6, optp, inp,
 		    td->td_ucred, scope_ambiguous, &in6a, NULL);
 		if (error)
 			goto release;
 		laddr = &in6a;
 
 		if (inp->inp_lport == 0) {
 			struct inpcbinfo *pcbinfo;
 
 			INP_WLOCK_ASSERT(inp);
 
 			pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 			INP_HASH_WLOCK(pcbinfo);
 			error = in6_pcbsetport(laddr, inp, td->td_ucred);
 			INP_HASH_WUNLOCK(pcbinfo);
 			if (error != 0) {
 				/* Undo an address bind that may have occurred. */
 				inp->in6p_laddr = in6addr_any;
 				goto release;
 			}
 		}
 		faddr = &sin6->sin6_addr;
 		fport = sin6->sin6_port; /* allow 0 port */
 
 	} else {
 		if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 			error = ENOTCONN;
 			goto release;
 		}
 		laddr = &inp->in6p_laddr;
 		faddr = &inp->in6p_faddr;
 		fport = inp->inp_fport;
 	}
 
 	ulen = m->m_pkthdr.len;
 	plen = sizeof(struct udphdr) + ulen;
 	hlen = sizeof(struct ip6_hdr);
 
 	/*
 	 * Calculate data length and get a mbuf
 	 * for UDP and IP6 headers.
 	 */
 	M_PREPEND(m, hlen + sizeof(struct udphdr), M_NOWAIT);
 	if (m == NULL) {
 		error = ENOBUFS;
 		goto release;
 	}
 
 	/*
 	 * Stuff checksum and output datagram.
 	 */
 	cscov = cscov_partial = 0;
 	udp6 = (struct udphdr *)(mtod(m, caddr_t) + hlen);
 	udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */
 	udp6->uh_dport = fport;
 	if (nxt == IPPROTO_UDPLITE) {
 		struct udpcb *up;
 
 		up = intoudpcb(inp);
 		cscov = up->u_txcslen;
 		if (cscov >= plen)
 			cscov = 0;
 		udp6->uh_ulen = htons(cscov);
 		/*
 		 * For UDP-Lite, checksum coverage length of zero means
 		 * the entire UDPLite packet is covered by the checksum.
 		 */
 		cscov_partial = (cscov == 0) ? 0 : 1;
 	} else if (plen <= 0xffff)
 		udp6->uh_ulen = htons((u_short)plen);
 	else
 		udp6->uh_ulen = 0;
 	udp6->uh_sum = 0;
 
 	ip6 = mtod(m, struct ip6_hdr *);
 	ip6->ip6_flow	= inp->inp_flow & IPV6_FLOWINFO_MASK;
 	ip6->ip6_vfc	&= ~IPV6_VERSION_MASK;
 	ip6->ip6_vfc	|= IPV6_VERSION;
 	ip6->ip6_plen	= htons((u_short)plen);
 	ip6->ip6_nxt	= nxt;
 	ip6->ip6_hlim	= in6_selecthlim(inp, NULL);
 	ip6->ip6_src	= *laddr;
 	ip6->ip6_dst	= *faddr;
 
 #ifdef MAC
 	mac_inpcb_create_mbuf(inp, m);
 #endif
 
 	if (cscov_partial) {
 		if ((udp6->uh_sum = in6_cksum_partial(m, nxt,
 		    sizeof(struct ip6_hdr), plen, cscov)) == 0)
 			udp6->uh_sum = 0xffff;
 	} else {
 		udp6->uh_sum = in6_cksum_pseudo(ip6, plen, nxt, 0);
 		m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 		m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
 	}
 
 	flags = 0;
 #if defined(ROUTE_MPATH) || defined(RSS)
 	if (CALC_FLOWID_OUTBOUND_SENDTO) {
 		uint32_t hash_type, hash_val;
 		uint8_t pr;
 
 		pr = inp->inp_socket->so_proto->pr_protocol;
 
 		hash_val = fib6_calc_packet_hash(laddr, faddr,
 		    inp->inp_lport, fport, pr, &hash_type);
 		m->m_pkthdr.flowid = hash_val;
 		M_HASHTYPE_SET(m, hash_type);
 	}
 	/* do not use inp flowid */
 	flags |= IP_NODEFAULTFLOWID;
 #endif
 
 	UDPSTAT_INC(udps_opackets);
 	if (nxt == IPPROTO_UDPLITE)
 		UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6);
 	else
 		UDP_PROBE(send, NULL, inp, ip6, inp, udp6);
 	error = ip6_output(m, optp,
 	    INP_WLOCKED(inp) ? &inp->inp_route6 : NULL, flags,
 	    inp->in6p_moptions, NULL, inp);
 	INP_UNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 
 	if (control) {
 		ip6_clearpktopts(&opt, -1);
 		m_freem(control);
 	}
 	return (error);
 
 release:
 	INP_UNLOCK(inp);
 	NET_EPOCH_EXIT(et);
 	if (control) {
 		ip6_clearpktopts(&opt, -1);
 		m_freem(control);
 	}
 	m_freem(m);
 
 	return (error);
 }
 
 static void
 udp6_abort(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_abort: inp == NULL"));
 
 	INP_WLOCK(inp);
 #ifdef INET
 	if (inp->inp_vflag & INP_IPV4) {
 		INP_WUNLOCK(inp);
 		udp_abort(so);
 		return;
 	}
 #endif
 
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		INP_HASH_WLOCK(pcbinfo);
 		in6_pcbdisconnect(inp);
 		inp->in6p_laddr = in6addr_any;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp6_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp == NULL, ("udp6_attach: inp != NULL"));
 
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		error = soreserve(so, udp_sendspace, udp_recvspace);
 		if (error)
 			return (error);
 	}
 	error = in_pcballoc(so, pcbinfo);
 	if (error)
 		return (error);
 	inp = (struct inpcb *)so->so_pcb;
 	inp->in6p_cksum = -1;	/* just to be sure */
 	/*
 	 * XXX: ugly!!
 	 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
 	 * because the socket may be bound to an IPv6 wildcard address,
 	 * which may match an IPv4-mapped IPv6 address.
 	 */
 	inp->inp_ip_ttl = V_ip_defttl;
 
 	error = udp_newudpcb(inp);
 	if (error) {
 		in_pcbdetach(inp);
 		in_pcbfree(inp);
 		return (error);
 	}
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 udp6_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	int error;
 	u_char vflagsav;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_bind: inp == NULL"));
 
 	if (nam->sa_family != AF_INET6)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len != sizeof(struct sockaddr_in6))
 		return (EINVAL);
 
 	INP_WLOCK(inp);
 	INP_HASH_WLOCK(pcbinfo);
 	vflagsav = inp->inp_vflag;
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
 	if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
 		struct sockaddr_in6 *sin6_p;
 
 		sin6_p = (struct sockaddr_in6 *)nam;
 
 		if (IN6_IS_ADDR_UNSPECIFIED(&sin6_p->sin6_addr))
 			inp->inp_vflag |= INP_IPV4;
 #ifdef INET
 		else if (IN6_IS_ADDR_V4MAPPED(&sin6_p->sin6_addr)) {
 			struct sockaddr_in sin;
 
 			in6_sin6_2_sin(&sin, sin6_p);
 			inp->inp_vflag |= INP_IPV4;
 			inp->inp_vflag &= ~INP_IPV6;
 			error = in_pcbbind(inp, (struct sockaddr *)&sin,
 			    td->td_ucred);
 			goto out;
 		}
 #endif
 	}
 
 	error = in6_pcbbind(inp, nam, td->td_ucred);
 #ifdef INET
 out:
 #endif
 	if (error != 0)
 		inp->inp_vflag = vflagsav;
 	INP_HASH_WUNLOCK(pcbinfo);
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp6_close(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_close: inp == NULL"));
 
 	INP_WLOCK(inp);
 #ifdef INET
 	if (inp->inp_vflag & INP_IPV4) {
 		INP_WUNLOCK(inp);
 		(void)udp_disconnect(so);
 		return;
 	}
 #endif
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		INP_HASH_WLOCK(pcbinfo);
 		in6_pcbdisconnect(inp);
 		inp->in6p_laddr = in6addr_any;
 		INP_HASH_WUNLOCK(pcbinfo);
 		soisdisconnected(so);
 	}
 	INP_WUNLOCK(inp);
 }
 
 static int
 udp6_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 #ifdef INET
 	struct epoch_tracker et;
 #endif
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 	struct sockaddr_in6 *sin6;
 	int error;
 	u_char vflagsav;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_connect: inp == NULL"));
 
 	sin6 = (struct sockaddr_in6 *)nam;
 	if (sin6->sin6_family != AF_INET6)
 		return (EAFNOSUPPORT);
 	if (sin6->sin6_len != sizeof(*sin6))
 		return (EINVAL);
 
 	/*
 	 * XXXRW: Need to clarify locking of v4/v6 flags.
 	 */
 	INP_WLOCK(inp);
 #ifdef INET
 	if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
 		struct sockaddr_in sin;
 
 		if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
 			error = EINVAL;
 			goto out;
 		}
 		if ((inp->inp_vflag & INP_IPV4) == 0) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 		if (inp->inp_faddr.s_addr != INADDR_ANY) {
 			error = EISCONN;
 			goto out;
 		}
 		in6_sin6_2_sin(&sin, sin6);
 		error = prison_remote_ip4(td->td_ucred, &sin.sin_addr);
 		if (error != 0)
 			goto out;
 		vflagsav = inp->inp_vflag;
 		inp->inp_vflag |= INP_IPV4;
 		inp->inp_vflag &= ~INP_IPV6;
 		NET_EPOCH_ENTER(et);
 		INP_HASH_WLOCK(pcbinfo);
 		error = in_pcbconnect(inp, (struct sockaddr *)&sin,
 		    td->td_ucred, true);
 		INP_HASH_WUNLOCK(pcbinfo);
 		NET_EPOCH_EXIT(et);
 		/*
 		 * If connect succeeds, mark socket as connected. If
 		 * connect fails and socket is unbound, reset inp_vflag
 		 * field.
 		 */
 		if (error == 0)
 			soisconnected(so);
 		else if (inp->inp_laddr.s_addr == INADDR_ANY &&
 		    inp->inp_lport == 0)
 			inp->inp_vflag = vflagsav;
 		goto out;
 	} else {
 		if ((inp->inp_vflag & INP_IPV6) == 0) {
 			error = EAFNOSUPPORT;
 			goto out;
 		}
 	}
 #endif
 	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		error = EISCONN;
 		goto out;
 	}
 	error = prison_remote_ip6(td->td_ucred, &sin6->sin6_addr);
 	if (error != 0)
 		goto out;
 	vflagsav = inp->inp_vflag;
 	inp->inp_vflag &= ~INP_IPV4;
 	inp->inp_vflag |= INP_IPV6;
 	INP_HASH_WLOCK(pcbinfo);
 	error = in6_pcbconnect(inp, nam, td->td_ucred);
 	INP_HASH_WUNLOCK(pcbinfo);
 	/*
 	 * If connect succeeds, mark socket as connected. If
 	 * connect fails and socket is unbound, reset inp_vflag
 	 * field.
 	 */
 	if (error == 0)
 		soisconnected(so);
 	else if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) &&
 	    inp->inp_lport == 0)
 		inp->inp_vflag = vflagsav;
 out:
 	INP_WUNLOCK(inp);
 	return (error);
 }
 
 static void
 udp6_detach(struct socket *so)
 {
 	struct inpcb *inp;
 	struct udpcb *up;
 
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_detach: inp == NULL"));
 
 	INP_WLOCK(inp);
 	up = intoudpcb(inp);
 	KASSERT(up != NULL, ("%s: up == NULL", __func__));
 	in_pcbdetach(inp);
 	in_pcbfree(inp);
 	udp_discardcb(up);
 }
 
 static int
 udp6_disconnect(struct socket *so)
 {
 	struct inpcb *inp;
 	struct inpcbinfo *pcbinfo;
 
 	pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol);
 	inp = sotoinpcb(so);
 	KASSERT(inp != NULL, ("udp6_disconnect: inp == NULL"));
 
 	INP_WLOCK(inp);
 #ifdef INET
 	if (inp->inp_vflag & INP_IPV4) {
 		INP_WUNLOCK(inp);
 		(void)udp_disconnect(so);
 		return (0);
 	}
 #endif
 
 	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
 		INP_WUNLOCK(inp);
 		return (ENOTCONN);
 	}
 
 	INP_HASH_WLOCK(pcbinfo);
 	in6_pcbdisconnect(inp);
 	inp->in6p_laddr = in6addr_any;
 	INP_HASH_WUNLOCK(pcbinfo);
 	SOCK_LOCK(so);
 	so->so_state &= ~SS_ISCONNECTED;		/* XXX */
 	SOCK_UNLOCK(so);
 	INP_WUNLOCK(inp);
 	return (0);
 }
 
 static int
 udp6_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *addr, struct mbuf *control, struct thread *td)
 {
 	int error;
 
 	if (addr) {
 		if (addr->sa_len != sizeof(struct sockaddr_in6)) {
 			error = EINVAL;
 			goto bad;
 		}
 		if (addr->sa_family != AF_INET6) {
 			error = EAFNOSUPPORT;
 			goto bad;
 		}
 	}
 
 	return (udp6_output(so, flags, m, addr, control, td));
 
 bad:
 	if (control)
 		m_freem(control);
 	m_freem(m);
 	return (error);
 }
 
-struct pr_usrreqs udp6_usrreqs = {
-	.pru_abort =		udp6_abort,
-	.pru_attach =		udp6_attach,
-	.pru_bind =		udp6_bind,
-	.pru_connect =		udp6_connect,
-	.pru_control =		in6_control,
-	.pru_detach =		udp6_detach,
-	.pru_disconnect =	udp6_disconnect,
-	.pru_peeraddr =		in6_mapped_peeraddr,
-	.pru_send =		udp6_send,
-	.pru_shutdown =		udp_shutdown,
-	.pru_sockaddr =		in6_mapped_sockaddr,
-	.pru_soreceive =	soreceive_dgram,
-	.pru_sosend =		sosend_dgram,
-	.pru_sosetlabel =	in_pcbsosetlabel,
-	.pru_close =		udp6_close
+#define	UDP6_PROTOSW							\
+	.pr_type =		SOCK_DGRAM,				\
+	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_CAPATTACH,		\
+	.pr_ctloutput =		ip6_ctloutput,				\
+	.pr_abort =		udp6_abort,				\
+	.pr_attach =		udp6_attach,				\
+	.pr_bind =		udp6_bind,				\
+	.pr_connect =		udp6_connect,				\
+	.pr_control =		in6_control,				\
+	.pr_detach =		udp6_detach,				\
+	.pr_disconnect =	udp6_disconnect,			\
+	.pr_peeraddr =		in6_mapped_peeraddr,			\
+	.pr_send =		udp6_send,				\
+	.pr_shutdown =		udp_shutdown,				\
+	.pr_sockaddr =		in6_mapped_sockaddr,			\
+	.pr_soreceive =		soreceive_dgram,			\
+	.pr_sosend =		sosend_dgram,				\
+	.pr_sosetlabel =	in_pcbsosetlabel,			\
+	.pr_close =		udp6_close
+
+struct protosw udp6_protosw = {
+	.pr_protocol =		IPPROTO_UDP,
+	UDP6_PROTOSW
+};
+
+struct protosw udplite6_protosw = {
+	.pr_protocol =		IPPROTO_UDPLITE,
+	UDP6_PROTOSW
 };
 
 static void
 udp6_init(void *arg __unused)
 {
 
 	IP6PROTO_REGISTER(IPPROTO_UDP, udp6_input, udp6_ctlinput);
 	IP6PROTO_REGISTER(IPPROTO_UDPLITE, udp6_input, udplite6_ctlinput);
 }
 SYSINIT(udp6_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, udp6_init, NULL);
diff --git a/sys/netipsec/keysock.c b/sys/netipsec/keysock.c
index 8c95bc775c73..d0ab2f6020ec 100644
--- a/sys/netipsec/keysock.c
+++ b/sys/netipsec/keysock.c
@@ -1,356 +1,342 @@
 /*	$FreeBSD$	*/
 /*	$KAME: keysock.c,v 1.25 2001/08/13 20:07:41 itojun Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the project nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include "opt_ipsec.h"
 
 /* This code has derived from sys/net/rtsock.c on FreeBSD2.2.5 */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 
 #include <net/if.h>
 #include <net/vnet.h>
 
 #include <netinet/in.h>
 
 #include <net/pfkeyv2.h>
 #include <netipsec/key.h>
 #include <netipsec/keysock.h>
 #include <netipsec/key_debug.h>
 #include <netipsec/ipsec.h>
 
 #include <machine/stdarg.h>
 
 static struct mtx keysock_mtx;
 MTX_SYSINIT(keysock, &keysock_mtx, "key socket pcb list", MTX_DEF);
 
 #define	KEYSOCK_LOCK()		mtx_lock(&keysock_mtx)
 #define	KEYSOCK_UNLOCK()	mtx_unlock(&keysock_mtx)
 
 VNET_DEFINE_STATIC(LIST_HEAD(, keycb), keycb_list) =
     LIST_HEAD_INITIALIZER(keycb_list);
 #define	V_keycb_list		VNET(keycb_list)
 
 static struct sockaddr key_src = { 2, PF_KEY, };
 
 static int key_sendup0(struct keycb *, struct mbuf *, int);
 
 VNET_PCPUSTAT_DEFINE(struct pfkeystat, pfkeystat);
 VNET_PCPUSTAT_SYSINIT(pfkeystat);
 
 #ifdef VIMAGE
 VNET_PCPUSTAT_SYSUNINIT(pfkeystat);
 #endif /* VIMAGE */
 
 static int
 key_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *nam, struct mbuf *control, struct thread *td)
 {
 	struct sadb_msg *msg;
 	int len, error = 0;
 
 	if ((flags & PRUS_OOB) || control != NULL) {
 		m_freem(m);
 		if (control != NULL)
 			m_freem(control);
 		return (EOPNOTSUPP);
 	}
 
 	PFKEYSTAT_INC(out_total);
 	PFKEYSTAT_ADD(out_bytes, m->m_pkthdr.len);
 
 	len = m->m_pkthdr.len;
 	if (len < sizeof(struct sadb_msg)) {
 		PFKEYSTAT_INC(out_tooshort);
 		error = EINVAL;
 		goto end;
 	}
 
 	if (m->m_len < sizeof(struct sadb_msg)) {
 		if ((m = m_pullup(m, sizeof(struct sadb_msg))) == NULL) {
 			PFKEYSTAT_INC(out_nomem);
 			error = ENOBUFS;
 			goto end;
 		}
 	}
 
 	M_ASSERTPKTHDR(m);
 
 	KEYDBG(KEY_DUMP, kdebug_mbuf(m));
 
 	msg = mtod(m, struct sadb_msg *);
 	PFKEYSTAT_INC(out_msgtype[msg->sadb_msg_type]);
 	if (len != PFKEY_UNUNIT64(msg->sadb_msg_len)) {
 		PFKEYSTAT_INC(out_invlen);
 		error = EINVAL;
 		goto end;
 	}
 
 	error = key_parse(m, so);
 	m = NULL;
 end:
 	if (m)
 		m_freem(m);
 	return error;
 }
 
 /*
  * send message to the socket.
  */
 static int
 key_sendup0(struct keycb *kp, struct mbuf *m, int promisc)
 {
 
 	if (promisc) {
 		struct sadb_msg *pmsg;
 
 		M_PREPEND(m, sizeof(struct sadb_msg), M_NOWAIT);
 		if (m == NULL) {
 			PFKEYSTAT_INC(in_nomem);
 			return (ENOBUFS);
 		}
 		pmsg = mtod(m, struct sadb_msg *);
 		bzero(pmsg, sizeof(*pmsg));
 		pmsg->sadb_msg_version = PF_KEY_V2;
 		pmsg->sadb_msg_type = SADB_X_PROMISC;
 		pmsg->sadb_msg_len = PFKEY_UNIT64(m->m_pkthdr.len);
 		/* pid and seq? */
 
 		PFKEYSTAT_INC(in_msgtype[pmsg->sadb_msg_type]);
 	}
 
 	if (!sbappendaddr(&kp->kp_socket->so_rcv, &key_src, m, NULL)) {
 		PFKEYSTAT_INC(in_nomem);
 		m_freem(m);
 		soroverflow(kp->kp_socket);
 		return ENOBUFS;
 	}
 
 	sorwakeup(kp->kp_socket);
 	return 0;
 }
 
 /* so can be NULL if target != KEY_SENDUP_ONE */
 int
 key_sendup_mbuf(struct socket *so, struct mbuf *m, int target)
 {
 	struct mbuf *n;
 	struct keycb *kp;
 	int error = 0;
 
 	KASSERT(m != NULL, ("NULL mbuf pointer was passed."));
 	KASSERT(so != NULL || target != KEY_SENDUP_ONE,
 	    ("NULL socket pointer was passed."));
 	KASSERT(target == KEY_SENDUP_ONE || target == KEY_SENDUP_ALL ||
 	    target == KEY_SENDUP_REGISTERED, ("Wrong target %d", target));
 
 	PFKEYSTAT_INC(in_total);
 	PFKEYSTAT_ADD(in_bytes, m->m_pkthdr.len);
 	if (m->m_len < sizeof(struct sadb_msg)) {
 		m = m_pullup(m, sizeof(struct sadb_msg));
 		if (m == NULL) {
 			PFKEYSTAT_INC(in_nomem);
 			return ENOBUFS;
 		}
 	}
 	if (m->m_len >= sizeof(struct sadb_msg)) {
 		struct sadb_msg *msg;
 		msg = mtod(m, struct sadb_msg *);
 		PFKEYSTAT_INC(in_msgtype[msg->sadb_msg_type]);
 	}
 	KEYSOCK_LOCK();
 	LIST_FOREACH(kp, &V_keycb_list, kp_next) {
 		/*
 		 * If you are in promiscuous mode, and when you get broadcasted
 		 * reply, you'll get two PF_KEY messages.
 		 * (based on pf_key@inner.net message on 14 Oct 1998)
 		 */
 		if (kp->kp_promisc) {
 			n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 			if (n != NULL)
 				key_sendup0(kp, n, 1);
 			else
 				PFKEYSTAT_INC(in_nomem);
 		}
 
 		/* the exact target will be processed later */
 		if (so != NULL && so->so_pcb == kp)
 			continue;
 
 		if (target == KEY_SENDUP_ONE || (
 		    target == KEY_SENDUP_REGISTERED && kp->kp_registered == 0))
 			continue;
 
 		/* KEY_SENDUP_ALL + KEY_SENDUP_REGISTERED */
 		n = m_copym(m, 0, M_COPYALL, M_NOWAIT);
 		if (n == NULL) {
 			PFKEYSTAT_INC(in_nomem);
 			/* Try send to another socket */
 			continue;
 		}
 
 		if (key_sendup0(kp, n, 0) == 0)
 			PFKEYSTAT_INC(in_msgtarget[target]);
 	}
 
 	if (so)	{ /* KEY_SENDUP_ONE */
 		error = key_sendup0(so->so_pcb, m, 0);
 		if (error == 0)
 			PFKEYSTAT_INC(in_msgtarget[KEY_SENDUP_ONE]);
 	} else {
 		error = 0;
 		m_freem(m);
 	}
 	KEYSOCK_UNLOCK();
 	return (error);
 }
 
 static u_long key_sendspace = 8192;
 SYSCTL_ULONG(_net_key, OID_AUTO, sendspace, CTLFLAG_RW, &key_sendspace, 0,
     "Default key socket send space");
 static u_long key_recvspace = 8192;
 SYSCTL_ULONG(_net_key, OID_AUTO, recvspace, CTLFLAG_RW, &key_recvspace, 0,
     "Default key socket receive space");
 
 static int
 key_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct keycb *kp;
 	int error;
 
 	KASSERT(so->so_pcb == NULL, ("key_attach: so_pcb != NULL"));
 
 	if (td != NULL) {
 		error = priv_check(td, PRIV_NET_RAW);
 		if (error)
 			return error;
 	}
 
 	error = soreserve(so, key_sendspace, key_recvspace);
 	if (error)
 		return (error);
 
 	kp = malloc(sizeof(*kp), M_PCB, M_WAITOK);
 	kp->kp_socket = so;
 	kp->kp_promisc = kp->kp_registered = 0;
 
 	so->so_pcb = kp;
 	so->so_options |= SO_USELOOPBACK;
 
 	KEYSOCK_LOCK();
 	LIST_INSERT_HEAD(&V_keycb_list, kp, kp_next);
 	KEYSOCK_UNLOCK();
 	soisconnected(so);
 
 	return (0);
 }
 
 static void
 key_close(struct socket *so)
 {
 
 	soisdisconnected(so);
 }
 
 static void
 key_detach(struct socket *so)
 {
 	struct keycb *kp = so->so_pcb;
 
 	key_freereg(so);
 	KEYSOCK_LOCK();
 	LIST_REMOVE(kp, kp_next);
 	KEYSOCK_UNLOCK();
 	free(kp, M_PCB);
 	so->so_pcb = NULL;
 }
 
 static int
 key_shutdown(struct socket *so)
 {
 
 	socantsendmore(so);
 	return (0);
 }
 
-struct pr_usrreqs key_usrreqs = {
-	.pru_abort =		key_close,
-	.pru_attach =		key_attach,
-	.pru_detach =		key_detach,
-	.pru_send =		key_send,
-	.pru_shutdown =		key_shutdown,
-	.pru_close =		key_close,
-};
-
-/* sysctl */
 SYSCTL_NODE(_net, PF_KEY, key, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "Key Family");
 
-/*
- * Definitions of protocols supported in the KEY domain.
- */
-
-extern struct domain keydomain;
-
-struct protosw keysw[] = {
-{
+static struct protosw keysw = {
 	.pr_type =		SOCK_RAW,
-	.pr_domain =		&keydomain,
 	.pr_protocol =		PF_KEY_V2,
 	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_usrreqs =		&key_usrreqs
-}
+	.pr_abort =		key_close,
+	.pr_attach =		key_attach,
+	.pr_detach =		key_detach,
+	.pr_send =		key_send,
+	.pr_shutdown =		key_shutdown,
+	.pr_close =		key_close,
 };
 
-struct domain keydomain = {
+static struct domain keydomain = {
 	.dom_family =		PF_KEY,
 	.dom_name =		"key",
-	.dom_protosw =		keysw,
-	.dom_protoswNPROTOSW =	&keysw[nitems(keysw)]
+	.dom_nprotosw =		1,
+	.dom_protosw =		{ &keysw },
 };
 DOMAIN_SET(key);
diff --git a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
index b5de5ed11b00..7a4403488ecf 100644
--- a/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
+++ b/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
@@ -1,1929 +1,1923 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
  *      The Regents of the University of California.  All rights reserved.
  * Copyright (c) 2004 The FreeBSD Foundation.  All rights reserved.
  * Copyright (c) 2004-2008 Robert N. M. Watson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Excerpts taken from tcp_subr.c, tcp_usrreq.c, uipc_socket.c
  */
 
 /*
  *
  * Copyright (c) 2010 Isilon Systems, Inc.
  * Copyright (c) 2010 iX Systems, Inc.
  * Copyright (c) 2010 Panasas, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 
 #include "sdp.h"
 
 #include <net/if.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #include <sys/sysctl.h>
 
 uma_zone_t	sdp_zone;
 struct rwlock	sdp_lock;
 LIST_HEAD(, sdp_sock) sdp_list;
 
 struct workqueue_struct *rx_comp_wq;
 
 RW_SYSINIT(sdplockinit, &sdp_lock, "SDP lock");
 #define	SDP_LIST_WLOCK()	rw_wlock(&sdp_lock)
 #define	SDP_LIST_RLOCK()	rw_rlock(&sdp_lock)
 #define	SDP_LIST_WUNLOCK()	rw_wunlock(&sdp_lock)
 #define	SDP_LIST_RUNLOCK()	rw_runlock(&sdp_lock)
 #define	SDP_LIST_WLOCK_ASSERT()	rw_assert(&sdp_lock, RW_WLOCKED)
 #define	SDP_LIST_RLOCK_ASSERT()	rw_assert(&sdp_lock, RW_RLOCKED)
 #define	SDP_LIST_LOCK_ASSERT()	rw_assert(&sdp_lock, RW_LOCKED)
 
 MALLOC_DEFINE(M_SDP, "sdp", "Sockets Direct Protocol");
 
 static void sdp_stop_keepalive_timer(struct socket *so);
 
 /*
  * SDP protocol interface to socket abstraction.
  */
 /*
  * sdp_sendspace and sdp_recvspace are the default send and receive window
  * sizes, respectively.
  */
 u_long	sdp_sendspace = 1024*32;
 u_long	sdp_recvspace = 1024*64;
 
 static int sdp_count;
 
 /*
  * Disable async. CMA events for sockets which are being torn down.
  */
 static void
 sdp_destroy_cma(struct sdp_sock *ssk)
 {
 
 	if (ssk->id == NULL)
 		return;
 	rdma_destroy_id(ssk->id);
 	ssk->id = NULL;
 }
 
 static int
 sdp_pcbbind(struct sdp_sock *ssk, struct sockaddr *nam, struct ucred *cred)
 {
 	struct sockaddr_in *sin;
 	struct sockaddr_in null;
 	int error;
 
 	SDP_WLOCK_ASSERT(ssk);
 
 	if (ssk->lport != 0 || ssk->laddr != INADDR_ANY)
 		return (EINVAL);
 	/* rdma_bind_addr handles bind races.  */
 	SDP_WUNLOCK(ssk);
 	if (ssk->id == NULL)
 		ssk->id = rdma_create_id(&init_net, sdp_cma_handler, ssk, RDMA_PS_SDP, IB_QPT_RC);
 	if (ssk->id == NULL) {
 		SDP_WLOCK(ssk);
 		return (ENOMEM);
 	}
 	if (nam == NULL) {
 		null.sin_family = AF_INET;
 		null.sin_len = sizeof(null);
 		null.sin_addr.s_addr = INADDR_ANY;
 		null.sin_port = 0;
 		bzero(&null.sin_zero, sizeof(null.sin_zero));
 		nam = (struct sockaddr *)&null;
 	}
 	error = -rdma_bind_addr(ssk->id, nam);
 	SDP_WLOCK(ssk);
 	if (error == 0) {
 		sin = (struct sockaddr_in *)&ssk->id->route.addr.src_addr;
 		ssk->laddr = sin->sin_addr.s_addr;
 		ssk->lport = sin->sin_port;
 	} else
 		sdp_destroy_cma(ssk);
 	return (error);
 }
 
 static void
 sdp_pcbfree(struct sdp_sock *ssk)
 {
 
 	KASSERT(ssk->socket == NULL, ("ssk %p socket still attached", ssk));
 	KASSERT((ssk->flags & SDP_DESTROY) == 0,
 	    ("ssk %p already destroyed", ssk));
 
 	sdp_dbg(ssk->socket, "Freeing pcb");
 	SDP_WLOCK_ASSERT(ssk);
 	ssk->flags |= SDP_DESTROY;
 	SDP_WUNLOCK(ssk);
 	SDP_LIST_WLOCK();
 	sdp_count--;
 	LIST_REMOVE(ssk, list);
 	SDP_LIST_WUNLOCK();
 	crfree(ssk->cred);
 	ssk->qp_active = 0;
 	if (ssk->qp) {
 		ib_destroy_qp(ssk->qp);
 		ssk->qp = NULL;
 	}
 	sdp_tx_ring_destroy(ssk);
 	sdp_rx_ring_destroy(ssk);
 	sdp_destroy_cma(ssk);
 	rw_destroy(&ssk->rx_ring.destroyed_lock);
 	rw_destroy(&ssk->lock);
 	uma_zfree(sdp_zone, ssk);
 }
 
 /*
  * Common routines to return a socket address.
  */
 static struct sockaddr *
 sdp_sockaddr(in_port_t port, struct in_addr *addr_p)
 {
 	struct sockaddr_in *sin;
 
 	sin = malloc(sizeof *sin, M_SONAME,
 		M_WAITOK | M_ZERO);
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = *addr_p;
 	sin->sin_port = port;
 
 	return (struct sockaddr *)sin;
 }
 
 static int
 sdp_getsockaddr(struct socket *so, struct sockaddr **nam)
 {
 	struct sdp_sock *ssk;
 	struct in_addr addr;
 	in_port_t port;
 
 	ssk = sdp_sk(so);
 	SDP_RLOCK(ssk);
 	port = ssk->lport;
 	addr.s_addr = ssk->laddr;
 	SDP_RUNLOCK(ssk);
 
 	*nam = sdp_sockaddr(port, &addr);
 	return 0;
 }
 
 static int
 sdp_getpeeraddr(struct socket *so, struct sockaddr **nam)
 {
 	struct sdp_sock *ssk;
 	struct in_addr addr;
 	in_port_t port;
 
 	ssk = sdp_sk(so);
 	SDP_RLOCK(ssk);
 	port = ssk->fport;
 	addr.s_addr = ssk->faddr;
 	SDP_RUNLOCK(ssk);
 
 	*nam = sdp_sockaddr(port, &addr);
 	return 0;
 }
 
 #if 0
 static void
 sdp_apply_all(void (*func)(struct sdp_sock *, void *), void *arg)
 {
 	struct sdp_sock *ssk;
 
 	SDP_LIST_RLOCK();
 	LIST_FOREACH(ssk, &sdp_list, list) {
 		SDP_WLOCK(ssk);
 		func(ssk, arg);
 		SDP_WUNLOCK(ssk);
 	}
 	SDP_LIST_RUNLOCK();
 }
 #endif
 
 static void
 sdp_output_reset(struct sdp_sock *ssk)
 {
 	struct rdma_cm_id *id;
 
 	SDP_WLOCK_ASSERT(ssk);
 	if (ssk->id) {
 		id = ssk->id;
 		ssk->qp_active = 0;
 		SDP_WUNLOCK(ssk);
 		rdma_disconnect(id);
 		SDP_WLOCK(ssk);
 	}
 	ssk->state = TCPS_CLOSED;
 }
 
 /*
  * Attempt to close a SDP socket, marking it as dropped, and freeing
  * the socket if we hold the only reference.
  */
 static struct sdp_sock *
 sdp_closed(struct sdp_sock *ssk)
 {
 	struct socket *so;
 
 	SDP_WLOCK_ASSERT(ssk);
 
 	ssk->flags |= SDP_DROPPED;
 	so = ssk->socket;
 	soisdisconnected(so);
 	if (ssk->flags & SDP_SOCKREF) {
 		ssk->flags &= ~SDP_SOCKREF;
 		SDP_WUNLOCK(ssk);
 		sorele(so);
 		return (NULL);
 	}
 	return (ssk);
 }
 
 /*
  * Perform timer based shutdowns which can not operate in
  * callout context.
  */
 static void
 sdp_shutdown_task(void *data, int pending)
 {
 	struct sdp_sock *ssk;
 
 	ssk = data;
 	SDP_WLOCK(ssk);
 	/*
 	 * I don't think this can race with another call to pcbfree()
 	 * because SDP_TIMEWAIT protects it.  SDP_DESTROY may be redundant.
 	 */
 	if (ssk->flags & SDP_DESTROY)
 		panic("sdp_shutdown_task: Racing with pcbfree for ssk %p",
 		    ssk);
 	if (ssk->flags & SDP_DISCON)
 		sdp_output_reset(ssk);
 	/* We have to clear this so sdp_detach() will call pcbfree(). */
 	ssk->flags &= ~(SDP_TIMEWAIT | SDP_DREQWAIT);
 	if ((ssk->flags & SDP_DROPPED) == 0 &&
 	    sdp_closed(ssk) == NULL)
 		return;
 	if (ssk->socket == NULL) {
 		sdp_pcbfree(ssk);
 		return;
 	}
 	SDP_WUNLOCK(ssk);
 }
 
 /*
  * 2msl has expired, schedule the shutdown task.
  */
 static void
 sdp_2msl_timeout(void *data)
 {
 	struct sdp_sock *ssk;
 
 	ssk = data;
 	/* Callout canceled. */
         if (!callout_active(&ssk->keep2msl))
 		goto out;
         callout_deactivate(&ssk->keep2msl);
 	/* Should be impossible, defensive programming. */
 	if ((ssk->flags & SDP_TIMEWAIT) == 0)
 		goto out;
 	taskqueue_enqueue(taskqueue_thread, &ssk->shutdown_task);
 out:
 	SDP_WUNLOCK(ssk);
 	return;
 }
 
 /*
  * Schedule the 2msl wait timer.
  */
 static void
 sdp_2msl_wait(struct sdp_sock *ssk)
 {
 
 	SDP_WLOCK_ASSERT(ssk);
 	ssk->flags |= SDP_TIMEWAIT;
 	ssk->state = TCPS_TIME_WAIT;
 	soisdisconnected(ssk->socket);
 	callout_reset(&ssk->keep2msl, TCPTV_MSL, sdp_2msl_timeout, ssk);
 }
 
 /*
  * Timed out waiting for the final fin/ack from rdma_disconnect().
  */
 static void
 sdp_dreq_timeout(void *data)
 {
 	struct sdp_sock *ssk;
 
 	ssk = data;
 	/* Callout canceled. */
         if (!callout_active(&ssk->keep2msl))
 		goto out;
 	/* Callout rescheduled, probably as a different timer. */
 	if (callout_pending(&ssk->keep2msl))
 		goto out;
         callout_deactivate(&ssk->keep2msl);
 	if (ssk->state != TCPS_FIN_WAIT_1 && ssk->state != TCPS_LAST_ACK)
 		goto out;
 	if ((ssk->flags & SDP_DREQWAIT) == 0)
 		goto out;
 	ssk->flags &= ~SDP_DREQWAIT;
 	ssk->flags |= SDP_DISCON;
 	sdp_2msl_wait(ssk);
 	ssk->qp_active = 0;
 out:
 	SDP_WUNLOCK(ssk);
 }
 
 /*
  * Received the final fin/ack.  Cancel the 2msl.
  */
 void
 sdp_cancel_dreq_wait_timeout(struct sdp_sock *ssk)
 {
 	sdp_dbg(ssk->socket, "cancelling dreq wait timeout\n");
 	ssk->flags &= ~SDP_DREQWAIT;
 	sdp_2msl_wait(ssk);
 }
 
 static int
 sdp_init_sock(struct socket *sk)
 {
 	struct sdp_sock *ssk = sdp_sk(sk);
 
 	sdp_dbg(sk, "%s\n", __func__);
 
 	callout_init_rw(&ssk->keep2msl, &ssk->lock, CALLOUT_RETURNUNLOCKED);
 	TASK_INIT(&ssk->shutdown_task, 0, sdp_shutdown_task, ssk);
 #ifdef SDP_ZCOPY
 	INIT_DELAYED_WORK(&ssk->srcavail_cancel_work, srcavail_cancel_timeout);
 	ssk->zcopy_thresh = -1; /* use global sdp_zcopy_thresh */
 	ssk->tx_ring.rdma_inflight = NULL;
 #endif
 	atomic_set(&ssk->mseq_ack, 0);
 	sdp_rx_ring_init(ssk);
 	ssk->tx_ring.buffer = NULL;
 
 	return 0;
 }
 
 /*
  * Allocate an sdp_sock for the socket and reserve socket buffer space.
  */
 static int
 sdp_attach(struct socket *so, int proto, struct thread *td)
 {
 	struct sdp_sock *ssk;
 	int error;
 
 	ssk = sdp_sk(so);
 	KASSERT(ssk == NULL, ("sdp_attach: ssk already set on so %p", so));
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		error = soreserve(so, sdp_sendspace, sdp_recvspace);
 		if (error)
 			return (error);
 	}
 	so->so_rcv.sb_flags |= SB_AUTOSIZE;
 	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	ssk = uma_zalloc(sdp_zone, M_NOWAIT | M_ZERO);
 	if (ssk == NULL)
 		return (ENOBUFS);
 	rw_init(&ssk->lock, "sdpsock");
 	ssk->socket = so;
 	ssk->cred = crhold(so->so_cred);
 	so->so_pcb = (caddr_t)ssk;
 	sdp_init_sock(so);
 	ssk->flags = 0;
 	ssk->qp_active = 0;
 	ssk->state = TCPS_CLOSED;
 	mbufq_init(&ssk->rxctlq, INT_MAX);
 	SDP_LIST_WLOCK();
 	LIST_INSERT_HEAD(&sdp_list, ssk, list);
 	sdp_count++;
 	SDP_LIST_WUNLOCK();
 
 	return (0);
 }
 
 /*
  * Detach SDP from the socket, potentially leaving it around for the
  * timewait to expire.
  */
 static void
 sdp_detach(struct socket *so)
 {
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	KASSERT(ssk->socket != NULL, ("sdp_detach: socket is NULL"));
 	ssk->socket->so_pcb = NULL;
 	ssk->socket = NULL;
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DREQWAIT))
 		SDP_WUNLOCK(ssk);
 	else if (ssk->flags & SDP_DROPPED || ssk->state < TCPS_SYN_SENT)
 		sdp_pcbfree(ssk);
 	else
 		panic("sdp_detach: Unexpected state, ssk %p.\n", ssk);
 }
 
 /*
  * Allocate a local address for the socket.
  */
 static int
 sdp_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error = 0;
 	struct sdp_sock *ssk;
 	struct sockaddr_in *sin;
 
 	sin = (struct sockaddr_in *)nam;
 	if (sin->sin_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (nam->sa_len != sizeof(*sin))
 		return (EINVAL);
 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 		return (EAFNOSUPPORT);
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	error = sdp_pcbbind(ssk, nam, td->td_ucred);
 out:
 	SDP_WUNLOCK(ssk);
 
 	return (error);
 }
 
 /*
  * Prepare to accept connections.
  */
 static int
 sdp_listen(struct socket *so, int backlog, struct thread *td)
 {
 	int error = 0;
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = EINVAL;
 		goto out;
 	}
 	if (error == 0 && ssk->lport == 0)
 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
 	SOCK_LOCK(so);
 	if (error == 0)
 		error = solisten_proto_check(so);
 	if (error == 0) {
 		solisten_proto(so, backlog);
 		ssk->state = TCPS_LISTEN;
 	}
 	SOCK_UNLOCK(so);
 
 out:
 	SDP_WUNLOCK(ssk);
 	if (error == 0)
 		error = -rdma_listen(ssk->id, backlog);
 	return (error);
 }
 
 /*
  * Initiate a SDP connection to nam.
  */
 static int
 sdp_start_connect(struct sdp_sock *ssk, struct sockaddr *nam, struct thread *td)
 {
 	struct sockaddr_in src;
 	struct socket *so;
 	int error;
 
 	so = ssk->socket;
 
 	SDP_WLOCK_ASSERT(ssk);
 	if (ssk->lport == 0) {
 		error = sdp_pcbbind(ssk, (struct sockaddr *)0, td->td_ucred);
 		if (error)
 			return error;
 	}
 	src.sin_family = AF_INET;
 	src.sin_len = sizeof(src);
 	bzero(&src.sin_zero, sizeof(src.sin_zero));
 	src.sin_port = ssk->lport;
 	src.sin_addr.s_addr = ssk->laddr;
 	soisconnecting(so);
 	SDP_WUNLOCK(ssk);
 	error = -rdma_resolve_addr(ssk->id, (struct sockaddr *)&src, nam,
 	    SDP_RESOLVE_TIMEOUT);
 	SDP_WLOCK(ssk);
 	if (error == 0)
 		ssk->state = TCPS_SYN_SENT;
 
 	return 0;
 }
 
 /*
  * Initiate SDP connection.
  */
 static int
 sdp_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error = 0;
 	struct sdp_sock *ssk;
 	struct sockaddr_in *sin;
 
 	sin = (struct sockaddr_in *)nam;
 	if (nam->sa_len != sizeof(*sin))
 		return (EINVAL);
 	if (sin->sin_family != AF_INET)
 		return (EAFNOSUPPORT);
 	if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
 		return (EAFNOSUPPORT);
 	if ((error = prison_remote_ip4(td->td_ucred, &sin->sin_addr)) != 0)
 		return (error);
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED))
 		error = EINVAL;
 	else
 		error = sdp_start_connect(ssk, nam, td);
 	SDP_WUNLOCK(ssk);
 	return (error);
 }
 
 /*
  * Drop a SDP socket, reporting
  * the specified error.  If connection is synchronized,
  * then send a RST to peer.
  */
 static struct sdp_sock *
 sdp_drop(struct sdp_sock *ssk, int errno)
 {
 	struct socket *so;
 
 	SDP_WLOCK_ASSERT(ssk);
 	so = ssk->socket;
 	if (TCPS_HAVERCVDSYN(ssk->state))
 		sdp_output_reset(ssk);
 	if (errno == ETIMEDOUT && ssk->softerror)
 		errno = ssk->softerror;
 	so->so_error = errno;
 	return (sdp_closed(ssk));
 }
 
 /*
  * User issued close, and wish to trail through shutdown states:
  * if never received SYN, just forget it.  If got a SYN from peer,
  * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
  * If already got a FIN from peer, then almost done; go to LAST_ACK
  * state.  In all other cases, have already sent FIN to peer (e.g.
  * after PRU_SHUTDOWN), and just have to play tedious game waiting
  * for peer to send FIN or not respond to keep-alives, etc.
  * We can let the user exit from the close as soon as the FIN is acked.
  */
 static void
 sdp_usrclosed(struct sdp_sock *ssk)
 {
 
 	SDP_WLOCK_ASSERT(ssk);
 
 	switch (ssk->state) {
 	case TCPS_LISTEN:
 		ssk->state = TCPS_CLOSED;
 		SDP_WUNLOCK(ssk);
 		sdp_destroy_cma(ssk);
 		SDP_WLOCK(ssk);
 		/* FALLTHROUGH */
 	case TCPS_CLOSED:
 		ssk = sdp_closed(ssk);
 		/*
 		 * sdp_closed() should never return NULL here as the socket is
 		 * still open.
 		 */
 		KASSERT(ssk != NULL,
 		    ("sdp_usrclosed: sdp_closed() returned NULL"));
 		break;
 
 	case TCPS_SYN_SENT:
 		/* FALLTHROUGH */
 	case TCPS_SYN_RECEIVED:
 		ssk->flags |= SDP_NEEDFIN;
 		break;
 
 	case TCPS_ESTABLISHED:
 		ssk->flags |= SDP_NEEDFIN;
 		ssk->state = TCPS_FIN_WAIT_1;
 		break;
 
 	case TCPS_CLOSE_WAIT:
 		ssk->state = TCPS_LAST_ACK;
 		break;
 	}
 	if (ssk->state >= TCPS_FIN_WAIT_2) {
 		/* Prevent the connection hanging in FIN_WAIT_2 forever. */
 		if (ssk->state == TCPS_FIN_WAIT_2)
 			sdp_2msl_wait(ssk);
 		else
 			soisdisconnected(ssk->socket);
 	}
 }
 
 static void
 sdp_output_disconnect(struct sdp_sock *ssk)
 {
 
 	SDP_WLOCK_ASSERT(ssk);
 	callout_reset(&ssk->keep2msl, SDP_FIN_WAIT_TIMEOUT,
 	    sdp_dreq_timeout, ssk);
 	ssk->flags |= SDP_NEEDFIN | SDP_DREQWAIT;
 	sdp_post_sends(ssk, M_NOWAIT);
 }
 
 /*
  * Initiate or continue a disconnect.
  * If embryonic state, just send reset (once).
  * If in ``let data drain'' option and linger null, just drop.
  * Otherwise (hard), mark socket disconnecting and drop
  * current input data; switch states based on user close, and
  * send segment to peer (with FIN).
  */
 static void
 sdp_start_disconnect(struct sdp_sock *ssk)
 {
 	struct socket *so;
 	int unread;
 
 	so = ssk->socket;
 	SDP_WLOCK_ASSERT(ssk);
 	sdp_stop_keepalive_timer(so);
 	/*
 	 * Neither sdp_closed() nor sdp_drop() should return NULL, as the
 	 * socket is still open.
 	 */
 	if (ssk->state < TCPS_ESTABLISHED) {
 		ssk = sdp_closed(ssk);
 		KASSERT(ssk != NULL,
 		    ("sdp_start_disconnect: sdp_close() returned NULL"));
 	} else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
 		ssk = sdp_drop(ssk, 0);
 		KASSERT(ssk != NULL,
 		    ("sdp_start_disconnect: sdp_drop() returned NULL"));
 	} else {
 		soisdisconnecting(so);
 		unread = sbused(&so->so_rcv);
 		sbflush(&so->so_rcv);
 		sdp_usrclosed(ssk);
 		if (!(ssk->flags & SDP_DROPPED)) {
 			if (unread)
 				sdp_output_reset(ssk);
 			else
 				sdp_output_disconnect(ssk);
 		}
 	}
 }
 
 /*
  * User initiated disconnect.
  */
 static int
 sdp_disconnect(struct socket *so)
 {
 	struct sdp_sock *ssk;
 	int error = 0;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
 	sdp_start_disconnect(ssk);
 out:
 	SDP_WUNLOCK(ssk);
 	return (error);
 }
 
 /*
  * Accept a connection.  Essentially all the work is done at higher levels;
  * just return the address of the peer, storing through addr.
  *
  *
  * XXX This is broken XXX
  * 
  * The rationale for acquiring the sdp lock here is somewhat complicated,
  * and is described in detail in the commit log entry for r175612.  Acquiring
  * it delays an accept(2) racing with sonewconn(), which inserts the socket
  * before the address/port fields are initialized.  A better fix would
  * prevent the socket from being placed in the listen queue until all fields
  * are fully initialized.
  */
 static int
 sdp_accept(struct socket *so, struct sockaddr **nam)
 {
 	struct sdp_sock *ssk = NULL;
 	struct in_addr addr;
 	in_port_t port;
 	int error;
 
 	if (so->so_state & SS_ISDISCONNECTED)
 		return (ECONNABORTED);
 
 	port = 0;
 	addr.s_addr = 0;
 	error = 0;
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = ECONNABORTED;
 		goto out;
 	}
 	port = ssk->fport;
 	addr.s_addr = ssk->faddr;
 out:
 	SDP_WUNLOCK(ssk);
 	if (error == 0)
 		*nam = sdp_sockaddr(port, &addr);
 	return error;
 }
 
 /*
  * Mark the connection as being incapable of further output.
  */
 static int
 sdp_shutdown(struct socket *so)
 {
 	int error = 0;
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
 	socantsendmore(so);
 	sdp_usrclosed(ssk);
 	if (!(ssk->flags & SDP_DROPPED))
 		sdp_output_disconnect(ssk);
 
 out:
 	SDP_WUNLOCK(ssk);
 
 	return (error);
 }
 
 static void
 sdp_append(struct sdp_sock *ssk, struct sockbuf *sb, struct mbuf *mb, int cnt)
 {
 	struct mbuf *n;
 	int ncnt;
 
 	SOCKBUF_LOCK_ASSERT(sb);
 	SBLASTRECORDCHK(sb);
 	KASSERT(mb->m_flags & M_PKTHDR,
 		("sdp_append: %p Missing packet header.\n", mb));
 	n = sb->sb_lastrecord;
 	/*
 	 * If the queue is empty just set all pointers and proceed.
 	 */
 	if (n == NULL) {
 		sb->sb_lastrecord = sb->sb_mb = sb->sb_sndptr = mb;
 		for (; mb; mb = mb->m_next) {
 	                sb->sb_mbtail = mb;
 			sballoc(sb, mb);
 		}
 		return;
 	}
 	/*
 	 * Count the number of mbufs in the current tail.
 	 */
 	for (ncnt = 0; n->m_next; n = n->m_next)
 		ncnt++;
 	n = sb->sb_lastrecord;
 	/*
 	 * If the two chains can fit in a single sdp packet and
 	 * the last record has not been sent yet (WRITABLE) coalesce
 	 * them.  The lastrecord remains the same but we must strip the
 	 * packet header and then let sbcompress do the hard part.
 	 */
 	if (M_WRITABLE(n) && ncnt + cnt < SDP_MAX_SEND_SGES &&
 	    n->m_pkthdr.len + mb->m_pkthdr.len - SDP_HEAD_SIZE <
 	    ssk->xmit_size_goal) {
 		m_adj(mb, SDP_HEAD_SIZE);
 		n->m_pkthdr.len += mb->m_pkthdr.len;
 		n->m_flags |= mb->m_flags & (M_PUSH | M_URG);
 		m_demote(mb, 1, 0);
 		sbcompress(sb, mb, sb->sb_mbtail);
 		return;
 	}
 	/*
 	 * Not compressible, just append to the end and adjust counters.
 	 */
 	sb->sb_lastrecord->m_flags |= M_PUSH;
 	sb->sb_lastrecord->m_nextpkt = mb;
 	sb->sb_lastrecord = mb;
 	if (sb->sb_sndptr == NULL)
 		sb->sb_sndptr = mb;
 	for (; mb; mb = mb->m_next) {
 		sb->sb_mbtail = mb;
 		sballoc(sb, mb);
 	}
 }
 
 /*
  * Do a send by putting data in output queue and updating urgent
  * marker if URG set.  Possibly send more data.  Unlike the other
  * pru_*() routines, the mbuf chains are our responsibility.  We
  * must either enqueue them or free them.  The other pru_* routines
  * generally are caller-frees.
  *
  * This comes from sendfile, normal sends will come from sdp_sosend().
  */
 static int
 sdp_send(struct socket *so, int flags, struct mbuf *m,
     struct sockaddr *nam, struct mbuf *control, struct thread *td)
 {
 	struct sdp_sock *ssk;
 	struct mbuf *n;
 	int error;
 	int cnt;
 
 	if (nam != NULL) {
 		if (nam->sa_family != AF_INET) {
 			if (control)
 				m_freem(control);
 			m_freem(m);
 			return (EAFNOSUPPORT);
 		}
 		if (nam->sa_len != sizeof(struct sockaddr_in)) {
 			if (control)
 				m_freem(control);
 			m_freem(m);
 			return (EINVAL);
 		}
 	}
 
 	error = 0;
 	ssk = sdp_sk(so);
 	KASSERT(m->m_flags & M_PKTHDR,
 	    ("sdp_send: %p no packet header", m));
 	M_PREPEND(m, SDP_HEAD_SIZE, M_WAITOK);
 	mtod(m, struct sdp_bsdh *)->mid = SDP_MID_DATA; 
 	for (n = m, cnt = 0; n->m_next; n = n->m_next)
 		cnt++;
 	if (cnt > SDP_MAX_SEND_SGES) {
 		n = m_collapse(m, M_WAITOK, SDP_MAX_SEND_SGES);
 		if (n == NULL) {
 			m_freem(m);
 			return (EMSGSIZE);
 		}
 		m = n;
 		for (cnt = 0; n->m_next; n = n->m_next)
 			cnt++;
 	}
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		if (control)
 			m_freem(control);
 		if (m)
 			m_freem(m);
 		error = ECONNRESET;
 		goto out;
 	}
 	if (control) {
 		/* SDP doesn't support control messages. */
 		if (control->m_len) {
 			m_freem(control);
 			if (m)
 				m_freem(m);
 			error = EINVAL;
 			goto out;
 		}
 		m_freem(control);	/* empty control, just free it */
 	}
 	if (!(flags & PRUS_OOB)) {
 		SOCKBUF_LOCK(&so->so_snd);
 		sdp_append(ssk, &so->so_snd, m, cnt);
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (nam && ssk->state < TCPS_SYN_SENT) {
 			/*
 			 * Do implied connect if not yet connected.
 			 */
 			error = sdp_start_connect(ssk, nam, td);
 			if (error)
 				goto out;
 		}
 		if (flags & PRUS_EOF) {
 			/*
 			 * Close the send side of the connection after
 			 * the data is sent.
 			 */
 			socantsendmore(so);
 			sdp_usrclosed(ssk);
 			if (!(ssk->flags & SDP_DROPPED))
 				sdp_output_disconnect(ssk);
 		} else if (!(ssk->flags & SDP_DROPPED) &&
 		    !(flags & PRUS_MORETOCOME))
 			sdp_post_sends(ssk, M_NOWAIT);
 		SDP_WUNLOCK(ssk);
 		return (0);
 	} else {
 		SOCKBUF_LOCK(&so->so_snd);
 		if (sbspace(&so->so_snd) < -512) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			m_freem(m);
 			error = ENOBUFS;
 			goto out;
 		}
 		/*
 		 * According to RFC961 (Assigned Protocols),
 		 * the urgent pointer points to the last octet
 		 * of urgent data.  We continue, however,
 		 * to consider it to indicate the first octet
 		 * of data past the urgent section.
 		 * Otherwise, snd_up should be one lower.
 		 */
 		m->m_flags |= M_URG | M_PUSH;
 		sdp_append(ssk, &so->so_snd, m, cnt);
 		SOCKBUF_UNLOCK(&so->so_snd);
 		if (nam && ssk->state < TCPS_SYN_SENT) {
 			/*
 			 * Do implied connect if not yet connected.
 			 */
 			error = sdp_start_connect(ssk, nam, td);
 			if (error)
 				goto out;
 		}
 		sdp_post_sends(ssk, M_NOWAIT);
 		SDP_WUNLOCK(ssk);
 		return (0);
 	}
 out:
 	SDP_WUNLOCK(ssk);
 	return (error);
 }
 
 /*
  * Send on a socket.  If send must go all at once and message is larger than
  * send buffering, then hard error.  Lock against other senders.  If must go
  * all at once and not enough room now, then inform user that this would
  * block and do nothing.  Otherwise, if nonblocking, send as much as
  * possible.  The data to be sent is described by "uio" if nonzero, otherwise
  * by the mbuf chain "top" (which must be null if uio is not).  Data provided
  * in mbuf chain must be small enough to send all at once.
  *
  * Returns nonzero on error, timeout or signal; callers must check for short
  * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
  * on return.
  */
 static int
 sdp_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
 	struct sdp_sock *ssk;
 	long space, resid;
 	int atomic;
 	int error;
 	int copy;
 
 	if (uio != NULL)
 		resid = uio->uio_resid;
 	else
 		resid = top->m_pkthdr.len;
 	atomic = top != NULL;
 	if (control != NULL) {
 		if (control->m_len) {
 			m_freem(control);
 			if (top)
 				m_freem(top);
 			return (EINVAL);
 		}
 		m_freem(control);
 		control = NULL;
 	}
 	/*
 	 * In theory resid should be unsigned.  However, space must be
 	 * signed, as it might be less than 0 if we over-committed, and we
 	 * must use a signed comparison of space and resid.  On the other
 	 * hand, a negative resid causes us to loop sending 0-length
 	 * segments to the protocol.
 	 *
 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
 	 * type sockets since that's an error.
 	 */
 	if (resid < 0 || (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
 		error = EINVAL;
 		goto out;
 	}
 	if (td != NULL)
 		td->td_ru.ru_msgsnd++;
 
 	ssk = sdp_sk(so);
 	error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		goto out;
 
 restart:
 	do {
 		SOCKBUF_LOCK(&so->so_snd);
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = EPIPE;
 			goto release;
 		}
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto release;
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0 && addr == NULL) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = ENOTCONN;
 			goto release;
 		}
 		space = sbspace(&so->so_snd);
 		if (flags & MSG_OOB)
 			space += 1024;
 		if (atomic && resid > ssk->xmit_size_goal - SDP_HEAD_SIZE) {
 			SOCKBUF_UNLOCK(&so->so_snd);
 			error = EMSGSIZE;
 			goto release;
 		}
 		if (space < resid &&
 		    (atomic || space < so->so_snd.sb_lowat)) {
 			if ((so->so_state & SS_NBIO) ||
 			    (flags & (MSG_NBIO | MSG_DONTWAIT)) != 0) {
 				SOCKBUF_UNLOCK(&so->so_snd);
 				error = EWOULDBLOCK;
 				goto release;
 			}
 			error = sbwait(so, SO_SND);
 			SOCKBUF_UNLOCK(&so->so_snd);
 			if (error)
 				goto release;
 			goto restart;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
 		do {
 			if (uio == NULL) {
 				resid = 0;
 				if (flags & MSG_EOR)
 					top->m_flags |= M_EOR;
 			} else {
 				/*
 				 * Copy the data from userland into a mbuf
 				 * chain.  If no data is to be copied in,
 				 * a single empty mbuf is returned.
 				 */
 				copy = min(space,
 				    ssk->xmit_size_goal - SDP_HEAD_SIZE);
 				top = m_uiotombuf(uio, M_WAITOK, copy,
 				    0, M_PKTHDR |
 				    ((flags & MSG_EOR) ? M_EOR : 0));
 				if (top == NULL) {
 					/* only possible error */
 					error = EFAULT;
 					goto release;
 				}
 				space -= resid - uio->uio_resid;
 				resid = uio->uio_resid;
 			}
 			/*
 			 * XXX all the SBS_CANTSENDMORE checks previously
 			 * done could be out of date after dropping the
 			 * socket lock.
 			 */
 			error = sdp_send(so, (flags & MSG_OOB) ? PRUS_OOB :
 			/*
 			 * Set EOF on the last send if the user specified
 			 * MSG_EOF.
 			 */
 			    ((flags & MSG_EOF) && (resid <= 0)) ? PRUS_EOF :
 			/* If there is more to send set PRUS_MORETOCOME. */
 			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
 			    top, addr, NULL, td);
 			top = NULL;
 			if (error)
 				goto release;
 		} while (resid && space > 0);
 	} while (resid);
 
 release:
 	SOCK_IO_SEND_UNLOCK(so);
 out:
 	if (top != NULL)
 		m_freem(top);
 	return (error);
 }
 
 /*
  * The part of soreceive() that implements reading non-inline out-of-band
  * data from a socket.  For more complete comments, see soreceive(), from
  * which this code originated.
  *
  * Note that soreceive_rcvoob(), unlike the remainder of soreceive(), is
  * unable to return an mbuf chain to the caller.
  */
 static int
 soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
 {
 	struct protosw *pr = so->so_proto;
 	struct mbuf *m;
 	int error;
 
 	KASSERT(flags & MSG_OOB, ("soreceive_rcvoob: (flags & MSG_OOB) == 0"));
 
 	m = m_get(M_WAITOK, MT_DATA);
-	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
+	error = pr->pr_rcvoob(so, m, flags & MSG_PEEK);
 	if (error)
 		goto bad;
 	do {
 		error = uiomove(mtod(m, void *),
 		    (int) min(uio->uio_resid, m->m_len), uio);
 		m = m_free(m);
 	} while (uio->uio_resid && error == 0 && m);
 bad:
 	if (m != NULL)
 		m_freem(m);
 	return (error);
 }
 
 /*
  * Optimized version of soreceive() for stream (TCP) sockets.
  */
 static int
 sdp_sorecv(struct socket *so, struct sockaddr **psa, struct uio *uio,
     struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	int len = 0, error = 0, flags, oresid;
 	struct sockbuf *sb;
 	struct mbuf *m, *n = NULL;
 	struct sdp_sock *ssk;
 
 	/* We only do stream sockets. */
 	if (so->so_type != SOCK_STREAM)
 		return (EINVAL);
 	if (psa != NULL)
 		*psa = NULL;
 	if (controlp != NULL)
 		return (EINVAL);
 	if (flagsp != NULL)
 		flags = *flagsp &~ MSG_EOR;
 	else
 		flags = 0;
 	if (flags & MSG_OOB)
 		return (soreceive_rcvoob(so, uio, flags));
 	if (mp0 != NULL)
 		*mp0 = NULL;
 
 	sb = &so->so_rcv;
 	ssk = sdp_sk(so);
 
 	/* Prevent other readers from entering the socket. */
 	error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
 	if (error)
 		return (error);
 	SOCKBUF_LOCK(sb);
 
 	/* Easy one, no space to copyout anything. */
 	if (uio->uio_resid == 0) {
 		error = EINVAL;
 		goto out;
 	}
 	oresid = uio->uio_resid;
 
 	/* We will never ever get anything unless we are connected. */
 	if (!(so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED))) {
 		/* When disconnecting there may be still some data left. */
 		if (sbavail(sb))
 			goto deliver;
 		if (!(so->so_state & SS_ISDISCONNECTED))
 			error = ENOTCONN;
 		goto out;
 	}
 
 	/* Socket buffer is empty and we shall not block. */
 	if (sbavail(sb) == 0 &&
 	    ((so->so_state & SS_NBIO) || (flags & (MSG_DONTWAIT|MSG_NBIO)))) {
 		error = EAGAIN;
 		goto out;
 	}
 
 restart:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 
 	/* Abort if socket has reported problems. */
 	if (so->so_error) {
 		if (sbavail(sb))
 			goto deliver;
 		if (oresid > uio->uio_resid)
 			goto out;
 		error = so->so_error;
 		if (!(flags & MSG_PEEK))
 			so->so_error = 0;
 		goto out;
 	}
 
 	/* Door is closed.  Deliver what is left, if any. */
 	if (sb->sb_state & SBS_CANTRCVMORE) {
 		if (sbavail(sb))
 			goto deliver;
 		else
 			goto out;
 	}
 
 	/* Socket buffer got some data that we shall deliver now. */
 	if (sbavail(sb) && !(flags & MSG_WAITALL) &&
 	    ((so->so_state & SS_NBIO) ||
 	     (flags & (MSG_DONTWAIT|MSG_NBIO)) ||
 	     sbavail(sb) >= sb->sb_lowat ||
 	     sbavail(sb) >= uio->uio_resid ||
 	     sbavail(sb) >= sb->sb_hiwat) ) {
 		goto deliver;
 	}
 
 	/* On MSG_WAITALL we must wait until all data or error arrives. */
 	if ((flags & MSG_WAITALL) &&
 	    (sbavail(sb) >= uio->uio_resid || sbavail(sb) >= sb->sb_lowat))
 		goto deliver;
 
 	/*
 	 * Wait and block until (more) data comes in.
 	 * NB: Drops the sockbuf lock during wait.
 	 */
 	error = sbwait(so, SO_RCV);
 	if (error)
 		goto out;
 	goto restart;
 
 deliver:
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	KASSERT(sbavail(sb), ("%s: sockbuf empty", __func__));
 	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb == NULL", __func__));
 
 	/* Statistics. */
 	if (uio->uio_td)
 		uio->uio_td->td_ru.ru_msgrcv++;
 
 	/* Fill uio until full or current end of socket buffer is reached. */
 	len = min(uio->uio_resid, sbavail(sb));
 	if (mp0 != NULL) {
 		/* Dequeue as many mbufs as possible. */
 		if (!(flags & MSG_PEEK) && len >= sb->sb_mb->m_len) {
 			for (*mp0 = m = sb->sb_mb;
 			     m != NULL && m->m_len <= len;
 			     m = m->m_next) {
 				len -= m->m_len;
 				uio->uio_resid -= m->m_len;
 				sbfree(sb, m);
 				n = m;
 			}
 			sb->sb_mb = m;
 			if (sb->sb_mb == NULL)
 				SB_EMPTY_FIXUP(sb);
 			n->m_next = NULL;
 		}
 		/* Copy the remainder. */
 		if (len > 0) {
 			KASSERT(sb->sb_mb != NULL,
 			    ("%s: len > 0 && sb->sb_mb empty", __func__));
 
 			m = m_copym(sb->sb_mb, 0, len, M_NOWAIT);
 			if (m == NULL)
 				len = 0;	/* Don't flush data from sockbuf. */
 			else
 				uio->uio_resid -= m->m_len;
 			if (*mp0 != NULL)
 				n->m_next = m;
 			else
 				*mp0 = m;
 			if (*mp0 == NULL) {
 				error = ENOBUFS;
 				goto out;
 			}
 		}
 	} else {
 		/* NB: Must unlock socket buffer as uiomove may sleep. */
 		SOCKBUF_UNLOCK(sb);
 		error = m_mbuftouio(uio, sb->sb_mb, len);
 		SOCKBUF_LOCK(sb);
 		if (error)
 			goto out;
 	}
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 
 	/*
 	 * Remove the delivered data from the socket buffer unless we
 	 * were only peeking.
 	 */
 	if (!(flags & MSG_PEEK)) {
 		if (len > 0)
 			sbdrop_locked(sb, len);
 
 		/* Notify protocol that we drained some data. */
 		SOCKBUF_UNLOCK(sb);
 		SDP_WLOCK(ssk);
 		sdp_do_posts(ssk);
 		SDP_WUNLOCK(ssk);
 		SOCKBUF_LOCK(sb);
 	}
 
 	/*
 	 * For MSG_WAITALL we may have to loop again and wait for
 	 * more data to come in.
 	 */
 	if ((flags & MSG_WAITALL) && uio->uio_resid > 0)
 		goto restart;
 out:
 	SBLASTRECORDCHK(sb);
 	SBLASTMBUFCHK(sb);
 	SOCKBUF_UNLOCK(sb);
 	SOCK_IO_RECV_UNLOCK(so);
 	return (error);
 }
 
 /*
  * Abort is used to teardown a connection typically while sitting in
  * the accept queue.
  */
 void
 sdp_abort(struct socket *so)
 {
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	/*
 	 * If we have not yet dropped, do it now.
 	 */
 	if (!(ssk->flags & SDP_TIMEWAIT) &&
 	    !(ssk->flags & SDP_DROPPED))
 		sdp_drop(ssk, ECONNABORTED);
 	KASSERT(ssk->flags & SDP_DROPPED, ("sdp_abort: %p not dropped 0x%X",
 	    ssk, ssk->flags));
 	SDP_WUNLOCK(ssk);
 }
 
 /*
  * Close a SDP socket and initiate a friendly disconnect.
  */
 static void
 sdp_close(struct socket *so)
 {
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	/*
 	 * If we have not yet dropped, do it now.
 	 */
 	if (!(ssk->flags & SDP_TIMEWAIT) &&
 	    !(ssk->flags & SDP_DROPPED)) 
 		sdp_start_disconnect(ssk);
 
 	/*
 	 * If we've still not dropped let the socket layer know we're
 	 * holding on to the socket and pcb for a while.
 	 */
 	if (!(ssk->flags & SDP_DROPPED)) {
 		ssk->flags |= SDP_SOCKREF;
 		soref(so);
 	}
 	SDP_WUNLOCK(ssk);
 }
 
 /*
  * User requests out-of-band data.
  */
 static int
 sdp_rcvoob(struct socket *so, struct mbuf *m, int flags)
 {
 	int error = 0;
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	SDP_WLOCK(ssk);
 	if (!rx_ring_trylock(&ssk->rx_ring)) {
 		SDP_WUNLOCK(ssk);
 		return (ECONNRESET);
 	}
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		error = ECONNRESET;
 		goto out;
 	}
 	if ((so->so_oobmark == 0 &&
 	     (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
 	    so->so_options & SO_OOBINLINE ||
 	    ssk->oobflags & SDP_HADOOB) {
 		error = EINVAL;
 		goto out;
 	}
 	if ((ssk->oobflags & SDP_HAVEOOB) == 0) {
 		error = EWOULDBLOCK;
 		goto out;
 	}
 	m->m_len = 1;
 	*mtod(m, caddr_t) = ssk->iobc;
 	if ((flags & MSG_PEEK) == 0)
 		ssk->oobflags ^= (SDP_HAVEOOB | SDP_HADOOB);
 out:
 	rx_ring_unlock(&ssk->rx_ring);
 	SDP_WUNLOCK(ssk);
 	return (error);
 }
 
 void
 sdp_urg(struct sdp_sock *ssk, struct mbuf *mb)
 {
 	struct mbuf *m;
 	struct socket *so;
 
 	so = ssk->socket;
 	if (so == NULL)
 		return;
 
 	so->so_oobmark = sbused(&so->so_rcv) + mb->m_pkthdr.len - 1;
 	sohasoutofband(so);
 	ssk->oobflags &= ~(SDP_HAVEOOB | SDP_HADOOB);
 	if (!(so->so_options & SO_OOBINLINE)) {
 		for (m = mb; m->m_next != NULL; m = m->m_next);
 		ssk->iobc = *(mtod(m, char *) + m->m_len - 1);
 		ssk->oobflags |= SDP_HAVEOOB;
 		m->m_len--;
 		mb->m_pkthdr.len--;
 	}
 }
 
 /*
  * Notify a sdp socket of an asynchronous error.
  *
  * Do not wake up user since there currently is no mechanism for
  * reporting soft errors (yet - a kqueue filter may be added).
  */
 struct sdp_sock *
 sdp_notify(struct sdp_sock *ssk, int error)
 {
 
 	SDP_WLOCK_ASSERT(ssk);
 
 	if ((ssk->flags & SDP_TIMEWAIT) ||
 	    (ssk->flags & SDP_DROPPED))
 		return (ssk);
 
 	/*
 	 * Ignore some errors if we are hooked up.
 	 */
 	if (ssk->state == TCPS_ESTABLISHED &&
 	    (error == EHOSTUNREACH || error == ENETUNREACH ||
 	     error == EHOSTDOWN))
 		return (ssk);
 	ssk->softerror = error;
 	return sdp_drop(ssk, error);
 }
 
 static void
 sdp_keepalive_timeout(void *data)
 {
 	struct sdp_sock *ssk;
 
 	ssk = data;
 	/* Callout canceled. */
         if (!callout_active(&ssk->keep2msl))
                 return;
 	/* Callout rescheduled as a different kind of timer. */
 	if (callout_pending(&ssk->keep2msl))
 		goto out;
         callout_deactivate(&ssk->keep2msl);
 	if (ssk->flags & SDP_DROPPED ||
 	    (ssk->socket->so_options & SO_KEEPALIVE) == 0)
 		goto out;
 	sdp_post_keepalive(ssk);
 	callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
 	    sdp_keepalive_timeout, ssk);
 out:
 	SDP_WUNLOCK(ssk);
 }
 
 
 void
 sdp_start_keepalive_timer(struct socket *so)
 {
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	if (!callout_pending(&ssk->keep2msl))
                 callout_reset(&ssk->keep2msl, SDP_KEEPALIVE_TIME,
                     sdp_keepalive_timeout, ssk);
 }
 
 static void
 sdp_stop_keepalive_timer(struct socket *so)
 {
 	struct sdp_sock *ssk;
 
 	ssk = sdp_sk(so);
 	callout_stop(&ssk->keep2msl);
 }
 
 /*
  * sdp_ctloutput() must drop the inpcb lock before performing copyin on
  * socket option arguments.  When it re-acquires the lock after the copy, it
  * has to revalidate that the connection is still valid for the socket
  * option.
  */
 #define SDP_WLOCK_RECHECK(inp) do {					\
 	SDP_WLOCK(ssk);							\
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {		\
 		SDP_WUNLOCK(ssk);					\
 		return (ECONNRESET);					\
 	}								\
 } while(0)
 
 static int
 sdp_ctloutput(struct socket *so, struct sockopt *sopt)
 {
 	int	error, opt, optval;
 	struct sdp_sock *ssk;
 
 	error = 0;
 	ssk = sdp_sk(so);
 	if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_KEEPALIVE) {
 		SDP_WLOCK(ssk);
 		if (so->so_options & SO_KEEPALIVE)
 			sdp_start_keepalive_timer(so);
 		else
 			sdp_stop_keepalive_timer(so);
 		SDP_WUNLOCK(ssk);
 	}
 	if (sopt->sopt_level != IPPROTO_TCP)
 		return (error);
 
 	SDP_WLOCK(ssk);
 	if (ssk->flags & (SDP_TIMEWAIT | SDP_DROPPED)) {
 		SDP_WUNLOCK(ssk);
 		return (ECONNRESET);
 	}
 
 	switch (sopt->sopt_dir) {
 	case SOPT_SET:
 		switch (sopt->sopt_name) {
 		case TCP_NODELAY:
 			SDP_WUNLOCK(ssk);
 			error = sooptcopyin(sopt, &optval, sizeof optval,
 			    sizeof optval);
 			if (error)
 				return (error);
 
 			SDP_WLOCK_RECHECK(ssk);
 			opt = SDP_NODELAY;
 			if (optval)
 				ssk->flags |= opt;
 			else
 				ssk->flags &= ~opt;
 			sdp_do_posts(ssk);
 			SDP_WUNLOCK(ssk);
 			break;
 
 		default:
 			SDP_WUNLOCK(ssk);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 
 	case SOPT_GET:
 		switch (sopt->sopt_name) {
 		case TCP_NODELAY:
 			optval = ssk->flags & SDP_NODELAY;
 			SDP_WUNLOCK(ssk);
 			error = sooptcopyout(sopt, &optval, sizeof optval);
 			break;
 		default:
 			SDP_WUNLOCK(ssk);
 			error = ENOPROTOOPT;
 			break;
 		}
 		break;
 	}
 	return (error);
 }
 #undef SDP_WLOCK_RECHECK
 
 int sdp_mod_count = 0;
 int sdp_mod_usec = 0;
 
 void
 sdp_set_default_moderation(struct sdp_sock *ssk)
 {
 	if (sdp_mod_count <= 0 || sdp_mod_usec <= 0)
 		return;
 	ib_modify_cq(ssk->rx_ring.cq, sdp_mod_count, sdp_mod_usec);
 }
 
 static void
 sdp_dev_add(struct ib_device *device)
 {
 	struct ib_fmr_pool_param param;
 	struct sdp_device *sdp_dev;
 
 	sdp_dev = malloc(sizeof(*sdp_dev), M_SDP, M_WAITOK | M_ZERO);
 	sdp_dev->pd = ib_alloc_pd(device, 0);
 	if (IS_ERR(sdp_dev->pd))
 		goto out_pd;
 	memset(&param, 0, sizeof param);
 	param.max_pages_per_fmr = SDP_FMR_SIZE;
 	param.page_shift = PAGE_SHIFT;
 	param.access = (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ);
 	param.pool_size = SDP_FMR_POOL_SIZE;
 	param.dirty_watermark = SDP_FMR_DIRTY_SIZE;
 	param.cache = 1;
 	sdp_dev->fmr_pool = ib_create_fmr_pool(sdp_dev->pd, &param);
 	if (IS_ERR(sdp_dev->fmr_pool))
 		goto out_fmr;
 	ib_set_client_data(device, &sdp_client, sdp_dev);
 	return;
 
 out_fmr:
 	ib_dealloc_pd(sdp_dev->pd);
 out_pd:
 	free(sdp_dev, M_SDP);
 }
 
 static void
 sdp_dev_rem(struct ib_device *device, void *client_data)
 {
 	struct sdp_device *sdp_dev;
 	struct sdp_sock *ssk;
 
 	SDP_LIST_WLOCK();
 	LIST_FOREACH(ssk, &sdp_list, list) {
 		if (ssk->ib_device != device)
 			continue;
 		SDP_WLOCK(ssk);
 		if ((ssk->flags & SDP_DESTROY) == 0)
 			ssk = sdp_notify(ssk, ECONNRESET);
 		if (ssk)
 			SDP_WUNLOCK(ssk);
 	}
 	SDP_LIST_WUNLOCK();
 	/*
 	 * XXX Do I need to wait between these two?
 	 */
 	sdp_dev = ib_get_client_data(device, &sdp_client);
 	if (!sdp_dev)
 		return;
 	ib_flush_fmr_pool(sdp_dev->fmr_pool);
 	ib_destroy_fmr_pool(sdp_dev->fmr_pool);
 	ib_dealloc_pd(sdp_dev->pd);
 	free(sdp_dev, M_SDP);
 }
 
 struct ib_client sdp_client =
     { .name = "sdp", .add = sdp_dev_add, .remove = sdp_dev_rem };
 
 
 static int
 sdp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, n, i;
 	struct sdp_sock *ssk;
 	struct xinpgen xig;
 
 	/*
 	 * The process of preparing the TCB list is too time-consuming and
 	 * resource-intensive to repeat twice on every request.
 	 */
 	if (req->oldptr == NULL) {
 		n = sdp_count;
 		n += imax(n / 8, 10);
 		req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xtcpcb);
 		return (0);
 	}
 
 	if (req->newptr != NULL)
 		return (EPERM);
 
 	/*
 	 * OK, now we're committed to doing something.
 	 */
 	SDP_LIST_RLOCK();
 	n = sdp_count;
 	SDP_LIST_RUNLOCK();
 
 	error = sysctl_wire_old_buffer(req, 2 * (sizeof xig)
 		+ n * sizeof(struct xtcpcb));
 	if (error != 0)
 		return (error);
 
 	bzero(&xig, sizeof(xig));
 	xig.xig_len = sizeof xig;
 	xig.xig_count = n;
 	xig.xig_gen = 0;
 	xig.xig_sogen = so_gencnt;
 	error = SYSCTL_OUT(req, &xig, sizeof xig);
 	if (error)
 		return (error);
 
 	SDP_LIST_RLOCK();
 	for (ssk = LIST_FIRST(&sdp_list), i = 0;
 	    ssk != NULL && i < n; ssk = LIST_NEXT(ssk, list)) {
 		struct xtcpcb xt;
 
 		SDP_RLOCK(ssk);
 		if (ssk->flags & SDP_TIMEWAIT) {
 			if (ssk->cred != NULL)
 				error = cr_cansee(req->td->td_ucred,
 				    ssk->cred);
 			else
 				error = EINVAL;	/* Skip this inp. */
 		} else if (ssk->socket)
 			error = cr_canseesocket(req->td->td_ucred,
 			    ssk->socket);
 		else
 			error = EINVAL;
 		if (error) {
 			error = 0;
 			goto next;
 		}
 
 		bzero(&xt, sizeof(xt));
 		xt.xt_len = sizeof xt;
 		xt.xt_inp.inp_gencnt = 0;
 		xt.xt_inp.inp_vflag = INP_IPV4;
 		memcpy(&xt.xt_inp.inp_laddr, &ssk->laddr, sizeof(ssk->laddr));
 		xt.xt_inp.inp_lport = ssk->lport;
 		memcpy(&xt.xt_inp.inp_faddr, &ssk->faddr, sizeof(ssk->faddr));
 		xt.xt_inp.inp_fport = ssk->fport;
 		xt.t_state = ssk->state;
 		if (ssk->socket != NULL)
 			sotoxsocket(ssk->socket, &xt.xt_inp.xi_socket);
 		xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP;
 		SDP_RUNLOCK(ssk);
 		error = SYSCTL_OUT(req, &xt, sizeof xt);
 		if (error)
 			break;
 		i++;
 		continue;
 next:
 		SDP_RUNLOCK(ssk);
 	}
 	if (!error) {
 		/*
 		 * Give the user an updated idea of our state.
 		 * If the generation differs from what we told
 		 * her before, she knows that something happened
 		 * while we were processing this request, and it
 		 * might be necessary to retry.
 		 */
 		xig.xig_gen = 0;
 		xig.xig_sogen = so_gencnt;
 		xig.xig_count = sdp_count;
 		error = SYSCTL_OUT(req, &xig, sizeof xig);
 	}
 	SDP_LIST_RUNLOCK();
 	return (error);
 }
 
 SYSCTL_NODE(_net_inet, -1, sdp, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "SDP");
 
 SYSCTL_PROC(_net_inet_sdp, TCPCTL_PCBLIST, pcblist,
     CTLFLAG_RD | CTLTYPE_STRUCT | CTLFLAG_MPSAFE,
     0, 0, sdp_pcblist, "S,xtcpcb",
     "List of active SDP connections");
 
 static void
 sdp_zone_change(void *tag)
 {
 
 	uma_zone_set_max(sdp_zone, maxsockets);
 }
 
 static void
 sdp_init(void *arg __unused)
 {
 
 	LIST_INIT(&sdp_list);
 	sdp_zone = uma_zcreate("sdp_sock", sizeof(struct sdp_sock),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(sdp_zone, maxsockets);
 	EVENTHANDLER_REGISTER(maxsockets_change, sdp_zone_change, NULL,
 		EVENTHANDLER_PRI_ANY);
 	rx_comp_wq = create_singlethread_workqueue("rx_comp_wq");
 	ib_register_client(&sdp_client);
 }
 SYSINIT(sdp_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_SECOND, sdp_init, NULL);
 
-extern struct domain sdpdomain;
-
-struct pr_usrreqs sdp_usrreqs = {
-	.pru_abort =		sdp_abort,
-	.pru_accept =		sdp_accept,
-	.pru_attach =		sdp_attach,
-	.pru_bind =		sdp_bind,
-	.pru_connect =		sdp_connect,
-	.pru_detach =		sdp_detach,
-	.pru_disconnect =	sdp_disconnect,
-	.pru_listen =		sdp_listen,
-	.pru_peeraddr =		sdp_getpeeraddr,
-	.pru_rcvoob =		sdp_rcvoob,
-	.pru_send =		sdp_send,
-	.pru_sosend =		sdp_sosend,
-	.pru_soreceive =	sdp_sorecv,
-	.pru_shutdown =		sdp_shutdown,
-	.pru_sockaddr =		sdp_getsockaddr,
-	.pru_close =		sdp_close,
-};
-
-struct protosw sdpsw[] = {
-{
-	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&sdpdomain,
+#define	SDP_PROTOSW							\
+	.pr_type =		SOCK_STREAM,				\
+	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,\
+	.pr_ctloutput =		sdp_ctloutput,				\
+	.pr_abort =		sdp_abort,				\
+	.pr_accept =		sdp_accept,				\
+	.pr_attach =		sdp_attach,				\
+	.pr_bind =		sdp_bind,				\
+	.pr_connect =		sdp_connect,				\
+	.pr_detach =		sdp_detach,				\
+	.pr_disconnect =	sdp_disconnect,				\
+	.pr_listen =		sdp_listen,				\
+	.pr_peeraddr =		sdp_getpeeraddr,			\
+	.pr_rcvoob =		sdp_rcvoob,				\
+	.pr_send =		sdp_send,				\
+	.pr_sosend =		sdp_sosend,				\
+	.pr_soreceive =		sdp_sorecv,				\
+	.pr_shutdown =		sdp_shutdown,				\
+	.pr_sockaddr =		sdp_getsockaddr,			\
+	.pr_close =		sdp_close
+
+
+static struct protosw sdp_ip_protosw = {
 	.pr_protocol =		IPPROTO_IP,
-	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
-	.pr_ctloutput =		sdp_ctloutput,
-	.pr_usrreqs =		&sdp_usrreqs
-},
-{
-	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&sdpdomain,
+	SDP_PROTOSW
+};
+static struct protosw sdp_tcp_protosw = {
 	.pr_protocol =		IPPROTO_TCP,
-	.pr_flags =		PR_CONNREQUIRED|PR_IMPLOPCL|PR_WANTRCVD,
-	.pr_ctloutput =		sdp_ctloutput,
-	.pr_usrreqs =		&sdp_usrreqs
-},
+	SDP_PROTOSW
 };
 
-struct domain sdpdomain = {
+static struct domain sdpdomain = {
 	.dom_family =		AF_INET_SDP,
 	.dom_name =		"SDP",
-	.dom_protosw =		sdpsw,
-	.dom_protoswNPROTOSW =	&sdpsw[sizeof(sdpsw)/sizeof(sdpsw[0])],
+	.dom_nprotosw =		2,
+	.dom_protosw = {
+		&sdp_ip_protosw,
+		&sdp_tcp_protosw,
+	},
 };
 
 DOMAIN_SET(sdp);
 
 int sdp_debug_level = 1;
 int sdp_data_debug_level = 0;
diff --git a/sys/rpc/rpc_generic.c b/sys/rpc/rpc_generic.c
index d75f23493f90..0e1797272a03 100644
--- a/sys/rpc/rpc_generic.c
+++ b/sys/rpc/rpc_generic.c
@@ -1,993 +1,993 @@
 /*	$NetBSD: rpc_generic.c,v 1.4 2000/09/28 09:07:04 kleink Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2009, Sun Microsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without 
  * modification, are permitted provided that the following conditions are met:
  * - Redistributions of source code must retain the above copyright notice, 
  *   this list of conditions and the following disclaimer.
  * - Redistributions in binary form must reproduce the above copyright notice, 
  *   this list of conditions and the following disclaimer in the documentation 
  *   and/or other materials provided with the distribution.
  * - Neither the name of Sun Microsystems, Inc. nor the names of its 
  *   contributors may be used to endorse or promote products derived 
  *   from this software without specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
  * POSSIBILITY OF SUCH DAMAGE.
  */
 /*
  * Copyright (c) 1986-1991 by Sun Microsystems Inc. 
  */
 
 /* #pragma ident	"@(#)rpc_generic.c	1.17	94/04/24 SMI" */
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * rpc_generic.c, Miscl routines for RPC.
  *
  */
 
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sbuf.h>
 #include <sys/systm.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syslog.h>
 
 #include <net/vnet.h>
 
 #include <rpc/rpc.h>
 #include <rpc/nettype.h>
 #include <rpc/rpcsec_gss.h>
 #include <rpc/rpcsec_tls.h>
 
 #include <rpc/rpc_com.h>
 #include <rpc/krpc.h>
 
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_param.h>
 
 extern	u_long sb_max_adj;	/* not defined in socketvar.h */
 
 /* Provide an entry point hook for the rpcsec_gss module. */
 struct rpc_gss_entries	rpc_gss_entries;
 
 struct handle {
 	NCONF_HANDLE *nhandle;
 	int nflag;		/* Whether NETPATH or NETCONFIG */
 	int nettype;
 };
 
 static const struct _rpcnettype {
 	const char *name;
 	const int type;
 } _rpctypelist[] = {
 	{ "netpath", _RPC_NETPATH },
 	{ "visible", _RPC_VISIBLE },
 	{ "circuit_v", _RPC_CIRCUIT_V },
 	{ "datagram_v", _RPC_DATAGRAM_V },
 	{ "circuit_n", _RPC_CIRCUIT_N },
 	{ "datagram_n", _RPC_DATAGRAM_N },
 	{ "tcp", _RPC_TCP },
 	{ "udp", _RPC_UDP },
 	{ 0, _RPC_NONE }
 };
 
 struct netid_af {
 	const char	*netid;
 	int		af;
 	int		protocol;
 };
 
 static const struct netid_af na_cvt[] = {
 	{ "udp",  AF_INET,  IPPROTO_UDP },
 	{ "tcp",  AF_INET,  IPPROTO_TCP },
 #ifdef INET6
 	{ "udp6", AF_INET6, IPPROTO_UDP },
 	{ "tcp6", AF_INET6, IPPROTO_TCP },
 #endif
 	{ "local", AF_LOCAL, 0 }
 };
 
 struct rpc_createerr rpc_createerr;
 
 /*
  * Find the appropriate buffer size
  */
 u_int
 /*ARGSUSED*/
 __rpc_get_t_size(int af, int proto, int size)
 {
 	int defsize;
 
 	switch (proto) {
 	case IPPROTO_TCP:
 		defsize = 64 * 1024;	/* XXX */
 		break;
 	case IPPROTO_UDP:
 		defsize = UDPMSGSIZE;
 		break;
 	default:
 		defsize = RPC_MAXDATASIZE;
 		break;
 	}
 	if (size == 0)
 		return defsize;
 
 	/* Check whether the value is within the upper max limit */
 	return (size > sb_max_adj ? (u_int)sb_max_adj : (u_int)size);
 }
 
 /*
  * Find the appropriate address buffer size
  */
 u_int
 __rpc_get_a_size(af)
 	int af;
 {
 	switch (af) {
 	case AF_INET:
 		return sizeof (struct sockaddr_in);
 #ifdef INET6
 	case AF_INET6:
 		return sizeof (struct sockaddr_in6);
 #endif
 	case AF_LOCAL:
 		return sizeof (struct sockaddr_un);
 	default:
 		break;
 	}
 	return ((u_int)RPC_MAXADDRSIZE);
 }
 
 #if 0
 
 /*
  * Used to ping the NULL procedure for clnt handle.
  * Returns NULL if fails, else a non-NULL pointer.
  */
 void *
 rpc_nullproc(clnt)
 	CLIENT *clnt;
 {
 	struct timeval TIMEOUT = {25, 0};
 
 	if (clnt_call(clnt, NULLPROC, (xdrproc_t) xdr_void, NULL,
 		(xdrproc_t) xdr_void, NULL, TIMEOUT) != RPC_SUCCESS) {
 		return (NULL);
 	}
 	return ((void *) clnt);
 }
 
 #endif
 
 int
 __rpc_socket2sockinfo(struct socket *so, struct __rpc_sockinfo *sip)
 {
 	int type, proto;
 	struct sockaddr *sa;
 	sa_family_t family;
 	struct sockopt opt;
 	int error;
 
 	CURVNET_SET(so->so_vnet);
-	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
+	error = so->so_proto->pr_sockaddr(so, &sa);
 	CURVNET_RESTORE();
 	if (error)
 		return 0;
 
 	sip->si_alen = sa->sa_len;
 	family = sa->sa_family;
 	free(sa, M_SONAME);
 
 	opt.sopt_dir = SOPT_GET;
 	opt.sopt_level = SOL_SOCKET;
 	opt.sopt_name = SO_TYPE;
 	opt.sopt_val = &type;
 	opt.sopt_valsize = sizeof type;
 	opt.sopt_td = NULL;
 	error = sogetopt(so, &opt);
 	if (error)
 		return 0;
 
 	/* XXX */
 	if (family != AF_LOCAL) {
 		if (type == SOCK_STREAM)
 			proto = IPPROTO_TCP;
 		else if (type == SOCK_DGRAM)
 			proto = IPPROTO_UDP;
 		else
 			return 0;
 	} else
 		proto = 0;
 
 	sip->si_af = family;
 	sip->si_proto = proto;
 	sip->si_socktype = type;
 
 	return 1;
 }
 
 /*
  * Linear search, but the number of entries is small.
  */
 int
 __rpc_nconf2sockinfo(const struct netconfig *nconf, struct __rpc_sockinfo *sip)
 {
 	int i;
 
 	for (i = 0; i < (sizeof na_cvt) / (sizeof (struct netid_af)); i++)
 		if (strcmp(na_cvt[i].netid, nconf->nc_netid) == 0 || (
 		    strcmp(nconf->nc_netid, "unix") == 0 &&
 		    strcmp(na_cvt[i].netid, "local") == 0)) {
 			sip->si_af = na_cvt[i].af;
 			sip->si_proto = na_cvt[i].protocol;
 			sip->si_socktype =
 			    __rpc_seman2socktype((int)nconf->nc_semantics);
 			if (sip->si_socktype == -1)
 				return 0;
 			sip->si_alen = __rpc_get_a_size(sip->si_af);
 			return 1;
 		}
 
 	return 0;
 }
 
 struct socket *
 __rpc_nconf2socket(const struct netconfig *nconf)
 {
 	struct __rpc_sockinfo si;
 	struct socket *so;
 	int error;
 
 	if (!__rpc_nconf2sockinfo(nconf, &si))
 		return 0;
 
 	so = NULL;
 	error =  socreate(si.si_af, &so, si.si_socktype, si.si_proto,
 	    curthread->td_ucred, curthread);
 
 	if (error)
 		return NULL;
 	else
 		return so;
 }
 
 char *
 taddr2uaddr(const struct netconfig *nconf, const struct netbuf *nbuf)
 {
 	struct __rpc_sockinfo si;
 
 	if (!__rpc_nconf2sockinfo(nconf, &si))
 		return NULL;
 	return __rpc_taddr2uaddr_af(si.si_af, nbuf);
 }
 
 struct netbuf *
 uaddr2taddr(const struct netconfig *nconf, const char *uaddr)
 {
 	struct __rpc_sockinfo si;
 	
 	if (!__rpc_nconf2sockinfo(nconf, &si))
 		return NULL;
 	return __rpc_uaddr2taddr_af(si.si_af, uaddr);
 }
 
 char *
 __rpc_taddr2uaddr_af(int af, const struct netbuf *nbuf)
 {
 	char *ret;
 	struct sbuf sb;
 	struct sockaddr_in *sin;
 	struct sockaddr_un *sun;
 	char namebuf[INET_ADDRSTRLEN];
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 	char namebuf6[INET6_ADDRSTRLEN];
 #endif
 	uint16_t port;
 
 	sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND);
 
 	switch (af) {
 	case AF_INET:
 		if (nbuf->len < sizeof(*sin))
 			return NULL;
 		sin = nbuf->buf;
 		if (inet_ntop(af, &sin->sin_addr, namebuf, sizeof namebuf)
 		    == NULL)
 			return NULL;
 		port = ntohs(sin->sin_port);
 		if (sbuf_printf(&sb, "%s.%u.%u", namebuf,
 			((uint32_t)port) >> 8,
 			port & 0xff) < 0)
 			return NULL;
 		break;
 #ifdef INET6
 	case AF_INET6:
 		if (nbuf->len < sizeof(*sin6))
 			return NULL;
 		sin6 = nbuf->buf;
 		if (inet_ntop(af, &sin6->sin6_addr, namebuf6, sizeof namebuf6)
 		    == NULL)
 			return NULL;
 		port = ntohs(sin6->sin6_port);
 		if (sbuf_printf(&sb, "%s.%u.%u", namebuf6,
 			((uint32_t)port) >> 8,
 			port & 0xff) < 0)
 			return NULL;
 		break;
 #endif
 	case AF_LOCAL:
 		sun = nbuf->buf;
 		if (sbuf_printf(&sb, "%.*s", (int)(sun->sun_len -
 			    offsetof(struct sockaddr_un, sun_path)),
 			sun->sun_path) < 0)
 			return (NULL);
 		break;
 	default:
 		return NULL;
 	}
 
 	sbuf_finish(&sb);
 	ret = strdup(sbuf_data(&sb), M_RPC);
 	sbuf_delete(&sb);
 
 	return ret;
 }
 
 struct netbuf *
 __rpc_uaddr2taddr_af(int af, const char *uaddr)
 {
 	struct netbuf *ret = NULL;
 	char *addrstr, *p;
 	unsigned port, portlo, porthi;
 	struct sockaddr_in *sin;
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	struct sockaddr_un *sun;
 
 	port = 0;
 	sin = NULL;
 
 	if (uaddr == NULL)
 		return NULL;
 
 	addrstr = strdup(uaddr, M_RPC);
 	if (addrstr == NULL)
 		return NULL;
 
 	/*
 	 * AF_LOCAL addresses are expected to be absolute
 	 * pathnames, anything else will be AF_INET or AF_INET6.
 	 */
 	if (*addrstr != '/') {
 		p = strrchr(addrstr, '.');
 		if (p == NULL)
 			goto out;
 		portlo = (unsigned)strtol(p + 1, NULL, 10);
 		*p = '\0';
 
 		p = strrchr(addrstr, '.');
 		if (p == NULL)
 			goto out;
 		porthi = (unsigned)strtol(p + 1, NULL, 10);
 		*p = '\0';
 		port = (porthi << 8) | portlo;
 	}
 
 	ret = (struct netbuf *)malloc(sizeof *ret, M_RPC, M_WAITOK);
 	
 	switch (af) {
 	case AF_INET:
 		sin = (struct sockaddr_in *)malloc(sizeof *sin, M_RPC,
 		    M_WAITOK);
 		memset(sin, 0, sizeof *sin);
 		sin->sin_family = AF_INET;
 		sin->sin_port = htons(port);
 		if (inet_pton(AF_INET, addrstr, &sin->sin_addr) <= 0) {
 			free(sin, M_RPC);
 			free(ret, M_RPC);
 			ret = NULL;
 			goto out;
 		}
 		sin->sin_len = ret->maxlen = ret->len = sizeof *sin;
 		ret->buf = sin;
 		break;
 #ifdef INET6
 	case AF_INET6:
 		sin6 = (struct sockaddr_in6 *)malloc(sizeof *sin6, M_RPC,
 		    M_WAITOK);
 		memset(sin6, 0, sizeof *sin6);
 		sin6->sin6_family = AF_INET6;
 		sin6->sin6_port = htons(port);
 		if (inet_pton(AF_INET6, addrstr, &sin6->sin6_addr) <= 0) {
 			free(sin6, M_RPC);
 			free(ret, M_RPC);
 			ret = NULL;
 			goto out;
 		}
 		sin6->sin6_len = ret->maxlen = ret->len = sizeof *sin6;
 		ret->buf = sin6;
 		break;
 #endif
 	case AF_LOCAL:
 		sun = (struct sockaddr_un *)malloc(sizeof *sun, M_RPC,
 		    M_WAITOK);
 		memset(sun, 0, sizeof *sun);
 		sun->sun_family = AF_LOCAL;
 		strncpy(sun->sun_path, addrstr, sizeof(sun->sun_path) - 1);
 		ret->len = ret->maxlen = sun->sun_len = SUN_LEN(sun);
 		ret->buf = sun;
 		break;
 	default:
 		break;
 	}
 out:
 	free(addrstr, M_RPC);
 	return ret;
 }
 
 int
 __rpc_seman2socktype(int semantics)
 {
 	switch (semantics) {
 	case NC_TPI_CLTS:
 		return SOCK_DGRAM;
 	case NC_TPI_COTS_ORD:
 		return SOCK_STREAM;
 	case NC_TPI_RAW:
 		return SOCK_RAW;
 	default:
 		break;
 	}
 
 	return -1;
 }
 
 int
 __rpc_socktype2seman(int socktype)
 {
 	switch (socktype) {
 	case SOCK_DGRAM:
 		return NC_TPI_CLTS;
 	case SOCK_STREAM:
 		return NC_TPI_COTS_ORD;
 	case SOCK_RAW:
 		return NC_TPI_RAW;
 	default:
 		break;
 	}
 
 	return -1;
 }
 
 /*
  * Returns the type of the network as defined in <rpc/nettype.h>
  * If nettype is NULL, it defaults to NETPATH.
  */
 static int
 getnettype(const char *nettype)
 {
 	int i;
 
 	if ((nettype == NULL) || (nettype[0] == 0)) {
 		return (_RPC_NETPATH);	/* Default */
 	}
 
 #if 0
 	nettype = strlocase(nettype);
 #endif
 	for (i = 0; _rpctypelist[i].name; i++)
 		if (strcasecmp(nettype, _rpctypelist[i].name) == 0) {
 			return (_rpctypelist[i].type);
 		}
 	return (_rpctypelist[i].type);
 }
 
 /*
  * For the given nettype (tcp or udp only), return the first structure found.
  * This should be freed by calling freenetconfigent()
  */
 struct netconfig *
 __rpc_getconfip(const char *nettype)
 {
 	char *netid;
 	static char *netid_tcp = (char *) NULL;
 	static char *netid_udp = (char *) NULL;
 	struct netconfig *dummy;
 
 	if (!netid_udp && !netid_tcp) {
 		struct netconfig *nconf;
 		void *confighandle;
 
 		if (!(confighandle = setnetconfig())) {
 			log(LOG_ERR, "rpc: failed to open " NETCONFIG);
 			return (NULL);
 		}
 		while ((nconf = getnetconfig(confighandle)) != NULL) {
 			if (strcmp(nconf->nc_protofmly, NC_INET) == 0) {
 				if (strcmp(nconf->nc_proto, NC_TCP) == 0) {
 					netid_tcp = strdup(nconf->nc_netid,
 					    M_RPC);
 				} else
 				if (strcmp(nconf->nc_proto, NC_UDP) == 0) {
 					netid_udp = strdup(nconf->nc_netid,
 					    M_RPC);
 				}
 			}
 		}
 		endnetconfig(confighandle);
 	}
 	if (strcmp(nettype, "udp") == 0)
 		netid = netid_udp;
 	else if (strcmp(nettype, "tcp") == 0)
 		netid = netid_tcp;
 	else {
 		return (NULL);
 	}
 	if ((netid == NULL) || (netid[0] == 0)) {
 		return (NULL);
 	}
 	dummy = getnetconfigent(netid);
 	return (dummy);
 }
 
 /*
  * Returns the type of the nettype, which should then be used with
  * __rpc_getconf().
  *
  * For simplicity in the kernel, we don't support the NETPATH
  * environment variable. We behave as userland would then NETPATH is
  * unset, i.e. iterate over all visible entries in netconfig.
  */
 void *
 __rpc_setconf(nettype)
 	const char *nettype;
 {
 	struct handle *handle;
 
 	handle = (struct handle *) malloc(sizeof (struct handle),
 	    M_RPC, M_WAITOK);
 	switch (handle->nettype = getnettype(nettype)) {
 	case _RPC_NETPATH:
 	case _RPC_CIRCUIT_N:
 	case _RPC_DATAGRAM_N:
 		if (!(handle->nhandle = setnetconfig()))
 			goto failed;
 		handle->nflag = TRUE;
 		break;
 	case _RPC_VISIBLE:
 	case _RPC_CIRCUIT_V:
 	case _RPC_DATAGRAM_V:
 	case _RPC_TCP:
 	case _RPC_UDP:
 		if (!(handle->nhandle = setnetconfig())) {
 		        log(LOG_ERR, "rpc: failed to open " NETCONFIG);
 			goto failed;
 		}
 		handle->nflag = FALSE;
 		break;
 	default:
 		goto failed;
 	}
 
 	return (handle);
 
 failed:
 	free(handle, M_RPC);
 	return (NULL);
 }
 
 /*
  * Returns the next netconfig struct for the given "net" type.
  * __rpc_setconf() should have been called previously.
  */
 struct netconfig *
 __rpc_getconf(void *vhandle)
 {
 	struct handle *handle;
 	struct netconfig *nconf;
 
 	handle = (struct handle *)vhandle;
 	if (handle == NULL) {
 		return (NULL);
 	}
 	for (;;) {
 		if (handle->nflag) {
 			nconf = getnetconfig(handle->nhandle);
 			if (nconf && !(nconf->nc_flag & NC_VISIBLE))
 				continue;
 		} else {
 			nconf = getnetconfig(handle->nhandle);
 		}
 		if (nconf == NULL)
 			break;
 		if ((nconf->nc_semantics != NC_TPI_CLTS) &&
 			(nconf->nc_semantics != NC_TPI_COTS) &&
 			(nconf->nc_semantics != NC_TPI_COTS_ORD))
 			continue;
 		switch (handle->nettype) {
 		case _RPC_VISIBLE:
 			if (!(nconf->nc_flag & NC_VISIBLE))
 				continue;
 			/* FALLTHROUGH */
 		case _RPC_NETPATH:	/* Be happy */
 			break;
 		case _RPC_CIRCUIT_V:
 			if (!(nconf->nc_flag & NC_VISIBLE))
 				continue;
 			/* FALLTHROUGH */
 		case _RPC_CIRCUIT_N:
 			if ((nconf->nc_semantics != NC_TPI_COTS) &&
 				(nconf->nc_semantics != NC_TPI_COTS_ORD))
 				continue;
 			break;
 		case _RPC_DATAGRAM_V:
 			if (!(nconf->nc_flag & NC_VISIBLE))
 				continue;
 			/* FALLTHROUGH */
 		case _RPC_DATAGRAM_N:
 			if (nconf->nc_semantics != NC_TPI_CLTS)
 				continue;
 			break;
 		case _RPC_TCP:
 			if (((nconf->nc_semantics != NC_TPI_COTS) &&
 				(nconf->nc_semantics != NC_TPI_COTS_ORD)) ||
 				(strcmp(nconf->nc_protofmly, NC_INET)
 #ifdef INET6
 				 && strcmp(nconf->nc_protofmly, NC_INET6))
 #else
 				)
 #endif
 				||
 				strcmp(nconf->nc_proto, NC_TCP))
 				continue;
 			break;
 		case _RPC_UDP:
 			if ((nconf->nc_semantics != NC_TPI_CLTS) ||
 				(strcmp(nconf->nc_protofmly, NC_INET)
 #ifdef INET6
 				&& strcmp(nconf->nc_protofmly, NC_INET6))
 #else
 				)
 #endif
 				||
 				strcmp(nconf->nc_proto, NC_UDP))
 				continue;
 			break;
 		}
 		break;
 	}
 	return (nconf);
 }
 
 void
 __rpc_endconf(vhandle)
 	void * vhandle;
 {
 	struct handle *handle;
 
 	handle = (struct handle *) vhandle;
 	if (handle == NULL) {
 		return;
 	}
 	endnetconfig(handle->nhandle);
 	free(handle, M_RPC);
 }
 
 int
 __rpc_sockisbound(struct socket *so)
 {
 	struct sockaddr *sa;
 	int error, bound;
 
 	CURVNET_SET(so->so_vnet);
-	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
+	error = so->so_proto->pr_sockaddr(so, &sa);
 	CURVNET_RESTORE();
 	if (error)
 		return (0);
 
 	switch (sa->sa_family) {
 		case AF_INET:
 			bound = (((struct sockaddr_in *) sa)->sin_port != 0);
 			break;
 #ifdef INET6
 		case AF_INET6:
 			bound = (((struct sockaddr_in6 *) sa)->sin6_port != 0);
 			break;
 #endif
 		case AF_LOCAL:
 			/* XXX check this */
 			bound = (((struct sockaddr_un *) sa)->sun_path[0] != '\0');
 			break;
 		default:
 			bound = FALSE;
 			break;
 	}
 
 	free(sa, M_SONAME);
 
 	return bound;
 }
 
 /*
  * Implement XDR-style API for RPC call.
  */
 enum clnt_stat
 clnt_call_private(
 	CLIENT		*cl,		/* client handle */
 	struct rpc_callextra *ext,	/* call metadata */
 	rpcproc_t	proc,		/* procedure number */
 	xdrproc_t	xargs,		/* xdr routine for args */
 	void		*argsp,		/* pointer to args */
 	xdrproc_t	xresults,	/* xdr routine for results */
 	void		*resultsp,	/* pointer to results */
 	struct timeval	utimeout)	/* seconds to wait before giving up */
 {
 	XDR xdrs;
 	struct mbuf *mreq;
 	struct mbuf *mrep;
 	enum clnt_stat stat;
 
 	mreq = m_getcl(M_WAITOK, MT_DATA, 0);
 
 	xdrmbuf_create(&xdrs, mreq, XDR_ENCODE);
 	if (!xargs(&xdrs, argsp)) {
 		m_freem(mreq);
 		return (RPC_CANTENCODEARGS);
 	}
 	XDR_DESTROY(&xdrs);
 
 	stat = CLNT_CALL_MBUF(cl, ext, proc, mreq, &mrep, utimeout);
 	m_freem(mreq);
 
 	if (stat == RPC_SUCCESS) {
 		xdrmbuf_create(&xdrs, mrep, XDR_DECODE);
 		if (!xresults(&xdrs, resultsp)) {
 			XDR_DESTROY(&xdrs);
 			return (RPC_CANTDECODERES);
 		}
 		XDR_DESTROY(&xdrs);
 	}
 
 	return (stat);
 }
 
 /*
  * Bind a socket to a privileged IP port
  */
 int
 bindresvport(struct socket *so, struct sockaddr *sa)
 {
 	int old, error, af;
 	bool_t freesa = FALSE;
 	struct sockaddr_in *sin;
 #ifdef INET6
 	struct sockaddr_in6 *sin6;
 #endif
 	struct sockopt opt;
 	int proto, portrange, portlow;
 	uint16_t *portp;
 	socklen_t salen;
 
 	if (sa == NULL) {
 		CURVNET_SET(so->so_vnet);
-		error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
+		error = so->so_proto->pr_sockaddr(so, &sa);
 		CURVNET_RESTORE();
 		if (error)
 			return (error);
 		freesa = TRUE;
 		af = sa->sa_family;
 		salen = sa->sa_len;
 		memset(sa, 0, sa->sa_len);
 	} else {
 		af = sa->sa_family;
 		salen = sa->sa_len;
 	}
 
 	switch (af) {
 	case AF_INET:
 		proto = IPPROTO_IP;
 		portrange = IP_PORTRANGE;
 		portlow = IP_PORTRANGE_LOW;
 		sin = (struct sockaddr_in *)sa;
 		portp = &sin->sin_port;
 		break;
 #ifdef INET6
 	case AF_INET6:
 		proto = IPPROTO_IPV6;
 		portrange = IPV6_PORTRANGE;
 		portlow = IPV6_PORTRANGE_LOW;
 		sin6 = (struct sockaddr_in6 *)sa;
 		portp = &sin6->sin6_port;
 		break;
 #endif
 	default:
 		return (EPFNOSUPPORT);
 	}
 
 	sa->sa_family = af;
 	sa->sa_len = salen;
 
 	if (*portp == 0) {
 		bzero(&opt, sizeof(opt));
 		opt.sopt_dir = SOPT_GET;
 		opt.sopt_level = proto;
 		opt.sopt_name = portrange;
 		opt.sopt_val = &old;
 		opt.sopt_valsize = sizeof(old);
 		error = sogetopt(so, &opt);
 		if (error) {
 			goto out;
 		}
 
 		opt.sopt_dir = SOPT_SET;
 		opt.sopt_val = &portlow;
 		error = sosetopt(so, &opt);
 		if (error)
 			goto out;
 	}
 
 	error = sobind(so, sa, curthread);
 
 	if (*portp == 0) {
 		if (error) {
 			opt.sopt_dir = SOPT_SET;
 			opt.sopt_val = &old;
 			sosetopt(so, &opt);
 		}
 	}
 out:
 	if (freesa)
 		free(sa, M_SONAME);
 
 	return (error);
 }
 
 /*
  * Make sure an mbuf list is made up entirely of ext_pgs mbufs.
  * This is needed for sosend() when KERN_TLS is being used.
  * (There might also be a performance improvement for certain
  *  network interfaces that handle ext_pgs mbufs efficiently.)
  * It expects at least one non-ext_pgs mbuf followed by zero
  * or more ext_pgs mbufs.  It does not handle the case where
  * non-ext_pgs mbuf(s) follow ext_pgs ones.
  * It also performs sanity checks on the resultant list.
  * The "mp" argument list is consumed.
  * The "maxextsiz" argument is the upper bound on the data
  * size for each mbuf (usually 16K for KERN_TLS).
  */
 struct mbuf *
 _rpc_copym_into_ext_pgs(struct mbuf *mp, int maxextsiz)
 {
 	struct mbuf *m, *m2, *m3, *mhead;
 	int tlen;
 
 	KASSERT((mp->m_flags & (M_EXT | M_EXTPG)) !=
 	    (M_EXT | M_EXTPG), ("_rpc_copym_into_ext_pgs:"
 	    " first mbuf is an ext_pgs"));
 	/*
 	 * Find the last non-ext_pgs mbuf and the total
 	 * length of the non-ext_pgs mbuf(s).
 	 * The first mbuf must always be a non-ext_pgs
 	 * mbuf.
 	 */
 	tlen = mp->m_len;
 	m2 = mp;
 	for (m = mp->m_next; m != NULL; m = m->m_next) {
 		if ((m->m_flags & M_EXTPG) != 0)
 			break;
 		tlen += m->m_len;
 		m2 = m;
 	}
 
 	/*
 	 * Copy the non-ext_pgs mbuf(s) into an ext_pgs
 	 * mbuf list.
 	 */
 	m2->m_next = NULL;
 	mhead = mb_mapped_to_unmapped(mp, tlen, maxextsiz,
 	    M_WAITOK, &m2);
 
 	/*
 	 * Link the ext_pgs list onto the newly copied
 	 * list and free up the non-ext_pgs mbuf(s).
 	 */
 	m2->m_next = m;
 	m_freem(mp);
 
 	/*
 	 * Sanity check the resultant mbuf list.  Check for and
 	 * remove any 0 length mbufs in the list, since the
 	 * KERN_TLS code does not expect any 0 length mbuf(s)
 	 * in the list.
 	 */
 	m3 = NULL;
 	m2 = mhead;
 	tlen = 0;
 	while (m2 != NULL) {
 		KASSERT(m2->m_len >= 0, ("_rpc_copym_into_ext_pgs:"
 		    " negative m_len"));
 		KASSERT((m2->m_flags & (M_EXT | M_EXTPG)) ==
 		    (M_EXT | M_EXTPG), ("_rpc_copym_into_ext_pgs:"
 			    " non-nomap mbuf in list"));
 		if (m2->m_len == 0) {
 			if (m3 != NULL)
 				m3->m_next = m2->m_next;
 			else
 				m = m2->m_next;
 			m2->m_next = NULL;
 			m_free(m2);
 			if (m3 != NULL)
 				m2 = m3->m_next;
 			else
 				m2 = m;
 		} else {
 			MBUF_EXT_PGS_ASSERT_SANITY(m2);
 			m3 = m2;
 			tlen += m2->m_len;
 			m2 = m2->m_next;
 		}
 	}
 	return (mhead);
 }
 
 /*
  * Kernel module glue
  */
 static int
 krpc_modevent(module_t mod, int type, void *data)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		error = rpctls_init();
 		break;
 	case MOD_UNLOAD:
 		/*
 		 * Cannot be unloaded, since the rpctlssd or rpctlscd daemons
 		 * might be performing a rpctls syscall.
 		 */
 		/* FALLTHROUGH */
 	default:
 		error = EOPNOTSUPP;
 	}
 	return (error);
 }
 static moduledata_t krpc_mod = {
 	"krpc",
 	krpc_modevent,
 	NULL,
 };
 DECLARE_MODULE(krpc, krpc_mod, SI_SUB_VFS, SI_ORDER_ANY);
 
 /* So that loader and kldload(2) can find us, wherever we are.. */
 MODULE_VERSION(krpc, 1);
 MODULE_DEPEND(krpc, xdr, 1, 1, 1);
diff --git a/sys/rpc/svc_dg.c b/sys/rpc/svc_dg.c
index db1928655618..4ab86bb5904a 100644
--- a/sys/rpc/svc_dg.c
+++ b/sys/rpc/svc_dg.c
@@ -1,307 +1,307 @@
 /*	$NetBSD: svc_dg.c,v 1.4 2000/07/06 03:10:35 christos Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2009, Sun Microsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without 
  * modification, are permitted provided that the following conditions are met:
  * - Redistributions of source code must retain the above copyright notice, 
  *   this list of conditions and the following disclaimer.
  * - Redistributions in binary form must reproduce the above copyright notice, 
  *   this list of conditions and the following disclaimer in the documentation 
  *   and/or other materials provided with the distribution.
  * - Neither the name of Sun Microsystems, Inc. nor the names of its 
  *   contributors may be used to endorse or promote products derived 
  *   from this software without specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 /*
  * Copyright (c) 1986-1991 by Sun Microsystems Inc.
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
 #ident	"@(#)svc_dg.c	1.17	94/04/24 SMI"
 #endif
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * svc_dg.c, Server side for connectionless RPC.
  */
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 
 #include <net/vnet.h>
 
 #include <rpc/rpc.h>
 
 #include <rpc/rpc_com.h>
 
 static enum xprt_stat svc_dg_stat(SVCXPRT *);
 static bool_t svc_dg_recv(SVCXPRT *, struct rpc_msg *,
     struct sockaddr **, struct mbuf **);
 static bool_t svc_dg_reply(SVCXPRT *, struct rpc_msg *,
     struct sockaddr *, struct mbuf *, uint32_t *);
 static void svc_dg_destroy(SVCXPRT *);
 static bool_t svc_dg_control(SVCXPRT *, const u_int, void *);
 static int svc_dg_soupcall(struct socket *so, void *arg, int waitflag);
 
 static const struct xp_ops svc_dg_ops = {
 	.xp_recv =	svc_dg_recv,
 	.xp_stat =	svc_dg_stat,
 	.xp_reply =	svc_dg_reply,
 	.xp_destroy =	svc_dg_destroy,
 	.xp_control =	svc_dg_control,
 };
 
 /*
  * Usage:
  *	xprt = svc_dg_create(sock, sendsize, recvsize);
  * Does other connectionless specific initializations.
  * Once *xprt is initialized, it is registered.
  * see (svc.h, xprt_register). If recvsize or sendsize are 0 suitable
  * system defaults are chosen.
  * The routines returns NULL if a problem occurred.
  */
 static const char svc_dg_str[] = "svc_dg_create: %s";
 static const char svc_dg_err1[] = "could not get transport information";
 static const char svc_dg_err2[] = "transport does not support data transfer";
 static const char __no_mem_str[] = "out of memory";
 
 SVCXPRT *
 svc_dg_create(SVCPOOL *pool, struct socket *so, size_t sendsize,
     size_t recvsize)
 {
 	SVCXPRT *xprt;
 	struct __rpc_sockinfo si;
 	struct sockaddr* sa;
 	int error;
 
 	if (!__rpc_socket2sockinfo(so, &si)) {
 		printf(svc_dg_str, svc_dg_err1);
 		return (NULL);
 	}
 	/*
 	 * Find the receive and the send size
 	 */
 	sendsize = __rpc_get_t_size(si.si_af, si.si_proto, (int)sendsize);
 	recvsize = __rpc_get_t_size(si.si_af, si.si_proto, (int)recvsize);
 	if ((sendsize == 0) || (recvsize == 0)) {
 		printf(svc_dg_str, svc_dg_err2);
 		return (NULL);
 	}
 
 	xprt = svc_xprt_alloc();
 	sx_init(&xprt->xp_lock, "xprt->xp_lock");
 	xprt->xp_pool = pool;
 	xprt->xp_socket = so;
 	xprt->xp_p1 = NULL;
 	xprt->xp_p2 = NULL;
 	xprt->xp_ops = &svc_dg_ops;
 
 	CURVNET_SET(so->so_vnet);
-	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
+	error = so->so_proto->pr_sockaddr(so, &sa);
 	CURVNET_RESTORE();
 	if (error)
 		goto freedata;
 
 	memcpy(&xprt->xp_ltaddr, sa, sa->sa_len);
 	free(sa, M_SONAME);
 
 	xprt_register(xprt);
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	soupcall_set(so, SO_RCV, svc_dg_soupcall, xprt);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	return (xprt);
 freedata:
 	(void) printf(svc_dg_str, __no_mem_str);
 	svc_xprt_free(xprt);
 
 	return (NULL);
 }
 
 /*ARGSUSED*/
 static enum xprt_stat
 svc_dg_stat(SVCXPRT *xprt)
 {
 
 	if (soreadable(xprt->xp_socket))
 		return (XPRT_MOREREQS);
 
 	return (XPRT_IDLE);
 }
 
 static bool_t
 svc_dg_recv(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr **addrp, struct mbuf **mp)
 {
 	struct uio uio;
 	struct sockaddr *raddr;
 	struct mbuf *mreq;
 	XDR xdrs;
 	int error, rcvflag;
 
 	/*
 	 * Serialise access to the socket.
 	 */
 	sx_xlock(&xprt->xp_lock);
 
 	/*
 	 * The socket upcall calls xprt_active() which will eventually
 	 * cause the server to call us here. We attempt to read a
 	 * packet from the socket and process it. If the read fails,
 	 * we have drained all pending requests so we call
 	 * xprt_inactive().
 	 */
 	uio.uio_resid = 1000000000;
 	uio.uio_td = curthread;
 	mreq = NULL;
 	rcvflag = MSG_DONTWAIT;
 	error = soreceive(xprt->xp_socket, &raddr, &uio, &mreq, NULL, &rcvflag);
 
 	if (error == EWOULDBLOCK) {
 		/*
 		 * We must re-test for readability after taking the
 		 * lock to protect us in the case where a new packet
 		 * arrives on the socket after our call to soreceive
 		 * fails with EWOULDBLOCK. The pool lock protects us
 		 * from racing the upcall after our soreadable() call
 		 * returns false.
 		 */
 		SOCKBUF_LOCK(&xprt->xp_socket->so_rcv);
 		if (!soreadable(xprt->xp_socket))
 			xprt_inactive_self(xprt);
 		SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv);
 		sx_xunlock(&xprt->xp_lock);
 		return (FALSE);
 	}
 
 	if (error) {
 		SOCKBUF_LOCK(&xprt->xp_socket->so_rcv);
 		soupcall_clear(xprt->xp_socket, SO_RCV);
 		SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv);
 		xprt_inactive_self(xprt);
 		sx_xunlock(&xprt->xp_lock);
 		return (FALSE);
 	}
 
 	sx_xunlock(&xprt->xp_lock);
 
 	xdrmbuf_create(&xdrs, mreq, XDR_DECODE);
 	if (! xdr_callmsg(&xdrs, msg)) {
 		XDR_DESTROY(&xdrs);
 		return (FALSE);
 	}
 
 	*addrp = raddr;
 	*mp = xdrmbuf_getall(&xdrs);
 	XDR_DESTROY(&xdrs);
 
 	return (TRUE);
 }
 
 static bool_t
 svc_dg_reply(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr *addr, struct mbuf *m, uint32_t *seq)
 {
 	XDR xdrs;
 	struct mbuf *mrep;
 	bool_t stat = TRUE;
 	int error;
 
 	mrep = m_gethdr(M_WAITOK, MT_DATA);
 
 	xdrmbuf_create(&xdrs, mrep, XDR_ENCODE);
 
 	if (msg->rm_reply.rp_stat == MSG_ACCEPTED &&
 	    msg->rm_reply.rp_acpt.ar_stat == SUCCESS) {
 		if (!xdr_replymsg(&xdrs, msg))
 			stat = FALSE;
 		else
 			xdrmbuf_append(&xdrs, m);
 	} else {
 		stat = xdr_replymsg(&xdrs, msg);
 	}
 
 	if (stat) {
 		m_fixhdr(mrep);
 		error = sosend(xprt->xp_socket, addr, NULL, mrep, NULL,
 		    0, curthread);
 		if (!error) {
 			stat = TRUE;
 		}
 	} else {
 		m_freem(mrep);
 	}
 
 	XDR_DESTROY(&xdrs);
 	xprt->xp_p2 = NULL;
 
 	return (stat);
 }
 
 static void
 svc_dg_destroy(SVCXPRT *xprt)
 {
 
 	SOCKBUF_LOCK(&xprt->xp_socket->so_rcv);
 	soupcall_clear(xprt->xp_socket, SO_RCV);
 	SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv);
 
 	sx_destroy(&xprt->xp_lock);
 	if (xprt->xp_socket)
 		(void)soclose(xprt->xp_socket);
 
 	if (xprt->xp_netid)
 		(void) mem_free(xprt->xp_netid, strlen(xprt->xp_netid) + 1);
 	svc_xprt_free(xprt);
 }
 
 static bool_t
 /*ARGSUSED*/
 svc_dg_control(xprt, rq, in)
 	SVCXPRT *xprt;
 	const u_int	rq;
 	void		*in;
 {
 
 	return (FALSE);
 }
 
 static int
 svc_dg_soupcall(struct socket *so, void *arg, int waitflag)
 {
 	SVCXPRT *xprt = (SVCXPRT *) arg;
 
 	xprt_active(xprt);
 	return (SU_OK);
 }
diff --git a/sys/rpc/svc_vc.c b/sys/rpc/svc_vc.c
index b8b22fb7f4cf..8b11cdf82e8b 100644
--- a/sys/rpc/svc_vc.c
+++ b/sys/rpc/svc_vc.c
@@ -1,1126 +1,1126 @@
 /*	$NetBSD: svc_vc.c,v 1.7 2000/08/03 00:01:53 fvdl Exp $	*/
 
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 2009, Sun Microsystems, Inc.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without 
  * modification, are permitted provided that the following conditions are met:
  * - Redistributions of source code must retain the above copyright notice, 
  *   this list of conditions and the following disclaimer.
  * - Redistributions in binary form must reproduce the above copyright notice, 
  *   this list of conditions and the following disclaimer in the documentation 
  *   and/or other materials provided with the distribution.
  * - Neither the name of Sun Microsystems, Inc. nor the names of its 
  *   contributors may be used to endorse or promote products derived 
  *   from this software without specific prior written permission.
  * 
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
  * POSSIBILITY OF SUCH DAMAGE.
  */
 
 #if defined(LIBC_SCCS) && !defined(lint)
 static char *sccsid2 = "@(#)svc_tcp.c 1.21 87/08/11 Copyr 1984 Sun Micro";
 static char *sccsid = "@(#)svc_tcp.c	2.2 88/08/01 4.0 RPCSRC";
 #endif
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  * svc_vc.c, Server side for Connection Oriented based RPC. 
  *
  * Actually implements two flavors of transporter -
  * a tcp rendezvouser (a listner and connection establisher)
  * and a record/tcp stream.
  */
 
 #include "opt_kern_tls.h"
 
 #include <sys/param.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/queue.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sx.h>
 #include <sys/systm.h>
 #include <sys/uio.h>
 
 #include <net/vnet.h>
 
 #include <netinet/tcp.h>
 
 #include <rpc/rpc.h>
 #include <rpc/rpcsec_tls.h>
 
 #include <rpc/krpc.h>
 #include <rpc/rpc_com.h>
 
 #include <security/mac/mac_framework.h>
 
 static bool_t svc_vc_rendezvous_recv(SVCXPRT *, struct rpc_msg *,
     struct sockaddr **, struct mbuf **);
 static enum xprt_stat svc_vc_rendezvous_stat(SVCXPRT *);
 static void svc_vc_rendezvous_destroy(SVCXPRT *);
 static bool_t svc_vc_null(void);
 static void svc_vc_destroy(SVCXPRT *);
 static enum xprt_stat svc_vc_stat(SVCXPRT *);
 static bool_t svc_vc_ack(SVCXPRT *, uint32_t *);
 static bool_t svc_vc_recv(SVCXPRT *, struct rpc_msg *,
     struct sockaddr **, struct mbuf **);
 static bool_t svc_vc_reply(SVCXPRT *, struct rpc_msg *,
     struct sockaddr *, struct mbuf *, uint32_t *seq);
 static bool_t svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in);
 static bool_t svc_vc_rendezvous_control (SVCXPRT *xprt, const u_int rq,
     void *in);
 static void svc_vc_backchannel_destroy(SVCXPRT *);
 static enum xprt_stat svc_vc_backchannel_stat(SVCXPRT *);
 static bool_t svc_vc_backchannel_recv(SVCXPRT *, struct rpc_msg *,
     struct sockaddr **, struct mbuf **);
 static bool_t svc_vc_backchannel_reply(SVCXPRT *, struct rpc_msg *,
     struct sockaddr *, struct mbuf *, uint32_t *);
 static bool_t svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq,
     void *in);
 static SVCXPRT *svc_vc_create_conn(SVCPOOL *pool, struct socket *so,
     struct sockaddr *raddr);
 static int svc_vc_accept(struct socket *head, struct socket **sop);
 static int svc_vc_soupcall(struct socket *so, void *arg, int waitflag);
 static int svc_vc_rendezvous_soupcall(struct socket *, void *, int);
 
 static const struct xp_ops svc_vc_rendezvous_ops = {
 	.xp_recv =	svc_vc_rendezvous_recv,
 	.xp_stat =	svc_vc_rendezvous_stat,
 	.xp_reply =	(bool_t (*)(SVCXPRT *, struct rpc_msg *,
 		struct sockaddr *, struct mbuf *, uint32_t *))svc_vc_null,
 	.xp_destroy =	svc_vc_rendezvous_destroy,
 	.xp_control =	svc_vc_rendezvous_control
 };
 
 static const struct xp_ops svc_vc_ops = {
 	.xp_recv =	svc_vc_recv,
 	.xp_stat =	svc_vc_stat,
 	.xp_ack =	svc_vc_ack,
 	.xp_reply =	svc_vc_reply,
 	.xp_destroy =	svc_vc_destroy,
 	.xp_control =	svc_vc_control
 };
 
 static const struct xp_ops svc_vc_backchannel_ops = {
 	.xp_recv =	svc_vc_backchannel_recv,
 	.xp_stat =	svc_vc_backchannel_stat,
 	.xp_reply =	svc_vc_backchannel_reply,
 	.xp_destroy =	svc_vc_backchannel_destroy,
 	.xp_control =	svc_vc_backchannel_control
 };
 
 /*
  * Usage:
  *	xprt = svc_vc_create(sock, send_buf_size, recv_buf_size);
  *
  * Creates, registers, and returns a (rpc) tcp based transporter.
  * Once *xprt is initialized, it is registered as a transporter
  * see (svc.h, xprt_register).  This routine returns
  * a NULL if a problem occurred.
  *
  * The filedescriptor passed in is expected to refer to a bound, but
  * not yet connected socket.
  *
  * Since streams do buffered io similar to stdio, the caller can specify
  * how big the send and receive buffers are via the second and third parms;
  * 0 => use the system default.
  */
 SVCXPRT *
 svc_vc_create(SVCPOOL *pool, struct socket *so, size_t sendsize,
     size_t recvsize)
 {
 	SVCXPRT *xprt;
 	struct sockaddr* sa;
 	int error;
 
 	SOCK_LOCK(so);
 	if (so->so_state & (SS_ISCONNECTED|SS_ISDISCONNECTED)) {
 		SOCK_UNLOCK(so);
 		CURVNET_SET(so->so_vnet);
-		error = so->so_proto->pr_usrreqs->pru_peeraddr(so, &sa);
+		error = so->so_proto->pr_peeraddr(so, &sa);
 		CURVNET_RESTORE();
 		if (error)
 			return (NULL);
 		xprt = svc_vc_create_conn(pool, so, sa);
 		free(sa, M_SONAME);
 		return (xprt);
 	}
 	SOCK_UNLOCK(so);
 
 	xprt = svc_xprt_alloc();
 	sx_init(&xprt->xp_lock, "xprt->xp_lock");
 	xprt->xp_pool = pool;
 	xprt->xp_socket = so;
 	xprt->xp_p1 = NULL;
 	xprt->xp_p2 = NULL;
 	xprt->xp_ops = &svc_vc_rendezvous_ops;
 
 	CURVNET_SET(so->so_vnet);
-	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
+	error = so->so_proto->pr_sockaddr(so, &sa);
 	CURVNET_RESTORE();
 	if (error) {
 		goto cleanup_svc_vc_create;
 	}
 
 	memcpy(&xprt->xp_ltaddr, sa, sa->sa_len);
 	free(sa, M_SONAME);
 
 	xprt_register(xprt);
 
 	solisten(so, -1, curthread);
 
 	SOLISTEN_LOCK(so);
 	xprt->xp_upcallset = 1;
 	solisten_upcall_set(so, svc_vc_rendezvous_soupcall, xprt);
 	SOLISTEN_UNLOCK(so);
 
 	return (xprt);
 
 cleanup_svc_vc_create:
 	sx_destroy(&xprt->xp_lock);
 	svc_xprt_free(xprt);
 
 	return (NULL);
 }
 
 /*
  * Create a new transport for a socket optained via soaccept().
  */
 SVCXPRT *
 svc_vc_create_conn(SVCPOOL *pool, struct socket *so, struct sockaddr *raddr)
 {
 	SVCXPRT *xprt;
 	struct cf_conn *cd;
 	struct sockaddr* sa = NULL;
 	struct sockopt opt;
 	int one = 1;
 	int error;
 
 	bzero(&opt, sizeof(struct sockopt));
 	opt.sopt_dir = SOPT_SET;
 	opt.sopt_level = SOL_SOCKET;
 	opt.sopt_name = SO_KEEPALIVE;
 	opt.sopt_val = &one;
 	opt.sopt_valsize = sizeof(one);
 	error = sosetopt(so, &opt);
 	if (error) {
 		return (NULL);
 	}
 
 	if (so->so_proto->pr_protocol == IPPROTO_TCP) {
 		bzero(&opt, sizeof(struct sockopt));
 		opt.sopt_dir = SOPT_SET;
 		opt.sopt_level = IPPROTO_TCP;
 		opt.sopt_name = TCP_NODELAY;
 		opt.sopt_val = &one;
 		opt.sopt_valsize = sizeof(one);
 		error = sosetopt(so, &opt);
 		if (error) {
 			return (NULL);
 		}
 	}
 
 	cd = mem_alloc(sizeof(*cd));
 	cd->strm_stat = XPRT_IDLE;
 
 	xprt = svc_xprt_alloc();
 	sx_init(&xprt->xp_lock, "xprt->xp_lock");
 	xprt->xp_pool = pool;
 	xprt->xp_socket = so;
 	xprt->xp_p1 = cd;
 	xprt->xp_p2 = NULL;
 	xprt->xp_ops = &svc_vc_ops;
 
 	/*
 	 * See http://www.connectathon.org/talks96/nfstcp.pdf - client
 	 * has a 5 minute timer, server has a 6 minute timer.
 	 */
 	xprt->xp_idletimeout = 6 * 60;
 
 	memcpy(&xprt->xp_rtaddr, raddr, raddr->sa_len);
 
 	CURVNET_SET(so->so_vnet);
-	error = so->so_proto->pr_usrreqs->pru_sockaddr(so, &sa);
+	error = so->so_proto->pr_sockaddr(so, &sa);
 	CURVNET_RESTORE();
 	if (error)
 		goto cleanup_svc_vc_create;
 
 	memcpy(&xprt->xp_ltaddr, sa, sa->sa_len);
 	free(sa, M_SONAME);
 
 	xprt_register(xprt);
 
 	SOCKBUF_LOCK(&so->so_rcv);
 	xprt->xp_upcallset = 1;
 	soupcall_set(so, SO_RCV, svc_vc_soupcall, xprt);
 	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	/*
 	 * Throw the transport into the active list in case it already
 	 * has some data buffered.
 	 */
 	sx_xlock(&xprt->xp_lock);
 	xprt_active(xprt);
 	sx_xunlock(&xprt->xp_lock);
 
 	return (xprt);
 cleanup_svc_vc_create:
 	sx_destroy(&xprt->xp_lock);
 	svc_xprt_free(xprt);
 	mem_free(cd, sizeof(*cd));
 
 	return (NULL);
 }
 
 /*
  * Create a new transport for a backchannel on a clnt_vc socket.
  */
 SVCXPRT *
 svc_vc_create_backchannel(SVCPOOL *pool)
 {
 	SVCXPRT *xprt = NULL;
 	struct cf_conn *cd = NULL;
 
 	cd = mem_alloc(sizeof(*cd));
 	cd->strm_stat = XPRT_IDLE;
 
 	xprt = svc_xprt_alloc();
 	sx_init(&xprt->xp_lock, "xprt->xp_lock");
 	xprt->xp_pool = pool;
 	xprt->xp_socket = NULL;
 	xprt->xp_p1 = cd;
 	xprt->xp_p2 = NULL;
 	xprt->xp_ops = &svc_vc_backchannel_ops;
 	return (xprt);
 }
 
 /*
  * This does all of the accept except the final call to soaccept. The
  * caller will call soaccept after dropping its locks (soaccept may
  * call malloc).
  */
 int
 svc_vc_accept(struct socket *head, struct socket **sop)
 {
 	struct socket *so;
 	int error = 0;
 	short nbio;
 
 	KASSERT(SOLISTENING(head),
 	    ("%s: socket %p is not listening", __func__, head));
 
 #ifdef MAC
 	error = mac_socket_check_accept(curthread->td_ucred, head);
 	if (error != 0)
 		goto done;
 #endif
 	/*
 	 * XXXGL: we want non-blocking semantics.  The socket could be a
 	 * socket created by kernel as well as socket shared with userland,
 	 * so we can't be sure about presense of SS_NBIO.  We also shall not
 	 * toggle it on the socket, since that may surprise userland.  So we
 	 * set SS_NBIO only temporarily.
 	 */
 	SOLISTEN_LOCK(head);
 	nbio = head->so_state & SS_NBIO;
 	head->so_state |= SS_NBIO;
 	error = solisten_dequeue(head, &so, 0);
 	head->so_state &= (nbio & ~SS_NBIO);
 	if (error)
 		goto done;
 
 	so->so_state |= nbio;
 	*sop = so;
 
 	/* connection has been removed from the listen queue */
 	KNOTE_UNLOCKED(&head->so_rdsel.si_note, 0);
 done:
 	return (error);
 }
 
 /*ARGSUSED*/
 static bool_t
 svc_vc_rendezvous_recv(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr **addrp, struct mbuf **mp)
 {
 	struct socket *so = NULL;
 	struct sockaddr *sa = NULL;
 	int error;
 	SVCXPRT *new_xprt;
 
 	/*
 	 * The socket upcall calls xprt_active() which will eventually
 	 * cause the server to call us here. We attempt to accept a
 	 * connection from the socket and turn it into a new
 	 * transport. If the accept fails, we have drained all pending
 	 * connections so we call xprt_inactive().
 	 */
 	sx_xlock(&xprt->xp_lock);
 
 	error = svc_vc_accept(xprt->xp_socket, &so);
 
 	if (error == EWOULDBLOCK) {
 		/*
 		 * We must re-test for new connections after taking
 		 * the lock to protect us in the case where a new
 		 * connection arrives after our call to accept fails
 		 * with EWOULDBLOCK.
 		 */
 		SOLISTEN_LOCK(xprt->xp_socket);
 		if (TAILQ_EMPTY(&xprt->xp_socket->sol_comp))
 			xprt_inactive_self(xprt);
 		SOLISTEN_UNLOCK(xprt->xp_socket);
 		sx_xunlock(&xprt->xp_lock);
 		return (FALSE);
 	}
 
 	if (error) {
 		SOLISTEN_LOCK(xprt->xp_socket);
 		if (xprt->xp_upcallset) {
 			xprt->xp_upcallset = 0;
 			soupcall_clear(xprt->xp_socket, SO_RCV);
 		}
 		SOLISTEN_UNLOCK(xprt->xp_socket);
 		xprt_inactive_self(xprt);
 		sx_xunlock(&xprt->xp_lock);
 		return (FALSE);
 	}
 
 	sx_xunlock(&xprt->xp_lock);
 
 	sa = NULL;
 	error = soaccept(so, &sa);
 
 	if (error) {
 		/*
 		 * XXX not sure if I need to call sofree or soclose here.
 		 */
 		if (sa)
 			free(sa, M_SONAME);
 		return (FALSE);
 	}
 
 	/*
 	 * svc_vc_create_conn will call xprt_register - we don't need
 	 * to do anything with the new connection except derefence it.
 	 */
 	new_xprt = svc_vc_create_conn(xprt->xp_pool, so, sa);
 	if (!new_xprt) {
 		soclose(so);
 	} else {
 		SVC_RELEASE(new_xprt);
 	}
 
 	free(sa, M_SONAME);
 
 	return (FALSE); /* there is never an rpc msg to be processed */
 }
 
 /*ARGSUSED*/
 static enum xprt_stat
 svc_vc_rendezvous_stat(SVCXPRT *xprt)
 {
 
 	return (XPRT_IDLE);
 }
 
 static void
 svc_vc_destroy_common(SVCXPRT *xprt)
 {
 	uint32_t reterr;
 
 	if (xprt->xp_socket) {
 		if ((xprt->xp_tls & (RPCTLS_FLAGS_HANDSHAKE |
 		    RPCTLS_FLAGS_HANDSHFAIL)) != 0) {
 			if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0) {
 				/*
 				 * If the upcall fails, the socket has
 				 * probably been closed via the rpctlssd
 				 * daemon having crashed or been
 				 * restarted, so just ignore returned stat.
 				 */
 				rpctls_srv_disconnect(xprt->xp_sslsec,
 				    xprt->xp_sslusec, xprt->xp_sslrefno,
 				    &reterr);
 			}
 			/* Must sorele() to get rid of reference. */
 			CURVNET_SET(xprt->xp_socket->so_vnet);
 			sorele(xprt->xp_socket);
 			CURVNET_RESTORE();
 		} else
 			(void)soclose(xprt->xp_socket);
 	}
 
 	if (xprt->xp_netid)
 		(void) mem_free(xprt->xp_netid, strlen(xprt->xp_netid) + 1);
 	svc_xprt_free(xprt);
 }
 
 static void
 svc_vc_rendezvous_destroy(SVCXPRT *xprt)
 {
 
 	SOLISTEN_LOCK(xprt->xp_socket);
 	if (xprt->xp_upcallset) {
 		xprt->xp_upcallset = 0;
 		solisten_upcall_set(xprt->xp_socket, NULL, NULL);
 	}
 	SOLISTEN_UNLOCK(xprt->xp_socket);
 
 	svc_vc_destroy_common(xprt);
 }
 
 static void
 svc_vc_destroy(SVCXPRT *xprt)
 {
 	struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1;
 	CLIENT *cl = (CLIENT *)xprt->xp_p2;
 
 	SOCKBUF_LOCK(&xprt->xp_socket->so_rcv);
 	if (xprt->xp_upcallset) {
 		xprt->xp_upcallset = 0;
 		if (xprt->xp_socket->so_rcv.sb_upcall != NULL)
 			soupcall_clear(xprt->xp_socket, SO_RCV);
 	}
 	SOCKBUF_UNLOCK(&xprt->xp_socket->so_rcv);
 
 	if (cl != NULL)
 		CLNT_RELEASE(cl);
 
 	svc_vc_destroy_common(xprt);
 
 	if (cd->mreq)
 		m_freem(cd->mreq);
 	if (cd->mpending)
 		m_freem(cd->mpending);
 	mem_free(cd, sizeof(*cd));
 }
 
 static void
 svc_vc_backchannel_destroy(SVCXPRT *xprt)
 {
 	struct cf_conn *cd = (struct cf_conn *)xprt->xp_p1;
 	struct mbuf *m, *m2;
 
 	svc_xprt_free(xprt);
 	m = cd->mreq;
 	while (m != NULL) {
 		m2 = m;
 		m = m->m_nextpkt;
 		m_freem(m2);
 	}
 	mem_free(cd, sizeof(*cd));
 }
 
 /*ARGSUSED*/
 static bool_t
 svc_vc_control(SVCXPRT *xprt, const u_int rq, void *in)
 {
 	return (FALSE);
 }
 
 static bool_t
 svc_vc_rendezvous_control(SVCXPRT *xprt, const u_int rq, void *in)
 {
 
 	return (FALSE);
 }
 
 static bool_t
 svc_vc_backchannel_control(SVCXPRT *xprt, const u_int rq, void *in)
 {
 
 	return (FALSE);
 }
 
 static enum xprt_stat
 svc_vc_stat(SVCXPRT *xprt)
 {
 	struct cf_conn *cd;
 
 	cd = (struct cf_conn *)(xprt->xp_p1);
 
 	if (cd->strm_stat == XPRT_DIED)
 		return (XPRT_DIED);
 
 	if (cd->mreq != NULL && cd->resid == 0 && cd->eor)
 		return (XPRT_MOREREQS);
 
 	if (soreadable(xprt->xp_socket))
 		return (XPRT_MOREREQS);
 
 	return (XPRT_IDLE);
 }
 
 static bool_t
 svc_vc_ack(SVCXPRT *xprt, uint32_t *ack)
 {
 
 	*ack = atomic_load_acq_32(&xprt->xp_snt_cnt);
 	*ack -= sbused(&xprt->xp_socket->so_snd);
 	return (TRUE);
 }
 
 static enum xprt_stat
 svc_vc_backchannel_stat(SVCXPRT *xprt)
 {
 	struct cf_conn *cd;
 
 	cd = (struct cf_conn *)(xprt->xp_p1);
 
 	if (cd->mreq != NULL)
 		return (XPRT_MOREREQS);
 
 	return (XPRT_IDLE);
 }
 
 /*
  * If we have an mbuf chain in cd->mpending, try to parse a record from it,
  * leaving the result in cd->mreq. If we don't have a complete record, leave
  * the partial result in cd->mreq and try to read more from the socket.
  */
 static int
 svc_vc_process_pending(SVCXPRT *xprt)
 {
 	struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1;
 	struct socket *so = xprt->xp_socket;
 	struct mbuf *m;
 
 	/*
 	 * If cd->resid is non-zero, we have part of the
 	 * record already, otherwise we are expecting a record
 	 * marker.
 	 */
 	if (!cd->resid && cd->mpending) {
 		/*
 		 * See if there is enough data buffered to
 		 * make up a record marker. Make sure we can
 		 * handle the case where the record marker is
 		 * split across more than one mbuf.
 		 */
 		size_t n = 0;
 		uint32_t header;
 
 		m = cd->mpending;
 		while (n < sizeof(uint32_t) && m) {
 			n += m->m_len;
 			m = m->m_next;
 		}
 		if (n < sizeof(uint32_t)) {
 			so->so_rcv.sb_lowat = sizeof(uint32_t) - n;
 			return (FALSE);
 		}
 		m_copydata(cd->mpending, 0, sizeof(header),
 		    (char *)&header);
 		header = ntohl(header);
 		cd->eor = (header & 0x80000000) != 0;
 		cd->resid = header & 0x7fffffff;
 		m_adj(cd->mpending, sizeof(uint32_t));
 	}
 
 	/*
 	 * Start pulling off mbufs from cd->mpending
 	 * until we either have a complete record or
 	 * we run out of data. We use m_split to pull
 	 * data - it will pull as much as possible and
 	 * split the last mbuf if necessary.
 	 */
 	while (cd->mpending && cd->resid) {
 		m = cd->mpending;
 		if (cd->mpending->m_next
 		    || cd->mpending->m_len > cd->resid)
 			cd->mpending = m_split(cd->mpending,
 			    cd->resid, M_WAITOK);
 		else
 			cd->mpending = NULL;
 		if (cd->mreq)
 			m_last(cd->mreq)->m_next = m;
 		else
 			cd->mreq = m;
 		while (m) {
 			cd->resid -= m->m_len;
 			m = m->m_next;
 		}
 	}
 
 	/*
 	 * Block receive upcalls if we have more data pending,
 	 * otherwise report our need.
 	 */
 	if (cd->mpending)
 		so->so_rcv.sb_lowat = INT_MAX;
 	else
 		so->so_rcv.sb_lowat =
 		    imax(1, imin(cd->resid, so->so_rcv.sb_hiwat / 2));
 	return (TRUE);
 }
 
 static bool_t
 svc_vc_recv(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr **addrp, struct mbuf **mp)
 {
 	struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1;
 	struct uio uio;
 	struct mbuf *m, *ctrl;
 	struct socket* so = xprt->xp_socket;
 	XDR xdrs;
 	int error, rcvflag;
 	uint32_t reterr, xid_plus_direction[2];
 	struct cmsghdr *cmsg;
 	struct tls_get_record tgr;
 	enum clnt_stat ret;
 
 	/*
 	 * Serialise access to the socket and our own record parsing
 	 * state.
 	 */
 	sx_xlock(&xprt->xp_lock);
 
 	for (;;) {
 		/* If we have no request ready, check pending queue. */
 		while (cd->mpending &&
 		    (cd->mreq == NULL || cd->resid != 0 || !cd->eor)) {
 			if (!svc_vc_process_pending(xprt))
 				break;
 		}
 
 		/* Process and return complete request in cd->mreq. */
 		if (cd->mreq != NULL && cd->resid == 0 && cd->eor) {
 
 			/*
 			 * Now, check for a backchannel reply.
 			 * The XID is in the first uint32_t of the reply
 			 * and the message direction is the second one.
 			 */
 			if ((cd->mreq->m_len >= sizeof(xid_plus_direction) ||
 			    m_length(cd->mreq, NULL) >=
 			    sizeof(xid_plus_direction)) &&
 			    xprt->xp_p2 != NULL) {
 				m_copydata(cd->mreq, 0,
 				    sizeof(xid_plus_direction),
 				    (char *)xid_plus_direction);
 				xid_plus_direction[0] =
 				    ntohl(xid_plus_direction[0]);
 				xid_plus_direction[1] =
 				    ntohl(xid_plus_direction[1]);
 				/* Check message direction. */
 				if (xid_plus_direction[1] == REPLY) {
 					clnt_bck_svccall(xprt->xp_p2,
 					    cd->mreq,
 					    xid_plus_direction[0]);
 					cd->mreq = NULL;
 					continue;
 				}
 			}
 
 			xdrmbuf_create(&xdrs, cd->mreq, XDR_DECODE);
 			cd->mreq = NULL;
 
 			/* Check for next request in a pending queue. */
 			svc_vc_process_pending(xprt);
 			if (cd->mreq == NULL || cd->resid != 0) {
 				SOCKBUF_LOCK(&so->so_rcv);
 				if (!soreadable(so))
 					xprt_inactive_self(xprt);
 				SOCKBUF_UNLOCK(&so->so_rcv);
 			}
 
 			sx_xunlock(&xprt->xp_lock);
 
 			if (! xdr_callmsg(&xdrs, msg)) {
 				XDR_DESTROY(&xdrs);
 				return (FALSE);
 			}
 
 			*addrp = NULL;
 			*mp = xdrmbuf_getall(&xdrs);
 			XDR_DESTROY(&xdrs);
 
 			return (TRUE);
 		}
 
 		/*
 		 * If receiving is disabled so that a TLS handshake can be
 		 * done by the rpctlssd daemon, return FALSE here.
 		 */
 		rcvflag = MSG_DONTWAIT;
 		if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0)
 			rcvflag |= MSG_TLSAPPDATA;
 tryagain:
 		if (xprt->xp_dontrcv) {
 			sx_xunlock(&xprt->xp_lock);
 			return (FALSE);
 		}
 
 		/*
 		 * The socket upcall calls xprt_active() which will eventually
 		 * cause the server to call us here. We attempt to
 		 * read as much as possible from the socket and put
 		 * the result in cd->mpending. If the read fails,
 		 * we have drained both cd->mpending and the socket so
 		 * we can call xprt_inactive().
 		 */
 		uio.uio_resid = 1000000000;
 		uio.uio_td = curthread;
 		ctrl = m = NULL;
 		error = soreceive(so, NULL, &uio, &m, &ctrl, &rcvflag);
 
 		if (error == EWOULDBLOCK) {
 			/*
 			 * We must re-test for readability after
 			 * taking the lock to protect us in the case
 			 * where a new packet arrives on the socket
 			 * after our call to soreceive fails with
 			 * EWOULDBLOCK.
 			 */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (!soreadable(so))
 				xprt_inactive_self(xprt);
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			sx_xunlock(&xprt->xp_lock);
 			return (FALSE);
 		}
 
 		/*
 		 * A return of ENXIO indicates that there is an
 		 * alert record at the head of the
 		 * socket's receive queue, for TLS connections.
 		 * This record needs to be handled in userland
 		 * via an SSL_read() call, so do an upcall to the daemon.
 		 */
 		if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0 &&
 		    error == ENXIO) {
 			/* Disable reception. */
 			xprt->xp_dontrcv = TRUE;
 			sx_xunlock(&xprt->xp_lock);
 			ret = rpctls_srv_handlerecord(xprt->xp_sslsec,
 			    xprt->xp_sslusec, xprt->xp_sslrefno,
 			    &reterr);
 			sx_xlock(&xprt->xp_lock);
 			xprt->xp_dontrcv = FALSE;
 			if (ret != RPC_SUCCESS || reterr != RPCTLSERR_OK) {
 				/*
 				 * All we can do is soreceive() it and
 				 * then toss it.
 				 */
 				rcvflag = MSG_DONTWAIT;
 				goto tryagain;
 			}
 			sx_xunlock(&xprt->xp_lock);
 			xprt_active(xprt);   /* Harmless if already active. */
 			return (FALSE);
 		}
 
 		if (error) {
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (xprt->xp_upcallset) {
 				xprt->xp_upcallset = 0;
 				soupcall_clear(so, SO_RCV);
 			}
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			xprt_inactive_self(xprt);
 			cd->strm_stat = XPRT_DIED;
 			sx_xunlock(&xprt->xp_lock);
 			return (FALSE);
 		}
 
 		if (!m) {
 			/*
 			 * EOF - the other end has closed the socket.
 			 */
 			xprt_inactive_self(xprt);
 			cd->strm_stat = XPRT_DIED;
 			sx_xunlock(&xprt->xp_lock);
 			return (FALSE);
 		}
 
 		/* Process any record header(s). */
 		if (ctrl != NULL) {
 			cmsg = mtod(ctrl, struct cmsghdr *);
 			if (cmsg->cmsg_type == TLS_GET_RECORD &&
 			    cmsg->cmsg_len == CMSG_LEN(sizeof(tgr))) {
 				memcpy(&tgr, CMSG_DATA(cmsg), sizeof(tgr));
 				/*
 				 * TLS_RLTYPE_ALERT records should be handled
 				 * since soreceive() would have returned
 				 * ENXIO.  Just throw any other
 				 * non-TLS_RLTYPE_APP records away.
 				 */
 				if (tgr.tls_type != TLS_RLTYPE_APP) {
 					m_freem(m);
 					m_free(ctrl);
 					rcvflag = MSG_DONTWAIT | MSG_TLSAPPDATA;
 					goto tryagain;
 				}
 			}
 			m_free(ctrl);
 		}
 
 		if (cd->mpending)
 			m_last(cd->mpending)->m_next = m;
 		else
 			cd->mpending = m;
 	}
 }
 
 static bool_t
 svc_vc_backchannel_recv(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr **addrp, struct mbuf **mp)
 {
 	struct cf_conn *cd = (struct cf_conn *) xprt->xp_p1;
 	struct ct_data *ct;
 	struct mbuf *m;
 	XDR xdrs;
 
 	sx_xlock(&xprt->xp_lock);
 	ct = (struct ct_data *)xprt->xp_p2;
 	if (ct == NULL) {
 		sx_xunlock(&xprt->xp_lock);
 		return (FALSE);
 	}
 	mtx_lock(&ct->ct_lock);
 	m = cd->mreq;
 	if (m == NULL) {
 		xprt_inactive_self(xprt);
 		mtx_unlock(&ct->ct_lock);
 		sx_xunlock(&xprt->xp_lock);
 		return (FALSE);
 	}
 	cd->mreq = m->m_nextpkt;
 	mtx_unlock(&ct->ct_lock);
 	sx_xunlock(&xprt->xp_lock);
 
 	xdrmbuf_create(&xdrs, m, XDR_DECODE);
 	if (! xdr_callmsg(&xdrs, msg)) {
 		XDR_DESTROY(&xdrs);
 		return (FALSE);
 	}
 	*addrp = NULL;
 	*mp = xdrmbuf_getall(&xdrs);
 	XDR_DESTROY(&xdrs);
 	return (TRUE);
 }
 
 static bool_t
 svc_vc_reply(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr *addr, struct mbuf *m, uint32_t *seq)
 {
 	XDR xdrs;
 	struct mbuf *mrep;
 	bool_t stat = TRUE;
 	int error, len, maxextsiz;
 #ifdef KERN_TLS
 	u_int maxlen;
 #endif
 
 	/*
 	 * Leave space for record mark.
 	 */
 	mrep = m_gethdr(M_WAITOK, MT_DATA);
 	mrep->m_data += sizeof(uint32_t);
 
 	xdrmbuf_create(&xdrs, mrep, XDR_ENCODE);
 
 	if (msg->rm_reply.rp_stat == MSG_ACCEPTED &&
 	    msg->rm_reply.rp_acpt.ar_stat == SUCCESS) {
 		if (!xdr_replymsg(&xdrs, msg))
 			stat = FALSE;
 		else
 			xdrmbuf_append(&xdrs, m);
 	} else {
 		stat = xdr_replymsg(&xdrs, msg);
 	}
 
 	if (stat) {
 		m_fixhdr(mrep);
 
 		/*
 		 * Prepend a record marker containing the reply length.
 		 */
 		M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK);
 		len = mrep->m_pkthdr.len;
 		*mtod(mrep, uint32_t *) =
 			htonl(0x80000000 | (len - sizeof(uint32_t)));
 
 		/* For RPC-over-TLS, copy mrep to a chain of ext_pgs. */
 		if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0) {
 			/*
 			 * Copy the mbuf chain to a chain of
 			 * ext_pgs mbuf(s) as required by KERN_TLS.
 			 */
 			maxextsiz = TLS_MAX_MSG_SIZE_V10_2;
 #ifdef KERN_TLS
 			if (rpctls_getinfo(&maxlen, false, false))
 				maxextsiz = min(maxextsiz, maxlen);
 #endif
 			mrep = _rpc_copym_into_ext_pgs(mrep, maxextsiz);
 		}
 		atomic_add_32(&xprt->xp_snd_cnt, len);
 		/*
 		 * sosend consumes mreq.
 		 */
 		error = sosend(xprt->xp_socket, NULL, NULL, mrep, NULL,
 		    0, curthread);
 		if (!error) {
 			atomic_add_rel_32(&xprt->xp_snt_cnt, len);
 			if (seq)
 				*seq = xprt->xp_snd_cnt;
 			stat = TRUE;
 		} else
 			atomic_subtract_32(&xprt->xp_snd_cnt, len);
 	} else {
 		m_freem(mrep);
 	}
 
 	XDR_DESTROY(&xdrs);
 
 	return (stat);
 }
 
 static bool_t
 svc_vc_backchannel_reply(SVCXPRT *xprt, struct rpc_msg *msg,
     struct sockaddr *addr, struct mbuf *m, uint32_t *seq)
 {
 	struct ct_data *ct;
 	XDR xdrs;
 	struct mbuf *mrep;
 	bool_t stat = TRUE;
 	int error, maxextsiz;
 #ifdef KERN_TLS
 	u_int maxlen;
 #endif
 
 	/*
 	 * Leave space for record mark.
 	 */
 	mrep = m_gethdr(M_WAITOK, MT_DATA);
 	mrep->m_data += sizeof(uint32_t);
 
 	xdrmbuf_create(&xdrs, mrep, XDR_ENCODE);
 
 	if (msg->rm_reply.rp_stat == MSG_ACCEPTED &&
 	    msg->rm_reply.rp_acpt.ar_stat == SUCCESS) {
 		if (!xdr_replymsg(&xdrs, msg))
 			stat = FALSE;
 		else
 			xdrmbuf_append(&xdrs, m);
 	} else {
 		stat = xdr_replymsg(&xdrs, msg);
 	}
 
 	if (stat) {
 		m_fixhdr(mrep);
 
 		/*
 		 * Prepend a record marker containing the reply length.
 		 */
 		M_PREPEND(mrep, sizeof(uint32_t), M_WAITOK);
 		*mtod(mrep, uint32_t *) =
 			htonl(0x80000000 | (mrep->m_pkthdr.len
 				- sizeof(uint32_t)));
 
 		/* For RPC-over-TLS, copy mrep to a chain of ext_pgs. */
 		if ((xprt->xp_tls & RPCTLS_FLAGS_HANDSHAKE) != 0) {
 			/*
 			 * Copy the mbuf chain to a chain of
 			 * ext_pgs mbuf(s) as required by KERN_TLS.
 			 */
 			maxextsiz = TLS_MAX_MSG_SIZE_V10_2;
 #ifdef KERN_TLS
 			if (rpctls_getinfo(&maxlen, false, false))
 				maxextsiz = min(maxextsiz, maxlen);
 #endif
 			mrep = _rpc_copym_into_ext_pgs(mrep, maxextsiz);
 		}
 		sx_xlock(&xprt->xp_lock);
 		ct = (struct ct_data *)xprt->xp_p2;
 		if (ct != NULL)
 			error = sosend(ct->ct_socket, NULL, NULL, mrep, NULL,
 			    0, curthread);
 		else
 			error = EPIPE;
 		sx_xunlock(&xprt->xp_lock);
 		if (!error) {
 			stat = TRUE;
 		}
 	} else {
 		m_freem(mrep);
 	}
 
 	XDR_DESTROY(&xdrs);
 
 	return (stat);
 }
 
 static bool_t
 svc_vc_null(void)
 {
 
 	return (FALSE);
 }
 
 static int
 svc_vc_soupcall(struct socket *so, void *arg, int waitflag)
 {
 	SVCXPRT *xprt = (SVCXPRT *) arg;
 
 	if (soreadable(xprt->xp_socket))
 		xprt_active(xprt);
 	return (SU_OK);
 }
 
 static int
 svc_vc_rendezvous_soupcall(struct socket *head, void *arg, int waitflag)
 {
 	SVCXPRT *xprt = (SVCXPRT *) arg;
 
 	if (!TAILQ_EMPTY(&head->sol_comp))
 		xprt_active(xprt);
 	return (SU_OK);
 }
 
 #if 0
 /*
  * Get the effective UID of the sending process. Used by rpcbind, keyserv
  * and rpc.yppasswdd on AF_LOCAL.
  */
 int
 __rpc_get_local_uid(SVCXPRT *transp, uid_t *uid) {
 	int sock, ret;
 	gid_t egid;
 	uid_t euid;
 	struct sockaddr *sa;
 
 	sock = transp->xp_fd;
 	sa = (struct sockaddr *)transp->xp_rtaddr;
 	if (sa->sa_family == AF_LOCAL) {
 		ret = getpeereid(sock, &euid, &egid);
 		if (ret == 0)
 			*uid = euid;
 		return (ret);
 	} else
 		return (-1);
 }
 #endif
diff --git a/sys/security/mac/mac_socket.c b/sys/security/mac/mac_socket.c
index d55487897522..4b3d6ccfb90c 100644
--- a/sys/security/mac/mac_socket.c
+++ b/sys/security/mac/mac_socket.c
@@ -1,633 +1,633 @@
 /*-
  * Copyright (c) 1999-2002, 2009 Robert N. M. Watson
  * Copyright (c) 2001 Ilmar S. Habibulin
  * Copyright (c) 2001-2005 Networks Associates Technology, Inc.
  * Copyright (c) 2005-2006 SPARTA, Inc.
  * Copyright (c) 2008 Apple Inc.
  * All rights reserved.
  *
  * This software was developed by Robert Watson and Ilmar Habibulin for the
  * TrustedBSD Project.
  *
  * This software was developed for the FreeBSD Project in part by McAfee
  * Research, the Technology Research Division of Network Associates, Inc.
  * under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the
  * DARPA CHATS research program.
  *
  * This software was enhanced by SPARTA ISSO under SPAWAR contract
  * N66001-04-C-6019 ("SEFOS").
  *
  * This software was developed at the University of Cambridge Computer
  * Laboratory with support from a grant from Google, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/mac.h>
 #include <sys/sbuf.h>
 #include <sys/sdt.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/file.h>
 #include <sys/namei.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 
 #include <net/bpfdesc.h>
 #include <net/if.h>
 #include <net/if_var.h>
 
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 
 #include <security/mac/mac_framework.h>
 #include <security/mac/mac_internal.h>
 #include <security/mac/mac_policy.h>
 
 /*
  * Currently, sockets hold two labels: the label of the socket itself, and a
  * peer label, which may be used by policies to hold a copy of the label of
  * any remote endpoint.
  *
  * Possibly, this peer label should be maintained at the protocol layer
  * (inpcb, unpcb, etc), as this would allow protocol-aware code to maintain
  * the label consistently.  For example, it might be copied live from a
  * remote socket for UNIX domain sockets rather than keeping a local copy on
  * this endpoint, but be cached and updated based on packets received for
  * TCP/IP.
  *
  * Unlike with many other object types, the lock protecting MAC labels on
  * sockets (the socket lock) is not frequently held at the points in code
  * where socket-related checks are called.  The MAC Framework acquires the
  * lock over some entry points in order to enforce atomicity (such as label
  * copies) but in other cases the policy modules will have to acquire the
  * lock themselves if they use labels.  This approach (a) avoids lock
  * acquisitions when policies don't require labels and (b) solves a number of
  * potential lock order issues when multiple sockets are used in the same
  * entry point.
  */
 
 struct label *
 mac_socket_label_alloc(int flag)
 {
 	struct label *label;
 	int error;
 
 	label = mac_labelzone_alloc(flag);
 	if (label == NULL)
 		return (NULL);
 
 	if (flag & M_WAITOK)
 		MAC_POLICY_CHECK(socket_init_label, label, flag);
 	else
 		MAC_POLICY_CHECK_NOSLEEP(socket_init_label, label, flag);
 	if (error) {
 		MAC_POLICY_PERFORM_NOSLEEP(socket_destroy_label, label);
 		mac_labelzone_free(label);
 		return (NULL);
 	}
 	return (label);
 }
 
 static struct label *
 mac_socketpeer_label_alloc(int flag)
 {
 	struct label *label;
 	int error;
 
 	label = mac_labelzone_alloc(flag);
 	if (label == NULL)
 		return (NULL);
 
 	if (flag & M_WAITOK)
 		MAC_POLICY_CHECK(socketpeer_init_label, label, flag);
 	else
 		MAC_POLICY_CHECK_NOSLEEP(socketpeer_init_label, label, flag);
 	if (error) {
 		MAC_POLICY_PERFORM_NOSLEEP(socketpeer_destroy_label, label);
 		mac_labelzone_free(label);
 		return (NULL);
 	}
 	return (label);
 }
 
 int
 mac_socket_init(struct socket *so, int flag)
 {
 
 	if (mac_labeled & MPC_OBJECT_SOCKET) {
 		so->so_label = mac_socket_label_alloc(flag);
 		if (so->so_label == NULL)
 			return (ENOMEM);
 		so->so_peerlabel = mac_socketpeer_label_alloc(flag);
 		if (so->so_peerlabel == NULL) {
 			mac_socket_label_free(so->so_label);
 			so->so_label = NULL;
 			return (ENOMEM);
 		}
 	} else {
 		so->so_label = NULL;
 		so->so_peerlabel = NULL;
 	}
 	return (0);
 }
 
 void
 mac_socket_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 static void
 mac_socketpeer_label_free(struct label *label)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(socketpeer_destroy_label, label);
 	mac_labelzone_free(label);
 }
 
 void
 mac_socket_destroy(struct socket *so)
 {
 
 	if (so->so_label != NULL) {
 		mac_socket_label_free(so->so_label);
 		so->so_label = NULL;
 		mac_socketpeer_label_free(so->so_peerlabel);
 		so->so_peerlabel = NULL;
 	}
 }
 
 void
 mac_socket_copy_label(struct label *src, struct label *dest)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_copy_label, src, dest);
 }
 
 int
 mac_socket_externalize_label(struct label *label, char *elements,
     char *outbuf, size_t outbuflen)
 {
 	int error;
 
 	MAC_POLICY_EXTERNALIZE(socket, label, elements, outbuf, outbuflen);
 
 	return (error);
 }
 
 static int
 mac_socketpeer_externalize_label(struct label *label, char *elements,
     char *outbuf, size_t outbuflen)
 {
 	int error;
 
 	MAC_POLICY_EXTERNALIZE(socketpeer, label, elements, outbuf,
 	    outbuflen);
 
 	return (error);
 }
 
 int
 mac_socket_internalize_label(struct label *label, char *string)
 {
 	int error;
 
 	MAC_POLICY_INTERNALIZE(socket, label, string);
 
 	return (error);
 }
 
 void
 mac_socket_create(struct ucred *cred, struct socket *so)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_create, cred, so, so->so_label);
 }
 
 void
 mac_socket_newconn(struct socket *oldso, struct socket *newso)
 {
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_newconn, oldso, oldso->so_label,
 	    newso, newso->so_label);
 }
 
 static void
 mac_socket_relabel(struct ucred *cred, struct socket *so,
     struct label *newlabel)
 {
 
 	SOCK_LOCK_ASSERT(so);
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_relabel, cred, so, so->so_label,
 	    newlabel);
 }
 
 void
 mac_socketpeer_set_from_mbuf(struct mbuf *m, struct socket *so)
 {
 	struct label *label;
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(socketpeer_set_from_mbuf, m, label, so,
 	    so->so_peerlabel);
 }
 
 void
 mac_socketpeer_set_from_socket(struct socket *oldso, struct socket *newso)
 {
 
 	if (mac_policy_count == 0)
 		return;
 
 	MAC_POLICY_PERFORM_NOSLEEP(socketpeer_set_from_socket, oldso,
 	    oldso->so_label, newso, newso->so_peerlabel);
 }
 
 void
 mac_socket_create_mbuf(struct socket *so, struct mbuf *m)
 {
 	struct label *label;
 
 	if (mac_policy_count == 0)
 		return;
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_PERFORM_NOSLEEP(socket_create_mbuf, so, so->so_label, m,
 	    label);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_accept, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_accept(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_accept, cred, so,
 	    so->so_label);
 	MAC_CHECK_PROBE2(socket_check_accept, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(socket_check_bind, "struct ucred *",
     "struct socket *", "struct sockaddr *");
 
 int
 mac_socket_check_bind(struct ucred *cred, struct socket *so,
     struct sockaddr *sa)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_bind, cred, so, so->so_label,
 	    sa);
 	MAC_CHECK_PROBE3(socket_check_bind, error, cred, so, sa);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(socket_check_connect, "struct ucred *",
     "struct socket *", "struct sockaddr *");
 
 int
 mac_socket_check_connect(struct ucred *cred, struct socket *so,
     struct sockaddr *sa)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_connect, cred, so,
 	    so->so_label, sa);
 	MAC_CHECK_PROBE3(socket_check_connect, error, cred, so, sa);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE4(socket_check_create, "struct ucred *", "int", "int",
     "int");
 
 int
 mac_socket_check_create(struct ucred *cred, int domain, int type, int proto)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_create, cred, domain, type,
 	    proto);
 	MAC_CHECK_PROBE4(socket_check_create, error, cred, domain, type,
 	    proto);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_deliver, "struct socket *",
     "struct mbuf *");
 
 int
 mac_socket_check_deliver(struct socket *so, struct mbuf *m)
 {
 	struct label *label;
 	int error;
 
 	if (mac_policy_count == 0)
 		return (0);
 
 	label = mac_mbuf_to_label(m);
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_deliver, so, so->so_label, m,
 	    label);
 	MAC_CHECK_PROBE2(socket_check_deliver, error, so, m);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_listen, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_listen(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_listen, cred, so,
 	    so->so_label);
 	MAC_CHECK_PROBE2(socket_check_listen, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_poll, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_poll(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_poll, cred, so, so->so_label);
 	MAC_CHECK_PROBE2(socket_check_poll, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_receive, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_receive(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_receive, cred, so,
 	    so->so_label);
 	MAC_CHECK_PROBE2(socket_check_receive, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE3(socket_check_relabel, "struct ucred *",
     "struct socket *", "struct label *");
 
 static int
 mac_socket_check_relabel(struct ucred *cred, struct socket *so,
     struct label *newlabel)
 {
 	int error;
 
 	SOCK_LOCK_ASSERT(so);
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_relabel, cred, so,
 	    so->so_label, newlabel);
 	MAC_CHECK_PROBE3(socket_check_relabel, error, cred, so, newlabel);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_send, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_send(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_send, cred, so, so->so_label);
 	MAC_CHECK_PROBE2(socket_check_send, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_stat, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_stat(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_stat, cred, so, so->so_label);
 	MAC_CHECK_PROBE2(socket_check_stat, error, cred, so);
 
 	return (error);
 }
 
 MAC_CHECK_PROBE_DEFINE2(socket_check_visible, "struct ucred *",
     "struct socket *");
 
 int
 mac_socket_check_visible(struct ucred *cred, struct socket *so)
 {
 	int error;
 
 	MAC_POLICY_CHECK_NOSLEEP(socket_check_visible, cred, so,
 	    so->so_label);
 	MAC_CHECK_PROBE2(socket_check_visible, error, cred, so);
 
 	return (error);
 }
 
 int
 mac_socket_label_set(struct ucred *cred, struct socket *so,
     struct label *label)
 {
 	int error;
 
 	/*
 	 * We acquire the socket lock when we perform the test and set, but
 	 * have to release it as the pcb code needs to acquire the pcb lock,
 	 * which will precede the socket lock in the lock order.  However,
 	 * this is fine, as any race will simply result in the inpcb being
 	 * refreshed twice, but still consistently, as the inpcb code will
 	 * acquire the socket lock before refreshing, holding both locks.
 	 */
 	SOCK_LOCK(so);
 	error = mac_socket_check_relabel(cred, so, label);
 	if (error) {
 		SOCK_UNLOCK(so);
 		return (error);
 	}
 
 	mac_socket_relabel(cred, so, label);
 	SOCK_UNLOCK(so);
 
 	/*
 	 * If the protocol has expressed interest in socket layer changes,
 	 * such as if it needs to propagate changes to a cached pcb label
 	 * from the socket, notify it of the label change while holding the
 	 * socket lock.
 	 */
-	if (so->so_proto->pr_usrreqs->pru_sosetlabel != NULL)
-		(so->so_proto->pr_usrreqs->pru_sosetlabel)(so);
+	if (so->so_proto->pr_sosetlabel != NULL)
+		so->so_proto->pr_sosetlabel(so);
 
 	return (0);
 }
 
 int
 mac_setsockopt_label(struct ucred *cred, struct socket *so, struct mac *mac)
 {
 	struct label *intlabel;
 	char *buffer;
 	int error;
 
 	if (!(mac_labeled & MPC_OBJECT_SOCKET))
 		return (EINVAL);
 
 	error = mac_check_structmac_consistent(mac);
 	if (error)
 		return (error);
 
 	buffer = malloc(mac->m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac->m_string, buffer, mac->m_buflen, NULL);
 	if (error) {
 		free(buffer, M_MACTEMP);
 		return (error);
 	}
 
 	intlabel = mac_socket_label_alloc(M_WAITOK);
 	error = mac_socket_internalize_label(intlabel, buffer);
 	free(buffer, M_MACTEMP);
 	if (error)
 		goto out;
 
 	error = mac_socket_label_set(cred, so, intlabel);
 out:
 	mac_socket_label_free(intlabel);
 	return (error);
 }
 
 int
 mac_getsockopt_label(struct ucred *cred, struct socket *so, struct mac *mac)
 {
 	char *buffer, *elements;
 	struct label *intlabel;
 	int error;
 
 	if (!(mac_labeled & MPC_OBJECT_SOCKET))
 		return (EINVAL);
 
 	error = mac_check_structmac_consistent(mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac->m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac->m_string, elements, mac->m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac->m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	intlabel = mac_socket_label_alloc(M_WAITOK);
 	SOCK_LOCK(so);
 	mac_socket_copy_label(so->so_label, intlabel);
 	SOCK_UNLOCK(so);
 	error = mac_socket_externalize_label(intlabel, elements, buffer,
 	    mac->m_buflen);
 	mac_socket_label_free(intlabel);
 	if (error == 0)
 		error = copyout(buffer, mac->m_string, strlen(buffer)+1);
 
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 
 	return (error);
 }
 
 int
 mac_getsockopt_peerlabel(struct ucred *cred, struct socket *so,
     struct mac *mac)
 {
 	char *elements, *buffer;
 	struct label *intlabel;
 	int error;
 
 	if (!(mac_labeled & MPC_OBJECT_SOCKET))
 		return (EINVAL);
 
 	error = mac_check_structmac_consistent(mac);
 	if (error)
 		return (error);
 
 	elements = malloc(mac->m_buflen, M_MACTEMP, M_WAITOK);
 	error = copyinstr(mac->m_string, elements, mac->m_buflen, NULL);
 	if (error) {
 		free(elements, M_MACTEMP);
 		return (error);
 	}
 
 	buffer = malloc(mac->m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
 	intlabel = mac_socket_label_alloc(M_WAITOK);
 	SOCK_LOCK(so);
 	mac_socket_copy_label(so->so_peerlabel, intlabel);
 	SOCK_UNLOCK(so);
 	error = mac_socketpeer_externalize_label(intlabel, elements, buffer,
 	    mac->m_buflen);
 	mac_socket_label_free(intlabel);
 	if (error == 0)
 		error = copyout(buffer, mac->m_string, strlen(buffer)+1);
 
 	free(buffer, M_MACTEMP);
 	free(elements, M_MACTEMP);
 
 	return (error);
 }
diff --git a/sys/sys/domain.h b/sys/sys/domain.h
index 3a1c30e163ed..af4dbf0be9ee 100644
--- a/sys/sys/domain.h
+++ b/sys/sys/domain.h
@@ -1,96 +1,97 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)domain.h	8.1 (Berkeley) 6/2/93
  * $FreeBSD$
  */
 
 #ifndef _SYS_DOMAIN_H_
 #define _SYS_DOMAIN_H_
 
 /*
  * Structure per communications domain.
  */
 
 /*
  * Forward structure declarations for function prototypes [sic].
  */
 struct	mbuf;
 struct	ifnet;
 struct	socket;
 struct	rib_head;
 
 struct domain {
 	int	dom_family;		/* AF_xxx */
+	u_int	dom_nprotosw;		/* length of dom_protosw[] */
 	char	*dom_name;
 	int	dom_flags;
 	int	(*dom_probe)(void);	/* check for support (optional) */
 	int	(*dom_externalize)	/* externalize access rights */
 		(struct mbuf *, struct mbuf **, int);
 	void	(*dom_dispose)		/* dispose of internalized rights */
 		(struct socket *);
-	struct	protosw *dom_protosw, *dom_protoswNPROTOSW;
 	struct	domain *dom_next;
 	struct rib_head *(*dom_rtattach)	/* initialize routing table */
 		(uint32_t);
 	void	(*dom_rtdetach)		/* clean up routing table */
 		(struct rib_head *);
 	void	*(*dom_ifattach)(struct ifnet *);
 	void	(*dom_ifdetach)(struct ifnet *, void *);
 	int	(*dom_ifmtu)(struct ifnet *);
 					/* af-dependent data on ifnet */
+	struct	protosw *dom_protosw[];
 };
 
 /* dom_flags */
 #define	DOMF_SUPPORTED	0x0001	/* System supports this domain. */
 #define	DOMF_INITED	0x0002	/* Initialized in the default vnet. */
 #define	DOMF_UNLOADABLE	0x0004	/* Can be unloaded */
 
 #ifdef _KERNEL
 extern int	domain_init_status;
 extern struct	domain *domains;
 void		domain_add(void *);
 void		domain_remove(void *);
 void		domain_init(void *);
 #ifdef VIMAGE
 void		vnet_domain_init(void *);
 void		vnet_domain_uninit(void *);
 #endif
 
 #define	DOMAIN_SET(name)						\
 	SYSINIT(domain_add_ ## name, SI_SUB_PROTO_DOMAIN,		\
 	    SI_ORDER_FIRST, domain_add, & name ## domain);		\
 	SYSUNINIT(domain_remove_ ## name, SI_SUB_PROTO_DOMAIN,		\
 	    SI_ORDER_FIRST, domain_remove, & name ## domain);		\
 	SYSINIT(domain_init_ ## name, SI_SUB_PROTO_DOMAIN,		\
 	    SI_ORDER_SECOND, domain_init, & name ## domain);
 #endif /* _KERNEL */
 
 #endif /* !_SYS_DOMAIN_H_ */
diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h
index 965c363afce1..f6907505178e 100644
--- a/sys/sys/protosw.h
+++ b/sys/sys/protosw.h
@@ -1,305 +1,249 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)protosw.h	8.1 (Berkeley) 6/2/93
  * $FreeBSD$
  */
 
 #ifndef _SYS_PROTOSW_H_
 #define _SYS_PROTOSW_H_
 
 #include <sys/queue.h>
 
 /* Forward declare these structures referenced from prototypes below. */
 struct kaiocb;
 struct mbuf;
 struct thread;
 struct sockaddr;
 struct socket;
 struct sockopt;
 
 /*#ifdef _KERNEL*/
 /*
  * Protocol switch table.
  *
  * Each protocol has a handle initializing one of these structures,
  * which is used for protocol-protocol and system-protocol communication.
  *
  * In retrospect, it would be a lot nicer to use an interface
  * similar to the vnode VOP interface.
  */
 struct ifnet;
 struct stat;
 struct ucred;
 struct uio;
 
 /* USE THESE FOR YOUR PROTOTYPES ! */
 typedef int	pr_ctloutput_t(struct socket *, struct sockopt *);
 typedef void	pr_abort_t(struct socket *);
 typedef int	pr_accept_t(struct socket *, struct sockaddr **);
 typedef int	pr_attach_t(struct socket *, int, struct thread *);
 typedef int	pr_bind_t(struct socket *, struct sockaddr *, struct thread *);
 typedef int	pr_connect_t(struct socket *, struct sockaddr *,
 		    struct thread *);
 typedef int	pr_connect2_t(struct socket *, struct socket *);
 typedef int	pr_control_t(struct socket *, unsigned long, void *,
 		    struct ifnet *, struct thread *);
 typedef void	pr_detach_t(struct socket *);
 typedef int	pr_disconnect_t(struct socket *);
 typedef int	pr_listen_t(struct socket *, int, struct thread *);
 typedef int	pr_peeraddr_t(struct socket *, struct sockaddr **);
 typedef int	pr_rcvd_t(struct socket *, int);
 typedef int	pr_rcvoob_t(struct socket *, struct mbuf *, int);
 typedef enum {
 	PRUS_OOB =		0x1,
 	PRUS_EOF =		0x2,
 	PRUS_MORETOCOME =	0x4,
 	PRUS_NOTREADY =		0x8,
 	PRUS_IPV6 =		0x10,
 } pr_send_flags_t;
 typedef int	pr_send_t(struct socket *, int, struct mbuf *,
 		    struct sockaddr *, struct mbuf *, struct thread *);
 typedef int	pr_ready_t(struct socket *, struct mbuf *, int);
 typedef int	pr_sense_t(struct socket *, struct stat *);
 typedef int	pr_shutdown_t(struct socket *);
 typedef int	pr_flush_t(struct socket *, int);
 typedef int	pr_sockaddr_t(struct socket *, struct sockaddr **);
 typedef int	pr_sosend_t(struct socket *, struct sockaddr *, struct uio *,
 		    struct mbuf *, struct mbuf *, int, struct thread *);
 typedef int	pr_soreceive_t(struct socket *, struct sockaddr **,
 		    struct uio *, struct mbuf **, struct mbuf **, int *);
 typedef int	pr_sopoll_t(struct socket *, int, struct ucred *,
 		    struct thread *);
 typedef void	pr_sosetlabel_t(struct socket *);
 typedef void	pr_close_t(struct socket *);
 typedef int	pr_bindat_t(int, struct socket *, struct sockaddr *,
 		    struct thread *);
 typedef int	pr_connectat_t(int, struct socket *, struct sockaddr *,
 		    struct thread *);
 typedef int	pr_aio_queue_t(struct socket *, struct kaiocb *);
 
 struct protosw {
 	short	pr_type;		/* socket type used for */
-	struct	domain *pr_domain;	/* domain protocol a member of */
 	short	pr_protocol;		/* protocol number */
 	short	pr_flags;		/* see below */
-/* protocol-protocol hooks */
-	pr_ctloutput_t *pr_ctloutput;	/* control output (from above) */
-/* utility hooks */
-
-	struct	pr_usrreqs *pr_usrreqs;	/* user-protocol hook */
+	short	pr_unused;
+	struct	domain	*pr_domain;	/* domain protocol a member of */
+
+	pr_soreceive_t	*pr_soreceive;	/* recv(2) */
+	pr_rcvd_t	*pr_rcvd;	/* soreceive_generic() if PR_WANTRCVD */
+	pr_sosend_t	*pr_sosend;	/* send(2) */
+	pr_send_t	*pr_send;	/* send(2) via sosend_generic() */
+	pr_ready_t	*pr_ready;	/* sendfile/ktls readyness */
+	pr_sopoll_t	*pr_sopoll;	/* poll(2) */
+/* Cache line #2 */
+	pr_attach_t	*pr_attach;	/* creation: socreate(), sonewconn() */
+	pr_detach_t	*pr_detach;	/* destruction: sofree() */
+	pr_connect_t	*pr_connect;	/* connect(2) */
+	pr_disconnect_t	*pr_disconnect;	/* sodisconnect() */
+	pr_close_t	*pr_close;	/* close(2) */
+	pr_shutdown_t	*pr_shutdown;	/* shutdown(2) */
+	pr_abort_t	*pr_abort;	/* abrupt tear down: soabort() */
+	pr_aio_queue_t	*pr_aio_queue;	/* aio(9) */
+/* Cache line #3 */
+	pr_bind_t	*pr_bind;	/* bind(2) */
+	pr_bindat_t	*pr_bindat;	/* bindat(2) */
+	pr_listen_t	*pr_listen;	/* listen(2) */
+	pr_accept_t	*pr_accept;	/* accept(2) */
+	pr_connectat_t	*pr_connectat;	/* connectat(2) */
+	pr_connect2_t	*pr_connect2;	/* socketpair(2) */
+	pr_control_t	*pr_control;	/* ioctl(2) */
+	pr_rcvoob_t	*pr_rcvoob;	/* soreceive_rcvoob() */
+/* Cache line #4 */
+	pr_ctloutput_t	*pr_ctloutput;	/* control output (from above) */
+	pr_peeraddr_t	*pr_peeraddr;	/* getpeername(2) */
+	pr_sockaddr_t	*pr_sockaddr;	/* getsockname(2) */
+	pr_sense_t	*pr_sense;	/* stat(2) */
+	pr_flush_t	*pr_flush;	/* XXXGL: merge with pr_shutdown_t! */
+	pr_sosetlabel_t	*pr_sosetlabel;	/* MAC, XXXGL: remove */
 };
 /*#endif*/
 
-/*
- * This number should be defined again within each protocol family to avoid
- * confusion.
- */
-#define	PROTO_SPACER	32767		/* spacer for loadable protocols */
-
 /*
  * Values for pr_flags.
  * PR_ADDR requires PR_ATOMIC;
  * PR_ADDR and PR_CONNREQUIRED are mutually exclusive.
  * PR_IMPLOPCL means that the protocol allows sendto without prior connect,
  *	and the protocol understands the MSG_EOF flag.  The first property is
  *	is only relevant if PR_CONNREQUIRED is set (otherwise sendto is allowed
  *	anyhow).
  * PR_SOCKBUF requires protocol to initialize and destroy its socket buffers
  * in its pr_attach and pr_detach.
  */
 #define	PR_ATOMIC	0x01		/* exchange atomic messages only */
 #define	PR_ADDR		0x02		/* addresses given with messages */
 #define	PR_CONNREQUIRED	0x04		/* connection required by protocol */
 #define	PR_WANTRCVD	0x08		/* want PRU_RCVD calls */
 #define	PR_RIGHTS	0x10		/* passes capabilities */
 #define PR_IMPLOPCL	0x20		/* implied open/close */
 /* was	PR_LASTHDR	0x40		   enforce ipsec policy; last header */
 #define	PR_CAPATTACH	0x80		/* socket can attach in cap mode */
 #define	PR_SOCKBUF	0x100		/* private implementation of buffers */
 
-#ifdef	_KERNEL			/* users shouldn't see this decl */
-
-struct ifnet;
-struct stat;
-struct ucred;
-struct uio;
-
-/*
- * If the ordering here looks odd, that's because it's alphabetical.  These
- * should eventually be merged back into struct protosw.
- *
- * Some fields initialized to defaults if they are NULL.
- */
-struct pr_usrreqs {
-	pr_abort_t	*pru_abort;
-	pr_accept_t	*pru_accept;
-	pr_attach_t	*pru_attach;
-	pr_bind_t	*pru_bind;
-	pr_connect_t	*pru_connect;
-	pr_connect2_t	*pru_connect2;
-	pr_control_t	*pru_control;
-	pr_detach_t	*pru_detach;
-	pr_disconnect_t	*pru_disconnect;
-	pr_listen_t	*pru_listen;
-	pr_peeraddr_t	*pru_peeraddr;
-	pr_rcvd_t	*pru_rcvd;
-	pr_rcvoob_t	*pru_rcvoob;
-	pr_send_t	*pru_send;
-	pr_ready_t	*pru_ready;
-	pr_sense_t	*pru_sense;
-	pr_shutdown_t	*pru_shutdown;
-	pr_flush_t	*pru_flush;
-	pr_sockaddr_t	*pru_sockaddr;
-	pr_sosend_t	*pru_sosend;
-	pr_soreceive_t	*pru_soreceive;
-	pr_sopoll_t	*pru_sopoll;
-	pr_sosetlabel_t	*pru_sosetlabel;
-	pr_close_t	*pru_close;
-	pr_bindat_t	*pru_bindat;
-	pr_connectat_t	*pru_connectat;
-	pr_aio_queue_t	*pru_aio_queue;
-};
-
-/*
- * All nonvoid pru_*() functions below return EOPNOTSUPP.
- */
-int	pru_accept_notsupp(struct socket *so, struct sockaddr **nam);
-int	pru_aio_queue_notsupp(struct socket *so, struct kaiocb *job);
-int	pru_attach_notsupp(struct socket *so, int proto, struct thread *td);
-int	pru_bind_notsupp(struct socket *so, struct sockaddr *nam,
-	    struct thread *td);
-int	pru_bindat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
-	    struct thread *td);
-int	pru_connect_notsupp(struct socket *so, struct sockaddr *nam,
-	    struct thread *td);
-int	pru_connectat_notsupp(int fd, struct socket *so, struct sockaddr *nam,
-	    struct thread *td);
-int	pru_connect2_notsupp(struct socket *so1, struct socket *so2);
-int	pru_control_notsupp(struct socket *so, u_long cmd, void *data,
-	    struct ifnet *ifp, struct thread *td);
-int	pru_disconnect_notsupp(struct socket *so);
-int	pru_listen_notsupp(struct socket *so, int backlog, struct thread *td);
-int	pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam);
-int	pru_rcvd_notsupp(struct socket *so, int flags);
-int	pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags);
-int	pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
-	    struct sockaddr *addr, struct mbuf *control, struct thread *td);
-int	pru_ready_notsupp(struct socket *so, struct mbuf *m, int count);
-int	pru_sense_null(struct socket *so, struct stat *sb);
-int	pru_shutdown_notsupp(struct socket *so);
-int	pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam);
-int	pru_sosend_notsupp(struct socket *so, struct sockaddr *addr,
-	    struct uio *uio, struct mbuf *top, struct mbuf *control, int flags,
-	    struct thread *td);
-int	pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
-	    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
-	    int *flagsp);
-int	pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
-	    struct thread *td);
-
-#endif /* _KERNEL */
-
 /*
  * The arguments to the ctlinput routine are
  *	(*protosw[].pr_ctlinput)(cmd, sa, arg);
  * where cmd is one of the commands below, sa is a pointer to a sockaddr,
  * and arg is a `void *' argument used within a protocol family.
  */
 #define	PRC_ROUTEDEAD		1	/* select new route if possible ??? */
 /* was	PRC_QUENCH2		3	DEC congestion bit says slow down */
 /* was	PRC_QUENCH		4	Deprecated by RFC 6633 */
 #define	PRC_MSGSIZE		5	/* message size forced drop */
 #define	PRC_HOSTDEAD		6	/* host appears to be down */
 #define	PRC_HOSTUNREACH		7	/* deprecated (use PRC_UNREACH_HOST) */
 #define	PRC_UNREACH_NET		8	/* no route to network */
 #define	PRC_UNREACH_HOST	9	/* no route to host */
 #define	PRC_UNREACH_PROTOCOL	10	/* dst says bad protocol */
 #define	PRC_UNREACH_PORT	11	/* bad port # */
 /* was	PRC_UNREACH_NEEDFRAG	12	   (use PRC_MSGSIZE) */
 #define	PRC_UNREACH_SRCFAIL	13	/* source route failed */
 #define	PRC_REDIRECT_NET	14	/* net routing redirect */
 #define	PRC_REDIRECT_HOST	15	/* host routing redirect */
 #define	PRC_REDIRECT_TOSNET	16	/* redirect for type of service & net */
 #define	PRC_REDIRECT_TOSHOST	17	/* redirect for tos & host */
 #define	PRC_TIMXCEED_INTRANS	18	/* packet lifetime expired in transit */
 #define	PRC_TIMXCEED_REASS	19	/* lifetime expired on reass q */
 #define	PRC_PARAMPROB		20	/* header incorrect */
 #define	PRC_UNREACH_ADMIN_PROHIB	21	/* packet administrativly prohibited */
 
 #define	PRC_NCMDS		22
 
 #define	PRC_IS_REDIRECT(cmd)	\
 	((cmd) >= PRC_REDIRECT_NET && (cmd) <= PRC_REDIRECT_TOSHOST)
 
 #ifdef PRCREQUESTS
 char	*prcrequests[] = {
 	"IFDOWN", "ROUTEDEAD", "IFUP", "DEC-BIT-QUENCH2",
 	"QUENCH", "MSGSIZE", "HOSTDEAD", "#7",
 	"NET-UNREACH", "HOST-UNREACH", "PROTO-UNREACH", "PORT-UNREACH",
 	"#12", "SRCFAIL-UNREACH", "NET-REDIRECT", "HOST-REDIRECT",
 	"TOSNET-REDIRECT", "TOSHOST-REDIRECT", "TX-INTRANS", "TX-REASS",
 	"PARAMPROB", "ADMIN-UNREACH"
 };
 #endif
 
 /*
  * The arguments to ctloutput are:
  *	(*protosw[].pr_ctloutput)(req, so, level, optname, optval, p);
  * req is one of the actions listed below, so is a (struct socket *),
  * level is an indication of which protocol layer the option is intended.
  * optname is a protocol dependent socket option request,
  * optval is a pointer to a mbuf-chain pointer, for value-return results.
  * The protocol is responsible for disposal of the mbuf chain *optval
  * if supplied,
  * the caller is responsible for any space held by *optval, when returned.
  * A non-zero return from ctloutput gives an
  * UNIX error number which should be passed to higher level software.
  */
 #define	PRCO_GETOPT	0
 #define	PRCO_SETOPT	1
 
 #define	PRCO_NCMDS	2
 
 #ifdef PRCOREQUESTS
 char	*prcorequests[] = {
 	"GETOPT", "SETOPT",
 };
 #endif
 
 #ifdef _KERNEL
 struct domain *pffinddomain(int family);
 struct protosw *pffindproto(int family, int protocol, int type);
 struct protosw *pffindtype(int family, int type);
-int	pf_proto_register(int family, struct protosw *npr);
-int	pf_proto_unregister(int family, int protocol, int type);
+int protosw_register(struct domain *, struct protosw *);
+int protosw_unregister(struct protosw *);
+
+/* Domains that are known to be avaliable for protosw_register(). */
+extern struct domain inetdomain;
+extern struct domain inet6domain;
 #endif
 
 #endif